Пример #1
0
N, M = np.shape(X)

# Restrict the data to images of "2"
X = X[y.A.ravel()==2,:]
N, M = np.shape(X)



### Gausian Kernel density estimator
# cross-validate kernel width by leave-one-out-cross-validation
# (efficient implementation in gausKernelDensity function)
# evaluate for range of kernel widths
widths = X.var(axis=0).max() * (2.0**np.arange(-10,3))
logP = np.zeros(np.size(widths))
for i,w in enumerate(widths):
   density, log_density = gausKernelDensity(X,w)
   logP[i] = log_density.sum()
val = logP.max()
ind = logP.argmax()

width=widths[ind]
print('Optimal estimated width is: {0}'.format(width))

# evaluate density for estimated width
density, log_density = gausKernelDensity(X,width)

# Sort the densities
i = (density.argsort(axis=0)).ravel()
density = density[i]

# Plot density estimate of outlier score
Пример #2
0
X = np.matrix(matdata['X'])
y = np.matrix(matdata['y'])
N, M = np.shape(X)

# Restrict the data to images of "2"
X = X[y.A.ravel() == 2, :]
N, M = np.shape(X)

### Gausian Kernel density estimator
# cross-validate kernel width by leave-one-out-cross-validation
# (efficient implementation in gausKernelDensity function)
# evaluate for range of kernel widths
widths = X.var(axis=0).max() * (2.0**np.arange(-10, 3))
logP = np.zeros(np.size(widths))
for i, w in enumerate(widths):
    density, log_density = gausKernelDensity(X, w)
    logP[i] = log_density.sum()
val = logP.max()
ind = logP.argmax()

width = widths[ind]
print('Optimal estimated width is: {0}'.format(width))

# evaluate density for estimated width
density, log_density = gausKernelDensity(X, width)

# Sort the densities
i = (density.argsort(axis=0)).ravel()
density = density[i]

# Plot density estimate of outlier score
Пример #3
0
def outlierDetection(X, objects=20):

    ### Gausian Kernel density estimator
    # cross-validate kernel width by leave-one-out-cross-validation
    # (efficient implementation in gausKernelDensity function)
    # evaluate for range of kernel widths
    widths = X.var(axis=0).max() * (2.0 ** np.arange(-10, 3))
    logP = np.zeros(np.size(widths))
    for i, w in enumerate(widths):
        density, log_density = gausKernelDensity(X, w)
        logP[i] = log_density.sum()
    val = logP.max()
    ind = logP.argmax()

    width = widths[ind]
    print ("Optimal estimated width is: {0}".format(width))

    # evaluate density for estimated width
    density, log_density = gausKernelDensity(X, width)

    # Sort the densities
    i = (density.argsort(axis=0)).ravel()
    density = density[i]

    # Plot density estimate of outlier score
    figure()
    bar(range(objects), density[:objects])
    title("Density estimate")

    # Plot possible outliers
    # figure()
    print "For Gaussian Kernel Density"
    for k in range(1, objects + 1):
        print i[k]
        # subplot(4,5,k)
        # imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary)
        # xticks([]); yticks([])
        # if k==3: title('Gaussian Kernel Density: Possible outliers')

    ### K-neighbors density estimator
    # Neighbor to use:
    K = 5

    # Find the k nearest neighbors
    knn = NearestNeighbors(n_neighbors=K).fit(X)
    D, i = knn.kneighbors(X)

    density = 1.0 / (D.sum(axis=1) / K)

    # Sort the scores
    i = density.argsort()
    density = density[i]

    # Plot k-neighbor estimate of outlier score (distances)
    figure()
    bar(range(objects), density[:objects])
    title("KNN density: Outlier score")
    # Plot possible outliers
    # figure()
    print "\n"
    print "For KNN density"
    for k in range(1, objects + 1):
        print i[k]
        # subplot(4,5,k)
        # imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary)
        # xticks([]); yticks([])
        # if k==3: title('KNN density: Possible outliers')

    ### K-nearest neigbor average relative density
    # Compute the average relative density

    knn = NearestNeighbors(n_neighbors=K).fit(X)
    D, i = knn.kneighbors(X)
    density = 1.0 / (D.sum(axis=1) / K)
    avg_rel_density = density / (density[i[:, 1:]].sum(axis=1) / K)

    # Sort the avg.rel.densities
    i_avg_rel = avg_rel_density.argsort()
    avg_rel_density = avg_rel_density[i_avg_rel]

    # Plot k-neighbor estimate of outlier score (distances)
    figure()
    bar(range(objects), avg_rel_density[:objects])
    title("KNN average relative density: Outlier score")
    # Plot possible outliers
    # figure()
    print "\n"
    print "For KNN average relative density"
    for k in range(1, objects + 1):
        print i_avg_rel[k]
        # subplot(4,5,k)
        # imshow(np.reshape(X[i_avg_rel[k],:], (1,9)).T, cmap=cm.binary)
        # xticks([]); yticks([])
        # if k==3: title('KNN average relative density: Possible outliers')

    ### Distance to 5'th nearest neighbor outlier score
    K = 25

    # Find the k nearest neighbors
    knn = NearestNeighbors(n_neighbors=K).fit(X)
    D, i = knn.kneighbors(X)

    # Outlier score
    score = D[:, K - 1]
    # Sort the scores
    i = score.argsort()
    score = score[i[::-1]]

    # Plot k-neighbor estimate of outlier score (distances)
    figure()
    bar(range(objects), score[:objects])
    title("25th neighbor distance: Outlier score")
    # Plot possible outliers
    # figure()
    print "\n"
    print "For 5'th neighbour distance"
    for k in range(1, objects + 1):
        print i[462 - k]
        # subplot(4,5,k)
        # imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary);
        # xticks([]); yticks([])
        # if k==3: title('5th neighbor distance: Possible outliers')

    # Plot random digits (the first 20 in the data set), for comparison
    # figure()
    # for k in range(1,objects + 1):
    #    subplot(4,5,k);
    #    imshow(np.reshape(X[k,:], (1,9)).T, cmap=cm.binary);
    #    xticks([]); yticks([])
    #    if k==3: title('Random digits from data set')
    show()
Пример #4
0
import matplotlib.pyplot as plot
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

data = pd.read_csv('data.csv')
X = data.drop("class", axis=1)
X = scaler.fit_transform(X.as_matrix())

best_logP = -inf
best_bandwidth = None
best_density = None

for bandwidth in np.linspace(0, 5, 1000):
    density, log_density = tb.gausKernelDensity(X, bandwidth)
    logP = log_density.sum()
    if logP > best_logP:
        best_logP = logP
        best_bandwidth = bandwidth
        best_density = density

kde_density = best_density[:, 0]

# Number of neighbors
K = 200

# Find the k nearest neighbors
knn = NearestNeighbors(n_neighbors=K).fit(X)
D, i = knn.kneighbors(X)
    plt.bar(range(n_outlier),d['ard'][:n_outlier])
    plt.title('KNN average relative density: Outlier score')
    plt.show()

d = {}
d_idx = {}

### Attribute normalization
X = X / np.max(X, axis=0)

print('Calculating Gaussian Kernel density...')
### Gausian Kernel
widths = X.var(axis=0).max() * (2.0**np.arange(-10,3))
logP = np.zeros(np.size(widths))
for i,w in enumerate(widths):
   density, log_density = gausKernelDensity(X,w)
   logP[i] = log_density.sum()

val = logP.max()
ind = logP.argmax()

width = widths[ind]
# width = 0.32228417991810204
print('\tOptimal estimated width is: {0}'.format(width))

# evaluate density for estimated width
d['kde'], log_density = gausKernelDensity(X,width)

# Sort the densities
d_idx['kde'] = (d['kde'].argsort(axis=0)).ravel()
d['kde'] = d['kde'][d_idx['kde']].reshape(d['kde'].shape[0])
Пример #6
0
def outlierDetection(X, objects=20):

    ### Gausian Kernel density estimator
    # cross-validate kernel width by leave-one-out-cross-validation
    # (efficient implementation in gausKernelDensity function)
    # evaluate for range of kernel widths
    widths = X.var(axis=0).max() * (2.0**np.arange(-10, 3))
    logP = np.zeros(np.size(widths))
    for i, w in enumerate(widths):
        density, log_density = gausKernelDensity(X, w)
        logP[i] = log_density.sum()
    val = logP.max()
    ind = logP.argmax()

    width = widths[ind]
    print('Optimal estimated width is: {0}'.format(width))

    # evaluate density for estimated width
    density, log_density = gausKernelDensity(X, width)

    # Sort the densities
    i = (density.argsort(axis=0)).ravel()
    density = density[i]

    # Plot density estimate of outlier score
    figure()
    bar(range(objects), density[:objects])
    title('Density estimate')

    # Plot possible outliers
    #figure()
    print "For Gaussian Kernel Density"
    for k in range(1, objects + 1):
        print i[k]
        #subplot(4,5,k)
        #imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary)
        #xticks([]); yticks([])
        #if k==3: title('Gaussian Kernel Density: Possible outliers')

    ### K-neighbors density estimator
    # Neighbor to use:
    K = 5

    # Find the k nearest neighbors
    knn = NearestNeighbors(n_neighbors=K).fit(X)
    D, i = knn.kneighbors(X)

    density = 1. / (D.sum(axis=1) / K)

    # Sort the scores
    i = density.argsort()
    density = density[i]

    # Plot k-neighbor estimate of outlier score (distances)
    figure()
    bar(range(objects), density[:objects])
    title('KNN density: Outlier score')
    # Plot possible outliers
    #figure()
    print "\n"
    print "For KNN density"
    for k in range(1, objects + 1):
        print i[k]
        #subplot(4,5,k)
        #imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary)
        #xticks([]); yticks([])
        #if k==3: title('KNN density: Possible outliers')

    ### K-nearest neigbor average relative density
    # Compute the average relative density

    knn = NearestNeighbors(n_neighbors=K).fit(X)
    D, i = knn.kneighbors(X)
    density = 1. / (D.sum(axis=1) / K)
    avg_rel_density = density / (density[i[:, 1:]].sum(axis=1) / K)

    # Sort the avg.rel.densities
    i_avg_rel = avg_rel_density.argsort()
    avg_rel_density = avg_rel_density[i_avg_rel]

    # Plot k-neighbor estimate of outlier score (distances)
    figure()
    bar(range(objects), avg_rel_density[:objects])
    title('KNN average relative density: Outlier score')
    # Plot possible outliers
    #figure()
    print "\n"
    print "For KNN average relative density"
    for k in range(1, objects + 1):
        print i_avg_rel[k]
        #subplot(4,5,k)
        #imshow(np.reshape(X[i_avg_rel[k],:], (1,9)).T, cmap=cm.binary)
        #xticks([]); yticks([])
        #if k==3: title('KNN average relative density: Possible outliers')

    ### Distance to 5'th nearest neighbor outlier score
    K = 25

    # Find the k nearest neighbors
    knn = NearestNeighbors(n_neighbors=K).fit(X)
    D, i = knn.kneighbors(X)

    # Outlier score
    score = D[:, K - 1]
    # Sort the scores
    i = score.argsort()
    score = score[i[::-1]]

    # Plot k-neighbor estimate of outlier score (distances)
    figure()
    bar(range(objects), score[:objects])
    title('25th neighbor distance: Outlier score')
    # Plot possible outliers
    #figure()
    print "\n"
    print "For 5'th neighbour distance"
    for k in range(1, objects + 1):
        print i[462 - k]
        #subplot(4,5,k)
        #imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary);
        #xticks([]); yticks([])
        #if k==3: title('5th neighbor distance: Possible outliers')

    # Plot random digits (the first 20 in the data set), for comparison
    #figure()
    #for k in range(1,objects + 1):
    #    subplot(4,5,k);
    #    imshow(np.reshape(X[k,:], (1,9)).T, cmap=cm.binary);
    #    xticks([]); yticks([])
    #    if k==3: title('Random digits from data set')
    show()
Пример #7
0
def Outlier(input_data, index_to_check):
    X, y = split_train_test(input_data, index_to_check)

    N, M = np.shape(X)

    # Restrict the data to images of "2"

    ### Gausian Kernel density estimator
    # cross-validate kernel width by leave-one-out-cross-validation
    # (efficient implementation in gausKernelDensity function)
    # evaluate for range of kernel widths
    widths = X.var(axis=0).max() * (2.0**np.arange(-10, 3))
    logP = np.zeros(np.size(widths))
    for i, w in enumerate(widths):
        print('Fold {:2d}, w={:f}'.format(i, w))
        density, log_density = gausKernelDensity(X, w)
        logP[i] = log_density.sum()

    val = logP.max()
    ind = logP.argmax()

    width = widths[ind]
    print('Optimal estimated width is: {0}'.format(width))

    # evaluate density for estimated width
    density, log_density = gausKernelDensity(X, width)

    # Sort the densities
    i = (density.argsort(axis=0)).ravel()
    density = density[i].reshape(-1, )
    print('The index of the lowest GKD estimator object: {0}'.format(i[0:5]))
    print('The value of the lowest GKD estimator object: {0}'.format(
        density[0:5]))

    # Plot density estimate of outlier score
    figure(1)
    bar(range(20), density[:20])
    title('Density estimate')

    # Plot possible outliers

    ### K-neighbors density estimator
    # Neighbor to use:
    K = 5

    # Find the k nearest neighbors
    knn = NearestNeighbors(n_neighbors=K).fit(X)
    D, i = knn.kneighbors(X)

    density = 1. / (D.sum(axis=1) / K)

    # Sort the scores
    i = density.argsort()
    density = density[i]
    print(
        'The index of the lowest KNN 5 neighbours density object: {0}'.format(
            i[0:5]))
    print(
        'The value of the lowest KNN 5 neighbours density object: {0}'.format(
            density[0:5]))

    # Plot k-neighbor estimate of outlier score (distances)
    figure(3)
    bar(range(20), density[:20])
    title('KNN density: Outlier score')
    # Plot possible outliers

    ### K-nearest neigbor average relative density
    # Compute the average relative density

    knn = NearestNeighbors(n_neighbors=K).fit(X)
    D, i = knn.kneighbors(X)
    density = 1. / (D.sum(axis=1) / K)
    avg_rel_density = density / (density[i[:, 1:]].sum(axis=1) / K)

    # Sort the avg.rel.densities
    i_avg_rel = avg_rel_density.argsort()
    avg_rel_density = avg_rel_density[i_avg_rel]

    print('The index of the lowest KNN average relative density object: {0}'.
          format(i_avg_rel[0:5]))
    print('The value of the lowest KNN average relative density object: {0}'.
          format(avg_rel_density[0:5]))

    # Plot k-neighbor estimate of outlier score (distances)
    figure(5)
    bar(range(20), avg_rel_density[:20])
    title('KNN average relative density: Outlier score')

    # Plot possible outliers
    ### Distance to 5'th nearest neighbor outlier score
    K = 5

    # Find the k nearest neighbors
    knn = NearestNeighbors(n_neighbors=K).fit(X)
    D, i = knn.kneighbors(X)

    # Outlier score
    score = D[:, K - 1]
    # Sort the scores
    i = score.argsort()
    score = score[i[::-1]]
    print(
        'The index of the highest KNN 5 neighbours outlier score: {0}'.format(
            i[0:5]))
    print(
        'The value of the highest KNN 5 neighbours outlier score: {0}'.format(
            score[0:5]))

    # Plot k-neighbor estimate of outlier score (distances)
    figure(7)
    bar(range(20), score[:20])
    title('5th neighbor distance: Outlier score')
    # Plot possible outliers

    show()

    print('Ran Exercise 11.4.1')