def compute_print_scores(normal_users, queue):

    K_GMM_n, K_KMeans_n, K_GMM_s, K_KMeans_s = Ks

    print 'novelty score GMM'
    B = GMM(covariance_type='full', n_components = 1)
    B.fit(queue)
    x = [B.score([i]).mean() for i in queue]
    print get_score_last_item(x, K_GMM_n)

    print 'novelty score OneClassSVM'
    x = anom_one_class(queue, [queue[-1]])
    print x[-1]

    print 'novelty score LSA'
    anomalymodel = lsanomaly.LSAnomaly()
    X = np.array(queue)
    anomalymodel.fit(X)
    print anomalymodel.predict(np.array([queue[-1]]))

    print 'novelty score degree K_means'
    K = KMeans(n_clusters=1)
    K.fit(queue)
    x = [K.score([i]) for i in queue]
    print get_score_last_item(x, K_KMeans_n)

    normal_and_new = normal_users + [queue[-1]]

    print 'degree of belonging to known class GMM'
    B = GMM(covariance_type='full', n_components = 1)
    B.fit(normal_users)
    x = [B.score([i]).mean() for i in normal_and_new]
    print get_score_last_item(x, K_GMM_s)

    print 'degree of belonging to known class OneClassSVM'
    x = anom_one_class(normal_users, [queue[-1]])
    print x[-1]

    print 'degree of belonging to known class LSA'
    anomalymodel = lsanomaly.LSAnomaly()
    X = np.array(normal_users)
    anomalymodel.fit(X)
    print anomalymodel.predict(np.array([queue[-1]]))

    print 'degree of belonging to known class K_means'
    K = KMeans(n_clusters=1)
    K.fit(normal_users)
    x = [K.score([i]) for i in normal_and_new]
    print get_score_last_item(x, K_KMeans_s)
Пример #2
0
def plot_results(X,
                 xx,
                 yy,
                 threshold=0.5,
                 sigma_candidates=None,
                 rho_candidates=None):
    _ = plt.figure(figsize=(16, 10))

    for row, sigma in enumerate(sigma_candidates):
        for col, rho in enumerate(rho_candidates):

            # Train the anomaly model
            clf = lsanomaly.LSAnomaly(sigma=sigma, rho=rho)
            clf.fit(X)

            # Get anomaly scores across the grid
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)

            # Plot the training data, anomaly model response and decision
            # boundary at threshold 0.5.
            subplot = plt.subplot(len(sigma_candidates), len(rho_candidates),
                                  row * 3 + col + 1)
            plt.contourf(
                xx,
                yy,
                Z,
                levels=np.linspace(0, 1, 11),
                cmap=plt.cm.get_cmap("GnBu"),
            )
            subplot.contour(xx,
                            yy,
                            Z,
                            levels=[threshold],
                            linewidths=2,
                            colors="red")
            cb = plt.colorbar()
            for t in cb.ax.get_yticklabels():
                t.set_fontsize(10)
            plt.scatter(X[:, 0],
                        X[:, 1],
                        c="black",
                        marker="+",
                        s=50,
                        linewidth=2)
            subplot.set_title(
                "$\sigma = $ %.3g, $\\rho$ = %.3g" % (sigma, rho),
                fontsize=14,
                usetex=True,
            )
            subplot.axes.get_xaxis().set_ticks([])
            subplot.axes.get_yaxis().set_ticks([])

            plt.xlim((-7, 7))
            plt.ylim((-7, 7))

    plt.show()
    def train_with_lsanomaly(self, trainX, testX):
        anomalymodel = lsanomaly.LSAnomaly()
        anomalymodel.fit(trainX)
        y_pred_train = anomalymodel.predict(trainX)
        y_pred_test = anomalymodel.predict(testX)

        # Process results
        self.replace_in_list(y_pred_train, 'anomaly', -1)
        self.replace_in_list(y_pred_test, 'anomaly', -1)
        n_error_train = y_pred_train.count(-1)
        n_error_test = y_pred_test.count(-1)

        return y_pred_train, y_pred_test, n_error_train, n_error_test
Пример #4
0
def use_model(model, df_list, x_columns, params):
    predicted = []

    if model == 'knn':
        neigh = NearestNeighbors(n_neighbors=params['n'], p=params['p'])
        neigh.fit(df_list[0][x_columns])

        for i in range(len(df_list)):
            pred = neigh.kneighbors(df_list[i][x_columns])
            pred = [np.mean(i) for i in pred[0]]
            predicted.append(pred)

    elif model == 'svm':
        svm = OneClassSVM(kernel=params['kernel'])
        svm.fit(df_list[0][x_columns])

        for i in range(len(df_list)):
            pred = svm.score_samples(df_list[i][x_columns])
            maximum = max(pred)
            pred = [(x * -1) + maximum for x in pred]
            predicted.append(pred)

    elif model == 'ísolationForest':
        clf = IsolationForest(n_estimators=params['n_estimators'],
                              random_state=0)
        clf.fit(df_list[0][x_columns])

        for i in range(len(df_list)):
            pred = clf.score_samples(df_list[i][x_columns])
            pred = list(map(abs, pred))
            predicted.append(pred)

    elif model == 'autoencoder':
        clf = AutoEncoder(hidden_neurons=params['hidden_neurons'],
                          verbose=0,
                          random_state=0)
        clf.fit(df_list[0][x_columns])
        for i in range(len(df_list)):
            pred = clf.decision_function(df_list[i][x_columns])
            predicted.append(pred)

    elif model == 'lsanomaly':
        anomalymodel = lsanomaly.LSAnomaly(sigma=params['sigma'],
                                           rho=params['rho'])
        anomalymodel.fit(df_list[0][x_columns].to_numpy())
        for i in range(len(df_list)):
            pred = anomalymodel.predict_proba(df_list[i][x_columns].to_numpy())
            pred = [a[1] for a in pred]
            predicted.append(pred)

    return predicted
digits = datasets.load_digits()
X = digits.data
y = digits.target

# Split data into training and test sets, then remove all examples of
# class 9 from the training set, leaving only examples of 0-8.
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.5)

train_inlier_idx = y_train < 9
X_train = X_train[train_inlier_idx, :]
y_train = y_train[train_inlier_idx]

# Fit the model for inlier classes
anomalymodel = lsanomaly.LSAnomaly()
anomalymodel.fit(X_train, y_train)

# Use the outlier score as a prediction of whether each test point
# belongs to class 9, for which no training data was given.
predictions = anomalymodel.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test == 9, predictions[:, -1])
print('AUC=%f' % (metrics.auc(fpr, tpr)))

# Try to assign each test point to classes 0-9, given only test data
# for classes 0-8.
y_pred = anomalymodel.predict(X_test)
y_pred = [w if np.isreal(w) else 9 for w in y_pred]
print 'Confusion matrix for all classes:'
print metrics.confusion_matrix(y_test, y_pred)
Пример #6
0
def evaluate(
    X_train,
    y_train,
    X_test,
    y_test,
    outlier_class,
    method_name,
    current_method_aucs,
    sigma,
    rho=0.1,
    nu=0.5,
):
    """
    Evaluation for a method and data set. Calculates the AUC for a single
    evaluation fold.

    Args:
        X_train (numpy.ndarray): independent training variables

        y_train (numpy.ndarray): training labels

        X_test (numpy.ndarray): independent test variables

        y_test (numpy.ndarray): test labels

        outlier_class (int): index of the outlier class

        method_name (str): method being run

        current_method_aucs (list): input to the *results* dictionary

        sigma (float): kernel lengthscale for LSAD and OCSVM

        rho (float): smoothness parameter for LSAD

        nu (float): OCSVM parameter - see *scikit-learn* documentation

    Raises:
        ValueError: if a `NaN` is encountered in the AUC calculation.

    """
    try:
        if method_name == "LSAD":
            lsanomaly_model = lsanomaly.LSAnomaly(n_kernels_max=500,
                                                  gamma=sigma**-2,
                                                  rho=rho)
            lsanomaly_model.fit(X_train, y_train)
            predictions = lsanomaly_model.predict_proba(X_test)[:, -1]

        elif method_name == "OCSVM":
            svm_anomaly_model = svm.OneClassSVM(gamma=sigma**-2, nu=nu)
            svm_anomaly_model.fit(X_train)
            predictions = 1 - svm_anomaly_model.decision_function(X_test)

        elif method_name == "KNN":
            anomaly_model = neighbors.NearestNeighbors(10)
            anomaly_model.fit(X_train)
            dists, idx = anomaly_model.kneighbors(X_test)
            predictions = dists[:, -1]

        elif method_name == "KM":
            km = cluster.KMeans(min(X_train.shape[0], 20))
            km.fit(X_train)
            nn = neighbors.NearestNeighbors(1)
            nn.fit(km.cluster_centers_)
            dists, idx = nn.kneighbors(X_test)
            predictions = dists[:, 0]

        else:
            raise ValueError("unknown method: {}".format(method_name))

        fpr, tpr, thresholds = metrics.roc_curve(y_test == outlier_class,
                                                 predictions)

        metric_auc = metrics.auc(fpr, tpr)
        logger.debug("\tAUC: {:>6.4f}".format(metric_auc))

        if not math.isnan(metric_auc):
            current_method_aucs.append(metric_auc)
        else:
            raise ValueError("NaN encountered in {}".format(method_name))
    except (IndexError, ValueError, Exception) as e:
        logger.exception("\t{} {}: {}".format(method_name, type(e), str(e)),
                         exc_info=True)
        raise
def compute_scores(normal_users, queue, Ks=[]):

    '''
        Calculates the novelty scores (noise and strangeness) for the 4 algotithms
        Receives the list of normal users and the queue (all users) and the list of curiosity factors Ks
        Updates the global variables GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s with the results 
    '''
    
    global GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s #Novelty Scores for each algorithm, those ''_n are for noise score, ''_s are for strangeness score 

    GMM_n = []
    one_n = []
    lsa_n = []
    K_n = []
    GMM_s = []
    one_s = []
    lsa_s = []
    K_s = []

    K_GMM_n, K_KMeans_n, K_GMM_s, K_KMeans_s = Ks #K_GMM_n, K_KMeans_n are the noise curiosity factors for each algorithm
                                                  #K_GMM_s, K_KMeans_s are the strangeness curiosity factors for each algorithm
                                                  #Ks is a list containing the 4 above mentioned parameters
    

    '''
    
    For One_class_SVM and LSA, when asked to predict the new entry, a label is directly returned 
        LSA: 'anomaly' or '0' (normal)

        One One_class_SVM: -1 (anomaly) or 1 (normal)

    GMM and K means predict a fitting score. The novelty score is obtained calculating the zscore of the entry compared with the scores of all other entries, calling 
    the function get_score_last_item
        If the zscore returned >= 1 the new entry is anomalous

    '''

    '''
    Noise scores are computed with the queue as the base of knowledge, fitting all the entries but the last to the algorithm
    '''                                    
    B = GMM(covariance_type='full', n_components = 1)
    B.fit(queue[0:-1])
    x = [B.score([i]).mean() for i in queue]
    GMM_n.append(get_score_last_item(x, K_GMM_n))


    K = KMeans(n_clusters=1)
    K.fit(queue[0:-1])
    x = [K.score([i]) for i in queue]
    K_n.append(get_score_last_item(x, K_KMeans_n))

    oneClassSVM = OneClassSVM(nu=0.1)
    oneClassSVM.fit(queue[0:-1])
    x = oneClassSVM.predict(np.array([queue[-1]]))
    if x == -1:
        one_n.append(1)
    if x == 1:
        one_n.append(0)
    
    X = np.array(queue[0:-1])
    anomalymodel = lsanomaly.LSAnomaly()
    anomalymodel.fit(X)
    x = anomalymodel.predict(np.array([queue[-1]])) 
    if x == ['anomaly']:
        lsa_n.append(1)
    if x == [0]:
        lsa_n.append(0)

    '''
    Strangeness scores are computed with the normal users as the base of knowledge, fitting normal users to the algorithm
    ''' 

    normal_and_new = normal_users + [queue[-1]] #List to be passed to get_score_last_item to calculate the zscore of the last item, the new entry

    B = GMM(covariance_type='full', n_components = 1)
    B.fit(normal_users)
    x = [B.score([i]).mean() for i in normal_and_new]
    GMM_s.append(get_score_last_item(x, K_GMM_s))


    K = KMeans(n_clusters=1)
    K.fit(normal_users)
    x = [K.score([i]) for i in normal_and_new]
    K_s.append(get_score_last_item(x, K_KMeans_s))

    oneClassSVM = OneClassSVM(nu=0.1)
    oneClassSVM.fit(normal_users)
    x = oneClassSVM.predict(np.array([queue[-1]]))
    if x == -1:
        one_s.append(1)
    if x == 1:
        one_s.append(0)

    anomalymodel = lsanomaly.LSAnomaly()
    X = np.array(normal_users)
    anomalymodel.fit(X)
    x = anomalymodel.predict(np.array([queue[-1]])) 
    if x == ['anomaly']:
        lsa_s.append(1)
    if x == [0]:
        lsa_s.append(0)

    return GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s
Пример #8
0
def eval(
    X_train,
    y_train,
    X_test,
    y_test,
    outlier_class,
    method,
    method_name,
    current_method_aucs,
    sigma,
    rho=0.1,
    nu=0.5,
):
    predictions = None

    try:
        if method_name == "LSAD":
            lsanomaly_model = lsanomaly.LSAnomaly(n_kernels_max=500,
                                                  gamma=sigma**-2,
                                                  rho=rho)
            lsanomaly_model.fit(X_train, y_train)
            predictions = lsanomaly_model.predict_proba(X_test)[:, -1]

        elif method_name == "OCSVM":
            svm_anomaly_model = svm.OneClassSVM(gamma=sigma**-2, nu=nu)
            svm_anomaly_model.fit(X_train)
            predictions = 1 - svm_anomaly_model.decision_function(X_test)

        elif method_name == "KNN":
            anomaly_model = neighbors.NearestNeighbors(10)
            anomaly_model.fit(X_train)
            dists, idx = anomaly_model.kneighbors(X_test)
            predictions = dists[:, -1]

        elif method_name == "KM":
            km = cluster.KMeans(min(X_train.shape[0], 20))
            km.fit(X_train)
            nn = neighbors.NearestNeighbors(1)
            nn.fit(km.cluster_centers_)
            dists, idx = nn.kneighbors(X_test)
            predictions = dists[:, 0]

        elif method_name == "DBS":
            dbs_anomaly_model = cluster.DBSCAN(eps=sigma, min_samples=3)
            clusters = dbs_anomaly_model.fit_predict(X_test)

        else:
            raise ValueError("unknown method: {}".format(method_name))

        fpr, tpr, thresholds = metrics.roc_curve(y_test == outlier_class,
                                                 predictions)

        metric_auc = metrics.auc(fpr, tpr)
        logger.debug("\tAUC: {:>6.4f}".format(metric_auc))

        if not math.isnan(metric_auc):
            current_method_aucs.append(metric_auc)
        else:
            raise ValueError("NaN encountered in {}".format(method_name))
    except (IndexError, Exception) as e:
        logger.exception(
            "\t{} {}: {}".format(method_name, type(e), str(e)),
            exc_info=True,
        )
        raise
Пример #9
0
Y_test = Y2[0:319, :]

# In[159]:

X_train = X2[:5000, :]
X_test = X2[10000:15000, :]

# In[160]:

plt.plot(X_test[:, 0])
plt.plot(X_test[:, 1])

# In[161]:

# Train the model
anomalymodel = lsanomaly.LSAnomaly(rho=1, sigma=.5)
anomalymodel.fit(Y_train)

# Predict anomalies statically (assuming iid samples)
y_pred_static = anomalymodel.predict_proba(Y_test)

# Predict anomalies sequentially (assume known transition matrix and
# initial probabilities)
A = np.array([[.999, .001], [.01, .99]])
pi = np.array([.5, .5])
y_pred_dynamic = anomalymodel.predict_sequence(Y_test, A, pi)

# In[162]:

plt.clf()
plt.figure(figsize=(10, 6))
Пример #10
0
 def generate_neural_classifier(data_list):
     x_train = np.array(data_list)
     clf = lsanomaly.LSAnomaly()
     clf.fit(x_train)
     return clf