Exemplo n.º 1
0
def obj_func_LOF(params):
    ## objective function used in baseian optimization
    outlier_fraction = params[0]
    n_neighbors = params[1]
    algorithm = params[2]
    leaf_size = params[3]

    # load data set to function work space
    Y_train = np.load('Y_train.npy')
    X_train = np.load('X_train.npy')

    # create model
    clf = LOF(n_neighbors=n_neighbors,
              algorithm=algorithm,
              leaf_size=leaf_size,
              contamination=outlier_fraction)
    # fit the dataset to the model
    clf.fit(X_train)

    scores_pred = clf.decision_function(
        X_train) * -1  # predict raw anomaly score
    Rprecision = Rprecision_f(Y_train, scores_pred)
    if glb_verbose:
        print('R Precision : ', Rprecision)

    y_pred = clf.predict(
        X_train)  # prediction of a datapoint category outlier or inlier
    objVal = objVal_f(Rprecision, y_pred, Y_train)

    return objVal
Exemplo n.º 2
0
def lof_pyod_once(X_nor, X_test, y_test, n_neighbors, contamination=0.05):

    lof = LOF(n_neighbors=n_neighbors, contamination=contamination)

    X_train = X_nor.astype(float).values.copy()

    lof.fit(X_train)
    ## now threshold is determined

    y_pred = lof.predict(X_test)
    scoreTable = lof.decision_function(X_test)
    #print(scoreTable)
    scoreTable = np.nan_to_num(scoreTable, copy=True)

    ## confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    tpr = tp / (tp + fn)
    fpr = fp / (tn + fp)
    #tprW[trail] = tpr
    #fprW[trail] = fpr
    tprW = tpr
    fprW = fpr

    # Auc score
    auc = roc_auc_score(y_test, scoreTable)

    #print(tpr, fpr)
    #print(auc)

    return tprW, fprW, auc, scoreTable
def calculate_LOF(given_DT, given_neighbors):
  X_1 = pd.DataFrame(given_DT)
  X = X_1.values
  clf = LOF(n_neighbors=given_neighbors)
  clf.fit(X)
  X_scores = clf.decision_scores_#clf.decision_function(XX_1)
  return X_scores
Exemplo n.º 4
0
def anomaly_detection(data, label):
    X = data[data.select_dtypes('number').columns.tolist()]
    y = data[label]
    y = y.values
    X = X.drop([label], axis=1)

    sc = StandardScaler()
    X = pd.DataFrame(data=sc.fit_transform(X), columns=X.columns)

    ifo = IForest(contamination=0.01,
                  behaviour='new',
                  n_estimators=1000,
                  max_samples=1024,
                  n_jobs=-1,
                  verbose=1)
    ifo.fit(X)
    ifo_pred = ifo.labels_
    print('ROC score for Isolation forest: ', roc_auc_score(y, ifo_pred))
    utilities.plot_outlier_scores(
        y,
        ifo.decision_scores_,
        bw=0.1,
        title='Fraud, Isolation forest. (n_estimators={})'.format(
            ifo.n_estimators))

    ae = AutoEncoder(hidden_neurons=[25, 20, 15, 20, 25],
                     hidden_activation='relu',
                     output_activation='sigmoid',
                     optimizer='adam',
                     epochs=20,
                     batch_size=128,
                     dropout_rate=0.2,
                     l2_regularizer=0.0,
                     validation_size=0.1,
                     preprocessing=False,
                     verbose=1,
                     random_state=1,
                     contamination=0.01)
    ae.fit(X)
    ae_pred = ae.labels_
    print('ROC score for Autoencoder: ', roc_auc_score(y, ae_pred))
    utilities.plot_outlier_scores(
        y,
        ae.decision_scores_,
        bw=0.1,
        title='Fraud, Autoencoder. (epochs={})'.format(ae.epochs))

    # Too long to train, under-sample needed
    lof = LOF(n_neighbors=int(y.sum() * 1.3), contamination=0.01, n_jobs=-1)
    lof.fit(X)
    lof_pred = lof.labels_
    print('ROC score for LOF: ', roc_auc_score(y, lof_pred))
    utilities.plot_outlier_scores(
        y,
        lof.decision_scores_,
        bw=0.1,
        title='Fraud, Local outliers factor. (n_neighbors={})'.format(
            lof.n_neighbors))

    return y, ifo_pred, ae_pred, lof_pred
Exemplo n.º 5
0
    def __call__(self):
        clf = LOF(contamination=0.1)
        buggy_enter_csv = self.get_file(
            join(self.data_buggy_dir, '*_ENTER.csv'))
        buggy_exit_csv = self.get_file(join(self.data_buggy_dir, '*_EXIT.csv'))
        data = self.get_data(buggy_enter_csv, buggy_exit_csv)

        # extend data with self.data_orig_dir
        for cur_dir, dirs, files in os.walk(self.data_orig_dir):
            for f_dir in dirs:
                enter_csv = self.get_file(join(cur_dir, f_dir, '*_ENTER.csv'))
                exit_csv = self.get_file(join(cur_dir, f_dir, '*_EXIT.csv'))
                ext_data = self.get_data(enter_csv, exit_csv)
                logger.debug('shape of data: {}'.format(data.shape))
                logger.debug('shape of ext_data: {}'.format(ext_data.shape))
                data = np.concatenate((data, ext_data), axis=0)
                logger.debug('shape of data: {}'.format(data.shape))

        clf.fit(data)
        train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        unique, counts = np.unique(train_pred, return_counts=True)
        logger.debug('unique (train): {}'.format(unique))
        logger.debug('counts (train): {}'.format(counts))
        if 0 not in unique:
            raise ModelError('Model contains no inlier')

        inliers_size = counts[0]
        outliers_size = counts[1] if len(counts) > 1 else 0
        logger.debug('num of inliers: {}'.format(inliers_size))
        logger.debug('num of outliers: {}'.format(outliers_size))

        return self.predict(clf)
Exemplo n.º 6
0
def run_LOF_base_detector(data, k, metric='euclidean', p=2):
    """
    Function to fit and predict the LOF base detector on `data`.
    
    Input:
     - data: pd.DataFrame, to run LOF on
     - k: integer, parameter to indicate the amount of neighbours to include in relative density determination
     - metric: string, distance metric to use, default `euclidean`
     - p: int, default 1 since metric = `euclidean`, otherwise set according to distance metric
     
    Output:
     - clf of class pyod.models.lof.LOF with all its properties
    """
    
    # Split data in values and targets: some datasets have an ID column, others don't
    try:
        X = data.drop(['outlier', 'id'], axis=1)
    except KeyError:
        X = data.drop('outlier', axis=1)
    
    # Construct and fit classifier
    clf = LOF(n_neighbors=k, metric='euclidean', p=p)
    clf.fit(X) # Fit only on features
    
    # Add ground truth labels for evaluation of the classifier
    clf.true_labels_ = data['outlier']
    
    # Return the classifier for further processing
    return clf
def getOutlierLOF(dataset):
    '''
    @brief Function that executes LOF algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    lof = LOF()
    # Fits the data and obtains labels
    lof.fit(dataset)
    # Return labels
    return lof.labels_
Exemplo n.º 8
0
    def runMethod(self):
        '''
        @brief This function is the actual implementation of HICS
        '''
        if self.verbose:
            print("Calculating the subspaces\n")
        # First we obtain the high contrast subspaces
        subspaces = self.hicsFramework()

        if self.verbose:
            print("Now calculating the scoring\n")
        # We initialize the scores for each instance as 0
        scores = np.zeros(len(self.dataset))
        # For each subspace
        for sub in subspaces:
            # We place the corresponding scorer according to parameter
            scorer = None
            if self.outlier_rank == "lof":
                scorer = LOF()
            elif self.outlier_rank == "cof":
                scorer = COF()
            elif self.outlier_rank == "cblof":
                scorer = CBLOF()
            elif self.outlier_rank == "loci":
                scorer = LOCI()
            elif self.outlier_rank == "hbos":
                scorer = HBOS()
            elif self.outlier_rank == "sod":
                scorer = SOD()
            # Fits the scorer with the dataset projection
            scorer.fit(self.dataset[:, sub])
            # Adds the scores obtained to the global ones
            scores = scores + scorer.decision_scores_
        # Compute the average
        self.outlier_score = scores / len(subspaces)
        # Marks the calculations as done
        self.calculations_done = True
Exemplo n.º 9
0
    # load dataset
    data_dict = load_dataset(
        dataset,
        subdataset,
        "all",
    )

    x_train = data_dict["train"]
    x_test = data_dict["test"]
    x_test_labels = data_dict["test_labels"]

    # data preprocessing for MSCRED
    start = time.time()
    od = LOF(n_neighbors=n_neighbors, leaf_size=leaf_size, p=p)
    od.fit(x_train)

    # get outlier scores
    anomaly_score = od.decision_function(x_test)

    anomaly_label = x_test_labels

    end = time.time()

    time = end - start

    # Make evaluation
    evaluate_all(anomaly_score, anomaly_label)
    salience = compute_salience(anomaly_score, anomaly_label)
    print('time')
    print('   ', time)
Exemplo n.º 10
0
class SolverAECIFAR():
    def __init__(self, data_name, hidden_dim=256, seed=0, learning_rate=3e-4, normal_class=0, anomaly_ratio=0.1,
                 batch_size=128, concentrated=0, training_ratio=0.8, SN=1, Trim=1, L=1.5, max_epochs=100):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if use_cuda else "cpu")
        self.L = L
        if concentrated == 1.0:
            full_data_name = 'CIFAR10_Concentrated'
        elif concentrated == 0.0:
            full_data_name = 'CIFAR10'
        self.result_path = "./results/{}_{}_{}/0.0/LOF/{}/".format(
            full_data_name, normal_class, anomaly_ratio, seed
        )
        data_path = "./data/" + data_name + ".npy"
        self.learning_rate = learning_rate
        self.SN = SN
        self.Trim = Trim
        # self.dataset = RealGraphDataset(data_path, missing_ratio=0, radius=2)
        self.dataset = CIFARVGGDataset(data_path, normal_class=normal_class, anomaly_ratio=anomaly_ratio, concentrated=concentrated)
        self.seed = seed
        self.hidden_dim = hidden_dim
        self.max_epochs = max_epochs

        self.data_path = data_path
        self.data_anomaly_ratio = self.dataset.__anomalyratio__()
        self.batch_size = batch_size
        self.input_dim = self.dataset.__dim__()
        self.data_normaly_ratio = 1 - self.data_anomaly_ratio
        n_sample = self.dataset.__len__()
        self.n_train = int(n_sample * training_ratio)
        self.n_test = n_sample - self.n_train
        print('|data dimension: {}|data noise ratio:{}'.format(self.dataset.__dim__(), self.data_anomaly_ratio))

        self.training_data, self.testing_data = data.random_split(dataset=self.dataset,
                                                                         lengths=[
                                                                             self.n_train,
                                                                             self.n_test
                                                                         ])

        self.ae = None
        self.discriminator = None
        self.model=None



    def train(self):
        self.model = LOF()
        self.model.fit(self.training_data.dataset.x)


    def test(self):
        y_test_scores = self.model.decision_function(self.testing_data.dataset.x)
        auc = roc_auc_score(self.testing_data.dataset.y, y_test_scores)

        from sklearn.metrics import precision_recall_fscore_support as prf, accuracy_score

        print("AUC:{:0.4f}".format(
           auc))

        os.makedirs(self.result_path, exist_ok=True)

        np.save(
            self.result_path + "result.npy",
            {
                "accuracy": auc,
                "precision": auc,
                "recall": auc,
                "f1": auc,
                "auc": auc,
            },
        ) # for consistency
        print("result save to {}".format(self.result_path))
# In[19]:

pca_vectors, tweets_dict = extract_2pca()

# In[22]:

pca_vectors.shape

# ### 5.5.2 Implement LOF model

# In[13]:

#Implement LOF model, extract decision scores
lof = LOF(metric='cosine')  #cosine is good for measuring non-numeric distances
lof_model = lof.fit(pca_vectors)
scores = lof_model.decision_scores_

# In[32]:

max(scores)

# ### 5.5.3 Implement function to extract top 5 outliers

# In[35]:

top_n = 5
tweet_index_decision_scores = []
decision_scores_tweet_index = []

for index, score in enumerate(scores):
#    cla.append(classifier)
##    cla.append((_*(144//splits),(_+1)*144//(splits)))
#clf =  LOF(n_neighbors=10, contamination=0.1)
#clf.fit(X_train)
#%%
#predictions = {}
#for _ in range (0,splits):
#    predictions['score_cla{}'.format(_)] = _
#predictions

#%%
cla = []
splits = 3
for _ in range(0, splits):
    classifier = LOF(n_neighbors=10, contamination=0.1)  #classifier
    classifier.fit(X_train[:, _ * (144 // splits):(_ + 1) * 144 // (splits)])
    cla.append(classifier)
    del classifier
#    cla.append((_*(144//splits),(_+1)*144//(splits)))
clf = LOF(n_neighbors=10, contamination=0.1)  #classifier
clf.fit(X_train)
#colors = ['red','green','blue']
i = 0
for dt in rrule(DAILY, dtstart=start_date, until=end_date):
    if (data.loc[dt.strftime("%Y-%m-%d")]['value']).values.size != 0:
        data.loc[dt.strftime("%Y-%m-%d"), 'score'] = clf.predict(
            preprocessing.normalize(
                data.loc[dt.strftime("%Y-%m-%d")].value.values.reshape(1, -1)))
        data.loc[dt.strftime("%Y-%m-%d"),
                 'probab'] = (clf.predict_proba((preprocessing.normalize(
                     data.loc[dt.strftime("%Y-%m-%d")].value.values.reshape(
Exemplo n.º 13
0
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        LSCP(detector_list=[LOF(contamination=contamination),
                            LOF(contamination=contamination)])
    ]
    
    
    model = SUOD(base_estimators=base_estimators, n_jobs=6, bps_flag=True, 
                 contamination=contamination, approx_flag_global=False)

    model.fit(X)  # fit all models with X
    model.approximate(X)  # conduct model approximation if it is enabled
    predicted_labels = model.predict(X)  # predict labels on X; for demo purpose only
    predicted_scores = model.decision_function(X)  # predict scores on X; for demo purpose only

    # %%
    evaluate_print('majority vote', y, majority_vote(predicted_labels))
    evaluate_print('average', y, average(predicted_scores))
    evaluate_print('maximization', y, maximization(predicted_scores))

    clf = LOF()
    clf.fit(X)
    evaluate_print('LOF', y, clf.decision_scores_)

    clf = IForest()
    clf.fit(X)
    evaluate_print('IForest', y, clf.decision_scores_)
print "x"
#Number of states
St = 5
Label = []
if args.m == 'lof' or args.m == 'LOF' or args.m == 'Lof':
    print "lof"
    #print "D = ", df[:30]
    numK = int(args.th)
    print "creating LOF"
    clf = LOF(n_neighbors=numK)
    print "fitting LOF"
    #E = np.array(df[:30])
    E = np.array(df)
    E = np.asfarray(E, float)
    print "E = ", E[0:30]
    clf.fit(E)
    #clf.fit(df[:50])
    print "fitting done"
    #print "l = ", clf.decision_scores_
    mn = min(clf.decision_scores_)
    mx = max(clf.decision_scores_)
    R = mx - mn
    print "R = ", R, mn, mx
    for lab in clf.decision_scores_:
        v = int(St * ((lab - mn) / R))
        Label.append(v)
else:
    if args.m == 'IF' or args.m == 'if' or args.m == 'If':
        print "lof"
        #print "D = ", df[:30]
        numE = int(args.th)
Exemplo n.º 15
0
@author: zixing.mei
"""

from pyod.models.lof import LOF

#训练异常检测模型,然后输出训练集样本的异常分
clf = LOF(n_neighbors=20,
          algorithm='auto',
          leaf_size=30,
          metric='minkowski',
          p=2,
          metric_params=None,
          contamination=0.1,
          n_jobs=1)
clf.fit(x)

#异常分
out_pred = clf.predict_proba(x, method='linear')[:, 1]
train['out_pred'] = out_pred

#异常分在0.9百分位以下的样本删掉
key = train['out_pred'].quantile(0.9)

x = train[train.out_pred < key][feature_lst]
y = train[train.out_pred < key]['bad_ind']

val_x = val[feature_lst]
val_y = val['bad_ind']

#重新训练模型
Exemplo n.º 16
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train LOF detector
    clf_name = 'LOF'
    clf = LOF()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Exemplo n.º 17
0
class TestLOF(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = LOF(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'n_neighbors_') and
                    self.clf.n_neighbors_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)
        print(pred_ranks)
        print(pred_ranks)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Exemplo n.º 18
0
def outlier_detection(df):

    testing_df = df[(df['Chassis_Number'] == 'WBA1C11080J829552')]
    # testing_df = df[(df['Chassis_Number'] == 'VF3LCYHZPJS332137')]

    clf = LOF(
        n_neighbors=10,
        contamination=0.1
    )
    data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1)
    data_reshaped = np.round(data_reshaped, 0)
    clf.fit(data_reshaped)
    y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1))
    # y_pred[y_pred < 0] = 0.0
    testing_df['outlier_score_lof'] = y_pred

    clf = LMDD(
        n_iter=100,
        contamination=0.1
    )
    data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1)
    data_reshaped = np.round(data_reshaped, 0)
    clf.fit(data_reshaped)
    y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1))
    # y_pred[y_pred < 0] = 0.0
    testing_df['outlier_score_lmdd'] = y_pred

    clf = IsolationForest(
        n_estimators=100,
        contamination=0.1
    )
    data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1)
    data_reshaped = np.round(data_reshaped, 0)
    clf.fit(data_reshaped)
    y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1))
    # y_pred[y_pred < 0] = 0.0
    testing_df['outlier_score_isolation_forest'] = y_pred

    clf = KNN(
        method='mean',
        n_neighbors=3,
        contamination=0.1
    )
    data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1)
    data_reshaped = np.round(data_reshaped, 0)
    clf.fit(data_reshaped)
    y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1))
    # y_pred[y_pred < 0] = 0.0
    testing_df['outlier_score_knn_mean'] = y_pred

    clf = KNN(
        method='median',
        n_neighbors=3,
        contamination=0.1
    )
    data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1)
    data_reshaped = np.round(data_reshaped, 0)
    clf.fit(data_reshaped)
    y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1))
    # y_pred[y_pred < 0] = 0.0
    testing_df['outlier_score_knn_median'] = y_pred

    print(testing_df[['Movement_Date', 'Kms', 'Kms_diff', 'outlier_score_lof', 'outlier_score_lmdd', 'outlier_score_isolation_forest', 'outlier_score_knn_mean', 'outlier_score_knn_median']])

    return
Exemplo n.º 19
0
class TestLOF(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination)

        self.clf = LOF(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        if not hasattr(self.clf,
                       'decision_scores_') or self.clf.decision_scores_ is None:
            self.assertRaises(AttributeError, 'decision_scores_ is not set')
        if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None:
            self.assertRaises(AttributeError, 'labels_ is not set')
        if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None:
            self.assertRaises(AttributeError, 'threshold_ is not set')
        if not hasattr(self.clf,
                       'negative_outlier_factor_') or self.clf.negative_outlier_factor_ is None:
            self.assertRaises(AttributeError,
                              'negative_outlier_factor_ is not set')

        if not hasattr(self.clf,
                       'n_neighbors') or self.clf.n_neighbors_ is None:
            self.assertRaises(AttributeError, 'n_neighbors is not set')

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_evaluate(self):
        self.clf.fit_predict_evaluate(self.X_test, self.y_test)

    def tearDown(self):
        pass
Exemplo n.º 20
0
    toeplitz_roc = []
    toeplitz_prn = []
    toeplitz_time = []

    pca_roc = []
    pca_prn = []
    pca_time = []

    rp_roc = []
    rp_prn = []
    rp_time = []

    for j in range(n_iter):
        start = time.time()
        clf = LOF()  # change this to other detection algorithms
        clf.fit(X)
        y_train_scores = clf.decision_scores_
        original_time.append(time.time() - start)
        original_roc.append(roc_auc_score(y, y_train_scores))
        original_prn.append(precision_n_scores(y, y_train_scores))

        X_transformed, _ = jl_fit_transform(X, dim_new, "basic")
        start = time.time()
        clf.fit(X_transformed)
        y_train_scores = clf.decision_scores_
        basic_time.append(time.time() - start)
        basic_roc.append(roc_auc_score(y, y_train_scores))
        basic_prn.append(precision_n_scores(y, y_train_scores))

        X_transformed, _ = jl_fit_transform(X, dim_new, "discrete")
        start = time.time()
from pyod.models.mcd import MCD
from pyod.models.lscp import LSCP
# from pyod.models.auto_encoder import AutoEncoder

clf_knn = KNN()
clf_pca = PCA()
clf_mcd = MCD()
clf_lof = LOF()
clf_cblof = CBLOF()
# clf_lscp = LSCP([clf_knn, clf_pca, clf_mcd ])
# clf_ae = AutoEncoder(epochs=50)

clf_mcd.fit(encodings_train)
clf_pca.fit(encodings_train)
clf_knn.fit(encodings_train)
clf_lof.fit(encodings_train)
clf_cblof.fit(encodings_train)
# clf_lscp.fit(encodings_train)
# clf_ae.fit(encodings_train)

anomaly_scores_mcd = clf_mcd.decision_function(encodings_train)
anomaly_scores_pca = clf_pca.decision_function(encodings_train)
anomaly_scores_knn = clf_knn.decision_function(encodings_train)
anomaly_scores_lof = clf_lof.decision_function(encodings_train)
anomaly_scores_cblof = clf_cblof.decision_function(encodings_train)
# anomaly_scores_lscp = clf_lscp.decision_function(encodings_train)
# anomaly_scores_ae = clf_ae.predict_proba(encodings_train)

# y_test_scores = []
# for x,_ in test_loader:
#     encodings_test = encoder(torch.Tensor(x).to(device))
Exemplo n.º 22
0
    toeplitz_roc = []
    toeplitz_prn = []
    toeplitz_time = []

    pca_roc = []
    pca_prn = []
    pca_time = []

    rp_roc = []
    rp_prn = []
    rp_time = []

    for j in range(n_iter):
        start = time.time()
        clf = LOF()  # change this to other detection algorithms
        clf.fit(X)
        y_train_scores = clf.decision_scores_
        original_time.append(time.time() - start)
        original_roc.append(roc_auc_score(y, y_train_scores))
        original_prn.append(precision_n_scores(y, y_train_scores))

        X_transformer, _ = jl_fit_transform(X, dim_new, "basic")
        start = time.time()
        clf.fit(X_transformer)
        y_train_scores = clf.decision_scores_
        basic_time.append(time.time() - start)
        basic_roc.append(roc_auc_score(y, y_train_scores))
        basic_prn.append(precision_n_scores(y, y_train_scores))

        X_transformer, _ = jl_fit_transform(X, dim_new, "discrete")
        start = time.time()
Exemplo n.º 23
0
    def train(self):
        model = LOF(contamination=self.data_anomaly_ratio)
        model.fit(self.X_train)

        self.best_model = model
Exemplo n.º 24
0
class TestLOF(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = LOF(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'n_neighbors_')
                and self.clf.n_neighbors_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_prediction_labels_confidence(self):
        pred_labels, confidence = self.clf.predict(self.X_test,
                                                   return_confidence=True)
        assert_equal(pred_labels.shape, self.y_test.shape)
        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_prediction_proba_linear_confidence(self):
        pred_proba, confidence = self.clf.predict_proba(self.X_test,
                                                        method='linear',
                                                        return_confidence=True)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
Exemplo n.º 25
0
#        detector_list, contamination=outliers_fraction,
#        random_state=random_state)
#}
#%%
file_no = 1
data = preprocess(data_1)
anomalies = anomalies_1
start_date = data.head(1).index.date[0]
end_date = data.tail(1).index.date[0]
datatotrain, datatotest, datatotrain_normalized, datatotest_normalized, train_data, test_data = createtraintest(
    data)
X_train, X_test = datatotrain_normalized, datatotest_normalized

#%%
clf = LOF(n_neighbors=10, contamination=0.1)
clf.fit(X_train)
clf1 = LOF(n_neighbors=10, contamination=0.1)
clf1.fit(X_train[:, 0:48])
clf2 = LOF(n_neighbors=10, contamination=0.1)
clf2.fit(X_train[:, 48:96])
clf3 = LOF(n_neighbors=10, contamination=0.1)
clf3.fit(X_train[:, 96:144])

#%%
datax = data['value'].values.reshape(-1, 144)
data_n = preprocessing.normalize(datax, norm='l2')
#y_pred = clf.predict(data_n)
i = 0
for dt in rrule(DAILY, dtstart=start_date, until=end_date):
    if (data.loc[dt.strftime("%Y-%m-%d")]['value']).values.size != 0:
        data.loc[dt.strftime("%Y-%m-%d"), 'ocsvm_score'] = clf.predict(
Exemplo n.º 26
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train LOF detector
    clf_name = 'LOF'
    clf = LOF()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Exemplo n.º 27
0
class TestLOF(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = LOF(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        if not hasattr(
                self.clf,
                'decision_scores_') or self.clf.decision_scores_ is None:
            self.assertRaises(AttributeError, 'decision_scores_ is not set')
        if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None:
            self.assertRaises(AttributeError, 'labels_ is not set')
        if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None:
            self.assertRaises(AttributeError, 'threshold_ is not set')
        if not hasattr(self.clf, 'negative_outlier_factor_'
                       ) or self.clf.negative_outlier_factor_ is None:
            self.assertRaises(AttributeError,
                              'negative_outlier_factor_ is not set')

        if not hasattr(self.clf,
                       'n_neighbors') or self.clf.n_neighbors_ is None:
            self.assertRaises(AttributeError, 'n_neighbors is not set')

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass