Exemplo n.º 1
0
 def outliers_detect(self, columns,outliers_fraction = 0.05):
     X = pd.get_dummies(self.data[columns])
     clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0)
     clf.fit(X)
     scores_pred = clf.decision_function(X) * -1
     y_pred = clf.predict(X)
     self.data['outlier'] = y_pred.tolist()
     n_inliers = len(y_pred) - np.count_nonzero(y_pred)
     n_outliers = np.count_nonzero(y_pred == 1)        
Exemplo n.º 2
0
    def cblof(self, X_train, contamination=None, random_state=None):
        """
        Train CBLOF model from PYOD

        Parameters
        __________
        X_train: scaled training data
        contamination: percentage of anomalies in the data
        random_state: random number seed

        Returns
        ________
        Anomaly scores
        """
        model = CBLOF(contamination=contamination, random_state=random_state)
        model.fit(X_train)

        # Predict raw anomaly score
        labels = model.predict(X_train)  # outlier labels (0 or 1)
        cblof_anomaly_scores = model.decision_function(
            X_train)  # outlier scores
        cblof_anomaly_scores = self.min_max_scaler(cblof_anomaly_scores)
        return cblof_anomaly_scores, labels
Exemplo n.º 3
0
    def get_CBOLF_scores(dataframe, cols, outliers_fraction=0.01):
        '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default
        
        Returns:
            df with CBOLF scores added         
        '''
        #standardize selected variables
        minmax = MinMaxScaler(feature_range=(0, 1))
        dataframe[cols] = minmax.fit_transform(dataframe[cols])

        #Convert dataframe to a numpy array in order to incorprate our algorithm
        arrays = []
        for row in cols:
            row = dataframe[row].values.reshape(-1, 1)
            arrays.append(row)
        X = np.concatenate((arrays), axis=1)

        #fit
        clf = CBLOF(contamination=outliers_fraction,
                    check_estimator=False,
                    random_state=0)
        clf.fit(X)

        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1

        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(X)
        n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)

        CheckOutliers.df1 = dataframe
        CheckOutliers.df1['outlier'] = y_pred.tolist()

        print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
              'found with CBLOF')
Exemplo n.º 4
0
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train CBLOF detector
    clf_name = 'CBLOF'
    clf = CBLOF(random_state=42)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    visualize(clf_name,
              X_train,
              y_train,
              X_test,
              y_test,
              y_train_pred,
Exemplo n.º 5
0
class TestCBLOF(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = CBLOF(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_') and
                self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and
                self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_') and
                self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and
                self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and
                self.clf._sigma is not None)
        assert (hasattr(self.clf, 'clustering_estimator_') and
                self.clf.clustering_estimator_ is not None)
        assert (hasattr(self.clf, 'cluster_labels_') and
                self.clf.cluster_labels_ is not None)
        assert (hasattr(self.clf, 'cluster_sizes_') and
                self.clf.cluster_sizes_ is not None)
        assert (hasattr(self.clf, 'cluster_centers_') and
                self.clf.cluster_centers_ is not None)
        assert (hasattr(self.clf, '_clustering_threshold') and
                self.clf._clustering_threshold is not None)
        assert (hasattr(self.clf, 'small_cluster_labels_') and
                self.clf.small_cluster_labels_ is not None)
        assert (hasattr(self.clf, 'large_cluster_labels_') and
                self.clf.large_cluster_labels_ is not None)
        assert (hasattr(self.clf, '_large_cluster_centers') and
                self.clf._large_cluster_centers is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_prediction_labels_confidence(self):
        pred_labels, confidence = self.clf.predict(self.X_test,
                                                   return_confidence=True)
        assert_equal(pred_labels.shape, self.y_test.shape)
        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_prediction_proba_linear_confidence(self):
        pred_proba, confidence = self.clf.predict_proba(self.X_test,
                                                        method='linear',
                                                        return_confidence=True)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
Exemplo n.º 6
0
class TestLOF(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = CBLOF(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        # TODO: sklearn examples are too small to form valid
        # check_estimator(self.clf)
        pass

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'clustering_estimator_') and
                    self.clf.clustering_estimator_ is not None)
        assert_true(hasattr(self.clf, 'cluster_labels_') and
                    self.clf.cluster_labels_ is not None)
        assert_true(hasattr(self.clf, 'cluster_sizes_') and
                    self.clf.cluster_sizes_ is not None)
        assert_true(hasattr(self.clf, 'cluster_centers_') and
                    self.clf.cluster_centers_ is not None)
        assert_true(hasattr(self.clf, '_clustering_threshold') and
                    self.clf._clustering_threshold is not None)
        assert_true(hasattr(self.clf, 'small_cluster_labels_') and
                    self.clf.small_cluster_labels_ is not None)
        assert_true(hasattr(self.clf, 'large_cluster_labels_') and
                    self.clf.large_cluster_labels_ is not None)
        assert_true(hasattr(self.clf, '_large_cluster_centers') and
                    self.clf._large_cluster_centers is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)
        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])
        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
X1 = df['Sales'].values.reshape(-1, 1)
X2 = df['Profit'].values.reshape(-1, 1)

X = np.concatenate((X1, X2), axis=1)

outliers_fraction = 0.01
xx, yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
clf = CBLOF(contamination=outliers_fraction,
            check_estimator=False,
            random_state=0)
clf.fit(X)
# predict raw anomaly score
scores_pred = clf.decision_function(X) * -1

# prediction of a datapoint category outlier or inlier
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)

plt.figure(figsize=(8, 8))

df1 = df
df1['outlier'] = y_pred.tolist()

# sales - inlier feature 1,  profit - inlier feature 2
inliers_sales = np.array(df1['Sales'][df1['outlier'] == 0]).reshape(-1, 1)
inliers_profit = np.array(df1['Profit'][df1['outlier'] == 0]).reshape(-1, 1)

# sales - outlier feature 1, profit - outlier feature 2
outliers_sales = df1['Sales'][df1['outlier'] == 1].values.reshape(-1, 1)
outliers_profit = df1['Profit'][df1['outlier'] == 1].values.reshape(-1, 1)
Exemplo n.º 8
0
    def get_outliers(dataframe,
                     cols,
                     outliers_fraction,
                     row_id,
                     n,
                     cbolf=True,
                     hbos=True,
                     iforest=True,
                     knn=True):
        '''
        Params:
            row_id ('str'): unique row identifier on the dataframe
            n(int): Minimum number of timmes an observation should be flagged as an outlier 
                    to be considered one
        
        Retrurns:
            List of index labels for rows in the dataframe that are flagged as outliers
        
        '''
        #standardize selected numerical variables
        minmax = MinMaxScaler(feature_range=(0, 1))
        dataframe[cols] = minmax.fit_transform(dataframe[cols])

        #Convert dataframe to a numpy array in order to incorprate our algorithm
        #Outliers.row_id = row_id
        arrays = []
        for row in cols:
            row = dataframe[row].values.reshape(-1, 1)
            arrays.append(row)
        X = np.concatenate((arrays), axis=1)

        if cbolf:
            '''Runs Cluster-Based Outlier Local Factor (CBOLF)  algorithm to identify outliers'''
            #fit
            clf = CBLOF(contamination=outliers_fraction,
                        check_estimator=False,
                        random_state=0)
            clf.fit(X)

            #predict raw anomaly score
            scores_pred = clf.decision_function(X) * -1

            #prediction of a datapoint category outlier or inlier
            y_pred = clf.predict(X)
            n_inliers = len(y_pred) - np.count_nonzero(y_pred)
            n_outliers = np.count_nonzero(y_pred == 1)

            #Hold results to dataframe and print findings
            Outliers.df1 = dataframe
            Outliers.df1['outlier'] = y_pred.tolist()
            Outliers.df1 = Outliers.df1.loc[Outliers.df1['outlier'] == 1]
            print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
                  'found with CBLOF')

        if hbos:
            '''Runs Histogram Based Outlier Score (HBOS) algorithm to identify outliers'''
            #fit
            clf = HBOS(contamination=outliers_fraction)
            clf.fit(X)

            #predict raw anomaly score
            scores_pred = clf.decision_function(X) * -1

            #prediction of a datapoint category outlier or inlier
            y_pred = clf.predict(X)
            n_inliers = len(y_pred) - np.count_nonzero(y_pred)
            n_outliers = np.count_nonzero(y_pred == 1)

            #Hold results to dataframe and print findings
            Outliers.df2 = dataframe
            Outliers.df2['outlier'] = y_pred.tolist()
            Outliers.df2 = Outliers.df2.loc[Outliers.df2['outlier'] == 1]
            print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
                  'found with HBOS')

        if iforest:
            '''Runs Isolation Forest algorithm to identify outliers'''
            #fit
            clf = IForest(contamination=outliers_fraction, random_state=0)
            clf.fit(X)

            #predict raw anomaly score
            scores_pred = clf.decision_function(X) * -1

            #prediction of a datapoint category outlier or inlier
            y_pred = clf.predict(X)
            n_inliers = len(y_pred) - np.count_nonzero(y_pred)
            n_outliers = np.count_nonzero(y_pred == 1)

            #Hold results to dataframe and print findings
            Outliers.df3 = dataframe
            Outliers.df3['outlier'] = y_pred.tolist()
            Outliers.df3 = Outliers.df3.loc[Outliers.df3['outlier'] == 1]
            print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
                  'found with IForest')

        if knn:
            '''Runs K-Nearest Neighbors algorithm to identify outliers'''
            #fit
            clf = KNN(contamination=outliers_fraction)
            clf.fit(X)

            #predict raw anomaly score
            scores_pred = clf.decision_function(X) * -1

            #prediction of a datapoint category outlier or inlier
            y_pred = clf.predict(X)
            n_inliers = len(y_pred) - np.count_nonzero(y_pred)
            n_outliers = np.count_nonzero(y_pred == 1)

            #Hold results to dataframe and print findings
            Outliers.df4 = dataframe
            Outliers.df4['outlier'] = y_pred.tolist()
            Outliers.df4 = Outliers.df4.loc[Outliers.df4['outlier'] == 1]
            print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
                  'found with KNN')

        #Merge dataframes
        merged_df = pd.concat(
            [Outliers.df1, Outliers.df2, Outliers.df3, Outliers.df4])

        #Get counts (Count number of times an observation is identified as an outlier)
        merged_df['count'] = merged_df.groupby(row_id)[row_id].transform(
            'count')
        #outliers['count'] = outliers.groupby('client_id')['client_id'].transform('count')

        #Filter common outliers (Outlier identified by all n algorithms)
        #common = outliers.loc[outliers['count'] >= n]
        common = merged_df.loc[merged_df['count'] >= n]

        #drop duplicates
        common = common.drop_duplicates(keep='last')

        #get list of indices to be removed on main dataframe
        Outliers.outlier_indices = []
        for index in common.index:
            Outliers.outlier_indices.append(index)

        #print(f' \n{common.shape[0]} outliers commonly found by all algorithms\n')
        print(
            f' \n{len(Outliers.outlier_indices)} outliers commonly found by all algorithms\n'
        )
        print(f'The row index labels are:\n {Outliers.outlier_indices}')
        return Outliers.outlier_indices
y = data118457['Speed diff']

plt.figure(figsize=(10, 4))
plt.plot(x, y, label='Car 118457')
plt.xlabel('Time')
plt.ylabel('Speed diff')
plt.show()

# In[15]:

cblof = CBLOF()
cblof.fit(df['Speed diff'].values.reshape(-1, 1))
xx = np.linspace(df['Speed diff'].min(), df['Speed diff'].max(),
                 len(df)).reshape(-1, 1)
anomaly_score = cblof.decision_function(xx)
outlier = cblof.predict(xx)
plt.figure(figsize=(10, 4))
plt.plot(xx, anomaly_score, label='anomaly score')
plt.ylabel('anomaly score')
plt.xlabel('Speed diff')
plt.show()

# In[16]:

minmax = MinMaxScaler(feature_range=(0, 1))
df[['CarId', 'Speed diff', 'Heading diff',
    'Position diff']] = minmax.fit_transform(
        df[['CarId', 'Speed diff', 'Heading diff', 'Position diff']])
df[['CarId', 'Speed diff', 'Heading diff', 'Position diff']].head()

# In[17]:
Exemplo n.º 10
0
    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train CBLOF detector
    clf_name = 'CBLOF'
    clf = CBLOF()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
              y_test_pred, show_figure=True, save_figure=False)