def outliers_detect(self, columns,outliers_fraction = 0.05): X = pd.get_dummies(self.data[columns]) clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0) clf.fit(X) scores_pred = clf.decision_function(X) * -1 y_pred = clf.predict(X) self.data['outlier'] = y_pred.tolist() n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1)
def cblof(self, X_train, contamination=None, random_state=None): """ Train CBLOF model from PYOD Parameters __________ X_train: scaled training data contamination: percentage of anomalies in the data random_state: random number seed Returns ________ Anomaly scores """ model = CBLOF(contamination=contamination, random_state=random_state) model.fit(X_train) # Predict raw anomaly score labels = model.predict(X_train) # outlier labels (0 or 1) cblof_anomaly_scores = model.decision_function( X_train) # outlier scores cblof_anomaly_scores = self.min_max_scaler(cblof_anomaly_scores) return cblof_anomaly_scores, labels
def get_CBOLF_scores(dataframe, cols, outliers_fraction=0.01): '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default Returns: df with CBOLF scores added ''' #standardize selected variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) #fit clf = CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) CheckOutliers.df1 = dataframe CheckOutliers.df1['outlier'] = y_pred.tolist() print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with CBLOF')
n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train CBLOF detector clf_name = 'CBLOF' clf = CBLOF(random_state=42) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
class TestCBLOF(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = CBLOF(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'clustering_estimator_') and self.clf.clustering_estimator_ is not None) assert (hasattr(self.clf, 'cluster_labels_') and self.clf.cluster_labels_ is not None) assert (hasattr(self.clf, 'cluster_sizes_') and self.clf.cluster_sizes_ is not None) assert (hasattr(self.clf, 'cluster_centers_') and self.clf.cluster_centers_ is not None) assert (hasattr(self.clf, '_clustering_threshold') and self.clf._clustering_threshold is not None) assert (hasattr(self.clf, 'small_cluster_labels_') and self.clf.small_cluster_labels_ is not None) assert (hasattr(self.clf, 'large_cluster_labels_') and self.clf.large_cluster_labels_ is not None) assert (hasattr(self.clf, '_large_cluster_centers') and self.clf._large_cluster_centers is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_prediction_labels_confidence(self): pred_labels, confidence = self.clf.predict(self.X_test, return_confidence=True) assert_equal(pred_labels.shape, self.y_test.shape) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_prediction_proba_linear_confidence(self): pred_proba, confidence = self.clf.predict_proba(self.X_test, method='linear', return_confidence=True) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
class TestLOF(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = CBLOF(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_sklearn_estimator(self): # TODO: sklearn examples are too small to form valid # check_estimator(self.clf) pass def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'clustering_estimator_') and self.clf.clustering_estimator_ is not None) assert_true(hasattr(self.clf, 'cluster_labels_') and self.clf.cluster_labels_ is not None) assert_true(hasattr(self.clf, 'cluster_sizes_') and self.clf.cluster_sizes_ is not None) assert_true(hasattr(self.clf, 'cluster_centers_') and self.clf.cluster_centers_ is not None) assert_true(hasattr(self.clf, '_clustering_threshold') and self.clf._clustering_threshold is not None) assert_true(hasattr(self.clf, 'small_cluster_labels_') and self.clf.small_cluster_labels_ is not None) assert_true(hasattr(self.clf, 'large_cluster_labels_') and self.clf.large_cluster_labels_ is not None) assert_true(hasattr(self.clf, '_large_cluster_centers') and self.clf._large_cluster_centers is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
X1 = df['Sales'].values.reshape(-1, 1) X2 = df['Profit'].values.reshape(-1, 1) X = np.concatenate((X1, X2), axis=1) outliers_fraction = 0.01 xx, yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100)) clf = CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) plt.figure(figsize=(8, 8)) df1 = df df1['outlier'] = y_pred.tolist() # sales - inlier feature 1, profit - inlier feature 2 inliers_sales = np.array(df1['Sales'][df1['outlier'] == 0]).reshape(-1, 1) inliers_profit = np.array(df1['Profit'][df1['outlier'] == 0]).reshape(-1, 1) # sales - outlier feature 1, profit - outlier feature 2 outliers_sales = df1['Sales'][df1['outlier'] == 1].values.reshape(-1, 1) outliers_profit = df1['Profit'][df1['outlier'] == 1].values.reshape(-1, 1)
def get_outliers(dataframe, cols, outliers_fraction, row_id, n, cbolf=True, hbos=True, iforest=True, knn=True): ''' Params: row_id ('str'): unique row identifier on the dataframe n(int): Minimum number of timmes an observation should be flagged as an outlier to be considered one Retrurns: List of index labels for rows in the dataframe that are flagged as outliers ''' #standardize selected numerical variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm #Outliers.row_id = row_id arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) if cbolf: '''Runs Cluster-Based Outlier Local Factor (CBOLF) algorithm to identify outliers''' #fit clf = CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=0) clf.fit(X) #predict raw anomaly score scores_pred = clf.decision_function(X) * -1 #prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) #Hold results to dataframe and print findings Outliers.df1 = dataframe Outliers.df1['outlier'] = y_pred.tolist() Outliers.df1 = Outliers.df1.loc[Outliers.df1['outlier'] == 1] print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with CBLOF') if hbos: '''Runs Histogram Based Outlier Score (HBOS) algorithm to identify outliers''' #fit clf = HBOS(contamination=outliers_fraction) clf.fit(X) #predict raw anomaly score scores_pred = clf.decision_function(X) * -1 #prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) #Hold results to dataframe and print findings Outliers.df2 = dataframe Outliers.df2['outlier'] = y_pred.tolist() Outliers.df2 = Outliers.df2.loc[Outliers.df2['outlier'] == 1] print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with HBOS') if iforest: '''Runs Isolation Forest algorithm to identify outliers''' #fit clf = IForest(contamination=outliers_fraction, random_state=0) clf.fit(X) #predict raw anomaly score scores_pred = clf.decision_function(X) * -1 #prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) #Hold results to dataframe and print findings Outliers.df3 = dataframe Outliers.df3['outlier'] = y_pred.tolist() Outliers.df3 = Outliers.df3.loc[Outliers.df3['outlier'] == 1] print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with IForest') if knn: '''Runs K-Nearest Neighbors algorithm to identify outliers''' #fit clf = KNN(contamination=outliers_fraction) clf.fit(X) #predict raw anomaly score scores_pred = clf.decision_function(X) * -1 #prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) #Hold results to dataframe and print findings Outliers.df4 = dataframe Outliers.df4['outlier'] = y_pred.tolist() Outliers.df4 = Outliers.df4.loc[Outliers.df4['outlier'] == 1] print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with KNN') #Merge dataframes merged_df = pd.concat( [Outliers.df1, Outliers.df2, Outliers.df3, Outliers.df4]) #Get counts (Count number of times an observation is identified as an outlier) merged_df['count'] = merged_df.groupby(row_id)[row_id].transform( 'count') #outliers['count'] = outliers.groupby('client_id')['client_id'].transform('count') #Filter common outliers (Outlier identified by all n algorithms) #common = outliers.loc[outliers['count'] >= n] common = merged_df.loc[merged_df['count'] >= n] #drop duplicates common = common.drop_duplicates(keep='last') #get list of indices to be removed on main dataframe Outliers.outlier_indices = [] for index in common.index: Outliers.outlier_indices.append(index) #print(f' \n{common.shape[0]} outliers commonly found by all algorithms\n') print( f' \n{len(Outliers.outlier_indices)} outliers commonly found by all algorithms\n' ) print(f'The row index labels are:\n {Outliers.outlier_indices}') return Outliers.outlier_indices
y = data118457['Speed diff'] plt.figure(figsize=(10, 4)) plt.plot(x, y, label='Car 118457') plt.xlabel('Time') plt.ylabel('Speed diff') plt.show() # In[15]: cblof = CBLOF() cblof.fit(df['Speed diff'].values.reshape(-1, 1)) xx = np.linspace(df['Speed diff'].min(), df['Speed diff'].max(), len(df)).reshape(-1, 1) anomaly_score = cblof.decision_function(xx) outlier = cblof.predict(xx) plt.figure(figsize=(10, 4)) plt.plot(xx, anomaly_score, label='anomaly score') plt.ylabel('anomaly score') plt.xlabel('Speed diff') plt.show() # In[16]: minmax = MinMaxScaler(feature_range=(0, 1)) df[['CarId', 'Speed diff', 'Heading diff', 'Position diff']] = minmax.fit_transform( df[['CarId', 'Speed diff', 'Heading diff', 'Position diff']]) df[['CarId', 'Speed diff', 'Heading diff', 'Position diff']].head() # In[17]:
# Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train CBLOF detector clf_name = 'CBLOF' clf = CBLOF() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)