class TestLSCP(unittest.TestCase): def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'cardio.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) self.detector_list = [LOF(), LOF()] self.clf = LSCP(self.detector_list) self.clf.fit(self.X_train) self.roc_floor = 0.6 def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'detector_list') and self.clf.detector_list is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
def identify_outliers(df, features, contamination=0.1, algorithms=['Isolation Forest']): """Cleans the outliers. Outlier detection using LSCP: Locally selective combination in parallel outlier ensembles. https://arxiv.org/abs/1812.01528 Parameters ---------- features : list List of feature names. df : DataFrame The data to be examined. contamination : float in (0., 0.5) the proportion of outliers in the data set. algorithms: list list with at the names of least 2 algorithms to be used during LSCP. A list of supported algorithms: ['Isolation Forest', 'Cluster-based Local Outlier Factor', 'Minimum Covariance Determinant (MCD)', 'Principal Component Analysis (PCA)', 'Angle-based Outlier Detector (ABOD)', 'Histogram-base Outlier Detection (HBOS)', 'K Nearest Neighbors (KNN)', 'Local Outlier Factor (LOF)', 'Feature Bagging', 'One-class SVM (OCSVM)'] Returns ------- df_sorted : DataFrame Original data with 3 new columns: anomaly_score, probability and prediction. Sorted on descending anomaly score. df_styled: DataFrame Styled version of df_sorted for use in Jupyter Notebook (i.e. display(df_styled)). """ df_numeric = df.select_dtypes( include=[np.number]) # keep only numeric type features X = np.asarray(df_numeric) classifiers = { 'Isolation Forest': IForest, 'Cluster-based Local Outlier Factor': CBLOF, 'Minimum Covariance Determinant (MCD)': MCD, 'Principal Component Analysis (PCA)': PCA, 'Angle-based Outlier Detector (ABOD)': ABOD, 'Histogram-base Outlier Detection (HBOS)': HBOS, 'K Nearest Neighbors (KNN)': knn, 'Local Outlier Factor (LOF)': LOF, 'Feature Bagging': FeatureBagging, 'One-class SVM (OCSVM)': OCSVM, } if len(algorithms) > 1: selected_classifiers = [classifiers[x]() for x in algorithms] clf = LSCP(selected_classifiers, contamination=contamination) else: clf = classifiers[algorithms[0]](contamination=contamination) clf.fit(X) y_pred = clf.predict(X) y_predict_proba = clf.predict_proba(X, method='unify') y_predict_proba = [item[1] for item in y_predict_proba] outlier_index, = np.where(y_pred == 1) anomaly_score = clf.decision_function(X) anomaly_score = pd.DataFrame(anomaly_score, columns=['anomaly_score']) y_predict_proba = pd.DataFrame(y_predict_proba, columns=['probability']) prediction = pd.DataFrame(y_pred, columns=['prediction']) df.columns = features df_with_anomaly_score = pd.concat( [df, anomaly_score, y_predict_proba, prediction], axis=1) df_sorted = df_with_anomaly_score.sort_values(by='anomaly_score', ascending=False) cm = sns.diverging_palette(220, 10, sep=80, n=7, as_cmap=True) df_styled = df_sorted.style.background_gradient(cmap=cm, subset=['anomaly_score']) \ .apply(lambda x: ['background: MistyRose' if x.name in outlier_index.tolist() else '' for i in x], axis=1, subset=df_sorted.columns[:-3]) return df_sorted, df_styled
class TestLSCP(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.X_train, self.X_test = standardizer(self.X_train, self.X_test) self.detector_list = [LOF(), LOF()] self.clf = LSCP(self.detector_list, contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert_true( hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true( hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true( hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true( hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true( hasattr(self.clf, 'detector_list') and self.clf.detector_list is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass