class MixtureLocalizationOutliers(object): def __init__(self, n_components='2'): self.GMM = GaussianMixture(int(n_components)) self.LOF = LocalOutlierFactor(n_neighbors=2, novelty=True, contamination=1e-4) self.decisions = None def fit(self, X, y=None): self.GMM.fit(X) pdfs = self.GMM.score_samples(X) self.LOF.fit(pdfs.reshape(-1,1)) lofs = self.LOF.decision_function(pdfs.reshape(-1,1)) self.lower_lof, self.upper_lof = np.percentile(lofs, [.25, .75]) def predict(self, X): pdfs = self.GMM.score_samples(X) lofs = self.LOF.decision_function(pdfs.reshape(-1,1)) preds = [] for pdf, lof in zip(pdfs, lofs): if lof <= self.lower_lof or pdf >= self.upper_lof: preds.append(-1) else: preds.append(1) self.decisions = lofs return preds def decision_function(self, X): if self.decisions is None: self.predict(X) return self.decisions
class LOFNovelty: def __init__(self): self.clf = LocalOutlierFactor(novelty=True, contamination=0.1) self.scaler = StandardScaler() def train(self, train): #train = self.scaler.fit_transform(train) self.clf.fit(train) def predict(self, valid, anomaly): #valid = self.scaler.fit_transform(valid) #anomaly = self.scaler.fit_transform(anomaly) y_pred_valid = self.clf.predict(valid) y_pred_outliers = self.clf.predict(anomaly) score_valid = self.clf.decision_function(valid) score_anomaly = self.clf.decision_function(anomaly) print("LOF Novelty result") print(confusion_matrix([1] * len(y_pred_valid), y_pred_valid).ravel()) print( confusion_matrix([-1] * len(y_pred_outliers), y_pred_outliers).ravel()) print(" Validation data:", list(y_pred_valid).count(1) / y_pred_valid.shape[0]) #print("Score", score_valid.mean(), score_valid.std()) print(" Outlier data:", list(y_pred_outliers).count(-1) / y_pred_outliers.shape[0])
def perform_outlier_detection(self, X): # LOF on all features clf = LocalOutlierFactor(n_neighbors=20) clf.fit(X) lof_scores = clf._decision_function(X) lof_scores = clf._decision_function(X) # Isolation forest on all features clf = IsolationForest() clf.fit(X) forest_scores = clf.decision_function(X) ''' clf = DBOD() clf.fit(X) distance_scores = clf.decision_function_distance(X) #abod_scores = ABOD(X, self.seed_user) abod_scores = clf.decision_function_angle(X) scores = self.combine([lof_scores, forest_scores, distance_scores, abod_scores]) ''' # scores = forest_scores scores = self.combine([lof_scores, forest_scores]) ''' with open('clique_expansion/' + self.seed_user + '_unnormalized_scores.csv', 'w') as f: for score in scores: f.write(str(score) + '\n') ''' new_scores = scores[self.len_priors:] user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True) threshold = np.percentile(new_scores, 8) outliers = [u[0] for u in user_scores if u[1] <= threshold] return outliers
def test_local_outlier_factor_cdist_p3(self): lof = LocalOutlierFactor(n_neighbors=2, novelty=True, p=3) data = np.array([[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]], dtype=np.float32) model = lof.fit(data) model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET, options={'optim': 'cdist'}) self.assertIn('CDist', str(model_onnx)) data = data.copy() data[:, 0] += 0.1 try: sess = InferenceSession(model_onnx.SerializeToString()) except InvalidGraph as e: if "Unrecognized attribute: p for operator CDist" in str(e): return raise e names = [o.name for o in sess.get_outputs()] self.assertEqual(names, ['label', 'scores']) got = sess.run(None, {'X': data}) self.assertEqual(len(got), 2) expected_label = lof.predict(data) expected_decif = lof.decision_function(data) assert_almost_equal(expected_label, got[0].ravel()) assert_almost_equal(expected_decif, got[1].ravel())
def test_local_outlier_factor_metric_cdist(self): for metric in ['euclidean', 'sqeuclidean']: with self.subTest(metric=metric): lof = LocalOutlierFactor(n_neighbors=2, novelty=True, metric=metric) data = np.array( [[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]], dtype=np.float32) model = lof.fit(data) model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET, options={'optim': 'cdist'}) data = data.copy() data[:, 0] += 0.1 sess = InferenceSession(model_onnx.SerializeToString()) names = [o.name for o in sess.get_outputs()] self.assertEqual(names, ['label', 'scores']) got = sess.run(None, {'X': data}) self.assertEqual(len(got), 2) expected_label = lof.predict(data) expected_decif = lof.decision_function(data) assert_almost_equal(expected_label, got[0].ravel()) assert_almost_equal(expected_decif, got[1].ravel(), decimal=4)
class LOFNoveltyFilter (StaticFilter, _InputsStatBasedInitializable): def __init__(self, name = 'LOF-based novelty', sample_size = 3000, metric = 'cosine', lof_kwds = {}, **kwds): assert (isinstance (sample_size, int) and 1 <= sample_size) self.name = name self.sample_size = sample_size self.lof_threshold = 0.0 self.lof = LocalOutlierFactor (**lof_kwds, metric = metric, novelty = True) super().__init__(**kwds) def inputs_stat_initialize (self, train_data: raw_datat = None, test_data: raw_datat = None): sample_size = min (self.sample_size, train_data.data.shape[0]) np1 ('Initializing LOF-based novelty estimator with {} training samples... ' .format (sample_size)) # TODO: random sampling (& shuffle)?. self.lof.fit (train_data.data[:sample_size]) c1 ('done') p1 ('{} offset is {}'.format (self.name, self.lof.offset_)) def close_enough (self, i: Input): lof = self.lof.decision_function (i.reshape (1, -1)) # p1 ('{}: {}'.format (self.name, lof)) return lof > self.lof_threshold
class LocalOutlierFactor_Classifier: """docstring for LocalOutlierFactor_Classifier""" def __init__(self, save_path): # 默认路径 self.save_path = os.path.join(save_path,'LocalOutlierFactor') if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.n_neighbors=40 # 数据集中的异常比例。当拟合时, 用于定义决策函数的阈值 self.contamination = 0.1 self.classifier = LocalOutlierFactor(n_neighbors=self.n_neighbors,contamination=self.contamination) def fit_model(self, train_data_matrix, test_data_matrix, test_true_label): """训练模型""" self.classifier.fit(train_data_matrix) y_pred_label = self.classifier.predict(test_data_matrix) n_errors_test = (y_pred_label!=test_true_label).sum() accuracy, classification_report, confusion_matrix = sklearn_evaluation(test_true_label, y_pred_label) print('Accuracy: {} \nClassification Report:\n{}\n'.format(accuracy, classification_report)) sys.stdout.flush() def test_model(test_data,test_label): """测试模型 such as test_label = [1,1,-1,....] """ scores_pred = self.classifier.decision_function(train_data) y_pred_test = self.classifier.predict(test_data) n_errors = (y_pred_test!=test_label)
def perform_outlier_detection(self, X, len_priors): # LOF on all features clf = LocalOutlierFactor(n_neighbors=20) clf.fit(X) check_is_fitted(clf, ["threshold_", "negative_outlier_factor_", "n_neighbors_", "_distances_fit_X_"]) if X is not None: X = check_array(X, accept_sparse='csr') y_pred = clf._decision_function(X) else: y_pred = clf.negative_outlier_factor_ #lof_scores = y_pred[len_priors:] #lof_scores = zip(self.current_level_users, y_pred_new) lof_scores = y_pred # Isolation forest on all features clf = IsolationForest() clf.fit(X) y_pred = clf.decision_function(X) #forest_scores = y_pred[len_priors:] #forest_scores = zip(self.current_level_users, y_pred_new) forest_scores = y_pred scores = self.combine(lof_scores, forest_scores) new_scores = scores[len_priors:] user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True) threshold = np.percentile(new_scores, 95) outliers = [u[0] for u in user_scores if u[1] >= threshold] return outliers
def computeLocalOutlierFactor(dyResult): nDarrayMeanVar, min_mean, max_mean, min_var, max_var = \ ut_data.numpyMeanVariance(dyResult["window"]) xx, yy = np.meshgrid(np.linspace(min_mean - 100, max_mean + 100, 500), np.linspace(min_var - 1000, max_var + 1000, 500)) clf = LocalOutlierFactor(n_neighbors=15, novelty=True, contamination=0.1) clf.fit(nDarrayMeanVar) print("a") Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Novelty Detection with LOF") plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu) a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred") plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred") s = 40 b1 = plt.scatter(nDarrayMeanVar[:, 0], nDarrayMeanVar[:, 1], c="white", s=s, edgecolors="k") # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k") # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s, edgecolors="k") plt.axis("tight") plt.xlim((min_mean, max_mean)) plt.ylim((min_var, max_var)) plt.legend( [a.collections[0], b1], [ "learned frontier", "training observations" # , # "new regular observations", # "new abnormal observations", ], loc="upper left", prop=matplotlib.font_manager.FontProperties(size=11), ) # plt.xlabel( # "errors novel regular: %d/40 ; errors novel abnormal: %d/40" # % (n_error_test, n_error_outliers) # ) plt.show()
def test_local_outlier_factor_double(self): lof = LocalOutlierFactor(n_neighbors=2, novelty=True) data = np.array([[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]], dtype=np.float64) model = lof.fit(data) model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET) sess = InferenceSession(model_onnx.SerializeToString()) names = [o.name for o in sess.get_outputs()] self.assertEqual(names, ['label', 'scores']) got = sess.run(None, {'X': data}) self.assertEqual(len(got), 2) expected_label = lof.predict(data) expected_decif = lof.decision_function(data) assert_almost_equal(expected_label, got[0].ravel()) assert_almost_equal(expected_decif, got[1].ravel())
def density_contour( ax, data, x, y, groupby=None, c="lightgray", single_contour_pad=1, linewidth=1, palette=None, ): _data = data.copy() if groupby is not None: if isinstance(groupby, str): _data["groupby"] = data[groupby] else: _data["groupby"] = groupby else: _data["groupby"] = "one group" _contour_kws = dict( linewidths=linewidth, levels=(-single_contour_pad,), linestyles="dashed" ) _lof_kws = dict(n_neighbors=25, novelty=True, contamination="auto") xmin, ymin = _data[[x, y]].min() xmax, ymax = _data[[x, y]].max() xmin, xmax = zoom_min_max(xmin, xmax, 1.2) ymin, ymax = zoom_min_max(ymin, ymax, 1.2) for group, sub_data in _data[[x, y, "groupby"]].groupby("groupby"): xx, yy = np.meshgrid(np.linspace(xmin, xmax, 500), np.linspace(ymin, ymax, 500)) clf = LocalOutlierFactor(**_lof_kws) clf.fit(sub_data.iloc[:, :2].values) z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) z = z.reshape(xx.shape) if palette is None: _color = c else: _color = palette[group] if group in palette else c # plot contour line(s) ax.contour(xx, yy, z, colors=_color, **_contour_kws) return
def test_local_outlier_factor_rnd(self): lof = LocalOutlierFactor(n_neighbors=2, novelty=True) rs = np.random.RandomState(0) data = rs.randn(100, 4).astype(np.float32) data[-1, 2:] = 99. data[-2, :2] = -99. model = lof.fit(data) model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET) sess = InferenceSession(model_onnx.SerializeToString()) names = [o.name for o in sess.get_outputs()] self.assertEqual(names, ['label', 'scores']) got = sess.run(None, {'X': data}) self.assertEqual(len(got), 2) expected_label = lof.predict(data) expected_decif = lof.decision_function(data) assert_almost_equal(expected_label, got[0].ravel()) assert_almost_equal(expected_decif, got[1].ravel(), decimal=5)
def compute_values(df, classes, **kwargs): if "alfa" in kwargs: alfa = float(kwargs["alfa"]) del kwargs["alfa"] else: alfa = 0.75 if "beta" in kwargs: beta = float(kwargs["beta"]) del kwargs["beta"] else: beta = 0.25 _, cls_num = np.unique(classes, return_inverse=True) clss = cls_num.astype(int) cls_indices = {} noncls_indices = {} for cls in np.unique(clss): cls_indices[cls] = [i for i in range(len(df)) if clss[i] == cls] noncls_indices[cls] = [i for i in range(len(df)) if clss[i] != cls] lof = LocalOutlierFactor(**kwargs) lof.fit(df.values) lofn = LocalOutlierFactor(**kwargs, novelty=True) same_lof = np.empty(len(df)) other_lof = np.empty(len(df)) all_lof = lof.negative_outlier_factor_ for cls in np.unique(clss): ind = cls_indices[cls] nind = noncls_indices[cls] lof.fit(df.iloc[ind]) same_lof[ind] = lof.negative_outlier_factor_ lofn.fit(df.iloc[nind]) for i in ind: v = lofn.decision_function([df.iloc[i]]) other_lof[i] = 1 / v if v != 0 else 10 values = -1 * (same_lof + alfa * other_lof + beta * all_lof) return values
def perform_local_outlier_factor_novelty_detection(data): ''' With the five patterns' counts, this method performs Local Outlier Factor that computes the local density deviation of a given data point with respect to its neighbors. The experimentation is performed with different time chunks and number of sequences. ''' # Importing necessary libraries from sklearn.neighbors import LocalOutlierFactor from sklearn.model_selection import train_test_split X = data.iloc[:, 0:4].values pca = PCA(n_components=2) X = pca.fit(StandardScaler().fit_transform(X)).transform( StandardScaler().fit_transform(X)) # Spliting the observations into 75% training and 25% testing X_train, X_test = train_test_split(X, test_size=0.25, random_state=42) # Local Outlier Factor classifier intialization and generate results classifier = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1) classifier.fit(X_train) Y_pred_train = classifier.predict(X_train) Y_pred_test = classifier.predict(X_test) n_error_train = Y_pred_train[Y_pred_train == -1].size n_error_test = Y_pred_test[Y_pred_test == -1].size error_train = n_error_train / Y_pred_train.shape[0] * 100 error_novel = n_error_test / Y_pred_test.shape[0] * 100 # Visualization plt.clf() myFig = plt.figure(figsize=[10, 8]) xx, yy = np.meshgrid(np.linspace(-3, 8, 500), np.linspace(-2.5, 4, 500)) Z = classifier.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu) a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred') plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred') s = 60 b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k') b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='gold', s=s, edgecolors='k') plt.axis('tight') plt.legend([a.collections[0], b1, b2], [ "Learned Frontier", "Training Observations", "New Regular Observations" ], loc="best", prop=matplotlib.font_manager.FontProperties(size=14)) plt.xlabel("Error Train: %.2f%% and Error Novel Regular: %.2f%%" % (error_train, error_novel), fontsize=13, weight="bold") plt.yticks(fontsize=14) plt.xticks(fontsize=14) plt.title( 'Novelty Detection using Local Outlier Factor of Ransomware Families\'\nSequence #1, #2, #3, and #4 Counts from 15 minutes of IRP Logs', fontsize=14, weight='bold') plt.show() # Save figure myFig.savefig( 'sequence_mining_analysis/Results/novelty_detection/Local_Outlier_Factor/15_mins_sequences_1_2_3_4.png', format='png', dpi=150) myFig.savefig( 'sequence_mining_analysis/Results/novelty_detection/Local_Outlier_Factor/15_mins_sequences_1_2_3_4.eps', format='eps', dpi=1200)
# Generate some abnormal novel observations X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) # fit the model for novelty detection (novelty=True) clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1) clf.fit(X_train) # DO NOT use predict, decision_function and score_samples on X_train as this # would give wrong results but only on new unseen data (not used in X_train), # e.g. X_test, X_outliers or the meshgrid y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) n_error_test = y_pred_test[y_pred_test == -1].size n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size # plot the learned frontier, the points, and the nearest vectors to the plane Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Novelty Detection with LOF") plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu) a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred') plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred') s = 40 b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k') b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s, edgecolors='k') c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s, edgecolors='k') plt.axis('tight') plt.xlim((-5, 5))
# Generate some abnormal novel observations X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) # fit the model for novelty detection (novelty=True) clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1) clf.fit(X_train) # DO NOT use predict, decision_function and score_samples on X_train as this # would give wrong results but only on new unseen data (not used in X_train), # e.g. X_test, X_outliers or the meshgrid y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) n_error_test = y_pred_test[y_pred_test == -1].size n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size # plot the learned frontier, the points, and the nearest vectors to the plane Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Novelty Detection with LOF") plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu) a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred") plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred") s = 40 b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=s, edgecolors="k") b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k") c = plt.scatter(X_outliers[:, 0],
offset_:偏移量用于从原始分数中获取二进制标签。negative_outlier_factor小于的观察值offset_ 被检测为异常。默认的偏移设置为-1.5(inliers score around -1),除非提供的污染参数不同于“自动”。在那种情况下,以这样的方式定义偏移量,即我们可以在训练中获得预期的异常值数量。 ''' clf = LocalOutlierFactor(novelty=True) #训练模型 clf.fit(X_train[0:40]) #训练数据集的异常得分 print(clf.negative_outlier_factor_) #预测数据是否是异常值,正常值返回1,异常值返回-1 print(clf.predict(mix_data)) #预测数据的异常度:LOF的值越接近1,越可能是正常样本,LOF的值越大于1,则越可能是异常样本 y_score = -clf.decision_function(mix_data) print(y_score) # 生成画布 fig = plt.figure() #生成子图 ax1 = fig.add_subplot(121) ax1.set_title("标签-异常值显示图", fontproperties=font_set) ax1.scatter(mix_lable, -clf.decision_function(mix_data), c=mix_lable) ax1.set_xlabel('标签', fontproperties=font_set) ax1.set_ylabel('异常度', fontproperties=font_set) ax2 = fig.add_subplot(122) fpr, tpr, threshold = metrics.roc_curve(mix_lable, y_score) auc = metrics.auc(fpr, tpr)
lim_inf = X.min(axis=0) lim_sup = X.max(axis=0) volume_support = (lim_sup - lim_inf).prod() t = np.arange(0, 100 / volume_support, 0.01 / volume_support) axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest() iforest.fit(X_train) s_X_iforest = iforest.decision_function(X_train) print('LocalOutlierFactor processing...') lof.fit(X_train) s_X_lof = lof.decision_function(X_train) print('OneClassSVM processing...') ocsvm.fit(X_train) s_X_ocsvm = ocsvm.decision_function(X_train).reshape(1, -1)[0] s_unif_iforest = iforest.decision_function(unif) s_unif_lof = lof.decision_function(unif) s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0] plt.subplot(121) print("t ist: " ,t) print("t_max ist : " , t_max) print("volume_support ist: " , volume_support) print("unif ist: ", unif) auc_iforest, em_iforest, amax_iforest = em(t, t_max,
outlier_ratio = 0.05 nomal = inlier_num data, __ = STL10_read() anormal = list(range(10)) anormal.remove(nomal) aa = data[nomal] o_num = (aa.shape[0] / (1 - outlier_ratio) - aa.shape[0]) / 9 #cut = np.shape(aa)[0] label = nomal * np.ones((np.shape(aa)[0], 1)) for i in anormal: _ = data[i] index = np.random.choice(np.shape(_)[0], np.int(o_num)) aa = np.vstack((aa, _[index])) label = np.vstack((label, i * np.ones((np.int(o_num), 1)))) data = aa data = np.reshape(data, (-1, 96 * 96 * 3)) clf = LocalOutlierFactor(n_neighbors=200, novelty=True, contamination=outlier_ratio) clf.fit(data) label_pred = clf.predict(data) TPR, TNR, F1 = performance(label, label_pred, nomal) score = -clf.decision_function(data) fpr, tpr, thresholds = roc_curve(np.reshape(label, [np.shape(data)[0], 1]), score, pos_label=inlier_num) print('auc=') print(1 - auc(fpr, tpr))
X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] # # training only on normal data: # X_train = X_train[y_train == 0] # y_train = y_train[y_train == 0] print('LocalOutlierFactor processing...') model = LocalOutlierFactor(n_neighbors=20) tstart = time() model.fit(X_train) fit_time += time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower,the more normal predict_time += time() - tstart fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring) if fit_time + predict_time > max_time: raise TimeoutError f = interp1d(fpr_, tpr_) tpr += f(x_axis) tpr[0] = 0. precision_, recall_ = precision_recall_curve(y_test, scoring)[:2] # cluster: old version of scipy -> interpol1d needs sorted x_input arg_sorted = recall_.argsort() recall_ = recall_[arg_sorted]
def get_clustermodel(df_train,df_test,outliers_fraction): #Remove once pfunction in implemented #df_train = s_train #df_test = s_test ocsvm_max_train = 10000 n_samples_train = df_train.shape[0] # define models: iforest = IsolationForest(max_samples=100, random_state=42, behaviour="new", contamination=outliers_fraction) lof = LocalOutlierFactor(n_neighbors=20, algorithm='auto', leaf_size=30,metric='minkowski',contamination=outliers_fraction,novelty=True) ocsvm = OneClassSVM(kernel='linear',gamma='auto', coef0=0.0, tol=0.001, nu=outliers_fraction, \ shrinking=True, cache_size=500, verbose=False, max_iter=-1) print('end of iForest,lof and OCSVM model creation') iforest_model = iforest.fit(df_train) print('end of iForest model training') #Local Outlier Factor only looks at the local neighbourhood of a data point and hence cannot make predictions on out of sample data points. #Hence we work directly with X_test here. lof_model = lof.fit(df_train) print('Local Outlier Factor test model completed') ocsvm_model = ocsvm.fit(df_train[:min(ocsvm_max_train, n_samples_train - 1)]) print('end of ocsvm model training!') #Anomaly Score iforest_anomalyscore = iforest.decision_function(df_test)#Predicts the anomaly score lof_anomalyscore = lof.decision_function(df_test) ocsvm_anomalyscore = ocsvm.decision_function(df_test) print('end of models - Anomaly score!') #Outliers / Anomaly data Points #LOF - Use the Negative Factor (Value is output in Negative so get the distcint) # lof_outlier = lof_model.predict(df_test) #iforest_outlier = iforest_model.predict(df_test) #ocsvm_outlier = ocsvm_model.predict(df_test) #lof_y_pred=np.array(lof_outlier) #Convert to an array #lof_y_pred[lof_y_pred == 1] = 0 #lof_y_pred[lof_y_pred == -1] = 1 #Anomalous score based LOF prediction #iforest_y_pred=np.array(iforest_outlier) #Convert to an array #iforest_y_pred[iforest_y_pred == 1] = 0 # iforest_y_pred[iforest_y_pred == -1] = 1 #Anomalous score based iForest prediction #ocsvm_y_pred=np.array(ocsvm_outlier) #Convert to an array #ocsvm_y_pred[ocsvm_y_pred * (-1) == -1] = 1 #Anomalous score based OCSVM prediction #ocsvm_y_pred[ocsvm_y_pred * (-1) == 1] = 0 # iforest_y_pred=np.array(iforest_anomalyscore) #Convert to an array # iforest_y_pred[iforest_anomalyscore>=np.percentile(iforest_anomalyscore,99)]=1 #Anomalous score based on the 99% percentile # iforest_y_pred[iforest_anomalyscore<np.percentile(iforest_anomalyscore,99)]=0 # ocsvm_y_pred=np.array(ocsvm_anomalyscore) #Convert to an array #ocsvm_y_pred[ocsvm_anomalyscore>=np.percentile(ocsvm_anomalyscore,99)]=1 #Anomalous score based on the 99% percentile #ocsvm_y_pred[ocsvm_anomalyscore<np.percentile(ocsvm_anomalyscore,99)]=0 return iforest_model,lof_model,ocsvm_model ,iforest_anomalyscore,lof_anomalyscore,ocsvm_anomalyscore
X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] # # training only on normal data: # X_train = X_train[y_train == 0] # y_train = y_train[y_train == 0] print('LocalOutlierFactor processing...') model = LocalOutlierFactor(n_neighbors=20) tstart = time() model.fit(X_train) fit_time += time() - tstart tstart = time() scoring = -model.decision_function( X_test) # the lower,the more normal predict_time += time() - tstart fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring) if fit_time + predict_time > max_time: raise TimeoutError f = interp1d(fpr_, tpr_) tpr += f(x_axis) tpr[0] = 0. precision_, recall_ = precision_recall_curve(y_test, scoring)[:2] # cluster: old version of scipy -> interpol1d needs sorted x_input arg_sorted = recall_.argsort() recall_ = recall_[arg_sorted]
lim_sup = X.max(axis=0) volume_support = (lim_sup - lim_inf).prod() t = np.arange(0, 100 / volume_support, 0.01 / volume_support) axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest() iforest.fit(X_train) s_X_iforest = iforest.decision_function(X_test) print('LocalOutlierFactor processing...') lof = LocalOutlierFactor(n_neighbors=20) lof.fit(X_train) s_X_lof = lof.decision_function(X_test) print('OneClassSVM processing...') ocsvm = OneClassSVM() ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)]) s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0] s_unif_iforest = iforest.decision_function(unif) s_unif_lof = lof.decision_function(unif) s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0] plt.subplot(121) auc_iforest, em_iforest, amax_iforest = em(t, t_max, volume_support, s_unif_iforest, s_X_iforest, n_generated) auc_lof, em_lof, amax_lof = em(t, t_max, volume_support, s_unif_lof, s_X_lof, n_generated)
# Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train LocalOutlierFactor clf_name = 'LOF' clf = LocalOutlierFactor(n_neighbors=3, novelty=False) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.predict(X_train) # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_function(X_train) # raw outlier scores # get the prediction on the test data, cannot predict train data(normal) clf = LocalOutlierFactor(n_neighbors=3, novelty=True) clf.fit(X_train) y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # Step 2: Determine the cut point import matplotlib.pyplot as plt plt.hist(y_test_scores, bins='auto') plt.title("Histogram with LOF Anomaly Scores") plt.show() test_scores = pd.DataFrame({'Scores': y_test_scores, 'Labels': y_test_pred}) pd.DataFrame({
knn_dist_all_grids, knn_ind_grids = model_knn.kneighbors(autoscaled_x_grids) knn_dist_grids = knn_dist_all_grids.mean(axis=1) knn_dist_grids = knn_dist_grids.reshape(xx.shape) # plot plt.title('k-NN') plt.contour(xx, yy, knn_dist_grids, levels=[knn_dist_threshold], linewidths=2, colors='darkred') plt.plot(x[:, 0], x[:, 1], 'x') plt.xlabel('x1') plt.ylabel('x2') plt.show() # LOF model_lof = LocalOutlierFactor(n_neighbors=k, novelty=True, contamination=rate_of_outliers) model_lof.fit(autoscaled_x) lof_grids = model_lof.decision_function(autoscaled_x_grids) lof_grids = lof_grids.reshape(xx.shape) # plot plt.title('LOF') plt.contour(xx, yy, lof_grids, levels=[0], linewidths=2, colors='darkred') plt.plot(x[:, 0], x[:, 1], 'x') plt.xlabel('x1') plt.ylabel('x2') plt.show()
data_scaled_means.to_csv("../../data/data_scaled_means.csv", index=False) ############## ### Machine learning models ## Isolation Forest ilf = IsolationForest().fit(data_scaled_means) answerIF_proba = abs(ilf.score_samples(data_scaled_means)) answerIF_proba = pd.DataFrame({'target': answerIF_proba}) pickle.dump(ilf, open("../../data/model/IsolationForest", "wb")) ## Local Outlier Factor lof = LocalOutlierFactor(n_neighbors=2, novelty=True) lof.fit(data_scaled_means) answerLOF_proba = lof.decision_function(data_scaled_means) answerLOF_proba = 1 - ((answerLOF_proba - answerLOF_proba.min()) / (answerLOF_proba.max() - answerLOF_proba.min())) answerLOF_proba = pd.DataFrame({'target': answerLOF_proba}) pickle.dump(lof, open("../../data/model/LocalOutlierFactor", "wb")) ## Elliptic Envelope ee = EllipticEnvelope() ee.fit(data_scaled_means) answerEE_proba = ee.decision_function(data_scaled_means) answerEE_proba = 1 - (answerEE_proba - 3 * answerEE_proba.min()) * 10**12 answerEE_proba = pd.DataFrame({'target': answerEE_proba}) pickle.dump(ee, open("../../data/model/EllipticEnvelope", "wb")) ##############
AE.fit(X_train) ae_pred_proba = AE.predict_proba(X_test)[:, 1] aucs_ae_ws[r] = evaluate.AUC(ae_pred_proba, y_test) auc_ae_ws = np.mean(aucs_ae_ws) # --- one-class-SVM --- # clf = svm.OneClassSVM(kernel="rbf") clf.fit(X_train) sklearn_score_anomalies = clf.decision_function(X_test) original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies] auc_svm_ws = evaluate.AUC(original_paper_score, y_test) # --- LOF --- # lof = LocalOutlierFactor(novelty=True) lof.fit(X_train) sklearn_score_anomalies = lof.decision_function(X_test) original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies] auc_lof_ws = evaluate.AUC(original_paper_score, y_test) # --- LODA --- # aucs_loda_ws = np.zeros(num_of_experiments) for r in tqdm(range(num_of_experiments)): loda = LODA() loda.fit(X_train) y_pred_proba_loda = np.zeros(X_test.shape[0]) for i in tqdm(range(X_test.shape[0])): loda.fit(X_test[i, :].reshape(1, -1)) y_pred_proba_loda[i] = loda.decision_function(X_test[i, :].reshape( 1, -1)) aucs_loda_ws[r] = evaluate.AUC(1 - y_pred_proba_loda, y_test) auc_loda_ws = np.mean(aucs_loda_ws)
class ApplicabilityDomain(): def __init__(self, method_name='ocsvm', rate_of_outliers=0.01, gamma='auto', nu=0.5, n_neighbors=10, metric='minkowski', p=2): """ Applicability Domain (AD) Parameters ---------- method_name: str, default 'ocsvm' The name of method to set AD. 'knn', 'lof', or 'ocsvm' rate_of_outliers: float, default 0.01 Rate of outlier samples. This is used to set threshold gamma : (only for 'ocsvm') float, default ’auto’ Kernel coefficient for ‘rbf’. Current default is ‘auto’ which optimize gamma to maximize variance in Gram matrix nu : (only for 'ocsvm') float, default 0.5 An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken. https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM n_neighbors: (only for 'knn' and 'lof') int, default 10 Number of neighbors to use for each query https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html metric : string or callable, default ‘minkowski’ Metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html p : integer, default 2 Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html """ if method_name != 'knn' and method_name != 'lof' and method_name != 'ocsvm': sys.exit( 'There is no ad method named \'{0}\'. Please check the variable of method_name.' .format(method_name)) self.method_name = method_name self.rate_of_outliers = rate_of_outliers self.gamma = gamma self.nu = nu self.n_neighbors = n_neighbors self.metric = metric self.p = p def fit(self, x): """ Applicability Domain (AD) Set AD Parameters ---------- x : numpy.array or pandas.DataFrame m x n matrix of X-variables of training data, m is the number of training sammples and n is the number of X-variables """ x = np.array(x) if self.method_name == 'ocsvm': if self.gamma == 'auto': ocsvm_gammas = 2**np.arange(-20, 11, dtype=float) variance_of_gram_matrix = [] for index, ocsvm_gamma in enumerate(ocsvm_gammas): gram_matrix = np.exp(-ocsvm_gamma * cdist(x, x, metric='seuclidean')) variance_of_gram_matrix.append(gram_matrix.var(ddof=1)) self.optimal_gamma = ocsvm_gammas[ variance_of_gram_matrix.index( max(variance_of_gram_matrix))] else: self.optimal_gamma = self.gamma self.ad = OneClassSVM(kernel='rbf', gamma=self.optimal_gamma, nu=self.nu) self.ad.fit(x) ad_values = np.ndarray.flatten(self.ad.decision_function(x)) elif self.method_name == 'knn': self.ad = NearestNeighbors(n_neighbors=self.n_neighbors) self.ad.fit(x) knn_dist_all, knn_ind_all = self.ad.kneighbors(None) ad_values = 1 / (knn_dist_all.mean(axis=1) + 1) elif self.method_name == 'lof': self.ad = LocalOutlierFactor(novelty=True, contamination=self.rate_of_outliers) self.ad.fit(x) ad_values = self.ad.negative_outlier_factor_ - self.ad.offset_ self.offset = np.percentile(ad_values, 100 * self.rate_of_outliers) def predict(self, x): """ Applicability Domain (AD) Predict AD-values Parameters ---------- x : numpy.array or pandas.DataFrame k x n matrix of X-variables of test data, which is autoscaled with training data, and k is the number of test samples Returns ------- ad_values : numpy.array, shape (n_samples,) values lower than 0 means outside of AD """ x = np.array(x) if self.method_name == 'ocsvm': ad_values = np.ndarray.flatten(self.ad.decision_function(x)) elif self.method_name == 'knn': knn_dist_all, knn_ind_all = self.ad.kneighbors(x) ad_values = 1 / (knn_dist_all.mean(axis=1) + 1) elif self.method_name == 'lof': ad_values = np.ndarray.flatten(self.ad.decision_function(x)) return ad_values - self.offset
lim_inf = X.min(axis=0) lim_sup = X.max(axis=0) volume_support = (lim_sup - lim_inf).prod() t = np.arange(0, 100 / volume_support, 0.01 / volume_support) axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest() iforest.fit(X_train) s_X_iforest = iforest.decision_function(X_test) print('LocalOutlierFactor processing...') lof = LocalOutlierFactor(n_neighbors=20) lof.fit(X_train) s_X_lof = lof.decision_function(X_test) print('OneClassSVM processing...') ocsvm = OneClassSVM() ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)]) s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0] s_unif_iforest = iforest.decision_function(unif) s_unif_lof = lof.decision_function(unif) s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0] plt.subplot(121) auc_iforest, em_iforest, amax_iforest = em(t, t_max, volume_support, s_unif_iforest, s_X_iforest, n_generated) auc_lof, em_lof, amax_lof = em(t, t_max, volume_support, s_unif_lof, s_X_lof, n_generated)
def eval_model(self, x, criterion='distance'): self.r_net.eval() x_random = copy.deepcopy(x) np.random.shuffle(x_random) if criterion == 'distance': print('[INFO] Using criterion distance...') x = torch.FloatTensor(x) x_random = torch.FloatTensor(x_random) if self.USE_GPU: x = x.cuda() x_random = x_random.cuda() r_target = self.r_target_net(x) r_pred = self.r_net(x) gap_loss = torch.mean(F.mse_loss(r_pred, r_target, reduction='none'), dim=1) r_target_random = self.r_target_net(x_random).detach() r_pred_random = self.r_net(x_random) xy = F.normalize(r_target, p=1, dim=1) * F.normalize( r_target_random, p=1, dim=1) x_y_ = F.normalize(r_pred, p=1, dim=1) * F.normalize( r_pred_random, p=1, dim=1) pair_wise_loss = torch.mean(F.mse_loss(xy, x_y_, reduction='none'), dim=1) scores = gap_loss + pair_wise_loss return scores.data.cpu().numpy() elif criterion == 'lof': print('[INFO] Using criterion LOF...') x = torch.FloatTensor(x) if self.USE_GPU: x = x.cuda() with torch.no_grad(): r_pred = self.r_net(x) representations = r_pred.cpu().numpy() clf = LocalOutlierFactor(novelty=True) clf.fit(representations) scores = 1 - clf.decision_function(representations) return scores elif criterion == 'iforest': print('[INFO] Using criterion iForest...') x = torch.FloatTensor(x) if self.USE_GPU: x = x.cuda() with torch.no_grad(): r_pred = self.r_net(x) representations = r_pred.cpu().numpy() clf = IsolationForest() clf.fit(representations) scores = 1 - clf.decision_function(representations) return scores else: raise ValueError('Invalid criterion!')
class Profile: def __init__(self, ip, train_size=100, tx_interval=-1, score_window=10): self.ip_addr = ip self.tx_interval = tx_interval self.train_size = train_size self.samples = [] self.detector = LocalOutlierFactor(novelty=True) self.scaler = StandardScaler() self.KS_population = [] self._updated = False self._last_vjits = ringwindow(15) self.score_window = score_window # the averaging window used over the anomaly scores. Larger windows increase robustness bu increase detection delay too. self._last_scores = ringwindow(self.score_window, 1) self._last_labels = ringwindow(self.score_window, 1) self.n_packets_lost_lastprobe = 0 def set_ip(self, ip): self.ip_addr = ip self._updated = True def set_tx_interval(self, value): self.tx_interval = value self._updated = True def set_train_size(self, value): self.train_size = value self.samples = self.samples[np.max(( len(self.samples) - self.train_size, 0)):] # take top most recent samples if not self.inTraining(): #refit model to current samples self.scaler.fit(np.vstack(self.samples)) self.detector = self.detector.fit( self.scaler.transform(np.vstack(self.samples))) self._updated = True def trainProgress(self): return np.double(len(self.samples)) / np.double(self.train_size) def inTraining(self): return len(self.samples) < self.train_size def process(self, raw_probe, printProgress=False): #check probe integrity n_lost_packets = np.sum(np.array( raw_probe[1]) == 0) #number of those with no response if n_lost_packets == len(raw_probe[1]): #all packets were lost return -3, self._last_labels.get_mean() if n_lost_packets > 0: #some packets were lost (we can't accuralty compute the probe) self.n_packets_lost_lastprobe = n_lost_packets if n_lost_packets <= 200: #we will still try and execute if only a few were lost # perform partial feature extraction x = self.extract_features_partial(raw_probe) # execute partial profile return self._process(x, printProgress, wasPartial=True) else: return -2, self._last_labels.get_mean() else: #no packets lost: self.n_packets_lost_lastprobe = 0 #perform feature extraction x = self.extract_features(raw_probe) #train/execute profile return self._process(x, printProgress) def _process( self, x, printProgress=False, wasPartial=False ): #learns and then scores sample. If still in training, 0 is returned. if self.inTraining() and wasPartial: return -2, 1 if self.inTraining(): self.samples.append(x) self.samples = self.samples[np.max(( len(self.samples) - self.train_size, 0)):] #take top most recent samples if not self.inTraining(): self.scaler.fit(np.vstack(self.samples)) self.detector = self.detector.fit( self.scaler.transform(np.vstack(self.samples))) if printProgress: progressbar(self.train_size, len(self.samples), pretext="Training") self._updated = True return 1, 1.0 else: if wasPartial: label = self.classify_sample(x) #update scores label = -2 else: label = self.classify_sample(x) score = self._last_labels.get_mean() return label, score def score_sample(self, x): if self.inTraining(): return #1.0 else: #model is trained return self._last_scores.insert_get_mean( self.detector.decision_function(self.scaler.transform(x)) [0]) # * -1 # larger is more anomalous def classify_sample(self, x): if self.inTraining(): return 1 else: #model is trained m_label = self._last_labels.insert_get_mean( self.detector.predict( self.scaler.transform(x))[0]) #1:normal, -1:anomaly return -1 if m_label < 0 else 1 def extract_features(self, raw_probe): tx_times = np.array(raw_probe[0]) rx_times = np.array(raw_probe[1]) mls_seq = np.array(raw_probe[2]) # Feature 1: v_ie rtt = rx_times - tx_times rtt_f = np.fft.fft(rtt) mls_seq_f = np.fft.fft(mls_seq) v_ie = np.sum(np.abs( (rtt_f / mls_seq_f))**2) / len(rtt_f) # total energy of impulse # Feature 2: v_dc if (mls_seq == 0).all(): # should not happen (means MLS was all zeros) v_dc = np.mean(rtt) else: v_dc = np.mean(rtt[ mls_seq == 1]) # the average rtt of the largest payload pings # Feature 3: v_jit jitter = np.diff(rx_times, n=1) if len(self.KS_population) == 0: m_pv = 1 else: pvs = np.zeros(len(self.KS_population)) for i in range(len(self.KS_population)): pvs[i] = ks_2samp(self.KS_population[i], jitter)[0] m_pv = np.max(pvs) v_jit = 0.0 if m_pv < 0.1 else 1.0 # update KS model set_size = 30 if self.inTraining(): if (len(self.KS_population) < set_size) or (np.random.rand() > 0.7): self.KS_population.append(jitter) self.KS_population = self.KS_population[np.max(( len(self.KS_population) - set_size, 0)):] self._updated = True return np.array([[v_ie, v_dc, v_jit]]) def extract_features_partial(self, raw_probe): tx_times = np.array(raw_probe[0]) rx_times = np.array(raw_probe[1]) mls_seq = np.array(raw_probe[2]) good = rx_times != 0 rtt = rx_times[good] - tx_times[good] average_sample = np.mean(np.vstack(self.samples), axis=0) # Feature 1: v_ie AVERAGE (not tested) v_ie = average_sample[0] # Feature 2: v_dc if (mls_seq == 0).all(): # should not happen (means MLS was all zeros) v_dc = np.mean(rtt) else: v_dc = np.mean( rtt[mls_seq[good] == 1]) # the average rtt of the largest payload pings # Feature 3: v_jit AVERAGE (not tested) v_jit = average_sample[2] return np.array([[v_ie, v_dc, v_jit]])
def identify_outliers(df,algorithm=0, detailed=False): """Identifies outliers in multi dimension. Dataset has to be parsed as numeric beforehand. """ # df_exclude_target = df[df.columns.difference([target])] # exclude target from data df_exclude_target = df.iloc[:,:-1] df_numeric = df_exclude_target.select_dtypes(include=[np.number]) # keep only numeric type features total_length = len(df_numeric) # total length of the dataframe, used for computing contamination later # print(total_length) outliers_count = np.zeros(len(df_numeric.columns)) # number of outliers of each feature dict_outliers = {} flag = False df_union = pd.DataFrame() for i, col in enumerate(df_numeric.columns): # if(df_numeric[col].dtype in [np.number]): # bug! to be figured out # first detect outliers in each column # keep only the ones that are out of +3 to -3 standard deviations in the column 'Data'. dict_outliers[col] = df_numeric[~(np.abs(df_numeric[col]-df_numeric[col].mean())<(3*df_numeric[col].std()))] # ~ means the other way around # combine all the rows containing outliers in one feature df_union = df_union.combine_first(dict_outliers[col]) # print(dict_outliers[col]) if len(dict_outliers[col]) != 0: outliers_count[i] = len(dict_outliers[col]) flag = True if detailed: print("There are {} outliers in variable {}".format(len(dict_outliers[col]), col)) print(dict_outliers[col][col]) print("") else: if detailed: print("No outliers are detected in variable {}".format(col)) print("") # boxplot: show outliers in each feature # feature scaling ss = StandardScaler() df_scaled = ss.fit_transform(df_numeric) df_scaled = pd.DataFrame(df_scaled, columns=df_numeric.columns) df_scaled.head() # draw box plot for numeric variables fig = plt.figure(figsize=(6, 4)) fig.subplots_adjust(top=0.93, wspace=0) ax = sns.boxplot(data=df_scaled, palette="Set1") ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right") plt.show() # Two options to estimate the propotion of outliers # One is to take the number of outliers in the feature containing most outliers # The other is to take the length of the union of rows containing outliers in any feature # print(outliers_count) # print(df_union) # max_outliers = max(outliers_count) max_outliers = len(df_union) # print("max outliers number is {}".format(max_outliers)) # if flag: # print("Outliers detected") # print("") # else: # print("No outliers detected") # print("") # plt.show() contamination = max_outliers / total_length X = np.asarray(df_numeric) if algorithm == 2: clf = svm.OneClassSVM(nu=0.95 * contamination + 0.05) clf.fit(X) y_pred = clf.predict(X) elif algorithm == 1: clf = LocalOutlierFactor(n_neighbors=20, contamination=contamination) y_pred = clf.fit_predict(X) else: clf = IsolationForest(contamination = contamination) clf.fit(X) y_pred = clf.predict(X) # print(y_pred) outlier_index, = np.where(y_pred == -1) df_outliers = df_numeric.iloc[outlier_index.tolist()] # print(outlier_index) if algorithm == 1: anomaly_score = y_pred # decision function only available for novelty detection for in lof else: anomaly_score = clf.decision_function(X) # p_pred: The anomaly score of the input samples. The lower, The more abnormal. anomaly_score = pd.DataFrame(anomaly_score, columns=['anomaly_score']) df_with_anomaly_score = pd.concat([df, anomaly_score], axis=1) df_with_anomaly_score df_sorted = df_with_anomaly_score.sort_values(by='anomaly_score') cm = sns.diverging_palette(10, 220, sep=80, n=7, as_cmap=True) df_styled = df_sorted.style.background_gradient(cmap=cm, subset=['anomaly_score']).apply(highlight_outlier, subset=df_sorted.columns[:-1]) # print("*********************************************") # print("Outliers detected in multi dimensional space:") # print("*********************************************") # print(df_numeric.iloc[outlier_index.tolist()]) df_pred = pd.DataFrame(y_pred, columns=['pred']) display(df_styled) return df_scaled, df_styled, df_outliers, df_pred, outliers_count