class TestKnnMedian(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.75 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = KNN(contamination=self.contamination, method='median') def test_fit(self): self.clf.fit(self.X_train) def test_decision_function(self): self.clf.fit(self.X_train) self.clf.decision_function(self.X_train) self.clf.decision_function(self.X_test) def test_sklearn_estimator(self): check_estimator(self.clf) def tearDown(self): pass
class TestKnnMedian(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = KNN(contamination=self.contamination, method='median') def test_fit(self): self.clf.fit(self.X_train) def test_decision_function(self): self.clf.fit(self.X_train) self.clf.decision_function(self.X_train) self.clf.decision_function(self.X_test) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
def knn(X_train, y_train=None, X_test=None, y_test=None): # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # # # evaluate and print the results # print("\nOn Training Data:") # evaluate_print(clf_name, y_train, y_train_scores) # print("\nOn Test Data:") # evaluate_print(clf_name, y_test, y_test_scores) # # visualize the results visualize(clf_name, X_train, X_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) return y_train_pred, y_train_scores
def pyodtry(): dfwhole = df_en_all df = dff2 X1 = reduce(dfwhole) X2 = reduce(df) ddf = pd.read_pickle('LogFileDfs/original') random_state = np.random.RandomState(42) outliers_fraction = 0.005 clf = KNN(method='mean', contamination=outliers_fraction) xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200)) clf.fit(X1) scores_pred = clf.decision_function(X2) * -1 y_pred = clf.predict(X2) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) print('OUTLIERS : ', n_outliers, 'INLIERS : ', n_inliers) #dfx = pdf #dfx['outlier'] = y_pred.tolist() df['authenticated?'] = y_pred.tolist() ddf['authenticated?'] = df['authenticated?'] output = ddf[ddf['authenticated?'] == 1] # create sqlalchemy engine #engine = create_engine("mysql+pymysql://{user}:{pw}@172.17.0.3/{db}".format(user="******",pw="richul123",db="emss")) # Insert whole DataFrame into MySQL #output.to_sql('output', con = engine, if_exists = 'replace', chunksize = 1000) with pd.ExcelWriter( '/home/richul/Documents/EnhancingMailServerSecurity/Output/output.xlsx' ) as writer: output.to_excel(writer, sheet_name='output')
class IForestSupervisedKNN(BaseDetector): def __init__(self, get_top=0.8, if_params={}, knn_params={}): super(IForestSupervisedKNN, self).__init__() self.get_top = get_top self.is_fitted = False self.iforest = IForest(**if_params) self.knn = KNN(**knn_params) def fit(self, X, y=None): X = check_array(X) self._set_n_classes(y) self.iforest.fit(X) scores = self.iforest.predict_proba(X)[:, 1] normal_instances = X[np.argsort(scores)[:int(len(X) * self.get_top)]] self.knn.fit(normal_instances) self.decision_scores_ = self.decision_function(X) self._process_decision_scores() self.is_fitted = True return self def decision_function(self, X): check_is_fitted(self, ['is_fitted']) return self.knn.decision_function(X)
def obj_func_kNN(params): ## objective function used in baseian optimization outlier_fraction = params[0] n_neighbors = params[1] method = params[2] radius = params[3] # load data set to function work space Y_train = np.load('Y_train.npy') X_train = np.load('X_train.npy') # create model clf = KNN(contamination=outlier_fraction, n_neighbors=n_neighbors, method=method, radius=radius) # fit the dataset to the model clf.fit(X_train) scores_pred = clf.decision_function( X_train) * -1 # predict raw anomaly score Rprecision = Rprecision_f(Y_train, scores_pred) if glb_verbose: print('R Precision : ', Rprecision) y_pred = clf.predict( X_train) # prediction of a datapoint category outlier or inlier objVal = objVal_f(Rprecision, y_pred, Y_train) return objVal
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test): if method == 'KNN': clf = KNN() elif method == 'CBLOF': clf = CBLOF() elif method == 'PCA': clf = PCA() else: clf = IForest() clf.fit(x_train) # 使用x_train训练检测器clf # 返回训练数据x_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) print("On train Data:") evaluate_print(method, y_train, y_train_scores) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(x_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(x_test) # 返回未知数据上的异常值 (分值越大越异常) print("On Test Data:") evaluate_print(method, y_test, y_test_scores) y_true = column_or_1d(y_test) y_pred = column_or_1d(y_test_scores) check_consistent_length(y_true, y_pred) roc = np.round(roc_auc_score(y_true, y_pred), decimals=4), prn = np.round(precision_n_scores(y_true, y_pred), decimals=4) total_roc.append(roc) total_prn.append(prn)
def detectarOutlierKNN(self, idmodelo, Xtodos, corteOutlier): # Detecao Outliers 1-------------------------------------------------------------- clf = KNN() clf.fit(Xtodos) # get outlier scores y_train_scores = clf.decision_scores_ # raw outlier scores y_test_scores = clf.decision_function(Xtodos) # outlier scores YCodigoTodosComOutilier = self.selectMatrizY(idmodelo, "ID", "TODOS") cont = 0 amostrasRemovidas = 0 for itemOutilier in y_train_scores: if itemOutilier > corteOutlier: contTodos = 0 for item in YCodigoTodosComOutilier: amostra = str(item) amostra = amostra.replace("[", "") amostra = amostra.replace("]", "") if contTodos == cont: db.execute( " update amostra set tpamostra = 'OUTLIER' where idamostra = " + str(amostra) + " and idmodelo = " + str( idmodelo) + "") print(itemOutilier) amostrasRemovidas = amostrasRemovidas + 1 break contTodos = contTodos + 1 cont = cont + 1 session.commit() print("Numero de Amostras Removidas: " + str(amostrasRemovidas)) return cont
def api_alert(influxdb_ip, influxdb_port, influxdb_user, influxdb_pwd, influxdb_database, influxdb_table, apiid): timelimit = 'time > now()-1d' # 访问influxdb client = InfluxDBClient(influxdb_ip, influxdb_port, influxdb_user, influxdb_pwd, influxdb_database) # 获取当前API一天前的数据 result = client.query('select Average, CallCount, ErrorRate from ' + influxdb_table + ' where ApiId = \'' + apiid + '\' and ' + timelimit + ';') # 把resultset格式的数据转换成list格式 apis_table = list(result.get_points(measurement='apis')) # 把要处理的数据存成DataFrame df = pd.DataFrame(data=apis_table) # 去掉不参与运算的列,取训练集x x = df x = x.drop("time", axis=1) # 数据处理一下,归一化,映射到[0,1] x['CallCount'] = (x['CallCount']-x['CallCount'].min()) / \ (x['CallCount'].max()-x['CallCount'].min()) x['Average'] = (x['Average']-x['Average'].min()) / \ (x['Average'].max()-x['Average'].min()) x['ErrorRate'] = x['ErrorRate'] / 100 # 取最后十秒的数据点作为测试点 x_last = x.tail(1) #df_last = df.tail(1) x = x.drop(x.index[-1]) df = df.drop(df.index[-1]) # 转换成numpy格式准备计算 x = x.values # 训练一个kNN检测器 clf_name = 'kNN' clf = KNN() # 初始化检测器clf clf.fit(x) # 使用X_train训练检测器clf # 给df添加一列显示异常分数 df['score'] = clf.decision_scores_ # 排序分数 df = df.sort_values("score", ascending=False) #print(df.head(20)) # 新数据预测 test_data = x_last test_scores = clf.decision_function(test_data) if (test_scores > 0.8): print('数据点异常程度4,必须报警') elif (test_scores > 0.5): print('数据点异常程度3,需要报警') elif (test_scores > 0.1): print('数据点异常程度2,建议报警') elif (test_scores > 0.05): print('数据点异常程度1,可以报警') #这个分级是根据KNN.py的图像分析出来的,0.05以上的很明显是异常点,0.1以上已经出现了离群现象,0.5以上就距离数据点很远了。 #这个值根据训练用的时间相关,一天的数据0.05比较合适。 return test_scores
def model_test(model_type, y_train, y_test, X_train, X_test, model_file, save_flag): if model_type == 'KNN': clf_name = 'KNN' clf = KNN() clf.fit(X_train) if model_type == 'XGBOD': clf_name = 'XGBOD' #set this scale_pos_weight sum(negative instances) / sum(positive instances). clf = XGBOD(random_state=42, scale_pos_weight=50) clf.fit(X_train, y_train) if model_type == 'SOD': # train SOD detector # Note that SOD is meant to work in high dimensions d > 2. # But here we are using 2D for visualization purpose # thus, higher precision is expected in higher dimensions clf_name = 'SOD' clf = SOD() clf.fit(X_train) if model_type == 'VAE': # train VAE detector (Beta-VAE) clf_name = 'VAE' contamination = 0.01 clf = VAE(epochs=30, contamination=contamination, gamma=0.8, capacity=0.2) clf.fit(X_train) #save model if specified if save_flag == '1': pickle.dump(clf, open(model_file, "wb")) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) conf_train = confusion_matrix(y_train, y_train_pred) print("<<<< confusion matrix for train: ", conf_train) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) conf_test = confusion_matrix(y_test, y_test_pred) print("<<<< confusion matrix for test: ", conf_test) # visualize the results #todo: Input data has to be 2-d for visualization. #visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, # y_test_pred, show_figure=True, save_figure=False) return model_file
def detect_anomaly(df): clf = KNN() x_values = df.change.values.reshape(df.index.values.shape[0],1) y_values = df.change.values.reshape(df.change.values.shape[0],1) clf.fit(y_values) clf.predict(y_values) df["out_label"] = clf.predict(y_values) #fit_predict_score df["out_score"] = clf.decision_function(y_values) return df
def detect_anomaly(df): x_values = df.index.values.reshape(df.index.values.shape[0],1) y_values = df.change.values.reshape(df.change.values.shape[0],1) clf = KNN() clf.fit(y_values) clf.predict(y_values) df["label_knn"] = clf.predict(y_values) df["score_knn"] = clf.decision_function(y_values).round(4) return df
def abnormal_KNN(train_npy, test_npy): clf_name = 'kNN' clf = KNN() train_npy = np.array(train_npy).reshape(-1, 1) clf.fit(train_npy) test_npy = np.array(test_npy).reshape(-1, 1) y_test_pred = clf.predict(test_npy) y_test_scores = clf.decision_function(test_npy) return y_test_pred
class KNNPredictionHead: def __init__(self, n_neighbors=5, method="largest", **kwargs): self.model = KNN(n_neighbors=n_neighbors, method=method) def fit(self, X): return self.model.fit(X.detach().cpu().numpy()) def decision_function(self, X): return self.model.decision_function(X.detach().cpu().numpy())
def get_KNN_scores(dataframe, cols, outliers_fraction=0.01, standardize=True): '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default Returns: df with KNN scores added ''' if standardize: #standardize selected variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) #fit clf = KNN(contamination=outliers_fraction) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) CheckOutliers.df4 = dataframe CheckOutliers.df4['outlier'] = y_pred.tolist() print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with KNN')
# In[169]: # 训练一个kNN检测器 clf_name = 'kNN' clf = KNN() # 初始化检测器 clf.fit(new_origin_all[:pos]) # 使用训练集训练检测器clf # 返回训练数据X_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(new_origin_all[pos:]) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(new_origin_all[pos:]) # 返回未知数据上的异常值 show_scatter(clf_name, df, y_train_pred, pos) # In[170]: clf_name = 'COF' clf = COF(n_neighbors=30) clf.fit(new_origin_all[:pos]) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores
clf.fit(trainData) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) #print(y_train_pred) y_train_scores = clf.decision_scores_ # raw outlier scores #print(y_train_scores) # get the prediction on the test data y_test_pred = clf.predict(testData) #X_test) # outlier labels (0 or 1) print(y_test_pred) from sklearn.metrics import accuracy_score accuracy_percentage = accuracy_score(testTarget, y_test_pred) * 100 print("The prediction accuracy is:", end=" ") print(accuracy_percentage) y_test_scores = clf.decision_function(testData) #X_test) # outlier scores #print(y_test_scores) # evaluate and print the results #print("\nOn Training Data:") #evaluate_print(clf_name, y_train, y_train_scores) #print("\nOn Test Data:") #evaluate_print(clf_name, y_test, y_test_scores) #isualize(clf_name, trainData, trainTarget, testData, testTarget, y_train_pred, # y_test_pred, show_figure=True, save_figure=False)
df['score'] = clf.decision_scores_ #df['pred'] = clf.labels_ df = df.sort_values("score", ascending=False) print(df.head(10)) print(df['CallCount'].max()) max_score = df.head(1)['score'].values print(max_score) df['prob'] = df['score'] / max_score print(df.head(10)) ''' # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(X_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(X_test) # 返回未知数据上的异常值 (分值越大越异常)''' # 新数据预测 testdata = {"Average": 150, "CallCount": 130, 'ErrorRate': 100} test = pd.DataFrame(data=testdata, index=[0]) print(test) test_scores = clf.decision_function(test) print(test_scores) alert = pd.DataFrame(df.loc[df['score'] > 10]) # 数据展示 plt.title(timelimit) plt.xlabel('Average') plt.ylabel('CallCount') plt.scatter('Average', 'CallCount', s=1, data=df) plt.scatter('Average', 'CallCount', s=2, c='red', data=alert) #plt.scatter('Average', 'CallCount', s=3, c='green', data=test) plt.show()
y_item = np.array(y_item) contam /= len(y_item) contam = min(0.5, contam) # 定义模型 classifiers = { 'KNN': KNN(contamination=contam), 'LOF': LOF(contamination=contam), 'PCA': PCA(contamination=contam), 'LODA': LODA(contamination=contam) } for cls in classifiers: clf = classifiers[cls] t0 = time.time() x_item = standardizer(x_item) clf.fit(x_item) y_scores = clf.decision_function(x_item) t1 = time.time() duration = round(t1 - t0, ndigits=4) roc = round(roc_auc_score(y_item, y_scores), ndigits=4) prn = round(precision_n_scores(y_item, y_scores), ndigits=4) results[cls].append(roc) print( 'benchmark id:{bench_id}, model:{clf_name}, ROC:{roc}, precision @ rank n:{prn}, ' 'execution time: {duration}s'.format( bench_id=bench_id, clf_name=cls, roc=roc, prn=prn,
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1) clf_name = 'KNN' clf = KNN() # 初始化检测器clf clf.fit(X_train) # 使用X_train训练检测器clf # 返回训练数据X_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) print("On train Data:") evaluate_print(clf_name, y_train, y_train_scores) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(X_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(X_test) # 返回未知数据上的异常值 (分值越大越异常) print("On Test Data:") evaluate_print(clf_name, y_test, y_test_scores) y_true = column_or_1d(y_test) y_pred = column_or_1d(y_test_scores) check_consistent_length(y_true, y_pred) roc = np.round(roc_auc_score(y_true, y_pred), decimals=4), prn = np.round(precision_n_scores(y_true, y_pred), decimals=4) knn_roc.append(roc) knn_prn.append(prn) clf_name = 'LOF' clf = LOF() # 初始化检测器clf
plt.ylabel('F2') plt.show() ''' KNN -> K-Nearest Neighbors Detector For an observation, its distance to its kth nearest neighbors could be viewed as the outlying scores Method: -Largest -Average -Median ''' clf_name = 'KNN' clf = KNN() clf.fit(X_train) X_train_pred = clf.labels_ X_train_score = clf.decision_scores_ score_pred = clf.decision_function(X_train)*-1 y_pred = clf.predict(X_train) n_errors = (y_pred != y_train).sum() print('No of Errors:', clf_name, n_errors) # visualization xx, yy = np.meshgrid(np.linspace(-10, 10, 300), np.linspace(-10, 10, 300)) threshold = stats.scoreatpercentile(score_pred, 100*outlier_fraction) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 Z = Z.reshape(xx.shape) # fill blue colormap from minimum anomaly score to threshold value plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 10), cmap=plt.cm.Blues_r) a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red') plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange') b = plt.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c='white',s=20, edgecolor='k') c = plt.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1], c='black',s=20, edgecolor='k')
class TestKnn(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = KNN(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
n_features=2, contamination=contamination, random_state=42) # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred,
class TestKnn(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = KNN(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_prediction_labels_confidence(self): pred_labels, confidence = self.clf.predict(self.X_test, return_confidence=True) assert_equal(pred_labels.shape, self.y_test.shape) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_prediction_proba_linear_confidence(self): pred_proba, confidence = self.clf.predict_proba(self.X_test, method='linear', return_confidence=True) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
class TestKnnMahalanobis(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) # calculate covariance for mahalanobis distance X_train_cov = np.cov(self.X_train, rowvar=False) self.clf = KNN(algorithm='auto', metric='mahalanobis', metric_params={'V': X_train_cov}) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
def get_out_score(X, observers): print('>> internal check : r = ', observers.shape[0]/X.shape[0]*100) clf = KNN() clf.fit(observers) return clf.decision_function(X)
# # kNN # In[3]: #kNN clf_name = 'kNN' clf = KNN(method='median') # In[4]: #用训练集训练 clf.fit(X_train) y_train_pred = clf.labels_ y_train_scores = clf.decision_scores_ y_test_pred = clf.predict(X_test) y_test_scores = clf.decision_function(X_test) #评价性能 roc_train = round(roc_auc_score(y_train, y_train_scores), 4) prn_train = round(precision_n_scores(y_train, y_train_scores), ndigits=4) roc_test = round(roc_auc_score(y_test, y_test_scores), 4) prn_test = round(precision_n_scores(y_test, y_test_scores), ndigits=4) # In[5]: #输出计算得到的roc_auc和precision @ rank n print("\nOn Train Data:") print(clf_name, 'roc:', roc_train, 'precision @ rank n:', prn_train) print("\nOn Test Data:") print(clf_name, 'roc:', roc_test, 'precision @ rank n:', prn_test) # In[6]:
Hence, we use a library called **pyod** which hosts a number of outlier detection algorithms. """ pip install pyod """### KNN Classifier (Proximity-Based)""" from pyod.models.knn import KNN # kNN detector # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X) y_pred = clf.predict(X) # outlier labels (0 or 1) y_scores = clf.decision_function(X) # outlier scores y_pred """0 means normal value while 1 means anomalous value.""" colors = np.array(['#377eb8', '#ff7f00']) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred - 1) // 2]) """Finding the ROC Accuracy score for the prediction label.""" clf.fit_predict_score(X[:, 0].reshape(-1,1), y_pred, scoring='roc_auc_score') """### Angle-based Outlier Detector (Probabilistic Based Model)""" from pyod.models import abod
#划分测试集和训练集 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33) #使用pyod中的KNN算法拟合数据 clf_name = 'KNN' clf = KNN() clf.fit(X_train) #预测得到由0和1组成的数组,1表示离群点,0表示飞离群点 y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores,The outlier scores of the training data. #预测样本是不是离群点,返回0和1 的数组 y_test_pred = clf.predict(X_test) y_test_scores = clf.decision_function( X_test) # outlier scores,The anomaly score of the input samples. #使用sklearn中的roc_auc_score方法得到auc值,即roc曲线下面的面积 try: sumAuc_train += sklearn.metrics.roc_auc_score(y_train, y_train_scores, average='macro') sumAuc_test += sklearn.metrics.roc_auc_score(y_test, y_test_scores, average='macro') #s=precision_score(y_train, y_train_scores, average='macro') i += 1 print(sumAuc_train, sumAuc_test) except ValueError: pass #得到ROC值和精确度 prn
k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200] train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf]) print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf)) for i in range(n_clf): k = k_list[i] clf = KNN(n_neighbors=k, method='largest') clf.fit(X_train_norm) train_scores[:, i] = clf.decision_scores_ test_scores[:, i] = clf.decision_function(X_test_norm) # Decision scores have to be normalized before combination train_scores_norm, test_scores_norm = standardizer(train_scores, test_scores) # Combination by average y_by_average = average(test_scores_norm) evaluate_print('Combination by Average', y_test, y_by_average) # Combination by max y_by_maximization = maximization(test_scores_norm) evaluate_print('Combination by Maximization', y_test, y_by_maximization) # Combination by aom y_by_aom = aom(test_scores_norm, n_buckets=5) evaluate_print('Combination by AOM', y_test, y_by_aom)
# Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=True)
k_list = [ 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200 ] train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf]) for i in range(n_clf): k = k_list[i] clf = KNN(n_neighbors=k, method='largest') clf.fit(X_train_norm) train_scores[:, i] = clf.decision_scores_ test_scores[:, i] = clf.decision_function(X_test_norm) # decision scores have to be normalized before combination train_scores_norm, test_scores_norm = standardizer(train_scores, test_scores) # combination by average y_by_average = average(test_scores_norm) evaluate_print('Combination by Average', y_test, y_by_average) # combination by max y_by_maximization = maximization(test_scores_norm) evaluate_print('Combination by Maximization', y_test, y_by_maximization) # combination by aom y_by_aom = aom(test_scores_norm, n_buckets=5) evaluate_print('Combination by AOM', y_test, y_by_aom)
# In[2]: # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # In[3]: # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # In[4]: