def detect_anomaly(df): x_values = df.index.values.reshape(df.index.values.shape[0],1) y_values = df.change.values.reshape(df.change.values.shape[0],1) clf = KNN() clf.fit(y_values) clf.predict(y_values) df["label_knn"] = clf.predict(y_values) df["score_knn"] = clf.decision_function(y_values).round(4) return df
def detect_anomaly(df): clf = KNN() x_values = df.change.values.reshape(df.index.values.shape[0],1) y_values = df.change.values.reshape(df.change.values.shape[0],1) clf.fit(y_values) clf.predict(y_values) df["out_label"] = clf.predict(y_values) #fit_predict_score df["out_score"] = clf.decision_function(y_values) return df
def knn(X_train, y_train=None, X_test=None, y_test=None): # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # # # evaluate and print the results # print("\nOn Training Data:") # evaluate_print(clf_name, y_train, y_train_scores) # print("\nOn Test Data:") # evaluate_print(clf_name, y_test, y_test_scores) # # visualize the results visualize(clf_name, X_train, X_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) return y_train_pred, y_train_scores
def obj_func_kNN(params): ## objective function used in baseian optimization outlier_fraction = params[0] n_neighbors = params[1] method = params[2] radius = params[3] # load data set to function work space Y_train = np.load('Y_train.npy') X_train = np.load('X_train.npy') # create model clf = KNN(contamination=outlier_fraction, n_neighbors=n_neighbors, method=method, radius=radius) # fit the dataset to the model clf.fit(X_train) scores_pred = clf.decision_function( X_train) * -1 # predict raw anomaly score Rprecision = Rprecision_f(Y_train, scores_pred) if glb_verbose: print('R Precision : ', Rprecision) y_pred = clf.predict( X_train) # prediction of a datapoint category outlier or inlier objVal = objVal_f(Rprecision, y_pred, Y_train) return objVal
def pyodtry(): dfwhole = df_en_all df = dff2 X1 = reduce(dfwhole) X2 = reduce(df) ddf = pd.read_pickle('LogFileDfs/original') random_state = np.random.RandomState(42) outliers_fraction = 0.005 clf = KNN(method='mean', contamination=outliers_fraction) xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200)) clf.fit(X1) scores_pred = clf.decision_function(X2) * -1 y_pred = clf.predict(X2) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) print('OUTLIERS : ', n_outliers, 'INLIERS : ', n_inliers) #dfx = pdf #dfx['outlier'] = y_pred.tolist() df['authenticated?'] = y_pred.tolist() ddf['authenticated?'] = df['authenticated?'] output = ddf[ddf['authenticated?'] == 1] # create sqlalchemy engine #engine = create_engine("mysql+pymysql://{user}:{pw}@172.17.0.3/{db}".format(user="******",pw="richul123",db="emss")) # Insert whole DataFrame into MySQL #output.to_sql('output', con = engine, if_exists = 'replace', chunksize = 1000) with pd.ExcelWriter( '/home/richul/Documents/EnhancingMailServerSecurity/Output/output.xlsx' ) as writer: output.to_excel(writer, sheet_name='output')
def get_all_readings_from_person(self, person_tag, remove_outliers=0, additional_where=""): #Debug.print_debug(self.file_path) print(self.file_path) dataset = sqlite3.connect(self.file_path) if len(additional_where) > 0: to_return = self.get_data_sql_query( "select {} from {} where {} like {} {}".format( ', '.join(self.features), self.table_name, self.person_column, person_tag, additional_where), dataset) else: to_return = self.get_data_sql_query( "select {} from {} where {} like '{}'".format( ', '.join(self.features), self.table_name, self.person_column, person_tag), dataset) self.data = to_return if (remove_outliers > 0): knn = KNN(contamination=remove_outliers) to_return_aux = to_return.copy() to_return_aux = to_return_aux.drop(self.label_tag, 1) knn.fit(to_return_aux) pred = knn.predict(to_return_aux) to_return = to_return.iloc[np.where(pred == 0)[0], :] return to_return
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test): if method == 'KNN': clf = KNN() elif method == 'CBLOF': clf = CBLOF() elif method == 'PCA': clf = PCA() else: clf = IForest() clf.fit(x_train) # 使用x_train训练检测器clf # 返回训练数据x_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) print("On train Data:") evaluate_print(method, y_train, y_train_scores) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(x_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(x_test) # 返回未知数据上的异常值 (分值越大越异常) print("On Test Data:") evaluate_print(method, y_test, y_test_scores) y_true = column_or_1d(y_test) y_pred = column_or_1d(y_test_scores) check_consistent_length(y_true, y_pred) roc = np.round(roc_auc_score(y_true, y_pred), decimals=4), prn = np.round(precision_n_scores(y_true, y_pred), decimals=4) total_roc.append(roc) total_prn.append(prn)
def median_knn(X_train, X_test, Y_train, Y_test): from pyod.models.knn import KNN model = KNN(method='median') model.fit(X_train) pred = model.predict(X_test) acc = np.sum(pred == Y_test) / X_test.shape[0] print(acc) return (acc * 100)
def model_test(model_type, y_train, y_test, X_train, X_test, model_file, save_flag): if model_type == 'KNN': clf_name = 'KNN' clf = KNN() clf.fit(X_train) if model_type == 'XGBOD': clf_name = 'XGBOD' #set this scale_pos_weight sum(negative instances) / sum(positive instances). clf = XGBOD(random_state=42, scale_pos_weight=50) clf.fit(X_train, y_train) if model_type == 'SOD': # train SOD detector # Note that SOD is meant to work in high dimensions d > 2. # But here we are using 2D for visualization purpose # thus, higher precision is expected in higher dimensions clf_name = 'SOD' clf = SOD() clf.fit(X_train) if model_type == 'VAE': # train VAE detector (Beta-VAE) clf_name = 'VAE' contamination = 0.01 clf = VAE(epochs=30, contamination=contamination, gamma=0.8, capacity=0.2) clf.fit(X_train) #save model if specified if save_flag == '1': pickle.dump(clf, open(model_file, "wb")) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) conf_train = confusion_matrix(y_train, y_train_pred) print("<<<< confusion matrix for train: ", conf_train) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) conf_test = confusion_matrix(y_test, y_test_pred) print("<<<< confusion matrix for test: ", conf_test) # visualize the results #todo: Input data has to be 2-d for visualization. #visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, # y_test_pred, show_figure=True, save_figure=False) return model_file
def abnormal_KNN(train_npy, test_npy): clf_name = 'kNN' clf = KNN() train_npy = np.array(train_npy).reshape(-1, 1) clf.fit(train_npy) test_npy = np.array(test_npy).reshape(-1, 1) y_test_pred = clf.predict(test_npy) y_test_scores = clf.decision_function(test_npy) return y_test_pred
def main(args): data = loadmat(args.filename) trainx, testx, trainy, testy = train_test_split(data['X'], data['y'], test_size=args.train_split, random_state=2) valx, evalx, valy, evaly = train_test_split(testx, testy, test_size=0.5) data_size = len(trainx[0]) encoder_neurons = [data_size, data_size / 2, data_size / 4] clf = KNN() clf.fit(trainx) print("Results Validation KNN") print_metrics(valy, clf.predict(valx)) print("Results Evaluation KNN") print_metrics(evaly, clf.predict(evalx)) clf = PCA(n_components=args.components) clf.fit(trainx) print("Results Validation PCA") print_metrics(valy, clf.predict(valx)) print("Results Evaluation PCA") print_metrics(evaly, clf.predict(evalx)) clf = VAE(encoder_neurons=encoder_neurons, decoder_neurons=encoder_neurons[::-1], epochs=args.epochs, contamination=args.contamination, gamma=args.gamma, capacity=args.capacity) clf.fit(trainx) print("Results Validation VAE") print_metrics(valy, clf.predict(valx)) print("Results Evaluation VAE") print_metrics(evaly, clf.predict(evalx))
class RemoveOutliers(): def __init__(self): self.estimator = KNN() def _remove(self,X): preds = self.estimator.predict(X) return X[preds , :] def fit(self,X,y=None): self.estimator.fit(X) return self def transform(self,X,y=None): return self._remove(X)
def outlier_detection(dataset, features = ["distance","average_speed", "average_acceleration","direction", "stopped"], contamination = 0.01, n_neighbors = 5, method = "mean", \ metric = "minkowski"): """ Detect outliers based on pyod KNN. Note: User may decide upon contamination threshold, number of neighbors, method and metric. For method three kNN detectors are supported: -largest: use the distance to the kth neighbor as the outlier score -mean(default): use the average of all k neighbors as the outlier score -median: use the median of the distance to k neighbors as the outlier score :param dataset: list of features to detect outliers upon. :param contamination: float in (0., 0.5), (default=0.01) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. :param n_neighbors: int, (default = 5) Number of neighbors to use by default for k neighbors queries. :param method: str, (default='largest') {'largest', 'mean', 'median'} :param metric: string or callable, default 'minkowski' metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. :return: """ clf = KNN(contamination=contamination, n_neighbors=n_neighbors, method=method, metric=metric) inp_data = dataset.loc[:, features] clf.fit(inp_data) scores_pred = clf.predict(inp_data) # avoid overwriting input # Inserting column, with 1 if outlier, else 0 if "outlier" in dataset: dataset["outlier"] = scores_pred else: dataset.insert(2, "outlier", scores_pred) return dataset
def get_KNN_scores(dataframe, cols, outliers_fraction=0.01, standardize=True): '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default Returns: df with KNN scores added ''' if standardize: #standardize selected variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) #fit clf = KNN(contamination=outliers_fraction) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) CheckOutliers.df4 = dataframe CheckOutliers.df4['outlier'] = y_pred.tolist() print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with KNN')
def knn_stat_tropo(df,time_param): knn=pd.DataFrame() name=[x for x in globals() if globals()[x] is df][0] print('dataframe: {}'.format(name)) param=name.split('_')[-1] knn['date']=df[time_param] knn['data_val']=df[("%s"%param+'_'+file['marker_name'][0])] print(knn) knn=knn.dropna() x_knn = knn['data_val'].values.reshape(-1,1) # Train kNN detector clf = KNN(contamination=0.01, n_neighbors=21, method='median') if len(x_knn) <= clf.n_neighbors: clf.n_neighbors=math.floor(len(x_knn)/2) clf.fit(x_knn) else: clf.fit(x_knn) #predict raw anomaly score #scores_pred = clf.decision_function(x_knn)*-1 #rediction of a datapoint category outlier or inlier start=time.time() an=clf.predict(x_knn) # to be optimized end=time.time() #knn['anomaly'] = pd.Series(clf.predict(x_knn)) print('predict comp time {}'.format(end-start)) knn['anomaly'] = pd.Series(an) #fig, ax = plt.subplots(figsize=(10,6)) a = knn.loc[knn['anomaly'] == 1, ['date', 'data_val']] #anomaly # ax.scatter(knn['date'], knn['data_val'], color='blue', label = 'Normal') # ax.scatter(a['date'],a['data_val'], color='red', label = 'Anomaly') # plt.legend() # plt.title('KNN tropo {} {} {}'.format("%s"%param,"%s"%file['marker_name'][0], 21)) # plt.xlabel('Date') # plt.show() # fig.savefig('KNN_tropo_{}_{}_{}.png'.format("%s"%param,"%s"%file['marker_name'][0], 21)) # y_train_scores = clf.decision_scores_ return(a)
leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1) clf.fit(trainData) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) #print(y_train_pred) y_train_scores = clf.decision_scores_ # raw outlier scores #print(y_train_scores) # get the prediction on the test data y_test_pred = clf.predict(testData) #X_test) # outlier labels (0 or 1) print(y_test_pred) from sklearn.metrics import accuracy_score accuracy_percentage = accuracy_score(testTarget, y_test_pred) * 100 print("The prediction accuracy is:", end=" ") print(accuracy_percentage) y_test_scores = clf.decision_function(testData) #X_test) # outlier scores #print(y_test_scores) # evaluate and print the results #print("\nOn Training Data:") #evaluate_print(clf_name, y_train, y_train_scores) #print("\nOn Test Data:")
import random from matplotlib.colors import cnames corr = df.corr()['deposit'].abs().sort_values(ascending=False) h_corr_cols = corr[corr < 1].index.tolist() colors = list(cnames.keys()) sns.set_style('darkgrid') fig , ax = plt.subplots(4,3,figsize = (16,12)) ax = ax.ravel() for i,col in enumerate(h_corr_cols): sns.boxplot(df[col], ax = ax[i],color = random.choice(colors)) x = df[h_corr_cols].values model = KNN(contamination=.1) model.fit(x) predicted = model.predict(x) outliers = df.loc[(predicted == 1),:] inliers = df.loc[(predicted == 0),:] df = df.drop(index = df.loc[(predicted == 1),:].index ) """###### Treating imbalance data""" df.education.value_counts().to_frame() df['education'].replace({'basic.9y': 'basic','basic.4y': 'basic','basic.6y':'basic'},inplace=True) df['education'].value_counts().to_frame() df.job.value_counts().to_frame()
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1) clf_name = 'KNN' clf = KNN() # 初始化检测器clf clf.fit(X_train) # 使用X_train训练检测器clf # 返回训练数据X_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) print("On train Data:") evaluate_print(clf_name, y_train, y_train_scores) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(X_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(X_test) # 返回未知数据上的异常值 (分值越大越异常) print("On Test Data:") evaluate_print(clf_name, y_test, y_test_scores) y_true = column_or_1d(y_test) y_pred = column_or_1d(y_test_scores) check_consistent_length(y_true, y_pred) roc = np.round(roc_auc_score(y_true, y_pred), decimals=4), prn = np.round(precision_n_scores(y_true, y_pred), decimals=4) knn_roc.append(roc) knn_prn.append(prn) clf_name = 'LOF'
class TestKnn(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = KNN(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
Hence, we use a library called **pyod** which hosts a number of outlier detection algorithms. """ pip install pyod """### KNN Classifier (Proximity-Based)""" from pyod.models.knn import KNN # kNN detector # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X) y_pred = clf.predict(X) # outlier labels (0 or 1) y_scores = clf.decision_function(X) # outlier scores y_pred """0 means normal value while 1 means anomalous value.""" colors = np.array(['#377eb8', '#ff7f00']) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred - 1) // 2]) """Finding the ROC Accuracy score for the prediction label.""" clf.fit_predict_score(X[:, 0].reshape(-1,1), y_pred, scoring='roc_auc_score') """### Angle-based Outlier Detector (Probabilistic Based Model)"""
class occ(): """ One-class classifier for outlier detection. Attributes: data model X Y score X_proj Methods: """ def __init__(self): self.data = None self.model = None self.X = None self.Y = None self.score = None #self.X_train = None #self.Y_train = None #self.X_test = None #self.Y_test = None def load_data_mat(self, file_name): """ Load data from .mat file. Note that data from ODDS contains a lot of .mat data including known anomalies(Y) """ # type: (str) -> None self.data = scipy.io.loadmat(file_name) self.X = self.data['X'] self.Y = self.data['y'] self.X_proj = occ.manipulation(self.X) #self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.X, self.Y, test_size=0.15) def load_data_csv(self, file_name, Y=False, **kwargs): """ The CSV file should be formatted as: X0, X1, ..., Xn, Y """ data = pd.read_csv(file_name, header=None, **kwargs) self.X = data[range(data.shape[1] - 1)].values if Y: self.Y = data[data.shape[1] - 1].to_numpy().reshape( [data.shape[0], 1]) else: pass def load_data_npz(self, file_name, Y=False): self.X = np.load(file_name) if not Y: self.Y = np.load(Y) else: pass def train(self, model='ocsvm', data=None, sampling=False, **kwargs): # type: (str, Optional[Any], bool, **Any) -> None """ :param sampling: (float) Proportion of sampling. If the raw data size is too large, we can use the sampled datasets. """ if type(data) != np.ndarray: if data == None: data = self.X if sampling != False: data = occ.sampling_X(self.X, rate=sampling) kernel_set = 'poly' gamma_set = 'scale' epochs = 10 batch_size = 50 nu = 0.1 hidden_neurons = None known_normal = False kernel_epochs = 50 radius_epochs = 100 neighbors = 10 for k, v in kwargs.items(): if 'kernel' == k: kernel_set = v elif 'gamma' == k: gamma_set = v elif 'epochs' == k: epochs = v elif 'nu' == k: nu = v elif 'batch_size' == k: batch_size = v elif 'hidden_neurons' == k: hidden_neurons = v elif 'known_normal' == k: known_normal = v elif 'kernel_epochs' == k: kernel_epochs = v elif 'radius_epochs' == k: radius_epochs = v elif 'neighbors' == k: neighbors = v if model == 'ocsvm': self.model = sklearn.svm.OneClassSVM(gamma=gamma_set, kernel=kernel_set, nu=nu) elif model == 'ocnn': self.model = ocnn(len(data[0]), epochs=epochs, nu=nu, batch_size=batch_size) elif model == 'ensemble': self.model = ensemble(nu=nu) elif model == 'isoForest': self.model = sklearn.ensemble.IsolationForest(contamination=nu) elif model == 'autoEncoder': self.model = AutoEncoderODD(nu=nu, hidden_neurons=hidden_neurons, epochs=epochs, batch_size=batch_size) elif model == 'vae': self.model = VAE_ODD(nu=nu, hidden_neurons=hidden_neurons, epochs=epochs, batch_size=batch_size) elif model == 'deepsvdd': self.model = deep_SVDD(nu=nu, known_normal=known_normal, hidden_neurons=hidden_neurons, kernel_epochs=kernel_epochs, radius_epochs=radius_epochs, batch_size=batch_size) elif model == 'knn': self.model = KNN(contamination=nu, n_neighbors=neighbors) elif model == 'twolineAE': self.model = twolineAE(nu=nu, hidden_neurons=hidden_neurons, epochs=epochs, batch_size=batch_size) else: print("There is no such model type {}".format(model)) data = self.select_data(data, **kwargs) self.model.fit(data) def predict(self, data=None, **kwargs): # type: (Optional[Any], **Any) -> ndarray if type(data) != np.ndarray: if data == None: data = self.X data = self.select_data(data, **kwargs) return self.model.predict(data).reshape(len(data), 1) def get_score(self, data=None, **kwargs): if type(data) != np.ndarray: if data == None: data = self.X data = self.select_data(data, **kwargs) return self.model.score_samples(data).reshape(len(data), 1) def export_csv(self, file_name, score): if type(self.Y) != np.ndarray: if self.Y == None: array = np.concatenate((self.X, score), axis=1) else: array = np.concatenate((self.X, self.Y, score), axis=1) pd.DataFrame(array).to_csv(file_name, header=None, index=False) def export_outliers(self, file_name, predictions): def export(array): pd.DataFrame(array[np.where(predictions == -1)[0]]).to_csv( file_name, header=None, index=False) if type(self.Y) != np.ndarray: if self.Y == None: export(self.X) else: X = self.X Y = self.Y export(np.concatenate((X, Y), axis=1)) @staticmethod def select_data(data, **kwargs): # type: (ndarray, **Any) -> ndarray norm = False manipulate = False for k, v in kwargs.items(): if 'norm' == k: norm = v if 'manipulate' == k: manipulate = v if norm: data = occ.norm(data) if manipulate: data = occ.manipulation(data) return data @staticmethod def manipulation(data, **kwargs): # type: (ndarray, **Any) -> ndarray method = 'pca' dim = 3 for k, v in kwargs.items(): if 'method' == k: method = v if 'dim' == k: dim = v if method == 'pca': projection = PCA(n_components=dim) projection.fit(data) return projection.transform(data) @staticmethod def show_projection(data, label=None, **kwargs): # type: (ndarray, ndarray, **Any) -> None size = 25 cmap = 'viridis' norm = False title = None save_file = None for k, val in kwargs.items(): if 'title' == k: title = val elif 'markersize' == k: size = val elif 'cmap' == k: cmap = val elif 'norm' == k: norm = val elif 'save_file' == k: save_file = val data = occ.select_data(data, **kwargs) data_proj = occ.manipulation(data, method='pca', dim=2) data_proj_t = data_proj.transpose() ax, fig = plt.subplots(figsize=(10, 10)) ax = plt.scatter(data_proj_t[0], data_proj_t[1], c=label, s=size, marker='.') ax = plt.colorbar() ax = plt.set_cmap(cmap) if title != None: plt.title(title) if save_file != None: plt.savefig(save_file, dpi=300) plt.show() @staticmethod def norm(data): # type: (ndarray) -> ndarray norm = sklearn.preprocessing.Normalizer(norm='l2', copy=True).fit(data) return norm.transform(data) @staticmethod def proportion(data): return np.where(data < 0) @staticmethod def sampling_X(X, rate=0.1): idx = list(range(len(X))) random.shuffle(idx) return X[idx[:int(len(X) * rate)]]
# Train kNN detector clf = KNN(contamination=0.02, n_neighbors=5) clf.fit(X) # Get the prediction labels of the training data y_train_pred = clf.labels_ # Outlier scores y_train_scores = clf.decision_scores_ # Import the utility function for model evaluation from pyod.utils import evaluate_print # Evaluate on the training data evaluate_print('KNN', y, y_train_scores) # A total of $1256 X_test_abnormal = np.array([[1256.]]) # Predict clf.predict(X_test_abnormal) # A total of $51896 X_test_abnormal = np.array([[51896.]]) # Predict clf.predict(X_test_abnormal)
class TestKnnMahalanobis(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) # calculate covariance for mahalanobis distance X_train_cov = np.cov(self.X_train, rowvar=False) self.clf = KNN(algorithm='auto', metric='mahalanobis', metric_params={'V': X_train_cov}) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
# # kNN # In[3]: #kNN clf_name = 'kNN' clf = KNN(method='median') # In[4]: #用训练集训练 clf.fit(X_train) y_train_pred = clf.labels_ y_train_scores = clf.decision_scores_ y_test_pred = clf.predict(X_test) y_test_scores = clf.decision_function(X_test) #评价性能 roc_train = round(roc_auc_score(y_train, y_train_scores), 4) prn_train = round(precision_n_scores(y_train, y_train_scores), ndigits=4) roc_test = round(roc_auc_score(y_test, y_test_scores), 4) prn_test = round(precision_n_scores(y_test, y_test_scores), ndigits=4) # In[5]: #输出计算得到的roc_auc和precision @ rank n print("\nOn Train Data:") print(clf_name, 'roc:', roc_train, 'precision @ rank n:', prn_train) print("\nOn Test Data:") print(clf_name, 'roc:', roc_test, 'precision @ rank n:', prn_test)
def KNNAlgo(TrainX, TestX, TrainY, TestY): ##Copy Variabel for Accuracy Analysis CopyTrainY = TrainY.copy() CopyTestY = TestY.copy() ##Applying KNN algorith, clf = KNN(n_neighbors=20) clf.fit(TrainX) ##Predicting Label for training Dataset y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) ##Outlier Scores for training Data Points y_train_scores = clf.decision_scores_ # raw outlier scores ##Predicting Label for Test Dataset y_test_pred = clf.predict(TestX) # outlier labels (0 or 1) ##Outlier scores for test dataset y_test_scores = clf.decision_function(TestX) # outlier scores ##Plot Outlier scoring Points for training Dataset sns.distplot(y_train_scores) plt.title('Distance from the Kth Nearest Neighbour') plt.savefig('KthDistanceTrainingSet.png') ##Plot Outlier scoring Points for test Dataset sns.distplot(y_test_scores) plt.title('Distance from the Kth Nearest Neighbour') plt.savefig('KthDistanceTestSet.png') ##Creating A dataframe for Train Dataset consisting of the (Outlier score + Label + Xpoints) TrainScores = list(y_train_scores) ##Outlier scores of training Dataset XPoints = np.arange(1, len(TrainScores) + 1) ##X axis CombineTrainFile = TrainY #Labels for training Dataset CombineTrainFile[ 'XPoints'] = XPoints ##0,1,2 -------------length of training set CombineTrainFile['TrainScores'] = TrainScores ##Oulier scores ##Plot Scatter Plot for the Local Outlier Scores colors = ['green', 'red'] fig = plt.figure(figsize=(8, 8)) plt.scatter(CombineTrainFile['XPoints'], CombineTrainFile['TrainScores'], c=CombineTrainFile['Flag'], cmap=matplotlib.colors.ListedColormap(colors)) plt.title('Outliers in Traning Set for Normal and Shell Company') plt.legend() ##Creating A dataframe for est Dataset consisting of the (Outlier score + Label + Xpoints) TestScores = list(y_test_scores) ##Outlier scores of training Dataset XPoints = np.arange(1, len(TestScores) + 1) ##X axis CombineTestFile = TestY #Labels for training Dataset CombineTestFile[ 'XPoints'] = XPoints ##0,1,2 -------------length of training set CombineTestFile['TestScores'] = TestScores ##Oulier scores ##Plot Scatter Plot for the Local Outlier Scores colors = ['green', 'red'] fig = plt.figure(figsize=(8, 8)) plt.scatter(CombineTestFile['XPoints'], CombineTestFile['TestScores'], c=CombineTestFile['Flag'], cmap=matplotlib.colors.ListedColormap(colors)) plt.title('Outliers in Test Set for Normal and Shell Company') plt.legend() ###Check the Accuracy of the KNN model print( "---------------------------------Accuracy for training Data-------------------------------------------------" ) print("Final accuracy score on the testing data: {:.4f}".format( accuracy_score(CopyTrainY, y_train_pred))) print("Final F-score on the testing data: {:.4f}".format( fbeta_score(CopyTrainY, y_train_pred, beta=1.2))) print('precision_score', precision_score(CopyTrainY, y_train_pred)) print('recall_score', recall_score(CopyTrainY, y_train_pred)) print( "--------------------------------Accuracy for Testing Data----------------------------------------------------" ) print("Final accuracy score on the testing data: {:.4f}".format( accuracy_score(CopyTestY, y_test_pred))) print("Final F-score on the testing data: {:.4f}".format( fbeta_score(CopyTestY, y_test_pred, beta=1.2))) print('precision_score', precision_score(CopyTestY, y_test_pred)) print('recall_score', recall_score(CopyTestY, y_test_pred)) cm = confusion_matrix(CopyTestY, y_test_pred, labels=[1, 0]) print("Confusion matrix for test dataset") print(cm) fpr, tpr, thresholds = roc_curve(CopyTestY, y_test_pred) fig, ax = plt.subplots(1, figsize=(12, 6)) plt.plot(fpr, tpr, color='darkorange', label='Model Performace') plt.plot([0, 1], [0, 1], color='gray', label='Random Performace') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title( 'Honest/Shell Analysis ROC Curve for KNN Outlier Detection Algorithm for test Dataset' ) plt.legend(loc="lower right") print('Auc Score is : ', roc_auc_score(CopyTestY, y_test_pred))
n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
# Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=True)
# In[2]: # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # In[3]: # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # In[4]:
class Remove_Outliers(BaseEstimator, TransformerMixin): def __init__(self, target, contamination=.20, random_state=42, methods=['knn', 'iso', 'mcd']): self.target = target self.contamination = contamination self.random_state = random_state self.methods = methods def fit(self, data, y=None): return (None) def transform(self, data, y=None): return (data) def fit_transform(self, dataset, y=None): data = dataset.copy() if 'iso' in self.methods: self.iso_forest = IForest(contamination=self.contamination, random_state=self.random_state, behaviour='new') self.iso_forest.fit(data.drop(self.target, axis=1)) iso_predict = self.iso_forest.predict( data.drop(self.target, axis=1)) data['iso'] = iso_predict if 'knn' in self.methods: self.knn_out = KNN(contamination=self.contamination) self.knn_out.fit(data.drop(self.target, axis=1)) knn_predict = self.knn_out.predict(data.drop(self.target, axis=1)) data['knn'] = knn_predict if 'pca' in self.methods: self.out_pca = PCA_RO(contamination=self.contamination, random_state=self.random_state) self.out_pca.fit(data.drop(self.target, axis=1)) pca_predict = self.out_pca.predict(data.drop(self.target, axis=1)) data['pca'] = pca_predict # use for those features which are gaussian distributed if 'mcd' in self.methods: self.mcd = EllipticEnvelope(contamination=0.01) self.mcd.fit(data.drop(self.target, axis=1)) mcd_predict = self.mcd.predict(data.drop(self.target, axis=1)) data['mcd'] = mcd_predict data['vote_outlier'] = 0 for i in self.methods: data['vote_outlier'] = data['vote_outlier'] + data[i] self.outliers = data[data['vote_outlier'] == len(self.methods)] return dataset[[ True if i not in self.outliers.index else False for i in dataset.index ]]
# 根据得到的分类标签,对降维后的数据进行标记并在图像中进行展示。 # In[169]: # 训练一个kNN检测器 clf_name = 'kNN' clf = KNN() # 初始化检测器 clf.fit(new_origin_all[:pos]) # 使用训练集训练检测器clf # 返回训练数据X_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(new_origin_all[pos:]) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(new_origin_all[pos:]) # 返回未知数据上的异常值 show_scatter(clf_name, df, y_train_pred, pos) # In[170]: clf_name = 'COF' clf = COF(n_neighbors=30) clf.fit(new_origin_all[:pos]) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores
class TestKnn(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = KNN(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_prediction_labels_confidence(self): pred_labels, confidence = self.clf.predict(self.X_test, return_confidence=True) assert_equal(pred_labels.shape, self.y_test.shape) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_prediction_proba_linear_confidence(self): pred_proba, confidence = self.clf.predict_proba(self.X_test, method='linear', return_confidence=True) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
plt.show() ''' KNN -> K-Nearest Neighbors Detector For an observation, its distance to its kth nearest neighbors could be viewed as the outlying scores Method: -Largest -Average -Median ''' clf_name = 'KNN' clf = KNN() clf.fit(X_train) X_train_pred = clf.labels_ X_train_score = clf.decision_scores_ score_pred = clf.decision_function(X_train)*-1 y_pred = clf.predict(X_train) n_errors = (y_pred != y_train).sum() print('No of Errors:', clf_name, n_errors) # visualization xx, yy = np.meshgrid(np.linspace(-10, 10, 300), np.linspace(-10, 10, 300)) threshold = stats.scoreatpercentile(score_pred, 100*outlier_fraction) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 Z = Z.reshape(xx.shape) # fill blue colormap from minimum anomaly score to threshold value plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 10), cmap=plt.cm.Blues_r) a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red') plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange') b = plt.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c='white',s=20, edgecolor='k') c = plt.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1], c='black',s=20, edgecolor='k') plt.axis('tight')