def pca(X_train, X_test, Y_train, Y_test): from pyod.models.pca import PCA model = PCA() model.fit(X_train) pred = model.predict(X_test) acc = np.sum(pred == Y_test) / X_test.shape[0] print(acc) return (acc * 100)
def print_accuracy(train_arr,test_arr,trader_id): if len(train_arr)==0 or len(test_arr)==0: return for i in range(len(train_arr)): l1=len(train_arr[i]) l2=len(test_arr[i]) if l1==0 or l2==0: continue train_data=np.array([train_arr[i]]).T test_data=np.array([test_arr[i]]).T # clf=OCSVM(kernel ='rbf',gamma = 0.5) print(len(train_arr)) clf = PCA(n_components =15) clf.fit(train_arr) y_pred=clf.predict(train_arr) print("TRAINING ACCURACY for TRADER",trader_id,":",100 - (sum(y_pred)*100/l1)) y_pred=clf.predict(test_data) print("TESTING ACCURACY: ",sum(y_pred)*100/l2)
def pca(self, X_train, n_components=None, contamination=None): """ Train PCA model from PYOD Parameters __________ X_train: scaled training data contamination: percentage of anomalies in the data n_components: number of components to transform Returns ________ Anomaly scores """ model = PCAOD(n_components=n_components, contamination=contamination) model.fit(X_train) # Predict raw anomaly score labels = model.predict(X_train) # outlier labels (0 or 1) pca_anomaly_scores = model.decision_function(X_train) # outlier scores pca_anomaly_scores = self.min_max_scaler(pca_anomaly_scores) return pca_anomaly_scores, labels
import numpy as np train_set=np.array(train_set) test_set=np.array(test_set) from pyod.models.ocsvm import OCSVM from pyod.models.pca import PCA # from pyod.models.mcd import MCD clf1=PCA(standardization = True,contamination=0.2) # clf1 = MCD(assume_centered = True) clf2=OCSVM(kernel = 'poly',nu = 0.25,degree =2,contamination =0.2) # clf2 = OCSVM(kernel = 'linear',nu =0.02) clf1.fit(train_set) clf2.fit(train_set) y_pred_train_pca=clf1.predict(train_set) y_pred_test_pca=clf1.predict(test_set) y_pred_train_ocsvm=clf2.predict(train_set) y_pred_test_ocsvm=clf2.predict(test_set) print(clf1.explained_variance_) # print(y_pred_test_pca,y_pred_test_ocsvm) train_pca_correct=0 train_ocsvm_correct=0 print("TRAIN SET") for i in range(len(pred_train_set)): # print("Actual:",pred_train_set[i],"PCA",y_pred_train_pca[i],"OCSVM",y_pred_train_ocsvm[i]) if pred_train_set[i]==y_pred_train_pca[i] and pred_train_set[i]==1: train_pca_correct+=1 if pred_train_set[i]==y_pred_train_ocsvm[i] and y_pred_train_ocsvm[i]==1: train_ocsvm_correct+=1
class TestPCA(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=10, contamination=self.contamination, random_state=42) self.clf = PCA(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'selected_components_') and self.clf.selected_components_ is not None) assert (hasattr(self.clf, 'selected_w_components_') and self.clf.selected_w_components_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
"""### Angle-based Outlier Detector (Probabilistic Based Model)""" from pyod.models import abod clf_abod = abod.ABOD(contamination=0.1, n_neighbors=5, method='fast') clf_abod.fit(X) y_pred = clf_abod.predict(X) # outlier labels (0 or 1) y_scores = clf_abod.decision_function(X) # outlier scores colors = np.array(['#377eb8', '#ff7f00']) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred - 1) // 2]) clf_abod.fit_predict_score(X[:, 0].reshape(-1,1), y_pred, scoring='roc_auc_score') """### Linear Model PCA""" from pyod.models.pca import PCA clf_pca = PCA() clf_pca.fit(X) y_pred = clf_pca.predict(X) # outlier labels (0 or 1) y_scores = clf_pca.decision_function(X) # outlier scores y_pred colors = np.array(['#377eb8', '#ff7f00']) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred - 1) // 2]) clf_pca.fit_predict_score(X[:, 0].reshape(-1,1), y_pred, scoring='roc_auc_score')
def detect(file, amountanom, realtime, dumptocsv): """ Function to apply a very simple anomaly detector amountanom: The top number of anomalies we want to print realtime: If we want to read the conn.log file in real time (not working) """ # Create a Pandas dataframe from the conn.log bro_df = pd.read_csv( file, sep="\t", comment='#', names=[ 'ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents' ]) # In case you need a label, due to some models being able to work in a # semisupervized mode, then put it here. For now everything is # 'normal', but we are not using this for detection bro_df['label'] = 'normal' # Replace the rows without data (with '-') with 0. # Even though this may add a bias in the algorithms, # is better than not using the lines. # Also fill the no values with 0 # Finally put a type to each column bro_df['orig_bytes'].replace('-', '0', inplace=True) bro_df['orig_bytes'] = bro_df['orig_bytes'].fillna(0).astype('int32') bro_df['resp_bytes'].replace('-', '0', inplace=True) bro_df['resp_bytes'] = bro_df['resp_bytes'].fillna(0).astype('int32') bro_df['resp_pkts'].replace('-', '0', inplace=True) bro_df['resp_pkts'] = bro_df['resp_pkts'].fillna(0).astype('int32') bro_df['orig_ip_bytes'].replace('-', '0', inplace=True) bro_df['orig_ip_bytes'] = bro_df['orig_ip_bytes'].fillna(0).astype('int32') bro_df['resp_ip_bytes'].replace('-', '0', inplace=True) bro_df['resp_ip_bytes'] = bro_df['resp_ip_bytes'].fillna(0).astype('int32') bro_df['duration'].replace('-', '0', inplace=True) bro_df['duration'] = bro_df['duration'].fillna(0).astype('float64') # Save dataframe to disk as CSV if dumptocsv != "None": bro_df.to_csv(dumptocsv) # Add the columns from the log file that we know are numbers. This is only for conn.log files. X_train = bro_df[[ 'duration', 'orig_bytes', 'id.resp_p', 'resp_bytes', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes' ]] # Our y is the label. But we are not using it now. y = bro_df.label # The X_test is where we are going to search for anomalies. In our case, its the same set of data than X_train. X_test = X_train ################# # Select a model from below # ABOD class for Angle-base Outlier Detection. For an observation, the # variance of its weighted cosine scores to all neighbors could be # viewed as the outlying score. # clf = ABOD() # LOF # clf = LOF() # CBLOF # clf = CBLOF() # LOCI # clf = LOCI() # LSCP # clf = LSCP() # MCD # clf = MCD() # OCSVM # clf = OCSVM() # PCA. Good and fast! clf = PCA() # SOD # clf = SOD() # SO_GAAL # clf = SO_GALL() # SOS # clf = SOS() # XGBOD # clf = XGBOD() # KNN # Good results but slow # clf = KNN() # clf = KNN(n_neighbors=10) ################# # Fit the model to the train data clf.fit(X_train) # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # Convert the ndarrays of scores and predictions to pandas series scores_series = pd.Series(y_test_scores) pred_series = pd.Series(y_test_pred) # Now use the series to add a new column to the X test X_test['score'] = scores_series.values X_test['pred'] = pred_series.values # Add the score to the bro_df also. So we can show it at the end bro_df['score'] = X_test['score'] # Keep the positive predictions only. That is, keep only what we predict is an anomaly. X_test_predicted = X_test[X_test.pred == 1] # Keep the top X amount of anomalies top10 = X_test_predicted.sort_values(by='score', ascending=False).iloc[:amountanom] # Print the results # Find the predicted anomalies in the original bro dataframe, where the rest of the data is df_to_print = bro_df.iloc[top10.index] print('\nFlows of the top anomalies') # Only print some columns, not all, so its easier to read. df_to_print = df_to_print.drop([ 'conn_state', 'history', 'local_orig', 'local_resp', 'missed_bytes', 'ts', 'tunnel_parents', 'uid', 'label' ], axis=1) print(df_to_print)
y = data['s'] #划分测试集和训练集 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33) #使用pyod中的PCA算法拟合数据 clf_name = 'PCA' clf = PCA() clf.fit(X_train) #预测得到由0和1组成的数组,1表示离群点,0表示飞离群点 y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores,The outlier scores of the training data. #预测样本是不是离群点,返回0和1 的数组 y_test_pred = clf.predict(X_test) y_test_scores = clf.decision_function( X_test) # outlier scores,The anomaly score of the input samples. #使用sklearn中的roc_auc_score方法得到auc值,即roc曲线下面的面积 try: sumAuc_train += sklearn.metrics.roc_auc_score(y_train, y_train_scores, average='macro') sumAuc_test += sklearn.metrics.roc_auc_score(y_test, y_test_scores, average='macro') #s=precision_score(y_train, y_train_scores, average='macro') i += 1 print(sumAuc_train, sumAuc_test) except ValueError:
# Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train PCA detector clf_name = 'PCA' clf = PCA() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)
class Remove_Outliers(BaseEstimator, TransformerMixin): def __init__(self, target, contamination=.20, random_state=42, methods=['knn', 'iso', 'mcd']): self.target = target self.contamination = contamination self.random_state = random_state self.methods = methods def fit(self, data, y=None): return (None) def transform(self, data, y=None): return (data) def fit_transform(self, dataset, y=None): data = dataset.copy() if 'iso' in self.methods: self.iso_forest = IForest(contamination=self.contamination, random_state=self.random_state, behaviour='new') self.iso_forest.fit(data.drop(self.target, axis=1)) iso_predict = self.iso_forest.predict( data.drop(self.target, axis=1)) data['iso'] = iso_predict if 'knn' in self.methods: self.knn_out = KNN(contamination=self.contamination) self.knn_out.fit(data.drop(self.target, axis=1)) knn_predict = self.knn_out.predict(data.drop(self.target, axis=1)) data['knn'] = knn_predict if 'pca' in self.methods: self.out_pca = PCA_RO(contamination=self.contamination, random_state=self.random_state) self.out_pca.fit(data.drop(self.target, axis=1)) pca_predict = self.out_pca.predict(data.drop(self.target, axis=1)) data['pca'] = pca_predict # use for those features which are gaussian distributed if 'mcd' in self.methods: self.mcd = EllipticEnvelope(contamination=0.01) self.mcd.fit(data.drop(self.target, axis=1)) mcd_predict = self.mcd.predict(data.drop(self.target, axis=1)) data['mcd'] = mcd_predict data['vote_outlier'] = 0 for i in self.methods: data['vote_outlier'] = data['vote_outlier'] + data[i] self.outliers = data[data['vote_outlier'] == len(self.methods)] return dataset[[ True if i not in self.outliers.index else False for i in dataset.index ]]
class TestPCA(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.5 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = PCA(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr(self.clf, 'decision_scores_') or \ self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') if not hasattr(self.clf, 'selected_components_') or \ self.clf.selected_components_ is None: self.assertRaises(AttributeError, 'selected_components_ is not set') if not hasattr(self.clf, 'selected_w_components_') or \ self.clf.selected_w_components_ is None: self.assertRaises(AttributeError, 'selected_w_components_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
train_normal_arr_stnd = train_normal_arr_stnd/(train_normal_arr_stnd.std(axis = 0)+1) test_all_data_stnd = test_all_data - test_all_data.mean(axis = 0) # negatives_mat = negatives_mat - negatives_mat.mean(axis =0) # all_data_mat = all_data_mat - all_data_mat.mean(axis=0) test_all_data_stnd = test_all_data_stnd/(test_all_data_stnd.std(axis = 0)+1) ## Generate labels # normal_complete_data_arr = all_data[0:len(normal_complete_data_arr)] # test_labels = all_labels[int(0.8)*len(normal_complete_data_arr):len(all_labels)] test_labels = all_labels[int(0.8*len(normal_complete_data_arr)):len(all_labels)] print("test",test_labels.count(0)) clf1 = PCA(n_components = 15,n_selected_components = 1,standardization = True) clf1.fit(train_normal_arr_stnd) predicted = clf1.predict(test_all_data_stnd) accuracy = 0 recall = 0 for i in range(len(predicted)): if predicted[i] == test_labels[i] and test_labels[i] == 1: recall +=1 if predicted[i] == all_labels[i]: accuracy +=1 print("PCA Accuracy",accuracy/len(train_normal_arr_stnd)) print("PCA Recall",recall/len(malicious_complete_data_arr)) print(clf1.singular_values_) ## OCSVM clf1 = OCSVM(kernel = 'rbf',gamma = 1,nu = 0.4) clf1.fit(train_normal_arr_stnd) predicted = clf1.predict(test_all_data_stnd)
class TestPCA(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.5 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = PCA(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr(self.clf, 'decision_scores_') or \ self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') if not hasattr(self.clf, 'selected_components_') or \ self.clf.selected_components_ is None: self.assertRaises(AttributeError, 'selected_components_ is not set') if not hasattr(self.clf, 'selected_w_components_') or \ self.clf.selected_w_components_ is None: self.assertRaises(AttributeError, 'selected_w_components_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # TODO: turn off performance check before a better data generation # method is available. # check performance # assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_evaluate(self): self.clf.fit_predict_evaluate(self.X_test, self.y_test) def tearDown(self): pass
print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = test_labels, clf.predict(test_all_data_stnd) print(classification_report(y_true, y_pred)) print() clf1 = oc_svm(kernel='rbf', nu=0.03, gamma=0.1) set_threshold = 0.9705 # scores = cross_val_score(estimator=clf1, X=train_normal_arr_stnd, cv=5) clf1.fit(train_normal_arr_stnd) roc = [] recall_list = [] # clf1.threshold_ = i predicted1 = clf1.predict(test_all_data_stnd) # print(predicted) print(clf1.score_samples(test_all_data_stnd))
def detect(file, amountanom, realtime): """ Functon to apply a very simple anomaly detector amountanom: The top number of anomalies we want to print realtime: If we want to read the conn.log file in real time (not working) """ # Create a zeek reader on a given log file. Thanks brothon reader = bro_log_reader.BroLogReader(file, tail=realtime) # Create a Pandas dataframe from reader bro_df = pd.DataFrame(reader.readrows()) # In case you need a label, due to some models being able to work in a semisupervized mode, then put it here. For now everything is 'normal', but we are not using this for detection bro_df['label'] = 'normal' # Change the datetime delta value to seconds. Scikit does not now how to work with timedeltas bro_df['durationsec'] = bro_df.duration.apply(lambda x: x.total_seconds()) # Replace the rows without data (with '-') with -1. Even though this may add a bias in the algorithms, is better than not using the lines. bro_df['orig_bytes'] = bro_df['orig_bytes'].replace(to_replace='-', value=-1) bro_df['resp_bytes'] = bro_df['resp_bytes'].replace(to_replace='-', value=-1) bro_df['resp_pkts'] = bro_df['resp_pkts'].replace(to_replace='-', value=-1) bro_df['orig_ip_bytes'] = bro_df['orig_ip_bytes'].replace(to_replace='-', value=-1) bro_df['resp_ip_bytes'] = bro_df['resp_ip_bytes'].replace(to_replace='-', value=-1) # Add the columns from the log file that we know are numbers. This is only for conn.log files. X_train = bro_df[[ 'durationsec', 'orig_bytes', 'id.resp_p', 'resp_bytes', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes' ]] # Our y is the label. But we are not using it now. y = bro_df.label # The X_test is where we are going to search for anomalies. In our case, its the same set of data than X_train. X_test = X_train ################# # Select a model from below # ABOD class for Angle-base Outlier Detection. For an observation, the variance of its weighted cosine scores to all neighbors could be viewed as the outlying score. #clf = ABOD() # LOF #clf = LOF() # CBLOF #clf = CBLOF() # LOCI #clf = LOCI() # LSCP #clf = LSCP() # MCD #clf = MCD() # OCSVM #clf = OCSVM() # PCA. Good and fast! clf = PCA() # SOD #clf = SOD() # SO_GAAL #clf = SO_GALL() # SOS #clf = SOS() # XGBOD #clf = XGBOD() # KNN # Good results but slow #clf = KNN() #clf = KNN(n_neighbors=10) ################# # Fit the model to the train data clf.fit(X_train) # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # Convert the ndarrays of scores and predictions to pandas series scores_series = pd.Series(y_test_scores) pred_series = pd.Series(y_test_pred) # Now use the series to add a new column to the X test X_test['score'] = scores_series.values X_test['pred'] = pred_series.values # Add the score to the bro_df also. So we can show it at the end bro_df['score'] = X_test['score'] # Keep the positive predictions only. That is, keep only what we predict is an anomaly. X_test_predicted = X_test[X_test.pred == 1] # Keep the top X amount of anomalies top10 = X_test_predicted.sort_values(by='score', ascending=False).iloc[:amountanom] ## Print the results # Find the predicted anomalies in the original bro dataframe, where the rest of the data is df_to_print = bro_df.iloc[top10.index] print('\nFlows of the top anomalies') # Only print some columns, not all, so its easier to read. df_to_print = df_to_print.drop([ 'conn_state', 'history', 'local_orig', 'local_resp', 'missed_bytes', 'ts', 'tunnel_parents', 'uid', 'label' ], axis=1) print(df_to_print)
class TestPCA(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.5 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = PCA(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'selected_components_') and self.clf.selected_components_ is not None) assert_true(hasattr(self.clf, 'selected_w_components_') and self.clf.selected_w_components_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass