def LOF_PCA_for_Clustering_del(final_data_x, isUsePCA=True, ratio=0.7): ''' :param final_data_x: 初始的进行归一化的x值 或者是已经进行PCA处理的值 :param isUsePCA ; 是否使用PCD进行降为 :return: ''' global pred_test if isUsePCA: pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9, ratio=ratio) print('pca_x', pca_x.shape) clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1) clf.fit(pca_x) pred_test = clf.predict(pca_x) return delete_Singular(pca_x, pred_test) else: clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1) #print(final_data_x[:, 1][:10]) clf.fit(final_data_x) pred_test = clf.predict(final_data_x) return delete_Singular(final_data_x, pred_test)
class LOFNovelty: def __init__(self): self.clf = LocalOutlierFactor(novelty=True, contamination=0.1) self.scaler = StandardScaler() def train(self, train): #train = self.scaler.fit_transform(train) self.clf.fit(train) def predict(self, valid, anomaly): #valid = self.scaler.fit_transform(valid) #anomaly = self.scaler.fit_transform(anomaly) y_pred_valid = self.clf.predict(valid) y_pred_outliers = self.clf.predict(anomaly) score_valid = self.clf.decision_function(valid) score_anomaly = self.clf.decision_function(anomaly) print("LOF Novelty result") print(confusion_matrix([1] * len(y_pred_valid), y_pred_valid).ravel()) print( confusion_matrix([-1] * len(y_pred_outliers), y_pred_outliers).ravel()) print(" Validation data:", list(y_pred_valid).count(1) / y_pred_valid.shape[0]) #print("Score", score_valid.mean(), score_valid.std()) print(" Outlier data:", list(y_pred_outliers).count(-1) / y_pred_outliers.shape[0])
def lof_scores(self, manifest_metric="euclidean", aggregation="average"): if manifest_metric == "dtw": metric = dtw else: metric = manifest_metric lof_clf_latent = LocalOutlierFactor(metric="euclidean", novelty=True) lof_clf_latent.fit(self.Z) lof_scores_latent = lof_clf_latent.predict(self.Z) lof_score_latent_x = lof_clf_latent.predict(self.z)[0] lof_clf_manifest = LocalOutlierFactor(metric=metric, novelty=True) lof_clf_manifest.fit(self.Z_tilde[:, :, 0]) lof_scores_manifest = lof_clf_manifest.predict(self.Z_tilde[:, :, 0]) lof_score_manifest_x = lof_clf_manifest.predict(self.z_tilde[:, :, 0])[0] if aggregation == "average": lof_score_latent = lof_scores_latent.mean() lof_score_manifest = lof_scores_manifest.mean() else: raise Exception("Aggregation method not valid.") return { "lof_latent_" + aggregation: lof_score_latent, "lof_latent_x": lof_score_latent_x, "lof_manifest_" + manifest_metric + "_" + aggregation: lof_score_manifest, "lof_manifest_x_" + manifest_metric: lof_score_manifest_x }
class LocalOutlierFactor_Classifier: """docstring for LocalOutlierFactor_Classifier""" def __init__(self, save_path): # 默认路径 self.save_path = os.path.join(save_path,'LocalOutlierFactor') if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.n_neighbors=40 # 数据集中的异常比例。当拟合时, 用于定义决策函数的阈值 self.contamination = 0.1 self.classifier = LocalOutlierFactor(n_neighbors=self.n_neighbors,contamination=self.contamination) def fit_model(self, train_data_matrix, test_data_matrix, test_true_label): """训练模型""" self.classifier.fit(train_data_matrix) y_pred_label = self.classifier.predict(test_data_matrix) n_errors_test = (y_pred_label!=test_true_label).sum() accuracy, classification_report, confusion_matrix = sklearn_evaluation(test_true_label, y_pred_label) print('Accuracy: {} \nClassification Report:\n{}\n'.format(accuracy, classification_report)) sys.stdout.flush() def test_model(test_data,test_label): """测试模型 such as test_label = [1,1,-1,....] """ scores_pred = self.classifier.decision_function(train_data) y_pred_test = self.classifier.predict(test_data) n_errors = (y_pred_test!=test_label)
def _localoutlierfactor(*, train, test, x_predict=None, metrics, n_neighbors=20, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination='auto', novelty=False, n_jobs=None): """ For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor """ model = LocalOutlierFactor(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, contamination=contamination, novelty=novelty, n_jobs=n_jobs) model.fit(train[0], train[1]) model_name = 'Local Outlier Factor' y_hat = model.predict(test[0]) if metrics == 'accuracy': accuracy = accuracy_score(test[1], y_hat) if metrics == 'f1': accuracy = f1_score(test[1], y_hat) if metrics == 'jaccard': accuracy = jaccard_score(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def local_outlier_detection(training_vectors, test_vectors_clean, test_vectors_anomalous): """Predicting outliers using Local Outlier Detection """ print("Starting Local Outlier Fitting...") # Fitting model for novel predictions lof = LocalOutlierFactor(novelty=True, contamination='auto', algorithm='auto', n_neighbors=20, n_jobs=-1) print("Fitting with Parameters: ", lof.get_params()) lof.fit(training_vectors) result_training = lof.predict(training_vectors) print("Fitting successful!") print("Starting Prediction...") # Predict returns 1 for inlier and -1 for outlier result_clean = lof.predict(test_vectors_clean) result_anomalous = lof.predict(test_vectors_anomalous) print("Predicting successful!") print("**************************") return result_clean, result_anomalous, result_training
def predict_LocalOutlierFactor(X, fraction_outlier): xx, yy = get_meshgrid(X) x1, x2 = xx.min(), xx.max() y1, y2 = yy.min(), yy.max() d = (x2 - x2) * 0.1 A = LocalOutlierFactor(contamination=fraction_outlier, novelty=True) A.fit(X) Y = A.predict(X) confidence_mat = numpy.array([(A.predict(x.reshape(-1, 2))).astype(int) for x in numpy.c_[xx.flatten(), yy.flatten()]]) grid_confidence = (confidence_mat).reshape((100, 100)) P.plot_contourf(X[Y > 0], X[Y <= 0], xx, yy, grid_confidence, x_range=[x1 - d, x2 + d], y_range=[y1 - d, y2 + d], filename_out='5_pred_LocalOutlierFactor_density.png') P.plot_2D_features_multi_Y(X, -Y, x_range=[x1 - d, x2 + d], y_range=[y1 - d, y2 + d], filename_out='5_pred_LocalOutlierFactor.png') return
def LOF_ano_score(): print("each ano score by LOF predict method range -1 to +1") lof = LocalOutlierFactor(n_neighbors=10, novelty=True, contamination=0.1) lof.fit(train_normal) # each LOF prediction label (-1 is anomaly and 1 is normal) test_a_pred = lof.predict(test_normal) # テストデータに対する予測 test_b_pred = lof.predict(test_ano) print(test_a_pred, test_b_pred)
def lof_predict(train, test, test_label): from sklearn.neighbors import LocalOutlierFactor lof = LocalOutlierFactor(novelty=True, contamination=0.01) lof.fit(train) lof_predict_label = lof.predict(test) plot_confusion_matrix(test_label, lof_predict_label, ['anomaly', 'normal'], 'LOF Confusion-Matrix')
def test_local_outlier_factor_metric_cdist(self): for metric in ['euclidean', 'sqeuclidean']: with self.subTest(metric=metric): lof = LocalOutlierFactor(n_neighbors=2, novelty=True, metric=metric) data = np.array( [[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]], dtype=np.float32) model = lof.fit(data) model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET, options={'optim': 'cdist'}) data = data.copy() data[:, 0] += 0.1 sess = InferenceSession(model_onnx.SerializeToString()) names = [o.name for o in sess.get_outputs()] self.assertEqual(names, ['label', 'scores']) got = sess.run(None, {'X': data}) self.assertEqual(len(got), 2) expected_label = lof.predict(data) expected_decif = lof.decision_function(data) assert_almost_equal(expected_label, got[0].ravel()) assert_almost_equal(expected_decif, got[1].ravel(), decimal=4)
def test_local_outlier_factor_cdist_p3(self): lof = LocalOutlierFactor(n_neighbors=2, novelty=True, p=3) data = np.array([[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]], dtype=np.float32) model = lof.fit(data) model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET, options={'optim': 'cdist'}) self.assertIn('CDist', str(model_onnx)) data = data.copy() data[:, 0] += 0.1 try: sess = InferenceSession(model_onnx.SerializeToString()) except InvalidGraph as e: if "Unrecognized attribute: p for operator CDist" in str(e): return raise e names = [o.name for o in sess.get_outputs()] self.assertEqual(names, ['label', 'scores']) got = sess.run(None, {'X': data}) self.assertEqual(len(got), 2) expected_label = lof.predict(data) expected_decif = lof.decision_function(data) assert_almost_equal(expected_label, got[0].ravel()) assert_almost_equal(expected_decif, got[1].ravel())
def fix_outliers_by_LocalOutlierFactor(X_train, y_train, X_test, y_test): from sklearn.neighbors import LocalOutlierFactor lof = LocalOutlierFactor(novelty=True) X_train_outlier_index = set() X_test_outlier_index = set() for var in X_train.columns: if X_train[var].nunique() > 2: lof.fit(X_train[var].values.reshape(-1, 1)) outlier_indexes = set((np.where( lof.predict(X_train[var].values.reshape(-1, 1)) == -1))[0]) X_train_outlier_index = X_train_outlier_index.union( outlier_indexes) #must not remove outliers from test dataset to see how the model performs for all posibilities #outlier_indexes_test_set = set((np.where(lof.predict(X_test[var].values.reshape(-1, 1)) == -1))[0]) #X_test_outlier_index=X_test_outlier_index.union(outlier_indexes_test_set) X_train.drop(X_train_outlier_index, inplace=True) y_train.drop(X_train_outlier_index, inplace=True) X_train.reset_index(drop=True, inplace=True) y_train.reset_index(drop=True, inplace=True) if fix_outliers_in_test_data == True: X_test.drop(X_test_outlier_index, inplace=True) y_test.drop(X_test_outlier_index, inplace=True) X_test.reset_index(drop=True, inplace=True) y_test.reset_index(drop=True, inplace=True) return X_train, y_train, X_test, y_test
def main(args): np.random.seed(0) # load data columns = args.features.split(",") raw_df = pd.read_csv(args.train_data_path) data, targets = raw_df[columns], raw_df[args.label] x_train, x_test, y_train, y_test = train_test_split(data, targets, train_size=0.8) # fit the model for outlier detection (default) lof = LocalOutlierFactor(n_neighbors=args.n_neighbors, novelty=True, n_jobs=args.n_jobs).fit(x_train) y_pre = lof.predict(x_test) print( metrics.classification_report(y_test, y_pre, target_names=["outlier", "normValue"])) ModelUtils.save_model(columns, lof, args.model_path)
def remove_noises(data): lof = LocalOutlierFactor(n_neighbors=15, novelty=True) lof.fit(data) outlier_predicted = lof.predict(data) clear_data = data[outlier_predicted == 1].copy() return clear_data
def novelty_detection(): x_train, y_train, x_test, y_test = load_data() num_per_class = int(x_train.shape[0] / len(np.unique(y_train))) num_known_classes = 7 # 已知类的个数 known = np.array([0, 1, 2, 3, 4, 5, 6]) # 测试集中余下的类别作为新类 num_train = num_per_class * num_known_classes x_train = x_train[:num_train] #y_train = y_train[:num_train] #y_train = int(known.__contains__(y_train)) #y_test_new = int(known.__contains__(y_test)) y_test = y_test.astype(np.int32).copy() #print(y_test) y_test[y_test <= 6] = 1 y_test[y_test > 6] = -1 #y_test[np.where(y_test==0)] = -1 #print(np.unique(y_train)) print(np.unique(y_test)) #print(y_test) # 用LOF做新颖点检测 lof = LocalOutlierFactor(n_neighbors=20, novelty=True, n_jobs=-1) print("-----fiting 训练集-----") lof.fit(x_train) print("-----预测测试集-----") y_pred = lof.predict(x_test) print(confusion_matrix(y_test, y_pred))
def fit_model(self): self.lof_list = [] for shot_data_array in self.shot_data_array_list: lof = LocalOutlierFactor(novelty=True) lof.fit(shot_data_array) y = lof.predict(shot_data_array) self.lof_list.append(lof)
class LOF(AnomalyDetector): """ Anomaly detector based on local outlier factor """ def __init__(self): self._model = LocalOutlierFactor(novelty=True) def learn(self, data): self._model.fit(data) def predict(self, data, obs): return self._model.predict(obs) == -1 def get_score(self, data, epoch=None): return self._model.score_samples(data) def anomalies_have_high_score(self): return False def get_memory_size(self): return 0 def save(self, filename): joblib.dump(self._model, filename) def load(self, filename): self._model = joblib.load(filename)
class LofDetection(BaseEstimator, TransformerMixin): def __init__(self, contamination=0): self.contamination = contamination def fit(self, X, y=None): if self.contamination == 0: return self self.lof = LocalOutlierFactor(contamination=self.contamination, novelty=True) if y is None: self.lof.fit(X) else: self.lof.fit(X, y) return self def transform(self, X_): X = deepcopy(X_) if self.contamination == 0: return X idx_outlier = self.lof.predict(X) == -1 X[idx_outlier, :] = np.nan simple_imputer = SimpleImputer() X = simple_imputer.fit_transform(X) return X
def predict_LOF(x_train, x_test, x_valid, dim): def get_2d_input(x): return np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2])) def get_1d_input(x): return np.reshape(x, (x.shape[0], x.shape[1])) clf = LocalOutlierFactor(n_neighbors=1, contamination=0.5, novelty=True, n_jobs=5) if (dim == 2): ux_, uy_ = load_preprocessed_data( 'ptb_xl_data/ptb_xl_3490_15_cwt.pkl' ) #'ptb_xl_data/ptb_xl_75_25_cwt.pkl' x_train_, x_test_, x_valid_, ux_ = map(lambda x: get_2d_input(x), [x_train, x_test, x_valid, ux_]) else: ux_, uy_ = load_preprocessed_data('ptb_xl_data/ptb_xl_3490_15.pkl' ) #'ptb_xl_data/ptb_xl_6480_14.pkl' x_train_, x_test_, x_valid_, ux_ = map(lambda x: get_1d_input(x), [x_train, x_test, x_valid, ux_]) x_all = np.concatenate((x_train_, x_test_), axis=0) # fit the model for outlier detection (default) clf.fit(x_all) y_pred = clf.predict(x_train_) errs_train = sum(y_pred == -1) y_pred_ted = clf.predict(x_test_) errs_test = sum(y_pred_ted == -1) y_pred_val = clf.predict(x_valid_) errs_val = sum(y_pred_val == -1) # test on the unknown data y_pred_ud = clf.predict(ux_) # 75 classes: all 1875 - 120 errors (2d, 6.4%), 47 errors (1d, 2.51%) # 2d : 3175 from 52350 (6,06%), 1365 from 52350 (2.61%) errs_un = sum(y_pred_ud == 1) return errs_train, errs_test, errs_un
def _outlier_detection_lof(table, input_cols, n_neighbors=20, result_type='add_prediction', new_column_name='is_outlier'): out_table = table.copy() features = out_table[input_cols] lof_model = LocalOutlierFactor(n_neighbors, algorithm='auto', leaf_size=30, metric='minkowski', p=2, novelty=True, contamination=0.1) lof_model.fit(features) isinlier = lambda _: 'in' if _ == 1 else 'out' out_table[new_column_name] = [ isinlier(lof_predict) for lof_predict in lof_model.predict(features) ] if result_type == 'add_prediction': pass elif result_type == 'remove_outliers': out_table = out_table[out_table[new_column_name] == 'in'] out_table = out_table.drop(new_column_name, axis=1) elif result_type == 'both': out_table = out_table[out_table[new_column_name] == 'in'] else: raise_runtime_error("Please check 'result_type'.") params = { 'Input Columns': input_cols, 'Result Type': result_type, 'Number of Neighbors': n_neighbors, } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Outlier Detection (Local Outlier Factor) Result | ### Parameters | | {display_params} | """.format(display_params=dict2MD(params)))) model = _model_dict('outlier_detection_lof') model['params'] = params model['lof_model'] = lof_model model['input_cols'] = input_cols model['result_type'] = result_type model['num_neighbors'] = n_neighbors model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': model}
def lof_each_ano_score(train_a, test_a, test_b): lof = LocalOutlierFactor(n_neighbors=5, novelty=True, contamination=0.1) lof.fit(train_a) # each LOF prediction label (-1 is anomaly and 1 is normal) test_a_pred = lof.predict(test_a) # テストデータに対する予測 test_b_pred = lof.predict(test_b) print(test_a_pred, test_b_pred)
def main(): # Read all the csv files csvPath = "./csv_files" csvFiles = [f for f in listdir(csvPath) if isfile(join(csvPath, f))] dfs = [] for cv in csvFiles: print("CSV Processing: "+cv) dfs.append(pd.read_csv(csvPath+'/'+cv,index_col=False)) df = pd.concat(dfs, ignore_index=True) #df = df.drop('Unnamed: 0', axis=1) # Process all the csv file totalNormal = 0 totalAnomalies =0 # Turn every column to numeric cols = [c for c in df.columns] nom_cols = ['ip_flags','tcp_udp_flags','payload'] for c in nom_cols: le = LabelEncoder() df[c] = le.fit_transform(df[c]) # Remove the cols with small standard deviation df = df.loc[:, df.std() > 0.0] # Calculate the correlation matrix corr_matrix = df.corr().abs() # Select upper triangle of correlation matrix upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Find index of feature columns with correlation greater than 0.95 to_drop = [column for column in upper.columns if any(upper[column] == 1)] df = df.drop(df[to_drop],axis=1) print(df.head()) # Use the isolation forest to find the anomalies -1: anomaly 1:normal clf = LocalOutlierFactor(n_neighbors=2,contamination='auto',novelty=True) clf.fit(df) df['label']=clf.predict(df) totalNormal = len(df[df['label']==1]) totalAnomalies = len(df[df['label']==-1]) print("Normal: "+str(totalNormal)) print("Anomaly: "+str(totalAnomalies)) print('Accuracy: '+str(totalNormal/float(totalNormal+totalAnomalies))) df.to_csv('./processed_csv/'+'processed.csv',index=False) #Save the model filename = 'model.sav' pickle.dump(clf,open(filename,'wb'))
def Compute_LOF(neighbors, x_train, x_test): # x_test: - np array # x_test_counterfactual: - np array # x_train: train data - np array clf = LocalOutlierFactor(n_neighbors=neighbors, contamination=0.01, novelty=True) clf.fit(x_train) X_outlier = clf.predict(x_test) return X_outlier
def run_lof(self, X_train, X_test, features_type): """ LOF: Identifying Density-Based Local Outliers Return ------ outliers : numpy.array(boolean) Boolean's array that indicates if a point is outlier """ X_train_c = X_train.copy() X_test_c = X_test.copy() X_train_c = X_train_c[features_type['quantitative']] X_train_c = X_train_c.dropna() X_test_c = X_test_c[features_type['quantitative']] X_test_c = X_test_c.dropna() # normalize data because that can be in different # scale and it affects the distance measure scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train_c) X_test_scaled = scaler.transform(X_test_c) # specify lof lof = LocalOutlierFactor(novelty=True) # fit and predict lof.fit(X_train_scaled) outpredict_train = lof.predict(X_train_scaled) outpredict_test = lof.predict(X_test_scaled) outix_train = X_train_c[outpredict_train == -1].index outix_test = X_test_c[outpredict_test == -1].index outliers_train = X_train.index.isin(outix_train) outliers_test = X_test.index.isin(outix_test) return outliers_train, outliers_test
class LOF(object): def __init__(self, n_neighbors=20, algorithm='auto', metric='minkowski'): """ Local Outlier Factor Arguments --------- n_neighbors : int, default=20 Number of neighbors to use by default for kneighbors queries. If n_neighbors is larger than the number of samples provided, all samples will be used. algorithm{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’ Algorithm used to compute the nearest neighbors. metric : str or callable, default=’minkowski’ metric used for the distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. --------- For more information, please visit https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html """ self.model = LocalOutlierFactor(n_neighbors=n_neighbors, algorithm=algorithm, metric=metric, contamination=0.00001, novelty=True) def fit(self, x): """ Arguments --------- x: ndarray, the event count matrix of shape num_instances-by-num_events """ print('LOF Fit') x = x.reshape((len(x), -1)) self.model.fit(x) def predict(self, x): """ Predict anomalies with mined invariants Arguments --------- x: the input event count matrix Returns ------- y_pred: ndarray, the predicted label vector of shape (num_instances,) """ print('LOF Predict') x = x.reshape((len(x), -1)) y_pred = self.model.predict(x) y_pred = np.where(y_pred > 0, 0, 1) return y_pred
def LOF_PCA_for_Clustering_more(final_data_x, isUsePCA=True, ratio_for_pca=0.7, ratio_for_lof=0.7): ''' :param final_data_x: 初始的进行归一化的x值 或者是已经进行PCA处理的值 :param isUsePCA ; 是否使用PCD进行降为 :return: ''' global pred_test if isUsePCA: pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9, ratio=ratio_for_pca) print('pca_x', pca_x.shape) if ratio_for_lof >= 1.0: lof_data = pca_x[:-1] test_x = [] else: lof_data = pca_x[:int(len(pca_x) * ratio_for_lof)] test_x = pca_x[int(len(pca_x) * ratio_for_lof):-1] clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1) clf.fit(lof_data) pred_test = clf.predict(lof_data) return (replace_Singular(lof_data, pred_test), test_x) else: clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1) clf.fit(final_data_x) pred_test = clf.predict(final_data_x) return replace_Singular(final_data_x, pred_test)
def schedule(self, event_input_name, event_input_value, data_from_pickle, X_predict, X_train, y_train, n_neighbors, algorithm, leaf_size, metric, p, metric_params, contamination, novelty, n_jobs): if event_input_name == 'INIT': return [event_input_value, None,self.classifier, self.prediction, self.score_samples] elif event_input_name == 'RUN': if data_from_pickle == None: # default values or not if n_neighbors is not None: self.n_neighbors = int(n_neighbors) if algorithm is not None: self.algorithm = algorithm if leaf_size is not None: self.leaf_size = int(leaf_size) if metric is not None: self.metric = metric if p is not None: self.p = int(p) if metric_params is not None: self.metric_params = metric_params if contamination is not None: if contamination == 'auto': self.contamination='auto' else: self.contamination=float(contamination) if novelty is not None: self.novelty=novelty if n_jobs is not None: self.n_jobs = int(n_jobs) classif = LocalOutlierFactor(n_neighbors=self.n_neighbors, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, p=self.p, metric_params=self.metric_params, contamination=self.contamination, novelty=self.novelty, n_jobs=self.n_jobs) classif.fit(np.array(X_train).astype(np.float64), np.array(y_train).astype(np.float64)) self.classifier=classif return [None, event_input_value, self.classifier, self.prediction, self.score_samples] else: classif = data_from_pickle self.classifier = classif self.prediction=classif.predict(np.array(X_predict).astype(np.float64).reshape(1, -1)) self.score_samples=classif.score_samples(np.array(X_predict).astype(np.float64).reshape(1, -1)) return [None, event_input_value, self.classifier, self.prediction, self.score_samples]
def perform_LOF(self, n_neighbors=10, target_names=None, novelty=True): """LOF algorithm. :param n_neighbors: number of data neighbours used, defaults to 10 :type n_neighbors: int, optional :param novelty: param necessary to detect anomalies, defaults to True :type novelty: bool, optional :return: classification report with results :rtype: str """ model = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=novelty) model.fit(self.X_train) y_test = model.predict(self.X_test) return classification_report(self.t_test, y_test, target_names=target_names)
def test_local_outlier_factor_double(self): lof = LocalOutlierFactor(n_neighbors=2, novelty=True) data = np.array([[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]], dtype=np.float64) model = lof.fit(data) model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET) sess = InferenceSession(model_onnx.SerializeToString()) names = [o.name for o in sess.get_outputs()] self.assertEqual(names, ['label', 'scores']) got = sess.run(None, {'X': data}) self.assertEqual(len(got), 2) expected_label = lof.predict(data) expected_decif = lof.decision_function(data) assert_almost_equal(expected_label, got[0].ravel()) assert_almost_equal(expected_decif, got[1].ravel())
def train_p2(datasets, model_path): ''' datasets:数据集 model_path:模型存储路径 n_neighbours:lof的局部密度阈值 n_components:pca的降维后的特征维度,当pca=True时生效 pca:是否使用pca降维 ''' lof = LocalOutlierFactor(n_neighbors=35, novelty=True) # 35 # ocs = OneClassSVM(kernel = 'rbf') # IF = IsolationForest(n_estimators=30) results = lof.fit(datasets) results = lof.predict(datasets) print(np.sum(results > 0) / datasets.shape[0]) joblib.dump(lof, model_path) return results
# Generate normal (not abnormal) training observations X = 0.3 * np.random.randn(100, 2) X_train = np.r_[X + 2, X - 2] # Generate new normal (not abnormal) observations X = 0.3 * np.random.randn(20, 2) X_test = np.r_[X + 2, X - 2] # Generate some abnormal novel observations X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) # fit the model for novelty detection (novelty=True) clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1) clf.fit(X_train) # DO NOT use predict, decision_function and score_samples on X_train as this # would give wrong results but only on new unseen data (not used in X_train), # e.g. X_test, X_outliers or the meshgrid y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) n_error_test = y_pred_test[y_pred_test == -1].size n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size # plot the learned frontier, the points, and the nearest vectors to the plane Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Novelty Detection with LOF") plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu) a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred') plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred') s = 40 b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')