Exemplo n.º 1
0
def LOF_PCA_for_Clustering_del(final_data_x, isUsePCA=True, ratio=0.7):
    '''

    :param final_data_x: 初始的进行归一化的x值 或者是已经进行PCA处理的值
    :param isUsePCA ; 是否使用PCD进行降为
    :return:
    '''
    global pred_test
    if isUsePCA:
        pca_x = PCA_mars.getPcaComponent(final_data_x,
                                         n_components=0.9,
                                         ratio=ratio)
        print('pca_x', pca_x.shape)
        clf = LocalOutlierFactor(n_neighbors=20,
                                 novelty=True,
                                 contamination=0.1)
        clf.fit(pca_x)

        pred_test = clf.predict(pca_x)
        return delete_Singular(pca_x, pred_test)
    else:
        clf = LocalOutlierFactor(n_neighbors=20,
                                 novelty=True,
                                 contamination=0.1)
        #print(final_data_x[:, 1][:10])
        clf.fit(final_data_x)
        pred_test = clf.predict(final_data_x)
        return delete_Singular(final_data_x, pred_test)
Exemplo n.º 2
0
class LOFNovelty:
    def __init__(self):
        self.clf = LocalOutlierFactor(novelty=True, contamination=0.1)
        self.scaler = StandardScaler()

    def train(self, train):
        #train = self.scaler.fit_transform(train)
        self.clf.fit(train)

    def predict(self, valid, anomaly):
        #valid = self.scaler.fit_transform(valid)
        #anomaly = self.scaler.fit_transform(anomaly)

        y_pred_valid = self.clf.predict(valid)
        y_pred_outliers = self.clf.predict(anomaly)
        score_valid = self.clf.decision_function(valid)
        score_anomaly = self.clf.decision_function(anomaly)

        print("LOF Novelty result")
        print(confusion_matrix([1] * len(y_pred_valid), y_pred_valid).ravel())
        print(
            confusion_matrix([-1] * len(y_pred_outliers),
                             y_pred_outliers).ravel())
        print(" Validation data:",
              list(y_pred_valid).count(1) / y_pred_valid.shape[0])
        #print("Score", score_valid.mean(), score_valid.std())
        print(" Outlier data:",
              list(y_pred_outliers).count(-1) / y_pred_outliers.shape[0])
Exemplo n.º 3
0
    def lof_scores(self, manifest_metric="euclidean", aggregation="average"):
        if manifest_metric == "dtw":
            metric = dtw
        else:
            metric = manifest_metric

        lof_clf_latent = LocalOutlierFactor(metric="euclidean", novelty=True)
        lof_clf_latent.fit(self.Z)
        lof_scores_latent = lof_clf_latent.predict(self.Z)
        lof_score_latent_x = lof_clf_latent.predict(self.z)[0]

        lof_clf_manifest = LocalOutlierFactor(metric=metric, novelty=True)
        lof_clf_manifest.fit(self.Z_tilde[:, :, 0])
        lof_scores_manifest = lof_clf_manifest.predict(self.Z_tilde[:, :, 0])
        lof_score_manifest_x = lof_clf_manifest.predict(self.z_tilde[:, :,
                                                                     0])[0]

        if aggregation == "average":
            lof_score_latent = lof_scores_latent.mean()
            lof_score_manifest = lof_scores_manifest.mean()
        else:
            raise Exception("Aggregation method not valid.")

        return {
            "lof_latent_" + aggregation: lof_score_latent,
            "lof_latent_x": lof_score_latent_x,
            "lof_manifest_" + manifest_metric + "_" + aggregation:
            lof_score_manifest,
            "lof_manifest_x_" + manifest_metric: lof_score_manifest_x
        }
Exemplo n.º 4
0
class LocalOutlierFactor_Classifier:
  """docstring for LocalOutlierFactor_Classifier"""
  def __init__(self, save_path):

    # 默认路径
    self.save_path = os.path.join(save_path,'LocalOutlierFactor')
    if not os.path.exists(self.save_path):
      os.makedirs(self.save_path)
    self.n_neighbors=40
    # 数据集中的异常比例。当拟合时, 用于定义决策函数的阈值
    self.contamination = 0.1

    self.classifier = LocalOutlierFactor(n_neighbors=self.n_neighbors,contamination=self.contamination)

 
  def fit_model(self, train_data_matrix, test_data_matrix, test_true_label):
    """训练模型"""
    self.classifier.fit(train_data_matrix)
    y_pred_label = self.classifier.predict(test_data_matrix)
    n_errors_test = (y_pred_label!=test_true_label).sum()
    accuracy, classification_report, confusion_matrix = sklearn_evaluation(test_true_label, y_pred_label)
    print('Accuracy: {} \nClassification Report:\n{}\n'.format(accuracy, classification_report))
    sys.stdout.flush()

  def test_model(test_data,test_label):
    """测试模型
       such as test_label = [1,1,-1,....]
    """

    scores_pred = self.classifier.decision_function(train_data)
    y_pred_test = self.classifier.predict(test_data)

    n_errors = (y_pred_test!=test_label)
def _localoutlierfactor(*, train, test, x_predict=None, metrics, n_neighbors=20, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination='auto', novelty=False, n_jobs=None):
    """
    For more info visit :
    https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor
    """

    model = LocalOutlierFactor(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, contamination=contamination, novelty=novelty, n_jobs=n_jobs)
    model.fit(train[0], train[1])
    model_name = 'Local Outlier Factor'
    y_hat = model.predict(test[0])

    if metrics == 'accuracy':
        accuracy = accuracy_score(test[1], y_hat)

    if metrics == 'f1':
        accuracy = f1_score(test[1], y_hat)

    if metrics == 'jaccard':
        accuracy = jaccard_score(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
Exemplo n.º 6
0
def local_outlier_detection(training_vectors, test_vectors_clean,
                            test_vectors_anomalous):
    """Predicting outliers using Local Outlier Detection
    """
    print("Starting Local Outlier Fitting...")

    # Fitting model for novel predictions
    lof = LocalOutlierFactor(novelty=True,
                             contamination='auto',
                             algorithm='auto',
                             n_neighbors=20,
                             n_jobs=-1)
    print("Fitting with Parameters: ", lof.get_params())
    lof.fit(training_vectors)
    result_training = lof.predict(training_vectors)

    print("Fitting successful!")
    print("Starting Prediction...")
    # Predict returns 1 for inlier and -1 for outlier
    result_clean = lof.predict(test_vectors_clean)
    result_anomalous = lof.predict(test_vectors_anomalous)

    print("Predicting successful!")
    print("**************************")

    return result_clean, result_anomalous, result_training
Exemplo n.º 7
0
def predict_LocalOutlierFactor(X, fraction_outlier):
    xx, yy = get_meshgrid(X)
    x1, x2 = xx.min(), xx.max()
    y1, y2 = yy.min(), yy.max()
    d = (x2 - x2) * 0.1

    A = LocalOutlierFactor(contamination=fraction_outlier, novelty=True)
    A.fit(X)
    Y = A.predict(X)

    confidence_mat = numpy.array([(A.predict(x.reshape(-1, 2))).astype(int)
                                  for x in numpy.c_[xx.flatten(),
                                                    yy.flatten()]])
    grid_confidence = (confidence_mat).reshape((100, 100))
    P.plot_contourf(X[Y > 0],
                    X[Y <= 0],
                    xx,
                    yy,
                    grid_confidence,
                    x_range=[x1 - d, x2 + d],
                    y_range=[y1 - d, y2 + d],
                    filename_out='5_pred_LocalOutlierFactor_density.png')
    P.plot_2D_features_multi_Y(X,
                               -Y,
                               x_range=[x1 - d, x2 + d],
                               y_range=[y1 - d, y2 + d],
                               filename_out='5_pred_LocalOutlierFactor.png')
    return
def LOF_ano_score():
    print("each ano score by LOF predict method range -1 to +1")
    lof = LocalOutlierFactor(n_neighbors=10, novelty=True, contamination=0.1)

    lof.fit(train_normal)
    # each LOF prediction label (-1 is anomaly and 1 is normal)
    test_a_pred = lof.predict(test_normal)  # テストデータに対する予測
    test_b_pred = lof.predict(test_ano)
    print(test_a_pred, test_b_pred)
def lof_predict(train, test, test_label):
    from sklearn.neighbors import LocalOutlierFactor
    lof = LocalOutlierFactor(novelty=True, contamination=0.01)
    lof.fit(train)
    lof_predict_label = lof.predict(test)
    plot_confusion_matrix(test_label, lof_predict_label, ['anomaly', 'normal'],
                          'LOF Confusion-Matrix')
Exemplo n.º 10
0
    def test_local_outlier_factor_metric_cdist(self):
        for metric in ['euclidean', 'sqeuclidean']:
            with self.subTest(metric=metric):
                lof = LocalOutlierFactor(n_neighbors=2,
                                         novelty=True,
                                         metric=metric)
                data = np.array(
                    [[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]],
                    dtype=np.float32)
                model = lof.fit(data)
                model_onnx = to_onnx(model,
                                     data,
                                     target_opset=TARGET_OPSET,
                                     options={'optim': 'cdist'})

                data = data.copy()
                data[:, 0] += 0.1

                sess = InferenceSession(model_onnx.SerializeToString())
                names = [o.name for o in sess.get_outputs()]
                self.assertEqual(names, ['label', 'scores'])
                got = sess.run(None, {'X': data})
                self.assertEqual(len(got), 2)
                expected_label = lof.predict(data)
                expected_decif = lof.decision_function(data)
                assert_almost_equal(expected_label, got[0].ravel())
                assert_almost_equal(expected_decif, got[1].ravel(), decimal=4)
Exemplo n.º 11
0
    def test_local_outlier_factor_cdist_p3(self):
        lof = LocalOutlierFactor(n_neighbors=2, novelty=True, p=3)
        data = np.array([[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]],
                        dtype=np.float32)
        model = lof.fit(data)
        model_onnx = to_onnx(model,
                             data,
                             target_opset=TARGET_OPSET,
                             options={'optim': 'cdist'})
        self.assertIn('CDist', str(model_onnx))

        data = data.copy()
        data[:, 0] += 0.1

        try:
            sess = InferenceSession(model_onnx.SerializeToString())
        except InvalidGraph as e:
            if "Unrecognized attribute: p for operator CDist" in str(e):
                return
            raise e

        names = [o.name for o in sess.get_outputs()]
        self.assertEqual(names, ['label', 'scores'])
        got = sess.run(None, {'X': data})
        self.assertEqual(len(got), 2)
        expected_label = lof.predict(data)
        expected_decif = lof.decision_function(data)
        assert_almost_equal(expected_label, got[0].ravel())
        assert_almost_equal(expected_decif, got[1].ravel())
Exemplo n.º 12
0
def fix_outliers_by_LocalOutlierFactor(X_train, y_train, X_test, y_test):
    from sklearn.neighbors import LocalOutlierFactor
    lof = LocalOutlierFactor(novelty=True)
    X_train_outlier_index = set()
    X_test_outlier_index = set()
    for var in X_train.columns:
        if X_train[var].nunique() > 2:
            lof.fit(X_train[var].values.reshape(-1, 1))
            outlier_indexes = set((np.where(
                lof.predict(X_train[var].values.reshape(-1, 1)) == -1))[0])
            X_train_outlier_index = X_train_outlier_index.union(
                outlier_indexes)
            #must not remove outliers from test dataset to see how the model performs for all posibilities
            #outlier_indexes_test_set = set((np.where(lof.predict(X_test[var].values.reshape(-1, 1)) == -1))[0])
            #X_test_outlier_index=X_test_outlier_index.union(outlier_indexes_test_set)

    X_train.drop(X_train_outlier_index, inplace=True)
    y_train.drop(X_train_outlier_index, inplace=True)
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)

    if fix_outliers_in_test_data == True:
        X_test.drop(X_test_outlier_index, inplace=True)
        y_test.drop(X_test_outlier_index, inplace=True)
        X_test.reset_index(drop=True, inplace=True)
        y_test.reset_index(drop=True, inplace=True)

    return X_train, y_train, X_test, y_test
Exemplo n.º 13
0
def main(args):
    np.random.seed(0)

    # load data
    columns = args.features.split(",")
    raw_df = pd.read_csv(args.train_data_path)
    data, targets = raw_df[columns], raw_df[args.label]

    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        targets,
                                                        train_size=0.8)

    # fit the model for outlier detection (default)
    lof = LocalOutlierFactor(n_neighbors=args.n_neighbors,
                             novelty=True,
                             n_jobs=args.n_jobs).fit(x_train)

    y_pre = lof.predict(x_test)

    print(
        metrics.classification_report(y_test,
                                      y_pre,
                                      target_names=["outlier", "normValue"]))

    ModelUtils.save_model(columns, lof, args.model_path)
Exemplo n.º 14
0
def remove_noises(data):
    lof = LocalOutlierFactor(n_neighbors=15, novelty=True)
    lof.fit(data)
    outlier_predicted = lof.predict(data)
    clear_data = data[outlier_predicted == 1].copy()

    return clear_data
Exemplo n.º 15
0
def novelty_detection():
    x_train, y_train, x_test, y_test = load_data()

    num_per_class = int(x_train.shape[0] / len(np.unique(y_train)))
    num_known_classes = 7       # 已知类的个数
    known = np.array([0, 1, 2, 3, 4, 5, 6])     # 测试集中余下的类别作为新类

    num_train = num_per_class * num_known_classes
    x_train = x_train[:num_train]
    #y_train = y_train[:num_train]
    #y_train = int(known.__contains__(y_train))
    #y_test_new = int(known.__contains__(y_test))
    y_test = y_test.astype(np.int32).copy()
    #print(y_test)
    y_test[y_test <= 6] = 1
    y_test[y_test > 6] = -1
    #y_test[np.where(y_test==0)] = -1
    #print(np.unique(y_train))
    print(np.unique(y_test))
    #print(y_test)

    # 用LOF做新颖点检测
    lof = LocalOutlierFactor(n_neighbors=20, novelty=True, n_jobs=-1)
    print("-----fiting 训练集-----")
    lof.fit(x_train)
    print("-----预测测试集-----")
    y_pred = lof.predict(x_test)
    print(confusion_matrix(y_test, y_pred))
Exemplo n.º 16
0
 def fit_model(self):
     self.lof_list = []
     for shot_data_array in self.shot_data_array_list:
         lof = LocalOutlierFactor(novelty=True)
         lof.fit(shot_data_array)
         y = lof.predict(shot_data_array)
         self.lof_list.append(lof)
Exemplo n.º 17
0
class LOF(AnomalyDetector):
    """
        Anomaly detector based on local outlier factor
    """
    def __init__(self):
        self._model = LocalOutlierFactor(novelty=True)

    def learn(self, data):
        self._model.fit(data)

    def predict(self, data, obs):
        return self._model.predict(obs) == -1

    def get_score(self, data, epoch=None):
        return self._model.score_samples(data)

    def anomalies_have_high_score(self):
        return False

    def get_memory_size(self):
        return 0

    def save(self, filename):
        joblib.dump(self._model, filename)

    def load(self, filename):
        self._model = joblib.load(filename)
Exemplo n.º 18
0
class LofDetection(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0):
        self.contamination = contamination

    def fit(self, X, y=None):
        if self.contamination == 0:
            return self
        self.lof = LocalOutlierFactor(contamination=self.contamination,
                                      novelty=True)
        if y is None:
            self.lof.fit(X)
        else:
            self.lof.fit(X, y)

        return self

    def transform(self, X_):
        X = deepcopy(X_)
        if self.contamination == 0:
            return X
        idx_outlier = self.lof.predict(X) == -1
        X[idx_outlier, :] = np.nan

        simple_imputer = SimpleImputer()
        X = simple_imputer.fit_transform(X)

        return X
Exemplo n.º 19
0
def predict_LOF(x_train, x_test, x_valid, dim):
    def get_2d_input(x):
        return np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2]))

    def get_1d_input(x):
        return np.reshape(x, (x.shape[0], x.shape[1]))

    clf = LocalOutlierFactor(n_neighbors=1,
                             contamination=0.5,
                             novelty=True,
                             n_jobs=5)

    if (dim == 2):
        ux_, uy_ = load_preprocessed_data(
            'ptb_xl_data/ptb_xl_3490_15_cwt.pkl'
        )  #'ptb_xl_data/ptb_xl_75_25_cwt.pkl'
        x_train_, x_test_, x_valid_, ux_ = map(lambda x: get_2d_input(x),
                                               [x_train, x_test, x_valid, ux_])
    else:
        ux_, uy_ = load_preprocessed_data('ptb_xl_data/ptb_xl_3490_15.pkl'
                                          )  #'ptb_xl_data/ptb_xl_6480_14.pkl'
        x_train_, x_test_, x_valid_, ux_ = map(lambda x: get_1d_input(x),
                                               [x_train, x_test, x_valid, ux_])

    x_all = np.concatenate((x_train_, x_test_), axis=0)
    # fit the model for outlier detection (default)
    clf.fit(x_all)

    y_pred = clf.predict(x_train_)
    errs_train = sum(y_pred == -1)

    y_pred_ted = clf.predict(x_test_)
    errs_test = sum(y_pred_ted == -1)

    y_pred_val = clf.predict(x_valid_)
    errs_val = sum(y_pred_val == -1)

    # test on the unknown data
    y_pred_ud = clf.predict(ux_)
    # 75 classes: all 1875 - 120 errors (2d, 6.4%), 47 errors (1d, 2.51%)
    # 2d : 3175 from 52350 (6,06%), 1365 from 52350 (2.61%)
    errs_un = sum(y_pred_ud == 1)

    return errs_train, errs_test, errs_un
Exemplo n.º 20
0
def _outlier_detection_lof(table,
                           input_cols,
                           n_neighbors=20,
                           result_type='add_prediction',
                           new_column_name='is_outlier'):
    out_table = table.copy()
    features = out_table[input_cols]
    lof_model = LocalOutlierFactor(n_neighbors,
                                   algorithm='auto',
                                   leaf_size=30,
                                   metric='minkowski',
                                   p=2,
                                   novelty=True,
                                   contamination=0.1)
    lof_model.fit(features)

    isinlier = lambda _: 'in' if _ == 1 else 'out'
    out_table[new_column_name] = [
        isinlier(lof_predict) for lof_predict in lof_model.predict(features)
    ]

    if result_type == 'add_prediction':
        pass
    elif result_type == 'remove_outliers':
        out_table = out_table[out_table[new_column_name] == 'in']
        out_table = out_table.drop(new_column_name, axis=1)
    elif result_type == 'both':
        out_table = out_table[out_table[new_column_name] == 'in']
    else:
        raise_runtime_error("Please check 'result_type'.")

    params = {
        'Input Columns': input_cols,
        'Result Type': result_type,
        'Number of Neighbors': n_neighbors,
    }

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Outlier Detection (Local Outlier Factor) Result
    | ### Parameters
    |
    | {display_params}
    |
    """.format(display_params=dict2MD(params))))

    model = _model_dict('outlier_detection_lof')
    model['params'] = params
    model['lof_model'] = lof_model
    model['input_cols'] = input_cols
    model['result_type'] = result_type
    model['num_neighbors'] = n_neighbors
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'model': model}
Exemplo n.º 21
0
def lof_each_ano_score(train_a, test_a, test_b):
    lof = LocalOutlierFactor(n_neighbors=5,
                           novelty=True,
                           contamination=0.1)

    lof.fit(train_a)
    # each LOF prediction label (-1 is anomaly and 1 is normal)
    test_a_pred = lof.predict(test_a) # テストデータに対する予測
    test_b_pred = lof.predict(test_b)
    print(test_a_pred, test_b_pred)
Exemplo n.º 22
0
def main():
  # Read all the csv files
  csvPath = "./csv_files"
  csvFiles = [f for f in listdir(csvPath) if isfile(join(csvPath, f))]
  
  dfs = [] 
  for cv in csvFiles:
    print("CSV Processing: "+cv)
    dfs.append(pd.read_csv(csvPath+'/'+cv,index_col=False))
  
  df = pd.concat(dfs, ignore_index=True)
  #df = df.drop('Unnamed: 0', axis=1)

  # Process all the csv file
  totalNormal = 0
  totalAnomalies =0 
 
  # Turn every column to numeric
  cols = [c for c in df.columns]

  nom_cols = ['ip_flags','tcp_udp_flags','payload']    
  for c in nom_cols:
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c])
    
  # Remove the cols with small standard deviation
  df = df.loc[:, df.std() > 0.0]

  # Calculate the correlation matrix
  corr_matrix = df.corr().abs()

  # Select upper triangle of correlation matrix
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

  # Find index of feature columns with correlation greater than 0.95
  to_drop = [column for column in upper.columns if any(upper[column] == 1)]
  
  df = df.drop(df[to_drop],axis=1)

  print(df.head())
  # Use the isolation forest to find the anomalies -1: anomaly 1:normal 
  clf = LocalOutlierFactor(n_neighbors=2,contamination='auto',novelty=True)
  clf.fit(df)
  df['label']=clf.predict(df)

  totalNormal = len(df[df['label']==1])
  totalAnomalies = len(df[df['label']==-1])
  print("Normal: "+str(totalNormal))
  print("Anomaly: "+str(totalAnomalies))
  print('Accuracy: '+str(totalNormal/float(totalNormal+totalAnomalies)))
  df.to_csv('./processed_csv/'+'processed.csv',index=False)

  #Save the model
  filename = 'model.sav'
  pickle.dump(clf,open(filename,'wb'))
Exemplo n.º 23
0
def Compute_LOF(neighbors, x_train, x_test):
    # x_test: - np array
    # x_test_counterfactual: - np array
    # x_train: train data  - np array

    clf = LocalOutlierFactor(n_neighbors=neighbors, contamination=0.01, novelty=True)
    clf.fit(x_train)

    X_outlier = clf.predict(x_test)

    return X_outlier
Exemplo n.º 24
0
    def run_lof(self, X_train, X_test, features_type):
        """
        LOF: Identifying Density-Based Local Outliers
        
        Return
        ------
        outliers : numpy.array(boolean)
            Boolean's array that indicates if a point is outlier
        """
        X_train_c = X_train.copy()
        X_test_c = X_test.copy()

        X_train_c = X_train_c[features_type['quantitative']]
        X_train_c = X_train_c.dropna()

        X_test_c = X_test_c[features_type['quantitative']]
        X_test_c = X_test_c.dropna()

        # normalize data because that can be in different
        # scale and it affects the distance measure
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_c)
        X_test_scaled = scaler.transform(X_test_c)

        # specify lof
        lof = LocalOutlierFactor(novelty=True)

        # fit and predict
        lof.fit(X_train_scaled)

        outpredict_train = lof.predict(X_train_scaled)
        outpredict_test = lof.predict(X_test_scaled)

        outix_train = X_train_c[outpredict_train == -1].index
        outix_test = X_test_c[outpredict_test == -1].index

        outliers_train = X_train.index.isin(outix_train)
        outliers_test = X_test.index.isin(outix_test)

        return outliers_train, outliers_test
class LOF(object):
    def __init__(self, n_neighbors=20, algorithm='auto', metric='minkowski'):
        """
        Local Outlier Factor
        Arguments
        ---------
        n_neighbors : int, default=20
            Number of neighbors to use by default for kneighbors queries.
            If n_neighbors is larger than the number of samples provided,
            all samples will be used.
        algorithm{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’
            Algorithm used to compute the nearest neighbors.
        metric : str or callable, default=’minkowski’
            metric used for the distance computation.
            Any metric from scikit-learn or scipy.spatial.distance can be used.

        ---------
            For more information, please visit
            https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html
        """
        self.model = LocalOutlierFactor(n_neighbors=n_neighbors,
                                        algorithm=algorithm,
                                        metric=metric,
                                        contamination=0.00001,
                                        novelty=True)

    def fit(self, x):
        """
        Arguments
        ---------
            x: ndarray, the event count matrix of shape num_instances-by-num_events
        """

        print('LOF Fit')
        x = x.reshape((len(x), -1))
        self.model.fit(x)

    def predict(self, x):
        """ Predict anomalies with mined invariants
        Arguments
        ---------
            x: the input event count matrix
        Returns
        -------
            y_pred: ndarray, the predicted label vector of shape (num_instances,)
        """
        print('LOF Predict')
        x = x.reshape((len(x), -1))
        y_pred = self.model.predict(x)
        y_pred = np.where(y_pred > 0, 0, 1)
        return y_pred
Exemplo n.º 26
0
def LOF_PCA_for_Clustering_more(final_data_x,
                                isUsePCA=True,
                                ratio_for_pca=0.7,
                                ratio_for_lof=0.7):
    '''

    :param final_data_x: 初始的进行归一化的x值 或者是已经进行PCA处理的值
    :param isUsePCA ; 是否使用PCD进行降为
    :return:
    '''
    global pred_test
    if isUsePCA:
        pca_x = PCA_mars.getPcaComponent(final_data_x,
                                         n_components=0.9,
                                         ratio=ratio_for_pca)
        print('pca_x', pca_x.shape)

        if ratio_for_lof >= 1.0:
            lof_data = pca_x[:-1]
            test_x = []
        else:
            lof_data = pca_x[:int(len(pca_x) * ratio_for_lof)]
            test_x = pca_x[int(len(pca_x) * ratio_for_lof):-1]
        clf = LocalOutlierFactor(n_neighbors=20,
                                 novelty=True,
                                 contamination=0.1)
        clf.fit(lof_data)
        pred_test = clf.predict(lof_data)
        return (replace_Singular(lof_data, pred_test), test_x)
    else:
        clf = LocalOutlierFactor(n_neighbors=20,
                                 novelty=True,
                                 contamination=0.1)
        clf.fit(final_data_x)
        pred_test = clf.predict(final_data_x)
        return replace_Singular(final_data_x, pred_test)
Exemplo n.º 27
0
    def schedule(self, event_input_name, event_input_value,  data_from_pickle, X_predict, X_train, y_train,
                 n_neighbors, algorithm, leaf_size, metric, p, metric_params, contamination, novelty, n_jobs):

        if event_input_name == 'INIT':

            return [event_input_value, None,self.classifier, self.prediction, self.score_samples]

        elif event_input_name == 'RUN':

            if data_from_pickle == None:
                # default values or not
                if n_neighbors is not None:
                    self.n_neighbors = int(n_neighbors)
                if algorithm is not None:
                    self.algorithm = algorithm
                if leaf_size is not None:
                    self.leaf_size = int(leaf_size)
                if metric is not None:
                    self.metric = metric
                if p is not None:
                    self.p = int(p)
                if metric_params is not None:
                    self.metric_params = metric_params
                if contamination is not None:
                    if contamination == 'auto':
                        self.contamination='auto'
                    else:
                        self.contamination=float(contamination)
                if novelty is not None:
                    self.novelty=novelty
                if n_jobs is not None:
                    self.n_jobs = int(n_jobs)

                classif = LocalOutlierFactor(n_neighbors=self.n_neighbors, algorithm=self.algorithm, leaf_size=self.leaf_size,
                                           metric=self.metric, p=self.p, metric_params=self.metric_params,
                                           contamination=self.contamination, novelty=self.novelty, n_jobs=self.n_jobs)

                classif.fit(np.array(X_train).astype(np.float64), np.array(y_train).astype(np.float64))
                self.classifier=classif

                return [None, event_input_value, self.classifier, self.prediction, self.score_samples]
            else:
                classif = data_from_pickle
                self.classifier = classif
                self.prediction=classif.predict(np.array(X_predict).astype(np.float64).reshape(1, -1))
                self.score_samples=classif.score_samples(np.array(X_predict).astype(np.float64).reshape(1, -1))

                return [None, event_input_value, self.classifier,  self.prediction, self.score_samples]
    def perform_LOF(self, n_neighbors=10, target_names=None, novelty=True):
        """LOF algorithm.

        :param n_neighbors: number of data neighbours used, defaults to 10
        :type n_neighbors: int, optional
        :param novelty: param necessary to detect anomalies, defaults to True
        :type novelty: bool, optional
        :return: classification report with results
        :rtype: str
        """
        model = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=novelty)
        model.fit(self.X_train)
        y_test = model.predict(self.X_test)
        return classification_report(self.t_test,
                                     y_test,
                                     target_names=target_names)
Exemplo n.º 29
0
    def test_local_outlier_factor_double(self):
        lof = LocalOutlierFactor(n_neighbors=2, novelty=True)
        data = np.array([[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]],
                        dtype=np.float64)
        model = lof.fit(data)
        model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET)

        sess = InferenceSession(model_onnx.SerializeToString())
        names = [o.name for o in sess.get_outputs()]
        self.assertEqual(names, ['label', 'scores'])
        got = sess.run(None, {'X': data})
        self.assertEqual(len(got), 2)
        expected_label = lof.predict(data)
        expected_decif = lof.decision_function(data)
        assert_almost_equal(expected_label, got[0].ravel())
        assert_almost_equal(expected_decif, got[1].ravel())
Exemplo n.º 30
0
def train_p2(datasets, model_path):
    '''
    datasets:数据集
    model_path:模型存储路径
    n_neighbours:lof的局部密度阈值
    n_components:pca的降维后的特征维度,当pca=True时生效
    pca:是否使用pca降维
    '''
    lof = LocalOutlierFactor(n_neighbors=35, novelty=True)  # 35
    # ocs = OneClassSVM(kernel = 'rbf')
    # IF = IsolationForest(n_estimators=30)
    results = lof.fit(datasets)
    results = lof.predict(datasets)
    print(np.sum(results > 0) / datasets.shape[0])

    joblib.dump(lof, model_path)
    return results
# Generate normal (not abnormal) training observations
X = 0.3 * np.random.randn(100, 2)
X_train = np.r_[X + 2, X - 2]
# Generate new normal (not abnormal) observations
X = 0.3 * np.random.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))

# fit the model for novelty detection (novelty=True)
clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1)
clf.fit(X_train)
# DO NOT use predict, decision_function and score_samples on X_train as this
# would give wrong results but only on new unseen data (not used in X_train),
# e.g. X_test, X_outliers or the meshgrid
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

# plot the learned frontier, the points, and the nearest vectors to the plane
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("Novelty Detection with LOF")
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')

s = 40
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')