示例#1
0
 def model_init(self, model):
     """Model initialisation of a single model.
     """
     if self.model == 'pca':
         self.models[model] = PCA(contamination=self.contamination)
     elif self.model == 'loda':
         self.models[model] = LODA(contamination=self.contamination)
     elif self.model == 'iforest':
         self.models[model] = IForest(n_estimators=50,
                                      bootstrap=True,
                                      behaviour='new',
                                      contamination=self.contamination)
     elif self.model == 'cblof':
         self.models[model] = CBLOF(n_clusters=3,
                                    contamination=self.contamination)
     elif self.model == 'feature_bagging':
         self.models[model] = FeatureBagging(
             base_estimator=PCA(contamination=self.contamination),
             contamination=self.contamination)
     elif self.model == 'copod':
         self.models[model] = COPOD(contamination=self.contamination)
     elif self.model == 'hbos':
         self.models[model] = HBOS(contamination=self.contamination)
     else:
         self.models[model] = HBOS(contamination=self.contamination)
     self.custom_model_scalers[model] = MinMaxScaler()
示例#2
0
文件: model.py 项目: esowc/DAAQS
    def pred_KNN(self, k=5, comp_with="openaq"):
        ## hyperparameters for KNN is tuned here
        # if self.bool_o_dict == True:
        self.comp_with = comp_with

        if comp_with == "openaq":
            if self.X_o == []:
                pred = []
            elif self.X_o.shape[0] > k:
                self.clf = KNN(n_neighbors=k)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            elif self.X_o.shape[0] > 2:
                # print(f"The value of k is changed from {k} to {self.X_o.shape[0]-1}")
                k = self.X_o.shape[0] - 1
                self.clf = KNN(n_neighbors=k)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            else:
                pred = []
            #A_location, B_location, C_location = self.pred_location(pred)

        elif comp_with == "cams":
            pred = []
            for each_X in self.X_c:
                # if each_X exists then it will have a shape of (10,8)
                self.clf = KNN(n_neighbors=k)
                self.clf.fit(each_X)
                pred.append(self.clf.labels_[-1])

        A_location, B_location, C_location = self.pred_location(pred)

        return A_location, B_location, C_location
示例#3
0
def pca(X_train, X_test, Y_train, Y_test):
    from pyod.models.pca import PCA
    model = PCA()
    model.fit(X_train)
    pred = model.predict(X_test)
    acc = np.sum(pred == Y_test) / X_test.shape[0]
    print(acc)
    return (acc * 100)
示例#4
0
def pca_outlier_detection(X_train, X_test, **kwargs):
    detector = PCA(**kwargs)
    detector.fit(X_train)
    prob = detector.predict_proba(X_test)[:, -1]

    if isinstance(X_test, pd.DataFrame):
        return pd.Series(prob, name='outlier', index=X_test.index)
    return pd.Series(prob, name='outlier')
示例#5
0
文件: test_pca.py 项目: deltat99/Pyod
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination)

        self.clf = PCA(contamination=self.contamination)
示例#6
0
    def S2(self):

        self.S1()
        water_data = self.water_data
        result = self.result

        # 数据预处理及模型训练
        clean_data = water_data[water_data['S1'] == 0]
        Y = pd.DataFrame(index=clean_data.index, columns=['S2'])

        X_train = np.array(clean_data.iloc[:, 1:12])
        name = list(clean_data.iloc[:, 1:12].columns.values)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

        clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True)
        clf2 = KNN(contamination=0.05, n_neighbors=100)
        clf3 = HBOS(contamination=0.05, n_bins=10)
        clf4 = PCA(contamination=0.05)

        clf1.fit(X_train)
        clf2.fit(X_train)
        clf3.fit(X_train)
        clf4.fit(X_train)

        Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_
        water_data = pd.concat([water_data, Y], axis=1)
        # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0;

        result['统计异常'] = water_data['S2'].values

        # 寻找异常维度
        from sklearn.neighbors import KernelDensity
        clean_data = water_data[water_data['S1'] == 0]
        dens = pd.DataFrame(index=clean_data.index,
                            columns=[
                                'temperature', 'pH', 'EC', 'ORP', 'DO',
                                'turbidity', 'transparency', 'COD', 'P',
                                'NH3N', 'flux'
                            ])

        for i in dens.columns:
            kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(
                clean_data[i].values.reshape(-1, 1))
            dens[i] = np.exp(
                kde.score_samples(clean_data[i].values.reshape(-1, 1)))
        dens = dens.iloc[:, 0:11].rank()
        dens['S2_names'] = dens.idxmin(axis=1)
        water_data = pd.concat([water_data, dens['S2_names']], axis=1)
        self.water_data = water_data
        result['统计异常维度'] = water_data['S2_names'].values

        # 存储模型
        joblib.dump(scaler, "./water_model/S2_scaler")
        joblib.dump(clf1, "./water_model/S2_Iforest")
示例#7
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.5
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = PCA(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)
示例#8
0
 def models_init(self):
     """Models initialisation.
     """
     self.model = self.configuration.get('model', 'pca')
     if self.model == 'pca':
         self.models = {
             model: PCA(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'loda':
         self.models = {
             model: LODA(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'iforest':
         self.models = {
             model: IForest(n_estimators=50,
                            bootstrap=True,
                            behaviour='new',
                            contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'cblof':
         self.models = {
             model: CBLOF(n_clusters=3, contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'feature_bagging':
         self.models = {
             model: FeatureBagging(
                 base_estimator=PCA(contamination=self.contamination),
                 contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'copod':
         self.models = {
             model: COPOD(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'hbos':
         self.models = {
             model: HBOS(contamination=self.contamination)
             for model in self.models_in_scope
         }
     else:
         self.models = {
             model: HBOS(contamination=self.contamination)
             for model in self.models_in_scope
         }
     self.custom_model_scalers = {
         model: MinMaxScaler()
         for model in self.models_in_scope
     }
def getOutlierPCA(dataset):
    '''
    @brief Function that executes PCA algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    pca = PCA()
    # Fits the data and obtains labels
    pca.fit(dataset)
    # Return labels
    return pca.labels_
示例#10
0
def define_classifiers(random_state, outliers_fraction):
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
示例#11
0
文件: ml.py 项目: pedrovhb/tcc
def train_model(station: Station) -> LSCP:
    t1 = time.time()
    log.info(f'Training model for {station}...')
    log.info('Loading training observations')
    observations_select = Observation.select(
        Observation.time,
        Observation.sample_frequency,
        Observation.sample_count,
        Observation.rms,
        Observation.crest,
        Observation.peak_to_peak,
        Observation.kurtosis,
    ).where(Observation.station == station, Observation.is_training)

    obs_data = []
    for observation in observations_select:
        obs_data.append([
            observation.rms, observation.peak_to_peak, observation.kurtosis,
            observation.crest
        ])

    log.info('Fitting LSCP model')
    lscp = LSCP([KNN()] * 5 + [LOF()] * 5 + [PCA()] * 5, contamination=0.03)
    lscp.fit(X=obs_data)
    log.info(f'Trained model in {time.time() - t1}')
    return lscp
示例#12
0
def train():
    dataset = get_data(1000, 10, 100)
    contamination = 0.01
    with mlflow.start_run():
        base_estimators = [
            LOF(n_neighbors=5, contamination=contamination),
            LOF(n_neighbors=15, contamination=contamination),
            LOF(n_neighbors=25, contamination=contamination),
            PCA(contamination=contamination),
            KNN(n_neighbors=5, contamination=contamination),
            KNN(n_neighbors=15, contamination=contamination),
            KNN(n_neighbors=25, contamination=contamination)]
        model = SUOD(base_estimators=base_estimators, n_jobs=6,  
                    rp_flag_global=True,  
                    bps_flag=True,  
                    approx_flag_global=False, 
                    contamination=contamination)
        model.fit(dataset)  
        model.approximate(dataset)  
        predicted_labels = model.predict(dataset)
        voted_labels = vote(predicted_labels)
        true_labels = [0]*1000 + [1]*10
        auc_score = roc_auc_score(voted_labels, true_labels)
        print("The resulted area under the ROC curve score is {}".format(auc_score))
        mlflow.log_metric("auc_score", auc_score)
        mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
示例#13
0
def main(args):
    data = loadmat(args.filename)
    trainx, testx, trainy, testy = train_test_split(data['X'],
                                                    data['y'],
                                                    test_size=args.train_split,
                                                    random_state=2)
    valx, evalx, valy, evaly = train_test_split(testx, testy, test_size=0.5)
    data_size = len(trainx[0])
    encoder_neurons = [data_size, data_size / 2, data_size / 4]
    clf = KNN()
    clf.fit(trainx)
    print("Results Validation KNN")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation KNN")
    print_metrics(evaly, clf.predict(evalx))

    clf = PCA(n_components=args.components)
    clf.fit(trainx)
    print("Results Validation PCA")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation PCA")
    print_metrics(evaly, clf.predict(evalx))

    clf = VAE(encoder_neurons=encoder_neurons,
              decoder_neurons=encoder_neurons[::-1],
              epochs=args.epochs,
              contamination=args.contamination,
              gamma=args.gamma,
              capacity=args.capacity)
    clf.fit(trainx)
    print("Results Validation VAE")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation VAE")
    print_metrics(evaly, clf.predict(evalx))
示例#14
0
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test):
    if method == 'KNN':
        clf = KNN()
    elif method == 'CBLOF':
        clf = CBLOF()
    elif method == 'PCA':
        clf = PCA()
    else:
        clf = IForest()
    clf.fit(x_train)  # 使用x_train训练检测器clf

    # 返回训练数据x_train上的异常标签和异常分值
    y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
    y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
    print("On train Data:")
    evaluate_print(method, y_train, y_train_scores)

    # 用训练好的clf来预测未知数据中的异常值
    y_test_pred = clf.predict(x_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
    y_test_scores = clf.decision_function(x_test)  # 返回未知数据上的异常值 (分值越大越异常)
    print("On Test Data:")
    evaluate_print(method, y_test, y_test_scores)

    y_true = column_or_1d(y_test)
    y_pred = column_or_1d(y_test_scores)
    check_consistent_length(y_true, y_pred)

    roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
    prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)

    total_roc.append(roc)
    total_prn.append(prn)
示例#15
0
def load_classifiers(outliers_fraction):
    outliers_fraction = min(0.5, outliers_fraction)
    random_state = np.random.RandomState(42)
    # Define nine outlier detection tools to be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction,
                random_state=random_state,
                behaviour="new"),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
示例#16
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
示例#17
0
def outlier_detection(x_raw, y_raw):
    """
    Filter all ourlier points
    :param x_raw: feature in ndarray
    :param y_raw: label in ndarray
    :return x_clean, y_clean: cleaned feature and label in ndarray
    """
    # TODO Filter the outliers.
    print()
    print("Detecting outliers...")
    print("Before outlier detection: {}".format(x_raw.shape))
    outliers_fraction = 0.04
    random_state = np.random.RandomState(42)
    # all outlier detection method candidate list as follows
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state),
        'Improving Supervised Outlier Detection with Unsupervised Representation Learning':
        XGBOD(contamination=outliers_fraction),
    }
    clf_name = 'Isolation Forest'
    clf = IForest(contamination=outliers_fraction, random_state=random_state)
    # clf_name = 'Angle-based Outlier Detector (ABOD)'
    # clf = ABOD(contamination=outliers_fraction, method='default')
    clf.fit(x_raw)
    y_pred = clf.predict(x_raw)
    # for pyod, 1 means outliers and 0 means inliers
    # for sklearn,  -1 means outliers and 1 means inliers
    idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1]
    x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0)
    y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0)
    print("After outlier detection: {}".format(x_clean.shape))
    assert (x_clean.shape[0] == y_clean.shape[0])
    return x_clean, y_clean
def OD_detect(df, id_col=None, contamination=0.05, trans_cols=None):
    """
    use pyod lib to find 5% outlier in dataset
    """
    df = df.copy()
    OD_clfs = {
        "HBOS": HBOS(contamination=contamination),
        "IForest": IForest(contamination=contamination),
        "CBLOF": CBLOF(contamination=contamination, n_clusters=5),
        # "OCSVM": OCSVM(contamination=contamination),
        "PCA": PCA(contamination=contamination)
    }
    results_list = []
    od_cols = ["id", "name", "result", "label"]

    if id_col is None:
        s_id = df.index
        od_cols = df.columns
    else:
        s_id = df[id_col]
        X_cols = df.columns.drop(id_col)

    if trans_cols is not None:
        for col in trans_cols:
            df[col] = PowerTransformer().fit_transform(df[col].values.reshape(
                -1, 1))

    for clf_name, clf in OD_clfs.items():
        od_result = pd.DataFrame(columns=od_cols)  # create an empty  dataframe

        od_result["id"] = s_id

        od_result['name'] = clf_name
        print(f"{clf_name}, {clf}")

        clf.fit(df[X_cols])

        od_result['result'] = clf.decision_scores_
        od_result['label'] = clf.labels_

        results_list.append(od_result)

    od_results_df = pd.concat(results_list, axis=0, ignore_index=True)
    job_name = f'{pd.datetime.now():%H%M}'
    od_results_df['job_name'] = job_name
    od_results_df.to_sql('t_ml',
                         engine,
                         if_exists='append',
                         schema='wh_v1',
                         method=psql_insert_copy)
    print(
        f"OD results {od_results_df.shape}exported to database{engine},job_name={job_name}"
    )
    return od_results_df
示例#19
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.5
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = PCA(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)
示例#20
0
文件: model.py 项目: esowc/DAAQS
    def pred_PCA(self, n_comp=3, comp_with='openaq'):

        ## hyperparameters for KNN is tuned here
        # Number of samples must be greater than the n_components (3 in this case). It can be made 0.3 to make it work

        self.comp_with = comp_with

        if comp_with == "openaq":
            if self.X_o == []:
                pred = []
            elif self.X_o.shape[0] > n_comp:
                self.clf = PCA(n_components=n_comp)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            elif self.X_o.shape[0] > 2:
                # print(f"The value of k is changed from {k} to {self.X_o.shape[0]-1}")
                n_comp = self.X_o.shape[0] - 1
                self.clf = PCA(n_components=n_comp)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            else:
                pred = []

        elif comp_with == "cams":
            pred = []
            for each_X in self.X_c:
                self.clf = PCA(n_components=n_comp)
                self.clf.fit(each_X)
                pred.append(self.clf.labels_[-1])

        A_location, B_location, C_location = self.pred_location(pred)

        return A_location, B_location, C_location
示例#21
0
    def pca(self, X_train, n_components=None, contamination=None):
        """
        Train PCA model from PYOD

        Parameters
        __________
        X_train: scaled training data
        contamination: percentage of anomalies in the data
        n_components: number of components to transform

        Returns
        ________
        Anomaly scores
        """
        model = PCAOD(n_components=n_components, contamination=contamination)
        model.fit(X_train)

        # Predict raw anomaly score
        labels = model.predict(X_train)  # outlier labels (0 or 1)
        pca_anomaly_scores = model.decision_function(X_train)  # outlier scores
        pca_anomaly_scores = self.min_max_scaler(pca_anomaly_scores)
        return pca_anomaly_scores, labels
示例#22
0
文件: model.py 项目: esowc/DAAQS
    def pred_COPOD(self, comp_with="openaq"):

        self.comp_with = comp_with

        if comp_with == "openaq":
            if self.X_o == []:
                pred = []
            else:
                self.clf = COPOD()
                self.clf.fit(self.X_o)
                pred = self.clf.labels_

        elif comp_with == "cams":
            pred = []
            for each_X in self.X_c:
                self.clf = COPOD()
                self.clf.fit(each_X)
                pred.append(self.clf.labels_[-1])

        A_location, B_location, C_location = self.pred_location(pred)

        return A_location, B_location, C_location
示例#23
0
    def fit_transform(self, df_train, df_corrupted):
        pyod_model = PCA(
            contamination=0.25
        )  # n_components = min(n_samples, n_features) default  # n_selected_components = None

        df_outliers_num = self.num_out_detect(df_train, df_corrupted,
                                              pyod_model)
        df_outliers_cat = self.cat_out_detect(df_train, df_corrupted)

        df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')

        for col in df_corrupted.columns:
            for i in df_outliers.index:
                if df_outliers.loc[i, col + "_outlier"] == 1:
                    df_outliers.loc[i, col] = np.nan

        return df_outliers, self.predictors
示例#24
0
    def setUp(self):
        self.n_train = 1000
        self.n_test = 500
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.random_state = 42
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=self.random_state)

        self.base_estimators = [
            LOF(n_neighbors=5, contamination=self.contamination),
            LOF(n_neighbors=15, contamination=self.contamination),
            LOF(n_neighbors=25, contamination=self.contamination),
            LOF(n_neighbors=35, contamination=self.contamination),
            LOF(n_neighbors=45, contamination=self.contamination),
            HBOS(contamination=self.contamination),
            PCA(contamination=self.contamination),
            LSCP(detector_list=[
                LOF(n_neighbors=5, contamination=self.contamination),
                LOF(n_neighbors=15, contamination=self.contamination)
            ],
                 random_state=self.random_state)
        ]

        this_directory = os.path.abspath(os.path.dirname(__file__))

        self.cost_forecast_loc_fit_ = os.path.join(this_directory,
                                                   'bps_train.joblib')

        self.cost_forecast_loc_pred_ = os.path.join(this_directory,
                                                    'bps_prediction.joblib')

        self.model = SUOD(base_estimators=self.base_estimators,
                          n_jobs=2,
                          rp_flag_global=True,
                          bps_flag=True,
                          contamination=self.contamination,
                          approx_flag_global=True,
                          cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
                          cost_forecast_loc_pred=self.cost_forecast_loc_pred_,
                          verbose=True)
    def fit_transform(self, dataset, y=None):
        data = dataset.copy()

        if 'iso' in self.methods:
            self.iso_forest = IForest(contamination=self.contamination,
                                      random_state=self.random_state,
                                      behaviour='new')
            self.iso_forest.fit(data.drop(self.target, axis=1))
            iso_predict = self.iso_forest.predict(
                data.drop(self.target, axis=1))
            data['iso'] = iso_predict

        if 'knn' in self.methods:
            self.knn_out = KNN(contamination=self.contamination)
            self.knn_out.fit(data.drop(self.target, axis=1))
            knn_predict = self.knn_out.predict(data.drop(self.target, axis=1))
            data['knn'] = knn_predict

        if 'pca' in self.methods:
            self.out_pca = PCA_RO(contamination=self.contamination,
                                  random_state=self.random_state)
            self.out_pca.fit(data.drop(self.target, axis=1))
            pca_predict = self.out_pca.predict(data.drop(self.target, axis=1))
            data['pca'] = pca_predict

        # use for those features which are gaussian distributed
        if 'mcd' in self.methods:
            self.mcd = EllipticEnvelope(contamination=0.01)
            self.mcd.fit(data.drop(self.target, axis=1))
            mcd_predict = self.mcd.predict(data.drop(self.target, axis=1))
            data['mcd'] = mcd_predict

        data['vote_outlier'] = 0

        for i in self.methods:
            data['vote_outlier'] = data['vote_outlier'] + data[i]

        self.outliers = data[data['vote_outlier'] == len(self.methods)]

        return dataset[[
            True if i not in self.outliers.index else False
            for i in dataset.index
        ]]
示例#26
0
    def __init__(self,
                 window_size,
                 step_size=1,
                 contamination=0.1,
                 n_components=None,
                 n_selected_components=None,
                 copy=True,
                 whiten=False,
                 svd_solver='auto',
                 tol=0.0,
                 iterated_power='auto',
                 random_state=None,
                 weighted=True,
                 standardization=True):
        super(PCA, self).__init__(contamination=contamination)
        self.window_size = window_size
        self.step_size = step_size

        # parameters for PCA
        self.n_components = n_components
        self.n_selected_components = n_selected_components
        self.copy = copy
        self.whiten = whiten
        self.svd_solver = svd_solver
        self.tol = tol
        self.iterated_power = iterated_power
        self.random_state = random_state
        self.weighted = weighted
        self.standardization = standardization

        # initialize a kNN model
        self.model_ = PCA_PYOD(
            n_components=self.n_components,
            n_selected_components=self.n_selected_components,
            contamination=self.contamination,
            copy=self.copy,
            whiten=self.whiten,
            svd_solver=self.svd_solver,
            tol=self.tol,
            iterated_power=self.iterated_power,
            random_state=self.random_state,
            weighted=self.weighted,
            standardization=self.standardization)
def choose_model(model, nnet):
    """ among implemented in PyOD """
    clfs = {
        'AE':
        AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15),
        'VAE':
        VAE(encoder_neurons=nnet[:5],
            decoder_neurons=nnet[4:],
            contamination=0.1,
            epochs=13),
        'ABOD':
        ABOD(),
        'FeatureBagging':
        FeatureBagging(),
        'HBOS':
        HBOS(),
        'IForest':
        IForest(),
        'KNN':
        KNN(),
        'LOF':
        LOF(),
        'OCSVM':
        OCSVM(),
        'PCA':
        PCA(),
        'SOS':
        SOS(),
        'COF':
        COF(),
        'CBLOF':
        CBLOF(),
        'SOD':
        SOD(),
        'LOCI':
        LOCI(),
        'MCD':
        MCD()
    }
    return clfs[model]
示例#28
0
def train(doc_list, dataset_name, clf_name):
    model_roc = []
    model_prc = []
    if clf_name == "PCA":
        clf = PCA()
    elif clf_name == "MCD":
        clf = MCD()
    elif clf_name == "LOF":
        clf = LOF()
    elif clf_name == "KNN":
        clf = KNN()
    elif clf_name == "LODA":
        clf = LODA()
    for i in range(10):
        data = pd.read_csv(doc_list[i], header=0, index_col=0)
        train_x = data.drop(drop + ground_truth, axis=1).values
        train_y = np.array([
            transfor[x]
            for x in list(_flatten(data[ground_truth].values.tolist()))
        ])
        clf.fit(train_x)
        predict = clf.decision_scores_
        roc = roc_auc_score(train_y, predict)
        prc = precision_n_scores(train_y, predict)
        if ((i + 1) % 200 == 0):
            print("第" + str(i + 1) + "个文件结果:")
            evaluate_print(clf_name, train_y, predict)
        model_roc.append(roc)
        model_prc.append(prc)
    model_roc_avg = np.mean(model_roc)
    model_prc_avg = np.mean(model_prc)
    print("模型" + clf_name + "在数据集" + dataset_name + "的平均roc_auc为" +
          str(round(model_roc_avg, 4)) + ",平均prc为" +
          str(round(model_prc_avg, 4)) + "。")

    return model_roc_avg, model_prc_avg
示例#29
0
def print_accuracy(train_arr,test_arr,trader_id):
    if len(train_arr)==0 or len(test_arr)==0:
        return
    for i in range(len(train_arr)):
        l1=len(train_arr[i])
        l2=len(test_arr[i])
        if l1==0 or l2==0:
            continue
        train_data=np.array([train_arr[i]]).T
        test_data=np.array([test_arr[i]]).T
        # clf=OCSVM(kernel ='rbf',gamma = 0.5)
        print(len(train_arr))
        clf = PCA(n_components =15)
        clf.fit(train_arr)
        y_pred=clf.predict(train_arr)
        print("TRAINING ACCURACY for TRADER",trader_id,":",100 - (sum(y_pred)*100/l1))
        y_pred=clf.predict(test_data)
        print("TESTING ACCURACY: ",sum(y_pred)*100/l2)
示例#30
0
    'K Nearest Neighbors (KNN)':
    KNN(contamination=outliers_fraction),
    'Average KNN':
    KNN(method='mean', contamination=outliers_fraction),
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
    LOF(n_neighbors=35, contamination=outliers_fraction),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
    'Minimum Covariance Determinant (MCD)':
    MCD(contamination=outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)':
    OCSVM(contamination=outliers_fraction),
    'Principal Component Analysis (PCA)':
    PCA(contamination=outliers_fraction, random_state=random_state),
    # 'Stochastic Outlier Selection (SOS)': SOS(
    #     contamination=outliers_fraction),
    'Locally Selective Combination (LSCP)':
    LSCP(detector_list,
         contamination=outliers_fraction,
         random_state=random_state),
    # 'Connectivity-Based Outlier Factor (COF)':
    #     COF(n_neighbors=35, contamination=outliers_fraction),
    # 'Subspace Outlier Detection (SOD)':
    #     SOD(contamination=outliers_fraction),
}

# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)
    'Isolation Forest': IForest(contamination=outliers_fraction,
                                random_state=random_state),
    '(KNN) K Nearest Neighbors ': KNN(
        contamination=outliers_fraction),
    'Average KNN': KNN(method='mean',
                       contamination=outliers_fraction),
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    '(LOF) Local Outlier Factor ':
        LOF(n_neighbors=35, contamination=outliers_fraction),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
    '(MCD) Minimum Covariance Determinant ': MCD(
        contamination=outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
    '(PCA) Principal Component Analysis ': PCA(
        contamination=outliers_fraction, random_state=random_state),
    # 'Stochastic Outlier Selection (SOS)': SOS(
    #     contamination=outliers_fraction),
    '(LSCP) Locally Selective Combination ': LSCP(
        detector_list, contamination=outliers_fraction,
        random_state=random_state),
    # 'Connectivity-Based Outlier Factor (COF)':
    #     COF(n_neighbors=35, contamination=outliers_fraction),
    # 'Subspace Outlier Detection (SOD)':
    #     SOD(contamination=outliers_fraction),
}
st.subheader('SELECT AN ALGORITHM:')

classifier_name = st.selectbox('THE ALGORITHM',[*classifiers])

# Show all detectors
示例#32
0
if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train PCA detector
    clf_name = 'PCA'
    clf = PCA()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
示例#33
0
class TestPCA(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.5
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = PCA(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'selected_components_') and
                    self.clf.selected_components_ is not None)
        assert_true(hasattr(self.clf, 'selected_w_components_') and
                    self.clf.selected_w_components_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass