Пример #1
0
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test):
    if method == 'KNN':
        clf = KNN()
    elif method == 'CBLOF':
        clf = CBLOF()
    elif method == 'PCA':
        clf = PCA()
    else:
        clf = IForest()
    clf.fit(x_train)  # 使用x_train训练检测器clf

    # 返回训练数据x_train上的异常标签和异常分值
    y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
    y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
    print("On train Data:")
    evaluate_print(method, y_train, y_train_scores)

    # 用训练好的clf来预测未知数据中的异常值
    y_test_pred = clf.predict(x_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
    y_test_scores = clf.decision_function(x_test)  # 返回未知数据上的异常值 (分值越大越异常)
    print("On Test Data:")
    evaluate_print(method, y_test, y_test_scores)

    y_true = column_or_1d(y_test)
    y_pred = column_or_1d(y_test_scores)
    check_consistent_length(y_true, y_pred)

    roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
    prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)

    total_roc.append(roc)
    total_prn.append(prn)
Пример #2
0
def model_test(model_type, y_train, y_test, X_train, X_test, model_file,
               save_flag):
    if model_type == 'KNN':
        clf_name = 'KNN'
        clf = KNN()
        clf.fit(X_train)
    if model_type == 'XGBOD':
        clf_name = 'XGBOD'
        #set this scale_pos_weight  sum(negative instances) / sum(positive instances).
        clf = XGBOD(random_state=42, scale_pos_weight=50)
        clf.fit(X_train, y_train)
    if model_type == 'SOD':
        # train SOD detector
        # Note that SOD is meant to work in high dimensions d > 2.
        # But here we are using 2D for visualization purpose
        # thus, higher precision is expected in higher dimensions
        clf_name = 'SOD'
        clf = SOD()
        clf.fit(X_train)
    if model_type == 'VAE':
        # train VAE detector (Beta-VAE)
        clf_name = 'VAE'
        contamination = 0.01
        clf = VAE(epochs=30,
                  contamination=contamination,
                  gamma=0.8,
                  capacity=0.2)
        clf.fit(X_train)

    #save model if specified
    if save_flag == '1':
        pickle.dump(clf, open(model_file, "wb"))

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    conf_train = confusion_matrix(y_train, y_train_pred)
    print("<<<< confusion matrix for train: ", conf_train)

    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
    conf_test = confusion_matrix(y_test, y_test_pred)
    print("<<<< confusion matrix for test: ", conf_test)

    # visualize the results
    #todo: Input data has to be 2-d for visualization.
    #visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
    #         y_test_pred, show_figure=True, save_figure=False)

    return model_file
Пример #3
0
def main():
    dataset, label = pre_data()
    from numpy import nan as NA
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values=NA, strategy="mean")
    dataset = imputer.fit_transform(dataset)
    x_train, x_test, y_train, y_label = train_test_split(dataset,
                                                         label,
                                                         test_size=0.3,
                                                         random_state=44)
    # x_train, x_test, y_train, y_label =[], [], [], []
    # for i in range(1000):
    #     x_train.append(dataset[i])
    #     y_train.append(label[i])
    # for i in range(6000,10000):
    #     x_train.append(dataset[i])
    #     y_train.append(label[i])
    # x_test = dataset[1000:6000]
    # y_label = label[1000:6000]
    for i in range(3):
        clf_name = 'IForest'
        clf = IForest()
        clf.fit(x_train)

        # get the prediction label and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores
        from sklearn.metrics import accuracy_score
        from sklearn.metrics import precision_score
        from sklearn.metrics import recall_score
        print(accuracy_score(y_train, y_train_pred))
        print(precision_score(y_train, y_train_pred))
        print(recall_score(y_train, y_train_pred))
        # get the prediction on the test data
        y_test_pred = clf.predict(x_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(x_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print(accuracy_score(y_label, y_test_pred))
        print(precision_score(y_train, y_train_pred))
        print(recall_score(y_train, y_train_pred))
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_label, y_test_scores)
Пример #4
0
def train(doc_list, dataset_name, clf_name):
    model_roc = []
    model_prc = []
    if clf_name == "PCA":
        clf = PCA()
    elif clf_name == "MCD":
        clf = MCD()
    elif clf_name == "LOF":
        clf = LOF()
    elif clf_name == "KNN":
        clf = KNN()
    elif clf_name == "LODA":
        clf = LODA()
    for i in range(10):
        data = pd.read_csv(doc_list[i], header=0, index_col=0)
        train_x = data.drop(drop + ground_truth, axis=1).values
        train_y = np.array([
            transfor[x]
            for x in list(_flatten(data[ground_truth].values.tolist()))
        ])
        clf.fit(train_x)
        predict = clf.decision_scores_
        roc = roc_auc_score(train_y, predict)
        prc = precision_n_scores(train_y, predict)
        if ((i + 1) % 200 == 0):
            print("第" + str(i + 1) + "个文件结果:")
            evaluate_print(clf_name, train_y, predict)
        model_roc.append(roc)
        model_prc.append(prc)
    model_roc_avg = np.mean(model_roc)
    model_prc_avg = np.mean(model_prc)
    print("模型" + clf_name + "在数据集" + dataset_name + "的平均roc_auc为" +
          str(round(model_roc_avg, 4)) + ",平均prc为" +
          str(round(model_prc_avg, 4)) + "。")

    return model_roc_avg, model_prc_avg
Пример #5
0
def pyod_anomaly_detection(type, contamination):
    X_train, y_train, X_test, y_test = data(type=type,
                                            contamination=contamination)
    if type == 'MAD':
        # train MAD detector
        clf_name = 'MAD'
        clf = MAD(threshold=3.5)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores
        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        # visualize the results
        # making dimensions = 2 for visualising purpose only. By repeating same data each dimension.
        visualize(clf_name,
                  np.hstack((X_train, X_train)),
                  y_train,
                  np.hstack((X_test, X_test)),
                  y_test,
                  y_train_pred,
                  y_test_pred,
                  show_figure=True,
                  save_figure=False)
    elif type == 'ABOD':
        # train ABOD detector
        clf_name = 'ABOD'
        clf = ABOD()
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        # visualize the results
        visualize(clf_name,
                  X_train,
                  y_train,
                  X_test,
                  y_test,
                  y_train_pred,
                  y_test_pred,
                  show_figure=True,
                  save_figure=False)
    elif type == 'AutoEncoder':
        # train AutoEncoder detector
        clf_name = 'AutoEncoder'
        clf = AutoEncoder(epochs=30, contamination=contamination)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)
Пример #6
0
detectors = [KNN(), LOF(), OCSVM()]

clf_name = 'LSCP'
clf = LSCP(base_estimators=detectors)
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print('Average', y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print('Average', y_test, y_test_scores)

# visualize the results
visualize(clf_name,
          X_train,
          y_train,
          X_test,
          y_test,
          y_train_pred,
          y_test_pred,
          show_figure=True,
          save_figure=False)
Пример #7
0
    print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf))

    for i in range(n_clf):
        k = k_list[i]

        clf = KNN(n_neighbors=k, method='largest')
        clf.fit(X_train_norm)

        train_scores[:, i] = clf.decision_scores_
        test_scores[:, i] = clf.decision_function(X_test_norm)

    # Decision scores have to be normalized before combination
    train_scores_norm, test_scores_norm = standardizer(train_scores,
                                                       test_scores)
    # Combination by average
    y_by_average = average(test_scores_norm)
    evaluate_print('Combination by Average', y_test, y_by_average)

    # Combination by max
    y_by_maximization = maximization(test_scores_norm)
    evaluate_print('Combination by Maximization', y_test, y_by_maximization)

    # Combination by aom
    y_by_aom = aom(test_scores_norm, n_buckets=5)
    evaluate_print('Combination by AOM', y_test, y_by_aom)

    # Combination by moa
    y_by_moa = moa(test_scores_norm, n_buckets=5)
    evaluate_print('Combination by MOA', y_test, y_by_moa)
Пример #8
0
    columns=["Dataset", "Dimensions", "PCA", "MCD", "LOF", "KNN", "LODA"])
result_roc
result_prc

#对全集csv文件进行训练并可视化结果
clf = PCA()
clf_name = "PCA"
read = r"D:\研一下学期\数据挖掘\作业4\pageb\meta_data\pageb.preproc.csv"
data = pd.read_csv(read, header=0, index_col=0)
train_x = data.drop(drop + ground_truth + ["original.label"], axis=1).values
train_y = np.array(
    [transfor[x] for x in list(_flatten(data[ground_truth].values.tolist()))])
clf.fit(train_x)
label = clf.labels_
predict = clf.decision_scores_
evaluate_print(clf_name, train_y, predict)
pca = decomposition.PCA(n_components=2)
X = pca.fit_transform(train_x)
visualize(clf_name,
          X,
          train_y,
          X,
          train_y,
          label,
          train_y,
          show_figure=True,
          save_figure=True)

clf = MCD()
clf_name = "PCA"
read = r"D:\研一下学期\数据挖掘\作业4\abalone\meta_data\abalone.preproc.csv"
Пример #9
0
                X_test,
                n_estimators,
                # rp_flags[starts[i]:starts[i + 1]],
                jl_transformers,
                approx_flags[starts[i]:starts[i + 1]],
                verbose=True) for i in range(n_jobs))

    print('Orig decision_function time:', time.time() - start)
    print()

    # unfold and generate the label matrix
    predicted_scores_orig = np.zeros([X_test.shape[0], n_estimators])
    for i in range(n_jobs):
        predicted_scores_orig[:, starts[i]:starts[i + 1]] = np.asarray(
            all_results_scores[i]).T
    ##########################################################################
    predicted_scores = standardizer(predicted_scores)
    predicted_scores_orig = standardizer(predicted_scores_orig)

    evaluate_print('orig', y_test, average(predicted_scores_orig))
    evaluate_print('new', y_test, average(predicted_scores))

    evaluate_print('orig max', y_test, maximization(predicted_scores_orig))
    evaluate_print('new max', y_test, maximization(predicted_scores))

    evaluate_print('orig aom', y_test, aom(predicted_scores_orig))
    evaluate_print('new aom', y_test, aom(predicted_scores))

    evaluate_print('orig moa', y_test, moa(predicted_scores_orig))
    evaluate_print('new moa', y_test, moa(predicted_scores))
Пример #10
0
    n_train = 20000  # number of training points
    n_test = 2000  # number of testing points
    n_features = 300  # number of features

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=n_features,
                      contamination=contamination,
                      random_state=42)

    # train AutoEncoder detector
    clf_name = 'AutoEncoder'
    clf = AutoEncoder(epochs=30, contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Пример #11
0
def fun(dir_path):
    file_list = []
    total_roc = []
    total_prn = []
    count = 0
    for home, dirs, files in os.walk("./"+dir_path+"/benchmarks"):
        for filename in files:
            fullname = os.path.join(home, filename)
            file_list.append(fullname)cb
    for file_csv in file_list:

        # if count == 2:
        #     break
        
        df = pd.read_csv(file_csv)
        columns = df.columns
        # df = df[columns].fillna('nan')

        data = df.drop(columns = ['point.id', 'motherset', 'origin'])

        class_mapping = {"anomaly":1, "nominal":0}
        data['ground.truth'] = data['ground.truth'].map(class_mapping)
        class_mapping = {"anomaly":1, "nominal":0}

        y = data['ground.truth']

        x = data.drop('ground.truth',axis=1)

        X_train, X_test, y_train, y_test = train_test_split(
                x, y, test_size=0.2, random_state=28)

        random_state = np.random.RandomState(42)
        outliers_fraction = 0.05
        # Define seven outlier detection tools to be compared
        classifiers = {
                'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
                'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state),
                'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state),
                'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
                'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
                'Average KNN': KNN(method='mean',contamination=outliers_fraction)
        }
        p_prn = []
        p_roc = []
        for i, (clf_name, clf) in enumerate(classifiers.items()):
            try:
                clf.fit(X_train)

                # get the prediction labels and outlier scores of the training data
                y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
                y_train_scores = clf.decision_scores_  # raw outlier scores

                # get the prediction on the test data
                y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
                y_test_scores = clf.decision_function(X_test)  # outlier scores

                # evaluate and print the results
                print(str(count)+"is analysing")
                print("\nOn Training Data:")
        
                evaluate_print(clf_name, y_train, y_train_scores)
                print("\nOn Test Data:")
                evaluate_print(clf_name, y_test, y_test_scores)
                roc=np.round(roc_auc_score(y_train, y_train_scores), decimals=4),
                prn=np.round(precision_n_scores(y_test, y_test_scores), decimals=4)

                p_prn.append(prn)
                p_roc.append(roc[0])
            except:
                p_prn.append(-1)
                p_roc.append(-1)

        total_prn.append(p_prn)
        total_roc.append(p_roc)    
        count += 1
            
    total_prn = json.dumps(total_prn)
    total_roc = json.dumps(total_roc)
    a = open(dir_path+"_prn_list.txt", "w",encoding='UTF-8')
    a.write(total_prn)
    a.close()
    a = open(dir_path+"_roc_list.txt", "w",encoding='UTF-8')
    a.write(total_roc)
    a.close()
            verbose=True)
        for i in range(n_jobs))

    print('Orig decision_function time:', time.time() - start)
    print()

    # unfold and generate the label matrix
    predicted_scores_orig = np.zeros([X.shape[0], n_estimators])
    for i in range(n_jobs):
        predicted_scores_orig[:, starts[i]:starts[i + 1]] = np.asarray(
            all_results_scores[i]).T
    ##########################################################################
    predicted_scores = standardizer(predicted_scores)
    predicted_scores_orig = standardizer(predicted_scores_orig)

    evaluate_print('orig', y_test, np.mean(predicted_scores_orig, axis=1))
    evaluate_print('new', y_test, np.mean(predicted_scores, axis=1))
    
#%%

    ##########################################################################
    start = time.time()
    for i in range(n_estimators):
        print(i)
        trained_estimators[i].predict(X)

    print('Orig decision_function time:', time.time() - start)
    print()
    
    ##########################################################################
    start = time.time()
Пример #13
0
    X_train_add = np.zeros([X_train.shape[0], len(estimator_list)])
    X_test_add = np.zeros([X_test.shape[0], len(estimator_list)])

    # fit the model
    for index, estimator in enumerate(estimator_list):
        if normalization_list[index]:
            estimator.fit(X_train_norm)
            X_train_add[:, index] = estimator.decision_scores_
            X_test_add[:, index] = estimator.decision_function(X_test_norm)
        else:
            estimator.fit(X_train)
            X_train_add[:, index] = estimator.decision_scores_
            X_test_add[:, index] = estimator.decision_function(X_test)

    # prepare the new feature space
    X_train_new = np.concatenate((X_train, X_train_add), axis=1)
    X_test_new = np.concatenate((X_test, X_test_add), axis=1)

    clf = XGBClassifier()
    clf.fit(X_train_new, y_train)
    y_test_scores = clf.predict_proba(X_test_new)  # outlier scores

    evaluate_print('XGBOD', y_test, y_test_scores[:, 1])

    clf = XGBClassifier()
    clf.fit(X_train, y_train)
    y_test_scores_orig = clf.predict_proba(X_test)  # outlier scores

    evaluate_print('old', y_test, y_test_scores_orig[:, 1])
from pyod.models.iforest import IForest
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
import numpy as np
import pickle

X_train = np.loadtxt('X_train.txt', dtype=float)
y_train = np.loadtxt('y_train.txt', dtype=float)
X_test = np.loadtxt('X_test.txt', dtype=float)
y_test = np.loadtxt('y_test.txt', dtype=float)

clf = IForest()
clf.fit(X_train)

y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores
print(y_test_pred)

print("\nOn Test Data:")
evaluate_print('IForest', y_test[:len(y_test_scores)], y_test_scores)

pickle.dump(clf, open("IForest.p", "wb"))
Пример #15
0
        print('{data_file} does not exist. Use generated data'.format(
            data_file=mat_file))
        X, y = generate_data(train_only=True)  # load data
    except IOError:
        print('{data_file} does not exist. Use generated data'.format(
            data_file=mat_file))
        X, y = generate_data(train_only=True)  # load data
    else:
        X = mat['X']
        y = mat['y'].ravel()

    for t in range(ite):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.4)

        # standardizing data for processing
        X_train_norm, X_test_norm = standardizer(X_train, X_test)

        # initialize 20 base detectors for combination

        clf = PCA()
        clf.fit(X_train_norm)

        train_scores = clf.decision_scores_
        test_scores = clf.decision_function(X_test_norm)

        print()
        evaluate_print('PCA Train', y_train, train_scores)
        evaluate_print('PCA Test', y_test, test_scores)
Пример #16
0
 def test_evaluate_print(self):
     X_train, y_train, X_test, y_test = generate_data(
         n_train=self.n_train,
         n_test=self.n_test,
         contamination=self.contamination)
     evaluate_print('dummy', y_train, y_train * 0.1)
from pyod.models.iforest import IForest
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
import numpy as np
import pickle

X_train = np.loadtxt('X_train.txt', dtype=float)
y_train = np.loadtxt('y_train.txt', dtype=float)
X_test = np.loadtxt('X_test.txt', dtype=float)
y_test = np.loadtxt('y_test.txt', dtype=float)

clf = IForest()
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
#y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
#y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores
print(y_test_pred)

# evaluate and print the results
print("\nOn Test Data:")
evaluate_print('IForest', y_test, y_test_scores)

pickle.dump(clf, open("IForest.p", "wb"))
Пример #18
0
        train_test_split(X, y, test_size=0.4, random_state=42)

    contamination = y.sum() / len(y)
    base_estimators = get_estimators_small(contamination)

    model = SUOD(base_estimators=base_estimators,
                 n_jobs=6,
                 bps_flag=True,
                 contamination=contamination,
                 approx_flag_global=True)

    model.fit(X_train)  # fit all models with X
    model.approximate(X_train)  # conduct model approximation if it is enabled
    predicted_labels = model.predict(X_test)  # predict labels
    predicted_scores = model.decision_function(X_test)  # predict scores
    predicted_probs = model.predict_proba(X_test)  # predict scores

    ###########################################################################
    # compared with other approaches
    evaluate_print('majority vote', y_test, majority_vote(predicted_labels))
    evaluate_print('average', y_test, average(predicted_scores))
    evaluate_print('maximization', y_test, maximization(predicted_scores))

    clf = LOF()
    clf.fit(X_train)
    evaluate_print('LOF', y_test, clf.decision_function(X_test))

    clf = IForest()
    clf.fit(X_train)
    evaluate_print('IForest', y_test, clf.decision_function(X_test))
Пример #19
0
    # model prediction
    all_results_scores = Parallel(
        n_jobs=n_jobs, max_nbytes=None,
        verbose=True)(delayed(_parallel_decision_function)(
            n_estimators_list[i],
            trained_estimators[starts[i]:starts[i + 1]],
            None,
            X_test,
            n_estimators,
            jl_transformers,
            approx_flags[starts[i]:starts[i + 1]],
            verbose=True) for i in range(n_jobs))

    print('Orig decision_function time:', time.time() - start)
    print()

    # unfold and generate the label matrix
    predicted_scores_orig = np.zeros([X_test.shape[0], n_estimators])
    for i in range(n_jobs):
        predicted_scores_orig[:, starts[i]:starts[i + 1]] = np.asarray(
            all_results_scores[i]).T
    ##########################################################################
    predicted_scores = standardizer(predicted_scores)
    predicted_scores_orig = standardizer(predicted_scores_orig)

    evaluate_print('orig', y_test, average(predicted_scores_orig))
    evaluate_print('new', y_test, average(predicted_scores))

    evaluate_print('orig moa', y_test, moa(predicted_scores_orig))
    evaluate_print('new moa', y_test, moa(predicted_scores))
Пример #20
0
    train_scores = np.zeros([X_train.shape[0], n_clf])
    test_scores = np.zeros([X_test.shape[0], n_clf])

    for i in range(n_clf):
        k = k_list[i]

        clf = KNN(n_neighbors=k, method='largest')
        clf.fit(X_train_norm)

        train_scores[:, i] = clf.decision_scores_
        test_scores[:, i] = clf.decision_function(X_test_norm)

    # decision scores have to be normalized before combination
    train_scores_norm, test_scores_norm = standardizer(train_scores,
                                                       test_scores)
    # combination by average
    y_by_average = average(test_scores_norm)
    evaluate_print('Combination by Average', y_test, y_by_average)

    # combination by max
    y_by_maximization = maximization(test_scores_norm)
    evaluate_print('Combination by Maximization', y_test, y_by_maximization)

    # combination by aom
    y_by_aom = aom(test_scores_norm, n_buckets=5)
    evaluate_print('Combination by AOM', y_test, y_by_aom)

    # combination by moa
    y_by_moa = moa(test_scores_norm, n_buckets=5)
    evaluate_print('Combination by MOA', y_test, y_by_moa)
Пример #21
0
from pyod.models.auto_encoder import AutoEncoder
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
import numpy as np
import pickle

if __name__ == '__main__':

    X_train = np.loadtxt('X_train.txt', dtype=float)
    y_train = np.loadtxt('y_train.txt', dtype=float)
    X_test = np.loadtxt('X_test.txt', dtype=float)
    y_test = np.loadtxt('y_test.txt', dtype=float)

    clf = AutoEncoder(epochs=30, contamination=0.2)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    #y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    #y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores
    print(y_test_pred)

    # evaluate and print the results
    print("\nOn Test Data:")
    evaluate_print('AutoEncoder', y_test, y_test_scores)

    pickle.dump(clf, open("autoencoder.p", "wb"))
Пример #22
0
    # train IForest detector
    clf_name = 'IForest'
    clf = IForest()
    clf.fit(X_train)

    # get the prediction label and decision_scores_ on the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    visualize(clf_name,
              X_train,
              y_train,
              X_test,
              y_test,
              y_train_pred,
              y_test_pred,
              show_figure=True,
              save_figure=False)
Пример #23
0
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.4, random_state=42)

    contamination = y.sum() / len(y)
    base_estimators = get_estimators_small(contamination)

    model = SUOD(base_estimators=base_estimators,
                 n_jobs=6,
                 bps_flag=True,
                 contamination=contamination,
                 approx_flag_global=True)

    model.fit(X_train)  # fit all models with X
    model.approximate(X_train)  # conduct model approximation if it is enabled

    # save the model
    dump(model, 'model.joblib')
    # load the model
    model = load('model.joblib')

    predicted_labels = model.predict(X_test)  # predict labels
    predicted_scores = model.decision_function(X_test)  # predict scores
    predicted_probs = model.predict_proba(X_test)  # predict scores

    ###########################################################################
    # model evaluation with the loaded model
    evaluate_print('majority vote', y_test, majority_vote(predicted_labels))
    evaluate_print('average', y_test, average(predicted_scores))
    evaluate_print('maximization', y_test, maximization(predicted_scores))