Пример #1
0
def detect_outliers(lst):
    """detect outliers in a list of numpy arrays
    
    Parameters
    ----------
    lst : List
        [description]
    
    Returns
    -------
    inliers : List
        A list of the inliers
    """
    clf = OCSVM(verbose=0)
    clf.fit(lst)

    inlier_idx = []
    outlier_idx = []
    for index, data in enumerate(lst):
        y = clf.predict(data.reshape(1, -1))
        if y:  # y==1 for outliers
            logger.debug('Found outlier: {0}'.format(index))
            outlier_idx.append(index)
        else:
            inlier_idx.append(index)

    logger.info('{:.0%} are outliers'.format(len(outlier_idx) / len(lst)))
    return inlier_idx, outlier_idx
Пример #2
0
def train_model(X, Y, contamination, name, from_scratch=True):
    model_dir = './model'
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    file_name = name + '.pkl'

    if from_scratch:
        if name == 'ocsvm':
            model = OCSVM(contamination=contamination)
            model.fit(X)
        elif name == 'iforest':
            model = IForest(contamination=contamination)
            model.fit(X)
        elif name == 'lof':
            model = LOF(contamination=contamination)
            model.fit(X)
        elif name == 'knn':
            model = KNN(contamination=contamination)
            model.fit(X)
        elif name == 'xgbod':
            model = XGBOD(contamination=contamination)
            model.fit(X, Y)

        save(model, model_dir, file_name)

    else:
        model = load(model_dir, file_name)

    return model
def getOutlierOCSVM(dataset):
    '''
    @brief Function that executes OCSVM algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    ocsvm = OCSVM()
    # Fits the data and obtains labels
    ocsvm.fit(dataset)
    # Return labels
    return ocsvm.labels_
Пример #4
0
def print_accuracy(train_arr,test_arr,trader_id):
    if len(train_arr)==0 or len(test_arr)==0:
        return
    for i in range(len(train_arr)):
        l1=len(train_arr[i])
        l2=len(test_arr[i])
        if l1==0 or l2==0:
            continue
        train_data=np.array([train_arr[i]]).T
        test_data=np.array([test_arr[i]]).T
        clf=OCSVM()
        clf.fit(train_data)
        y_pred=clf.predict(train_data)
        print("TRAINING ACCURACY for TRADER",trader_id,":",100 - (sum(y_pred)*100/l1))
        y_pred=clf.predict(test_data)
        print("TESTING ACCURACY: ",sum(y_pred)*100/l2)
Пример #5
0
class TestOCSVM(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = OCSVM()
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'support_')
                and self.clf.support_ is not None)
        assert (hasattr(self.clf, 'support_vectors_')
                and self.clf.support_vectors_ is not None)
        assert (hasattr(self.clf, 'dual_coef_')
                and self.clf.dual_coef_ is not None)
        assert (hasattr(self.clf, 'intercept_')
                and self.clf.intercept_ is not None)

        # only available for linear kernel
        # if not hasattr(self.clf, 'coef_') or self.clf.coef_ is None:
        #     self.assertRaises(AttributeError, 'coef_ is not set')

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3.5)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3.5)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Пример #6
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, X_test, y_train, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train one_class_svm detector
    clf_name = 'OneClassSVM'
    clf = OCSVM()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
    pred_test_set.append(1)
# print(len(train_set),len(test_set))
import numpy as np
train_set=np.array(train_set)
test_set=np.array(test_set)

from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
# from pyod.models.mcd import MCD

clf1=PCA(standardization = True,contamination=0.2)
# clf1 = MCD(assume_centered = True)
clf2=OCSVM(kernel = 'poly',nu = 0.25,degree =2,contamination =0.2)
# clf2 = OCSVM(kernel = 'linear',nu =0.02)
clf1.fit(train_set)
clf2.fit(train_set)

y_pred_train_pca=clf1.predict(train_set)
y_pred_test_pca=clf1.predict(test_set)

y_pred_train_ocsvm=clf2.predict(train_set)
y_pred_test_ocsvm=clf2.predict(test_set)
print(clf1.explained_variance_)
# print(y_pred_test_pca,y_pred_test_ocsvm)
train_pca_correct=0
train_ocsvm_correct=0
print("TRAIN SET")
for i in range(len(pred_train_set)):
    # print("Actual:",pred_train_set[i],"PCA",y_pred_train_pca[i],"OCSVM",y_pred_train_ocsvm[i])
    if pred_train_set[i]==y_pred_train_pca[i] and pred_train_set[i]==1:
        train_pca_correct+=1
Пример #8
0
                      random_state=42)
    # load pretrained models
    prepare_trained_model()

    # recommended models
    selected_models = select_model(X_train, n_selection=100)

    print("Showing the top recommended models...")
    for i, model in enumerate(selected_models):
        print(i, model)

    print()

    model_1 = LODA(n_bins=5, n_random_cuts=100)
    print(
        "1st model Average Precision",
        average_precision_score(y_train,
                                model_1.fit(X_train).decision_scores_))

    model_10 = LODA(n_bins=5, n_random_cuts=20)
    print(
        "10th model Average Precision",
        average_precision_score(y_train,
                                model_10.fit(X_train).decision_scores_))

    model_50 = OCSVM(kernel='sigmoid', nu=0.6)
    print(
        "50th model Average Precision",
        average_precision_score(y_train,
                                model_50.fit(X_train).decision_scores_))
Пример #9
0
class SolverVAECIFAR():
    def __init__(self,
                 data_name,
                 hidden_dim=256,
                 seed=0,
                 learning_rate=3e-4,
                 normal_class=0,
                 anomaly_ratio=0.1,
                 batch_size=128,
                 concentrated=0,
                 training_ratio=0.8,
                 SN=1,
                 Trim=1,
                 L=1.5,
                 max_epochs=100):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if use_cuda else "cpu")
        self.L = L
        if concentrated == 1.0:
            full_data_name = 'CIFAR10_Concentrated'
        elif concentrated == 0.0:
            full_data_name = 'CIFAR10'
        self.result_path = "./results/{}_{}/0.0/OCSVM/{}/".format(
            full_data_name, normal_class, seed)
        data_path = "./data/" + data_name + ".npy"
        self.learning_rate = learning_rate
        self.SN = SN
        self.Trim = Trim
        # self.dataset = RealGraphDataset(data_path, missing_ratio=0, radius=2)
        self.dataset = CIFARVGGDataset(data_path,
                                       normal_class=normal_class,
                                       anomaly_ratio=anomaly_ratio,
                                       concentrated=concentrated)
        self.seed = seed
        self.hidden_dim = hidden_dim
        self.max_epochs = max_epochs

        self.data_path = data_path
        self.data_anomaly_ratio = self.dataset.__anomalyratio__()
        self.batch_size = batch_size
        self.input_dim = self.dataset.__dim__()
        self.data_normaly_ratio = 1 - self.data_anomaly_ratio
        n_sample = self.dataset.__len__()
        self.n_train = int(n_sample * training_ratio)
        self.n_test = n_sample - self.n_train
        print('|data dimension: {}|data noise ratio:{}'.format(
            self.dataset.__dim__(), self.data_anomaly_ratio))

        self.training_data, self.testing_data = data.random_split(
            dataset=self.dataset, lengths=[self.n_train, self.n_test])

        self.ae = None
        self.discriminator = None
        self.model = None

    def train(self):
        self.model = OCSVM()
        self.model.fit(self.training_data.dataset.x)

    def test(self):
        y_test_scores = self.model.decision_function(
            self.testing_data.dataset.x)
        auc = roc_auc_score(self.testing_data.dataset.y, y_test_scores)

        from sklearn.metrics import precision_recall_fscore_support as prf, accuracy_score

        print("AUC:{:0.4f}".format(auc))

        os.makedirs(self.result_path, exist_ok=True)

        np.save(
            self.result_path + "result.npy",
            {
                "accuracy": auc,
                "precision": auc,
                "recall": auc,
                "f1": auc,
                "auc": auc,
            },
        )  # for consistency
        print("result save to {}".format(self.result_path))
Пример #10
0
class TestOCSVM(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = OCSVM()
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'support_') and
                    self.clf.support_ is not None)
        assert_true(hasattr(self.clf, 'support_vectors_') and
                    self.clf.support_vectors_ is not None)
        assert_true(hasattr(self.clf, 'dual_coef_') and
                    self.clf.dual_coef_ is not None)
        assert_true(hasattr(self.clf, 'intercept_') and
                    self.clf.intercept_ is not None)

        # only available for linear kernel
        # if not hasattr(self.clf, 'coef_') or self.clf.coef_ is None:
        #     self.assertRaises(AttributeError, 'coef_ is not set')

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Пример #11
0
class TestOCSVM(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination)

        self.clf = OCSVM()
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        if not hasattr(self.clf,
                       'decision_scores_') or self.clf.decision_scores_ is None:
            self.assertRaises(AttributeError, 'decision_scores_ is not set')
        if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None:
            self.assertRaises(AttributeError, 'labels_ is not set')
        if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None:
            self.assertRaises(AttributeError, 'threshold_ is not set')
        if not hasattr(self.clf,
                       'support_') or self.clf.support_ is None:
            self.assertRaises(AttributeError, 'support_ is not set')
        if not hasattr(self.clf,
                       'support_vectors_') or self.clf.support_vectors_ is None:
            self.assertRaises(AttributeError, 'support_vectors_ is not set')
        if not hasattr(self.clf, 'dual_coef_') or self.clf.dual_coef_ is None:
            self.assertRaises(AttributeError, 'dual_coef_ is not set')

        # only available for linear kernel
        # if not hasattr(self.clf, 'coef_') or self.clf.coef_ is None:
        #     self.assertRaises(AttributeError, 'coef_ is not set')
        if not hasattr(self.clf, 'intercept_') or self.clf.intercept_ is None:
            self.assertRaises(AttributeError, 'intercept_ is not set')

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_evaluate(self):
        self.clf.fit_predict_evaluate(self.X_test, self.y_test)

    def tearDown(self):
        pass
 # 	clf12.fit(np.array([data_vol,data_price]).T)
 # print("TRADER",i,true1,count1,true1/count1,true2,count2,true2/count2,true3,count3,true3/count3)
 data_vol = []
 data_price = []
 data_vol_price = []
 mal_t_stamps1 = []
 mal_t_stamps2 = []
 mal_t_stamps12 = []
 clf1 = OCSVM()
 clf2 = OCSVM()
 clf12 = OCSVM()
 for j in d_transaction[i]:
     data_vol.append(j[2])
     data_price.append(j[1])
     data_vol_price.append([j[2], j[1]])
 clf1.fit(np.array([data_vol]).T)
 clf2.fit(np.array([data_price]).T)
 clf12.fit(np.array(data_vol_price))
 for j in d_transaction[i]:
     p1 = clf1.predict(np.array(j[2]).reshape(1, -1))
     p2 = clf2.predict(np.array(j[2]).reshape(1, -1))
     p3 = clf12.predict(np.array([j[2], j[1]]).T.reshape(1, -1))
     if p1 == 1:
         mal_t_stamps1.append(j[0])
     if p2 == 1:
         mal_t_stamps2.append(j[0])
     if p3 == 1:
         mal_t_stamps12.append(j[0])
 s = set(d_attack[i])
 print("TRADER", i, "VOL", len(s & set(mal_t_stamps1)), "OUT OF",
       len(mal_t_stamps1), "PRICE", len(s & set(mal_t_stamps2)), "OUT OF",
Пример #13
0
def run_all_models(all_array, labels, pca, data_set_name):
    picture_name = all_array.get("# img", 1)
    all_array = all_array.drop("# img", 1)

    # standardizing data for processing
    all_array = standardizer(all_array)

    y = labels.get("in").to_numpy()
    x_train, x_test, y_train, y_test, picture_train, picture_test = train_test_split(all_array, y, picture_name,
                                                                                     test_size=0.4)

    if pca:
        transformer = IncrementalPCA()
        all_array = transformer.fit_transform(all_array)

    print("OCSVM")
    now = time()
    clf = OCSVM()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("OCSVM", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("Auto-encoder")
    now = time()
    clf = AutoEncoder(epochs=30)
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("Auto-encoder", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("HBOS")
    now = time()
    clf = HBOS()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("HBOS", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("SO_GAAL")
    now = time()
    clf = SO_GAAL()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("SO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("MO_GAAL")
    now = time()
    clf = MO_GAAL()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("MO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("MCD")
    now = time()
    clf = MCD()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("MCD", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("SOS")
    now = time()
    clf = SOS()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("SOS", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("IForest")
    now = time()
    clf = IForest()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("IFrorest", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("KNN")
    now = time()
    clf = KNN()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("KNN", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("PCA")
    now = time()
    clf = PCA()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("PCA", all_array.shape, temp, data_set_name, time() - now, scores_train))
Пример #14
0
def main():
    parser = argparse.ArgumentParser(description='baseline')
    register_data_args(parser)
    parser.add_argument("--mode",
                        type=str,
                        default='A',
                        choices=['A', 'AX', 'X'],
                        help="dropout probability")
    parser.add_argument("--seed",
                        type=int,
                        default=-1,
                        help="random seed, -1 means dont fix seed")
    parser.add_argument(
        "--emb-method",
        type=str,
        default='DeepWalk',
        help="embedding methods: DeepWalk, Node2Vec, LINE, SDNE, Struc2Vec")
    parser.add_argument("--ad-method",
                        type=str,
                        default='OCSVM',
                        help="embedding methods: PCA,OCSVM,IF,AE")
    args = parser.parse_args()

    if args.seed != -1:
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    logging.basicConfig(
        filename="./log/baseline.log",
        filemode="a",
        format="%(asctime)s-%(name)s-%(levelname)s-%(message)s",
        level=logging.INFO)
    logger = logging.getLogger('baseline')

    datadict = emb_dataloader(args)

    if args.mode == 'X':
        data = datadict['features']
        #print('X shape',data.shape)
    else:
        t0 = time.time()
        embeddings = embedding(args, datadict)
        dur1 = time.time() - t0

        if args.mode == 'A':
            data = embeddings
            #print('A shape',data.shape)
        if args.mode == 'AX':
            data = np.concatenate((embeddings, datadict['features']), axis=1)
            #print('AX shape',data.shape)

    logger.debug(f'data shape: {data.shape}')

    if args.ad_method == 'OCSVM':
        clf = OCSVM(contamination=0.1)
    if args.ad_method == 'IF':
        clf = IForest(n_estimators=100,
                      contamination=0.1,
                      n_jobs=-1,
                      behaviour="new")
    if args.ad_method == 'PCA':
        clf = PCA(contamination=0.1)
    if args.ad_method == 'AE':
        clf = AutoEncoder(contamination=0.1)

    t1 = time.time()
    clf.fit(data[datadict['train_mask']])
    dur2 = time.time() - t1

    print('traininig time:', dur1 + dur2)

    logger.info('\n')
    logger.info('\n')
    logger.info(
        f'Parameters dataset:{args.dataset} datamode:{args.mode} ad-method:{args.ad_method} emb-method:{args.emb_method}'
    )
    logger.info('-------------Evaluating Validation Results--------------')

    t2 = time.time()
    y_pred_val = clf.predict(data[datadict['val_mask']])
    y_score_val = clf.decision_function(data[datadict['val_mask']])
    auc, ap, f1, acc, precision, recall = baseline_evaluate(datadict,
                                                            y_pred_val,
                                                            y_score_val,
                                                            val=True)
    dur3 = time.time() - t2
    print('infer time:', dur3)

    logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}')
    logger.info(
        f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}'
    )

    logger.info('-------------Evaluating Test Results--------------')
    y_pred_test = clf.predict(data[datadict['test_mask']])
    y_score_test = clf.decision_function(data[datadict['test_mask']])
    auc, ap, f1, acc, precision, recall = baseline_evaluate(datadict,
                                                            y_pred_test,
                                                            y_score_test,
                                                            val=False)
    logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}')
    logger.info(
        f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}'
    )