class TestOCSVM(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = OCSVM() self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'support_') and self.clf.support_ is not None) assert (hasattr(self.clf, 'support_vectors_') and self.clf.support_vectors_ is not None) assert (hasattr(self.clf, 'dual_coef_') and self.clf.dual_coef_ is not None) assert (hasattr(self.clf, 'intercept_') and self.clf.intercept_ is not None) # only available for linear kernel # if not hasattr(self.clf, 'coef_') or self.clf.coef_ is None: # self.assertRaises(AttributeError, 'coef_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3.5) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3.5) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
class SolverVAECIFAR(): def __init__(self, data_name, hidden_dim=256, seed=0, learning_rate=3e-4, normal_class=0, anomaly_ratio=0.1, batch_size=128, concentrated=0, training_ratio=0.8, SN=1, Trim=1, L=1.5, max_epochs=100): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if use_cuda else "cpu") self.L = L if concentrated == 1.0: full_data_name = 'CIFAR10_Concentrated' elif concentrated == 0.0: full_data_name = 'CIFAR10' self.result_path = "./results/{}_{}/0.0/OCSVM/{}/".format( full_data_name, normal_class, seed) data_path = "./data/" + data_name + ".npy" self.learning_rate = learning_rate self.SN = SN self.Trim = Trim # self.dataset = RealGraphDataset(data_path, missing_ratio=0, radius=2) self.dataset = CIFARVGGDataset(data_path, normal_class=normal_class, anomaly_ratio=anomaly_ratio, concentrated=concentrated) self.seed = seed self.hidden_dim = hidden_dim self.max_epochs = max_epochs self.data_path = data_path self.data_anomaly_ratio = self.dataset.__anomalyratio__() self.batch_size = batch_size self.input_dim = self.dataset.__dim__() self.data_normaly_ratio = 1 - self.data_anomaly_ratio n_sample = self.dataset.__len__() self.n_train = int(n_sample * training_ratio) self.n_test = n_sample - self.n_train print('|data dimension: {}|data noise ratio:{}'.format( self.dataset.__dim__(), self.data_anomaly_ratio)) self.training_data, self.testing_data = data.random_split( dataset=self.dataset, lengths=[self.n_train, self.n_test]) self.ae = None self.discriminator = None self.model = None def train(self): self.model = OCSVM() self.model.fit(self.training_data.dataset.x) def test(self): y_test_scores = self.model.decision_function( self.testing_data.dataset.x) auc = roc_auc_score(self.testing_data.dataset.y, y_test_scores) from sklearn.metrics import precision_recall_fscore_support as prf, accuracy_score print("AUC:{:0.4f}".format(auc)) os.makedirs(self.result_path, exist_ok=True) np.save( self.result_path + "result.npy", { "accuracy": auc, "precision": auc, "recall": auc, "f1": auc, "auc": auc, }, ) # for consistency print("result save to {}".format(self.result_path))
n_features=2, contamination=contamination, random_state=42) # train one_class_svm detector clf_name = 'OneClassSVM' clf = OCSVM() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred,
class TestOCSVM(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = OCSVM() self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'support_') and self.clf.support_ is not None) assert_true(hasattr(self.clf, 'support_vectors_') and self.clf.support_vectors_ is not None) assert_true(hasattr(self.clf, 'dual_coef_') and self.clf.dual_coef_ is not None) assert_true(hasattr(self.clf, 'intercept_') and self.clf.intercept_ is not None) # only available for linear kernel # if not hasattr(self.clf, 'coef_') or self.clf.coef_ is None: # self.assertRaises(AttributeError, 'coef_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
class TestOCSVM(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = OCSVM() self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr(self.clf, 'decision_scores_') or self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') if not hasattr(self.clf, 'support_') or self.clf.support_ is None: self.assertRaises(AttributeError, 'support_ is not set') if not hasattr(self.clf, 'support_vectors_') or self.clf.support_vectors_ is None: self.assertRaises(AttributeError, 'support_vectors_ is not set') if not hasattr(self.clf, 'dual_coef_') or self.clf.dual_coef_ is None: self.assertRaises(AttributeError, 'dual_coef_ is not set') # only available for linear kernel # if not hasattr(self.clf, 'coef_') or self.clf.coef_ is None: # self.assertRaises(AttributeError, 'coef_ is not set') if not hasattr(self.clf, 'intercept_') or self.clf.intercept_ is None: self.assertRaises(AttributeError, 'intercept_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_evaluate(self): self.clf.fit_predict_evaluate(self.X_test, self.y_test) def tearDown(self): pass
#划分测试集和训练集 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33) #使用pyod中的OCSVM算法拟合数据 clf_name = 'OCSVM' clf = OCSVM() clf.fit(X_train) #预测得到由0和1组成的数组,1表示离群点,0表示飞离群点 y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores,The outlier scores of the training data. #预测样本是不是离群点,返回0和1 的数组 y_test_pred = clf.predict(X_test) y_test_scores = clf.decision_function( X_test) # outlier scores,The anomaly score of the input samples. #使用sklearn中的roc_auc_score方法得到auc值,即roc曲线下面的面积 try: sumAuc_train += sklearn.metrics.roc_auc_score(y_train, y_train_scores, average='macro') sumAuc_test += sklearn.metrics.roc_auc_score(y_test, y_test_scores, average='macro') #s=precision_score(y_train, y_train_scores, average='macro') i += 1 print(sumAuc_train, sumAuc_test) except ValueError: print('1') pass
def run_all_models(all_array, labels, pca, data_set_name): picture_name = all_array.get("# img", 1) all_array = all_array.drop("# img", 1) # standardizing data for processing all_array = standardizer(all_array) y = labels.get("in").to_numpy() x_train, x_test, y_train, y_test, picture_train, picture_test = train_test_split(all_array, y, picture_name, test_size=0.4) if pca: transformer = IncrementalPCA() all_array = transformer.fit_transform(all_array) print("OCSVM") now = time() clf = OCSVM() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("OCSVM", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("Auto-encoder") now = time() clf = AutoEncoder(epochs=30) clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("Auto-encoder", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("HBOS") now = time() clf = HBOS() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("HBOS", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("SO_GAAL") now = time() clf = SO_GAAL() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("SO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("MO_GAAL") now = time() clf = MO_GAAL() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("MO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("MCD") now = time() clf = MCD() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("MCD", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("SOS") now = time() clf = SOS() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("SOS", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("IForest") now = time() clf = IForest() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("IFrorest", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("KNN") now = time() clf = KNN() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("KNN", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("PCA") now = time() clf = PCA() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("PCA", all_array.shape, temp, data_set_name, time() - now, scores_train))
def main(): parser = argparse.ArgumentParser(description='baseline') register_data_args(parser) parser.add_argument("--mode", type=str, default='A', choices=['A', 'AX', 'X'], help="dropout probability") parser.add_argument("--seed", type=int, default=-1, help="random seed, -1 means dont fix seed") parser.add_argument( "--emb-method", type=str, default='DeepWalk', help="embedding methods: DeepWalk, Node2Vec, LINE, SDNE, Struc2Vec") parser.add_argument("--ad-method", type=str, default='OCSVM', help="embedding methods: PCA,OCSVM,IF,AE") args = parser.parse_args() if args.seed != -1: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) logging.basicConfig( filename="./log/baseline.log", filemode="a", format="%(asctime)s-%(name)s-%(levelname)s-%(message)s", level=logging.INFO) logger = logging.getLogger('baseline') datadict = emb_dataloader(args) if args.mode == 'X': data = datadict['features'] #print('X shape',data.shape) else: t0 = time.time() embeddings = embedding(args, datadict) dur1 = time.time() - t0 if args.mode == 'A': data = embeddings #print('A shape',data.shape) if args.mode == 'AX': data = np.concatenate((embeddings, datadict['features']), axis=1) #print('AX shape',data.shape) logger.debug(f'data shape: {data.shape}') if args.ad_method == 'OCSVM': clf = OCSVM(contamination=0.1) if args.ad_method == 'IF': clf = IForest(n_estimators=100, contamination=0.1, n_jobs=-1, behaviour="new") if args.ad_method == 'PCA': clf = PCA(contamination=0.1) if args.ad_method == 'AE': clf = AutoEncoder(contamination=0.1) t1 = time.time() clf.fit(data[datadict['train_mask']]) dur2 = time.time() - t1 print('traininig time:', dur1 + dur2) logger.info('\n') logger.info('\n') logger.info( f'Parameters dataset:{args.dataset} datamode:{args.mode} ad-method:{args.ad_method} emb-method:{args.emb_method}' ) logger.info('-------------Evaluating Validation Results--------------') t2 = time.time() y_pred_val = clf.predict(data[datadict['val_mask']]) y_score_val = clf.decision_function(data[datadict['val_mask']]) auc, ap, f1, acc, precision, recall = baseline_evaluate(datadict, y_pred_val, y_score_val, val=True) dur3 = time.time() - t2 print('infer time:', dur3) logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}') logger.info( f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}' ) logger.info('-------------Evaluating Test Results--------------') y_pred_test = clf.predict(data[datadict['test_mask']]) y_score_test = clf.decision_function(data[datadict['test_mask']]) auc, ap, f1, acc, precision, recall = baseline_evaluate(datadict, y_pred_test, y_score_test, val=False) logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}') logger.info( f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}' )