Exemplo n.º 1
0
def train_and_save(args, data):
    rauc = np.zeros(args.runs)
    ap = np.zeros(args.runs)
    if args.mode == "unsupervised":
        real_ano = 0
    if args.mode == "semi_supervised":
        real_ano = int(args.known_outliers / 2)
    if args.mode == "supervised":
        real_ano = args.known_outliers
    print("Mode :", args.mode, "Total outliers :", known_outliers,
          "Real outliers", real_ano)
    for i in range(args.runs):
        x_train, y_train = get_train(data, args)
        devnet = DevNet(**args.__dict__)
        devnet.fit(x_train, y_train)
        scores = devnet.decision_function(data.X_val)
        rauc[i], ap[i] = aucPerformance(scores, data.Y_val)

    mean_aucpr = np.mean(ap)
    std_aucpr = np.std(ap)
    df = pd.DataFrame.from_dict(args.__dict__, orient="index").T
    df["real_anomaly"] = real_ano
    df["mean_auc"] = mean_aucpr
    df["std_auc"] = std_aucpr
    try:
        pd.read_csv(args.output)
        df.to_csv(args.output, mode="a", index=False, header=False)
    except:
        df.to_csv(args.output, mode="a", index=False)
Exemplo n.º 2
0
 def evaluating(self, x_test, y_test):
     net = deepcopy(self.net)
     y_pred = net(x_test)
     # loss = dev_loss(y_test, y_pred)
     # print(loss)
     y_test = y_test.detach().cpu().numpy()
     y_pred = y_pred.detach().cpu().numpy()
     # fpr, tpr, roc_auc = dict(), dict(), dict()
     # for i in range(2):
     #     fpr[i], tpr[i], _ = roc_curve(y_test, y_pred, pos_label=1)
     #     roc_auc[i] = auc(fpr[i], tpr[i])
     auc_roc, auc_pr, ap = aucPerformance(y_test, y_pred)
     print(auc_roc)
     del net
     return auc_roc, auc_pr, ap
def train_and_save(args, data):
    rauc = np.zeros(args.runs)
    ap = np.zeros(args.runs)

    print("Mode :", args.mode, "Rep dim :", args.rep_dim, "L2 weight :",
          args.weight_decay)
    for i in range(args.runs):
        d_svdd = DSVDD(**args.__dict__)
        d_svdd.fit(data.X_train, verbose=False)
        scores = d_svdd.decision_function(data.X_val)
        rauc[i], ap[i] = aucPerformance(scores, data.Y_val)

    mean_aucpr = np.mean(ap)
    std_aucpr = np.std(ap)
    df = pd.DataFrame.from_dict(args.__dict__, orient="index").T
    df["mean_auc"] = mean_aucpr
    df["std_auc"] = std_aucpr
    try:
        pd.read_csv(args.output)
        df.to_csv(args.output, mode="a", index=False, header=False)
    except:
        df.to_csv(args.output, mode="a", index=False)
Exemplo n.º 4
0
def run_t(args):

    names = ['x_train_1w_50percent']
    network_depth = int(args.network_depth)
    random_seed = args.ramdn_seed
    for nm in names:
        runs = args.runs
        rauc = np.zeros(runs)
        ap = np.zeros(runs)
        filename = nm.strip()

        x, labels = dataLoading_np(args.input_path + filename + ".npy")
        outlier_indices = np.where(labels == 1)[0]
        outliers = x[outlier_indices]
        n_outliers_org = outliers.shape[0]

        train_time = 0
        test_time = 0
        for i in np.arange(runs):
            x_train, x_test, y_train, y_test = train_test_split(
                x, labels, test_size=0.2, random_state=42, stratify=labels)
            print('x_train', x_train.shape, type(x_train))
            print('y_train', y_train.shape, type(y_train))
            print('x_test', x_test.shape, type(x_test))
            print('y_test', y_test.shape, type(y_test))

            y_train = np.array(y_train)
            y_test = np.array(y_test)
            print(filename + ': round ' + str(i))
            outlier_indices = np.where(y_train == 1)[0]
            inlier_indices = np.where(y_train == 0)[0]
            n_outliers = len(outlier_indices)
            print("Original training size: %d, No. outliers: %d" %
                  (x_train.shape[0], n_outliers))
            rng = np.random.RandomState(random_seed)
            if n_outliers > args.known_outliers:
                mn = n_outliers - args.known_outliers
                remove_idx = rng.choice(outlier_indices, mn, replace=False)
                x_train = np.delete(x_train, remove_idx, axis=0)
                y_train = np.delete(y_train, remove_idx, axis=0)

            outlier_indices = np.where(y_train == 1)[0]
            inlier_indices = np.where(y_train == 0)[0]

            print('training samples num:', y_train.shape[0], 'outlier num:',
                  outlier_indices.shape[0], 'inlier num:',
                  inlier_indices.shape[0])
            input_shape = x_train.shape[1:]
            n_samples_trn = x_train.shape[0]
            n_outliers = len(outlier_indices)
            print("Training data size: %d, No. outliers: %d" %
                  (x_train.shape[0], n_outliers))

            start_time = time.time()
            input_shape = x_train.shape[1:]
            epochs = args.epochs
            batch_size = args.batch_size
            nb_batch = args.nb_batch

            model = neural_network(input_shape, network_depth)
            print(model.summary())
            model_filename = filename + "_" + str(
                args.batch_size) + "bs_" + str(
                    args.known_outliers) + "ko_" + str(network_depth) + "d.h5"
            model_name = os.path.join('../model/train_', model_filename)
            checkpointer = ModelCheckpoint(model_name,
                                           monitor='loss',
                                           verbose=0,
                                           save_best_only=True,
                                           save_weights_only=True)
            model.fit_generator(batch_generator_sup(x_train, outlier_indices,
                                                    inlier_indices, batch_size,
                                                    nb_batch, rng),
                                steps_per_epoch=nb_batch,
                                epochs=epochs,
                                callbacks=[checkpointer])

            train_time += time.time() - start_time
            start_time = time.time()
            scores = load_model_weight_predict(model_name, input_shape,
                                               network_depth, x_test)
            test_time += time.time() - start_time
            print(scores.shape)
            rauc[i], ap[i] = aucPerformance(scores, y_test)
            preds = scores
            class_one = preds > 0.5
            predic_class = np.where(class_one == True, 1, 0)
            precision_new = precision_score(y_test, predic_class)
            print('new precision', precision_new)
            recall_new = recall_score(y_test, predic_class)
            print('new recall', recall_new)
            f1_new = 2 * ((precision_new * recall_new) /
                          (precision_new + recall_new))
            print('f1 new', f1_new)

            fig3 = plt.figure()
            plt.plot(model.history.history['loss'])
            plt.title('model loss')
            plt.ylabel('loss')
            plt.xlabel('epoch')
            plt.legend(['train', 'validation'], loc='upper left')
            plt.show()
            fig3.savefig('my_figure3.png')

        mean_auc = np.mean(rauc)
        std_auc = np.std(rauc)
        mean_aucpr = np.mean(ap)
        std_aucpr = np.std(ap)
        train_time = train_time / runs
        test_time = test_time / runs
        print("average AUC-ROC: %.4f, average AUC-PR: %.4f" %
              (mean_auc, mean_aucpr))
        print("average runtime: %.4f seconds" % (train_time + test_time))
        architecture = '2 hidlstm(64)+1dense(32)+ 1dense(1)'
        losses = 'binary-cross-entropy'
        max_precision = 0
        max_recall = 0
        f_one = 0
        writeResults(filename + '_' + str(network_depth),
                     losses,
                     x.shape[0],
                     x.shape[1],
                     n_samples_trn,
                     n_outliers_org,
                     n_outliers,
                     network_depth,
                     mean_auc,
                     mean_aucpr,
                     std_auc,
                     std_aucpr,
                     train_time,
                     test_time,
                     architecture,
                     epochs,
                     batch_size,
                     nb_batch,
                     precision_new,
                     recall_new,
                     f1_new,
                     max_precision,
                     max_recall,
                     f_one,
                     path=args.output)
import warnings

model_path = '/content/gdrive/My Drive/devnet/DevNet/DevNet-master copy 2/model/devnet_/x_train_1w_50percent_512bs_943ko_1d.h5'
network_depth = 1
input_shape = [1, 298]

x_test = np.load(
    '/content/gdrive/My Drive/devnet/DevNet/DevNet-master copy 2/dataset/x_test_1w_50percent.npy'
)

y_test = np.load(
    '/content/gdrive/My Drive/devnet/DevNet/DevNet-master copy 2/dataset/y_test_1w_50percent.npy'
)

scores = load_model_weight_predict(model_path,
                                   input_shape=input_shape,
                                   network_depth=network_depth,
                                   x_test=x_test)

preds = scores
class_one = preds > 0.5
predic_class = np.where(class_one == True, 1, 0)
precision_new = precision_score(y_test, predic_class)
print('precision', precision_new)
recall_new = recall_score(y_test, predic_class)
print('recall', recall_new)
f1_new = 2 * ((precision_new * recall_new) / (precision_new + recall_new))
print('f1', f1_new)

AUC_ROC, AUC_PR = aucPerformance(scores, y_test)
Exemplo n.º 6
0
def run_devnet(args):
    names = args.data_set.split(',')
    names = ['creditcard_train']
    network_depth = int(args.network_depth)
    random_seed = args.ramdn_seed
    for nm in names:
        runs = args.runs
        rauc = np.zeros(runs)
        ap = np.zeros(runs)
        filename = nm.strip()
        global data_format
        data_format = int(args.data_format)
        if data_format == 0:
            x, labels = dataLoading(args.input_path + filename + ".csv")
        else:
            x, labels = get_data_from_svmlight_file(args.input_path +
                                                    filename + ".svm")
            x = x.tocsr()
        outlier_indices = np.where(labels == 1)[0]
        outliers = x[outlier_indices]
        n_outliers_org = outliers.shape[0]

        train_time = 0
        test_time = 0
        for i in np.arange(runs):
            x_train, x_test, y_train, y_test = train_test_split(
                x, labels, test_size=0.2, random_state=42, stratify=labels)
            y_train = np.array(y_train)
            y_test = np.array(y_test)
            print(filename + ': round ' + str(i))
            outlier_indices = np.where(y_train == 1)[0]
            inlier_indices = np.where(y_train == 0)[0]
            n_outliers = len(outlier_indices)
            print("Original training size: %d, No. outliers: %d" %
                  (x_train.shape[0], n_outliers))

            n_noise = len(np.where(
                y_train == 0)[0]) * args.cont_rate / (1. - args.cont_rate)
            n_noise = int(n_noise)

            rng = np.random.RandomState(random_seed)
            if data_format == 0:
                if n_outliers > args.known_outliers:
                    mn = n_outliers - args.known_outliers
                    remove_idx = rng.choice(outlier_indices, mn, replace=False)
                    x_train = np.delete(x_train, remove_idx, axis=0)
                    y_train = np.delete(y_train, remove_idx, axis=0)

                noises = inject_noise(outliers, n_noise, random_seed)
                x_train = np.append(x_train, noises, axis=0)
                y_train = np.append(y_train, np.zeros((noises.shape[0], 1)))

            else:
                if n_outliers > args.known_outliers:
                    mn = n_outliers - args.known_outliers
                    remove_idx = rng.choice(outlier_indices, mn, replace=False)
                    retain_idx = set(np.arange(
                        x_train.shape[0])) - set(remove_idx)
                    retain_idx = list(retain_idx)
                    x_train = x_train[retain_idx]
                    y_train = y_train[retain_idx]

                noises = inject_noise_sparse(outliers, n_noise, random_seed)
                x_train = vstack([x_train, noises])
                y_train = np.append(y_train, np.zeros((noises.shape[0], 1)))

            outlier_indices = np.where(y_train == 1)[0]
            inlier_indices = np.where(y_train == 0)[0]
            print('training samples num:', y_train.shape[0], 'outlier num:',
                  outlier_indices.shape[0], 'inlier num:',
                  inlier_indices.shape[0], 'noise num:', n_noise)
            n_samples_trn = x_train.shape[0]
            n_outliers = len(outlier_indices)
            print("Training data size: %d, No. outliers: %d" %
                  (x_train.shape[0], n_outliers))

            start_time = time.time()
            input_shape = x_train.shape[1:]
            epochs = args.epochs
            batch_size = args.batch_size
            nb_batch = args.nb_batch

            model = deviation_network(input_shape, network_depth)
            print(model.summary())
            model_name = "./model/devnet_" + filename + "_" + str(
                args.cont_rate) + "cr_" + str(args.batch_size) + "bs_" + str(
                    args.known_outliers) + "ko_" + str(network_depth) + "d.h5"
            checkpointer = ModelCheckpoint(model_name,
                                           monitor='loss',
                                           verbose=0,
                                           save_best_only=True,
                                           save_weights_only=True)

            model.fit_generator(batch_generator_sup(x_train, outlier_indices,
                                                    inlier_indices, batch_size,
                                                    nb_batch, rng),
                                steps_per_epoch=nb_batch,
                                epochs=epochs,
                                callbacks=[checkpointer])
            train_time += time.time() - start_time

            start_time = time.time()
            scores = load_model_weight_predict(model_name, input_shape,
                                               network_depth, x_test)
            test_time += time.time() - start_time
            rauc[i], ap[i] = aucPerformance(scores, y_test)

        mean_auc = np.mean(rauc)
        std_auc = np.std(rauc)
        mean_aucpr = np.mean(ap)
        std_aucpr = np.std(ap)
        train_time = train_time / runs
        test_time = test_time / runs
        print("average AUC-ROC: %.4f, average AUC-PR: %.4f" %
              (mean_auc, mean_aucpr))
        print("average runtime: %.4f seconds" % (train_time + test_time))
        writeResults(filename + '_' + str(network_depth),
                     x.shape[0],
                     x.shape[1],
                     n_samples_trn,
                     n_outliers_org,
                     n_outliers,
                     network_depth,
                     mean_auc,
                     mean_aucpr,
                     std_auc,
                     std_aucpr,
                     train_time,
                     test_time,
                     path=args.output)
Exemplo n.º 7
0
def main():

    torch.manual_seed(666)
    torch.cuda.manual_seed_all(666)
    np.random.seed(666)

    # data_name = 'BlogCatalog'

    print(args)
    nb_epochs = 50
    nb_runs = 16
    nb_try = 16
    nb_batch_maml = 10
    nb_batch = 32
    lr_1 = 0.03
    lr_s = lr_1 * args.task_num
    tr = 0.6
    # features, labels, idx_train, y_train, idx_val, y_val, idx_labeled, train_unlabeled = SGC_process(data_name, degree=2, l_ratio=0.08, tr_ratio=0.8)
    # print(args.task_num)

    aucfile = 'results/auc_' + datetime.now().strftime("%m_%d_%H_%M") + '_yelp.txt'
    with open(aucfile, 'a') as f:
        f.write("settings: {labeled ratio: %f, training ratio: %f, epochs: %d, update_step: %d}\n" % (lr_1, tr, nb_epochs, args.update_step))
        for t in range(nb_try):
            taskData = task(nb_task=args.task_num, degree=2, l_ratio=lr_1, t_ratio=tr, name='yelp')
            taskData.loadNProcess()
            f.write("target data name:" + taskData.f_name[-1] + "\n")
            f.write("%d-th try: \n" % t)
            for i in range(nb_runs):
                # training maml
                print("maml training...")
                print("In %d-th run..." % (i + 1))
                f.write("%d-th run\n" % i)
                feature_list, label, l_list, ul_list, idx_test = taskData.sampleAnomaly()
                config = modelArch(feature_list[0].shape[1], args.n_way)
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                maml = Meta(args, config).to(device)
                # stats of parameters to be updated
                tmp = filter(lambda x: x.requires_grad, maml.parameters())
                num = sum(map(lambda x: np.prod(x.shape), tmp))
                # print(maml)
                print("Total #trainable tensors: ", num)
                batch_gen = DataLoaderN(feature_list, l_list, ul_list, b_size=8, b_size_qry=6, nb_task=args.task_num, device=device)
                maml.train()
                for e in range(1, nb_epochs + 1):
                    print("Running %d-th epoch" % e)
                    epoch_loss = 0
                    epoch_acc = 0
                    for b in range(nb_batch_maml):
                        x_train, y_train, x_qry, y_qry = batch_gen.getBatch(qry=False)
                        y_pred, loss = maml(x_train, y_train, x_qry, y_qry)
                        epoch_loss += loss
                    print("Epoch loss: %f" % epoch_loss)
                print("End of training.")
                # testing
                print("Evaluating the maml model")
                maml.eval()
                x_test, y_test = feature_list[args.task_num-1][idx_test].to(device), label[idx_test].to(device)
                auc_roc, auc_pr, ap = maml.evaluating(x_test, y_test)
                print("End of evaluating.")
                f.write("MAML auc_roc: %.5f, auc_pr: %.5f, ap: %.5f\n" % (auc_roc, auc_pr, ap))

                # g-dev training
                print('G-dev training...')
                features, labels, idx_labeled, idx_unlabeled, idx_test = SGC_process(taskData.target, degree=2, l_ratio=lr_s, tr_ratio=tr)
                # print("finish loading data...")
                attr_dim = features.shape[1]
                # print("%d-th run:" % i)
                # model = FCNet(attr_dim, 1).to(device)
                # model = SGC_original(attr_dim, 1).to(device)
                model = SGC(attr_dim, 1).to(device)
                # print(model)
                optim = torch.optim.Adam(model.parameters(), lr=0.002, weight_decay=0)
                # loss = deviation_loss()
                data_sampler = DataLoader(features, idx_labeled, idx_unlabeled, b_size=8)
                model.float()
                model.train()
                for e in range(1, nb_epochs + 1):
                    # print('Epoch: %d' % e)
                    epoch_loss = 0
                    epoch_acc = 0
                    for b in range(nb_batch):
                        x_b, y_b = data_sampler.getBatch()
                        x_b, y_b = x_b.to(device), y_b.to(device)
                        y_pred = model(x_b)
                        loss = deviation_loss(y_b, y_pred)
                        optim.zero_grad()
                        loss.backward()
                        optim.step()
                        epoch_loss += loss.item()
                    print("epoch loss %f" % epoch_loss)
                # validation
                model.eval()
                # print(idx_val.shape)
                x_val = features[idx_test].to(device)
                # print(x_val.shape)
                y_pred = model(x_val).detach().cpu().numpy()
                y_val = labels[idx_test].detach().cpu().numpy()
                # fpr, tpr, roc_auc = dict(), dict(), dict()
                # for i in range(2):
                #     fpr[i], tpr[i], _ = roc_curve(y_val, y_pred, pos_label=1)
                #     roc_auc[i] = auc(fpr[i], tpr[i])
                auc_roc, _, auc_pr = aucPerformance(y_val, y_pred)
                print("G-dev auc_roc: %.5f, auc_pr: %.5f" % (auc_roc, auc_pr))
                f.write("G-Dev auc_roc: %.5f, auc_pr: %.5f\n" % (auc_roc, auc_pr))

    f.close()
def run_devnet(args):
    print("Chosen mode :", args.mode)
    nm = 'fraud'
    network_depth = int(args.network_depth)
    random_seed = args.ramdn_seed
    
    runs = args.runs
    rauc = np.zeros(runs)
    ap = np.zeros(runs)  
    filename = nm.strip()
    global data_format
    data_format = int(args.data_format)
    
    data = Dataset(mode="other")
    
    if args.mode =="unsupervised" :
        outlier_scores = lesinn(data.X_train, data.X_train) 
        
        ind_scores = np.argsort(outlier_scores.flatten())
            
        inlier_ids, outlier_ids = ind_scores[:-args.known_outliers:], ind_scores[-args.known_outliers:]  
        inlier_ids = np.intersect1d(inlier_ids, np.where(data.Y_train == 0)[0])

    #print("Original training size: %d, No. outliers: %d" % (x_train.shape[0], 
    #                                                        n_outliers))    
    train_time = 0
    test_time = 0
    for i in np.arange(runs):  
        print(filename + ': round ' + str(i))     
        x_train, x_test, y_train, y_test = data.X_train, data.X_val, data.Y_train, data.Y_val
        
        
        if args.mode == "unsupervised" :
            y_train[inlier_ids] = 0;
            y_train[outlier_ids] = 1
        
        outlier_indices = np.where(y_train == 1)[0]
        outliers = x_train[outlier_indices]  
        n_outliers_org = outliers.shape[0]   
        
        inlier_indices = np.where(y_train == 0)[0]
        n_outliers = len(outlier_indices)        
        
        n_noise  = len(np.where(y_train == 0)[0]) * args.cont_rate / (1. - args.cont_rate)
        n_noise = int(n_noise)                
        
        rng = np.random.RandomState(random_seed)  
        if data_format == 0:                
            if n_outliers > args.known_outliers:
                mn = n_outliers - args.known_outliers
                remove_idx = rng.choice(outlier_indices, mn, replace=False)            
                x_train = np.delete(x_train, remove_idx, axis=0)
                y_train = np.delete(y_train, remove_idx, axis=0)
        if args.cont_rate > 0 :
            noises = inject_noise(outliers, n_noise, random_seed)
            x_train = np.append(x_train, noises, axis = 0)
            y_train = np.append(y_train, np.zeros((noises.shape[0], 1)))
        
        outlier_indices = np.where(y_train == 1)[0]
        inlier_indices = np.where(y_train == 0)[0]
        #print(y_train.shape[0], outlier_indices.shape[0], inlier_indices.shape[0], n_noise)
        input_shape = x_train.shape[1:]
        n_samples_trn = x_train.shape[0]
        n_outliers = len(outlier_indices)            
        print("Training data size: %d, No. outliers: %d" % (x_train.shape[0], n_outliers))
        
        
        start_time = time.time() 
        input_shape = x_train.shape[1:]
        epochs = args.epochs
        batch_size = args.batch_size    
        nb_batch = args.nb_batch  
        model = deviation_network(input_shape)
        #print(model.summary())  
        model_name = "./model/" + args.mode + "_" + str(args.cont_rate) + "cr_"  + str(args.known_outliers)  +"d.h5"
        checkpointer = ModelCheckpoint(model_name, monitor='loss', verbose=0,
                                       save_best_only = True, save_weights_only = True)            
        
        model.fit_generator(batch_generator_sup(x_train, outlier_indices, 
                                                inlier_indices, batch_size, nb_batch, rng),
                                      steps_per_epoch = nb_batch,
                                      epochs = epochs,
                                      callbacks=[checkpointer],
                                      verbose = True)   
        train_time += time.time() - start_time
        
        start_time = time.time() 
        scores = load_model_weight_predict(model_name, input_shape, network_depth, x_test)
        test_time += time.time() - start_time
        rauc[i], ap[i] = aucPerformance(scores, y_test)     
    
    mean_auc = np.mean(rauc)
    #std_auc = np.std(rauc)
    mean_aucpr = np.mean(ap)
    std_aucpr = np.std(ap)
    train_time = train_time/runs
    test_time = test_time/runs
    print("average AUC-ROC: %.4f, average AUC-PR: %.4f" % (mean_auc, mean_aucpr))    
    #print("average runtime: %.4f seconds" % (train_time + test_time))
    writeResults(filename+'_vrai_'+str(network_depth), n_samples_trn, n_outliers_org, n_outliers,
                 mean_aucpr, std_aucpr, args.cont_rate, path=args.output)