Exemplo n.º 1
0
def main():
    # reading in
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        default='data/sampling',
                        help='determine the base dir of the dataset document')
    parser.add_argument("--sample_n",
                        default=1000,
                        type=int,
                        help='starting image index of preprocessing')
    parser.add_argument("--evidence_n",
                        default=20,
                        type=int,
                        help='how many top/bottom tiles to pick from')
    parser.add_argument("--repl_n",
                        default=3,
                        type=int,
                        help='how many resampled replications')
    parser.add_argument("--image_split",
                        action='store_true',
                        help='if use image_split')
    parser.add_argument("--batch_size",
                        default=50,
                        type=int,
                        help="batch size")
    parser.add_argument("--stage_two",
                        action='store_true',
                        help='if only use stage two patients')
    parser.add_argument("--changhai",
                        action='store_true',
                        help='if use additional data')
    args = parser.parse_args()

    feature_size = 32
    #gpu = "cuda:0"
    gpu = None
    # 5-folds cross validation
    dataloader = CVDataLoader(args, gpu, feature_size)

    n_epoch = 800
    lr = 0.0005
    if args.stage_two:
        weight_decay = 0.008
    else:
        weight_decay = 0.005
    manytimes_n = 8

    if not os.path.isdir('figure'):
        os.mkdir('figure')
    if not os.path.isdir(os.path.join(args.data_dir, 'model')):
        os.mkdir(os.path.join(args.data_dir, 'model'))

    acc_folds = []
    auc_folds = []
    c_index_folds = []
    f1_folds = []
    f1_folds_pos = []
    total_round = 0
    model_count = 0

    loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(0.8))

    for _ in range(manytimes_n):  # averaging
        for i in range(5):
            train_history = []
            test_history = []
            minimum_loss = None
            auc_fold = None
            acc_fold = None
            early_stop_count = 0

            model = Predictor(evidence_size=args.evidence_n,
                              layers=(100, 50, 1),
                              feature_size=feature_size)
            # model.apply(weight_init)
            if gpu:
                model = model.to(gpu)
            optimizer = torch.optim.RMSprop(model.parameters(),
                                            lr=lr,
                                            weight_decay=weight_decay)
            # optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

            dataloader.set_fold(i)
            X_test, Y_test, df_test = dataloader.get_test()
            # X_train, Y_train, df_train = dataloader.get_train()
            print('starting fold %d' % i)

            for epoch in range(n_epoch):
                #result = model(X_train)
                #loss = nn.functional.binary_cross_entropy(result, Y_train) + nn.functional.mse_loss(result, Y_train)
                # loss = nn.functional.mse_loss(result, Y_train)
                #loss.backward()
                #optimizer.step()
                #optimizer.zero_grad()

                # batch input
                for X_train_batch, Y_train_batch, df_train_batch in dataloader:
                    # print(X_train_batch.shape)
                    result = model(X_train_batch)
                    loss = loss_function(result, Y_train_batch)
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()

                X_train, Y_train, df_train = X_train_batch, Y_train_batch, df_train_batch

                if epoch % 20 == 0:
                    result_test = model(X_test)
                    loss_test = loss_function(result_test, Y_test)
                    #loss_test = nn.functional.mse_loss(result_test, Y_test)
                    acc_train, acc_test = accuracy(result, Y_train), accuracy(
                        result_test, Y_test)
                    auc_train, auc_test = auc(result, Y_train), auc(
                        result_test, Y_test)
                    if args.changhai:
                        c_index_train, c_index_test = 0, 0
                    else:
                        c_index_train, c_index_test = c_index(
                            result, df_train), c_index(result_test, df_test)
                    recall_train, recall_test = recall(result,
                                                       Y_train), recall(
                                                           result_test, Y_test)
                    precision_train, precision_test = precision(
                        result, Y_train), precision(result_test, Y_test)
                    f1_train_pos, f1_test_pos = f1(result, Y_train), f1(
                        result_test, Y_test)
                    f1_train, f1_test = f1(result, Y_train,
                                           negative=True), f1(result_test,
                                                              Y_test,
                                                              negative=True)
                    train_history.append(
                        (epoch, loss, acc_train, auc_train, c_index_train))
                    test_history.append(
                        (epoch, loss_test, acc_test, auc_test, c_index_test))
                    if epoch % 40 == 0:
                        print(
                            "%s epoch:%d loss:%.3f/%.3f acc:%.3f/%.3f auc:%.3f/%.3f c_index:%.3f/%.3f recall:%.3f/%.3f prec:%.3f/%.3f f1:%.3f/%.3f f1(neg):%.3f/%.3f"
                            % (time.strftime(
                                '%m.%d %H:%M:%S', time.localtime(
                                    time.time())), epoch, loss, loss_test,
                               acc_train, acc_test, auc_train, auc_test,
                               c_index_train, c_index_test, recall_train,
                               recall_test, precision_train, precision_test,
                               f1_train_pos, f1_test_pos, f1_train, f1_test))
                    # early stop
                    if minimum_loss is None or minimum_loss * 0.995 > loss_test:
                        # if minimum_loss is None or minimum_loss > loss_test:
                        if f1_train == 0:
                            continue
                        minimum_loss = loss_test
                        auc_fold = auc_test
                        acc_fold = acc_test
                        c_index_fold = c_index_test
                        f1_fold_pos = f1_test_pos
                        f1_fold = f1_test
                        early_stop_count = 0
                    elif auc_test > auc_fold and auc_test > 0.5 and acc_test >= acc_fold:
                        minimum_loss = loss_test
                        auc_fold = auc_test
                        acc_fold = acc_test
                        c_index_fold = c_index_test
                        f1_fold_pos = f1_test_pos
                        f1_fold = f1_test
                        early_stop_count = 0
                    else:
                        early_stop_count += 1
                    if early_stop_count > 2 and epoch > 100:
                        if args.stage_two:
                            if auc_fold > 0.55:
                                print('early stop at epoch %d' % epoch)
                                break
                        elif early_stop_count > 3:
                            print('early stop at epoch %d' % epoch)
                            break
                    if epoch > 500:
                        optimizer = torch.optim.RMSprop(
                            model.parameters(),
                            lr * 0.6,
                            weight_decay=weight_decay * 1.2)

            train_history = np.array(train_history)
            test_history = np.array(test_history)
            acc_folds.append(acc_fold)
            auc_folds.append(auc_fold)
            f1_folds.append(f1_fold)
            f1_folds_pos.append(f1_fold_pos)
            c_index_folds.append(c_index_fold)
            plt.plot(train_history[:, 0], train_history[:, 1], label='train')
            plt.plot(test_history[:, 0], test_history[:, 1], label='test')
            plt.legend()
            plt.savefig('figure/sample_%d_fold%d.png' % (args.sample_n, i))
            plt.cla()
            if acc_fold > 0.7 and auc_fold > 0.6 and model_count < 10:
                model.save(args.data_dir + "/model/model_%d" % model_count)
                model_count += 1
            print("acc:%.3f\tauc:%.3f\tc_index:%.3f\tf1:%.3f" %
                  (acc_fold, auc_fold, c_index_fold, f1_fold))
            total_round += 1
            if gpu:
                del dataloader.X_train, dataloader.Y_train, dataloader.X_test, dataloader.Y_test
                del X_test, Y_test, X_train, Y_train, model, optimizer
                torch.cuda.empty_cache()

    print('CV-acc:%.3f CV-auc:%.3f CV-c-index:%.3f f1:%.3f f1(neg):%.3f' %
          (sum(acc_folds) / 5 / manytimes_n, sum(auc_folds) / 5 / manytimes_n,
           sum(c_index_folds) / 5 / manytimes_n, sum(f1_folds_pos) / 5 /
           manytimes_n, sum(f1_folds) / 5 / manytimes_n))
                       lw=lw,
                       batch_size=batch_size,
                       input_dropout=dropout_p,
                       gpu=gpu)
elif args.model == 'ErmlpAvg':
    model = ERMLP_avg(embedding_dim=embedding_dim,
                      embedding_rel_dim=embedding_rel_dim,
                      mlp_hidden=mlp_hidden,
                      weights=pretrained_weights,
                      n_r=n_r,
                      lw=lw,
                      batch_size=batch_size,
                      input_dropout=dropout_p,
                      gpu=gpu)
else:
    raise Exception('Unknown model!')

model_name = 'models/ConceptNet/model.bin'
state = torch.load(model_name, map_location=lambda storage, loc: storage)
model.load_state_dict(state)

# Test Model
test_s, test_o, test_p = test_data
score_test = model.forward(test_s, test_o, test_p)
score_test = score_test.cpu().data.numpy() if gpu else score_test.data.numpy()
test_acc = get_accuracy(score_test, thresh)
test_auc_score = auc(score_test, test_label)

print('Test Accuracy: {0}'.format(test_acc))
print('Test AUC Score: {0}'.format(test_auc_score))
    n_shapelets=50,
    min_shapelet_size=0,
    max_shapelet_size=1,
    force_dim=x_dimensions,
    metric="euclidean",
    #metric="scaled_euclidean",
    #metric="scaled_dtw",
    #metric_params={"r": 3},
)

#Build gRSF
bag = BaggingClassifier(
    base_estimator=tree,
    bootstrap=True,
    n_jobs=16,
    n_estimators=100,
    random_state=100,
)

#Results from cross validation
true, pred = cross_validation.kfold(data, label, bag)

#Print trees for debugging
#    for tree in bag.estimators_:
#   print_tree(tree.root_node_)
#   print(tree.root_node_.shapelet.array)

#Evaluation
evaluation.auc(true, pred)
evaluation.rocplot(true, pred)
		train_neg_s, train_neg_o, train_neg_p = train_negative_data
		train_label = np.concatenate((np.ones(len(train_s)), np.zeros(len(train_neg_s))))
		train_s = np.vstack([train_s, train_neg_s])
		train_o = np.vstack([train_o, train_neg_o])
		train_p = np.concatenate((train_p, train_neg_p))
		train_s, train_o, train_p, train_label = shuffle(train_s, train_o, train_p, train_label, random_state=4086)
		score = model.forward(train_s, train_o, train_p)
		loss = model.bce_loss(score, train_label, average=True)
		loss.backward()
		optimizer.step()
		if normalize_embed:
			model.normalize_embeddings()
		epoch_loss.append(loss.cpu().data.numpy())
		pred_score = model.predict_proba(score)
		score = score.cpu().data.numpy() if gpu else score.data.numpy()
		train_auc_score = auc(score, train_label)
		print('Epoch {0}\t Batch{1}\tTrain Loss value: {2}'.format(epoch, i, stats(epoch_loss)))
		print('Epoch {0}\t Batch{1}\tTraining AUC Score: {2}'.format(epoch, i, train_auc_score))
	
	if epoch%10 == 0:
		# Do evaluation on Dev Set		
		for j, valid_batch_data in enumerate(valid_loader,0):
			valid_s, valid_o, valid_p, valid_label = valid_batch_data
			score_val = model.forward(valid_s.numpy(), valid_o.numpy(), valid_p.numpy())
			score_val = score_val.cpu().data.numpy() if gpu else score_val.data.numpy()
			val_acc, thresh = find_clf_threshold(score_val)
		
			print('Threshold {0}'.format(thresh))
			val_auc_score = auc(score_val, valid_label)

			print('Epoch {0}\t Batch{1}\t Validation Accuracy: {2}'.format(epoch, j, val_acc))
Exemplo n.º 5
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--name", default="", help='save file name')
    args = parser.parse_args()
    # training
    batch_size = 100
    epoch_n = 50
    lr = 0.0005
    weight_decay = 0.00001
    logging.basicConfig(filename='data/cam/log_' + args.name,
                        level=logging.INFO)

    msg = "%s loading data" % time.strftime('%m.%d %H:%M:%S')
    print(msg)
    logging.info(msg)

    dataloader = CAMdataloader(batch_size)
    model = MyResNet(torchvision.models.resnet.BasicBlock,
                     [3, 4, 6, 3]).to(gpu)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 weight_decay=weight_decay)
    _, Y_test = dataloader.get_test()
    Y_test = Y_test.to(gpu)

    msg = "%s training start" % time.strftime('%m.%d %H:%M:%S')
    print(msg)
    logging.info(msg)

    for epoch in range(epoch_n):
        total_data = 0
        for X_batch, Y_batch in dataloader:
            total_data += len(X_batch)
            X_batch, Y_batch = X_batch.to(gpu), Y_batch.to(gpu)
            Y_predict = torch.nn.functional.sigmoid(model(X_batch)).view(-1)
            loss = torch.nn.functional.binary_cross_entropy(Y_predict, Y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # evaluation
            if total_data % 10000 == 0:
                Y_predict_test = []
                model.eval()
                for X_batch_test, _ in dataloader.test:
                    X_batch_test = X_batch_test.to(gpu)
                    y_predict_test = torch.nn.functional.sigmoid(
                        model(X_batch_test).detach())
                    Y_predict_test.append(y_predict_test)
                    # torch.cuda.empty_cache()
                Y_predict_test = torch.cat(Y_predict_test).view(-1)

                loss_test = torch.nn.functional.binary_cross_entropy(
                    Y_predict_test, Y_test)
                acc_train, acc_test = accuracy(Y_predict, Y_batch), accuracy(
                    Y_predict_test, Y_test)
                auc_train, auc_test = auc(Y_predict, Y_batch), auc(
                    Y_predict_test, Y_test)
                msg = "%s epoch:%d(%d/100000) loss:%.3f acc:%.3f auc:%.3f test_loss:%.3f test_acc:%.3f test_auc:%.3f" % (
                    time.strftime('%m.%d %H:%M:%S', time.localtime(
                        time.time())), epoch, total_data, loss, acc_train,
                    auc_train, loss_test, acc_test, auc_test)
                print(msg)
                logging.info(msg)
                model.train()

    model.save(args.name)
Exemplo n.º 6
0
            with torch.no_grad():
                accumulated_pair_auc = []
                for batch in validation_set.pairs(
                        config['evaluation']['batch_size']):
                    node_features, edge_features, from_idx, to_idx, graph_idx, labels = get_graph(
                        batch)
                    labels = labels.to(device)
                    eval_pairs = model(node_features.to(device),
                                       edge_features.to(device),
                                       from_idx.to(device), to_idx.to(device),
                                       graph_idx.to(device),
                                       config['evaluation']['batch_size'] * 2)

                    x, y = reshape_and_split_tensor(eval_pairs, 2)
                    similarity = compute_similarity(config, x, y)
                    pair_auc = auc(similarity, labels)
                    accumulated_pair_auc.append(pair_auc)

                accumulated_triplet_acc = []
                for batch in validation_set.triplets(
                        config['evaluation']['batch_size']):
                    node_features, edge_features, from_idx, to_idx, graph_idx = get_graph(
                        batch)
                    eval_triplets = model(
                        node_features.to(device), edge_features.to(device),
                        from_idx.to(device), to_idx.to(device),
                        graph_idx.to(device),
                        config['evaluation']['batch_size'] * 4)
                    x_1, y, x_2, z = reshape_and_split_tensor(eval_triplets, 4)
                    sim_1 = compute_similarity(config, x_1, y)
                    sim_2 = compute_similarity(config, x_2, z)
Exemplo n.º 7
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        default='data/sampling',
                        help='determine the base dir of the dataset document')
    parser.add_argument("--sample_n",
                        default=2000,
                        type=int,
                        help='starting image index of preprocessing')
    parser.add_argument("--evidence_n",
                        default=500,
                        type=int,
                        help='how many top/bottom tiles to pick from')
    parser.add_argument("--repl_n",
                        default=3,
                        type=int,
                        help='how many resampled replications')
    parser.add_argument("--image_split",
                        action='store_true',
                        help='if use image_split')
    parser.add_argument("--batch_size",
                        default=200,
                        type=int,
                        help="batch size")
    parser.add_argument("--stage_two",
                        action='store_true',
                        help='if only use stage two patients')
    parser.add_argument("--threshold",
                        default=25,
                        type=float,
                        help='threshold')
    parser.add_argument("--changhai",
                        action='store_true',
                        help='if use additional data')
    parser.add_argument("--TH", action='store_true')
    args = parser.parse_args()

    gpu = "cuda:0"
    n_epoch = 80
    acc_folds = []
    auc_folds = []
    c_index_folds = []
    f1_folds = []
    f1_folds_pos = []
    unsuccessful_count = 0
    model_count = 0
    n_manytimes = 8

    # caching
    if False:
        # if os.path.exists(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl')) and os.path.exists(os.path.join(args.data_dir, 'graph', 'graph_df.pkl')):
        print("loading cached graph data")
        with open(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl'),
                  'rb') as file:
            dataset = pickle.load(file)
        with open(os.path.join(args.data_dir, 'graph', 'graph_df.pkl'),
                  'rb') as file:
            df = pickle.load(file)
    else:
        if not os.path.exists(os.path.join(args.data_dir, 'graph')):
            os.mkdir(os.path.join(args.data_dir, 'graph'))
        dataset, df = construct_graph_dataset(args, gpu)
        with open(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl'),
                  'wb') as file:
            pickle.dump(dataset, file)
        with open(os.path.join(args.data_dir, 'graph', 'graph_df.pkl'),
                  'wb') as file:
            pickle.dump(df, file)

    splitter = CrossValidationSplitter(dataset,
                                       df,
                                       n=5,
                                       n_manytimes=n_manytimes)
    # criterion = torch.nn.CrossEntropyLoss()
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(0.4))
    fold_num = 0
    if not os.path.isdir(os.path.join(args.data_dir, 'model')):
        os.mkdir(os.path.join(args.data_dir, 'model'))

    for train_dataset, test_dataset, train_df, test_df in splitter:
        print("starting fold %d-%d" % (fold_num // 5, fold_num % 5))
        train_loader = DataLoader(train_dataset, batch_size=args.batch_size)
        test_loader = DataLoader(test_dataset, batch_size=args.batch_size)
        train_history = []
        test_history = []
        minimum_loss = None
        auc_fold = None
        acc_fold = None
        early_stop_count = 0
        model = GNN(32).to(gpu)
        optimizer = torch.optim.RMSprop(model.parameters(),
                                        lr=0.0004,
                                        weight_decay=0.001)

        for epoch in range(n_epoch):
            model.train()
            for data in train_loader:  # Iterate in batches over the training dataset.
                y_pred = model(data.x, data.edge_index,
                               data.batch.to(gpu)).view(
                                   -1)  # Perform a single forward pass.
                loss = criterion(y_pred, data.y)  # Compute the loss.
                loss.backward()  # Derive gradients.
                optimizer.step()  # Update parameters based on gradients.
                optimizer.zero_grad()  # Clear gradients.

            if epoch % 1 == 0:
                model.eval()
                y_pred_train, y_train = concat_result(train_loader, model, gpu)
                y_pred_test, y_test = concat_result(test_loader, model, gpu)
                loss_train, loss_test = criterion(y_pred_train,
                                                  y_train), criterion(
                                                      y_pred_test, y_test)
                #loss_test = nn.functional.mse_loss(result_test, Y_test)
                acc_train, acc_test = accuracy(y_pred_train,
                                               y_train), accuracy(
                                                   y_pred_test, y_test)
                auc_train, auc_test = auc(y_pred_train,
                                          y_train), auc(y_pred_test, y_test)
                if False:
                    c_index_train, c_index_test = 0, 0
                else:
                    c_index_train, c_index_test = c_index(
                        y_pred_train, train_df), c_index(y_pred_test, test_df)
                f1_train, f1_test = f1(y_pred_train, y_train,
                                       negative=True), f1(y_pred_test,
                                                          y_test,
                                                          negative=True)
                if epoch % 5 == 0:
                    print(
                        f'Epoch:{epoch:03d} Loss:{loss_train:.3f}/{loss_test:.3f} ACC:{acc_train:.3f}/{acc_test:.3f} AUC:{auc_train:.3f}/{auc_test:.3f} CI:{c_index_train:.3f}/{c_index_test:.3f} f1(neg):{f1_train:.3f}/{f1_test:.3f}'
                    )

                # early stop
                if minimum_loss is None or minimum_loss * 0.997 > loss_test:
                    # if minimum_loss is None or minimum_loss > loss_test:
                    if f1_train == 0:
                        continue
                    minimum_loss = loss_test
                    auc_fold = auc_test
                    acc_fold = acc_test
                    c_index_fold = c_index_test
                    f1_fold = f1_test
                    early_stop_count = 0
                    if acc_fold > 0.75 and auc_fold > 0.75:
                        model.save(args.data_dir +
                                   "/model/graph_%d" % model_count)
                #elif auc_test > auc_fold and auc_test>0.5 and acc_test >= acc_fold:
                #    minimum_loss = loss_test
                #    auc_fold = auc_test
                #    acc_fold = acc_test
                #    c_index_fold = c_index_test
                #    f1_fold = f1_test
                #    early_stop_count = 0\
                elif auc_fold + acc_fold + c_index_fold < auc_test + acc_test + c_index_fold:
                    minimum_loss = loss_test
                    auc_fold = auc_test
                    acc_fold = acc_test
                    c_index_fold = c_index_test
                    f1_fold = f1_test
                    early_stop_count = 0
                    if acc_fold > 0.75 and auc_fold > 0.75:
                        model.save(args.data_dir +
                                   "/model/graph_%d" % model_count)
                else:
                    early_stop_count += 1
                if abs(auc_fold - 1) < 0.0001:
                    pass
                    #print('wtf')
                if early_stop_count > 3 and epoch > 25:
                    if args.stage_two:
                        if auc_fold > 0.55 and acc_fold > 0.55:
                            print('early stop at epoch %d' % epoch)
                            if acc_fold > 0.75 and auc_fold > 0.75:
                                model.load(args.data_dir +
                                           "/model/graph_%d" % model_count)
                                model_count += 1
                            break
                    elif early_stop_count > 3:
                        print('early stop at epoch %d' % epoch)
                        break

        acc_folds.append(acc_fold)
        auc_folds.append(auc_fold)
        f1_folds.append(f1_fold)
        c_index_folds.append(c_index_fold)
        fold_num += 1
        print("acc:%.3f\tauc:%.3f\tc_index:%.3f\tf1:%.3f" %
              (acc_fold, auc_fold, c_index_fold, f1_fold))

    total_count = 5 * n_manytimes
    print('CV-acc:%.3f CV-auc:%.3f CV-c-index:%.3f f1(neg):%.3f' %
          (sum(acc_folds) / total_count, sum(auc_folds) / total_count,
           sum(c_index_folds) / total_count, sum(f1_folds) / total_count))