def main(): # reading in import argparse parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default='data/sampling', help='determine the base dir of the dataset document') parser.add_argument("--sample_n", default=1000, type=int, help='starting image index of preprocessing') parser.add_argument("--evidence_n", default=20, type=int, help='how many top/bottom tiles to pick from') parser.add_argument("--repl_n", default=3, type=int, help='how many resampled replications') parser.add_argument("--image_split", action='store_true', help='if use image_split') parser.add_argument("--batch_size", default=50, type=int, help="batch size") parser.add_argument("--stage_two", action='store_true', help='if only use stage two patients') parser.add_argument("--changhai", action='store_true', help='if use additional data') args = parser.parse_args() feature_size = 32 #gpu = "cuda:0" gpu = None # 5-folds cross validation dataloader = CVDataLoader(args, gpu, feature_size) n_epoch = 800 lr = 0.0005 if args.stage_two: weight_decay = 0.008 else: weight_decay = 0.005 manytimes_n = 8 if not os.path.isdir('figure'): os.mkdir('figure') if not os.path.isdir(os.path.join(args.data_dir, 'model')): os.mkdir(os.path.join(args.data_dir, 'model')) acc_folds = [] auc_folds = [] c_index_folds = [] f1_folds = [] f1_folds_pos = [] total_round = 0 model_count = 0 loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(0.8)) for _ in range(manytimes_n): # averaging for i in range(5): train_history = [] test_history = [] minimum_loss = None auc_fold = None acc_fold = None early_stop_count = 0 model = Predictor(evidence_size=args.evidence_n, layers=(100, 50, 1), feature_size=feature_size) # model.apply(weight_init) if gpu: model = model.to(gpu) optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay) # optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) dataloader.set_fold(i) X_test, Y_test, df_test = dataloader.get_test() # X_train, Y_train, df_train = dataloader.get_train() print('starting fold %d' % i) for epoch in range(n_epoch): #result = model(X_train) #loss = nn.functional.binary_cross_entropy(result, Y_train) + nn.functional.mse_loss(result, Y_train) # loss = nn.functional.mse_loss(result, Y_train) #loss.backward() #optimizer.step() #optimizer.zero_grad() # batch input for X_train_batch, Y_train_batch, df_train_batch in dataloader: # print(X_train_batch.shape) result = model(X_train_batch) loss = loss_function(result, Y_train_batch) loss.backward() optimizer.step() optimizer.zero_grad() X_train, Y_train, df_train = X_train_batch, Y_train_batch, df_train_batch if epoch % 20 == 0: result_test = model(X_test) loss_test = loss_function(result_test, Y_test) #loss_test = nn.functional.mse_loss(result_test, Y_test) acc_train, acc_test = accuracy(result, Y_train), accuracy( result_test, Y_test) auc_train, auc_test = auc(result, Y_train), auc( result_test, Y_test) if args.changhai: c_index_train, c_index_test = 0, 0 else: c_index_train, c_index_test = c_index( result, df_train), c_index(result_test, df_test) recall_train, recall_test = recall(result, Y_train), recall( result_test, Y_test) precision_train, precision_test = precision( result, Y_train), precision(result_test, Y_test) f1_train_pos, f1_test_pos = f1(result, Y_train), f1( result_test, Y_test) f1_train, f1_test = f1(result, Y_train, negative=True), f1(result_test, Y_test, negative=True) train_history.append( (epoch, loss, acc_train, auc_train, c_index_train)) test_history.append( (epoch, loss_test, acc_test, auc_test, c_index_test)) if epoch % 40 == 0: print( "%s epoch:%d loss:%.3f/%.3f acc:%.3f/%.3f auc:%.3f/%.3f c_index:%.3f/%.3f recall:%.3f/%.3f prec:%.3f/%.3f f1:%.3f/%.3f f1(neg):%.3f/%.3f" % (time.strftime( '%m.%d %H:%M:%S', time.localtime( time.time())), epoch, loss, loss_test, acc_train, acc_test, auc_train, auc_test, c_index_train, c_index_test, recall_train, recall_test, precision_train, precision_test, f1_train_pos, f1_test_pos, f1_train, f1_test)) # early stop if minimum_loss is None or minimum_loss * 0.995 > loss_test: # if minimum_loss is None or minimum_loss > loss_test: if f1_train == 0: continue minimum_loss = loss_test auc_fold = auc_test acc_fold = acc_test c_index_fold = c_index_test f1_fold_pos = f1_test_pos f1_fold = f1_test early_stop_count = 0 elif auc_test > auc_fold and auc_test > 0.5 and acc_test >= acc_fold: minimum_loss = loss_test auc_fold = auc_test acc_fold = acc_test c_index_fold = c_index_test f1_fold_pos = f1_test_pos f1_fold = f1_test early_stop_count = 0 else: early_stop_count += 1 if early_stop_count > 2 and epoch > 100: if args.stage_two: if auc_fold > 0.55: print('early stop at epoch %d' % epoch) break elif early_stop_count > 3: print('early stop at epoch %d' % epoch) break if epoch > 500: optimizer = torch.optim.RMSprop( model.parameters(), lr * 0.6, weight_decay=weight_decay * 1.2) train_history = np.array(train_history) test_history = np.array(test_history) acc_folds.append(acc_fold) auc_folds.append(auc_fold) f1_folds.append(f1_fold) f1_folds_pos.append(f1_fold_pos) c_index_folds.append(c_index_fold) plt.plot(train_history[:, 0], train_history[:, 1], label='train') plt.plot(test_history[:, 0], test_history[:, 1], label='test') plt.legend() plt.savefig('figure/sample_%d_fold%d.png' % (args.sample_n, i)) plt.cla() if acc_fold > 0.7 and auc_fold > 0.6 and model_count < 10: model.save(args.data_dir + "/model/model_%d" % model_count) model_count += 1 print("acc:%.3f\tauc:%.3f\tc_index:%.3f\tf1:%.3f" % (acc_fold, auc_fold, c_index_fold, f1_fold)) total_round += 1 if gpu: del dataloader.X_train, dataloader.Y_train, dataloader.X_test, dataloader.Y_test del X_test, Y_test, X_train, Y_train, model, optimizer torch.cuda.empty_cache() print('CV-acc:%.3f CV-auc:%.3f CV-c-index:%.3f f1:%.3f f1(neg):%.3f' % (sum(acc_folds) / 5 / manytimes_n, sum(auc_folds) / 5 / manytimes_n, sum(c_index_folds) / 5 / manytimes_n, sum(f1_folds_pos) / 5 / manytimes_n, sum(f1_folds) / 5 / manytimes_n))
lw=lw, batch_size=batch_size, input_dropout=dropout_p, gpu=gpu) elif args.model == 'ErmlpAvg': model = ERMLP_avg(embedding_dim=embedding_dim, embedding_rel_dim=embedding_rel_dim, mlp_hidden=mlp_hidden, weights=pretrained_weights, n_r=n_r, lw=lw, batch_size=batch_size, input_dropout=dropout_p, gpu=gpu) else: raise Exception('Unknown model!') model_name = 'models/ConceptNet/model.bin' state = torch.load(model_name, map_location=lambda storage, loc: storage) model.load_state_dict(state) # Test Model test_s, test_o, test_p = test_data score_test = model.forward(test_s, test_o, test_p) score_test = score_test.cpu().data.numpy() if gpu else score_test.data.numpy() test_acc = get_accuracy(score_test, thresh) test_auc_score = auc(score_test, test_label) print('Test Accuracy: {0}'.format(test_acc)) print('Test AUC Score: {0}'.format(test_auc_score))
n_shapelets=50, min_shapelet_size=0, max_shapelet_size=1, force_dim=x_dimensions, metric="euclidean", #metric="scaled_euclidean", #metric="scaled_dtw", #metric_params={"r": 3}, ) #Build gRSF bag = BaggingClassifier( base_estimator=tree, bootstrap=True, n_jobs=16, n_estimators=100, random_state=100, ) #Results from cross validation true, pred = cross_validation.kfold(data, label, bag) #Print trees for debugging # for tree in bag.estimators_: # print_tree(tree.root_node_) # print(tree.root_node_.shapelet.array) #Evaluation evaluation.auc(true, pred) evaluation.rocplot(true, pred)
train_neg_s, train_neg_o, train_neg_p = train_negative_data train_label = np.concatenate((np.ones(len(train_s)), np.zeros(len(train_neg_s)))) train_s = np.vstack([train_s, train_neg_s]) train_o = np.vstack([train_o, train_neg_o]) train_p = np.concatenate((train_p, train_neg_p)) train_s, train_o, train_p, train_label = shuffle(train_s, train_o, train_p, train_label, random_state=4086) score = model.forward(train_s, train_o, train_p) loss = model.bce_loss(score, train_label, average=True) loss.backward() optimizer.step() if normalize_embed: model.normalize_embeddings() epoch_loss.append(loss.cpu().data.numpy()) pred_score = model.predict_proba(score) score = score.cpu().data.numpy() if gpu else score.data.numpy() train_auc_score = auc(score, train_label) print('Epoch {0}\t Batch{1}\tTrain Loss value: {2}'.format(epoch, i, stats(epoch_loss))) print('Epoch {0}\t Batch{1}\tTraining AUC Score: {2}'.format(epoch, i, train_auc_score)) if epoch%10 == 0: # Do evaluation on Dev Set for j, valid_batch_data in enumerate(valid_loader,0): valid_s, valid_o, valid_p, valid_label = valid_batch_data score_val = model.forward(valid_s.numpy(), valid_o.numpy(), valid_p.numpy()) score_val = score_val.cpu().data.numpy() if gpu else score_val.data.numpy() val_acc, thresh = find_clf_threshold(score_val) print('Threshold {0}'.format(thresh)) val_auc_score = auc(score_val, valid_label) print('Epoch {0}\t Batch{1}\t Validation Accuracy: {2}'.format(epoch, j, val_acc))
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--name", default="", help='save file name') args = parser.parse_args() # training batch_size = 100 epoch_n = 50 lr = 0.0005 weight_decay = 0.00001 logging.basicConfig(filename='data/cam/log_' + args.name, level=logging.INFO) msg = "%s loading data" % time.strftime('%m.%d %H:%M:%S') print(msg) logging.info(msg) dataloader = CAMdataloader(batch_size) model = MyResNet(torchvision.models.resnet.BasicBlock, [3, 4, 6, 3]).to(gpu) optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) _, Y_test = dataloader.get_test() Y_test = Y_test.to(gpu) msg = "%s training start" % time.strftime('%m.%d %H:%M:%S') print(msg) logging.info(msg) for epoch in range(epoch_n): total_data = 0 for X_batch, Y_batch in dataloader: total_data += len(X_batch) X_batch, Y_batch = X_batch.to(gpu), Y_batch.to(gpu) Y_predict = torch.nn.functional.sigmoid(model(X_batch)).view(-1) loss = torch.nn.functional.binary_cross_entropy(Y_predict, Y_batch) loss.backward() optimizer.step() optimizer.zero_grad() # evaluation if total_data % 10000 == 0: Y_predict_test = [] model.eval() for X_batch_test, _ in dataloader.test: X_batch_test = X_batch_test.to(gpu) y_predict_test = torch.nn.functional.sigmoid( model(X_batch_test).detach()) Y_predict_test.append(y_predict_test) # torch.cuda.empty_cache() Y_predict_test = torch.cat(Y_predict_test).view(-1) loss_test = torch.nn.functional.binary_cross_entropy( Y_predict_test, Y_test) acc_train, acc_test = accuracy(Y_predict, Y_batch), accuracy( Y_predict_test, Y_test) auc_train, auc_test = auc(Y_predict, Y_batch), auc( Y_predict_test, Y_test) msg = "%s epoch:%d(%d/100000) loss:%.3f acc:%.3f auc:%.3f test_loss:%.3f test_acc:%.3f test_auc:%.3f" % ( time.strftime('%m.%d %H:%M:%S', time.localtime( time.time())), epoch, total_data, loss, acc_train, auc_train, loss_test, acc_test, auc_test) print(msg) logging.info(msg) model.train() model.save(args.name)
with torch.no_grad(): accumulated_pair_auc = [] for batch in validation_set.pairs( config['evaluation']['batch_size']): node_features, edge_features, from_idx, to_idx, graph_idx, labels = get_graph( batch) labels = labels.to(device) eval_pairs = model(node_features.to(device), edge_features.to(device), from_idx.to(device), to_idx.to(device), graph_idx.to(device), config['evaluation']['batch_size'] * 2) x, y = reshape_and_split_tensor(eval_pairs, 2) similarity = compute_similarity(config, x, y) pair_auc = auc(similarity, labels) accumulated_pair_auc.append(pair_auc) accumulated_triplet_acc = [] for batch in validation_set.triplets( config['evaluation']['batch_size']): node_features, edge_features, from_idx, to_idx, graph_idx = get_graph( batch) eval_triplets = model( node_features.to(device), edge_features.to(device), from_idx.to(device), to_idx.to(device), graph_idx.to(device), config['evaluation']['batch_size'] * 4) x_1, y, x_2, z = reshape_and_split_tensor(eval_triplets, 4) sim_1 = compute_similarity(config, x_1, y) sim_2 = compute_similarity(config, x_2, z)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default='data/sampling', help='determine the base dir of the dataset document') parser.add_argument("--sample_n", default=2000, type=int, help='starting image index of preprocessing') parser.add_argument("--evidence_n", default=500, type=int, help='how many top/bottom tiles to pick from') parser.add_argument("--repl_n", default=3, type=int, help='how many resampled replications') parser.add_argument("--image_split", action='store_true', help='if use image_split') parser.add_argument("--batch_size", default=200, type=int, help="batch size") parser.add_argument("--stage_two", action='store_true', help='if only use stage two patients') parser.add_argument("--threshold", default=25, type=float, help='threshold') parser.add_argument("--changhai", action='store_true', help='if use additional data') parser.add_argument("--TH", action='store_true') args = parser.parse_args() gpu = "cuda:0" n_epoch = 80 acc_folds = [] auc_folds = [] c_index_folds = [] f1_folds = [] f1_folds_pos = [] unsuccessful_count = 0 model_count = 0 n_manytimes = 8 # caching if False: # if os.path.exists(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl')) and os.path.exists(os.path.join(args.data_dir, 'graph', 'graph_df.pkl')): print("loading cached graph data") with open(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl'), 'rb') as file: dataset = pickle.load(file) with open(os.path.join(args.data_dir, 'graph', 'graph_df.pkl'), 'rb') as file: df = pickle.load(file) else: if not os.path.exists(os.path.join(args.data_dir, 'graph')): os.mkdir(os.path.join(args.data_dir, 'graph')) dataset, df = construct_graph_dataset(args, gpu) with open(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl'), 'wb') as file: pickle.dump(dataset, file) with open(os.path.join(args.data_dir, 'graph', 'graph_df.pkl'), 'wb') as file: pickle.dump(df, file) splitter = CrossValidationSplitter(dataset, df, n=5, n_manytimes=n_manytimes) # criterion = torch.nn.CrossEntropyLoss() criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(0.4)) fold_num = 0 if not os.path.isdir(os.path.join(args.data_dir, 'model')): os.mkdir(os.path.join(args.data_dir, 'model')) for train_dataset, test_dataset, train_df, test_df in splitter: print("starting fold %d-%d" % (fold_num // 5, fold_num % 5)) train_loader = DataLoader(train_dataset, batch_size=args.batch_size) test_loader = DataLoader(test_dataset, batch_size=args.batch_size) train_history = [] test_history = [] minimum_loss = None auc_fold = None acc_fold = None early_stop_count = 0 model = GNN(32).to(gpu) optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0004, weight_decay=0.001) for epoch in range(n_epoch): model.train() for data in train_loader: # Iterate in batches over the training dataset. y_pred = model(data.x, data.edge_index, data.batch.to(gpu)).view( -1) # Perform a single forward pass. loss = criterion(y_pred, data.y) # Compute the loss. loss.backward() # Derive gradients. optimizer.step() # Update parameters based on gradients. optimizer.zero_grad() # Clear gradients. if epoch % 1 == 0: model.eval() y_pred_train, y_train = concat_result(train_loader, model, gpu) y_pred_test, y_test = concat_result(test_loader, model, gpu) loss_train, loss_test = criterion(y_pred_train, y_train), criterion( y_pred_test, y_test) #loss_test = nn.functional.mse_loss(result_test, Y_test) acc_train, acc_test = accuracy(y_pred_train, y_train), accuracy( y_pred_test, y_test) auc_train, auc_test = auc(y_pred_train, y_train), auc(y_pred_test, y_test) if False: c_index_train, c_index_test = 0, 0 else: c_index_train, c_index_test = c_index( y_pred_train, train_df), c_index(y_pred_test, test_df) f1_train, f1_test = f1(y_pred_train, y_train, negative=True), f1(y_pred_test, y_test, negative=True) if epoch % 5 == 0: print( f'Epoch:{epoch:03d} Loss:{loss_train:.3f}/{loss_test:.3f} ACC:{acc_train:.3f}/{acc_test:.3f} AUC:{auc_train:.3f}/{auc_test:.3f} CI:{c_index_train:.3f}/{c_index_test:.3f} f1(neg):{f1_train:.3f}/{f1_test:.3f}' ) # early stop if minimum_loss is None or minimum_loss * 0.997 > loss_test: # if minimum_loss is None or minimum_loss > loss_test: if f1_train == 0: continue minimum_loss = loss_test auc_fold = auc_test acc_fold = acc_test c_index_fold = c_index_test f1_fold = f1_test early_stop_count = 0 if acc_fold > 0.75 and auc_fold > 0.75: model.save(args.data_dir + "/model/graph_%d" % model_count) #elif auc_test > auc_fold and auc_test>0.5 and acc_test >= acc_fold: # minimum_loss = loss_test # auc_fold = auc_test # acc_fold = acc_test # c_index_fold = c_index_test # f1_fold = f1_test # early_stop_count = 0\ elif auc_fold + acc_fold + c_index_fold < auc_test + acc_test + c_index_fold: minimum_loss = loss_test auc_fold = auc_test acc_fold = acc_test c_index_fold = c_index_test f1_fold = f1_test early_stop_count = 0 if acc_fold > 0.75 and auc_fold > 0.75: model.save(args.data_dir + "/model/graph_%d" % model_count) else: early_stop_count += 1 if abs(auc_fold - 1) < 0.0001: pass #print('wtf') if early_stop_count > 3 and epoch > 25: if args.stage_two: if auc_fold > 0.55 and acc_fold > 0.55: print('early stop at epoch %d' % epoch) if acc_fold > 0.75 and auc_fold > 0.75: model.load(args.data_dir + "/model/graph_%d" % model_count) model_count += 1 break elif early_stop_count > 3: print('early stop at epoch %d' % epoch) break acc_folds.append(acc_fold) auc_folds.append(auc_fold) f1_folds.append(f1_fold) c_index_folds.append(c_index_fold) fold_num += 1 print("acc:%.3f\tauc:%.3f\tc_index:%.3f\tf1:%.3f" % (acc_fold, auc_fold, c_index_fold, f1_fold)) total_count = 5 * n_manytimes print('CV-acc:%.3f CV-auc:%.3f CV-c-index:%.3f f1(neg):%.3f' % (sum(acc_folds) / total_count, sum(auc_folds) / total_count, sum(c_index_folds) / total_count, sum(f1_folds) / total_count))