def tune(lr=0.1, dropout=0.3, kernel_num=100, kernel_sizes='3,4,5', embed_dim=100): parser = argparse.ArgumentParser() parser.add_argument("--lr", type=float, default=lr) parser.add_argument("--dropout", type=float, default=dropout) parser.add_argument("--kernel_num", type=int, default=kernel_num) parser.add_argument("--kernel_sizes", type=str, default=kernel_sizes) parser.add_argument("--batch_size", type=int, default=16) parser.add_argument("--early_stop", type=int, default=10) parser.add_argument("--embed_dim", type=int, default=embed_dim) parser.add_argument("--max_len", type=int, default=200) parser.add_argument("--class_num", type=int, default=3) parser.add_argument("--lr_decay", type=float, default=0.5) args = parser.parse_args() # print("lr", args.lr, "dropout", args.dropout, "kernel_num", args.kernel_num, "kernel_sizes",args.kernel_sizes, "batch_size", args.batch_size, "early_stop", args.early_stop, "embed_dim", args.embed_dim, "max_len", args.max_len, "class_num", args.class_num, "lr_decay", args.lr_decay) train_loader, dev_loader, test_loader, vocab_size = get_dataloaders(args.batch_size, args.max_len) model = WordCNN(args, vocab_size, embedding_matrix=None) # loss function criterion = nn.CrossEntropyLoss() # choose optimizer optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_decay) model, best_acc = trainer(train_loader, dev_loader, model, optimizer, criterion, early_stop=args.early_stop) print('best_dev_acc:{}'.format(best_acc)) predict(model, test_loader) print("This is args", args)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--lr", type=float, default=0.005) parser.add_argument("--dropout", type=float, default=0.3) parser.add_argument("--kernel_num", type=int, default=100) parser.add_argument("--kernel_sizes", type=str, default='2,3,4') parser.add_argument("--batch_size", type=int, default=16) parser.add_argument("--early_stop", type=int, default=15) parser.add_argument("--embed_dim", type=int, default=100) parser.add_argument("--max_len", type=int, default=200) parser.add_argument("--class_num", type=int, default=3) parser.add_argument("--lr_decay", type=float, default=0.5) args = parser.parse_args() #load data train_loader, dev_loader, test_loader, vocab_size = get_dataloaders(args.batch_size, args.max_len) #build model # try to use pretrained embedding here model = WordCNN(args, vocab_size, embedding_matrix=None) #loss function criterion = nn.CrossEntropyLoss() #choose optimizer optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,model.parameters()), lr=args.lr) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_decay) model, best_acc = trainer(train_loader, dev_loader, model, optimizer, criterion, early_stop = args.early_stop) print('best_dev_acc:{}'.format(best_acc)) predict(model, test_loader)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--lr", type=float, default=0.1) parser.add_argument("--dropout", type=float, default=0.3) parser.add_argument("--kernel_num", type=int, default=100) parser.add_argument("--kernel_sizes", type=str, default='3,4,5') parser.add_argument("--batch_size", type=int, default=16) parser.add_argument("--early_stop", type=int, default=3) parser.add_argument("--embed_dim", type=int, default=100) parser.add_argument("--max_len", type=int, default=200) parser.add_argument("--class_num", type=int, default=3) parser.add_argument("--lr_decay", type=float, default=0.5) parser.add_argument('-dpad', '--dynamic_pad', help='True to use dynamic padding, default is False', action='store_true', default=False) parser.add_argument("--file_name", type=str, default='submission.csv') args = parser.parse_args() # check if dynamic padding flag is true if args.dynamic_pad: from preprocess_dpad import get_dataloaders else: from preprocess import get_dataloaders #load data train_loader, dev_loader, test_loader, vocab_size = get_dataloaders( args.batch_size, args.max_len) #build model # try to use pretrained embedding here # embedding_matrix = np.loadtxt('w_emb_mat.txt') model = WordCNN(args, vocab_size, embedding_matrix=None) #loss function criterion = nn.CrossEntropyLoss() #choose optimizer optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_decay) model, best_acc = trainer(train_loader, dev_loader, model, optimizer, criterion, early_stop=args.early_stop) print('best_dev_acc:{}'.format(best_acc)) predict(model, test_loader, args.file_name)
def main(args): print >> sys.stderr, args random.seed(args.seed) assert (os.path.isdir(args.datapath)) fold_dir = args.datapath X_train, y_train = load_dataset(fold_dir, 'train.ppi.txt') print "Positives", y_train.count('Positive'), "Negatives", y_train.count( 'Negative') word_vocab = ['<ZERO>', 'UNK'] for xx in X_train: _, _, _, text = xx for s in text: for w in s: if w.lower() not in word_vocab: word_vocab.append(w.lower()) print "Vocab", len(word_vocab), word_vocab[:10] vocab_cache = os.path.join(args.datapath, 'word_vocab.mppi.txt') with open(vocab_cache, 'w') as f: print "Saved vocab to", vocab_cache pickle.dump(word_vocab, f) embeddings = load_embeddings(args.embeddings_path, word_vocab, 200) labels = ['Negative', 'Positive'] model_name = 'saved_model_ppi' if not os.path.exists('{}/scratch'.format(args.datapath)): os.mkdir('{}/scratch'.format(args.datapath)) if os.path.exists('{}/{}'.format(args.datapath, model_name)): os.rename( '{}/{}'.format(args.datapath, model_name), '{}/{}_{}'.format(args.datapath, model_name, int(time.time()))) os.mkdir('{}/{}'.format(args.datapath, model_name)) for j in range(num_ensembles): m = WordCNN(labels, word_vocab, word_embeddings=embeddings, max_sequence_length=1000) m.fit(X_train, y_train, num_epoch=args.num_epoch, batch_size=args.batch_size, seed=j) save_path = '{}/{}/model_{}'.format(args.datapath, model_name, j) m.save(save_path) print "Saved model {} to {}".format(j, save_path)
def main(args): print >> sys.stderr, args random.seed(args.seed) assert (os.path.isdir(args.datapath)) fold_dir = args.datapath lines = [] for line in fileinput.input(args.files): lines.append(line) tokens, pmids = load_annotated(lines) X_test = extract_candidates(tokens, pmids) #print len(pmids), 'articles' #print len(X_test), 'examples' vocab_cache = os.path.join(args.datapath, 'word_vocab.mppi.txt') with open(vocab_cache, 'r') as f: word_vocab = pickle.load(f) #print "Loaded vocab from", vocab_cache #print "Vocab", len(word_vocab), word_vocab[:10] model_name = 'saved_model_ppi' labels = ['Negative', 'Positive'] proba_cumulative = np.zeros((len(X_test), len(labels))) for j in range(num_ensembles): m = WordCNN(labels, word_vocab, word_embeddings=None, max_sequence_length=1000) save_path = '{}/{}/model_{}'.format(args.datapath, model_name, j) m.restore(save_path) print >> sys.stderr, "Restoring model {}/{}".format( j + 1, num_ensembles) proba_cumulative += m.predict_proba(X_test) proba_cumulative /= num_ensembles y_pred = np.argmax(proba_cumulative, axis=1) positive_pct = [y[labels.index('Positive')] for y in proba_cumulative] pair_scores = {} for inst, pred in zip(X_test, positive_pct): pmid, a, b, text = inst key = ','.join([pmid] + sorted([a, b])) if key not in pair_scores: pair_scores[key] = [] pair_scores[key].append(pred) output_pairs = {} for k in sorted(pair_scores.keys()): #print k, pair_scores[k], np.mean(pair_scores[k]) meanscore = np.mean(pair_scores[k]) pmid, a, b = k.split(',') if pmid not in output_pairs: output_pairs[pmid] = [] if len(pair_scores[k]) == 1: pair_scores[k].append(pair_scores[k][0]) print '\t'.join( [pmid, a, b, str(pair_scores[k][0]), str(pair_scores[k][1])])