Exemplo n.º 1
0
def tune(lr=0.1, dropout=0.3, kernel_num=100, kernel_sizes='3,4,5', embed_dim=100):
    parser = argparse.ArgumentParser()
    parser.add_argument("--lr", type=float, default=lr)
    parser.add_argument("--dropout", type=float, default=dropout)
    parser.add_argument("--kernel_num", type=int, default=kernel_num)
    parser.add_argument("--kernel_sizes", type=str, default=kernel_sizes)
    parser.add_argument("--batch_size", type=int, default=16)
    parser.add_argument("--early_stop", type=int, default=10)
    parser.add_argument("--embed_dim", type=int, default=embed_dim)
    parser.add_argument("--max_len", type=int, default=200)
    parser.add_argument("--class_num", type=int, default=3)
    parser.add_argument("--lr_decay", type=float, default=0.5)
    args = parser.parse_args()

    # print("lr", args.lr, "dropout", args.dropout, "kernel_num", args.kernel_num, "kernel_sizes",args.kernel_sizes, "batch_size", args.batch_size, "early_stop", args.early_stop, "embed_dim", args.embed_dim, "max_len", args.max_len, "class_num", args.class_num, "lr_decay", args.lr_decay)
    train_loader, dev_loader, test_loader, vocab_size = get_dataloaders(args.batch_size, args.max_len)
    model = WordCNN(args, vocab_size, embedding_matrix=None)
    # loss function
    criterion = nn.CrossEntropyLoss()
    # choose optimizer
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr)

    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_decay)
    model, best_acc = trainer(train_loader, dev_loader, model, optimizer, criterion, early_stop=args.early_stop)

    print('best_dev_acc:{}'.format(best_acc))
    predict(model, test_loader)
    print("This is args", args)
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--lr", type=float, default=0.005)
    parser.add_argument("--dropout", type=float, default=0.3)
    parser.add_argument("--kernel_num", type=int, default=100)
    parser.add_argument("--kernel_sizes", type=str, default='2,3,4')
    parser.add_argument("--batch_size", type=int, default=16)
    parser.add_argument("--early_stop", type=int, default=15)
    parser.add_argument("--embed_dim", type=int, default=100)
    parser.add_argument("--max_len", type=int, default=200)
    parser.add_argument("--class_num", type=int, default=3)
    parser.add_argument("--lr_decay", type=float, default=0.5)
    args = parser.parse_args()
    #load data
    train_loader, dev_loader, test_loader, vocab_size = get_dataloaders(args.batch_size, args.max_len)
    #build model
    # try to use pretrained embedding here
    model = WordCNN(args, vocab_size, embedding_matrix=None)
    #loss function
    criterion = nn.CrossEntropyLoss()
    #choose optimizer
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,model.parameters()), lr=args.lr)

    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_decay)
    model, best_acc = trainer(train_loader, dev_loader, model, optimizer, criterion, early_stop = args.early_stop)

    print('best_dev_acc:{}'.format(best_acc))
    predict(model, test_loader)
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--lr", type=float, default=0.1)
    parser.add_argument("--dropout", type=float, default=0.3)
    parser.add_argument("--kernel_num", type=int, default=100)
    parser.add_argument("--kernel_sizes", type=str, default='3,4,5')
    parser.add_argument("--batch_size", type=int, default=16)
    parser.add_argument("--early_stop", type=int, default=3)
    parser.add_argument("--embed_dim", type=int, default=100)
    parser.add_argument("--max_len", type=int, default=200)
    parser.add_argument("--class_num", type=int, default=3)
    parser.add_argument("--lr_decay", type=float, default=0.5)
    parser.add_argument('-dpad',
                        '--dynamic_pad',
                        help='True to use dynamic padding, default is False',
                        action='store_true',
                        default=False)
    parser.add_argument("--file_name", type=str, default='submission.csv')
    args = parser.parse_args()

    # check if dynamic padding flag is true
    if args.dynamic_pad:
        from preprocess_dpad import get_dataloaders
    else:
        from preprocess import get_dataloaders

    #load data
    train_loader, dev_loader, test_loader, vocab_size = get_dataloaders(
        args.batch_size, args.max_len)
    #build model
    # try to use pretrained embedding here

    # embedding_matrix = np.loadtxt('w_emb_mat.txt')
    model = WordCNN(args, vocab_size, embedding_matrix=None)
    #loss function
    criterion = nn.CrossEntropyLoss()
    #choose optimizer
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                       model.parameters()),
                                lr=args.lr)

    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_decay)
    model, best_acc = trainer(train_loader,
                              dev_loader,
                              model,
                              optimizer,
                              criterion,
                              early_stop=args.early_stop)

    print('best_dev_acc:{}'.format(best_acc))
    predict(model, test_loader, args.file_name)
Exemplo n.º 4
0
def main(args):
    print >> sys.stderr, args
    random.seed(args.seed)
    assert (os.path.isdir(args.datapath))
    fold_dir = args.datapath

    X_train, y_train = load_dataset(fold_dir, 'train.ppi.txt')

    print "Positives", y_train.count('Positive'), "Negatives", y_train.count(
        'Negative')

    word_vocab = ['<ZERO>', 'UNK']
    for xx in X_train:
        _, _, _, text = xx
        for s in text:
            for w in s:
                if w.lower() not in word_vocab:
                    word_vocab.append(w.lower())

    print "Vocab", len(word_vocab), word_vocab[:10]
    vocab_cache = os.path.join(args.datapath, 'word_vocab.mppi.txt')
    with open(vocab_cache, 'w') as f:
        print "Saved vocab to", vocab_cache
        pickle.dump(word_vocab, f)

    embeddings = load_embeddings(args.embeddings_path, word_vocab, 200)

    labels = ['Negative', 'Positive']

    model_name = 'saved_model_ppi'
    if not os.path.exists('{}/scratch'.format(args.datapath)):
        os.mkdir('{}/scratch'.format(args.datapath))

    if os.path.exists('{}/{}'.format(args.datapath, model_name)):
        os.rename(
            '{}/{}'.format(args.datapath, model_name),
            '{}/{}_{}'.format(args.datapath, model_name, int(time.time())))

    os.mkdir('{}/{}'.format(args.datapath, model_name))

    for j in range(num_ensembles):
        m = WordCNN(labels,
                    word_vocab,
                    word_embeddings=embeddings,
                    max_sequence_length=1000)

        m.fit(X_train,
              y_train,
              num_epoch=args.num_epoch,
              batch_size=args.batch_size,
              seed=j)

        save_path = '{}/{}/model_{}'.format(args.datapath, model_name, j)
        m.save(save_path)
        print "Saved model {} to {}".format(j, save_path)
Exemplo n.º 5
0
def main(args):
    print >> sys.stderr, args
    random.seed(args.seed)
    assert (os.path.isdir(args.datapath))
    fold_dir = args.datapath

    lines = []
    for line in fileinput.input(args.files):
        lines.append(line)

    tokens, pmids = load_annotated(lines)
    X_test = extract_candidates(tokens, pmids)
    #print len(pmids), 'articles'
    #print len(X_test), 'examples'

    vocab_cache = os.path.join(args.datapath, 'word_vocab.mppi.txt')

    with open(vocab_cache, 'r') as f:
        word_vocab = pickle.load(f)
        #print "Loaded vocab from", vocab_cache

    #print "Vocab", len(word_vocab),  word_vocab[:10]
    model_name = 'saved_model_ppi'
    labels = ['Negative', 'Positive']

    proba_cumulative = np.zeros((len(X_test), len(labels)))

    for j in range(num_ensembles):
        m = WordCNN(labels,
                    word_vocab,
                    word_embeddings=None,
                    max_sequence_length=1000)

        save_path = '{}/{}/model_{}'.format(args.datapath, model_name, j)
        m.restore(save_path)
        print >> sys.stderr, "Restoring model {}/{}".format(
            j + 1, num_ensembles)

        proba_cumulative += m.predict_proba(X_test)

    proba_cumulative /= num_ensembles
    y_pred = np.argmax(proba_cumulative, axis=1)
    positive_pct = [y[labels.index('Positive')] for y in proba_cumulative]

    pair_scores = {}
    for inst, pred in zip(X_test, positive_pct):
        pmid, a, b, text = inst
        key = ','.join([pmid] + sorted([a, b]))
        if key not in pair_scores:
            pair_scores[key] = []

        pair_scores[key].append(pred)

    output_pairs = {}

    for k in sorted(pair_scores.keys()):
        #print k, pair_scores[k], np.mean(pair_scores[k])
        meanscore = np.mean(pair_scores[k])
        pmid, a, b = k.split(',')

        if pmid not in output_pairs:
            output_pairs[pmid] = []

        if len(pair_scores[k]) == 1:
            pair_scores[k].append(pair_scores[k][0])

        print '\t'.join(
            [pmid, a, b,
             str(pair_scores[k][0]),
             str(pair_scores[k][1])])