Exemplo n.º 1
0
def main():
    args = parse_args()

    # input files
    wv_file = args.glove_dir + '/' + args.wv_file
    wv_dim = args.wv_dim

    # output files
    helper.ensure_dir(args.vocab_dir)
    vocab_file = args.vocab_dir + '/vocab.pkl'
    emb_file = args.vocab_dir + '/embedding.npy'

    # load files
    print("loading files...")
    train_file = args.data_dir + '/train.jsonl'
    dev_file = args.data_dir + '/dev.jsonl'
    # test_file = args.data_dir + '/test.jsonl'
    train_tokens = load_tokens(train_file)
    dev_tokens = load_tokens(dev_file)
    # test_tokens = load_tokens(test_file)
    if args.lower:
        train_tokens, dev_tokens = [[t.lower() for t in tokens] for tokens in \
                                                 (train_tokens, dev_tokens)]
        # (train_tokens, dev_tokens, test_tokens)]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    # all_tokens = train_tokens + dev_tokens + test_tokens
    all_tokens = train_tokens + dev_tokens
    v = build_vocab(all_tokens, glove_vocab, args.min_freq)

    print("calculating oov...")
    # datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
    datasets = {'train': train_tokens, 'dev': dev_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    if args.random:
        print("using random initialization...")
        embedding = random_embedding(v, wv_dim)
    else:
        embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    print("all done.")
Exemplo n.º 2
0
def evaluate_model(evalparams):

    torch.manual_seed(evalparams.seed)
    random.seed(1234)
    if evalparams.cpu:
        evalparams.cuda = False
    elif evalparams.cud:
        torch.cuda.manual_seed(args.seed)

    # load opt
    print(evalparams.model_dir, evalparams.model)
    #     model_file = evalparams.model_dir + "/" + evalparams.model
    model_file = 'best_model.pt'
    print("Loading model from {}".format(model_file))
    opt = torch_utils.load_config(model_file)
    model = RelationModel(opt)
    model.load(model_file)

    # load vocab
    vocab_file = evalparams.model_dir + '/vocab.pkl'
    vocab = Vocab(vocab_file, load=True)
    assert opt[
        'vocab_size'] == vocab.size, "Vocab size must match that in the saved model."

    # load data
    data_file = opt['data_dir'] + '/{}.json'.format(evalparams.dataset)
    print("Loading data from {} with batch size {}...".format(
        data_file, opt['batch_size']))
    batch = DataLoader(data_file,
                       opt['batch_size'],
                       opt,
                       vocab,
                       evaluation=True)

    helper.print_config(opt)
    id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])

    predictions = []
    all_probs = []
    for i, b in enumerate(batch):
        preds, probs, _ = model.predict(b)
        predictions += preds
        all_probs += probs
    predictions = [id2label[p] for p in predictions]
    p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True)

    # save probability scores
    if len(evalparams.out) > 0:
        helper.ensure_dir(os.path.dirname(evalparams.out))
        with open(evalparams.out, 'wb') as outfile:
            pickle.dump(all_probs, outfile)
        print("Prediction scores saved to {}.".format(evalparams.out))

    print("Evaluation ended.")

    return (batch.gold(), predictions, model)
Exemplo n.º 3
0
def prepare_vocab(data_dir,
                  vocab_dir,
                  spacy_model,
                  glove_dir="dataset/glove",
                  wv_file="glove.840B.300d.txt",
                  wv_dim=300,
                  min_freq=0,
                  lower=True):
    # input files
    train_file = data_dir + '/train.json'
    dev_file = data_dir + '/dev.json'
    test_file = data_dir + '/test.json'
    wv_file = glove_dir + '/' + wv_file
    wv_dim = wv_dim

    # output files
    helper.ensure_dir(vocab_dir)
    vocab_file = vocab_dir + '/vocab.pkl'
    emb_file = vocab_dir + '/embedding.npy'

    # load files
    print("loading files...")
    train_tokens = load_tokens(train_file, spacy_model)
    dev_tokens = load_tokens(dev_file)
    test_tokens = load_tokens(test_file, spacy_model)
    if lower:
        train_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in \
                                     (train_tokens, dev_tokens, test_tokens)]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    v = build_vocab(train_tokens, glove_vocab, min_freq)

    print("calculating oov...")
    datasets = {'train': train_tokens, 'test': test_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    print("all done.")
Exemplo n.º 4
0
def viz_att(words, attn, name, label):
    sns.set()
    f, ax = plt.subplots(figsize=(20, 20))
    df = pd.DataFrame(attn, index=words, columns=words)
    sns.heatmap(df, xticklabels=words, yticklabels=words, cmap="YlGnBu", ax=ax)
    ax.set_title(name)
    label_y = ax.get_yticklabels()
    plt.setp(label_y, rotation=360, horizontalalignment='right')
    label_x = ax.get_xticklabels()
    plt.setp(label_x, rotation=90, horizontalalignment='right')
    fig_path = "svgs/" + str(label)
    ensure_dir(fig_path)
    f.savefig(fig_path + "/" + name + '.svg',
              format='svg',
              bbox_inches='tight')
Exemplo n.º 5
0
def prepare_voabulary (vocab_params): 
    
    # input files
    train_file = vocab_params.data_dir + '/train.json'
    dev_file = vocab_params.data_dir + '/dev.json'
    test_file = vocab_params.data_dir + '/test.json'
    wv_file = vocab_params.glove_dir + '/' + vocab_params.glove_text_file
    wv_dim = vocab_params.emb_dim

    # output files
    helper.ensure_dir(vocab_params.vocab_dir)
    vocab_file = vocab_params.vocab_dir + vocab_params.vocab_file
    emb_file = vocab_params.vocab_dir + vocab_params.embed_file

    # load files
    print("loading files...")
    train_tokens = load_tokens(train_file)
    dev_tokens = load_tokens(dev_file)
    test_tokens = load_tokens(test_file)
    if vocab_params.lower:
        train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\
                (train_tokens, dev_tokens, test_tokens)]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))
    
    print("building vocab...")
    v = build_vocab(train_tokens, glove_vocab, vocab_params.min_freq)

    print("calculating oov...")
    datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov*100.0/total))
    
    print("building embeddings...")
    embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    print("all done.")
    return(vocab)
def main():
    args = parse_args()

    # input files
    train_file = args.data_dir + '/train.json'
    dev_file = args.data_dir + '/dev.json'
    test_file = args.data_dir + '/test.json'

    embedding_file = args.ucca_embedding_dir + '/' + args.ucca_embedding_file
    index_file = args.ucca_embedding_dir + '/' + args.ucca_embedding_index_file

    helper.ensure_dir(args.ucca_embedding_dir)

    UccaEmbedding.prepare(args.ucca_embedding_dim,
                          [train_file, dev_file, test_file], index_file,
                          embedding_file, args.ucca_embedding_source)

    return UccaEmbedding(args.ucca_embedding_dim, index_file, embedding_file)
Exemplo n.º 7
0
def split_test_data(coarse_name):
    data = read_tsv('dataset/test.tsv')
    # for i,coarse_name in enumerate(constant.COARSE_INTO_MULTI):
    # save dir
    res_dir = 'result/multi/' + coarse_name
    helper.ensure_dir(res_dir)

    # select test data according to coarse predictions
    coarse_id = constant.COARSE_TO_ID[coarse_name]
    coarse_prediction = eval(open(constant.BEST_PRED_COARSE_FILE).read())
    tmp_list, index_rec, labels = [], {}, []
    for i, p in enumerate(coarse_prediction):
        if p == coarse_id:
            tmp_list.append(data[i])
            index_rec[i] = len(
                tmp_list) - 1  # index[0~1500] = current coarse data index
            labels.append(data[i]['label'])

    # save input data of test
    print("\nsaving data...")
    helper.ensure_dir('dataset/multi/' + coarse_name + '/eval/')
    input_path = os.path.join('dataset/multi/' + coarse_name + '/eval/',
                              'test.tsv')
    with open(input_path, 'w') as f:
        pass
    with open(input_path, 'a') as f:
        for i, p in enumerate(tmp_list):
            f.write('\t'.join([str(p['label']), p['text_a']]) + '\n')
    print("test input file saved to file {}".format(input_path))

    # save index relation
    index_rela_path = os.path.join(res_dir, 'index_relation')
    with open(index_rela_path, 'w') as f:
        f.write(str(index_rec))
    print(
        "index relation between multi test set and test.tsv saved to file {}".
        format(index_rela_path))

    # save corresponding labels
    labels_save_path = os.path.join(res_dir, 'labels')
    with open(labels_save_path, 'w') as f:
        f.write(str(labels))
    print("corresponding labels saved to file {}".format(labels_save_path) +
          "\n")
def main():
    args = parse_args()
    
    # input files
    train_file = args.data_dir + '/train.json'
    dev_file = args.data_dir + '/dev.json'
    test_file = args.data_dir + '/test.json'
    wv_file = args.glove_dir + '/' + args.wv_file
    wv_dim = args.wv_dim

    # output files
    helper.ensure_dir(args.vocab_dir)
    vocab_file = args.vocab_dir + '/vocab.pkl'
    emb_file = args.vocab_dir + '/embedding.npy'

    # load files
    print("loading files...")
    train_tokens = load_tokens(train_file)
    test_tokens = load_tokens(test_file)
    dev_tokens = test_tokens

    # load glove
    print("loading glove...")
    glove_vocab = load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))
    
    print("building vocab...")
    v = build_vocab(train_tokens, glove_vocab)

    print("calculating oov...")
    datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov*100.0/total))
    
    print("building embeddings...")
    embedding = build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    print("all done.")
Exemplo n.º 9
0
helper.print_config(opt)
id2label = dict([(v,k) for k,v in constant.LABEL_TO_ID.items()])

predictions = []
all_probs = []
for i, b in enumerate(batch):
    preds, probs, _ = model.predict(b)
    predictions += preds
    all_probs += probs
predictions = [id2label[p] for p in predictions]
p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True)

# save probability scores
if len(args.out) > 0:
    helper.ensure_dir(os.path.dirname(args.out))
    with open(args.out, 'wb') as outfile:
        pickle.dump(all_probs, outfile)
    print("Prediction scores saved to {}.".format(args.out))

print("Evaluation ended.")









Exemplo n.º 10
0
NER.build_vocab(dataset_vocab)
PST.build_vocab(dataset_vocab)

opt["num_class"] = len(RELATION.vocab)
opt["vocab_pad_id"] = TOKEN.vocab.stoi["<pad>"]
opt["pos_pad_id"] = POS.vocab.stoi["<pad>"]
opt["ner_pad_id"] = NER.vocab.stoi["<pad>"]
opt["pe_pad_id"] = PST.vocab.stoi["<pad>"]
opt["vocab_size"] = len(TOKEN.vocab)
opt["pos_size"] = len(POS.vocab)
opt["ner_size"] = len(NER.vocab)
opt["pe_size"] = len(PST.vocab)
opt["rel_stoi"] = RELATION.vocab.stoi
opt["rel_itos"] = RELATION.vocab.itos

helper.ensure_dir(opt["p_dir"], verbose=True)
helper.ensure_dir(opt["s_dir"], verbose=True)

TOKEN.vocab.load_vectors("glove.840B.300d", cache="./dataset/.vectors_cache")
if TOKEN.vocab.vectors is not None:
    opt["emb_dim"] = TOKEN.vocab.vectors.size(1)


def load_best_model(model_dir, model_type="predictor"):
    model_file = model_dir + "/best_model.pt"
    print("Loading model from {}".format(model_file))
    model_opt = torch_utils.load_config(model_file)
    if model_type == "predictor":
        predictor = Predictor(model_opt)
        model = Trainer(model_opt, predictor, model_type=model_type)
    else:
Exemplo n.º 11
0
def main():
    args = parse_args()

    # input files
    train_file = args.data_dir + '/train.json'
    dev_file = args.data_dir + '/dev.json'
    test_file = args.data_dir + '/test.json'
    schema_file = args.data_dir + '/schemas.json'
    wv_file = args.emb_dir + '/' + args.wv_file
    wv_dim = args.wv_dim

    # output files
    helper.ensure_dir(args.vocab_dir)
    vocab_file = args.vocab_dir + '/vocab.pkl'
    char_file = args.vocab_dir + '/chars.json'
    emb_file = args.vocab_dir + '/embedding.npy'

    # load files
    print("loading files...")
    train_tokens = load_tokens(train_file)
    dev_tokens = load_tokens(dev_file)
    test_tokens = load_tokens(test_file)
    if args.lower:
        train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\
                (train_tokens, dev_tokens, test_tokens)]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    v = build_vocab(train_tokens, glove_vocab, args.min_freq)

    print("calculating oov...")
    datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping embeddings to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    # print("all done.")

    print("building schemas...")
    all_schemas = set()
    subj_type = set()
    obj_type = set()
    min_count = 2
    pos_tags = set()
    chars = defaultdict(int)
    with open(train_file) as f:
        a = json.load(f)
        for ins in a:
            for spo in ins['spo_details']:
                all_schemas.add(spo[3])
                subj_type.add(spo[2])
                obj_type.add(spo[6])
            for pos in ins['pos_tags']:
                pos_tags.add(pos)
            for token in ins['tokens']:
                for char in token:
                    chars[char] += 1
    id2predicate = {i + 1: j for i, j in enumerate(all_schemas)}  # 0表示终止类别
    predicate2id = {j: i for i, j in id2predicate.items()}

    id2subj_type = {i + 1: j for i, j in enumerate(subj_type)}  # 0表示终止类别
    subj_type2id = {j: i for i, j in id2subj_type.items()}

    id2obj_type = {i + 1: j for i, j in enumerate(obj_type)}  # 0表示终止类别
    obj_type2id = {j: i for i, j in id2obj_type.items()}

    with codecs.open(schema_file, 'w', encoding='utf-8') as f:
        json.dump([
            id2predicate, predicate2id, id2subj_type, subj_type2id,
            id2obj_type, obj_type2id
        ],
                  f,
                  indent=4,
                  ensure_ascii=False)

    print("dumping chars to files...")
    with codecs.open(char_file, 'w', encoding='utf-8') as f:
        chars = {i: j for i, j in chars.items() if j >= min_count}
        id2char = {i + 2: j for i, j in enumerate(chars)}  # padding: 0, unk: 1
        char2id = {j: i for i, j in id2char.items()}
        id2pos = {i + 2: j
                  for i, j in enumerate(pos_tags)}  # padding: 0, unk: 1
        pos2id = {j: i for i, j in id2pos.items()}
        json.dump([id2char, char2id, id2pos, pos2id],
                  f,
                  indent=4,
                  ensure_ascii=False)
Exemplo n.º 12
0
def train_unbiased_model(args, biased_batch_probs):
    # make opt
    opt = vars(args)
    opt["num_class"] = len(constant.LABEL_TO_ID)

    # load vocab
    vocab_file = opt['vocab_dir'] + '/vocab.pkl'
    vocab = Vocab(vocab_file, load=True)
    opt['vocab_size'] = vocab.size
    emb_file = opt['vocab_dir'] + '/embedding.npy'
    emb_matrix = np.load(emb_file)
    assert emb_matrix.shape[0] == vocab.size
    assert emb_matrix.shape[1] == opt['emb_dim']

    # load data
    print("Loading data from {} with batch size {}...".format(
        opt["data_dir"], opt["batch_size"]))
    train_batch = DataLoader(
        opt["data_dir"] + "/" + args.data_name,
        opt["batch_size"],
        opt,
        vocab,
        evaluation=False,
    )
    dev_batch = DataLoader(opt["data_dir"] + "/dev.json",
                           opt["batch_size"],
                           opt,
                           vocab,
                           evaluation=True)

    model_id = opt["id"] if len(opt["id"]) > 1 else "0" + opt["id"]
    model_save_dir = opt["save_dir"] + "/" + model_id
    opt["model_save_dir"] = model_save_dir
    helper.ensure_dir(model_save_dir, verbose=True)

    # save config
    helper.save_config(opt, model_save_dir + "/config.json", verbose=True)
    vocab.save(model_save_dir + "/vocab.pkl")
    file_logger = helper.FileLogger(
        model_save_dir + "/" + opt["log"],
        header="# epoch\ttrain_loss\tdev_loss\tdev_f1")

    # print model info
    helper.print_config(opt)

    # model
    model = RelationModel(opt, emb_matrix=emb_matrix)

    id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])
    dev_f1_history = []
    current_lr = opt["lr"]

    global_step = 0
    global_start_time = time.time()
    format_str = (
        "{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}"
    )
    max_steps = len(train_batch) * opt["num_epoch"]

    # start training
    for epoch in range(1, opt["num_epoch"] + 1):
        train_loss = 0
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = model.update(batch,
                                torch.tensor(biased_batch_probs[i]).cuda())
            train_loss += loss
            if global_step % opt["log_step"] == 0:
                duration = time.time() - start_time
                print(
                    format_str.format(
                        datetime.now(),
                        global_step,
                        max_steps,
                        epoch,
                        opt["num_epoch"],
                        loss,
                        duration,
                        current_lr,
                    ))

        # eval on dev
        print("Evaluating on dev set...")
        predictions = []
        dev_loss = 0
        for i, batch in enumerate(dev_batch):
            preds, _, loss = model.predict(batch)
            predictions += preds
            dev_loss += loss
        predictions = [id2label[p] for p in predictions]
        dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions)

        f = open("label.txt", "w+")
        f.write(str(dev_batch.gold()))
        f.close()

        train_loss = (train_loss / train_batch.num_examples * opt["batch_size"]
                      )  # avg loss per batch
        dev_loss = dev_loss / dev_batch.num_examples * opt["batch_size"]
        print(
            "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}"
            .format(epoch, train_loss, dev_loss, dev_f1))
        file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}".format(
            epoch, train_loss, dev_loss, dev_f1))

        # save
        model_file = model_save_dir + "/checkpoint_epoch_{}.pt".format(epoch)
        model.save(model_file, epoch)
        if epoch == 1 or dev_f1 > max(dev_f1_history):
            copyfile(model_file, model_save_dir + "/best_model.pt")
            print("new best model saved.")
        if epoch % opt["save_epoch"] != 0:
            os.remove(model_file)

        # lr schedule
        if (len(dev_f1_history) > 10 and dev_f1 <= dev_f1_history[-1]
                and opt["optim"] in ["sgd", "adagrad"]):
            current_lr *= opt["lr_decay"]
            model.update_lr(current_lr)

        dev_f1_history += [dev_f1]
        print("")

    print("Training ended with {} epochs.".format(epoch))
Exemplo n.º 13
0
def trainmodel(config=None):
    if config is not None:
        args.batch_size = config["bsz"]
        args.seed = config["npseed"]
        args.npseed = config["npseed"]
        args.input_dropout = config["inp_drop"]

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    helper.print_arguments(args)

    train_batch, valid_batch, test_batch = get_dataloaders(args, vocab)

    trainer = ABSATrainer(args, emb_matrix=word_emb)
    print(trainer.model)
    print("Total parameters:", _totally_parameters(trainer.model))

    best_path = args.save_dir
    helper.ensure_dir(best_path, verbose=True)

    print("Training Set: {}".format(len(train_batch)))
    print("Valid Set: {}".format(len(valid_batch)))
    print("Test Set: {}".format(len(test_batch)))

    train_acc_history, train_loss_history = [], []
    val_acc_history, val_loss_history, val_f1_score_history = [0.0], [0.0
                                                                      ], [0.0]
    patience = 0
    epoch = 0

    for _ in range(1, args.num_epoch + 1):
        epoch += 1
        print("Epoch {}".format(epoch) + "-" * 60)
        train_loss, train_acc, train_step = 0.0, 0.0, 0
        for i, batch in enumerate(train_batch):
            loss, acc = trainer.update(batch)
            train_loss += loss
            train_acc += acc
            train_step += 1
            if train_step % args.log_step == 0:
                print("{}/{} train_loss: {:.6f}, train_acc: {:.6f}".format(
                    i, len(train_batch), train_loss / train_step,
                    train_acc / train_step))
        val_loss, val_acc, val_f1 = evaluate(trainer, valid_batch)

        print(
            "End of {} train_loss: {:.4f}, train_acc: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}, f1_score: {:.4f}"
            .format(
                epoch,
                train_loss / train_step,
                train_acc / train_step,
                val_loss,
                val_acc,
                val_f1,
            ))

        train_acc_history.append(train_acc / train_step)
        train_loss_history.append(train_loss / train_step)
        val_loss_history.append(val_loss)

        # save best model
        if epoch == 1 or float(val_acc) > max(val_acc_history):
            patience = 0
            torch.save(trainer, best_path + '/best_checkpoint.pt')
            print("new best model saved.")

        val_acc_history.append(float(val_acc))
        val_f1_score_history.append(val_f1)

        if patience >= 20:
            print('Reach the max patience, stopping...')
            break

    print("Training ended with {} epochs.".format(epoch))

    # bt_val_acc = max(val_acc_history)
    # bt_val_idx = val_acc_history.index(bt_val_acc)
    # bt_val_f1 = val_f1_score_history[bt_val_idx]
    # bt_val_loss = val_loss_history[bt_val_idx]

    # print(
    #     "Training Summary: Best best_acc_epoch:{}, val_loss:{}, val_acc:{}, val_f1:{}".format(
    #         bt_val_idx, bt_val_loss, bt_val_acc, bt_val_f1
    #     )
    # )
    print("Loading best checkpoints from", best_path + '/best_checkpoint.pt')
    trainer = torch.load(best_path + '/best_checkpoint.pt')
    test_loss, test_acc, test_f1 = evaluate(trainer, test_batch)
    print("Evaluation Results: test_loss:{}, test_acc:{}, test_f1:{}".format(
        test_loss, test_acc, test_f1))
Exemplo n.º 14
0
opt['save_dir'] = "saved_models/" + opt['type'] + "/"
opt['res_dir'] = "result/" + opt['type'] + "/"
label2id = get_current_label2id(opt)
if opt['type'] == 'multi':
    opt['save_dir'] = "saved_models/" + opt['type'] + "/" + opt['coarse_name']
    opt['res_dir'] = "result/" + opt['type'] + "/" + opt['coarse_name'] + "/"
else:
    opt['coarse_name'] = ''
opt['num_class'] = len(label2id)

# print opt
helper.print_config(opt)
id2label = dict([(v, k) for k, v in label2id.items()])

# model save dir
helper.ensure_dir(opt['save_dir'], verbose=True)
helper.ensure_dir(opt['res_dir'], verbose=True)

# save config
helper.save_config(opt,
                   os.path.join(opt['save_dir'], 'config.json'),
                   verbose=True)
file_logger = helper.FileLogger(
    opt['save_dir'] + '/' + opt['log'],
    header="# epoch\ttrain_loss\tdev_loss\ttrain_ACC\ttest_ACC\tF1")

# load data
if opt['type'] == 'multi':
    # split train set into new train set and test set, used in the second level
    split_save_dir = 'dataset/multi/' + opt['coarse_name']
    helper.ensure_dir(split_save_dir)
random.seed(1234)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# make opt
opt = vars(args)
label2id = constant.LABEL_TO_ID
opt['num_class'] = len(label2id)

# print opt
helper.print_config(opt)
label2id = constant.LABEL_TO_ID
id2label = dict([(v, k) for k, v in label2id.items()])

# model save dir
helper.ensure_dir(opt['save_dir'], verbose=True)

# save config
helper.save_config(opt, opt['save_dir'] + '/config.json', verbose=True)
file_logger = helper.FileLogger(
    opt['save_dir'] + '/' + opt['log'],
    header="# epoch\ttrain_loss\tdev_loss\ttrain_ACC\ttest_ACC\tF1")

# load data
print("Loading data from {} with batch size {} ...".format(
    opt['data_dir'], opt['batch_size']))
train_batch = DataLoader(opt['data_dir'] + '/train.tsv', opt['batch_size'],
                         opt)
test_batch = DataLoader(opt['data_dir'] + '/test.tsv', opt['batch_size'], opt)

# build model
Exemplo n.º 16
0
def main():
    args = parse_args()

    # input files
    train_file = args.data_dir + '/train.jsonl'
    dev_file = args.data_dir + '/testa.jsonl'
    test_file = args.data_dir + '/testb.jsonl'
    wv_file = args.glove_dir + '/' + args.wv_file
    wv_dim = args.wv_dim

    # output files
    helper.ensure_dir(args.vocab_dir)
    vocab_file = args.vocab_dir + '/vocab.pkl'
    char_vocab_file = args.vocab_dir + '/vocab_char.pkl'
    emb_file = args.vocab_dir + '/embedding.npy'

    # load files
    print("loading files...")
    train_tokens, train_chars = load_tokens(train_file)
    dev_tokens, dev_chars = load_tokens(dev_file)
    test_tokens, test_chars = load_tokens(test_file)
    if args.lower:
        train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\
                (train_tokens, dev_tokens, test_tokens)]
    if args.char_lower and train_chars:
        train_chars, dev_chars, test_chars = [[c.lower() for c in chars] for chars in\
            (train_chars, dev_chars, test_chars)]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    if args.all:
        all_tokens = train_tokens + dev_tokens + test_tokens
    else:
        all_tokens = train_tokens
    v = build_vocab(all_tokens, glove_vocab, args.min_freq)

    if train_chars:
        print("building vocab for chars...")
        all_chars = train_chars + dev_chars + test_chars
        char_counter = Counter(all_chars)
        #char_vocab = constant.VOCAB_PREFIX + sorted(char_counter.keys(), key=char_counter.get, reverse=True)
        char_vocab = constant.VOCAB_PREFIX + sorted(list(char_counter.keys()))
        print("vocab built with {} chars.".format(len(char_vocab)))
    else:
        char_vocab = None

    print("calculating oov...")
    datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    if args.random:
        print("using random initialization...")
        embedding = random_embedding(v, wv_dim)
    else:
        embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    if char_vocab:
        with open(char_vocab_file, 'wb') as outfile:
            pickle.dump(char_vocab, outfile)
    np.save(emb_file, embedding)
    print("all done.")
Exemplo n.º 17
0
def train_model(vocab_params,
                train_params,
                train_batch,
                dev_batch,
                model_id=-1):
    torch.manual_seed(train_params.seed)
    np.random.seed(train_params.seed)
    random.seed(train_params.seed)

    if train_params.cpu:
        train_params.cuda = False
    elif train_params.cuda:
        torch.cuda.manual_seed(train_params.seed)

    # make opt
    opt = vars(vocab_params)

    print(constant.LABEL_TO_ID)
    print(opt)
    opt['num_class'] = len(constant.LABEL_TO_ID)
    #     Combine all the parameters together
    opt.update(vars(train_params))

    # load vocab
    vocab_file = opt['vocab_dir'] + '/vocab.pkl'
    vocab = Vocab(vocab_file, load=True)
    opt['vocab_size'] = vocab.size
    emb_file = opt['vocab_dir'] + '/embedding.npy'
    emb_matrix = np.load(emb_file)
    assert emb_matrix.shape[0] == vocab.size
    assert emb_matrix.shape[1] == opt['emb_dim']

    if (model_id == -1):
        model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
    model_save_dir = opt['save_dir'] + '/' + model_id
    opt['model_save_dir'] = model_save_dir
    helper.ensure_dir(model_save_dir, verbose=True)

    # save config
    helper.save_config(opt, model_save_dir + '/config.json', verbose=True)
    vocab.save(model_save_dir + '/vocab.pkl')
    file_logger = helper.FileLogger(
        model_save_dir + '/' + opt['log'],
        header="# epoch\ttrain_loss\tdev_loss\tdev_f1")

    # print model info
    helper.print_config(opt)

    # model
    model = RelationModel(opt, emb_matrix=emb_matrix)

    id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])
    dev_f1_history = []
    current_lr = opt['lr']

    global_step = 0
    global_start_time = time.time()
    format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
    max_steps = len(train_batch) * opt['num_epoch']

    # start training
    for epoch in range(1, opt['num_epoch'] + 1):
        train_loss = 0
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = model.update(batch)
            train_loss += loss
            if global_step % opt['log_step'] == 0:
                duration = time.time() - start_time
                print(format_str.format(datetime.now(), global_step, max_steps, epoch,\
                        opt['num_epoch'], loss, duration, current_lr))

        # eval on dev
        print("Evaluating on dev set...")
        predictions = []
        dev_loss = 0
        for i, batch in enumerate(dev_batch):
            preds, _, loss = model.predict(batch)
            predictions += preds
            dev_loss += loss
        predictions = [id2label[p] for p in predictions]
        dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions)

        train_loss = train_loss / train_batch.num_examples * opt[
            'batch_size']  # avg loss per batch
        dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size']
        print("epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch,\
                train_loss, dev_loss, dev_f1))
        file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}".format(
            epoch, train_loss, dev_loss, dev_f1))

        # save
        model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch)
        model.save(model_file, epoch)
        if epoch == 1 or dev_f1 > max(dev_f1_history):
            copyfile(model_file, model_save_dir + '/best_model.pt')
            print("new best model saved.")
        if epoch % opt['save_epoch'] != 0:
            os.remove(model_file)

        # lr schedule
        if len(dev_f1_history) > 10 and dev_f1 <= dev_f1_history[-1] and \
                opt['optim'] in ['sgd', 'adagrad']:
            current_lr *= opt['lr_decay']
            model.update_lr(current_lr)

        dev_f1_history += [dev_f1]
        print("")

    print("Training ended with {} epochs.".format(epoch))
Exemplo n.º 18
0
def main():
    args = parse_args()

    # input files
    train_file = args.data_dir + "/rationale_train.json"
    dev_file = args.data_dir + "/rationale_dev.json"
    un_file = args.data_dir + "/rationale_un.json"
    wl_file = args.data_dir + "/rationale_wl.json"
    cts_file = args.data_dir + "/rationale_cts.json"
    bc_file = args.data_dir + "/rationale_bc.json"
    wv_file = args.glove_dir + "/" + args.wv_file
    wv_dim = args.wv_dim

    # output files
    helper.ensure_dir(args.vocab_dir)
    vocab_file = args.vocab_dir + "/vocab.pkl"
    emb_file = args.vocab_dir + "/embedding.npy"

    # load files
    print("loading files...")
    train_tokens = load_tokens(train_file)
    dev_tokens = load_tokens(dev_file)
    un_tokens = load_tokens(un_file)
    wl_tokens = load_tokens(wl_file)
    cts_tokens = load_tokens(cts_file)
    bc_tokens = load_tokens(bc_file)

    if args.lower:
        train_tokens, dev_tokens, un_tokens, wl_tokens, cts_tokens, bc_tokens = [
            [t.lower() for t in tokens] for tokens in (
                train_tokens,
                dev_tokens,
                un_tokens,
                wl_tokens,
                cts_tokens,
                bc_tokens,
            )
        ]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    v = build_vocab(train_tokens, glove_vocab, args.min_freq)

    print("calculating oov...")
    datasets = {
        "train": train_tokens,
        "dev": dev_tokens,
        "un": un_tokens,
        "wl": wl_tokens,
        "cts": cts_tokens,
        "bc": bc_tokens,
    }
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, "wb") as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    print("all done.")
Exemplo n.º 19
0
def main():
    # set top-level random seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    if args.cpu:
        args.cuda = False
    elif args.cuda:
        # force random seed for reproducibility
        # also apply same seed to numpy in every file
        torch.backends.cudnn.deterministic = True
        torch.cuda.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    # make opt
    opt = vars(args)
    opt['num_class'] = len(constant.LABEL_TO_ID)

    # load vocab
    vocab_file = opt['vocab_dir'] + '/vocab.pkl'
    vocab = Vocab(vocab_file, load=True)

    # in some previous experiments we saw that lower vocab size can improve performance
    # but it was in a completely different project although on the same data
    # here it seems it's much harder to get this to work
    # uncomment the following line if this is solved:
    # new_vocab_size = 30000

    opt['vocab_size'] = vocab.size
    emb_file = opt['vocab_dir'] + '/embedding.npy'
    emb_matrix = np.load(emb_file)
    assert emb_matrix.shape[0] == vocab.size
    assert emb_matrix.shape[1] == opt['emb_dim']

    # load data
    print("Loading data from {} with batch size {}...".format(
        opt['data_dir'], opt['batch_size']))
    train_batch = DataLoader(opt['data_dir'] + '/train.json',
                             opt['batch_size'],
                             opt,
                             vocab,
                             evaluation=False)
    dev_batch = DataLoader(opt['data_dir'] + '/dev.json',
                           opt['batch_size'],
                           opt,
                           vocab,
                           evaluation=True)

    model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
    model_save_dir = opt['save_dir'] + '/' + model_id
    opt['model_save_dir'] = model_save_dir
    helper.ensure_dir(model_save_dir, verbose=True)

    # save config
    helper.save_config(opt, model_save_dir + '/config.json', verbose=True)
    vocab.save(model_save_dir + '/vocab.pkl')
    file_logger = helper.FileLogger(
        model_save_dir + '/' + opt['log'],
        header="# epoch\ttrain_loss\tdev_loss\tdev_p\tdev_r\tdev_f1")

    # print model info
    helper.print_config(opt)

    # model
    model = RelationModel(opt, emb_matrix=emb_matrix)

    id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])
    dev_f1_history = []
    current_lr = opt['lr']

    global_step = 0

    format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
    max_steps = len(train_batch) * opt['num_epoch']

    # setup the scheduler for lr decay
    # this doesn't seem to work well compared to what we already have
    # scheduler = ReduceLROnPlateau(model.optimizer, mode='min', factor=opt['lr_decay'], patience=1)

    # start training
    for epoch in range(1, opt['num_epoch'] + 1):
        # TODO: if lr warmup is used, the lr console output is not updated
        print(
            "Current params: " + " heads-" + str(opt["n_head"]) +
            " enc_layers-" + str(opt["num_layers_encoder"]),
            " drop-" + str(opt["dropout"]) + " scaled_drop-" +
            str(opt["scaled_dropout"]) + " lr-" + str(opt["lr"]),
            " lr_decay-" + str(opt["lr_decay"]) + " max_grad_norm-" +
            str(opt["max_grad_norm"]))
        print(
            " weight_no_rel-" + str(opt["weight_no_rel"]) + " weight_rest-" +
            str(opt["weight_rest"]) + " attn-" + str(opt["attn"]) +
            " attn_dim-" + str(opt["attn_dim"]),
            " obj_sub_pos-" + str(opt["obj_sub_pos"]) + " new_residual-" +
            str(opt["new_residual"]))
        print(
            " use_batch_norm-" + str(opt["use_batch_norm"]) +
            " relative_positions-" + str(opt["relative_positions"]),
            " decay_epoch-" + str(opt["decay_epoch"]) + " use_lemmas-" +
            str(opt["use_lemmas"]), " hidden_self-" + str(opt["hidden_self"]))

        train_loss = 0
        for i, batch in enumerate(train_batch):

            start_time = time.time()
            global_step += 1

            loss = model.update(batch)
            train_loss += float(loss)

            if global_step % opt['log_step'] == 0:
                duration = time.time() - start_time
                print(
                    format_str.format(datetime.now(), global_step, max_steps,
                                      epoch, opt['num_epoch'], loss, duration,
                                      current_lr))
            # do garbage collection,
            # as per https://discuss.pytorch.org/t/best-practices-for-maximum-gpu-utilization/13863/6
            del loss

        # eval on dev
        print("Evaluating on dev set...")
        predictions = []
        dev_loss = 0
        for i, batch in enumerate(dev_batch):
            preds, _, loss = model.predict(batch)
            predictions += preds
            dev_loss += float(loss)
            del loss

        predictions = [id2label[p] for p in predictions]
        dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions)

        train_loss = train_loss / train_batch.num_examples * opt[
            'batch_size']  # avg loss per batch
        dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size']
        print(
            "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch, \
                                                                                       train_loss, dev_loss, dev_f1)
        )
        file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}\t{:.4f}".format(
            epoch, train_loss, dev_loss, dev_p, dev_r, dev_f1))

        # save
        model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch)
        model.save(model_file, epoch)
        if epoch == 1 or dev_f1 > max(dev_f1_history):
            copyfile(model_file, model_save_dir + '/best_model.pt')
            print("new best model saved.")
        if epoch % opt['save_epoch'] != 0:
            os.remove(model_file)

        # reduce learning rate if it stagnates by a certain decay rate and within given epoch patience
        # this for some reason works worth than the implementation we have afterwards
        # scheduler.step(dev_loss)

        if opt["optim"] != "noopt_adam" and opt["optim"] != "noopt_nadam":

            # do warm_up_for sgd only instead of adam
            do_warmup_trick = False

            if do_warmup_trick:
                # print("do_warmup_trick")

                # 1 and 5 first worked kind of
                # 10 and 15
                current_lr = 10 * (360**(-0.5) *
                                   min(epoch**(-0.5), epoch * 15**(-1.5)))
                # print("current_lr", current_lr)
                model.update_lr(current_lr)

            else:
                # decay schedule # 15 is best!
                # simulate patience of x epochs
                if len(dev_f1_history
                       ) > opt['decay_epoch'] and dev_f1 <= dev_f1_history[-1]:
                    current_lr *= opt['lr_decay']
                    model.update_lr(current_lr)

        # else, update the learning rate in torch_utils.py

        dev_f1_history += [dev_f1]
        print("")

    print("Training ended with {} epochs.".format(epoch))
Exemplo n.º 20
0
            errors[i]["prop_mentions"] = (
                errors[i]["real_mentions"] / len(errors[i]["mentions"])
                if len(errors[i]["mentions"]) > 0
                else 1
            )


# Convert the bootleg_emmental entity QIDs to the wikidata mentions
def save_csv(obj, name):
    cols = obj[0].keys()
    print(cols)
    csv_columns = cols
    f = open(name + ".csv", "w")
    w = csv.DictWriter(f, fieldnames=csv_columns)
    w.writeheader()
    for k, v in obj.items():
        w.writerow(v)
    print("Wrote to file!")


# save probability scores
if len(args.out) > 0:
    helper.ensure_dir(args.out)
    save_csv(errors, "{}/{}_{}".format(args.out, timestamp, args.dataset))
    filename_probs = "{}/{}_{}".format(args.out, timestamp, "probs.pkl")
    with open(filename_probs, "wb") as outfile:
        pickle.dump(all_probs, outfile)
    print("Prediction scores saved to {}.".format(args.out))

print("Evaluation ended.")
Exemplo n.º 21
0
    torch.backends.cudnn.benchmark = False


seed = args.seed
seed_everything(seed)

if args.cpu:
    args.use_cuda = False
elif args.use_cuda:
    torch.cuda.manual_seed(args.seed)

opt = vars(args)

# print model info
helper.print_config(opt)
helper.ensure_dir(opt["model_save_dir"], verbose=True)
# save model config
helper.save_config(opt,
                   opt["model_save_dir"] + "/" + opt["id"] + '.config',
                   verbose=True)
# record training log
file_logger = helper.FileLogger(
    opt["model_save_dir"] + '/' + opt['id'] + ".log",
    header="# epoch\ttrain_loss\tprecision5\tNDCG5\tMAP5\tprecision7"
    "\tNDCG7\tMAP7\tprecision10\tNDCG10\tMAP10")

preprocess = Preprocess(opt)
print("Preprocess is done.")
print("Create model TaNP...")

opt['uf_dim'] = preprocess.uf_dim
Exemplo n.º 22
0
def transre_search(ffn, connect, hidden_dim, trans_layers, multi_heads,
                   ffn_ex_size, initial, final):
    opt['weighted'] = False
    opt['rnn'] = False
    opt['ffn'] = ffn
    opt['connect'] = connect
    opt['hidden_dim'] = hidden_dim
    opt['trans_layers'] = trans_layers
    opt['multi_heads'] = multi_heads
    opt['ffn_ex_size'] = ffn_ex_size
    opt['initial'] = initial
    opt['final'] = final

    id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
    model_name =str (opt['optim']) + '_' + str (opt['lr']) + str (ffn) + '_' +str(connect)+"_"\
                + str (hidden_dim) + '_' + str (trans_layers) + '_' + str (multi_heads) + '_' + \
                str (ffn_ex_size)+'_'+str(initial)+'_'+str(final)
    model_name = model_name + '' + str(opt['memo'])

    model_name = str(id) + "_" + model_name

    model_save_dir = opt['save_dir'] + '/' + model_name
    opt['model_save_dir'] = model_save_dir
    helper.ensure_dir(model_save_dir, verbose=True)

    # save config
    helper.save_config(opt, model_save_dir + '/config.json', verbose=True)
    vocab.save(model_save_dir + '/vocab.pkl')
    file_logger = helper.FileLogger(
        model_save_dir + '/' + opt['log'],
        header="# epoch\ttrain_loss\tdev_loss\tdev_score\tbest_dev_score")
    helper.print_config(opt)

    if not opt['load']:
        trainer = TransTrainer(opt, emb_matrix=emb_matrix)
    else:
        # load pre-train model
        model_file = opt['model_file']
        print("Loading model from {}".format(model_file))
        model_opt = torch_utils.load_config(model_file)
        model_opt['optim'] = opt['optim']
        trainer = TransTrainer(model_opt)
        trainer.load(model_file)

    id2label = dict([(v, k) for k, v in label2id.items()
                     ])  # the classification result
    dev_score_history = []
    dev_loss_history = []
    current_lr = opt['lr']

    global_step = 0
    format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
    max_steps = len(train_batch) * opt['num_epoch']

    best_result = "unknown"
    file_logger.log(str(opt['memo']))
    for epoch in range(1, opt['num_epoch'] + 1):
        train_loss = 0
        epoch_start_time = time.time()
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss, norm = trainer.update(batch)
            train_loss += loss
            if global_step % opt['log_step'] == 0:
                duration = time.time() - start_time
                print(
                    format_str.format(datetime.now(), global_step, max_steps,
                                      epoch, opt['num_epoch'], loss, duration,
                                      current_lr))

        print("Evaluating on dev set...")
        predictions = []
        dev_loss = 0
        for i, batch in enumerate(dev_batch):
            preds, _, loss, _ = trainer.predict(batch)
            predictions += preds
            dev_loss += loss
        predictions = [id2label[p] for p in predictions]
        train_loss = train_loss / train_batch.num_examples * opt[
            'batch_size']  # avg loss per batch
        dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size']

        acc, dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions)
        print(
            "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}"
            .format(epoch, train_loss, dev_loss, dev_f1))
        dev_score = dev_f1
        file_logger.log("{}\t{:.3f}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}".format(
            epoch, acc, train_loss, dev_loss, dev_score,
            max([dev_score] + dev_score_history)))

        # save
        model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch)
        trainer.save(model_file, epoch)

        if epoch == 1 or dev_score > max(dev_score_history):
            copyfile(model_file, model_save_dir + '/best_model.pt')
            best_result = (model_name, dev_score)
            print("new best model saved.")
            file_logger.log(
                "new best model saved at epoch {}: {:.2f}\t{:.2f}\t{:.2f}".
                format(epoch, dev_p * 100, dev_r * 100, dev_score * 100))
        if epoch % opt['save_epoch'] != 0:
            os.remove(model_file)

        # lr schedule
        if len(dev_score_history
               ) > opt['decay_epoch'] and dev_score <= dev_score_history[
                   -1] and opt['optim'] in ['sgd', 'adagrad', 'adadelta']:
            current_lr *= opt['lr_decay']
            trainer.update_lr(current_lr)

        dev_score_history += [dev_score]
        dev_loss_history += [dev_loss]
        epoch_end_time = time.time()
        print("epoch time {:.3f}".format(epoch_end_time - epoch_start_time))
    return best_result
def main():
    args = parse_args()

    # input files
    train_file = args.data_dir + '/train.list'
    test_file = args.data_dir + '/test.list'
    wv_file = args.w2v_dir + '/' + args.wv_file
    wv_dim = args.wv_dim

    # output files
    helper.ensure_dir(args.vocab_dir)
    vocab_file = args.vocab_dir + '/vocab.pkl'
    emb_file = args.vocab_dir + '/embedding.npy'

    # load files
    print("loading files...")
    train_tokens = load_tokens(train_file)
    test_tokens = load_tokens(test_file)

    # load glove
    print("loading word vector...")
    glove_vocab = load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    v = build_vocab(train_tokens + test_tokens + constant.ASP_TOKEN,
                    glove_vocab)

    print("calculating oov...")
    datasets = {'train': train_tokens, 'test': test_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    embedding = build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("building asp embeddings...")
    w2id = {w: i for i, w in enumerate(v)}
    ASP_TO_ID = constant.ASP_TO_ID
    asp_emb = np.random.uniform(-1, 1, (len(ASP_TO_ID), wv_dim))
    for key in ASP_TO_ID.keys():
        ts = list(jieba.cut(key))
        tmp = np.zeros(wv_dim)
        for t in ts:
            if t not in w2id:
                tmp += embedding[w2id['<UNK>']]
                print(t)
            else:
                tmp += embedding[w2id[t]]
        asp_emb[ASP_TO_ID[key]] = tmp / len(ts)
    print("embedding size: {} x {}".format(*asp_emb.shape))

    print("dumping to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    np.save(args.vocab_dir + '/asp_embedding.npy', asp_emb)
    print("all done.")
Exemplo n.º 24
0
    opt['data_dir'], opt['batch_size']))
train_batch = DataLoader(opt['data_dir'] + '/train.json',
                         opt['batch_size'],
                         opt,
                         vocab,
                         evaluation=False)
dev_batch = DataLoader(opt['data_dir'] + '/dev.json',
                       opt['batch_size'],
                       opt,
                       vocab,
                       evaluation=True)

model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
model_save_dir = opt['save_dir'] + '/' + model_id
opt['model_save_dir'] = model_save_dir
helper.ensure_dir(model_save_dir, verbose=True)

# save config
helper.save_config(opt, model_save_dir + '/config.json', verbose=True)
vocab.save(model_save_dir + '/vocab.pkl')
file_logger = helper.FileLogger(
    model_save_dir + '/' + opt['log'],
    header="# epoch\ttrain_loss\tdev_loss\tdev_score\tbest_dev_score")

# print model info
helper.print_config(opt)

# model
if not opt['load']:
    trainer = GCNTrainer(opt, emb_matrix=emb_matrix)
else:
Exemplo n.º 25
0
def main():
    args = get_parser()

    # set seed and prepare for training
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    if args.cpu:
        args.cuda = False
    elif args.cuda:
        torch.cuda.manual_seed(args.seed)
    init_time = time.time()

    # make opt
    opt = vars(args)
    TEXT, train_batch, dev_batch = load_data(opt['batch_size'],
                                             device='cuda:0')

    vocab = TEXT.vocab
    opt['vocab_size'] = len(vocab.stoi)
    emb_matrix = vocab.vectors

    assert emb_matrix.shape[0] == opt['vocab_size']
    assert emb_matrix.shape[1] == opt['emb_dim']

    model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
    model_save_dir = opt['save_dir'] + '/' + str(model_id)
    opt['model_save_dir'] = model_save_dir
    helper.ensure_dir(model_save_dir, verbose=True)

    # save config
    path = os.path.join(model_save_dir, 'config.json')
    helper.save_config(opt, path, verbose=True)
    # vocab.save(os.path.join(model_save_dir, 'vocab.pkl'))
    file_logger = helper.FileLogger(
        os.path.join(model_save_dir, opt['log']),
        header="# epoch\ttrain_loss\tdev_loss\tdev_score\tbest_dev_score")

    # print model info
    helper.print_config(opt)

    # Build Model
    if not opt['load']:
        trainer = LSTMTrainer(opt, emb_matrix)
    else:
        model_file = opt['model_file']
        print("Loading model from {}".format(model_file))
        model_opt = torch_utils.load_config(model_file)
        model_opt['optim'] = opt['optim']
        trainer = LSTMTrainer(model_opt)
        trainer.load(model_file)

    dev_score_history = []
    current_lr = opt['lr']

    global_step = 0
    global_start_time = time.time()
    format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
    max_steps = len(train_batch) * opt['num_epoch']

    # start training
    for epoch in range(1, opt['num_epoch'] + 1):
        train_loss = 0
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch)
            train_loss += loss
            if global_step % opt['log_step'] == 0:
                duration = time.time() - start_time
                print(format_str.format(datetime.now(), global_step, max_steps, epoch, \
                    opt['num_epoch'], loss, duration, current_lr))

        # eval on dev
        print("Evaluating on dev set ...")
        predictions = []
        golds = []
        dev_loss = 0.0
        for i, batch in enumerate(dev_batch):
            preds, probs, labels, loss = trainer.predict(batch)
            predictions += preds
            golds += labels
            dev_loss += loss
        train_loss = train_loss / len(train_batch)
        dev_loss = dev_loss / len(dev_batch)
        # print(golds)
        # print(predictions)
        print(accuracy_score(golds, predictions))
        dev_roc = roc_auc_score(golds, predictions)
        print(
            "epoch {}: train loss = {:.6f}, dev loss = {:.6f}, dev roc = {:.4f}"
            .format(epoch, train_loss, dev_loss, dev_roc))
        dev_score = dev_roc
        file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}".format(
            epoch, train_loss, dev_loss, dev_score,
            max([dev_score] + dev_score_history)))

        # save model
        model_file = os.path.join(model_save_dir,
                                  "checkpoint_epoch_{}.py".format(epoch))
        trainer.save(model_file, epoch)
        if epoch == 1 or dev_score > max(dev_score_history):
            copyfile(model_file, model_save_dir + '/best_model.pt')
            print("new best model saved.")
            file_logger.log("new best model saved at epoch {}: {:.2f}"\
                .format(epoch, dev_score*100))
        if epoch % opt['save_epoch'] != 0:
            os.remove(model_file)

        if len(dev_score_history) > opt['decay_epoch'] and dev_score <= dev_score_history[-1] and \
            opt['optim'] in ['sgd', 'adagrad', 'adadelta']:
            current_lr *= opt['lr_decay']
            trainer.update_lr(current_lr)

        dev_score_history += [dev_score]
        print("")

    print("Training ended with {} epochs.".format(epoch))