def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          charEmbedding,
                          word2id,
                          tag2id,
                          crf=True,
                          remove_O=False):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data
    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, charEmbedding, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                       dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(
        bilstm_model,
        "./ckpts/" + str(bilstm_model.best_val_loss)[:5] + model_name + ".pkl")

    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    print("评估{}模型中...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)
    f = open(
        'next_dev_result/' + str(bilstm_model.best_val_loss)[:5] +
        '_bilstmcrf_result.txt', 'w')
    for pred_tag_list in pred_tag_lists:
        f.write(' '.join(pred_tag_list) + '\n')
    f.close()
Пример #2
0
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          tag2id,
                          output_dir,
                          crf=True,
                          remove_O=False):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                       dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, os.path.join(output_dir, model_name + ".pkl"))

    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    print("评估{}模型中...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)

    metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=remove_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    return pred_tag_lists
Пример #3
0
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          tag2id,
                          crf=True):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                       dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, "./ckpts/" + model_name + ".pkl")

    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    print("评估{}模型中...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)

    metrics = Metrics(test_tag_lists, pred_tag_lists)

    return pred_tag_lists
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          tag2id,
                          crf=True,
                          remove_O=False):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                       dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, "./ckpts/" + model_name + ".pkl")

    print("Training completed, {} seconds when sharing.".format(
        int(time.time() - start)))
    print("Evaluation{} model:...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)

    metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=remove_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    return pred_tag_lists
Пример #5
0
def bilstm_train_and_eval(train_loader, dev_loader, eval_loader,
                          test_loader, token2id, tag2id, method):
    """训练并保存模型"""

    vocab_size = len(token2id)
    out_size = len(tag2id)
    meta = get_meta([TrainingConfig.__dict__, LSTMConfig.__dict__])

    model = BILSTM_Model(vocab_size, out_size, token2id, tag2id, method=method)
    model.train(train_loader, dev_loader, eval_loader)

    try:
        # 保存模型的信息
        root_dir = "/home/luopx/share_folders/Sohu"
        model_dir = 'ckpts/{}/{}-{}-Len{}-{:.2f}-{:.4f}'.format(
            model.method,
            meta['token_method'],
            meta['tag_schema'],
            meta['max_len'],
            model.best_val_loss,
            model.best_f1_score
        )
        model_dir = join(root_dir, model_dir)

        if not os.path.isdir(model_dir):
            os.mkdir(model_dir)
        save_model(model, join(model_dir, "model.pkl"))

        # 保存word2id  tag2id 以及模型设置的信息
        with open(join(model_dir, 'meta.json'), 'w') as w:
            w.write(json.dumps(meta, indent=4))

        # 在验证集上面观察模型的效果、特点
        print("评估{}模型中...".format(method))
        # 分析结果
        print("分析在验证集上的结果...")
        metrics = model.cal_scores(eval_loader, use_model='best_f1')
        with open(join(model_dir, 'dev_result.txt'), 'w') as outfile:
            metrics.report_details(outfile=outfile)

        # 加载测试集,解码,将结果保存成文件
        print("在val_loss最小的模型上解码...")
        test_result = join(model_dir, 'min_devLoss_result.txt')
        decoding(model, test_loader, test_result)
        print("在f1分数值最大的模型上解码...")
        test_result = join(model_dir, 'max_f1_result.txt')
        decoding(model, test_loader, test_result, use_model="best_f1")

    except:
        import pdb
        pdb.set_trace()
def bilstm_pred(train_word_lists, train_tag_lists, dev_word_lists,
                dev_tag_lists, test_word_lists, test_tag_lists):
    start = time.time()
    vocab_size = len(bilstm_word2id)
    out_size = len(bilstm_tag2id)
    model = BILSTM_Model(vocab_size, out_size, crf=False)
    model.train(train_word_lists, train_tag_lists, dev_word_lists,
                dev_tag_lists, bilstm_word2id, bilstm_tag2id)
    save_model(model, "./ckpts/lstm.pkl")
    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    pred_tag_lists, test_tag_lists = model.test(test_word_lists,
                                                test_tag_lists, word2id,
                                                tag2id)
    return pred_tag_lists
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          tag2id,
                          crf=True,
                          remove_O=False,
                          reload_model=False):
    # data
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    # training
    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)

    # get model_file
    if crf:
        model_name = "bilstm_crf"
    else:
        model_name = "bilstm"
    emb_size = LSTMConfig.emb_size
    hidden_size = LSTMConfig.hidden_size
    model_file = "./weights/" + model_name + '_' + str(emb_size) + '_' + str(
        hidden_size) + ".pkl"

    if reload_model:
        # reload trained model!
        bilstm_model = load_model(model_file)
    else:
        # train and save model!
        bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
        bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                           dev_tag_lists, word2id, tag2id)
        save_model(
            bilstm_model, model_file
        )  # re-thinking when to save the model? after valid for each epoch?
    print("Training finished, taken {} seconds!".format(
        int(time.time() - start)))
    print("Evaluating {} model:".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)
    results_print(test_tag_lists, pred_tag_lists, remove_O=remove_O)

    return pred_tag_lists
Пример #8
0
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          data2id,
                          crf=True,
                          remove_O=False):
    train_word_lists, train_data_lists, train_wordlabel_lists, train_datalabel_lists, train_dataptr_lists = train_data
    dev_word_lists, dev_data_lists, dev_wordlabel_lists, dev_datalabel_lists, dev_dataptr_lists = dev_data
    test_word_lists, test_data_lists, test_wordlabel_lists, test_datalabel_lists, test_dataptr_lists = test_data

    start = time.time()

    vocab_size = len(word2id)
    data_size = len(data2id)

    bilstm_model = BILSTM_Model(vocab_size, data_size, crf=crf)
    bilstm_model.train(train_word_lists, train_data_lists,
                       train_wordlabel_lists, train_datalabel_lists,
                       train_dataptr_lists, dev_word_lists, dev_data_lists,
                       dev_wordlabel_lists, dev_datalabel_lists,
                       dev_dataptr_lists, word2id, data2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, "./ckpts/" + model_name + ".pkl")

    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    print("评估{}模型中...".format(model_name))
    pred_tag_lists, pred_tag_lists = bilstm_model.test(
        test_word_lists, test_data_lists, test_wordlabel_lists,
        test_datalabel_lists, test_dataptr_lists, word2id, data2id)

    #for pred, gold in zip(pred_tag_lists, test_tag_lists):
    #print(pred, gold)

    allnum = 0
    correct = 0
    for pred, gold in zip(pred_tag_lists, pred_tag_lists):
        pred = pred[:len(gold)]
        for x, y in zip(pred, gold):
            if x == y:
                correct += 1
            allnum += 1

    print(correct / allnum)

    return correct / allnum
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          tag2id,
                          crf=True,
                          remove_O=False):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                       dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, "./ckpts/" + model_name + ".pkl")

    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    print("评估{}模型中...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)

    with open("./result.txt", "a+") as f:
        for i in range(len(pred_tag_lists)):
            f.write(pred_tag_lists[i] + " " + pred_tag_lists[i] + "\n")

    metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=remove_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    return pred_tag_lists