Пример #1
0
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          tag2id,
                          output_dir,
                          crf=True,
                          remove_O=False):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                       dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, os.path.join(output_dir, model_name + ".pkl"))

    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    print("评估{}模型中...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)

    metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=remove_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    return pred_tag_lists
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          charEmbedding,
                          word2id,
                          tag2id,
                          crf=True,
                          remove_O=False):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data
    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, charEmbedding, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                       dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(
        bilstm_model,
        "./ckpts/" + str(bilstm_model.best_val_loss)[:5] + model_name + ".pkl")

    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    print("评估{}模型中...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)
    f = open(
        'next_dev_result/' + str(bilstm_model.best_val_loss)[:5] +
        '_bilstmcrf_result.txt', 'w')
    for pred_tag_list in pred_tag_lists:
        f.write(' '.join(pred_tag_list) + '\n')
    f.close()
Пример #3
0
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          tag2id,
                          crf=True):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                       dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, "./ckpts/" + model_name + ".pkl")

    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    print("评估{}模型中...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)

    metrics = Metrics(test_tag_lists, pred_tag_lists)

    return pred_tag_lists
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          tag2id,
                          crf=True,
                          remove_O=False):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                       dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, "./ckpts/" + model_name + ".pkl")

    print("Training completed, {} seconds when sharing.".format(
        int(time.time() - start)))
    print("Evaluation{} model:...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)

    metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=remove_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    return pred_tag_lists
def bilstm_pred(train_word_lists, train_tag_lists, dev_word_lists,
                dev_tag_lists, test_word_lists, test_tag_lists):
    start = time.time()
    vocab_size = len(bilstm_word2id)
    out_size = len(bilstm_tag2id)
    model = BILSTM_Model(vocab_size, out_size, crf=False)
    model.train(train_word_lists, train_tag_lists, dev_word_lists,
                dev_tag_lists, bilstm_word2id, bilstm_tag2id)
    save_model(model, "./ckpts/lstm.pkl")
    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    pred_tag_lists, test_tag_lists = model.test(test_word_lists,
                                                test_tag_lists, word2id,
                                                tag2id)
    return pred_tag_lists
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          tag2id,
                          crf=True,
                          remove_O=False,
                          reload_model=False):
    # data
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    # training
    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)

    # get model_file
    if crf:
        model_name = "bilstm_crf"
    else:
        model_name = "bilstm"
    emb_size = LSTMConfig.emb_size
    hidden_size = LSTMConfig.hidden_size
    model_file = "./weights/" + model_name + '_' + str(emb_size) + '_' + str(
        hidden_size) + ".pkl"

    if reload_model:
        # reload trained model!
        bilstm_model = load_model(model_file)
    else:
        # train and save model!
        bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
        bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                           dev_tag_lists, word2id, tag2id)
        save_model(
            bilstm_model, model_file
        )  # re-thinking when to save the model? after valid for each epoch?
    print("Training finished, taken {} seconds!".format(
        int(time.time() - start)))
    print("Evaluating {} model:".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)
    results_print(test_tag_lists, pred_tag_lists, remove_O=remove_O)

    return pred_tag_lists
Пример #7
0
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          data2id,
                          crf=True,
                          remove_O=False):
    train_word_lists, train_data_lists, train_wordlabel_lists, train_datalabel_lists, train_dataptr_lists = train_data
    dev_word_lists, dev_data_lists, dev_wordlabel_lists, dev_datalabel_lists, dev_dataptr_lists = dev_data
    test_word_lists, test_data_lists, test_wordlabel_lists, test_datalabel_lists, test_dataptr_lists = test_data

    start = time.time()

    vocab_size = len(word2id)
    data_size = len(data2id)

    bilstm_model = BILSTM_Model(vocab_size, data_size, crf=crf)
    bilstm_model.train(train_word_lists, train_data_lists,
                       train_wordlabel_lists, train_datalabel_lists,
                       train_dataptr_lists, dev_word_lists, dev_data_lists,
                       dev_wordlabel_lists, dev_datalabel_lists,
                       dev_dataptr_lists, word2id, data2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, "./ckpts/" + model_name + ".pkl")

    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    print("评估{}模型中...".format(model_name))
    pred_tag_lists, pred_tag_lists = bilstm_model.test(
        test_word_lists, test_data_lists, test_wordlabel_lists,
        test_datalabel_lists, test_dataptr_lists, word2id, data2id)

    #for pred, gold in zip(pred_tag_lists, test_tag_lists):
    #print(pred, gold)

    allnum = 0
    correct = 0
    for pred, gold in zip(pred_tag_lists, pred_tag_lists):
        pred = pred[:len(gold)]
        for x, y in zip(pred, gold):
            if x == y:
                correct += 1
            allnum += 1

    print(correct / allnum)

    return correct / allnum
def bilstm_train_and_eval(train_data,
                          dev_data,
                          test_data,
                          word2id,
                          tag2id,
                          crf=True,
                          remove_O=False):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                       dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, "./ckpts/" + model_name + ".pkl")

    print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    print("评估{}模型中...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)

    with open("./result.txt", "a+") as f:
        for i in range(len(pred_tag_lists)):
            f.write(pred_tag_lists[i] + " " + pred_tag_lists[i] + "\n")

    metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=remove_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    return pred_tag_lists