def bilstm_train_and_eval(train_data, dev_data, test_data, charEmbedding, word2id, tag2id, crf=True, remove_O=False): train_word_lists, train_tag_lists = train_data dev_word_lists, dev_tag_lists = dev_data test_word_lists, test_tag_lists = test_data start = time.time() vocab_size = len(word2id) out_size = len(tag2id) bilstm_model = BILSTM_Model(vocab_size, charEmbedding, out_size, crf=crf) bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id) model_name = "bilstm_crf" if crf else "bilstm" save_model( bilstm_model, "./ckpts/" + str(bilstm_model.best_val_loss)[:5] + model_name + ".pkl") print("训练完毕,共用时{}秒.".format(int(time.time() - start))) print("评估{}模型中...".format(model_name)) pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists, test_tag_lists, word2id, tag2id) f = open( 'next_dev_result/' + str(bilstm_model.best_val_loss)[:5] + '_bilstmcrf_result.txt', 'w') for pred_tag_list in pred_tag_lists: f.write(' '.join(pred_tag_list) + '\n') f.close()
def bilstm_train_and_eval(train_data, dev_data, test_data, word2id, tag2id, output_dir, crf=True, remove_O=False): train_word_lists, train_tag_lists = train_data dev_word_lists, dev_tag_lists = dev_data test_word_lists, test_tag_lists = test_data start = time.time() vocab_size = len(word2id) out_size = len(tag2id) bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf) bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id) model_name = "bilstm_crf" if crf else "bilstm" save_model(bilstm_model, os.path.join(output_dir, model_name + ".pkl")) print("训练完毕,共用时{}秒.".format(int(time.time() - start))) print("评估{}模型中...".format(model_name)) pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists, test_tag_lists, word2id, tag2id) metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=remove_O) metrics.report_scores() metrics.report_confusion_matrix() return pred_tag_lists
def bilstm_train_and_eval(train_data, dev_data, test_data, word2id, tag2id, crf=True): train_word_lists, train_tag_lists = train_data dev_word_lists, dev_tag_lists = dev_data test_word_lists, test_tag_lists = test_data start = time.time() vocab_size = len(word2id) out_size = len(tag2id) bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf) bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id) model_name = "bilstm_crf" if crf else "bilstm" save_model(bilstm_model, "./ckpts/" + model_name + ".pkl") print("训练完毕,共用时{}秒.".format(int(time.time() - start))) print("评估{}模型中...".format(model_name)) pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists, test_tag_lists, word2id, tag2id) metrics = Metrics(test_tag_lists, pred_tag_lists) return pred_tag_lists
def bilstm_train_and_eval(train_data, dev_data, test_data, word2id, tag2id, crf=True, remove_O=False): train_word_lists, train_tag_lists = train_data dev_word_lists, dev_tag_lists = dev_data test_word_lists, test_tag_lists = test_data start = time.time() vocab_size = len(word2id) out_size = len(tag2id) bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf) bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id) model_name = "bilstm_crf" if crf else "bilstm" save_model(bilstm_model, "./ckpts/" + model_name + ".pkl") print("Training completed, {} seconds when sharing.".format( int(time.time() - start))) print("Evaluation{} model:...".format(model_name)) pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists, test_tag_lists, word2id, tag2id) metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=remove_O) metrics.report_scores() metrics.report_confusion_matrix() return pred_tag_lists
def bilstm_train_and_eval(train_loader, dev_loader, eval_loader, test_loader, token2id, tag2id, method): """训练并保存模型""" vocab_size = len(token2id) out_size = len(tag2id) meta = get_meta([TrainingConfig.__dict__, LSTMConfig.__dict__]) model = BILSTM_Model(vocab_size, out_size, token2id, tag2id, method=method) model.train(train_loader, dev_loader, eval_loader) try: # 保存模型的信息 root_dir = "/home/luopx/share_folders/Sohu" model_dir = 'ckpts/{}/{}-{}-Len{}-{:.2f}-{:.4f}'.format( model.method, meta['token_method'], meta['tag_schema'], meta['max_len'], model.best_val_loss, model.best_f1_score ) model_dir = join(root_dir, model_dir) if not os.path.isdir(model_dir): os.mkdir(model_dir) save_model(model, join(model_dir, "model.pkl")) # 保存word2id tag2id 以及模型设置的信息 with open(join(model_dir, 'meta.json'), 'w') as w: w.write(json.dumps(meta, indent=4)) # 在验证集上面观察模型的效果、特点 print("评估{}模型中...".format(method)) # 分析结果 print("分析在验证集上的结果...") metrics = model.cal_scores(eval_loader, use_model='best_f1') with open(join(model_dir, 'dev_result.txt'), 'w') as outfile: metrics.report_details(outfile=outfile) # 加载测试集,解码,将结果保存成文件 print("在val_loss最小的模型上解码...") test_result = join(model_dir, 'min_devLoss_result.txt') decoding(model, test_loader, test_result) print("在f1分数值最大的模型上解码...") test_result = join(model_dir, 'max_f1_result.txt') decoding(model, test_loader, test_result, use_model="best_f1") except: import pdb pdb.set_trace()
def bilstm_pred(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, test_word_lists, test_tag_lists): start = time.time() vocab_size = len(bilstm_word2id) out_size = len(bilstm_tag2id) model = BILSTM_Model(vocab_size, out_size, crf=False) model.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, bilstm_word2id, bilstm_tag2id) save_model(model, "./ckpts/lstm.pkl") print("训练完毕,共用时{}秒.".format(int(time.time() - start))) pred_tag_lists, test_tag_lists = model.test(test_word_lists, test_tag_lists, word2id, tag2id) return pred_tag_lists
def bilstm_train_and_eval(train_data, dev_data, test_data, word2id, tag2id, crf=True, remove_O=False, reload_model=False): # data train_word_lists, train_tag_lists = train_data dev_word_lists, dev_tag_lists = dev_data test_word_lists, test_tag_lists = test_data # training start = time.time() vocab_size = len(word2id) out_size = len(tag2id) # get model_file if crf: model_name = "bilstm_crf" else: model_name = "bilstm" emb_size = LSTMConfig.emb_size hidden_size = LSTMConfig.hidden_size model_file = "./weights/" + model_name + '_' + str(emb_size) + '_' + str( hidden_size) + ".pkl" if reload_model: # reload trained model! bilstm_model = load_model(model_file) else: # train and save model! bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf) bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id) save_model( bilstm_model, model_file ) # re-thinking when to save the model? after valid for each epoch? print("Training finished, taken {} seconds!".format( int(time.time() - start))) print("Evaluating {} model:".format(model_name)) pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists, test_tag_lists, word2id, tag2id) results_print(test_tag_lists, pred_tag_lists, remove_O=remove_O) return pred_tag_lists
def bilstm_train_and_eval(train_data, dev_data, test_data, word2id, data2id, crf=True, remove_O=False): train_word_lists, train_data_lists, train_wordlabel_lists, train_datalabel_lists, train_dataptr_lists = train_data dev_word_lists, dev_data_lists, dev_wordlabel_lists, dev_datalabel_lists, dev_dataptr_lists = dev_data test_word_lists, test_data_lists, test_wordlabel_lists, test_datalabel_lists, test_dataptr_lists = test_data start = time.time() vocab_size = len(word2id) data_size = len(data2id) bilstm_model = BILSTM_Model(vocab_size, data_size, crf=crf) bilstm_model.train(train_word_lists, train_data_lists, train_wordlabel_lists, train_datalabel_lists, train_dataptr_lists, dev_word_lists, dev_data_lists, dev_wordlabel_lists, dev_datalabel_lists, dev_dataptr_lists, word2id, data2id) model_name = "bilstm_crf" if crf else "bilstm" save_model(bilstm_model, "./ckpts/" + model_name + ".pkl") print("训练完毕,共用时{}秒.".format(int(time.time() - start))) print("评估{}模型中...".format(model_name)) pred_tag_lists, pred_tag_lists = bilstm_model.test( test_word_lists, test_data_lists, test_wordlabel_lists, test_datalabel_lists, test_dataptr_lists, word2id, data2id) #for pred, gold in zip(pred_tag_lists, test_tag_lists): #print(pred, gold) allnum = 0 correct = 0 for pred, gold in zip(pred_tag_lists, pred_tag_lists): pred = pred[:len(gold)] for x, y in zip(pred, gold): if x == y: correct += 1 allnum += 1 print(correct / allnum) return correct / allnum
def bilstm_train_and_eval(train_data, dev_data, test_data, word2id, tag2id, crf=True, remove_O=False): train_word_lists, train_tag_lists = train_data dev_word_lists, dev_tag_lists = dev_data test_word_lists, test_tag_lists = test_data start = time.time() vocab_size = len(word2id) out_size = len(tag2id) bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf) bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id) model_name = "bilstm_crf" if crf else "bilstm" save_model(bilstm_model, "./ckpts/" + model_name + ".pkl") print("训练完毕,共用时{}秒.".format(int(time.time() - start))) print("评估{}模型中...".format(model_name)) pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists, test_tag_lists, word2id, tag2id) with open("./result.txt", "a+") as f: for i in range(len(pred_tag_lists)): f.write(pred_tag_lists[i] + " " + pred_tag_lists[i] + "\n") metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=remove_O) metrics.report_scores() metrics.report_confusion_matrix() return pred_tag_lists