def train(config): # tag_length should not include start/end tags model = BiLSTM_CRF(config) optimizer = optim.Adam(model.parameters(), config.lr) # f1 score of validation dataset valid_f1 = -1000 stop = False start_t = time.time() for epoch in range(config.n_epoch): # or bert_test.zero_grad() since all bert_test parameters are in optimizer if stop: break optimizer.zero_grad() _, batch_inputs, batch_outputs, masks, length = random_batch(embeddings, x_train, y_train, config.batch_size) loss = model.neg_log_likelihood(batch_inputs, batch_outputs, masks, length) loss.backward() optimizer.step() if (epoch + 1) % config.eval_freq == 0: print('Epoch: {:04d}, loss: {:.4f}, seconds: {:.4f}'.format(epoch, loss, time.time() - start_t)) entities, new_valid_f1, prec, recall = get_f1(model, config, test=False) print('[Validation]f1 score from {:.6f} to {:.6f}'.format(valid_f1, new_valid_f1)) print('[Validation]precision: {}, recall: {}\n'.format(prec, recall)) if epoch > 100000 and (abs(new_valid_f1 - valid_f1) < 0.001 or new_valid_f1 < valid_f1): stop = True if new_valid_f1 > valid_f1: valid_f1 = new_valid_f1 torch.save(model.state_dict(), config.model_save_path)
def train(data_loader, data_size, batch_size, embedding_dim, hidden_dim, sentence_length, num_layers, epochs, learning_rate, tag2id, model_saved_path, train_log_path, validate_log_path, train_history_image_path): ''' data_loader: 数据集的加载器, 之前已经通过load_dataset完成了构造 data_size: 训练集和测试集的样本数量 batch_size: 批次的样本个数 embedding_dim: 词嵌入的维度 hidden_dim: 隐藏层的维度 sentence_length: 文本限制的长度 num_layers: 神经网络堆叠的LSTM层数 epochs: 训练迭代的轮次 learning_rate: 学习率 tag2id: 标签到id的映射字典 model_saved_path: 模型保存的路径 train_log_path: 训练日志保存的路径 validate_log_path: 测试集日志保存的路径 train_history_image_path: 训练数据的相关图片保存路径 ''' # 将中文字符和id的对应码表加载进内存 char2id = json.load( open("./data/char_to_id.json", mode="r", encoding="utf-8")) # 初始化BiLSTM_CRF模型 model = BiLSTM_CRF(vocab_size=len(char2id), tag_to_ix=tag2id, embedding_dim=embedding_dim, hidden_dim=hidden_dim, batch_size=batch_size, num_layers=num_layers, sequence_length=sentence_length) # 定义优化器, 使用SGD作为优化器(pytorch中Embedding支持的GPU加速为SGD, SparseAdam) # 参数说明如下: # lr: 优化器学习率 # momentum: 优化下降的动量因子, 加速梯度下降过程 # optimizer = optim.SGD(params=model.parameters(), lr=learning_rate, momentum=0.85, weight_decay=1e-4) optimizer = optim.Adam(params=model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-4) # 设定优化器学习率更新策略 # 参数说明如下: # optimizer: 优化器 # step_size: 更新频率, 每过多少个epoch更新一次优化器学习率 # gamma: 学习率衰减幅度, # 按照什么比例调整(衰减)学习率(相对于上一轮epoch), 默认0.1 # 例如: # 初始学习率 lr = 0.5, step_size = 20, gamma = 0.1 # lr = 0.5 if epoch < 20 # lr = 0.05 if 20 <= epoch < 40 # lr = 0.005 if 40 <= epoch < 60 # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.8) # 初始化存放训练中损失, 准确率, 召回率, F1等数值指标 train_loss_list = [] train_acc_list = [] train_recall_list = [] train_f1_list = [] train_log_file = open(train_log_path, mode="w", encoding="utf-8") # 初始化存放测试中损失, 准确率, 召回率, F1等数值指标 validate_loss_list = [] validate_acc_list = [] validate_recall_list = [] validate_f1_list = [] validate_log_file = open(validate_log_path, mode="w", encoding="utf-8") # 利用tag2id生成id到tag的映射字典 id2tag = {v: k for k, v in tag2id.items()} # 利用char2id生成id到字符的映射字典 id2char = {v: k for k, v in char2id.items()} # 按照参数epochs的设定来循环epochs次 for epoch in range(epochs): # 在进度条打印前, 先输出当前所执行批次 tqdm.write("Epoch {}/{}".format(epoch + 1, epochs)) # 定义要记录的正确总实体数, 识别实体数以及真实实体数 total_acc_entities_length, \ total_predict_entities_length, \ total_gold_entities_length = 0, 0, 0 # 定义每batch步数, 批次loss总值, 准确度, f1值 step, total_loss, correct, f1 = 1, 0.0, 0, 0 # 开启当前epochs的训练部分 for inputs, labels in tqdm(data_loader["train"]): # 将数据以Variable进行封装 inputs, labels = Variable(inputs), Variable(labels) # 在训练模型期间, 要在每个样本计算梯度前将优化器归零, 不然梯度会被累加 optimizer.zero_grad() # 此处调用的是BiLSTM_CRF类中的neg_log_likelihood()函数 loss = model.neg_log_likelihood(inputs, labels) # 获取当前步的loss, 由tensor转为数字 step_loss = loss.data # 累计每步损失值 total_loss += step_loss # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数 best_path_list = model(inputs) # 模型评估指标值获取包括:当前批次准确率, 召回率, F1值以及对应的实体个数 step_acc, step_recall, f1_score, acc_entities_length, \ predict_entities_length, gold_entities_length = evaluate(inputs.tolist(), labels.tolist(), best_path_list, id2char, id2tag) # 训练日志内容 ''' log_text = "Epoch: %s | Step: %s " \ "| loss: %.5f " \ "| acc: %.5f " \ "| recall: %.5f " \ "| f1 score: %.5f" % \ (epoch, step, step_loss, step_acc, step_recall,f1_score) ''' # 分别累计正确总实体数、识别实体数以及真实实体数 total_acc_entities_length += acc_entities_length total_predict_entities_length += predict_entities_length total_gold_entities_length += gold_entities_length # 对损失函数进行反向传播 loss.backward() # 通过optimizer.step()计算损失, 梯度和更新参数 optimizer.step() # 记录训练日志 # train_log_file.write(log_text + "\n") step += 1 # 获取当前epochs平均损失值(每一轮迭代的损失总值除以总数据量) epoch_loss = total_loss / data_size["train"] # 计算当前epochs准确率 if total_predict_entities_length > 0: total_acc = total_acc_entities_length / total_predict_entities_length # 计算当前epochs召回率 if total_gold_entities_length > 0: total_recall = total_acc_entities_length / total_gold_entities_length # 计算当前epochs的F1值 total_f1 = 0 if total_acc + total_recall != 0: total_f1 = 2 * total_acc * total_recall / (total_acc + total_recall) log_text = "Epoch: %s " \ "| mean loss: %.5f " \ "| total acc: %.5f " \ "| total recall: %.5f " \ "| total f1 scroe: %.5f" % (epoch, epoch_loss, total_acc, total_recall, total_f1) # 当前epochs训练后更新学习率, 必须在优化器更新之后 # scheduler.step() # 记录当前epochs训练loss值(用于图表展示), 准确率, 召回率, f1值 train_loss_list.append(epoch_loss) train_acc_list.append(total_acc) train_recall_list.append(total_recall) train_f1_list.append(total_f1) train_log_file.write(log_text + "\n") # 定义要记录的正确总实体数, 识别实体数以及真实实体数 total_acc_entities_length, \ total_predict_entities_length, \ total_gold_entities_length = 0, 0, 0 # 定义每batch步数, 批次loss总值, 准确度, f1值 step, total_loss, correct, f1 = 1, 0.0, 0, 0 # 开启当前epochs的验证部分 with torch.no_grad(): for inputs, labels in tqdm(data_loader["validation"]): # 将数据以 Variable 进行封装 inputs, labels = Variable(inputs), Variable(labels) # 此处调用的是 BiLSTM_CRF 类中的 neg_log_likelihood 函数 # 返回最终的 CRF 的对数似然结果 try: loss = model.neg_log_likelihood(inputs, labels) except: continue # 获取当前步的 loss 值,由 tensor 转为数字 step_loss = loss.data # 累计每步损失值 total_loss += step_loss # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数 best_path_list = model(inputs) # 模型评估指标值获取: 当前批次准确率, 召回率, F1值以及对应的实体个数 step_acc, step_recall, f1_score, acc_entities_length, \ predict_entities_length, gold_entities_length = evaluate(inputs.tolist(), labels.tolist(), best_path_list, id2char, id2tag) # 训练日志内容 ''' log_text = "Epoch: %s | Step: %s " \ "| loss: %.5f " \ "| acc: %.5f " \ "| recall: %.5f " \ "| f1 score: %.5f" % \ (epoch, step, step_loss, step_acc, step_recall,f1_score) ''' # 分别累计正确总实体数、识别实体数以及真实实体数 total_acc_entities_length += acc_entities_length total_predict_entities_length += predict_entities_length total_gold_entities_length += gold_entities_length # 记录验证集损失日志 # validate_log_file.write(log_text + "\n") step += 1 # 获取当前批次平均损失值(每一批次损失总值除以数据量) epoch_loss = total_loss / data_size["validation"] # 计算总批次准确率 if total_predict_entities_length > 0: total_acc = total_acc_entities_length / total_predict_entities_length # 计算总批次召回率 if total_gold_entities_length > 0: total_recall = total_acc_entities_length / total_gold_entities_length # 计算总批次F1值 total_f1 = 0 if total_acc + total_recall != 0.0: total_f1 = 2 * total_acc * total_recall / (total_acc + total_recall) log_text = "Epoch: %s " \ "| mean loss: %.5f " \ "| total acc: %.5f " \ "| total recall: %.5f " \ "| total f1 scroe: %.5f" % (epoch, epoch_loss, total_acc, total_recall, total_f1) # 记录当前批次验证loss值(用于图表展示)准确率, 召回率, f1值 validate_loss_list.append(epoch_loss) validate_acc_list.append(total_acc) validate_recall_list.append(total_recall) validate_f1_list.append(total_f1) validate_log_file.write(log_text + "\n") # 保存模型 torch.save(model.state_dict(), model_saved_path) # 将loss下降历史数据转为图片存储 save_train_history_image(train_loss_list, validate_loss_list, train_history_image_path, "Loss") # 将准确率提升历史数据转为图片存储 save_train_history_image(train_acc_list, validate_acc_list, train_history_image_path, "Acc") # 将召回率提升历史数据转为图片存储 save_train_history_image(train_recall_list, validate_recall_list, train_history_image_path, "Recall") # 将F1上升历史数据转为图片存储 save_train_history_image(train_f1_list, validate_f1_list, train_history_image_path, "F1") print("train Finished".center(100, "-"))
with torch.no_grad(): output = model(sentence_in) I, O, T = cal_for_acc(output[1], targets) inter += I out_entity_num += O target_entity_num += T P = inter / (out_entity_num + 1e-18) R = inter / (target_entity_num + 1e-18) F1 = 2 * P * R / (P + R + 1e-18) print( '\rtraining:%d/%d\t loss:%0.5f \t P:%0.5f \t R:%0.5F \t F1:%0.5f' % (iter, len(train), loss_ / (iter + 1), P, R, F1), end='', flush=True) #print('training:%d/%d\t loss:%0.5f ' %(iter, len(train), loss_/(iter+1))) # Step 4. Compute the loss, gradients, and update the parameters by # calling optimizer.step() loss.backward() optimizer.step() # val data accuracy with torch.no_grad(): F1, avg_loss = f1_in_batch_data(val, model) print('\n val_acc: \t epoch: %d \tF1 score: %0.5f \n' % (epoch, F1)) torch.save( model.state_dict(), '../model_dict/hidim512_fulldict_' + str(epoch) + '_' + str(F1) + '_params.pkl')