def _eval(data): model.eval() # 不启用 BatchNormalization 和 Dropout # data = dev_data y_pred = [] y_true = [] with torch.no_grad(): for batch_data in data_iter(data, test_batch_size, shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) score, dev_f1 = get_score(y_true, y_pred) return score, dev_f1
def run(mtd="fold_split"): def _eval(data): model.eval() # 不启用 BatchNormalization 和 Dropout # data = dev_data y_pred = [] y_true = [] with torch.no_grad(): for batch_data in dataset_processer.data_iter( data, config['test_batch_size'], shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = dataset_processer.batch2tensor( batch_data) batch_outputs = model(batch_inputs) y_pred.extend( torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) score, dev_f1 = scores.get_score(y_true, y_pred) return score, dev_f1 if mtd == "fold_split": demo_preprocess.split_dataset(raw_path, train_path, dev_path, test_path) elif mtd == "process_data": demo_preprocess.process_data(config, train_path, dev_path) elif mtd == "train": Train_data = file_utils.read_json(config["train_set"]) Dev_data = file_utils.read_json(config["dev_set"]) # 生成模型可处理的格式 train_data = dataset_processer.get_examples(Train_data, label_encoder) dev_data = dataset_processer.get_examples(Dev_data, label_encoder) del Train_data, Dev_data # 一个epoch的batch个数 batch_num = int( np.ceil(len(train_data) / float(config["train_batch_size"]))) print("batch_num:{}".format(batch_num)) # model = BertSoftmaxModel(cfg.bert_path, label_encoder) optimizer = Optimizer(model.all_parameters, steps=batch_num * config["epochs"]) # 优化器 # loss # criterion = nn.CrossEntropyLoss() # obj criterion = loss_factory.focal_loss() best_train_f1, best_dev_f1 = 0, 0 early_stop = -1 EarlyStopEpochs = 10 # 当多个epoch,dev的指标都没有提升,则早停 # train print("start train") for epoch in range(cfg.RESUME_EPOCH + 1, config["epochs"] + 1): optimizer.zero_grad() model.train() # 启用 BatchNormalization 和 Dropout overall_losses = 0 losses = 0 # batch_idx = 1 y_pred = [] y_true = [] step = 0 for batch_data in dataset_processer.data_iter( train_data, config["train_batch_size"], shuffle=True): torch.cuda.empty_cache() batch_inputs, batch_labels = dataset_processer.batch2tensor( batch_data) batch_outputs = model(batch_inputs) print(batch_outputs.shape) # loss = criterion(batch_outputs, batch_labels) loss.backward() loss_value = loss.detach().cpu().item() losses += loss_value overall_losses += loss_value y_pred.extend( torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) # nn.utils.clip_grad_norm_(optimizer.all_params, max_norm=config["clip"]) # 梯度裁剪 for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers): cur_optim.step() scheduler.step() optimizer.zero_grad() step += 1 # print(step, time.time()) overall_losses /= batch_num overall_losses = scores.reformat(overall_losses, 4) score, train_f1 = scores.get_score(y_true, y_pred) print("epoch:{},train_score:{}, train_f1:{}, overall_loss:{} ". format(epoch, train_f1, score, overall_losses)) # if set(y_true) == set(y_pred): # print("report") # report = classification_report(y_true, y_pred, digits=4, target_names=label_encoder.target_names) # # logging.info('\n' + report) # print(report) # eval _, dev_f1 = _eval(data=dev_data) if best_dev_f1 < dev_f1: best_dev_f1 = dev_f1 early_stop = 0 best_train_f1 = train_f1 save_path = model_utils.save_checkpoint( model, epoch, save_folder=os.path.join(cfg.proj_path, "data/bert_nn")) print("save_path:{}".format(save_path)) # torch.save(model.state_dict(), save_model) else: early_stop += 1 if early_stop == EarlyStopEpochs: # 达到早停次数,则停止训练 break print( "early_stop:{}, score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}" .format(early_stop, dev_f1, score, best_train_f1, best_dev_f1))
def run(method="train", save_path=None, infer_texts=[]): shuffle_slicer = ShuffleSlicer() # start_time = time.time() raw_data_path = "/home/wujinjie/kesci_question_multilabel_classification/data/raw_data/baidu/nlp_db.baidu_text.csv" texts = pd.read_csv(raw_data_path) train_df, dev_df, test_df = shuffle_slicer.split(texts, dev=True) # Test_data = {'label': [0] * len(texts), 'text': test_texts} clip = 5.0 epochs = 100 # log_interval = 50 test_batch_size = 128 train_batch_size = 128 train_texts, train_labels = process_corpus_dl(train_df) Train_data = {'label': train_labels, 'text': train_texts} dev_texts, dev_labels = process_corpus_dl(dev_df) Dev_data = {'label': dev_labels, 'text': dev_texts} vocab = Vocab(Train_data) step = 0 def _eval(data): model.eval() # 不启用 BatchNormalization 和 Dropout # data = dev_data y_pred = [] y_true = [] with torch.no_grad(): for batch_data in data_iter(data, test_batch_size, shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) score, dev_f1 = get_score(y_true, y_pred) return score, dev_f1 def _infer(data): model.eval() # data = dev_data y_pred = [] with torch.no_grad(): for batch_data in data_iter(data, test_batch_size, shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) print(label_encoder.label2name(y_pred)) if method == "train": model = Model(vocab, label_encoder) # loss criterion = nn.CrossEntropyLoss() # obj # 生成模型可处理的格式 train_data = get_examples_bert(Train_data, model.word_encoder, vocab, label_encoder) dev_data = get_examples_bert(Dev_data, model.word_encoder, vocab, label_encoder) # 一个epoch的batch个数 batch_num = int(np.ceil(len(train_data) / float(train_batch_size))) optimizer = Optimizer(model.all_parameters, steps=batch_num * epochs) # 优化器 best_train_f1, best_dev_f1 = 0, 0 early_stop = -1 EarlyStopEpochs = 3 # 当多个epoch,dev的指标都没有提升,则早停 # train print("start train") for epoch in range(1, epochs + 1): optimizer.zero_grad() model.train() # 启用 BatchNormalization 和 Dropout overall_losses = 0 losses = 0 # batch_idx = 1 y_pred = [] y_true = [] for batch_data in data_iter(train_data, train_batch_size, shuffle=True): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) loss = criterion(batch_outputs, batch_labels) loss.backward() loss_value = loss.detach().cpu().item() losses += loss_value overall_losses += loss_value y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) nn.utils.clip_grad_norm_( optimizer.all_params, max_norm=clip) # 梯度裁剪 for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers): cur_optim.step() scheduler.step() optimizer.zero_grad() step += 1 # print(step) print(epoch) overall_losses /= batch_num overall_losses = reformat(overall_losses, 4) score, train_f1 = get_score(y_true, y_pred) print("score:{}, train_f1:{}".format(train_f1, score)) # if set(y_true) == set(y_pred): # print("report") # report = classification_report(y_true, y_pred, digits=4, target_names=label_encoder.target_names) # # logging.info('\n' + report) # print(report) # eval _, dev_f1 = _eval(data=dev_data) if best_dev_f1 <= dev_f1: best_dev_f1 = dev_f1 early_stop = 0 best_train_f1 = train_f1 save_path = model_utils.save_checkpoint( model, epoch, save_folder="/home/wujinjie/kesci_question_multilabel_classification/data/textbert") print("save_path:{}".format(save_path)) # torch.save(model.state_dict(), save_model) else: early_stop += 1 if early_stop == EarlyStopEpochs: # 达到早停次数,则停止训练 break print("score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}".format( dev_f1, score, best_train_f1, best_dev_f1)) else: model = model_utils.load_checkpoint(save_path) if method == "test": test_texts, test_labels = process_corpus_dl(train_df) Test_data = {'label': test_labels, 'text': test_texts} test_data = get_examples_bert(Test_data, model.word_encoder, vocab, label_encoder) # model.load_state_dict(torch.load(save_model)) _, dev_f1 = _eval(data=test_data) print(dev_f1) elif method == "infer": infer_texts = list(map(segment, infer_texts)) # print(infer_texts) Infer_data = {'label': [0] * len(infer_texts), 'text': infer_texts} infer_data = get_examples_bert(Infer_data, model.word_encoder, vocab, label_encoder) _infer(data=infer_data)
overall_losses += loss_value y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) nn.utils.clip_grad_norm_(optimizer.all_params, max_norm=clip) # 梯度裁剪 for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers): cur_optim.step() scheduler.step() optimizer.zero_grad() step += 1 # print(step) print(epoch) overall_losses /= batch_num overall_losses = reformat(overall_losses, 4) score, train_f1 = get_score(y_true, y_pred) print("score:{}, train_f1:{}".format(train_f1, score)) if set(y_true) == set(y_pred): print("report") report = classification_report(y_true, y_pred, digits=4, target_names=vocab.target_names) # logging.info('\n' + report) print(report) # eval model.eval() # 不启用 BatchNormalization 和 Dropout data = dev_data y_pred = [] y_true = [] with torch.no_grad(): for batch_data in data_iter(data, test_batch_size, shuffle=False): torch.cuda.empty_cache()