def main(): parser = argparse.ArgumentParser() parser.add_argument("--bert_model", default='bert-base-cased', type=str, help="transformers中的模型都可: bert-base-uncased, roberta-base.") parser.add_argument("--output_dir", default='output', type=str, help="The output directory where the model checkpoints will be written.") parser.add_argument("--output_file", # default='output_batch4_gpu4_large_qo_lamda10_fp16.txt', default='output_file.txt', type=str, help="The output directory where the model checkpoints will be written.") parser.add_argument("--train_file", default='data/sem/ntrain.tsv', type=str) parser.add_argument("--test_file", default='data/sem/ntest.tsv', type=str) parser.add_argument("--dev_file", default='data/sem/ndev.tsv', type=str) parser.add_argument('--n_gpu', type=int, default=2, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') parser.add_argument("--max_seq_length", default=512, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=4, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=4, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-6, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=50.0, type=float, help="Total number of training epochs to perform.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case",#用uncased无大小写模型时要这个 default=True, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=4,#原来是4 help='Loss scaling, positive power of 2 values can improve fp16 convergence.') #增加dev集 parser.add_argument("--dev_batch_size", default=8, type=int, help="Total batch size for dev.") parser.add_argument("--print_step", default=50, type=int, help="多少步进行模型保存以及日志信息写入") parser.add_argument("--early_stop", type=int, default=50, help="提前终止,多少次dev acc 不再连续增大,就不再训练") parser.add_argument("--label_list", default=["0", "1", "2", "3", "4"], type=list, help="我自己加的类别标签") parser.add_argument("--predict_test_file", default='ntest_sg_label.tsv', type=str) parser.add_argument("--log_dir", default="log_dir", type=str, help="日志目录,主要用于 tensorboard 分析") args = parser.parse_args() logger.info(args) output_eval_file = os.path.join(args.output_dir, args.output_file) os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.log_dir, exist_ok=True)#如果已经存在,不抛出异常 with open(output_eval_file, "w") as writer: writer.write("%s\t\n" % args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = args.n_gpu else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = args.n_gpu # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) #为了复现 random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # 为所有GPU设置随机种子 torch.backends.cudnn.enabled = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False os.environ['PYTHONHASHSEED'] = str(args.seed) # 为了禁止hash随机化,使得实验可复现。 def seed_worker(worker_id): worker_seed = torch.initial_seed() % 2 ** 32 np.random.seed(worker_seed) random.seed(worker_seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") #读数据,生成dataframe df_train = pd.read_csv(args.train_file, sep='\t') df_dev = pd.read_csv(args.dev_file, sep='\t') df_test = pd.read_csv(args.test_file, sep='\t') # Load the pretrained Tokenizer tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = AutoModelForSequenceClassification.from_pretrained(args.bert_model, num_labels=5, output_attentions=False, output_hidden_states=False) # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=5, # output_attentions=False, output_hidden_states=False) model.to(device) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used# thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] def encode_fn(text_list): all_input_ids = [] for text in text_list: input_ids = tokenizer.encode(text, add_special_tokens=True, max_length=128, return_tensors='pt',pad_to_max_length=True) # 这个长度得改!!! all_input_ids.append(input_ids) all_input_ids = torch.cat(all_input_ids, dim=0) return all_input_ids criterion = torch.nn.CrossEntropyLoss()#加了torch criterion = criterion.to(device) if args.do_train: # Create the data loader train_text_values = df_train['sentence'].values all_input_ids = encode_fn(train_text_values) labels = df_train['label'].values labels = torch.tensor(labels - 1) # 减一,让标签从0开始 train_data = TensorDataset(all_input_ids, labels) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size, shuffle=True,worker_init_fn=seed_worker) # _init_fn dev_text_values = df_dev['sentence'].values dall_input_ids = encode_fn(dev_text_values) dlabels = df_dev['label'].values dlabels = torch.tensor(dlabels - 1) # 减一,让标签从0开始 dev_data = TensorDataset(dall_input_ids, dlabels) dev_dataloader = DataLoader(dev_data, batch_size=args.dev_batch_size, worker_init_fn=seed_worker) num_train_steps = int( len(df_train) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # create optimizer and learning rate schedule optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) # 要重现BertAdam特定的行为,需设置correct_bias = False #total_steps = len(train_dataloader) * args.epoch scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(args.warmup_proportion*num_train_steps), num_training_steps=num_train_steps)#num_warmup_steps不知道 logger.info("***** Running training *****transformers") logger.info(" Num examples = %d", len(df_train)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) logger.info("***** Running dev *****") logger.info(" Num examples = %d", len(df_dev)) logger.info(" Batch size = %d", args.dev_batch_size) with open(output_eval_file, "a") as writer:### writer.write("\t\n***** Running training *****transformers\t\n") writer.write(" Num examples = %d\t\n" % len(df_train)) writer.write(" Batch size = %d\t\n" % args.train_batch_size) writer.write(" Num steps = %d\t\n" % num_train_steps) writer.write("\t\n***** Running dev *****transformers\t\n") writer.write(" Num examples = %d\t\n" % len(df_dev)) writer.write(" Batch size = %d\t\n" % args.dev_batch_size) global_step = 0 best_acc = 0 early_stop_times = 0 writer = SummaryWriter( log_dir=args.log_dir + '/' + time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime(time.time()))) num_model = 0 num_bestacc=0 for epoch in trange(int(args.num_train_epochs), desc="Epoch"): if early_stop_times >= args.early_stop: print('early_stop......') break print(f'---------------- Epoch: {epoch + 1:02} ----------') epoch_loss = 0 all_preds = np.array([], dtype=int) all_labels = np.array([], dtype=int) train_steps = 0 for step, batch in enumerate(tqdm(train_dataloader, ncols=50, desc="Iteration")):#新增ncols,进度条长度。默认是10 model.train() # 这个位置正确,保证每一个batch都能进入model.train()的模式 ##传统的训练函数进来一个batch的数据,计算一次梯度,更新一次网络,而这里用了梯度累加(gradient accumulation) ##梯度累加就是,每次获取1个batch的数据,计算1次梯度,梯度不清空,不断累加,累加一定次数后,根据累加的梯度更新网络参数,然后清空梯度,进行下一次循环。 # 梯度累加步骤:1. input output 获取loss:输入文本和标签,通过infer计算得到预测值,计算损失函数 out1 = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0] > 0).to(device), labels=batch[1].to(device)) loss, logits = out1[:2] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale # 2.loss.backward() 反向传播,计算当前梯度 2.1 loss regularization if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps train_steps += 1 # 2.2 back propagation if args.fp16: optimizer.backward(loss) else: loss.backward()## 反向传播求解梯度 # 用于画图和分析的数据 epoch_loss += loss.item() preds = logits.detach().cpu().numpy() outputs = np.argmax(preds, axis=1) all_preds = np.append(all_preds, outputs) label_ids = batch[1].to('cpu').numpy() all_labels = np.append(all_labels, label_ids) # 3. 多次循环步骤1-2,不清空梯度,使梯度累加在已有梯度上 update parameters of net #梯度累加了一定次数后,先optimizer.step() 根据累计的梯度更新网络参数,然后optimizer.zero_grad() 清空过往梯度,为下一波梯度累加做准备 if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), 1)#optimizer_grouped_parameters # 梯度裁剪不再在AdamW中了#大于1的梯度将其设为1.0, 以防梯度爆炸。解决神经网络训练过拟合。只在训练的时候使用,在测试的时候不用 optimizer.step()## 更新权重参数 # update parameters of net scheduler.step() optimizer.zero_grad()## 梯度清零 # reset gradient global_step += 1 #新增dev数据集调参 if global_step % args.print_step == 0 and global_step != 0: num_model += 1 train_loss = epoch_loss / train_steps train_acc, train_report = classifiction_metric(all_preds, all_labels, args.label_list) dev_loss, dev_acc, dev_report, _, _, _ = evaluate(model, dev_dataloader, criterion, device, args.label_list) c = global_step // args.print_step writer.add_scalar("loss/train", train_loss, c) writer.add_scalar("loss/dev", dev_loss, c) writer.add_scalar("micro_f1/train", train_acc, c)##acc/train writer.add_scalar("micro_f1/dev", dev_acc, c)##acc/dev for label in args.label_list: writer.add_scalar(label + "_" + "f1/train", train_report[label]['f1-score'], c) writer.add_scalar(label + "_" + "f1/dev", dev_report[label]['f1-score'], c) print_list = ['macro', 'weighted'] for label in print_list: writer.add_scalar(label + "_avg_" +"f1/train", train_report[label+' avg']['f1-score'], c) writer.add_scalar(label + "_avg_" + "f1/dev", dev_report[label+' avg']['f1-score'], c) # 以 acc 取优 if dev_acc > best_acc: num_bestacc += 1 best_acc = dev_acc # Save a trained model model_to_save = model.module if hasattr(model,'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "_pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) early_stop_times = 0 else: early_stop_times += 1 with open(output_eval_file, "a") as writer:### writer.write("\t\n***** Ending dev *****transformers\t\n") writer.write(" global_step : %d\t\n" % global_step) writer.write(" num_model : %d\t\n" % num_model) writer.write(" num_bestacc : %d\t\n" % num_bestacc) if args.do_eval: # dataframe保存带标签的预测文件ntest_label.tsv,格式:id,text,label,predict_label df = pd.DataFrame(columns=['text', 'label', 'predict_label']) df['text']=df_test['sentence'] # Create the test data loader test_text_values = df_test['sentence'].values tall_input_ids = encode_fn(test_text_values) tlabels = df_test['label'].values tlabels = torch.tensor(tlabels - 1) # 减一,让标签从0开始 pred_data = TensorDataset(tall_input_ids,tlabels) pred_dataloader = DataLoader(pred_data, batch_size=args.eval_batch_size, worker_init_fn=seed_worker) logger.info("***** Running evaluation *****transformers") logger.info(" Num examples = %d", len(df_test)) logger.info(" Batch size = %d", args.eval_batch_size) output_eval_file = os.path.join(args.output_dir, "result.txt") output_model_file = os.path.join(args.output_dir, "_pytorch_model.bin") model_state_dict = torch.load(output_model_file) model = AutoModelForSequenceClassification.from_pretrained(args.bert_model, num_labels=5,state_dict=model_state_dict, output_attentions=False, output_hidden_states=False) # model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=5,state_dict=model_state_dict, # output_attentions=False, output_hidden_states=False) model.to(device) logger.info("Start evaluating") print("=======================") print("test_total...") _,eval_accuracy, eval_report, all_logits, all_preds, all_labels = evaluate(model, pred_dataloader,criterion, device, args.label_list) df['predict_label'] = all_preds df['label'] = all_labels ntest_sg_label = os.path.join(args.output_dir, args.predict_test_file) df.to_csv(ntest_sg_label, sep='\t') eval_macro_f1 = eval_report['macro avg']['f1-score'] result = {'eval_accuracy': eval_accuracy,'eval_macro_f1':eval_macro_f1} with open(output_eval_file, "a") as writer: writer.write("***** Running evaluation *****transformers\t\n") writer.write(" Num examples = %d\t\n" % df.shape[0]) writer.write(" Batch size = %d\t\n" % args.eval_batch_size) logger.info("***** Eval results *****transformers") writer.write("\t\n***** Eval results %s *****transformers\t\n" % ( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\t" % (key, str(result[key]))) writer.write("\t\n") np.savetxt(args.output_dir+'/all_logits_transf.txt', all_logits.reshape(-1,5))
def train(args, config, tokenizer, model): """train model""" #Load and prepare data posterior_dic = pickle.load( open(os.path.join(args.posterior_dir, 'posterior-trn.pkl'), 'rb')) train_examples = read_examples(os.path.join(args.data_dir, 'ppl-trn.pkl'), posterior_dic=posterior_dic) train_features = convert_examples_to_features(train_examples, tokenizer, args, stage='training') all_event_ids = torch.tensor([f.event_ids for f in train_features], dtype=torch.long) all_context_ids = torch.tensor([f.context_ids for f in train_features], dtype=torch.long) all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long) all_posterior = torch.tensor([f.posterior for f in train_features], dtype=torch.long) train_data = TensorDataset(all_event_ids, all_context_ids, all_target_ids, all_posterior) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) train_dataloader = cycle(train_dataloader) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=args.train_steps) #Running training logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size= %d", args.train_batch_size) logger.info(" Batch size (including gradient_accumulation_steps)= %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Num steps = %d", args.train_steps * args.gradient_accumulation_steps) dev_dataset = {} model.train() global_step, tr_re_loss, tr_context_loss, tr_reward, tr_clf_loss, nb_tr_examples, nb_tr_steps, best_bleu, best_bleu, eval_flag = 0, 0, 0, 0, 0, 0, 0, 0, 0, True bar = tqdm(range(args.train_steps * args.gradient_accumulation_steps), total=args.train_steps * args.gradient_accumulation_steps) for step in bar: batch = next(train_dataloader) batch = tuple(t.to(args.device) for t in batch) event_ids, context_ids, target_ids, posterior = batch with torch.no_grad(): context_ids, context_ids_random = model(context_ids=context_ids, posterior=posterior) (re_loss, context_loss, reward), _, _ = model(event_ids=event_ids, context_ids=context_ids, context_ids_random=context_ids_random, target_ids=target_ids, posterior=posterior) # mean() to average on multi-gpu. if args.n_gpu > 1: re_loss = re_loss.mean() context_loss = context_loss.mean() reward = reward.mean() if args.fp16 and args.loss_scale != 1.0: re_loss = re_loss * args.loss_scale context_loss = context_loss * args.loss_scale reward = reward * args.loss_scale if args.gradient_accumulation_steps > 1: re_loss = re_loss / args.gradient_accumulation_steps context_loss = context_loss / args.gradient_accumulation_steps reward = reward / args.gradient_accumulation_steps #print loss information tr_re_loss += re_loss.item() tr_context_loss += context_loss.item() tr_reward += reward.item() train_re_loss = round( tr_re_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) train_context_loss = round( tr_context_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) train_reward = round( tr_reward * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("re_loss {}, context_loss {}, reward {}".format( train_re_loss, train_context_loss, train_reward)) nb_tr_examples += event_ids.size(0) nb_tr_steps += 1 #backward loss = re_loss + context_loss if args.fp16: optimizer.backward(loss) else: loss.backward() #update parameter if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 eval_flag = True #Running evaluation if ((global_step + 1) % args.eval_steps == 0) and eval_flag: tr_re_loss, tr_context_loss, tr_reward, tr_clf_loss, nb_tr_examples, nb_tr_steps, eval_flag = 0, 0, 0, 0, 0, 0, False result = evaluate(args, config, tokenizer, model, os.path.join(args.data_dir, 'ppl-dev.pkl'), num_sample=10000) if 'event2mind' in args.data_dir: category = ["<oReact>", "<xIntent>", "<xReact>"] else: category = [ "<oEffect>", "<oReact>", "<oWant>", "<xAttr>", "<xEffect>", "<xIntent>", "<xNeed>", "<xReact>", "<xWant>" ] overall_bleu = 0 overall_dist = [] for c in category: bleu, dist = test(args, config, tokenizer, model, os.path.join(args.data_dir, 'gen-dev.pkl'), c, 10, 3000 // len(category)) result[c + ' (bleu,dist1,dist2)'] = [ bleu, dist1(dist), dist2(dist) ] result[c + ' (bleu,dist1,dist2)'] = ' '.join( [str(x) for x in result[c + ' (bleu,dist1,dist2)']]) overall_bleu += bleu overall_dist += dist overall_bleu = round(overall_bleu / len(category), 1) result['Overall (bleu-2,dist1,dist2)'] = [ overall_bleu, dist1(overall_dist), dist2(overall_dist) ] result['Overall (bleu-2,dist1,dist2)'] = ' '.join( [str(x) for x in result['Overall (bleu-2,dist1,dist2)']]) result['global_step'] = global_step + 1 result['train_loss'] = round(train_re_loss, 5) logger.info("***** Result *****") #print result for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) logger.info(" " + "*" * 20) if overall_bleu >= best_bleu: logger.info(" Best bleu:%s", overall_bleu) logger.info(" " + "*" * 20) best_bleu = overall_bleu # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file)
def fit(model, training_iter, eval_iter, num_epoch, pbar, num_train_steps, verbose=1): # ------------------判断CUDA模式---------------------- if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() # 多GPU # n_gpu = 1 else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 model.to(device) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) # ---------------------优化器------------------------- t_total = num_train_steps # Prepare optimizer and scheduler (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_step, num_training_steps=t_total ) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # ---------------------模型初始化---------------------- model.zero_grad() set_seed(args) global_train_loss, global_eval_loss = [], [] train_acc_obj_class_word, train_f1_obj_class_word = [], [] train_acc_express_word, train_f1_express_word = [], [] eval_acc_obj_class_word, eval_f1_obj_class_word = [], [] eval_acc_express_word, eval_f1_express_word = [], [] history = { "train_loss": global_train_loss, "eval_loss": global_eval_loss, "train_acc_obj_class_word": train_acc_obj_class_word, "train_f1_obj_class_word": train_f1_obj_class_word, "train_acc_express_word": train_acc_express_word, "train_f1_express_word": train_f1_express_word, "eval_acc_obj_class_word": eval_acc_obj_class_word, "eval_f1_obj_class_word": eval_f1_obj_class_word, "eval_acc_express_word": eval_acc_express_word, "eval_f1_express_word": eval_f1_express_word } # ------------------------训练------------------------------ start = time.time() best_obj_word_f1 = 0 global_step = 0 model.zero_grad() set_seed(args) for e in range(num_epoch): model.train() train_obj_predicts, train_obj_labels, train_express_predicts, train_express_labels = [], [], [], [] loss_epoch = 0 for step, batch in enumerate(training_iter): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, obj_ids, express_ids, _, start_poses, end_poses = batch obj_classify, express_classify, start_logits, end_logits, _, _= model(input_ids, segment_ids, input_mask) # 预测对象类词, 真实对象类词, 预测表示词, 真实表示词, 起始位置, 结束位置, 预测起始位置,预测结束位置 train_loss = loss_fn(obj_classify, obj_ids, express_classify, express_ids, start_poses, end_poses, start_logits, end_logits) if n_gpu > 1: train_loss = train_loss.mean() if args.gradient_accumulation_steps > 1: train_loss = train_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(train_loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.backward(train_loss) else: train_loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 obj_classify = obj_classify.cpu() obj_ids = obj_ids.cpu() train_obj_acc, train_obj_prf = evaluate(obj_classify, obj_ids) express_classify = express_classify.cpu() express_ids = express_ids.cpu() train_express_acc, train_express_prf = evaluate(express_classify, express_ids) start_poses = start_poses.cpu() start_logits = start_logits.cpu() end_poses = end_poses.cpu() end_logits = end_logits.cpu() acc_start_pos, f1_start_pos, acc_end_pos, f1_end_pos, start_or_end_crt_acc, start_and_end_crt_acc = \ evaluate_pos(start_poses, start_logits, end_poses, end_logits) loss_epoch += train_loss.item() pbar.show_process(train_loss.item(), train_obj_acc, train_obj_prf[2], train_express_acc, train_express_prf[2], acc_start_pos, f1_start_pos, acc_end_pos, f1_end_pos, start_or_end_crt_acc, start_and_end_crt_acc, time.time() - start, step) if global_step % 100 == 0: train_obj_predicts.append(obj_classify) train_obj_labels.append(obj_ids) train_express_predicts.append(express_classify) train_express_labels.append(express_ids) train_obj_predicted = torch.cat(train_obj_predicts, dim=0).cpu() train_obj_labeled = torch.cat(train_obj_labels, dim=0).cpu() train_express_predicted = torch.cat(train_express_predicts, dim=0).cpu() train_express_labeled = torch.cat(train_express_labels, dim=0).cpu() del train_obj_predicts, train_obj_labels, train_express_predicts, train_express_labels all_train_obj_acc, all_train_obj_prf = evaluate(train_obj_predicted, train_obj_labeled) all_train_express_acc, all_train_express_prf = evaluate(train_express_predicted, train_express_labeled) global_train_loss.append(loss_epoch / (step + 1)) train_acc_obj_class_word.append(all_train_obj_acc) train_f1_obj_class_word.append(all_train_obj_prf[2]) train_acc_express_word.append(all_train_express_acc) train_f1_express_word.append(all_train_express_prf[2]) del all_train_obj_acc, all_train_obj_prf, all_train_express_acc, all_train_express_prf # -----------------------验证---------------------------- count = 0 eval_obj_predicts, eval_obj_labels, eval_express_predicts, eval_express_labels = [], [], [], [] eval_start_pos_preds, eval_start_pos_true = [], [] eval_end_pos_preds, eval_end_pos_true = [], [] eval_losses = 0 model.eval() with torch.no_grad(): for step, batch in enumerate(eval_iter): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, obj_ids, express_ids, _, start_poses, end_poses = batch obj_classify, express_classify, start_logits, end_logits, _, _ = model(input_ids, segment_ids, input_mask) # 预测对象类词, 真实对象类词, 预测表示词, 真实表示词, 起始位置, 结束位置, 预测起始位置,预测结束位置 eval_loss = loss_fn(obj_classify, obj_ids, express_classify, express_ids, start_poses, end_poses, start_logits, end_logits) eval_losses += eval_loss count += 1 eval_obj_predicts.append(obj_classify) eval_obj_labels.append(obj_ids) eval_express_predicts.append(express_classify) eval_express_labels.append(express_ids) eval_start_pos_preds.append(start_logits) eval_start_pos_true.append(start_poses) eval_end_pos_preds.append(end_logits) eval_end_pos_true.append(end_poses) eval_obj_predicted = torch.cat(eval_obj_predicts, dim=0).cpu() eval_obj_labeled = torch.cat(eval_obj_labels, dim=0).cpu() eval_express_predicted = torch.cat(eval_express_predicts, dim=0).cpu() eval_express_labeled = torch.cat(eval_express_labels, dim=0).cpu() eval_obj_acc, eval_obj_prf = evaluate(eval_obj_predicted, eval_obj_labeled) eval_express_acc, eval_express_prf = evaluate(eval_express_predicted, eval_express_labeled) eval_acc_obj_class_word.append(eval_obj_acc) eval_f1_obj_class_word.append(eval_obj_prf[2]) eval_acc_express_word.append(eval_express_acc) eval_f1_express_word.append(eval_express_prf[2]) eval_start_pos_preds = torch.cat(eval_start_pos_preds, dim=0).cpu() eval_start_pos_true = torch.cat(eval_start_pos_true, dim=0).cpu() eval_end_pos_preds = torch.cat(eval_end_pos_preds, dim=0).cpu() eval_end_pos_true = torch.cat(eval_end_pos_true, dim=0).cpu() acc_start_pos, f1_start_pos, acc_end_pos, f1_end_pos, start_or_end_crt_acc, start_and_end_crt_acc = \ evaluate_pos(eval_start_pos_true, eval_start_pos_preds, eval_end_pos_true, eval_end_pos_preds) avg_eval_loss = eval_losses.item() / count global_eval_loss.append(avg_eval_loss) logger.info(f"""\nEpoch {e + 1}/{num_epoch} - eval_loss: {avg_eval_loss:.4f} - eval_obj_acc: {eval_obj_acc:.4f} eval_obj_f1:{eval_obj_prf[2]:.4f} - eval_express_acc: {eval_express_acc:.4f} eval_express_f1: {eval_express_prf[2]:.4f} - eval_acc_start_pos: {acc_start_pos:.4f} eval_f1_start_pos: {f1_start_pos:.4f} - eval_acc_end_pos: {acc_end_pos:.4f} eval_f1_end_pos: {f1_end_pos:.4f} - any_crt_acc: {start_or_end_crt_acc:.4f} all_crt_acc {start_and_end_crt_acc:.4f}\n""") # 保存最好的模型 if eval_obj_prf[2] > best_obj_word_f1: best_obj_word_f1 = eval_obj_prf[2] save_model(model, optimizer, scheduler, args.output_dir) # if e % verbose == 0: # train_losses.append(train_loss.item()) # train_f1.append(best_train_f1) # eval_losses.append(eval_loss.item() / count) # eval_f1.append(_eval_f1) logger.info(f"best object class word: {best_obj_word_f1:.4f}") loss_acc_f1_plot(history, path=args.output_dir + "loss_acc_f1_plot.png")
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--pregenerated_data", type=Path, required=True) parser.add_argument("--teacher_model", default=None, type=str, required=True) parser.add_argument("--student_model", default=None, type=str, required=True) parser.add_argument("--output_dir", default=None, type=str, required=True) # Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--weight_decay', '--wd', default=1e-4, type=float, metavar='W', help='weight decay') parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--continue_train', action='store_true', help='Whether to train from checkpoints') # Additional arguments parser.add_argument('--eval_step', type=int, default=1000) args = parser.parse_args() logger.info('args:{}'.format(args)) samples_per_epoch = [] for i in range(int(args.num_train_epochs)): epoch_file = args.pregenerated_data / "epoch_{}.json".format(i) metrics_file = args.pregenerated_data / "epoch_{}_metrics.json".format( i) if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( "Warning! There are fewer epochs of pregenerated data ({}) than training epochs ({})." .format(i, args.num_train_epochs)) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.num_train_epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.teacher_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(int(args.num_train_epochs)): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.continue_train: student_model = TinyBertForPreTraining.from_pretrained( args.student_model) else: student_model = TinyBertForPreTraining.from_scratch(args.student_model) teacher_config = BertConfig.from_pretrained(args.teacher_model) teacher_config.output_hidden_states = True teacher_config.output_attentions = True teacher_model = BertModel.from_pretrained(args.teacher_model, config=teacher_config) # student_model = TinyBertForPreTraining.from_scratch(args.student_model, fit_size=teacher_model.config.hidden_size) student_model.to(device) teacher_model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) teacher_model = DDP(teacher_model) elif n_gpu > 1: student_model = torch.nn.DataParallel(student_model) teacher_model = torch.nn.DataParallel(teacher_model) size = 0 for n, p in student_model.named_parameters(): logger.info('n: {}'.format(n)) logger.info('p: {}'.format(p.nelement())) size += p.nelement() logger.info('Total parameters: {}'.format(size)) # Prepare optimizer param_optimizer = list(student_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] loss_mse = MSELoss() optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(num_train_optimization_steps * args.warmup_proportion), num_training_steps=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(" Num examples = {}".format(total_train_examples)) logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) for epoch in trange(int(args.num_train_epochs), desc="Epoch"): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0. tr_att_loss = 0. tr_rep_loss = 0. student_model.train() nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc="Epoch {}".format(epoch)) as pbar: for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", ascii=True)): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch if input_ids.size()[0] != args.train_batch_size: continue att_loss = 0. rep_loss = 0. student_atts, student_reps = student_model( input_ids, segment_ids, input_mask) teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids) teacher_reps = teacher_outputs[2] teacher_atts = teacher_outputs[3] teacher_reps = [ teacher_rep.detach() for teacher_rep in teacher_reps ] # speedup 1.5x teacher_atts = [ teacher_att.detach() for teacher_att in teacher_atts ] teacher_layer_num = len(teacher_atts) student_layer_num = len(student_atts) assert teacher_layer_num % student_layer_num == 0 layers_per_block = int(teacher_layer_num / student_layer_num) new_teacher_atts = [ teacher_atts[i * layers_per_block + layers_per_block - 1] for i in range(student_layer_num) ] for student_att, teacher_att in zip(student_atts, new_teacher_atts): student_att = torch.where( student_att <= -1e2, torch.zeros_like(student_att).to(device), student_att) teacher_att = torch.where( teacher_att <= -1e2, torch.zeros_like(teacher_att).to(device), teacher_att) att_loss += loss_mse(student_att, teacher_att) new_teacher_reps = [ teacher_reps[i * layers_per_block] for i in range(student_layer_num + 1) ] new_student_reps = student_reps for student_rep, teacher_rep in zip(new_student_reps, new_teacher_reps): rep_loss += loss_mse(student_rep, teacher_rep) loss = att_loss + rep_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_att_loss += att_loss.item() tr_rep_loss += rep_loss.item() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps mean_att_loss = tr_att_loss * args.gradient_accumulation_steps / nb_tr_steps mean_rep_loss = tr_rep_loss * args.gradient_accumulation_steps / nb_tr_steps if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if (global_step + 1) % args.eval_step == 0: result = {} result['global_step'] = global_step result['loss'] = mean_loss result['att_loss'] = mean_att_loss result['rep_loss'] = mean_rep_loss output_eval_file = os.path.join( args.output_dir, "log.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Save a trained model model_name = "step_{}_{}".format( global_step, "pytorch_model.bin") logging.info( "** ** * Saving fine-tuned model ** ** * ") # Only save the model it-self model_to_save = student_model.module if hasattr( student_model, 'module') else student_model output_model_file = os.path.join( args.output_dir, model_name) output_config_file = os.path.join( args.output_dir, "config.json") torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) model_name = "step_{}_{}".format(global_step, "pytorch_model.bin") logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = student_model.module if hasattr( student_model, 'module') else student_model output_model_file = os.path.join(args.output_dir, model_name) output_config_file = os.path.join(args.output_dir, "config.json") torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir)
def train(args, config, tokenizer, model): """train model""" #Load and prepare data train_examples = read_examples(os.path.join(args.data_dir, 'gen-trn.pkl')) prior_dic = pickle.load( open(os.path.join(args.prior_distribution_dir, 'prior-trn.pkl'), 'rb')) train_features = convert_examples_to_features(train_examples, tokenizer, args, stage='training', prior_dic=prior_dic) all_event_ids = torch.tensor([f.event_ids for f in train_features], dtype=torch.long) all_prior = torch.tensor([f.prior for f in train_features], dtype=torch.long) train_data = TensorDataset(all_event_ids, all_prior) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) train_dataloader = cycle(train_dataloader) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=args.train_steps) #Running training logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size= %d", args.train_batch_size) logger.info(" Batch size (including gradient_accumulation_steps)= %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Num steps = %d", args.train_steps * args.gradient_accumulation_steps) dev_dataset = {} model.train() global_step, tr_loss, nb_tr_examples, nb_tr_steps, best_loss, eval_flag = 0, 0, 0, 0, 1e4, True bar = tqdm(range(args.train_steps * args.gradient_accumulation_steps), total=args.train_steps * args.gradient_accumulation_steps) for step in bar: batch = next(train_dataloader) batch = tuple(t.to(args.device) for t in batch) event_ids, prior = batch loss = model(event_ids=event_ids, prior=prior) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps #print loss information tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += event_ids.size(0) nb_tr_steps += 1 #backward if args.fp16: optimizer.backward(loss) else: loss.backward() #update parameter if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 eval_flag = True #Running evaluation if ((global_step + 1) % args.eval_steps == 0) and eval_flag: tr_loss, nb_tr_examples, nb_tr_steps, eval_flag = 0, 0, 0, False prior_dic = pickle.load( open( os.path.join(args.prior_distribution_dir, 'prior-dev.pkl'), 'rb')) result = test(args, config, tokenizer, model, os.path.join(args.data_dir, 'gen-dev.pkl'), prior_dic=prior_dic) result['global_step'] = global_step + 1 result['train_loss'] = round(train_loss, 5) #print result for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) logger.info(" " + "*" * 20) if result['eval_loss'] < best_loss: logger.info(" Best loss:%s", round(result['eval_loss'], 5)) logger.info(" " + "*" * 20) best_loss = result['eval_loss'] # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file)
def main(args): if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") os.makedirs(args.output_dir, exist_ok=True) json.dump(args.__dict__, open( os.path.join(args.output_dir, 'opt_{}.json'.format(args.task_name)), 'w'), sort_keys=True, indent=2) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) amp_handle = None if args.fp16: from apex import amp amp_handle = amp.init(enable_caching=True) # Prepare model if (args.model_recover_path is None) or len(args.model_recover_path) == 0: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) else: if not os.path.exists(args.model_recover_path): logger.info("Path does not exist: {0}".format( args.model_recover_path)) sys.exit(0) logger.info("***** Recover model: {0} *****".format( args.model_recover_path)) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=torch.load(args.model_recover_path), num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) if args.do_train: t_total = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) else: t_total = 1 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.fp16_utils.fp16_optimizer import FP16_Optimizer except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.task_name == 'sts-b': if args.fp16: lbl_type = torch.half else: lbl_type = torch.float else: lbl_type = torch.long global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) train_data = convert_features_to_dataset(train_features, lbl_type) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) best_result = 0.0 for i_epoch in trange(1, args.num_train_epochs + 1, desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 model.train() iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)') for step, batch in enumerate(iter_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() tr_loss += loss.item() iter_bar.set_description('Iter (loss=%5.3f)' % loss.item()) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # Perform validation eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) eval_data = convert_features_to_dataset(eval_features, lbl_type) eval_segment = processor.get_dev_segments()[0] logger.info("***** Running evaluation: {0}-{1} *****".format( eval_segment, i_epoch)) logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_result = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 all_logits, all_label_ids = [], [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) tmp_eval_loss = outputs[0] logits = outputs[1] if amp_handle: amp_handle._clear_cache() logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() all_logits.append(logits) all_label_ids.append(label_ids) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps # compute evaluation metric all_logits = np.concatenate(all_logits, axis=0) all_label_ids = np.concatenate(all_label_ids, axis=0) metric_func = processor.get_metric_func() eval_result = metric_func(all_logits, all_label_ids) # logging the results logger.info("***** Eval results for {0}: {1} *****".format( eval_segment, eval_result)) if eval_result > best_result: best_result = eval_result # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "{0}.pt".format(args.task_name)) torch.save(model_to_save.state_dict(), output_model_file) logger.info( " Saved best model to {0}".format(output_model_file)) # delete unused variables del optimizer del param_optimizer del optimizer_grouped_parameters # Load a trained model that you have fine-tuned if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() del model output_model_file = os.path.join(args.output_dir, "{0}.pt".format(args.task_name)) model_state_dict = torch.load(output_model_file) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) eval_set_list = [] for eval_segment in processor.get_dev_segments(): eval_examples = processor.get_dev_examples(args.data_dir, segment=eval_segment) eval_set_list.append((eval_segment, eval_examples)) break for eval_segment, eval_examples in eval_set_list: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) eval_data = convert_features_to_dataset(eval_features, lbl_type) logger.info("***** Running evaluation: %s *****", eval_segment) logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_result = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 all_logits, all_label_ids = [], [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) tmp_eval_loss = outputs[0] logits = outputs[1] if amp_handle: amp_handle._clear_cache() logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() all_logits.append(logits) all_label_ids.append(label_ids) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps # compute evaluation metric all_logits = np.concatenate(all_logits, axis=0) all_label_ids = np.concatenate(all_label_ids, axis=0) metric_func = processor.get_metric_func() eval_result = metric_func(all_logits, all_label_ids) # logging the results logger.info("***** Eval results for {0}: {1} *****".format( eval_segment, eval_result))
class BertModel(object): def __init__(self, config): self.config = config self.init_config() self.init_random_seeds() self.init_bert() def init_config(self): self.args = munchify(self.config) self.pretrained_model = self.args.pretrained_model self.device = self.args.device self.n_gpu = (len(self.args.gpu_ids.split(",")) if "gpu_ids" in self.config else torch.cuda.device_count()) if "gpu_ids" in self.config: os.environ["CUDA_VISIBLE_DEVICES"] = self.config["gpu_ids"] def init_bert(self): self.model = BertForSequenceClassification.from_pretrained( self.pretrained_model, num_labels=self.args.num_labels, ) print_transformer(self.model) self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer) if self.args.fp16: self.model.half() self.model.to(self.device) if self.n_gpu > 1: self.model = torch.nn.DataParallel(self.model) def init_optimizer(self, n_examples): num_train_optimization_steps = ( int(n_examples / self.args.batch_size / self.args.gradient_accumulation_steps) * self.args.epochs) # Prepare optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] if self.args.fp16: try: from apex.optimizers import FP16_Optimizer, FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use" " distributed and fp16 training.") optimizer = FusedAdam( optimizer_grouped_parameters, lr=self.args.lr, bias_correction=False, max_grad_norm=1.0, ) if self.args.loss_scale == 0: self.optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: self.optimizer = FP16_Optimizer( optimizer, static_loss_scale=self.args.loss_scale) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, warmup=self.args.warmup_proportion, t_total=num_train_optimization_steps, ) else: self.optimizer = AdamW(self.model.parameters(), lr=self.args.lr, correct_bias=False) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=num_train_optimization_steps, num_training_steps=self.args.warmup_proportion * num_train_optimization_steps, ) def init_random_seeds(self): random.seed(self.args.seed) np.random.seed(self.args.seed) torch.manual_seed(self.args.seed) if self.n_gpu > 0: torch.cuda.manual_seed_all(self.args.seed) def save_pretrained(self, path): model_to_save = (self.model.module if hasattr(self.model, "module") else self.model) model_to_save.save_pretrained(path) @timeit def train_an_epoch(self, train_dataloader): self.model.train() for step, batch in enumerate(tqdm(train_dataloader, desc="Training")): batch = tuple(t.to(self.args.device) for t in batch) input_ids, input_mask, label_ids = batch if self.args.is_multilabel: logits = self.model( input_ids, token_type_ids=None, attention_mask=input_mask, )[0] loss = F.binary_cross_entropy_with_logits( logits, label_ids.float()) else: loss, logits = self.model( input_ids, token_type_ids=None, attention_mask=input_mask, labels=label_ids, ) if self.n_gpu > 1: loss = loss.mean() if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps if self.args.fp16: self.optimizer.backward(loss) else: loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) self.tr_loss += loss.item() self.nb_tr_steps += 1 if (step + 1) % self.args.gradient_accumulation_steps == 0: if self.args.fp16: lr_this_step = self.args.learning_rate * warmup_linear( self.iterations / self.num_train_optimization_steps, self.args.warmup_proportion, ) for param_group in self.optimizer.param_groups: param_group["lr"] = lr_this_step self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() self.iterations += 1 def train(self, train_set, dev_set): self.iterations, self.nb_tr_steps, self.tr_loss = 0, 0, 0 self.best_valid_metric, self.unimproved_iters = 0, 0 self.early_stop = False if self.args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(self.args.gradient_accumulation_steps)) self.args.batch_size = (self.args.batch_size // self.args.gradient_accumulation_steps) self.init_optimizer(len(train_set)) train_dataset = convert_df_to_dataset(train_set, self.tokenizer, self.args.max_seq_length) dev_dataset = convert_df_to_dataset(dev_set, self.tokenizer, self.args.max_seq_length) train_dataloader = DataLoader( train_dataset, sampler=RandomSampler(train_dataset), batch_size=self.args.batch_size, ) dev_dataloader = DataLoader( dev_dataset, sampler=SequentialSampler(dev_dataset), batch_size=self.args.batch_size, ) for epoch in trange(int(self.args.epochs), desc="Epoch"): self.train_an_epoch(train_dataloader) tqdm.write(f"[Epoch {epoch}] loss: {self.tr_loss}".format( epoch, self.best_valid_metric)) self.tr_loss = 0 eval_result = self.eval(dev_dataloader) # Update validation results if eval_result[self.args.valid_metric] > self.best_valid_metric: self.unimproved_iters = 0 self.best_valid_metric = eval_result[self.args.valid_metric] print_dict_as_table( eval_result, tag=f"[Epoch {epoch}]performance on validation set", columns=["metrics", "values"], ) ensureDir(self.args.model_save_dir) self.save_pretrained(self.args.model_save_dir) else: self.unimproved_iters += 1 if self.unimproved_iters >= self.args.patience: self.early_stop = True tqdm.write( "Early Stopping. Epoch: {}, best_valid_metric ({}): {}" .format(epoch, self.args.valid_metric, self.best_valid_metric)) break def test(self, test_set): """Get a evaluation result for a test set. Args: test_set: Returns: """ test_dataset = convert_df_to_dataset(test_set, self.tokenizer, self.args.max_seq_length) test_dataloader = DataLoader( test_dataset, sampler=SequentialSampler(test_dataset), batch_size=self.args.batch_size, ) return self.eval(test_dataloader) def scores(self, test_dataloader): """Get predicted label scores for a test_dataloader Args: test_dataloader: Returns: ndarray: An array of predicted label scores. """ self.model.eval() predicted_labels, target_labels = list(), list() for input_ids, input_mask, label_ids in tqdm(test_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.args.device) input_mask = input_mask.to(self.args.device) label_ids = label_ids.to(self.args.device) with torch.no_grad(): logits = self.model(input_ids, token_type_ids=None, attention_mask=input_mask)[0] if self.args.is_multilabel: predicted_labels.extend( F.sigmoid(logits).round().long().cpu().detach().numpy()) else: predicted_labels.extend( torch.argmax(logits, dim=1).cpu().detach().numpy()) target_labels.extend(label_ids.cpu().detach().numpy()) return np.array(predicted_labels), np.array(target_labels) @timeit def eval(self, test_dataloader): """Get the evaluation performance of a test_dataloader Args: test_dataloader: Returns: dict: A result dict containing result of "accuracy", "precision", "recall" and "F1". """ # test loader tensor: input_ids, input_mask, segment_ids, label_ids predicted_labels, target_labels = self.scores(test_dataloader) if self.args.num_labels > 2: accuracy = metrics.accuracy_score(target_labels, predicted_labels) macro_precision = metrics.precision_score(target_labels, predicted_labels, average="macro") macro_recall = metrics.recall_score(target_labels, predicted_labels, average="macro") macro_f1 = metrics.f1_score(target_labels, predicted_labels, average="macro") micro_precision = metrics.precision_score(target_labels, predicted_labels, average="micro") micro_recall = metrics.recall_score(target_labels, predicted_labels, average="micro") micro_f1 = metrics.f1_score(target_labels, predicted_labels, average="micro") return { "accuracy": accuracy, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_f1": macro_f1, "micro_precision": micro_precision, "micro_recall": micro_recall, "micro_f1": micro_f1, } else: accuracy = metrics.accuracy_score(target_labels, predicted_labels) precision = metrics.precision_score(target_labels, predicted_labels, average="binary") recall = metrics.recall_score(target_labels, predicted_labels, average="binary") f1 = metrics.f1_score(target_labels, predicted_labels, average="binary") return { "accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, } @timeit def predict(self, test_set): """ Args: test_set: list of :obj:InputExample Returns: ndarray: An array of predicted label scores. """ test_dataset = convert_df_to_dataset(test_set, self.tokenizer, self.args.max_seq_length) test_dataloader = DataLoader( test_dataset, sampler=SequentialSampler(test_dataset), batch_size=self.args.batch_size, ) return self.scores(test_dataloader)[0]
class BERTModel(BaseModel): def __init__(self, args): super().__init__() self.estimator = None self.label_mapping = None self.train_examples = None self.num_train_optimization_steps = None # Hyperparams self.max_seq_length = args.max_seq_length self.train_batch_size = args.train_batch_size self.eval_batch_size = args.eval_batch_size # Initial learning rate for Adam optimizer self.learning_rate = args.learning_rate self.num_epochs = args.epochs # Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% of training. self.warmup_steps = args.warmup_steps self.no_cuda = args.no_cuda # Number of updates steps to accumulate before performing a backward/update pass. self.gradient_accumulation_steps = args.gradient_accumulation_steps self.seed = args.seed # Use 16 bit float precision (instead of 32bit) self.fp16 = args.fp16 # Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True. # 0 (default value): dynamic loss scaling. Positive power of 2: static loss scaling value. self.loss_scale = args.loss_scale # Meta params self.write_test_output = args.write_test_output self.output_attentions = args.output_attentions self.eval_after_epoch = args.eval_after_epoch self.username = args.username # model self.model_type = args.model_type # paths self.train_data_path = os.path.join(args.data_path, args.train_data, 'train.tsv') self.dev_data_path = os.path.join(args.data_path, args.dev_data, 'dev.tsv') self.test_data_path = os.path.join(args.data_path, args.test_data, 'test.tsv') self.other_path = args.other_path self.default_output_folder = 'output' self.output_path = self.generate_output_path(args.output_path) self.model_path = os.path.join(self.other_path, 'bert') self.all_args = vars(args) def generate_output_path(self, output_path): if output_path is None: output_path = os.path.join(self.default_output_folder, f"{time.strftime('%Y_%m_%d-%-H_%M_%S')}-{str(uuid.uuid4())[:4]}-{self.username}") return output_path def create_dirs(self): for _dir in [self.output_path]: logger.info(f'Creating directory {_dir}') os.makedirs(_dir) def train(self): # Setup self._setup_bert() # Prepare optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if self.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") self.optimizer = FusedAdam(optimizer_grouped_parameters, lr=self.learning_rate, bias_correction=False, max_grad_norm=1.0) if self.loss_scale == 0: self.optimizer = FP16_Optimizer(self.optimizer, dynamic_loss_scale=True) else: self.optimizer = FP16_Optimizer(self.optimizer, static_loss_scale=self.loss_scale) else: self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate) self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.num_train_optimization_steps) # Run training global_step = 0 tr_loss = 0 train_features = self.convert_examples_to_features(self.train_examples) logger.debug("***** Running training *****") logger.debug(" Num examples = %d", len(self.train_examples)) logger.debug(" Batch size = %d", self.train_batch_size) logger.debug(" Num steps = %d", self.num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.train_batch_size) loss_vs_time = [] for epoch in range(int(self.num_epochs)): self.model.train() nb_tr_examples, nb_tr_steps = 0, 0 epoch_loss = 0 pbar = tqdm(train_dataloader) for step, batch in enumerate(pbar): batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, logits = self.model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) if self.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if self.gradient_accumulation_steps > 1: loss = loss / self.gradient_accumulation_steps if self.fp16: self.optimizer.backward(loss) else: loss.backward() loss = loss.item() tr_loss += loss epoch_loss += loss if step > 0: pbar.set_description("Loss: {:8.4f} | Average loss/it: {:8.4f}".format(loss, epoch_loss/step)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % self.gradient_accumulation_steps == 0: # Gradient clipping torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() global_step += 1 # evaluate model if self.eval_after_epoch: self.model.eval() nb_train_steps, nb_train_examples = 0, 0 train_accuracy, train_loss = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(train_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): loss, logits = self.model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) train_accuracy += self.accuracy(logits.to('cpu').numpy(), label_ids.to('cpu').numpy()) train_loss += loss.mean().item() nb_train_examples += input_ids.size(0) nb_train_steps += 1 train_loss = train_loss / nb_train_steps train_accuracy = 100 * train_accuracy / nb_train_examples print("{bar}\nEpoch {}:\nTraining loss: {:8.4f} | Training accuracy: {:.2f}%\n{bar}".format(epoch+1, train_loss, train_accuracy, bar=80*'=')) # Save model model_to_save = self.model.module if hasattr(self.model, 'module') else self.model # Only save the model it-self output_model_file = os.path.join(self.output_path, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(self.output_path, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) args_output_file = os.path.join(self.output_path, 'args.json') with open(args_output_file, 'w') as f: json.dump(self.all_args, f) def test(self): # Setup self._setup_bert(setup_mode='test') # Run test eval_examples = self.processor.get_dev_examples(self.dev_data_path) eval_features = self.convert_examples_to_features(eval_examples) logger.debug("***** Running evaluation *****") logger.debug(" Num examples = %d", len(eval_examples)) logger.debug(" Batch size = %d", self.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.eval_batch_size) self.model.eval() eval_loss = 0 nb_eval_steps = 0 result = {'prediction': [], 'label': [], 'text': []} for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) tmp_eval_loss, logits = self.model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() result['prediction'].extend(np.argmax(logits, axis=1).tolist()) result['label'].extend(label_ids.tolist()) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps label_mapping = self.get_label_mapping() result_out = self.performance_metrics(result['label'], result['prediction'], label_mapping=label_mapping) if self.write_test_output: test_output = self.get_full_test_output(result['prediction'], result['label'], label_mapping=label_mapping, test_data_path=self.dev_data_path) result_out = {**result_out, **test_output} return result_out def save_results(self, results): result_path = os.path.join(self.output_path, 'results.json') logger.info(f'Writing output results to {result_path}...') with open(result_path, 'w') as f: json.dump(results, f) def predict(self, data): """Predict data (list of strings)""" # Setup self._setup_bert(setup_mode='predict', data=data) # Run predict predict_examples = self.processor.get_test_examples(data) predict_features = self.convert_examples_to_features(predict_examples) all_input_ids = torch.tensor([f.input_ids for f in predict_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in predict_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in predict_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in predict_features], dtype=torch.long) predict_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) predict_sampler = SequentialSampler(predict_data) predict_dataloader = DataLoader(predict_data, sampler=predict_sampler, batch_size=self.eval_batch_size) self.model.eval() result = [] for input_ids, input_mask, segment_ids, label_ids in predict_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) output = self.model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids) logits = output[0] probabilities = torch.nn.functional.softmax(logits, dim=1) probabilities = probabilities.detach().cpu().numpy() res = self.format_predictions(probabilities, label_mapping=self.label_mapping) result.extend(res) return result def fine_tune(self): raise NotImplementedError def _setup_bert(self, setup_mode='train', data=None): # Create necessary dirctory structure if setup_mode == 'train': self.create_dirs() # GPU config self.device = torch.device("cuda" if torch.cuda.is_available() and not self.no_cuda else "cpu") self.n_gpu = torch.cuda.device_count() if self.no_cuda: self.n_gpu = 0 if setup_mode == 'train': logger.info("Initialize BERT: device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}".format(self.device, self.n_gpu, False, self.fp16)) if self.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(self.gradient_accumulation_steps)) self.train_batch_size = self.train_batch_size // self.gradient_accumulation_steps # seed random.seed(self.seed) np.random.seed(self.seed) torch.manual_seed(self.seed) if self.n_gpu > 0: torch.cuda.manual_seed_all(self.seed) # label mapping if setup_mode == 'train': self.label_mapping = self.set_label_mapping() elif setup_mode in ['test', 'predict']: self.label_mapping = self.get_label_mapping() # Build model self.processor = SentimentClassificationProcessor(self.train_data_path, self.label_mapping) num_labels = len(self.label_mapping) self.do_lower_case = 'uncased' in self.model_type self.tokenizer = BertTokenizer.from_pretrained(self.model_type, do_lower_case=self.do_lower_case) if setup_mode == 'train': self.train_examples = self.processor.get_train_examples(self.train_data_path) self.num_train_optimization_steps = int(len(self.train_examples) / self.train_batch_size / self.gradient_accumulation_steps) * self.num_epochs # Prepare model if setup_mode == 'train': # if self.fine_tune_path: # logger.info('Loading fine-tuned model {} of type {}...'.format(self.fine_tune_path, self.model_type)) # config = BertConfig(os.path.join(self.fine_tune_path, CONFIG_NAME)) # weights = torch.load(os.path.join(self.fine_tune_path, WEIGHTS_NAME)) # self.model = BertForSequenceClassification.from_pretrained(self.model_type, cache_dir=self.model_path, num_labels=num_labels, state_dict=weights) # else: # logger.info('Loading pretrained model {}...'.format(self.model_type)) # self.model = BertForSequenceClassification.from_pretrained(self.model_type, cache_dir=self.model_path, num_labels = num_labels) self.model = BertForSequenceClassification.from_pretrained(self.model_type, cache_dir=self.model_path, num_labels = num_labels) if self.fp16: self.model.half() else: # Load a trained model and config that you have trained self.model = BertForSequenceClassification.from_pretrained(self.output_path) self.model.to(self.device) if self.n_gpu > 1: self.model = torch.nn.DataParallel(self.model) def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() def accuracy(self, out, labels): outputs = np.argmax(out, axis=1) return np.sum(outputs == labels) def convert_examples_to_features(self, examples): """Loads a data file into a list of `InputBatch`s.""" features = [] for (ex_index, example) in enumerate(examples): tokens_a = self.tokenizer.tokenize(str(example.text_a)) tokens_b = None if example.text_b: tokens_b = self.tokenizer.tokenize(str(example.text_b)) # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b, self.max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self.max_seq_length - 2: tokens_a = tokens_a[:(self.max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambigiously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = ["[CLS]"] + tokens_a + ["[SEP]"] segment_ids = [0] * len(tokens) if tokens_b: tokens += tokens_b + ["[SEP]"] segment_ids += [1] * (len(tokens_b) + 1) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. padding = [0] * (self.max_seq_length - len(input_ids)) input_ids += padding input_mask += padding segment_ids += padding assert len(input_ids) == self.max_seq_length assert len(input_mask) == self.max_seq_length assert len(segment_ids) == self.max_seq_length label_id = self.label_mapping[example.label] if ex_index < 5: logger.debug("*** Example ***") logger.debug("guid: %s" % (example.guid)) logger.debug("tokens: %s" % " ".join( [str(x) for x in tokens])) logger.debug("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.debug("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.debug( "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logger.debug("label: %s (id = %d)" % (example.label, label_id)) features.append( InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)) return features