def _get_users_subscription(self, user_ids): ''' Метод получает подписки пользователей и обЪединяет их в одно множество уникальных id групп(self.user_subscription). Принимает один параметр список id пользователей. ''' self._set_program_state('Обработка данных') requests_list = list( ('API.groups.get({"user_id":' + str(item) + '}).items,' for item in user_ids) ) self._send_message('Подготовка дополнительных запросов.', state=requests_list) if len(requests_list) > 0: self._send_message(f'Подготовлено {len(requests_list)} дополнительных запросов') one_percent = len(requests_list) / 100 progressbar = ProgressBar(60) while requests_list: temp_list = requests_list[:25] requests_list = requests_list[25:] _requests = ''.join(temp_list) percent = len(temp_list) / one_percent request = self._api_execute(f'return [{_requests}];') for item in request: if item: for el in item: self.users_subscription.add(el) progressbar.set_progress(percent) self._send_message(f'Добавлено {len(self.users_subscription)} уникальных group_id')
def evaluate(args, model, tokenizer, prefix=""): eval_task_names = (args.task_name,) eval_outputs_dirs = (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, data_type='dev') if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=xlnet_collate_fn if args.model_type in ['xlnet'] else collate_fn) # Eval! logger.info("********* Running evaluation {} ********".format(prefix)) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet', 'albert', 'roberta'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) pbar(step) print(' ') if 'cuda' in str(args.device): torch.cuda.empty_cache() eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) logger.info("******** Eval results {} ********".format(prefix)) for key in sorted(result.keys()): logger.info(" dev: %s = %s", key, str(result[key])) return results
def evaluate(valid_loader,model): pbar = ProgressBar(n_total=len(valid_loader),desc='Evaluating') valid_loss = AverageMeter() valid_acc = AverageMeter() model.eval() count = 0 with torch.no_grad(): for batch_idx,(data, target) in enumerate(valid_loader): data, target = data.to(device), target.to(device) output = model(data) loss = loss_fn(output, target).item() # sum up batch loss pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability correct = pred.eq(target.view_as(pred)).sum().item() valid_loss.update(loss,n = data.size(0)) valid_acc.update(correct, n=1) count += data.size(0) pbar(step=batch_idx) return {'valid_loss':valid_loss.avg, 'valid_acc':valid_acc.sum /count}
def valdation(model, val_dataloader, device, task_type): pre_labels = [] total = 0 total_correct = 0 model.eval() loss_total = 0 with torch.no_grad(): pbar = ProgressBar(n_total=len(val_dataloader), desc='evaldation') for step, batch in enumerate(val_dataloader): batch = [t.to(device) for t in batch] inputs_a = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] } inputs_b = { 'input_ids': batch[3], 'attention_mask': batch[4], 'token_type_ids': batch[5] } labels = batch[6] inputs = [] inputs.append(inputs_a) inputs.append(inputs_b) output = model(inputs) if task_type == "classification": pred = torch.argmax(output, dim=1) pre_labels.extend(pred.detach().cpu().tolist()) correct = (labels == pred).sum() total_correct += correct total += labels.size()[0] loss = F.cross_entropy(output, labels) pbar(step, {'loss': loss.item()}) else: loss = F.mse_loss(output, labels) loss_total += loss.item() if task_type == "classification": acc = total_correct / total return acc, pre_labels else: return loss_total
def train(args,loaders,model): train_monitor = TrainingMonitor(file_dir='./logs/', arch=f'resnet18_{args.norm_type}_{args.batch_size}') train_loader,valid_loader = loaders['train'],loaders['valid'] optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9,weight_decay=1e-4) for epoch in range(1,args.epochs + 1): pbar = ProgressBar(n_total=len(train_loader), desc='Training') train_loss = AverageMeter() model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() optimizer.step() pbar(step=batch_idx, info={'loss': loss.item()}) train_loss.update(loss.item(), n=1) valid_log = evaluate(valid_loader, model) train_log = {'loss':train_loss.avg} logs = dict(train_log, **valid_log) show_info = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()]) print(show_info) train_monitor.epoch_step(logs)
def predict(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) pred_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) pred_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs): pred_dataset = load_and_cache_examples(args, pred_task, tokenizer, data_type='test') if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) args.pred_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly pred_sampler = SequentialSampler( pred_dataset) if args.local_rank == -1 else DistributedSampler( pred_dataset) pred_dataloader = DataLoader( pred_dataset, sampler=pred_sampler, batch_size=args.pred_batch_size, collate_fn=xlnet_collate_fn if args.model_type in ['xlnet'] else collate_fn) logger.info("***** Running prediction {} *****".format(prefix)) logger.info(" Num examples = %d", len(pred_dataset)) logger.info(" Batch size = %d", args.pred_batch_size) nb_pred_steps = 0 preds = None pbar = ProgressBar(n_total=len(pred_dataloader), desc="Predicting") for step, batch in enumerate(pred_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if ( 'bert' in args.model_type or 'xlnet' in args.model_type ) else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) _, logits = outputs[:2] nb_pred_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) pbar(step) print(' ') if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) output_pred_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt") with open(output_pred_file, "w") as writer: for pred in preds: writer.write(str(pred) + '\n') return results
def train(args, train_dataset, model, tokenizer, label_list, pad_token_label_id): """ Train the model """ # 载入数据 args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) # 总训练步数 t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # 优化器 if args.optimizer.lower() == "adamw": no_decay = ["bias", "LayerNorm.weight"] # bert_parameters = eval('model.{}'.format(args.model_type)).named_parameters() base_model_param_optimizer = list(model.BaseModel.named_parameters()) start_parameters = model.start_fc.named_parameters() end_parameters = model.end_fc.named_parameters() args.bert_lr = args.bert_lr if args.bert_lr else args.learning_rate args.start_lr = args.start_lr if args.start_lr else args.learning_rate args.end_lr = args.end_lr if args.end_lr else args.learning_rate optimizer_grouped_parameters = [ {"params": [p for n, p in base_model_param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, "lr": args.bert_lr}, {"params": [p for n, p in base_model_param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, "lr": args.bert_lr}, {"params": [p for n, p in start_parameters if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, "lr": args.start_lr}, {"params": [p for n, p in start_parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0, "lr": args.start_lr}, {"params": [p for n, p in end_parameters if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, "lr": args.end_lr}, {"params": [p for n, p in end_parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0, "lr": args.end_lr} ] if "lstm" in args.model_type.lower(): lstm_param_optimizer = list(model.Bilstm.named_parameters()) optimizer_grouped_parameters.extend([{'params': [p for n, p in lstm_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate}, {'params': [p for n, p in lstm_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,'lr': args.crf_learning_rate}]) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) else: optimizer = Lamb(model.parameters()) # 学习率 args.warmup_steps = int(t_total * args.warmup_proportion) # 学习率预热 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # 继续训练 if args.continue_train and \ os.path.isfile(os.path.join(args.continue_train_checkpoint, "optimizer.pt")) and \ os.path.isfile(os.path.join(args.continue_train_checkpoint, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp logger.info("using fp16 !!!") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # 多GPU训练 (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # adversarial_training if args.adv_training == 'fgm': adv = FGM(model=model, param_name='word_embeddings') elif args.adv_training == 'pgd': adv = PGD(model=model, param_name='word_embeddings') # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps, ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything(args.seed) # 复现 for _ in range(int(args.num_train_epochs)): logger.info(f"############### Epoch_{_} ###############") pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {"input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[5], "end_positions": batch[6]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.adv_training: adv.adversarial_training(args, inputs, optimizer) pbar(step, {'loss': loss.item()}) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # 验证集 logger.info("\n global_step: %s", global_step) logger.info("average tr_loss: %s", tr_loss/global_step) evaluate(args, model, tokenizer, label_list, pad_token_label_id) if args.save_steps > 0 and global_step % args.save_steps == 0: # 保存模型 logger.info("global_step: %s 模型已保存!", global_step) output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) logger.info("Saving model checkpoint to %s", output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) model.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) tokenizer.save_vocabulary(output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("\n") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step
def predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir): os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id,data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) ### span_labels = [] for label in label_list: label = label.split('-')[-1] if label not in span_labels: span_labels.append(label) span_map = {i: label for i, label in enumerate(span_labels)} # Eval logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] # 全部测试结果 error_results=[] # 预测错误结果 true_labels = [] # 真实标签 predict_labels = [] # 预测标签 output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt") error_predict_file = os.path.join(pred_output_dir, prefix, "Error_test_prediction.txt") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") if isinstance(model, torch.nn.DataParallel): # 多GPU训练 model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[5], "end_positions": batch[6]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, start_logits, end_logits = outputs[:3] start_preds = start_logits.detach().cpu().numpy() end_preds = end_logits.detach().cpu().numpy() start_preds = np.argmax(start_preds, axis=2) end_preds = np.argmax(end_preds, axis=2) start_preds_list = [span_map[j] for j in start_preds[0][1:-1]] end_preds_list = [span_map[j] for j in end_preds[0][1:-1]] batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()[1:-1] batch_true_labels = [args.id2label.get(i) for i in batch_true_labels] true_labels.append(batch_true_labels) batch_predict_labels = convert_span_to_bio([start_preds_list], [end_preds_list]) predict_labels.extend(batch_predict_labels) input_ids = inputs["input_ids"].squeeze(0).cpu().numpy().tolist()[1:-1] sent = "" ifError=False for input_id,pre,lab in zip(input_ids, batch_predict_labels[0], batch_true_labels): sent+=" ".join([tokenizer.ids_to_tokens[input_id],lab,pre])+"\n" if lab != pre: ifError=True sent+="\n" results.append(sent) if ifError: error_results.append(sent) ifError = False pbar(step) # 计算测试集 acc, recall, f1 logger.info("\n测试集结果统计:") logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info(str(classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2))) logger.info("\n") with open(output_predict_file, "w",encoding="utf-8") as writer: for record in results: writer.write(record) with open(error_predict_file, "w",encoding="utf-8") as writer: for record in error_results: writer.write(record)
def evaluate(args, model, tokenizer, label_list, pad_token_label_id): eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) ## span_labels = [] for label in label_list: label = label.split('-')[-1] if label not in span_labels: span_labels.append(label) span_map = {i: label for i, label in enumerate(span_labels)} # Eval logger.info("***** Running evaluation %s *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 true_labels = [] predict_labels = [] model.eval() pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[5], "end_positions": batch[6]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, start_logits, end_logits = outputs[:3] if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 start_preds = start_logits.detach().cpu().numpy() # [64, 128, 5] end_preds = end_logits.detach().cpu().numpy() start_preds = np.argmax(start_preds, axis=2) # [64, 128] end_preds = np.argmax(end_preds, axis=2) start_preds_list = [] end_preds_list = [] batch_true_labels = batch[4].squeeze(0).cpu().numpy().tolist() for index, input_length in enumerate(batch[3]): # batch[3] 每句长度 start_preds_list.append([span_map[j] for j in start_preds[index][:input_length]][1:-1]) end_preds_list.append([span_map[j] for j in end_preds[index][:input_length]][1:-1]) batch_true = [args.id2label.get(i) for i in batch_true_labels[index][:input_length]][1:-1] true_labels.append(batch_true) batch_predict_labels = convert_span_to_bio(start_preds_list, end_preds_list) predict_labels.extend(batch_predict_labels) pbar(step) logger.info("\n") logger.info("average eval_loss: %s", str(eval_loss/nb_eval_steps)) logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info(str(classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2)))
def train(args, train_dataset, model, tokenizer): """ Train the model """ ################################################################### # 初始化一个pandas DataFrame进行训练日志的存储 df_path = args.output_dir + "/bert" + "df_log.pickle" if not os.path.isfile(df_path): df = pd.DataFrame(columns=[ "epoch", "train_loss", "train_auc", "test_loss", "test_auc" ]) df.to_pickle(df_path) print("log DataFrame created!") ################################################################# args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs args.warmup_steps = int(t_total * args.warmup_proportion) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything(args.seed) for _ in range(int(args.num_train_epochs)): all_predictions, all_labels = [], [] pbar = ProgressBar(n_total=len(train_dataloader), desc='Train') for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet', 'albert', 'roberta' ] else None # XLM, DistilBERT don't use segment_ids outputs = model(**inputs) loss = outputs[0] ############################################################################################################################# # 提取预测的结果和标记, 并存到all_predictions, all_labels里 # 用来计算auc # 存储所有预测的结果和标记, 用来计算auc predictions = fl.softmax(outputs[1], dim=-1)[:, 1] predictions = predictions.detach().cpu().numpy().reshape( -1).tolist() labels = batch[3].cpu().numpy().reshape(-1).tolist() all_predictions.extend(predictions) all_labels.extend(labels) # 计算auc fpr, tpr, thresholds = metrics.roc_curve(y_true=all_labels, y_score=all_predictions) auc = metrics.auc(fpr, tpr) ############################################################################################################################# if args.n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() ############################################################################### log_dic = { "epoch": _, "train_loss": tr_loss / (step + 1), "train_auc": auc, "test_loss": 0, "test_auc": 0 } if step % 10 == 0: print((str({k: v for k, v in log_dic.items() if v != 0}))) df = pd.read_pickle(df_path) df = df.append([log_dic]) df.reset_index(inplace=True, drop=True) df.to_pickle(df_path) ############################################################################## if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 # flag if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.local_rank == -1: results = evaluate(args, model, tokenizer) # 每args.save_steps步保存一次模型 if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) tokenizer.save_vocabulary(vocab_path=output_dir) pbar(step, {'loss': loss.item()}) #################################################################################################################### global all_auc global threshold all_auc.append(auc) best_auc = max(all_auc) if all_auc[-1] < best_auc: threshold += 1 args.learning_rate *= 0.8 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) predict(args, model, tokenizer, prefix="") else: # 如果 threshold = 0 if threshold >= 5: print("epoch {} has the lowest loss".format( 0 + np.argmax(np.array(all_auc)))) print("early stop!") break ##################################################################################################################### print(" ") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step
def evaluate(args, model, tokenizer, label_list, pad_token_label_id): eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # Eval logger.info("***** Running evaluation %s *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 true_labels = [] predict_labels = [] model.eval() pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] batch_predict_labels = model.crf.decode(logits, inputs['attention_mask']) if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean( ) # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist() pbar(step) for index, input_length in enumerate(batch[4]): batch_true = [ args.id2label.get(i) for i in batch_true_labels[index][:input_length] ][1:-1] batch_predict = [ args.id2label.get(i) for i in batch_predict_labels[index][:input_length] ][1:-1] true_labels.append(batch_true) predict_labels.append(batch_predict) logger.info("\n") logger.info("average eval_loss: %s", str(eval_loss / nb_eval_steps)) logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info( str( classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2)))
def train(args): logger.info("args: %s", args) tokenizer = BertTokenizer.from_pretrained(args.pretrained) config = BertConfig.from_pretrained(args.pretrained) device = "cuda" if torch.cuda.is_available() else "cpu" task_type = args.task_type model = SentenceBert.from_pretrained( config=config, pretrained_model_name_or_path=args.pretrained, max_len=args.max_len, tokenizer=tokenizer, device=device, task_type=task_type) model.to(device) train_dataset = DataReader(tokenizer=tokenizer, filepath=args.train_file, max_len=args.max_len, task_type=task_type) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) val_dataset = DataReader(tokenizer=tokenizer, filepath=args.val_file, max_len=args.max_len, task_type=task_type) val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True) optimizer = AdamW(model.parameters(), lr=args.lr) scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=2) re_scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=2) model.train() logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataloader)) logger.info(" Num Epochs = %d", args.epochs) best_acc = 0.0 re_loss_min = 1000000.0 for epoch in range(args.epochs): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): batch = [t.to(device) for t in batch] inputs_a = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] } inputs_b = { 'input_ids': batch[3], 'attention_mask': batch[4], 'token_type_ids': batch[5] } labels = batch[6] inputs = [] inputs.append(inputs_a) inputs.append(inputs_b) output = model(inputs) if task_type == "classification": loss = F.cross_entropy(output, labels) else: loss = F.mse_loss(output, labels) optimizer.zero_grad() loss.backward() optimizer.step() pbar(step, {'loss': loss.item()}) time_srt = datetime.now().strftime('%Y-%m-%d') if task_type == "classification": # train_acc = valdation(model,train_dataloader,device,task_type) val_acc = valdation(model, val_dataloader, device, task_type) scheduler.step(val_acc) if val_acc > best_acc: best_acc = val_acc save_path = os.path.join(args.model_out, "classification", "20W_SBert_best_new" + time_srt) if not os.path.exists(save_path): os.makedirs(save_path) logger.info("save model") model.save_pretrained(save_path) tokenizer.save_vocabulary(save_path) # logger.info("train_acc: %.4f------val_acc:%.4f------best_acc:%.4f"%(train_acc,val_acc,best_acc)) logger.info("val_acc:%.4f------best_acc:%.4f" % (val_acc, best_acc)) else: # re_train_loss = valdation(model, train_dataloader, device, task_type) re_val_loss = valdation(model, val_dataloader, device, task_type) re_scheduler.step(re_val_loss) if re_loss_min > re_val_loss: re_loss_min = re_val_loss save_path = os.path.join(args.model_out, "regression", "20W_SBert_best_new_bak_" + time_srt) if not os.path.exists(save_path): os.makedirs(save_path) logger.info("save model") model.save_pretrained(save_path) tokenizer.save_vocabulary(save_path) # logger.info("re_train_loss:%.4f------re_val_loss: %.4f------re_loss_min:%.4f" % (re_train_loss,re_val_loss, re_loss_min)) logger.info("re_val_loss: %.4f------re_loss_min:%.4f" % (re_val_loss, re_loss_min))
def train(args, train_dataset, model, tokenizer): """ Train the model """ """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=xlnet_collate_fn if args.model_type in ['xlnet'] else collate_fn) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs args.warmup_steps = int(t_total * args.warmup_proportion) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to global_step of last saved checkpoint from model path try: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) except ValueError: global_step = 0 epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything( args.seed ) # Added here for reproductibility (even between python 2 and 3) for _ in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet', 'albert', 'roberta' ] else None # XLM, DistilBERT don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() pbar(step, {'loss': loss.item()}) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: print(" ") logs = {} # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) print(" ") if 'cuda' in str(args.device): torch.cuda.empty_cache() if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def predict(args, model, tokenizer, label_list, prefix=""): pred_task_names = (args.task_name, ) pred_outputs_dirs = (args.output_dir, ) label_map = {i: label for i, label in enumerate(label_list)} for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs): pred_dataset = load_and_cache_examples(args, pred_task, tokenizer, data_type='test') if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) args.pred_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly pred_sampler = SequentialSampler( pred_dataset) if args.local_rank == -1 else DistributedSampler( pred_dataset) pred_dataloader = DataLoader( pred_dataset, sampler=pred_sampler, batch_size=args.pred_batch_size, collate_fn=xlnet_collate_fn if args.model_type in ['xlnet'] else collate_fn) logger.info("******** Running prediction {} ********".format(prefix)) logger.info(" Num examples = %d", len(pred_dataset)) logger.info(" Batch size = %d", args.pred_batch_size) nb_pred_steps = 0 preds = None pbar = ProgressBar(n_total=len(pred_dataloader), desc="Predicting") for step, batch in enumerate(pred_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if ( 'bert' in args.model_type or 'xlnet' in args.model_type ) else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) _, logits = outputs[:2] nb_pred_steps += 1 if preds is None: if pred_task == 'copa': preds = logits.softmax(-1).detach().cpu().numpy() else: preds = logits.detach().cpu().numpy() else: if pred_task == 'copa': preds = np.append( preds, logits.softmax(-1).detach().cpu().numpy(), axis=0) else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) pbar(step) print(' ') if args.output_mode == "classification": predict_label = np.argmax(preds, axis=1) elif args.output_mode == "regression": predict_label = np.squeeze(preds) if pred_task == 'copa': predict_label = [] pred_logits = preds[:, 1] i = 0 while (i < len(pred_logits) - 1): if pred_logits[i] >= pred_logits[i + 1]: predict_label.append(0) else: predict_label.append(1) i += 2 output_submit_file = os.path.join(pred_output_dir, prefix, "test_prediction.json") output_logits_file = os.path.join(pred_output_dir, prefix, "test_logits") # 保存标签结果 with open(output_submit_file, "w") as writer: for i, pred in enumerate(predict_label): json_d = {} json_d['id'] = i json_d['label'] = str(label_map[pred]) writer.write(json.dumps(json_d) + '\n') # 保存中间预测结果 save_numpy(file_path=output_logits_file, data=preds)
def predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir): os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # 每次只有一条数据 # Eval logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] # 全部测试结果 error_results = [] # 预测错误结果 true_labels = [] # 真实标签 predict_labels = [] # 预测标签 output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt") error_predict_file = os.path.join(pred_output_dir, prefix, "Error_test_prediction.txt") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") if isinstance(model, torch.nn.DataParallel): # 多GPU训练 model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": None, 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] batch_predict_labels = model.crf.decode(logits, inputs['attention_mask']) batch_predict_labels = batch_predict_labels[0][ 1:-1] # [CLS]XXXX[SEP] 每次只有一条数据 batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()[1:-1] input_ids = inputs["input_ids"].squeeze(0).cpu().numpy().tolist()[1:-1] sent = "" ifError = False for input_id, pre, lab in zip(input_ids, batch_predict_labels, batch_true_labels): sent += " ".join([ tokenizer.ids_to_tokens[input_id], args.id2label[lab], args.id2label[pre] ]) + "\n" if args.id2label[lab] != args.id2label[pre]: ifError = True sent += "\n" results.append(sent) if ifError: error_results.append(sent) ifError = False pbar(step) # 计算测试集 acc, recall, f1 batch_true = [args.id2label.get(i) for i in batch_true_labels] batch_predict = [args.id2label.get(i) for i in batch_predict_labels] assert len(batch_true) == len(batch_predict) true_labels.append(batch_true) predict_labels.append(batch_predict) logger.info("\n测试集结果统计:") logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info( str( classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2))) logger.info("\n") with open(output_predict_file, "w", encoding="utf-8") as writer: for record in results: writer.write(record) with open(error_predict_file, "w", encoding="utf-8") as writer: for record in error_results: writer.write(record)
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=xlnet_collate_fn if args.model_type in ['xlnet'] else collate_fn) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs args.warmup_steps = int(t_total * args.warmup_proportion) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything( args.seed ) # Added here for reproductibility (even between python 2 and 3) for _ in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet', 'albert', 'roberta' ] else None # XLM, DistilBERT don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) pbar(step, {'loss': loss.item()}) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: print(" ") # Log metrics if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well evaluate(args, model, tokenizer) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) tokenizer.save_vocabulary(vocab_path=output_dir) print(" ") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step
def predict(args, model, tokenizer, prefix=""): pred_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) pred_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs): pred_dataset = load_and_cache_examples(args, pred_task, tokenizer, data_type='test') if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) ############################################################################################ args.pred_batch_size = 64 * max( 1, args.n_gpu) ###### DEFAULT = 128 # ############################################################################################ pred_sampler = SequentialSampler( pred_dataset) if args.local_rank == -1 else DistributedSampler( pred_dataset) pred_dataloader = DataLoader( pred_dataset, sampler=pred_sampler, batch_size=args.pred_batch_size, collate_fn=xlnet_collate_fn if args.model_type in ['xlnet'] else collate_fn) logger.info("***** Running prediction {} *****".format(prefix)) logger.info(" Num examples = %d", len(pred_dataset)) logger.info(" Batch size = %d", args.pred_batch_size) nb_pred_steps = 0 preds = None pbar = ProgressBar(n_total=len(pred_dataloader), desc="Predict") for step, batch in enumerate(pred_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if ( 'bert' in args.model_type or 'xlnet' in args.model_type ) else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) _, logits = outputs[:2] nb_pred_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) pbar(step) write_list = session_probability(args.data_dir, preds, 1) output_pred_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt") with open(output_pred_file, "w") as writer: writer.write('SessionId,Probability\n') for id, element in enumerate(write_list): writer.write(str(id + 50069) + "," + str(element) + '\n') return results