def evaluate(self, dataloader, metric='accuracy'): running_loss = 0. classifier = None if self.model_type == 'classifier': # or self.num_classes is not None: classifier = Classifier(self.class_names) y_pred = [] y_true = [] self.eval() rmse_ = 0. with torch.no_grad(): for data_batch in dataloader: inputs, labels = data_batch[0], data_batch[1] inputs = inputs.to(self.device) labels = labels.to(self.device) outputs = self.forward(inputs) loss = self.compute_loss(outputs, labels)[0] if self.parallel: running_loss += loss.sum() outputs = parallel.gather(outputs, self.device) else: running_loss += loss.item() if classifier is not None and metric == 'accuracy': classifier.update_accuracies(outputs, labels) y_true.extend(list(labels.squeeze(0).cpu().numpy())) _, preds = torch.max(torch.exp(outputs), 1) y_pred.extend(list(preds.cpu().numpy())) elif metric == 'rmse': rmse_ += rmse(outputs, labels).cpu().numpy() self.train() ret = {} # print('Running_loss: {:.3f}'.format(running_loss)) if metric == 'rmse': print('Total rmse: {:.3f}'.format(rmse_)) ret['final_rmse'] = rmse_ / len(dataloader) ret['final_loss'] = running_loss / len(dataloader) if classifier is not None: ret['accuracy'], ret[ 'class_accuracies'] = classifier.get_final_accuracies() ret['report'] = classification_report( y_true, y_pred, target_names=self.class_names) ret['confusion_matrix'] = confusion_matrix(y_true, y_pred) try: ret['roc_auc_score'] = roc_auc_score(y_true, y_pred) except: pass return ret
def save_embeddings(args, model, tokenizer): with open(os.path.join(args.data_dir, 'label_map.json'), 'r') as reader: label_map = json.load(reader) label_id_to_string = {i: label for label, i in label_map.items()} #Train set pooled_output_file = os.path.join(args.output_dir, 'train_embeddings.csv') train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) train_sampler = SequentialSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.eval_batch_size) logger.info("***** Generating Embeddings for Train Examples") for batch in tqdm(train_dataloader, desc="Train Embedding"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, } # XLM and RoBERTa don't use segment_ids # 'labels': batch[3]} outputs = model(**inputs) outputs = [outputs[i] for i in range(len(outputs))] pooled_outputs = parallel.gather([output[1] for output in outputs], target_device='cuda:0') pooled_outputs = pooled_outputs.detach().cpu().numpy() out_label_ids = batch[3].detach().cpu().numpy() example_ids = batch[4].detach().cpu().numpy() labels = [label_id_to_string[i] for i in out_label_ids] pooled_outputs = pd.DataFrame(pooled_outputs) pooled_outputs['example_id'] = example_ids pooled_outputs['label'] = labels pooled_outputs.to_csv(pooled_output_file, mode='a') #Dev set logger.info("***** Generating Embeddings for Dev Examples") pooled_output_file = os.path.join(args.output_dir, 'dev_embeddings.csv') eval_task = args.task_name eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) for batch in tqdm(eval_dataloader, desc="Dev Embedding"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, } # XLM and RoBERTa don't use segment_ids # 'labels': batch[3]} outputs = model(**inputs) outputs = [outputs[i] for i in range(len(outputs))] pooled_outputs = parallel.gather([output[1] for output in outputs], target_device='cuda:0') pooled_outputs = pooled_outputs.detach().cpu().numpy() out_label_ids = batch[3].detach().cpu().numpy() example_ids = batch[4].detach().cpu().numpy() labels = [label_id_to_string[i] for i in out_label_ids] pooled_outputs = pd.DataFrame(pooled_outputs) pooled_outputs['example_id'] = example_ids pooled_outputs['label'] = labels pooled_outputs.to_csv(pooled_output_file, mode='a')
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = (args.task_name, ) eval_outputs_dirs = (args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) with open(os.path.join(args.data_dir, 'label_map.json'), 'r') as reader: label_map = json.load(reader) label_id_to_string = {i: label for label, i in label_map.items()} if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) preds = None out_label_ids = None example_ids = None pooled_outputs = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, } # XLM and RoBERTa don't use segment_ids # 'labels': batch[3]} outputs = model(**inputs) # outputs = [outputs[i][0] for i in range(len(outputs))] outputs = [outputs[i] for i in range(len(outputs))] logits = parallel.gather([output[0] for output in outputs], target_device='cuda:0') pooled_output = parallel.gather([output[1] for output in outputs], target_device='cuda:1') if preds is None: preds = logits.detach().cpu() #.numpy() pooled_outputs = pooled_output.detach().cpu().numpy() out_label_ids = batch[3].detach().cpu() #.numpy() example_ids = batch[4].detach().cpu().numpy() else: preds = torch.cat((preds, logits.detach().cpu()), axis=0) pooled_outputs = np.append( pooled_outputs, pooled_output.detach().cpu().numpy(), axis=0) out_label_ids = torch.cat( (out_label_ids, batch[3].detach().cpu()), axis=0) example_ids = np.append(example_ids, batch[4].detach().cpu().numpy(), axis=0) k_values = (1, 2, 3, 4, 5) topk_accuracies, mistakes_at_k, preds_topk = accuracy( preds, out_label_ids, k_values) preds_topk = preds_topk.t() ##in order to extract specific examples at topk, need to take difference of mistakes at k and mistakes at k-1 #those would be the examples that were correctly predicted at k #so if we do that for k 1-5, we can show concrete examples of chief complaints at increasing levels of "difficulty" #so, mistakes1 != mistakes2 would give you true for indices that were wrong at top1 but correct at top2 #all you gotta do now is match these with text and labels #we could show this for top1-2, and top4-5 to emphasize the different types of errors we see ### Top-1 to Top-2 top1to2 = (mistakes_at_k[0] != mistakes_at_k[1]) df_1to2 = get_errors_topk(top1to2, preds_topk, 2, example_ids, label_id_to_string) df_1to2.to_csv(os.path.join(eval_output_dir, 'df_1to2.csv')) ### Top-4 to Top-5 top4to5 = (mistakes_at_k[3] != mistakes_at_k[4]) df_4to5 = get_errors_topk(top4to5, preds_topk, 5, example_ids, label_id_to_string) df_4to5.to_csv(os.path.join(eval_output_dir, 'df_4to5.csv')) ### Top-8 to Top-9 # top8to9 = (mistakes_at_k[7] != mistakes_at_k[8]) # df_8to9 = get_errors_topk(top8to9, preds_topk, 9, example_ids, label_id_to_string) # df_8to9.to_csv(os.path.join(eval_output_dir, 'df_8to9.csv')) preds = preds.numpy() out_label_ids = out_label_ids.numpy() preds = np.argmax(preds, axis=1) # result = compute_metrics(eval_task, preds, out_label_ids) for k, acc in zip(k_values, topk_accuracies): results['top{}_acc'.format(k)] = round(acc.item(), 4) # results.update(result) labels = [label_id_to_string[i] for i in out_label_ids] label_ids = range(len(label_map.keys())) label_list = [label_id_to_string[i] for i in label_ids] #we want to copy over non-mistake indices from out_label_ids to preds for k, mistakes in zip(k_values, mistakes_at_k): m_k = mistakes.view(-1).numpy() preds[~m_k] = out_label_ids[~m_k] result = compute_f1pr_topk(preds, out_label_ids, k) if args.final_eval: plot_confusion_matrix_topk(out_label_ids, preds, label_list, args.output_dir, k) results.update(result) metrics = [ 'acc', 'f1_macro', 'f1_micro', 'f1_weighted', 'precision', 'recall' ] results_lists = [] for i in k_values: metrics_for_k = [] for m in metrics: metrics_for_k.append(results["top{}_{}".format(i, m)]) results_lists.append(metrics_for_k) results_df = pd.DataFrame(results_lists, index=k_values, columns=metrics) results_df.to_csv(os.path.join(eval_output_dir, "results_df.csv")) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(results.keys()): if 'acc' in key: logger.info(" %s = %s", key, str(results[key])) writer.write("%s\t%s\n" % (key, str(round(results[key], 4)))) writer.write('\n') return results
def main(): parser = setup_parser() args = parser.parse_args() processors = { 'stsb': StsbProcessor, 'mednli': MednliProcessor, 'medsts': MedstsProcessor } output_modes = { 'mnli': 'classification', 'stsb': 'regression', 'mednli': 'classification', 'medsts': 'regression' } bert_types = { 'discharge': '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_disch_100000', 'all': '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_all_notes_150000', 'base_uncased': 'bert-base-uncased', 'base_cased': 'bert-base-cased' } ################################################################################################## ################################### SETUP DATA, DEVICE, MODEL #################################### ################################################################################################## if args.local_rank == -1 or args.no_cuda: device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda', args.local_rank) n_gpu = 1 #Initialize the distributed backend which will take care of synchronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: {}".format(task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels(output_mode) num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) ################################################################################################## ########################################### OPTIMIZER ############################################ ################################################################################################## if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] if args.discriminative_finetuning: group1 = ['layer.0', 'layer.1.'] group2 = ['layer.2', 'layer.3'] group3 = ['layer.4', 'layer.5'] group4 = ['layer.6', 'layer.7'] group5 = ['layer.8', 'layer.9'] group6 = ['layer.10', 'layer.11'] group_all = ['layer.0', 'layer.1.', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \ 'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.01, 'lr': args.learning_rate}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.0}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.0, 'lr': args.learning_rate}, ] else: optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) ################################################################################################## ############################################# TRAIN ############################################## ################################################################################################## global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) all_pids = np.array([f.pid for f in eval_features]) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, drop_last=True) model.train() epoch_metric = {} for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, num_labels) for i in range(len(logits)) ] loss = loss_fct(logits, label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss_fct = DataParallelCriterion(loss_fct) logits = [logits[i].view(-1) for i in range(len(logits))] loss = loss_fct(logits, label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() #average on multi-gpu if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: #modify lr with special warm up BERT uses #if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 with torch.no_grad(): model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] i = 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == 'classification': # loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, num_labels) for i in range(len(logits)) ] tmp_eval_loss = loss_fct(logits, label_ids.view(-1)) elif output_mode == 'regression': # loss_fct = MSELoss() # tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) loss_fct = MSELoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1) for i in range(len(logits)) ] tmp_eval_loss = loss_fct(logits, label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 logits = parallel.gather(logits, target_device='cuda:0') if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == 'classification': preds = np.argmax(preds, axis=1) elif output_mode == 'regression': preds = np.squeeze(preds) all_label_ids = all_label_ids[:preds.shape[0]] all_pids = all_pids[:preds.shape[0]] errors = generate_errors(preds, all_label_ids.numpy(), all_pids) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss logger.info('***** Eval Results *****') for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) epoch_metric[_] = result[ 'pearson'] if output_mode == 'regression' else result['acc'] output_eval_file = os.path.join(args.output_dir, 'eval_results.txt') with open(output_eval_file, 'w') as writer: logger.info('***** Eval Results *****') # for key in sorted(result.keys()): # logger.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) # writer.write("{} {}\n".format("epoch","pearson")) for key in sorted(epoch_metric.keys()): writer.write("{}\t{}\t{}\t{}\n".format(key, str(epoch_metric[key]), args.learning_rate, args.train_batch_size)) errors.to_csv('errors.txt', sep='\t', index=False) ################################################################################################## ########################################## SAVE & RELOAD ######################################### ################################################################################################## if args.do_train: #Save a trained model, config, and tokenizer model_to_save = model.module if hasattr( model, 'module') else model #only save the model itself output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device)
def do_eval(model, logger, output_dir, device, tr_loss, nb_tr_steps, global_step, processor, label_list, tokenizer, eval_dataloader, error_analysis_dict, output_mode, i, task): model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 preds = [] all_label_ids = [] all_input_ids = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc='Evaluating'): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, i, output_mode) if output_mode == 'classification': loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, logits[0].size(1)) for i in range(len(logits)) ] tmp_eval_loss = loss_fct(logits, label_ids.view(-1)) else: loss_fct = MSELoss() loss_fct = DataParallelCriterion(loss_fct) logits = [logits[i].view(-1) for i in range(len(logits))] tmp_eval_loss = loss_fct(logits, label_ids.view(-1)) logits = gather(logits, target_device='cuda:0') if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) if len(all_label_ids) == 0: all_label_ids.append(label_ids.detach().cpu().numpy()) else: all_label_ids[0] = np.append(all_label_ids[0], label_ids.detach().cpu().numpy(), axis=0) if len(all_input_ids) == 0: all_input_ids.append(input_ids.detach().cpu().numpy()) else: all_input_ids[0] = np.append(all_input_ids[0], input_ids.detach().cpu().numpy(), axis=0) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps preds = preds[0] all_label_ids = all_label_ids[0] all_input_ids = all_input_ids[0] all_pids = error_analysis_dict['pids'][:len(preds)] all_text_a = error_analysis_dict['text_a'][:len(preds)] all_text_b = error_analysis_dict['text_b'][:len(preds)] all_textpair_tokenized = [ ' '.join(tokenizer.convert_ids_to_tokens(ids)) for ids in all_input_ids ] assert len(preds) == len(all_label_ids) == len(all_input_ids) == len( all_pids) == len(all_text_a) == len(all_text_b) == len( all_textpair_tokenized) all_textpair_tokenized = [ tp.replace('[PAD]', '').strip() for tp in all_textpair_tokenized ] if output_mode == 'classification': preds = np.argmax(preds, axis=1) preds_rounded = preds eval_accuracy = accuracy(preds, all_label_ids) else: preds = np.squeeze(preds) preds_rounded = np.round(preds * 4) / 4 eval_accuracy = pearsonr(preds, all_label_ids)[0] errors = generate_errors(preds, preds_rounded, all_label_ids, all_pids, all_text_a, all_text_b, all_textpair_tokenized) if i == 0: errors.to_csv(os.path.join(output_dir, 'error_table.csv'), sep=',', index=False) result = { 'task name': task, 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(output_dir, 'eval_results.txt') with open(output_eval_file, 'w') as writer: logger.info('******** Eval Results *****') for key in sorted(result.keys()): logger.info(' %s = %s', key, str(result[key])) # writer.write('{} = {}\n'.format(key, str(result[key]))) return eval_accuracy