def random_sample_item(self, run_infer=False, get_random=True): sample_item = FunsdDataset(self.args, self.tokenizer, self.labels, self.pad_token_label_id, mode=self.mode).__getitem__( np.random.randint(30)) return sample_item
def main(): # noqa C901 parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, # help="Path to pre-trained model or shortcut name selected in the list: " # + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.", ) ## Other parameters parser.add_argument( "--labels", default="", type=str, help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=512, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--do_train", action="store_true", help="Whether to run training." ) parser.add_argument( "--do_eval", action="store_true", help="Whether to run eval on the dev set." ) parser.add_argument( "--do_predict", action="store_true", help="Whether to run predictions on the test set.", ) parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--weight_decay", default=0.0, type=float, help="Weight decay if we apply some." ) parser.add_argument( "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer." ) parser.add_argument( "--max_grad_norm", default=1.0, type=float, help="Max gradient norm." ) parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument( "--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps." ) parser.add_argument( "--logging_steps", type=int, default=50, help="Log every X updates steps." ) parser.add_argument( "--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.", ) parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument( "--no_cuda", action="store_true", help="Avoid using CUDA when available" ) parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", ) parser.add_argument( "--seed", type=int, default=42, help="random seed for initialization" ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument( "--server_ip", type=str, default="", help="For distant debugging." ) parser.add_argument( "--server_port", type=str, default="", help="For distant debugging." ) args = parser.parse_args() if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train ): if not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) else: if args.local_rank in [-1, 0]: shutil.rmtree(args.output_dir) if not os.path.exists(args.output_dir) and (args.do_eval or args.do_predict): raise ValueError( "Output directory ({}) does not exist. Please train and save the model before inference stage.".format( args.output_dir ) ) if ( not os.path.exists(args.output_dir) and args.do_train and args.local_rank in [-1, 0] ): os.makedirs(args.output_dir) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach( address=(args.server_ip, args.server_port), redirect_output=True ) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu" ) args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( filename=os.path.join(args.output_dir, "train.log") if args.local_rank in [-1, 0] else None, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = FunsdDataset( args, tokenizer, labels, pad_token_label_id, mode="train" ) global_step, tr_loss = train( args, train_dataset, model, tokenizer, labels, pad_token_label_id ) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case ) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True) ) ) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN ) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _ = evaluate( args, model, tokenizer, labels, pad_token_label_id, mode="test", prefix=global_step, ) if global_step: result = {"{}_{}".format(global_step, k): v for k, v in result.items()} results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case ) model = model_class.from_pretrained(args.output_dir) model.to(args.device) result, predictions = evaluate( args, model, tokenizer, labels, pad_token_label_id, mode="test" ) # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join( args.output_dir, "test_predictions.txt" ) with open(output_test_predictions_file, "w", encoding="utf8") as writer: with open( os.path.join(args.data_dir, "test.txt"), "r", encoding="utf8" ) as f: example_id = 0 for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = ( line.split()[0] + " " + predictions[example_id].pop(0) + "\n" ) writer.write(output_line) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0], ) return results
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""): eval_dataset = FunsdDataset(args, tokenizer, labels, pad_token_label_id, mode=mode) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=None, ) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): with torch.no_grad(): inputs = { "input_ids": batch[0].to(args.device), "attention_mask": batch[1].to(args.device), "labels": batch[3].to(args.device), } if args.model_type in ["layoutlm"]: inputs["bbox"] = batch[4].to(args.device) inputs["token_type_ids"] = ( batch[2].to(args.device) if args.model_type in ["bert", "layoutlm"] else None ) # RoBERTa don"t use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] if args.n_gpu > 1: tmp_eval_loss = ( tmp_eval_loss.mean() ) # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0 ) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) results = { "loss": eval_loss, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } report = classification_report(out_label_list, preds_list) logger.info("\n" + report) logger.info("***** Eval results %s *****", prefix) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results, preds_list
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test", prefix=""): output_lst = [] eval_dataset = FunsdDataset(args, tokenizer, labels, pad_token_label_id, mode=mode) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=None) logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Running inference"): with torch.no_grad(): inputs = { "input_ids": batch[0].to(args.device), "attention_mask": batch[1].to(args.device), "labels": batch[3].to(args.device), } if args.model_type in ['layoutlm']: inputs["bbox"] = batch[4].to(args.device) inputs["token_type_ids"] = (batch[2].to(args.device) if args.model_type in ['bert', 'layoutlm'] else None) outputs = model(**inputs) output_lst.append(outputs) temp_eval_loss, logits = outputs[:2] # torch.cuda.empty_cache() NOTE : UNCOMMENT IF GPU OVERLOADS if (preds is None): preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] print(out_label_ids) print(label_map) print( "===================================PREDICTION DTYPE======================================" ) print("PREDICTION SHAPE : ", type(preds)) print("PREDICTION ARRAY : ", preds) print("PREDICTION SHAPE : ", preds.shape) print( "===================================PREDICTION DTYPE======================================" ) print() for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: try: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) except Exception as e: return { "prediction_list": preds_list, "output_label_list": out_label_list } return {"prediction_list": preds_list, "output_label_list": out_label_list}
def return_sample_output(self, run_infer=False, get_random=True): sample_dataset = FunsdDataset(self.args, self.tokenizer, self.labels, self.pad_token_label_id, mode=self.mode).__getitem__( np.random.randint(30)) print("dataset size : ", len(next(iter(sample_dataset)))) if (get_random): if ( run_infer ): #TODO Remove this run_infer condition and indent code below sample_item = next(iter(DataLoader([sample_dataset]))) self.model.eval() with torch.no_grad(): inputs = { "input_ids": sample_item[0].to("cuda"), "attention_mask": sample_item[1].to("cuda"), "token_type_ids": sample_item[2].to("cuda"), "bbox": sample_item[4].to("cuda") } print("INPUT ID SHAPE : ", inputs["input_ids"].shape) print("ATTENTION MASK SHAPE : ", inputs["attention_mask"].shape) print("TOKEN SHAPE : ", inputs['token_type_ids'].shape) print("BBOX SHAPE : ", inputs['bbox'].shape) print(inputs) model_output = self.model(**inputs) return {"model_output": model_output} else: dataset_sampler = SequentialSampler(sample_dataset) sample_dataset = FunsdDataset(self.args, self.tokenizer, self.labels, self.pad_token_label_id, mode=self.mode) sample_dataloader = DataLoader(sample_dataset, sampler=dataset_sampler, collate_fn=None) for sample_item in sample_dataloader: inputs = { "input_ids": sample_item[0].to("cuda"), "attention_mask": sample_item[1].to("cuda"), "token_type_ids": sample_item[2].to("cuda"), "bbox": sample_item[4].to("cuda") } print("INPUT ID SHAPE : ", inputs["input_ids"].shape) print("ATTENTION MASK SHAPE : ", inputs["attention_mask"].shape) print("TOKEN SHAPE : ", inputs['token_type_ids'].shape) print("BBOX SHAPE : ", inputs['bbox'].shape) model_output = self.model(**inputs) print({"model_output": model_output}) print("model_output_shape:", model_output[0].shape) logits = model_output[-1] preds = logits.detach().cpu().numpy() out_label_ids = sample_item[3].to( "cuda").detach().cpu().numpy() preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(self.labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] print(out_label_list) print(preds_list) # print(out_label_ids) # print(label_map) print("=====================PREDICTIONS====================") print(preds.shape) print("=====================PREDICTIONS====================") print("label list: ", out_label_ids) for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != self.pad_token_label_id: out_label_list[i].append( label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) # print("EXCEPTION!!!!") pred_output = { "prediction_list": preds_list, "output_label_list": out_label_list } if (self.args.text_output): with open(f"{int(time())}_prediction_output.txt", "w+", encoding="utf-8") as output_file: output_file.write(str(pred_output)) #NOTE: Convert model_output to readable output annotations from the model - REFER : run_model_sample.py (lines 100-140) return pred_output
def main(args): # noqa C901 if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train): if not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) else: if args.local_rank in [-1, 0]: shutil.rmtree(args.output_dir) if not os.path.exists(args.output_dir) and (args.do_eval or args.do_predict): raise ValueError( "Output directory ({}) does not exist. Please train and save the model before inference stage." .format(args.output_dir)) if (not os.path.exists(args.output_dir) and args.do_train and args.local_rank in [-1, 0]): os.makedirs(args.output_dir) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( filename=os.path.join(args.output_dir, "train.log") if args.local_rank in [-1, 0] else None, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = FunsdDataset(args, tokenizer, labels, pad_token_label_id, mode="train") global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _ = evaluate( args, model, tokenizer, labels, pad_token_label_id, mode="test", prefix=global_step, ) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.output_dir) model.to(args.device) result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w", encoding="utf8") as writer: with open(os.path.join(args.data_dir, "test.txt"), "r", encoding="utf8") as f: example_id = 0 for line in f: if line.startswith( "-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = (line.split()[0] + " " + predictions[example_id].pop(0) + "\n") writer.write(output_line) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0], ) return results