def after_pred(self): "Generate SquadResults" for i, example_index in enumerate(self.example_indices): eval_feature = self.features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [self.pred[output][i] for output in self.pred] output = apply(Self.numpy(), to_detach(output)) if isinstance(self.learn.model, self.xmodel_instances): # Some models like the ones in `self.xmodel_instances` use 5 arguments for their predictions start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] self.learn.pred = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) else: start_logits, end_logits = output self.learn.pred = SquadResult(unique_id, start_logits, end_logits)
def evaluate(model, tokenizer): # Evaluate dataset, examples, features = load_and_cache_examples(tokenizer, is_training=False) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=8) # Eval! print("***** Running evaluation *****") print(" Num examples = ", len(dataset)) all_results = [] start_time = timeit.default_timer() for batch in tqdm_notebook(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [output[i].detach().cpu().tolist() for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time print(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) predictions = compute_predictions_logits( examples, features, all_results, n_best_size = 20, max_answer_length = 30, do_lower_case=False, output_prediction_file="predictions.json", output_nbest_file="nbest_predictions.json", output_null_log_odds_file=None, verbose_logging=False, version_2_with_negative=False, null_score_diff_threshold=0.0, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate_full_dataset(self, data_loader: DataLoader): all_results = [] for batch in data_loader: inputs = { "input_ids": batch[0].cuda(), "attention_mask": batch[1].cuda(), "token_type_ids": batch[2].cuda(), } feature_indices = batch[3] outputs = self.model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = self.validation_features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [ output[i].detach().cpu().tolist() for output in outputs ] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) output_prediction_file = None output_nbest_file = None output_null_log_odds_file = None task = self.context.get_data_config().get("task") if task == "SQuAD1.1": version_2_with_negative = False elif task == "SQuAD2.0": version_2_with_negative = True else: raise NameError(f"Incompatible dataset '{task}' detected") # TODO: Make verbose logging configurable verbose_logging = False predictions = compute_predictions_logits( self.validation_examples, self.validation_features, all_results, self.context.get_hparam("n_best_size"), self.context.get_hparam("max_answer_length"), self.context.get_hparam("do_lower_case"), output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, version_2_with_negative, self.context.get_hparam("null_score_diff_threshold"), self.tokenizer, ) results = squad_evaluate(self.validation_examples, predictions) return results
def test_step(self, batch, batch_nb): # input_ids, attention_mask, token_type_ids, start_positions, end_positions = batch inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } all_results = [] outputs = self.forward(**inputs) example_indices = batch[3] examples = self.test_examples() features = self.test_features() batch_features = [] batch_examples = [ examples[example_index.item()] for i, example_index in enumerate(example_indices) ] for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) # print("result", result.unique_id) all_results.append(result) batch_features.append(eval_feature) predictions = compute_predictions_logits(examples, batch_features, all_results, do_lower_case=True, version_2_with_negative=True, tokenizer=self.tokenizer) answers_data, predictions = get_metrics_input(batch_examples, predictions) return { **self.calculate_metrics(predictions, answers_data, stage='test') }
def evaluate(self, model, dataset, examples, features): eval_batch_size, eval_dataloader = self.get_dataloader_sampler(dataset) # multi-gpu evaluate if self.args_dict[N_GPU] > 1 and not isinstance( model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) else: model = model # Eval! logger.info("***** Running evaluation {} *****".format( self.global_step)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", self.args_dict[eval_batch_size]) all_results = [] start_time = timeit.default_timer() model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(self.args_dict[DEVICE]) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1]} example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) eval_time = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", eval_time, eval_time / len(dataset)) # Compute predictions predictions = self.calcuate_predictions(all_results, examples, features) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results, eval_time
def qa_evaluate(lang, test_set, model_type, loader, bert_model, learner, save_dir): all_results, loss, uids = [], [], [] examples = test_set.examples features = test_set.features for batch in loader: with torch.no_grad(): input_ids, attention_mask, token_type_ids, labels, unique_ids = ( batch[0], batch[1], batch[2], batch[3], batch[4], ) bert_output = bert_model(input_ids, attention_mask, token_type_ids) outputs = learner(bert_output, labels=labels, attention_mask=attention_mask) loss.append(outputs.loss.mean().item()) for i, uid in enumerate(unique_ids): unique_id = int(uid.item()) start_logits = outputs.start_logits[i].detach().cpu().tolist() end_logits = outputs.end_logits[i].detach().cpu().tolist() result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) uids.append(unique_id) save_dir = os.path.join(save_dir, "result") os.makedirs(save_dir, exist_ok=True) output_prediction_file = os.path.join(save_dir, f"{lang}.predictions") output_nbest_file = os.path.join(save_dir, f"{lang}.nbest_predictions") features = [f for f in features if f.unique_id in uids] qas_ids = list(dict.fromkeys([f.qas_id for f in features])) predictions = compute_predictions_logits( examples, features, all_results, n_best_size=20, max_answer_length=30, do_lower_case=False, output_prediction_file=output_prediction_file, output_nbest_file=output_nbest_file, output_null_log_odds_file=None, verbose_logging=True, version_2_with_negative=False, null_score_diff_threshold=-np.inf, tokenizer=AutoTokenizer.from_pretrained(model_type), ) results = squad_evaluate(test_set.get_by_ids(qas_ids), predictions) return torch.tensor(loss), dict(results)
def get_squad_results( model, dataset: tf.data.Dataset, features: List[SquadFeatures], per_gpu_batch_size: int, num_batches: int, disable_tqdm: bool, ) -> List[SquadResult]: results = [] total_steps = math.ceil(len(features) / per_gpu_batch_size) pbar = tqdm.tqdm(total=total_steps, disable=disable_tqdm) pbar.set_description(f"Evaluating with batch size {per_gpu_batch_size}") if num_batches: dataset = dataset.take(num_batches) for step, batch in enumerate(dataset): input_dict = { "input_ids": batch[0]["input_ids"], "attention_mask": batch[0]["attention_mask"], "token_type_ids": batch[0]["token_type_ids"], } outputs = model(input_dict, training=False) start_logits, end_logits = outputs[0], outputs[1] per_gpu_batch_size = len(batch[1]["start_positions"]) for i in range(per_gpu_batch_size): feature_index = batch[0]["feature_index"][i].numpy().item() unique_id = int(features[feature_index].unique_id) result = SquadResult( unique_id=unique_id, start_logits=start_logits[i].numpy().tolist(), end_logits=end_logits[i].numpy().tolist(), ) results.append(result) pbar.update(1) pbar.close() return results
def evaluate_full_dataset(self, data_loader: DataLoader): all_results = [] for batch in data_loader: inputs = { "input_ids": batch[0].cuda(), "attention_mask": batch[1].cuda(), "token_type_ids": batch[2].cuda(), } feature_indices = batch[3] outputs = self.model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = self.validation_features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [ output[i].detach().cpu().tolist() for output in outputs ] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) output_prediction_file = None output_nbest_file = None output_null_log_odds_file = None predictions = compute_predictions_logits( self.validation_examples, self.validation_features, all_results, self.context.get_hparam("n_best_size"), self.context.get_hparam("max_answer_length"), True, output_prediction_file, output_nbest_file, output_null_log_odds_file, True, False, self.context.get_hparam("null_score_diff_threshold"), self.tokenizer, ) results = squad_evaluate(self.validation_examples, predictions) return results
def find_answer(self, question, context, n_best_size=20, max_answer_length=30, full_sentence=False): # heavily inspired by "https://github.com/huggingface/transformers/blob/v2.3.0/examples/run_squad.py#L212-L317" example_id = '55555' example = SquadExample(example_id, question, context, None, None, None) features, dataset = squad_convert_examples_to_features( [example], self.tokenizer, self.max_seq_length, self.doc_stride, self.max_query_length, False, return_dataset='pt') sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=1) all_results = [] for batch in dataloader: self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if self.model_type in {"xlm", "roberta", "distilbert"}: del inputs["token_type_ids"] example_index = batch[3] # XLNet and XLM use more arguments for their predictions if self.model_type in {"xlnet", "xlm"}: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = self.model(**inputs) output = [o.detach().cpu().tolist() for o in outputs] unique_id = int(features[example_index].unique_id) # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] squad_result = SquadResult( unique_id, start_logits[0], end_logits[0], start_top_index=start_top_index[0], end_top_index=end_top_index[0], cls_logits=cls_logits[0], ) else: start_logits, end_logits = output squad_result = SquadResult(unique_id, start_logits[0], end_logits[0]) all_results.append(squad_result) # XLNet and XLM use a more complex post-processing procedure if self.model_type in {"xlnet", "xlm"}: if hasattr(model, "config"): start_n_top = self.model.config.start_n_top end_n_top = self.model.config.end_n_top else: start_n_top = self.model.module.config.start_n_top end_n_top = self.model.module.config.end_n_top predictions = compute_predictions_log_probs( [example], features, all_results, n_best_size, max_answer_length, '/tmp/pred.out', '/tmp/nbest.out', '/tmp/null.out', start_n_top, end_n_top, self.version_2_with_negative, tokenizer, self.verbose, ) else: predictions = compute_predictions_logits( [example], features, all_results, n_best_size, max_answer_length, self.do_lower_case, '/tmp/pred.out', '/tmp/nbest.out', '/tmp/null.out', self.verbose, self.version_2_with_negative, self.null_score_diff_threshold, ) prediction = predictions[example_id] logger.debug(f'found prediction: "{prediction}"') # empty prediction indicates unknown answer if not prediction: logger.debug('empty prediction') return None if full_sentence: doc = self.nlp(context) for sent in doc.sents: if prediction in sent.text: prediction = sent.text break return prediction
def QA_evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = squad_load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) #eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, shuffle=False) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "squad_predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_squad_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "squad_null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, device, prefix=""): eval_dataset, examples, features = data.load_and_cache_examples( args.validation, tokenizer, args, evaluate=True, output_examples=True, ) eval_dataloader = data.get_dataloader(eval_dataset, args.per_gpu_eval_batch_size, evaluate=True) all_results = [] start_time = timeit.default_timer() eval_batches = 0 for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) eval_batches += 1 with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(device) }) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / (eval_batches * args.per_gpu_eval_batch_size)) # Compute predictions output_prediction_file = os.path.join(args.output_data_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_data_dir, "nbest_predictions_{}.json".format(prefix)) if args.has_unanswerable: output_null_log_odds_file = os.path.join( args.output_data_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = squad_metrics.compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_len, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.has_unanswerable, tokenizer, logger.level < logging.INFO, ) else: predictions = squad_metrics.compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_len, args.uncased_model, output_prediction_file, output_nbest_file, output_null_log_odds_file, logger.level < logging.INFO, args.has_unanswerable, args.null_score_diff_thresh, tokenizer, ) # Compute the F1 and exact scores. results = squad_metrics.squad_evaluate(examples, predictions) return results
def evaluate(self, prefix: str, args, tokenizer, dataset, examples, features) -> torch.Tensor: """Performs evaluation on the dataset Parameters ---------- prefix : str The model to be used for training args : tokenizer : The tokenizer used to preprocess the data. dataset : List(torch.utils.data.TensorDataset) The evaluation dataset examples : List(torch.utils.data.TensorDataset) The examples in the evaluation dataset features : List(torch.utils.data.TensorDataset) SQuAD-like features corresponding to the evalaution dataset Returns ------- torch.Tensor The evaluation metrics (Exact Match (EM) and F1-score) """ if not os.path.exists( self.args.output_dir) and self.args.local_rank in [-1, 0]: os.makedirs(self.args.output_dir) eval_batch_size = self.args.per_device_eval_batch_size * max( 1, self.args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) # multi-gpu evaluate if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): self.model.eval() batch = tuple(t.to(self.args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if self.params.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] example_indices = batch[3] outputs = self.model(**inputs) example_indices = batch[5] for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [tensor_to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) eval_time = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", eval_time, eval_time / len(dataset)) # Compute predictions predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, None, None, None, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix="", global_step=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info(f"***** Running evaluation {prefix} *****") logger.info(f" Num examples = {len(dataset)}") logger.info(f" Batch size = {args.eval_batch_size}") all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Eval"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info( f" Evaluation done in total {evalTime} secs ({evalTime / len(dataset)} sec per example)" ) # Compute predictions output_prediction_file = os.path.join(args.output_dir, f"predictions_{prefix}.json") output_nbest_file = os.path.join(args.output_dir, f"nbest_predictions_{prefix}.json") if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, f"null_odds_{prefix}.json") else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, False, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) # Write the result # Write the evaluation result on file output_dir = os.path.join(args.output_dir, "eval") if not os.path.exists(output_dir): os.makedirs(output_dir) output_eval_file = os.path.join(output_dir, f"eval_result_{global_step}.txt") logger.info("***** Official Eval results *****") with open(output_eval_file, "w", encoding="utf-8") as f: official_eval_results = eval_during_train(args) for key in sorted(official_eval_results.keys()): logger.info(f" {key} = {official_eval_results[key]}") f.write(f" {key} = {official_eval_results[key]}\n") return results
def train(EXP: str, MODEL_NAME: str, DELTA: float, WEIGHT_DECAY: float, DEVICE: str) -> float: EPOCHS = 3 BATCH_SIZE = 13 SAMPLES = 10 FREEZE = True LOGS = "logs" DOC_STRIDE = 128 MAX_SEQ_LENGTH = 384 MAX_QUERY_LENGTH = 64 MAX_ANSWER_LENGTH = 30 N_BEST_SIZE = 20 NULL_SCORE_THRESH = 0.0 LOWER_CASE = True THREADS = 4 LOADER_OPTIONS = { "num_workers": 10, "pin_memory": True } LR = 5e-5 ADAM_EPSILON = 1e-8 N_WARMUP_STEPS = 0 MAX_GRAD_NORM = 1 DATA_DIR = os.path.join("./dataset/squadv1") dumper = Dumper(f'dumps/dump_{EXP}_{MODEL_NAME}_{DELTA}.dump') os.makedirs(LOGS, exist_ok=True) writer_name = f"bayeformers_bert_squad.{EXP}" writer_path = os.path.join(LOGS, writer_name) writer_suff = f".DELTA_{DELTA}.WEIGHT_DECAY_{WEIGHT_DECAY}" writer = SummaryWriter(writer_path + writer_suff) o_model, tokenizer = setup_model(MODEL_NAME, LOWER_CASE) o_model = torch.nn.DataParallel(o_model, device_ids=[0, 1, 2, 3]) o_model.to(DEVICE) squadv1 = { "max_seq_length" : MAX_SEQ_LENGTH, "doc_stride" : DOC_STRIDE, "max_query_length": MAX_QUERY_LENGTH, "threads" : THREADS } train_dataset, train_examples, train_features = setup_squadv1_dataset(DATA_DIR, tokenizer=tokenizer, test=False, **squadv1) test_dataset, test_examples, test_features = setup_squadv1_dataset(DATA_DIR, tokenizer=tokenizer, test=True, **squadv1) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, **LOADER_OPTIONS) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, **LOADER_OPTIONS) decay = [param for name, param in o_model.named_parameters() if name in ["bias", "LayerNorm.weight"]] no_decay = [param for name, param in o_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]] params_decay = { "params": decay, "weight_decay": WEIGHT_DECAY } params_no_decay = { "params": no_decay, "weight_decay": 0.0 } parameters = [params_decay, params_no_decay] criterion = nn.CrossEntropyLoss().to(DEVICE) optim = AdamW(parameters, lr=LR, eps=ADAM_EPSILON) scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS) # =========================== FREQUENTIST ================================== report = Report() with dumper("frequentist_train"): for epoch in tqdm(range(EPOCHS), desc="Epoch"): # ============================ TRAIN ====================================== o_model.train() report.reset() with dumper("epoch", epoch): pbar = tqdm(train_loader, desc="Train") for inputs in pbar: inputs = setup_inputs(inputs, MODEL_NAME, o_model) inputs = dic2cuda(inputs, DEVICE) start_positions = inputs["start_positions"] end_positions = inputs["end_positions"] optim.zero_grad() outputs = o_model(**inputs) start_logits = outputs[1] end_logits = outputs[2] ignored_idx = start_logits.size(1) start_logits = start_logits.clamp_(0, ignored_idx) end_logits = end_logits.clamp_(0, ignored_idx) criterion.ignore_index = ignored_idx with dumper(): dumper['start_positions'] = start_positions dumper['end_positions'] = end_positions dumper['start_logits'] = start_logits dumper['end_logits'] = end_logits start_loss = criterion(start_logits, start_positions) end_loss = criterion( end_logits, end_positions) start_acc = (torch.argmax(start_logits, dim=1) == start_positions).float().sum() end_acc = (torch.argmax( end_logits, dim=1) == end_positions).float().sum() loss = 0.5 * (start_loss + end_loss) acc = 0.5 * (start_acc + end_acc) loss.backward() nn.utils.clip_grad_norm_(o_model.parameters(), MAX_GRAD_NORM) optim.step() report.total += loss.item() / len(train_loader) report.acc += acc.item() * 100 / len(train_dataset) pbar.set_postfix(total=report.total, acc=report.acc) scheduler.step() writer.add_scalar("train_nll", report.total, epoch) writer.add_scalar("train_acc", report.acc, epoch) # ============================ TEST ======================================= o_model.eval() report.reset() with dumper.section("frequentist_test"): with torch.no_grad(): results = [] pbar = tqdm(test_loader, desc="Test") for inputs in pbar: inputs = setup_inputs(inputs, MODEL_NAME, o_model, True) inputs = dic2cuda(inputs, DEVICE) feature_indices = inputs["feature_indices"] del inputs["feature_indices"] outputs = o_model(**inputs) for i, feature_idx in enumerate(feature_indices): eval_feature = test_features[feature_idx.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) results.append(result) with dumper(): dumper['unique_id'] = unique_id dumper['start_logits'] = start_logits dumper['end_logits'] = end_logits predictions = compute_predictions_logits( test_examples, test_features, results, N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE, os.path.join(LOGS, f"preds.frequentist.test.{writer_name + writer_suff}.json"), os.path.join(LOGS, f"nbestpreds.frequentist.test.{writer_name + writer_suff}.json"), None, True, False, NULL_SCORE_THRESH, tokenizer, ) results = squad_evaluate(test_examples, predictions) report.em = results["exact"] report.f1 = results["f1"] report.total = results["total"] print(f'em={report.em}, f1={report.f1}, total={report.total}') writer.add_scalar("test_em", report.em, epoch) writer.add_scalar("test_f1", report.f1, epoch) writer.add_scalar("test_total", report.total, epoch) # ============================ EVALUTATION ==================================== b_model = to_bayesian(o_model, delta=DELTA, freeze=FREEZE) b_model = b_model.to(DEVICE) b_model.eval() report.reset() with dumper("bayesian_eval_before_train"): with torch.no_grad(): results = [] pbar = tqdm(test_loader, desc="Bayesian Eval") for inputs in pbar: inputs = setup_inputs(inputs, MODEL_NAME, o_model, True) inputs = dic2cuda(inputs, DEVICE) feature_indices = inputs["feature_indices"] B = inputs["input_ids"].size(0) del inputs["feature_indices"] samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE) _, _, start_logits, end_logits, log_prior, log_variational_posterior = samples start_logits_list = start_logits.tolist() end_logits_list = end_logits.tolist() for i, feature_idx in enumerate(feature_indices): eval_feature = test_features[feature_idx.item()] unique_id = int(eval_feature.unique_id) result = SquadResult(unique_id, start_logits_list[i], end_logits_list[i]) results.append(result) with dumper(): dumper['unique_id'] = unique_id dumper['start_logits'] = start_logits_list[i] dumper['end_logits'] = end_logits_list[i] predictions = compute_predictions_logits( test_examples, test_features, results, N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE, os.path.join(LOGS, f"preds.bayesian.eval.{writer_name + writer_suff}.json"), os.path.join(LOGS, f"nbestpreds.bayesian.eval.{writer_name + writer_suff}.json"), None, True, False, NULL_SCORE_THRESH, tokenizer, ) results = squad_evaluate(test_examples, predictions) report.em = results["exact"] report.f1 = results["f1"] report.total = results["total"] print(f'em={report.em}, f1={report.f1}, total={report.total}') writer.add_scalar("bayesian_eval_em", report.em, epoch) writer.add_scalar("bayesian_eval_f1", report.f1, epoch) writer.add_scalar("bayesian_eval_total", report.total, epoch) # ============================ BAYESIAN ====================================== decay = [param for name, param in b_model.named_parameters() if name in ["bias", "LayerNorm.weight"]] no_decay = [param for name, param in b_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]] params_decay = { "params": decay, "weight_decay": WEIGHT_DECAY } params_no_decay = { "params": no_decay, "weight_decay": 0.0 } parameters = [params_decay, params_no_decay] criterion = nn.CrossEntropyLoss().to(DEVICE) optim = AdamW(parameters, lr=LR, eps=ADAM_EPSILON) scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS) with dumper("bayesian_train"): for epoch in tqdm(range(EPOCHS), desc="Bayesian Epoch"): with dumper("epoch", epoch): # ============================ TRAIN ====================================== b_model.train() report.reset() pbar = tqdm(train_loader, desc="Bayesian Train") for inputs in pbar: inputs = setup_inputs(inputs, MODEL_NAME, o_model) inputs = dic2cuda(inputs, DEVICE) start_positions = inputs["start_positions"] end_positions = inputs["end_positions"] B = inputs["input_ids"].size(0) optim.zero_grad() samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE) raw_start_logits, raw_end_logits, start_logits, end_logits, log_prior, log_variational_posterior = samples ignored_idx = start_logits.size(1) start_logits = start_logits.clamp_(0, ignored_idx) end_logits = end_logits.clamp_(0, ignored_idx) criterion.ignore_index = ignored_idx with dumper(): dumper['start_positions'] = start_positions dumper['end_positions'] = end_positions dumper['start_logits'] = start_logits dumper['end_logits'] = end_logits dumper['log_prior'] = log_prior dumper['log_variational_posterior'] = log_variational_posterior start_loss = criterion(start_logits, start_positions) end_loss = criterion( end_logits, end_positions) start_acc = (torch.argmax(start_logits, dim=1) == start_positions).float().sum() end_acc = (torch.argmax( end_logits, dim=1) == end_positions).float().sum() start_acc_std = np.std([(torch.argmax(start_logits.clamp(0, ignored_idx), dim=1) == start_positions).float().sum().item() for start_logits in raw_start_logits]) end_acc_std = np.std([(torch.argmax( end_logits.clamp(0, ignored_idx), dim=1) == end_positions).float().sum().item() for end_logits in raw_end_logits]) nll = 0.5 * (start_loss + end_loss) acc = 0.5 * (start_acc + end_acc) acc_std = 0.5 * (start_acc_std + end_acc_std) loss = (log_variational_posterior - log_prior) / len(train_loader) + nll loss.backward() nn.utils.clip_grad_norm_(b_model.parameters(), MAX_GRAD_NORM) optim.step() report.total += loss.item() / len(train_loader) report.nll += nll.item() / len(train_loader) report.log_prior += log_prior.item() / len(train_loader) report.log_variational_posterior += log_variational_posterior.item() / len(train_loader) report.acc += acc.item() * 100 / len(train_dataset) report.acc_std += acc_std / len(train_loader) pbar.set_postfix( total=report.total, nll=report.nll, log_prior=report.log_prior, log_variational_posterior=report.log_variational_posterior, acc=report.acc, acc_std=report.acc_std, ) scheduler.step() writer.add_scalar("bayesian_train_nll", report.nll, epoch) writer.add_scalar("bayesian_train_acc", report.acc, epoch) writer.add_scalar("bayesian_train_acc_std", report.acc_std, epoch) # ============================ TEST ======================================= b_model.eval() report.reset() with dumper("bayesian_test_after_train"): with torch.no_grad(): results = [] pbar = tqdm(test_loader, desc="Bayesian Test") for inputs in pbar: inputs = setup_inputs(inputs, MODEL_NAME, o_model, True) inputs = dic2cuda(inputs, DEVICE) feature_indices = inputs["feature_indices"] B = inputs["input_ids"].size(0) del inputs["feature_indices"] samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE) _, _, start_logits, end_logits, log_prior, log_variational_posterior = samples start_logits_list = start_logits.tolist() end_logits_list = end_logits.tolist() for i, feature_idx in enumerate(feature_indices): eval_feature = test_features[feature_idx.item()] unique_id = int(eval_feature.unique_id) result = SquadResult(unique_id, start_logits_list[i], end_logits_list[i]) results.append(result) with dumper(): dumper['unique_id'] = unique_id dumper['start_logits'] = start_logits_list[i] dumper['end_logits'] = end_logits_list[i] predictions = compute_predictions_logits( test_examples, test_features, results, N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE, os.path.join(LOGS, f"preds.bayesian.test.{writer_name + writer_suff}.json"), os.path.join(LOGS, f"nbestpreds.bayesian.test.{writer_name + writer_suff}.json"), None, True, False, NULL_SCORE_THRESH, tokenizer, ) results = squad_evaluate(test_examples, predictions) report.em = results["exact"] report.f1 = results["f1"] report.total = results["total"] print(f'em={report.em}, f1={report.f1}, total={report.total}') writer.add_scalar("bayesian_test_em", report.em, epoch) writer.add_scalar("bayesian_test_f1", report.f1, epoch) writer.add_scalar("bayesian_test_total", report.total, epoch) # ============================ SAVE ======================================= torch.save({ "weight_decay": WEIGHT_DECAY, "delta" : DELTA, "model" : b_model.state_dict(), "em" : report.em, "f1" : report.f1, "total" : report.total, }, f"{writer_path + writer_suff}.pth") return report.acc
def process_one_question(features, dataset, model, tokenizer, examples, device, use_ir_score=False, mu=0.0, ir_scores=None): all_results = [] eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=12) for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output if (use_ir_score): ir_scores_seq = np.ones( len(start_logits)) * ir_scores[eval_feature.example_index] start_logits = list( np.array(start_logits) * (1 - mu) + mu * ir_scores_seq) end_logits = list( np.array(end_logits) * (1 - mu) + mu * ir_scores_seq) result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) prefix = "" output_dir = "./tmp_dir" if not os.path.exists(output_dir): os.mkdir(output_dir) output_prediction_file = os.path.join( output_dir, curr_date_str + "_predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( output_dir, curr_date_str + "_nbest_predictions_{}.json".format(prefix)) output_null_log_odds_file = os.path.join( output_dir, curr_date_str + "_null_odds_{}.json".format(prefix)) compute_predictions_logits_all( examples, features, all_results, 20, # 20 args.n_best_size, 384, # args.max_answer_length, True, # args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # args.verbose_logging, False, # args.version_2_with_negative, 0.0, # args.null_score_diff_threshold, tokenizer, ) predictions = json.load( open( os.path.join( output_dir, curr_date_str + "_nbest_predictions_{}.json".format(prefix)), 'r')) return all_results, predictions
async def _custom_accuracy(self, examples, features, dataset, prefix=""): if not os.path.exists(self.parent.config.output_dir ) and self.parent.config.local_rank in [-1, 0]: os.makedirs(self.parent.config.output_dir) self.parent.config.eval_batch_size = ( self.parent.config.per_gpu_eval_batch_size * max(1, self.parent.config.n_gpu)) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader( dataset, sampler=eval_sampler, batch_size=self.parent.config.eval_batch_size, ) # multi-gpu evaluate if self.parent.config.n_gpu > 1 and not isinstance( self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) # Eval logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", self.parent.config.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): self.model.eval() batch = tuple(t.to(self.parent.config.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if self.parent.config.model_type in [ "xlm", "roberta", "distilbert", "camembert", ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if self.parent.config.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(self.model, "config") and hasattr( self.model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * self.parent.config.lang_id).to( self.parent.config.device) }) outputs = self.model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [self.to_list(output[i]) for output in outputs] if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info( " Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset), ) # Compute predictions output_prediction_file = os.path.join( self.parent.config.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( self.parent.config.output_dir, "nbest_predictions_{}.json".format(prefix), ) # XLNet and XLM use a more complex post-processing procedure if self.parent.config.model_type in ["xlnet", "xlm"]: start_n_top = (self.model.config.start_n_top if hasattr( self.model, "config") else self.model.module.config.start_n_top) end_n_top = (self.model.config.end_n_top if hasattr( self.model, "config") else self.model.module.config.end_n_top) predictions = compute_predictions_log_probs( examples, features, all_results, self.parent.config.n_best_size, self.parent.config.max_answer_length, output_prediction_file, output_nbest_file, None, start_n_top, end_n_top, False, self.tokenizer, True, ) else: predictions = compute_predictions_logits( examples, features, all_results, self.parent.config.n_best_size, self.parent.config.max_answer_length, self.parent.config.do_lower_case, output_prediction_file, output_nbest_file, None, True, False, self.parent.config.null_score_diff_threshold, self.tokenizer, ) return predictions
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1]} if args.model_type != 'distilbert': inputs[ 'token_type_ids'] = None if args.model_type == 'xlm' else batch[ 2] # XLM don't use segment_ids example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! print("***** Running evaluation {} *****".format(prefix)) print(" Num examples = %d", len(dataset)) print(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() eval_pbar = tqdm(total=len(dataset), position=0, leave=True, file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)) for batch in eval_dataloader: model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } feature_indices = batch[3] outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) eval_pbar.update(batch[0].size(0)) # hiepnh eval_pbar.close() # hiepnh evalTime = timeit.default_timer() - start_time print(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() seq_lens = torch.sum((batch[0] != 0).to(torch.int32), dim=1).numpy() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): # inputs = { # "input_ids": batch[0], # "attention_mask": batch[1].half() if args.data_type == 'fp16' else batch[1], # "token_type_ids": batch[2], # } inputs = [ batch[0], batch[1].half() if args.data_type == 'fp16' else batch[1], batch[2] ] example_indices = batch[3] # outputs = model(**inputs) outputs = model(*inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits[:seq_lens[i]], end_logits[:seq_lens[i]]) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def predict(self, question: Question, contexts: List[Context]) -> List[Answer]: examples = craft_squad_examples(question, contexts) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=self.args["max_seq_length"], doc_stride=self.args["doc_stride"], max_query_length=self.args["max_query_length"], is_training=False, return_dataset="pt", threads=self.args["threads"], tqdm_enabled=self.args["tqdm_enabled"]) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=32) all_results = [] for batch in eval_dataloader: self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } feature_indices = batch[3] outputs = self.model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) answers, _ = compute_predictions_logits( all_examples=examples, all_features=features, all_results=all_results, n_best_size=self.args["n_best_size"], max_answer_length=self.args["max_answer_length"], do_lower_case=self.args["do_lower_case"], output_prediction_file=self.args["output_prediction_file"], output_nbest_file=self.args["output_nbest_file"], output_null_log_odds_file=self.args["output_null_log_odds_file"], verbose_logging=self.args["verbose_logging"], version_2_with_negative=self.args["version_2_with_negative"], null_score_diff_threshold=self.args["null_score_diff_threshold"], tokenizer=self.tokenizer, language=question.language) all_answers = [] for idx, ans in enumerate(answers): all_answers.append( Answer(text=answers[ans][0], score=answers[ans][1], ctx_score=contexts[idx].score, language=question.language)) return all_answers
def run_prediction_multi(question_texts, context_texts): """ Modified from run_squad.py to only produce predicted answer given the question and context. This function will produce multiple answers by splitting the context into paragraphs Input: 1. List of questions 2. List of Context Output: 1. Predicted answer """ examples = [] for i, question_text in enumerate(question_texts): for j, context_text in enumerate(context_texts): example = SquadExample( qas_id=str(i) + str(j), question_text=question_text, context_text=context_text, answer_text=None, start_position_character=None, title="Predict", is_impossible=False, answers=None, ) examples.append(example) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1, ) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) all_results = [] for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) output_prediction_file = "predictions.json" output_nbest_file = "nbest_predictions.json" output_null_log_odds_file = "null_predictions.json" predictions = compute_predictions_logits( examples, features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # verbose_logging True, # version_2_with_negative null_score_diff_threshold, tokenizer, ) return predictions
def evaluate(args, model, tokenizer, prefix="", save_dir='', save_log_path=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not save_dir and args.local_rank in [-1, 0]: os.makedirs(save_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() # y_cls_correct = 0 # y_cls_incorrect = 0 y_cls_tp, y_cls_tn, y_cls_fp, y_cls_fn = 0, 0, 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) is_impossible = eval_feature.is_impossible output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits, logits_cls, prob_cls = output prob_cls = np.asarray(prob_cls, dtype=np.float) predict_cls = np.argmax(prob_cls) if predict_cls == int(not is_impossible): if is_impossible: y_cls_tn += 1 else: y_cls_tp += 1 else: if is_impossible: y_cls_fp += 1 else: y_cls_fn += 1 result = SquadResult(unique_id, start_logits, end_logits) # Add cls prediction if args.force_cls_pred: result.prob_cls = prob_cls all_results.append(result) # print(y_cls_correct, y_cls_incorrect) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(save_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( save_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( save_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) if args.force_cls_pred: example_index_to_features = collections.defaultdict(list) for feature in features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result n_force = 0 for example_index, example in enumerate(examples): eval_features = example_index_to_features[example_index] prob = [] for eval_feature in eval_features: eval_result = unique_id_to_result[eval_feature.unique_id] prob.append(eval_result.prob_cls[0]) if np.mean(prob) >= 0.8: predictions[example.qas_id] = "" n_force += 1 print("\n") print("num of force prediction:", n_force) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) cls_accuracy = (y_cls_tn + y_cls_tp) / (y_cls_tn + y_cls_tp + y_cls_fn + y_cls_fp) cls_no_ans_accuracy = y_cls_tn / (y_cls_tn + y_cls_fp) cls_has_ans_accuracy = y_cls_tp / (y_cls_tp + y_cls_fn) # Add CLS accuracy to result results.update({ 'cls_accuracy': cls_accuracy, 'cls_no_ans_accuracy': cls_no_ans_accuracy, 'cls_has_ans_accuracy': cls_has_ans_accuracy }) # save log to file if save_log_path: util.save_json_file(save_log_path, results) return results
def evaluate(args, model, tokenizer, prefix="", global_step=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in progress_bar(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "distilkobert", "xlm-roberta" ]: del inputs["token_type_ids"] # reforbert인 경우 if args.model_type in ["reforbert"]: del inputs["attention_mask"] example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) # Write the result # Write the evaluation result on file output_dir = os.path.join(args.output_dir, 'eval') if not os.path.exists(output_dir): os.makedirs(output_dir) output_eval_file = os.path.join( output_dir, "eval_result_{}_{}.txt".format( list(filter(None, args.model_name_or_path.split("/"))).pop(), global_step)) logger.info("***** Official Eval results *****") with open(output_eval_file, "w", encoding='utf-8') as f: official_eval_results = eval_during_train(args) for key in sorted(official_eval_results.keys()): logger.info(" %s = %s", key, str(official_eval_results[key])) f.write(" {} = {}\n".format(key, str(official_eval_results[key]))) return results
def predict(self, id_, question, paragraph_texts, paragraph_scores): # dataset, examples, features = load_and_cache_examples(self.args, self.tokenizer, evaluate=True, output_examples=True) # processor = SquadV2Processor() if self.args.version_2_with_negative else SquadV1Processor() # todo convert to single query examples examples = create_inference_examples(question, paragraph_texts, paragraph_scores, chinese=self.args.chinese, tokenizer=self.tokenizer) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=self.args.max_seq_length, doc_stride=self.args.doc_stride, max_query_length=self.args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=self.args.threads, tqdm_enabled=False) # if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: # os.makedirs(args.output_dir) self.args.eval_batch_size = self.args.per_gpu_eval_batch_size * max( 1, self.args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size) # multi-gpu evaluate if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) # Eval! # logger.info("***** Running evaluation {} *****".format(prefix)) # logger.info(" Num examples = %d", len(dataset)) # logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] # start_time = timeit.default_timer() for batch in eval_dataloader: self.model.eval() batch = tuple(t.to(self.args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } # if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]: # del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions # if args.model_type in ["xlnet", "xlm"]: # inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # # for lang_id-sensitive xlm models # if hasattr(model, "config") and hasattr(model.config, "lang2id"): # inputs.update( # {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} # ) outputs = self.model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) # Compute predictions prefix = "" output_prediction_file = os.path.join( self.args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( self.args.output_dir, "nbest_predictions_{}.json".format(prefix)) if self.args.version_2_with_negative: output_null_log_odds_file = os.path.join( self.args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if self.args.model_type in ["xlnet", "xlm"]: start_n_top = self.model.config.start_n_top if hasattr( self.model, "config") else self.model.module.config.start_n_top end_n_top = self.model.config.end_n_top if hasattr( self.model, "config") else self.model.module.config.end_n_top answers, nbest_answers = compute_predictions_log_probs( examples, features, all_results, self.args.n_best_size, self.args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, self.args.version_2_with_negative, self.tokenizer, self.args.verbose_logging, self.args.chinese) else: answers, nbest_answers = compute_predictions_logits( examples, features, all_results, self.args.n_best_size, self.args.max_answer_length, self.args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, self.args.verbose_logging, self.args.version_2_with_negative, self.args.null_score_diff_threshold, self.tokenizer, self.args.chinese) all_answers = [] for answer_id, ans in enumerate(answers): ans_dict = { "id": id_, "answer": answers[ans][0], "phrase_score": answers[ans][1], "paragraph_score": paragraph_scores[answer_id], } all_answers.append(ans_dict) return all_answers
def evaluate(args, model_path1, model1, model2, model3, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, model_path1, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model1, torch.nn.DataParallel): model1 = torch.nn.DataParallel(model1) if args.n_gpu > 1 and not isinstance(model2, torch.nn.DataParallel): model2 = torch.nn.DataParallel(model2) if args.n_gpu > 1 and not isinstance(model3, torch.nn.DataParallel): model3 = torch.nn.DataParallel(model3) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model1.eval() model2.eval() model3.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } feature_indices = batch[3] outputs1 = model1(**inputs) outputs2 = model2(**inputs) outputs3 = model3(**inputs) # print("outputs1", outputs1) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output1 = [to_list(output1[i]) for output1 in outputs1] # print("output1", output1) # print("len(output1)", len(output1[0])) output2 = [to_list(output2[i]) for output2 in outputs2] output3 = [to_list(output3[i]) for output3 in outputs3] start_logits1, end_logits1 = output1 start_logits2, end_logits2 = output2 start_logits3, end_logits3 = output3 # 第一种加权加和形式集成 weights = [0.4, 0.2, 0.4] start_logits = [ weights[0] * log1 + weights[1] * log2 + weights[2] * log3 for log1, log2, log3 in zip(start_logits1, start_logits2, start_logits3) ] end_logits = [ weights[0] * log1 + weights[1] * log2 + weights[2] * log3 for log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3) ] # # 第二种算数平均是集成 # start_logits = [ # (log1 + log2 + log3)/3 # for log1, log2, log3 in zip(start_logits1, start_logits2, start_logits3) # ] # end_logits = [ # (log1 + log2 + log3) / 3 # for log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3) # ] # # 第三种位置形式 # start_logits = [ # max(log1, log2, log3) # for log1, log2, log3 in zip(start_logits1, start_logits2, start_logits3) # ] # end_logits = [ # max(log1, log2, log3) # for log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3) # ] # print("start_logits1", start_logits1[0]) # print("start_logits2", start_logits2[0]) # print("start_logits3", start_logits3[0]) # print("start_logits", start_logits[0]) result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info( " Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset), ) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate_ensemble(args, checkpoints, tokenizer, model_class, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() model_list = [] for ckpt in checkpoints: logger.info("Evaluate the following fine_tuned_model: %s", ckpt) model_list.append(model_class.from_pretrained(ckpt)) for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: raise NotImplementedError # inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # # for lang_id-sensitive xlm models # if hasattr(model, "config") and hasattr(model.config, "lang2id"): # inputs.update( # {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} # ) outputs_list = [] for model in model_list: model.to(args.device) model.eval() with torch.no_grad(): outputs = model(**inputs) outputs_list.append(outputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) start_logits_list, end_logits_list = [], [] for outputs in outputs_list: output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: raise NotImplementedError # start_logits = output[0] # start_top_index = output[1] # end_logits = output[2] # end_top_index = output[3] # cls_logits = output[4] # result = SquadResult( # unique_id, # start_logits, # end_logits, # start_top_index=start_top_index, # end_top_index=end_top_index, # cls_logits=cls_logits, # ) else: start_logits, end_logits = output start_logits_list.append(start_logits) end_logits_list.append(end_logits) if args.model_type in ["xlnet", "xlm"]: raise NotImplementedError else: start_logits_list = np.array(start_logits_list) end_logits_list = np.array(end_logits_list) #Ensembling method (eg max/avg/etc) start_logits = list(start_logits_list.mean(axis=0)) end_logits = list(end_logits_list.mean(axis=0)) result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: raise NotImplementedError # start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top # end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix="", calibration=False): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) dataset_cached = "./dataset_cached" if not os.path.exists(dataset_cached): os.makedirs(dataset_cached) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) calibation_iteration = int( (len(dataset) * 0.05 + args.eval_batch_size - 1) / args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) print(" Batch size = %d" % args.eval_batch_size) if args.mkldnn_eval: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model) print(model) all_results = [] evalTime = 0 nb_eval_steps = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) if calibration and nb_eval_steps >= calibation_iteration: break with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1]} if args.model_type != 'distilbert': inputs[ 'token_type_ids'] = None if args.model_type == 'xlm' else batch[ 2] # XLM don't use segment_ids example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) if nb_eval_steps >= args.warmup: start_time = timeit.default_timer() outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) if nb_eval_steps >= args.warmup: evalTime += (timeit.default_timer() - start_time) nb_eval_steps += 1 if args.iter > 0 and nb_eval_steps >= (args.warmup + args.iter): break if nb_eval_steps >= args.warmup: perf = (nb_eval_steps - args.warmup) * args.eval_batch_size / evalTime if args.eval_batch_size == 1: print('Latency: %.3f ms' % (evalTime / (nb_eval_steps - args.warmup) * 1000)) print("Evaluation done in total %f secs (Throughput: %f samples/sec)" % (evalTime, perf)) else: logger.info( "*****no performance, please check dataset length and warmup number *****" ) # Compute predictions output_prediction_file = os.path.join(dataset_cached, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( dataset_cached, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( dataset_cached, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) elif not calibration and args.iter == 0: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Compute the F1 and exact scores. if not calibration and args.iter == 0: results = squad_evaluate(examples, predictions) bert_task_acc_keys = ['best_f1', 'f1', 'mcc', 'spearmanr', 'acc'] for key in bert_task_acc_keys: if key in results.keys(): acc = results[key] break print("Accuracy: %.5f" % acc) else: results = None return results, perf
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""): eval_dataset,features, examples = load_and_cache_examples( args, tokenizer, labels, pad_token_label_id, mode=mode ) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = ( SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) ) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size ) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None all_results = [] start_time = timeit.default_timer() model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], } inputs["bbox"] = batch[5] inputs["token_type_ids"] = (batch[6]) outputs = model(**inputs) example_indices = batch[7] for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset)) output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) predictions = compute_predictions_logits( examples, features, all_results, 20, 30, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, True, True, 0.0, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix="", adapter_names=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "adapter_names": adapter_names, } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def generate_model_outputs(args, model, tokenizer, is_dev=False, prefix='', save_dir=''): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=is_dev, output_examples=True) logger.info( f'REAL number of examples {len(examples)} and features {len(features)}!' ) if not save_dir and args.local_rank in [-1, 0]: os.makedirs(save_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Output! logger.info("***** Generating outputs {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) # all_results = collections.defaultdict(list) all_results = [] start_time = timeit.default_timer() for batch in tqdm(dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) print('# of resuls in all_results:', len(all_results)) # Save feaures with open(os.path.join(save_dir, 'features.pkl'), 'wb') as f: pickle.dump(features, f) # Save all_results with open(os.path.join(save_dir, 'all_results.pkl'), 'wb') as f: pickle.dump(all_results, f) # Save tokenizer with open(os.path.join(save_dir, 'tokenizer.pkl'), 'wb') as f: pickle.dump(tokenizer, f) json_to_save = { 'model_name': args.name, 'type': 'dev' if is_dev else 'train', 'num_examples': len(examples), 'num_features': len(features) } util.save_json_file(os.path.join(save_dir, 'config.json'), json_to_save)