def post_processing_function( examples: datasets.Dataset, features: datasets.Dataset, outputs: EvalLoopOutput, stage="eval" ): # Decode the predicted tokens. preds = outputs.predictions if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Build a map example to its corresponding features. example_id_to_index = {k: i for i, k in enumerate(examples["id"])} feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)} predictions = {} # Let's loop over all the examples! for example_index, example in enumerate(examples): # This is the index of the feature associated to the current example. feature_index = feature_per_example[example_index] predictions[example["id"]] = decoded_preds[feature_index] # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [ {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() ] else: formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] references = [{"id": ex["id"], "answers": ex[answer_column]} for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references)
def evaluate(): model.eval() eval_losses: List[float] = [] preds: torch.Tensor = None label_ids: torch.Tensor = None for inputs in tqdm(eval_iterator): loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only) if loss is not None: eval_losses.append(loss) if logits is not None: preds = logits if preds is None else torch.cat( (preds, logits), dim=0) if labels is not None: label_ids = labels if label_ids is None else torch.cat( (label_ids, labels), dim=0) # Finally, turn the aggregated tensors into numpy arrays. if preds is not None: preds = preds.cpu().numpy() if label_ids is not None: label_ids = label_ids.cpu().numpy() if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = compute_metrics( EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses)
def post_proc(self, xs, features, outs, stage="eval"): ps = self.params preds = outs.predictions if isinstance(preds, tuple): preds = preds[0] preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) map = {k: i for i, k in enumerate(xs["id"])} feature_per_example = {map[x["example_id"]]: i for i, x in enumerate(features)} ys = {} for i, x in enumerate(xs): ys[x["id"]] = preds[feature_per_example[i]] if ps.version_2_with_negative: ys = [ {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in ys.items() ] else: ys = [{"id": k, "prediction_text": v} for k, v in ys.items()] ls = [{"id": x["id"], "answers": x[self.cols[EACH][2]]} for x in xs] return EvalPrediction(predictions=ys, label_ids=ls)
def prediction_loop(self, *args, **kwargs) -> PredictionOutput: pred_outs = super().prediction_loop(*args, **kwargs) preds, label_ids, metrics = pred_outs.predictions, pred_outs.label_ids, pred_outs.metrics preds = preds.squeeze() if self.compute_metrics is not None: metrics_no_label = self.compute_metrics( EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics_no_label = {} for key in list(metrics_no_label.keys()): if not key.startswith("eval_"): metrics_no_label[f"eval_{key}"] = metrics_no_label.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics={ **metrics, **metrics_no_label })
def compute_loss(self, model, inputs): """ Override loss computation to calculate and log metrics during training """ outputs = model(**inputs) # Custom logging steps (to log training metrics) if (self.state.global_step == 1 and self.args.logging_first_step) or ( self.args.logging_steps > 0 and self.state.global_step > 0 and self.state.global_step % self.args.logging_steps == 0): labels = None has_labels = all( inputs.get(k) is not None for k in self.label_names) if has_labels: labels = nested_detach( tuple(inputs.get(name) for name in self.label_names)) if len(labels) == 1: labels = labels[0] # Compute and log metrics only if labels are available if labels is not None: metrics = self.compute_scores( EvalPrediction( predictions=(outputs["word_outputs"], outputs["indexes"]), label_ids=labels, )) if self.wandb_callback is not None: self.wandb_callback.update_metrics(metrics) # Save past state if it exists # TODO: this needs to be fixed and made cleaner later. if self.args.past_index >= 0: self._past = outputs[self.args.past_index] # We don't use .loss here since the model may return tuples instead of ModelOutput. return outputs["loss"] if isinstance(outputs, dict) else outputs[0]
def evaluate_mc_style_verifier_with_reader_and_iselector( reader_logits, selector_logits, verifier_logits, label_dict, ): merge_ratio = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5] merge_predictions = {k: {r: [] for r in merge_ratio} for k in merge_ratio} label_list = [] for example_id, label_id, in label_dict.items(): label_list.append(label_id) verifier_prob = torch.softmax( torch.tensor(verifier_logits[example_id]), -1) selector_prob = torch.softmax( torch.tensor(selector_logits[example_id]), -1) reader_prob = torch.softmax(torch.tensor(reader_logits[example_id]), -1) for merge_selector_ratio in merge_ratio: merge_selector_prediction = ( merge_selector_ratio * selector_prob + (1 - merge_selector_ratio) * reader_prob) for merge_verifier_ratio in merge_ratio: merge_verifier_prediction = ( merge_verifier_ratio * verifier_prob + (1 - merge_verifier_ratio) * merge_selector_prediction).tolist() merge_predictions[merge_selector_ratio][ merge_verifier_ratio].append(merge_verifier_prediction) metrics = {} for merge_selector_ratio in merge_ratio: for merge_verifier_ratio in merge_ratio: metrics[f"selector_merge_{merge_selector_ratio}_verifier_merge_{merge_verifier_ratio}_acc"] = \ compute_mc_metrics( EvalPrediction(predictions=merge_predictions[merge_selector_ratio][merge_verifier_ratio], label_ids=label_list))['accuracy'] return metrics
def prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None ) -> PredictionOutput: """ Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. Works both with or without labels. """ if hasattr(self, "_prediction_loop"): warnings.warn( "The `_prediction_loop` method is deprecated and won't be called in a future version, define `prediction_loop` in your subclass.", FutureWarning, ) return self._prediction_loop(dataloader, description, prediction_loss_only=prediction_loss_only) prediction_loss_only = ( prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only ) ''' assert not getattr( self.model.config, "output_attentions", False ), "The prediction loop does not work with `output_attentions=True`." assert not getattr( self.model.config, "output_hidden_states", False ), "The prediction loop does not work with `output_hidden_states=True`." ''' model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) else: model = self.model # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. ''' batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) ''' eval_losses: List[float] = [] preds: torch.Tensor = None label_ids: torch.Tensor = None model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) if self.args.past_index >= 0: self._past = None disable_tqdm = not self.is_local_process_zero() or self.args.disable_tqdm for inputs in tqdm(dataloader, desc=description, disable=disable_tqdm): loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only) batch_size = inputs[list(inputs.keys())[0]].shape[0] if loss is not None: eval_losses.extend([loss] * batch_size) if logits is not None: preds = logits if preds is None else nested_concat(preds, logits, dim=0) if labels is not None: label_ids = labels if label_ids is None else nested_concat(label_ids, labels, dim=0) if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of the evaluation loop delattr(self, "_past") if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if preds is not None: preds = distributed_concat(preds, num_total_examples=self.num_examples(dataloader)) if label_ids is not None: label_ids = distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader)) elif is_torch_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset if preds is not None: preds = nested_xla_mesh_reduce(preds, "eval_preds") if label_ids is not None: label_ids = nested_xla_mesh_reduce(label_ids, "eval_label_ids") if eval_losses is not None: eval_losses = xm.mesh_reduce("eval_losses", torch.tensor(eval_losses), torch.cat).tolist() # Finally, turn the aggregated tensors into numpy arrays. if preds is not None: preds = nested_numpify(preds) if label_ids is not None: label_ids = nested_numpify(label_ids) if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: if self.args.local_rank != -1: metrics["eval_loss"] = ( distributed_broadcast_scalars(eval_losses, num_total_examples=self.num_examples(dataloader)) .mean() .item() ) else: metrics["eval_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) else: model = self.model # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] preds: torch.Tensor = None label_ids: torch.Tensor = None model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader( dataloader, [self.args.device]).per_device_loader(self.args.device) for inputs in tqdm(dataloader, desc=description): has_labels = any( inputs.get(k) is not None for k in ["labels", "lm_labels", "masked_lm_labels"]) for k, v in inputs.items(): inputs[k] = v.to(self.args.device) with torch.no_grad(): outputs = model(**inputs) if has_labels: step_eval_loss, logits = outputs[:2] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if not prediction_loss_only: if preds is None: preds = logits.detach() else: preds = torch.cat((preds, logits.detach()), dim=0) if inputs.get("labels") is not None: if label_ids is None: label_ids = inputs["labels"].detach() else: label_ids = torch.cat( (label_ids, inputs["labels"].detach()), dim=0) if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if preds is not None: preds = self.distributed_concat( preds, num_total_examples=self.num_examples(dataloader)) if label_ids is not None: label_ids = self.distributed_concat( label_ids, num_total_examples=self.num_examples(dataloader)) elif is_torch_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset if preds is not None: preds = xm.mesh_reduce("eval_preds", preds, torch.cat) if label_ids is not None: label_ids = xm.mesh_reduce("eval_label_ids", label_ids, torch.cat) # Finally, turn the aggregated tensors into numpy arrays. if preds is not None: preds = preds.cpu().numpy() if label_ids is not None: label_ids = label_ids.cpu().numpy() if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics( EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", ) -> PredictionOutput: """ Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. Works both with or without labels. """ if not isinstance(dataloader.dataset, collections.abc.Sized): raise ValueError("dataset must implement __len__") prediction_loss_only = ( prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only ) model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. batch_size = dataloader.batch_size num_examples = self.num_examples(dataloader) logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", num_examples) logger.info(" Batch size = %d", batch_size) losses_host: torch.Tensor = None preds_host: Union[torch.Tensor, List[torch.Tensor]] = None labels_host: Union[torch.Tensor, List[torch.Tensor]] = None world_size = 1 if is_torch_tpu_available(): world_size = xm.xrt_world_size() elif self.args.local_rank != -1: world_size = torch.distributed.get_world_size() world_size = max(1, world_size) eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size) if not prediction_loss_only: preds_gatherer = DistributedTensorGatherer(world_size, num_examples) labels_gatherer = DistributedTensorGatherer(world_size, num_examples) model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) if self.args.past_index >= 0: self._past = None self.callback_handler.eval_dataloader = dataloader for step, inputs in enumerate(dataloader): loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) if loss is not None: losses = loss.repeat(batch_size) losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) if logits is not None: # preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) logits_reduced = logits.argmax(-1) preds_host = logits_reduced if preds_host is None else nested_concat(preds_host, logits_reduced, padding_index=-100) if labels is not None: labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control) # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0: eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) if not prediction_loss_only: preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) # Set back to None to begin a new accumulation losses_host, preds_host, labels_host = None, None, None if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of the evaluation loop delattr(self, "_past") # Gather all remaining tensors and put them back on the CPU eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) if not prediction_loss_only: preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) eval_loss = eval_losses_gatherer.finalize() preds = preds_gatherer.finalize() if not prediction_loss_only else None label_ids = labels_gatherer.finalize() if not prediction_loss_only else None if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if eval_loss is not None: metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item() # Prefix all keys with metric_key_prefix + '_' for key in list(metrics.keys()): if not key.startswith(f"{metric_key_prefix}_"): metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def prediction_loop(self, data_loader, world_size): num_examples = len(data_loader.dataset) batch_size = data_loader.batch_size eval_losses_gatherer = DistributedTensorGatherer( world_size, num_examples, make_multiple_of=batch_size) preds_gatherer = DistributedTensorGatherer(world_size, num_examples) labels_gatherer = DistributedTensorGatherer(world_size, num_examples) losses_host, preds_host, labels_host = None, None, None self.model.eval() for step, inputs in enumerate(data_loader): loss, logits, labels = self.prediction_step(inputs) losses = loss.repeat(batch_size) losses_host = losses if losses_host is None else torch.cat( (losses_host, losses), dim=0) preds_host = logits if preds_host is None else trainer_pt_utils.nested_concat( preds_host, logits, padding_index=-100) labels_host = labels if labels_host is None else trainer_pt_utils.nested_concat( labels_host, labels, padding_index=-100) eval_losses_gatherer.add_arrays( trainer_pt_utils.nested_numpify(losses_host)) preds_gatherer.add_arrays( trainer_pt_utils.nested_numpify(preds_host)) labels_gatherer.add_arrays( trainer_pt_utils.nested_numpify(labels_host)) losses_host, preds_host, labels_host = None, None, None eval_loss = eval_losses_gatherer.finalize() preds = preds_gatherer.finalize() labels_ids = labels_gatherer.finalize() if self.type_score == "PER": preds_ids = np.argmax(preds, axis=-1) predicted_phonemes = self.processor.batch_decode( torch.from_numpy(preds_ids)) true_phonemes = self.processor.batch_decode( torch.from_numpy(labels_ids)) per = generate_per_score(true_phonemes, predicted_phonemes) return per elif self.type_score == "WER": pred = EvalPrediction(predictions=preds, label_ids=labels_ids) pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = self.processor.tokenizer.pad_token_id pred_str = self.processor.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = self.processor.batch_decode(pred.label_ids, group_tokens=False) metrics = compute_wer(pred_str, label_str) metrics = denumpify_detensorize(metrics) metrics["t_loss"] = eval_loss.mean().item() wer = PredictionOutput(preds, labels_ids, metrics).metrics["wer"] return wer
def prediction_loop( self, dataset: tf.data.Dataset, steps: int, num_examples: int, description: str, prediction_loss_only: Optional[bool] = None, ) -> PredictionOutput: """ Prediction/evaluation loop, shared by :func:`~transformers.TFTrainer.evaluate` and :func:`~transformers.TFTrainer.predict`. Works both with or without labels. """ prediction_loss_only = ( prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only ) logger.info("***** Running %s *****", description) logger.info(" Num examples in dataset = %d", num_examples) if description == "Evaluation": logger.info(" Num examples in used in evaluation = %d", self.args.eval_batch_size * steps) logger.info(" Batch size = %d", self.args.eval_batch_size) label_ids: np.ndarray = None preds: np.ndarray = None self.eval_loss.reset_states() # Reset the past mems state at the beginning of the evaluation if necessary. if self.args.past_index >= 0: self._past = None for step, batch in enumerate(dataset): logits = self.distributed_prediction_steps(batch) _, labels = batch if not prediction_loss_only: if isinstance(logits, tuple): logits = logits[0] if isinstance(labels, tuple): labels = labels[0] if self.args.n_replicas > 1: for val in logits.values: if preds is None: preds = val.numpy() else: preds = np.append(preds, val.numpy(), axis=0) for val in labels.values: if label_ids is None: label_ids = val.numpy() else: label_ids = np.append(label_ids, val.numpy(), axis=0) else: if preds is None: preds = logits.numpy() else: preds = np.append(preds, logits.numpy(), axis=0) if label_ids is None: label_ids = labels.numpy() else: label_ids = np.append(label_ids, labels.numpy(), axis=0) if step == steps - 1: break if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} metrics["eval_loss"] = self.eval_loss.result().numpy() / steps for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", ) -> PredictionOutput: """ Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. Works both with or without labels. """ if not isinstance(dataloader.dataset, collections.abc.Sized): raise ValueError("dataset must implement __len__") prediction_loss_only = (prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only) if self.args.deepspeed and not self.args.do_train: # no harm, but flagging to the user that deepspeed config is ignored for eval # flagging only for when --do_train wasn't passed as only then it's redundant logger.info( "Detected the deepspeed argument but it will not be used for evaluation" ) model = self._wrap_model(self.model, training=False) # if full fp16 is wanted on eval and this ``evaluation`` or ``predict`` isn't called while # ``train`` is running, half it first and then put on device if not self.is_in_train and self.args.fp16_full_eval: model = model.half().to(self.args.device) batch_size = dataloader.batch_size num_examples = self.num_examples(dataloader) logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", num_examples) logger.info(" Batch size = %d", batch_size) model.eval() self.callback_handler.eval_dataloader = dataloader re_labels = None pred_relations = None entities = None for step, inputs in enumerate(dataloader): outputs, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) re_labels = labels[ 1] if re_labels is None else re_labels + labels[1] pred_relations = (outputs.pred_relations if pred_relations is None else pred_relations + outputs.pred_relations) entities = outputs.entities if entities is None else entities + outputs.entities self.control = self.callback_handler.on_prediction_step( self.args, self.state, self.control) gt_relations = [] for b in range(len(re_labels)): rel_sent = [] for head, tail in zip(re_labels[b]["head"], re_labels[b]["tail"]): rel = {} rel["head_id"] = head rel["head"] = (entities[b]["start"][rel["head_id"]], entities[b]["end"][rel["head_id"]]) rel["head_type"] = entities[b]["label"][rel["head_id"]] rel["tail_id"] = tail rel["tail"] = (entities[b]["start"][rel["tail_id"]], entities[b]["end"][rel["tail_id"]]) rel["tail_type"] = entities[b]["label"][rel["tail_id"]] rel["type"] = 1 rel_sent.append(rel) gt_relations.append(rel_sent) re_metrics = self.compute_metrics( EvalPrediction(predictions=pred_relations, label_ids=gt_relations)) re_metrics = { "precision": re_metrics["ALL"]["p"], "recall": re_metrics["ALL"]["r"], "f1": re_metrics["ALL"]["f1"], } re_metrics[f"{metric_key_prefix}_loss"] = outputs.loss.mean().item() metrics = {} # # Prefix all keys with metric_key_prefix + '_' for key in list(re_metrics.keys()): if not key.startswith(f"{metric_key_prefix}_"): metrics[f"{metric_key_prefix}_{key}"] = re_metrics.pop(key) else: metrics[f"{key}"] = re_metrics.pop(key) return metrics
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only # multi-gpu eval if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): model = torch.nn.DataParallel(self.model) else: model = self.model model.to(self.args.device) if is_tpu_available(): batch_size = dataloader._loader._loader.batch_size else: batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] preds_t1: np.ndarray = None preds_t2: np.ndarray = None label_ids_t1: np.ndarray = None label_ids_t2: np.ndarray = None model.eval() for inputs in tqdm(dataloader, desc=description): has_labels = any( inputs.get(k) is not None for k in [ "labels", "labels_t1", "labels_t2", "lm_labels", "masked_lm_labels" ]) for k, v in inputs.items(): inputs[k] = v.to(self.args.device) with torch.no_grad(): outputs = model(**inputs) if has_labels: if self.alternate: step_eval_loss, logits, task = outputs[:3] else: step_eval_loss, logits_t1, logits_t2 = outputs[:3] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if self.alternate: if not prediction_loss_only: if task == 0: if preds_t1 is None: preds_t1 = logits.detach().cpu().numpy() else: preds_t1 = np.append(preds_t1, logits.detach().cpu().numpy(), axis=0) if inputs.get("labels") is not None: if label_ids_t1 is None: label_ids_t1 = inputs["labels"].detach().cpu( ).numpy() else: label_ids_t1 = np.append( label_ids_t1, inputs["labels"].detach().cpu().numpy(), axis=0) elif task == 1: if preds_t2 is None: preds_t2 = logits.detach().cpu().numpy() else: preds_t2 = np.append(preds_t2, logits.detach().cpu().numpy(), axis=0) if inputs.get("labels") is not None: if label_ids_t2 is None: label_ids_t2 = inputs["labels"].detach().cpu( ).numpy() else: label_ids_t2 = np.append( label_ids_t2, inputs["labels"].detach().cpu().numpy(), axis=0) else: if not prediction_loss_only: if preds_t1 is None or preds_t2 is None: preds_t1 = logits_t1.detach().cpu().numpy() preds_t2 = logits_t1.detach().cpu().numpy() else: preds_t1 = np.append(preds_t1, logits_t1.detach().cpu().numpy(), axis=0) preds_t2 = np.append(preds_t2, logits_t2.detach().cpu().numpy(), axis=0) if inputs.get("labels_t1") is not None: if label_ids_t1 is None or label_ids_t2 is None: label_ids_t1 = inputs["labels_t1"].detach().cpu( ).numpy() label_ids_t2 = inputs["labels_t2"].detach().cpu( ).numpy() else: label_ids_t1 = np.append( label_ids_t1, inputs["labels_t1"].detach().cpu().numpy(), axis=0) label_ids_t2 = np.append( label_ids_t2, inputs["labels_t2"].detach().cpu().numpy(), axis=0) # if is_tpu_available() and preds is not None and label_ids is not None: # # tpu-comment: Get all predictions and labels from all worker shards of eval dataset # preds = xm.mesh_reduce("eval_preds", preds, np.concatenate) # label_ids = xm.mesh_reduce("eval_out_label_ids", label_ids, np.concatenate) metrics = {} if self.compute_metrics is not None: if preds_t1 is not None and label_ids_t1 is not None: metrics["task 1"] = self.compute_metrics( EvalPrediction(predictions=preds_t1, label_ids=label_ids_t1)) if preds_t2 is not None and label_ids_t2 is not None: metrics["task 2"] = self.compute_metrics( EvalPrediction(predictions=preds_t2, label_ids=label_ids_t2)) if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return (PredictionOutput(predictions=preds_t1, label_ids=label_ids_t1, metrics=metrics), PredictionOutput(predictions=preds_t2, label_ids=label_ids_t2, metrics=metrics))
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only # multi-gpu eval if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): model = torch.nn.DataParallel(self.model) else: model = self.model model.to(self.args.device) if is_tpu_available(): batch_size = dataloader._loader._loader.batch_size else: batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] eval_tag_losses = [] eval_gen_losses = [] eval_cov_losses = [] preds = [] label_ids = [] model.eval() for inputs in tqdm(dataloader, desc=description): for k, v in inputs.items(): inputs[k] = v.to(self.args.device) with torch.no_grad(): outputs = model(**inputs) step_eval_loss, logits = outputs[:2] other_loss = outputs[-1] eval_losses += [step_eval_loss.mean().item()] eval_tag_losses += [other_loss['tag_loss'].mean().item()] eval_gen_losses += [other_loss['gen_loss'].mean().item()] eval_cov_losses += [other_loss['cov_loss'].mean().item()] if not prediction_loss_only: preds.append(logits.detach().cpu().numpy().argmax(-1)) if inputs.get("tgt_token") is not None: label_ids.append( inputs["tgt_token"][:, 1:].detach().cpu().numpy()) if is_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset preds = xm.mesh_reduce("eval_preds", preds, np.concatenate) label_ids = xm.mesh_reduce("eval_out_label_ids", label_ids, np.concatenate) if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics( EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses) if len(eval_tag_losses) > 0: metrics["eval_tag_loss"] = np.mean(eval_tag_losses) if len(eval_gen_losses) > 0: metrics["eval_gen_loss"] = np.mean(eval_gen_losses) if len(eval_cov_losses) > 0: metrics["eval_cov_loss"] = np.mean(eval_cov_losses) if metrics["eval_cov_loss"] != 0: metrics["eval_loss"] = metrics["eval_tag_loss"] + metrics[ "eval_gen_loss"] # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def evaluate_answer_verifier_with_explicit_reader( self, evidence_reader, multiple_choice_dataset, answer_verifier_dataset, ): evidence_reader = evidence_reader.to(self.args.device) _model = self.model self.model = evidence_reader evidence_reader_output = self.evaluate( multiple_choice_dataset, description="Evaluation", metric_key_prefix="evidence_reader", compute_metrics=compute_mc_metrics) self.model = _model answer_verifier_output = self.evaluate( answer_verifier_dataset, description="Evaluation", metric_key_prefix="intensive_selector", compute_metrics=compute_mc_metrics) evidence_reader_predictions = {} answer_verifier_predictions = {} labels = {} for prediction, label_id, example_id in zip( *evidence_reader_output[:-1]): evidence_reader_predictions[example_id] = torch.softmax( torch.tensor(prediction), -1) labels[example_id] = label_id for prediction, label_id, example_id in zip( *answer_verifier_output[:-1]): answer_verifier_predictions[example_id] = torch.softmax( torch.tensor(prediction), -1) assert labels[example_id] == label_id merge_ratio = [ 0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 1 ] merge_prediction = {k: [] for k in merge_ratio} merge_prediction_dict = {k: {} for k in merge_ratio} label_list = [] all_example_ids = [] for example_id, label_id, in labels.items(): all_example_ids.append(example_id) label_list.append(label_id) answer_verifier_prediction = answer_verifier_predictions[ example_id] evidence_reader_prediction = evidence_reader_predictions[ example_id] for ratio in merge_ratio: merge_prediction[ratio].append( (ratio * answer_verifier_prediction + (1 - ratio) * evidence_reader_prediction).tolist()) merge_prediction_dict[ratio][example_id] = ( ratio * answer_verifier_prediction + (1 - ratio) * evidence_reader_prediction).tolist() all_merged_results = {} for ratio in merge_ratio: merged_results = { f'merge_{ratio}_{k}': v for k, v in compute_mc_metrics(EvalPrediction( predictions=merge_prediction[ratio], label_ids=label_list), all_example_ids=all_example_ids) } all_merged_results = {**all_merged_results, **merged_results} metrics = { **evidence_reader_output.metrics, **answer_verifier_output.metrics } metrics = {**metrics, **all_merged_results} return metrics, merge_prediction_dict
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None) -> PredictionOutput: prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only model = self.model batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] preds: torch.Tensor = None label_ids: torch.Tensor = None model.eval() for inputs in tqdm(dataloader, desc=description): has_labels = any( inputs.get(k) is not None for k in ["labels", "lm_labels", "masked_lm_labels"]) for k, v in inputs.items(): inputs[k] = v.to(self.device) with torch.no_grad(): outputs = model(**inputs) if has_labels: step_eval_loss, logits = outputs[:2] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if not prediction_loss_only: if preds is None: preds = logits.detach() else: preds = torch.cat((preds, logits.detach()), dim=0) if inputs.get("labels") is not None: if label_ids is None: label_ids = inputs["labels"].detach() else: label_ids = torch.cat( (label_ids, inputs["labels"].detach()), dim=0) if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if preds is not None: preds = self.distributed_concat( preds, num_total_examples=self.num_examples(dataloader)) if label_ids is not None: label_ids = self.distributed_concat( label_ids, num_total_examples=self.num_examples(dataloader)) if preds is not None: preds = preds.cpu().numpy() if label_ids is not None: label_ids = label_ids.cpu().numpy() if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics( EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def evidence_reading(self, evidence_reader, eval_dataset, prepare_feature_func, metric_key_prefix="fulleval"): evidence_reader = evidence_reader.to(self.args.device) evidence_reading_data_collator = DataCollatorForMultipleChoice( tokenizer=self.tokenizer) processed_datasets = eval_dataset.map( prepare_feature_func, batched=True, remove_columns=eval_dataset.column_names, load_from_cache_file=False, ) if 'evidence_sentence' in processed_datasets.column_names: evidence_sentences = { eid: evidence_sent for eid, evidence_sent in zip( processed_datasets['example_ids'], processed_datasets['evidence_sentence']) } processed_datasets = processed_datasets.remove_columns( "evidence_sentence") else: evidence_sentences = {} start_time = time.time() evidence_generator = self.model self.model = evidence_reader output = self.evaluate(processed_datasets, data_collator=evidence_reading_data_collator, description="Evaluation", metric_key_prefix=metric_key_prefix, compute_metrics=compute_mc_metrics) self.model = evidence_generator answer_dict = {} for orig_example_id, answer in zip(eval_dataset['example_id'], eval_dataset['answer']): answer_dict[orig_example_id] = ord(answer) - ord('A') is_answer_option = [] for processed_example_id in output.example_ids: orig_example_id = processed_example_id[:-2] corresponding_option = int(processed_example_id[-1]) is_answer_option.append( int(corresponding_option == answer_dict[orig_example_id])) right_option_acc = compute_mc_metrics( EvalPrediction(predictions=output.predictions, label_ids=output.label_ids), is_answer_option) wrong_option_acc = compute_mc_metrics( EvalPrediction(predictions=output.predictions, label_ids=output.label_ids), 1 - np.array(is_answer_option)) output.metrics.update({ f'{metric_key_prefix}_right_acc': right_option_acc['accuracy'], f'{metric_key_prefix}_wrong_acc': wrong_option_acc['accuracy'] }) n_samples = len(processed_datasets) output.metrics.update( speed_metrics(metric_key_prefix, start_time, n_samples)) return output.metrics, evidence_sentences
def evaluate(self, dataset, data_collator=None, description="", metric_key_prefix="eval", compute_metrics=None): # predicition with single device eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader( dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size, collate_fn=self.data_collator if data_collator is None else data_collator, num_workers=self.args.dataloader_num_workers) batch_size = eval_dataloader.batch_size num_examples = len(eval_dataloader.dataset) logger.info("***** Running {} *****".format(description)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", self.args.eval_batch_size) losses_host: torch.Tensor = None preds_host: Union[torch.Tensor, List[torch.Tensor]] = None labels_host: Union[torch.Tensor, List[torch.Tensor]] = None world_size = max(1, self.args.world_size) compute_metrics = self.compute_metrics if compute_metrics is None else compute_metrics prediction_loss_only = True if compute_metrics is None else None eval_losses_gatherer = DistributedTensorGatherer( world_size, num_examples, make_multiple_of=batch_size) if not prediction_loss_only: # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass # a batch size to the sampler) make_multiple_of = None if hasattr(eval_dataloader, "sampler") and isinstance( eval_dataloader.sampler, SequentialDistributedSampler): make_multiple_of = eval_dataloader.sampler.batch_size preds_gatherer = DistributedTensorGatherer( world_size, num_examples, make_multiple_of=make_multiple_of) labels_gatherer = DistributedTensorGatherer( world_size, num_examples, make_multiple_of=make_multiple_of) model = self._wrap_model(self.model) model.eval() all_example_ids = [] start_time = timeit.default_timer() for step, inputs in enumerate(tqdm(eval_dataloader)): if 'example_ids' in inputs.keys(): example_ids = inputs.pop('example_ids') all_example_ids += example_ids loss, logits, labels = self.prediction_step( model, inputs, prediction_loss_only) if loss is not None: losses = loss.repeat(eval_dataloader.batch_size) losses_host = losses if losses_host is None else torch.cat( (losses_host, losses), dim=0) if logits is not None: preds_host = logits if preds_host is None else nested_concat( preds_host, logits, padding_index=-100) if labels is not None: labels_host = labels if labels_host is None else nested_concat( labels_host, labels, padding_index=-100) # Gather all remaining tensors and put them back on the CPU eval_losses_gatherer.add_arrays(nested_numpify(losses_host)) if not prediction_loss_only: preds_gatherer.add_arrays(nested_numpify(preds_host)) labels_gatherer.add_arrays(nested_numpify(labels_host)) eval_loss = eval_losses_gatherer.finalize() preds = preds_gatherer.finalize() if not prediction_loss_only else None label_ids = labels_gatherer.finalize( ) if not prediction_loss_only else None if compute_metrics is not None and preds is not None and label_ids is not None: metrics = compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids), all_example_ids=all_example_ids if len(all_example_ids) > 0 else None) else: metrics = {} # To be JSON-serializable, we need to remove numpy types or zero-d tensors metrics = denumpify_detensorize(metrics) eval_time = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", eval_time, eval_time / len(dataset)) if eval_loss is not None: metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item() # Prefix all keys with metric_key_prefix + '_' for key in list(metrics.keys()): if not key.startswith(f"{metric_key_prefix}_"): metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) return PredictionOutput( predictions=preds, label_ids=label_ids, metrics=metrics, example_ids=None if len(all_example_ids) == 0 else all_example_ids)
def _prediction_loop( self, dataset: tf.data.Dataset, description: str, prediction_loss_only: Optional[bool] = None) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only logger.info("***** Running %s *****", description) logger.info(" Batch size = %d", self.args.eval_batch_size) label_ids: np.ndarray = None preds: np.ndarray = None step: int = 1 for features, labels in dataset: step = tf.convert_to_tensor(step, dtype=tf.int64) loss, logits = self._evaluate_steps(features, labels) loss = tf.reduce_mean(loss) if not prediction_loss_only: if isinstance(logits, tuple): logits = logits[0] if isinstance(labels, tuple): labels = labels[0] if self.args.n_gpu > 1: for val in logits.values: if preds is None: preds = val.numpy() else: preds = np.append(preds, val.numpy(), axis=0) for val in labels.values: if label_ids is None: label_ids = val.numpy() else: label_ids = np.append(label_ids, val.numpy(), axis=0) else: if preds is None: preds = logits.numpy() else: preds = np.append(preds, logits.numpy(), axis=0) if label_ids is None: label_ids = labels.numpy() else: label_ids = np.append(label_ids, labels.numpy(), axis=0) step += 1 if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics( EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} metrics["eval_loss"] = loss.numpy() for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)