def post_processing_function(
        examples: datasets.Dataset, features: datasets.Dataset, outputs: EvalLoopOutput, stage="eval"
    ):
        # Decode the predicted tokens.
        preds = outputs.predictions
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

        # Build a map example to its corresponding features.
        example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
        feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
        predictions = {}
        # Let's loop over all the examples!
        for example_index, example in enumerate(examples):
            # This is the index of the feature associated to the current example.
            feature_index = feature_per_example[example_index]
            predictions[example["id"]] = decoded_preds[feature_index]

        # Format the result to the format the metric expects.
        if data_args.version_2_with_negative:
            formatted_predictions = [
                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
            ]
        else:
            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]

        references = [{"id": ex["id"], "answers": ex[answer_column]} for ex in examples]
        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
示例#2
0
def evaluate():
    model.eval()
    eval_losses: List[float] = []
    preds: torch.Tensor = None
    label_ids: torch.Tensor = None

    for inputs in tqdm(eval_iterator):
        loss, logits, labels = self.prediction_step(model, inputs,
                                                    prediction_loss_only)
        if loss is not None:
            eval_losses.append(loss)
        if logits is not None:
            preds = logits if preds is None else torch.cat(
                (preds, logits), dim=0)
        if labels is not None:
            label_ids = labels if label_ids is None else torch.cat(
                (label_ids, labels), dim=0)

        # Finally, turn the aggregated tensors into numpy arrays.
        if preds is not None:
            preds = preds.cpu().numpy()
        if label_ids is not None:
            label_ids = label_ids.cpu().numpy()

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = compute_metrics(
                EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}
        if len(eval_losses) > 0:
            metrics["eval_loss"] = np.mean(eval_losses)
示例#3
0
 def post_proc(self, xs, features, outs, stage="eval"):
     ps = self.params
     preds = outs.predictions
     if isinstance(preds, tuple):
         preds = preds[0]
     preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
     map = {k: i for i, k in enumerate(xs["id"])}
     feature_per_example = {map[x["example_id"]]: i for i, x in enumerate(features)}
     ys = {}
     for i, x in enumerate(xs):
         ys[x["id"]] = preds[feature_per_example[i]]
     if ps.version_2_with_negative:
         ys = [
             {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in ys.items()
         ]
     else:
         ys = [{"id": k, "prediction_text": v} for k, v in ys.items()]
     ls = [{"id": x["id"], "answers": x[self.cols[EACH][2]]} for x in xs]
     return EvalPrediction(predictions=ys, label_ids=ls)
示例#4
0
    def prediction_loop(self, *args, **kwargs) -> PredictionOutput:
        pred_outs = super().prediction_loop(*args, **kwargs)
        preds, label_ids, metrics = pred_outs.predictions, pred_outs.label_ids, pred_outs.metrics
        preds = preds.squeeze()
        if self.compute_metrics is not None:
            metrics_no_label = self.compute_metrics(
                EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics_no_label = {}

        for key in list(metrics_no_label.keys()):
            if not key.startswith("eval_"):
                metrics_no_label[f"eval_{key}"] = metrics_no_label.pop(key)

        return PredictionOutput(predictions=preds,
                                label_ids=label_ids,
                                metrics={
                                    **metrics,
                                    **metrics_no_label
                                })
    def compute_loss(self, model, inputs):
        """
        Override loss computation to calculate and log metrics
        during training
        """
        outputs = model(**inputs)

        # Custom logging steps (to log training metrics)
        if (self.state.global_step == 1 and self.args.logging_first_step) or (
                self.args.logging_steps > 0 and self.state.global_step > 0
                and self.state.global_step % self.args.logging_steps == 0):
            labels = None
            has_labels = all(
                inputs.get(k) is not None for k in self.label_names)
            if has_labels:
                labels = nested_detach(
                    tuple(inputs.get(name) for name in self.label_names))
                if len(labels) == 1:
                    labels = labels[0]

            # Compute and log metrics only if labels are available
            if labels is not None:
                metrics = self.compute_scores(
                    EvalPrediction(
                        predictions=(outputs["word_outputs"],
                                     outputs["indexes"]),
                        label_ids=labels,
                    ))
                if self.wandb_callback is not None:
                    self.wandb_callback.update_metrics(metrics)

        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]
        # We don't use .loss here since the model may return tuples instead of ModelOutput.
        return outputs["loss"] if isinstance(outputs, dict) else outputs[0]
示例#6
0
def evaluate_mc_style_verifier_with_reader_and_iselector(
    reader_logits,
    selector_logits,
    verifier_logits,
    label_dict,
):
    merge_ratio = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
    merge_predictions = {k: {r: [] for r in merge_ratio} for k in merge_ratio}
    label_list = []
    for example_id, label_id, in label_dict.items():
        label_list.append(label_id)
        verifier_prob = torch.softmax(
            torch.tensor(verifier_logits[example_id]), -1)
        selector_prob = torch.softmax(
            torch.tensor(selector_logits[example_id]), -1)
        reader_prob = torch.softmax(torch.tensor(reader_logits[example_id]),
                                    -1)
        for merge_selector_ratio in merge_ratio:
            merge_selector_prediction = (
                merge_selector_ratio * selector_prob +
                (1 - merge_selector_ratio) * reader_prob)
            for merge_verifier_ratio in merge_ratio:
                merge_verifier_prediction = (
                    merge_verifier_ratio * verifier_prob +
                    (1 - merge_verifier_ratio) *
                    merge_selector_prediction).tolist()
                merge_predictions[merge_selector_ratio][
                    merge_verifier_ratio].append(merge_verifier_prediction)

    metrics = {}
    for merge_selector_ratio in merge_ratio:
        for merge_verifier_ratio in merge_ratio:
            metrics[f"selector_merge_{merge_selector_ratio}_verifier_merge_{merge_verifier_ratio}_acc"] = \
                compute_mc_metrics(
                    EvalPrediction(predictions=merge_predictions[merge_selector_ratio][merge_verifier_ratio],
                                   label_ids=label_list))['accuracy']
    return metrics
    def prediction_loop(
        self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None
    ) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`.

        Works both with or without labels.
        """
        if hasattr(self, "_prediction_loop"):
            warnings.warn(
                "The `_prediction_loop` method is deprecated and won't be called in a future version, define `prediction_loop` in your subclass.",
                FutureWarning,
            )
            return self._prediction_loop(dataloader, description, prediction_loss_only=prediction_loss_only)

        prediction_loss_only = (
            prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
        )

        '''
        assert not getattr(
            self.model.config, "output_attentions", False
        ), "The prediction loop does not work with `output_attentions=True`."
        assert not getattr(
            self.model.config, "output_hidden_states", False
        ), "The prediction loop does not work with `output_hidden_states=True`."
        '''

        model = self.model
        # multi-gpu eval
        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)
        else:
            model = self.model
        # Note: in torch.distributed mode, there's no point in wrapping the model
        # inside a DistributedDataParallel as we'll be under `no_grad` anyways.

        '''
        batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", self.num_examples(dataloader))
        logger.info("  Batch size = %d", batch_size)
        '''
        eval_losses: List[float] = []
        preds: torch.Tensor = None
        label_ids: torch.Tensor = None
        model.eval()

        if is_torch_tpu_available():
            dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device)

        if self.args.past_index >= 0:
            self._past = None

        disable_tqdm = not self.is_local_process_zero() or self.args.disable_tqdm
        for inputs in tqdm(dataloader, desc=description, disable=disable_tqdm):
            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only)
            batch_size = inputs[list(inputs.keys())[0]].shape[0]
            if loss is not None:
                eval_losses.extend([loss] * batch_size)
            if logits is not None:
                preds = logits if preds is None else nested_concat(preds, logits, dim=0)
            if labels is not None:
                label_ids = labels if label_ids is None else nested_concat(label_ids, labels, dim=0)

        if self.args.past_index and hasattr(self, "_past"):
            # Clean the state at the end of the evaluation loop
            delattr(self, "_past")

        if self.args.local_rank != -1:
            # In distributed mode, concatenate all results from all nodes:
            if preds is not None:
                preds = distributed_concat(preds, num_total_examples=self.num_examples(dataloader))
            if label_ids is not None:
                label_ids = distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader))
        elif is_torch_tpu_available():
            # tpu-comment: Get all predictions and labels from all worker shards of eval dataset
            if preds is not None:
                preds = nested_xla_mesh_reduce(preds, "eval_preds")
            if label_ids is not None:
                label_ids = nested_xla_mesh_reduce(label_ids, "eval_label_ids")
            if eval_losses is not None:
                eval_losses = xm.mesh_reduce("eval_losses", torch.tensor(eval_losses), torch.cat).tolist()

        # Finally, turn the aggregated tensors into numpy arrays.
        if preds is not None:
            preds = nested_numpify(preds)
        if label_ids is not None:
            label_ids = nested_numpify(label_ids)

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}
        if len(eval_losses) > 0:
            if self.args.local_rank != -1:
                metrics["eval_loss"] = (
                    distributed_broadcast_scalars(eval_losses, num_total_examples=self.num_examples(dataloader))
                    .mean()
                    .item()
                )
            else:
                metrics["eval_loss"] = np.mean(eval_losses)

        # Prefix all keys with eval_
        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{key}"] = metrics.pop(key)

        return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
示例#8
0
    def _prediction_loop(
            self,
            dataloader: DataLoader,
            description: str,
            prediction_loss_only: Optional[bool] = None) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.

        Works both with or without labels.
        """

        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only

        model = self.model
        # multi-gpu eval
        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)
        else:
            model = self.model
        # Note: in torch.distributed mode, there's no point in wrapping the model
        # inside a DistributedDataParallel as we'll be under `no_grad` anyways.

        batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", self.num_examples(dataloader))
        logger.info("  Batch size = %d", batch_size)
        eval_losses: List[float] = []
        preds: torch.Tensor = None
        label_ids: torch.Tensor = None
        model.eval()

        if is_torch_tpu_available():
            dataloader = pl.ParallelLoader(
                dataloader,
                [self.args.device]).per_device_loader(self.args.device)

        for inputs in tqdm(dataloader, desc=description):
            has_labels = any(
                inputs.get(k) is not None
                for k in ["labels", "lm_labels", "masked_lm_labels"])

            for k, v in inputs.items():
                inputs[k] = v.to(self.args.device)

            with torch.no_grad():
                outputs = model(**inputs)
                if has_labels:
                    step_eval_loss, logits = outputs[:2]
                    eval_losses += [step_eval_loss.mean().item()]
                else:
                    logits = outputs[0]

            if not prediction_loss_only:
                if preds is None:
                    preds = logits.detach()
                else:
                    preds = torch.cat((preds, logits.detach()), dim=0)
                if inputs.get("labels") is not None:
                    if label_ids is None:
                        label_ids = inputs["labels"].detach()
                    else:
                        label_ids = torch.cat(
                            (label_ids, inputs["labels"].detach()), dim=0)

        if self.args.local_rank != -1:
            # In distributed mode, concatenate all results from all nodes:
            if preds is not None:
                preds = self.distributed_concat(
                    preds, num_total_examples=self.num_examples(dataloader))
            if label_ids is not None:
                label_ids = self.distributed_concat(
                    label_ids,
                    num_total_examples=self.num_examples(dataloader))
        elif is_torch_tpu_available():
            # tpu-comment: Get all predictions and labels from all worker shards of eval dataset
            if preds is not None:
                preds = xm.mesh_reduce("eval_preds", preds, torch.cat)
            if label_ids is not None:
                label_ids = xm.mesh_reduce("eval_label_ids", label_ids,
                                           torch.cat)

        # Finally, turn the aggregated tensors into numpy arrays.
        if preds is not None:
            preds = preds.cpu().numpy()
        if label_ids is not None:
            label_ids = label_ids.cpu().numpy()

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = self.compute_metrics(
                EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}
        if len(eval_losses) > 0:
            metrics["eval_loss"] = np.mean(eval_losses)

        # Prefix all keys with eval_
        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{key}"] = metrics.pop(key)

        return PredictionOutput(predictions=preds,
                                label_ids=label_ids,
                                metrics=metrics)
示例#9
0
    def prediction_loop(
        self,
        dataloader: DataLoader,
        description: str,
        prediction_loss_only: Optional[bool] = None,
        ignore_keys: Optional[List[str]] = None,
        metric_key_prefix: str = "eval",
    ) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`.

        Works both with or without labels.
        """
        if not isinstance(dataloader.dataset, collections.abc.Sized):
            raise ValueError("dataset must implement __len__")
        prediction_loss_only = (
            prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
        )

        model = self.model
        # multi-gpu eval
        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)
        # Note: in torch.distributed mode, there's no point in wrapping the model
        # inside a DistributedDataParallel as we'll be under `no_grad` anyways.

        batch_size = dataloader.batch_size
        num_examples = self.num_examples(dataloader)
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", num_examples)
        logger.info("  Batch size = %d", batch_size)
        losses_host: torch.Tensor = None
        preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
        labels_host: Union[torch.Tensor, List[torch.Tensor]] = None

        world_size = 1
        if is_torch_tpu_available():
            world_size = xm.xrt_world_size()
        elif self.args.local_rank != -1:
            world_size = torch.distributed.get_world_size()
        world_size = max(1, world_size)

        eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size)
        if not prediction_loss_only:
            preds_gatherer = DistributedTensorGatherer(world_size, num_examples)
            labels_gatherer = DistributedTensorGatherer(world_size, num_examples)

        model.eval()

        if is_torch_tpu_available():
            dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device)

        if self.args.past_index >= 0:
            self._past = None

        self.callback_handler.eval_dataloader = dataloader

        for step, inputs in enumerate(dataloader):
            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
            if loss is not None:
                losses = loss.repeat(batch_size)
                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
            if logits is not None:
                # preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
                logits_reduced = logits.argmax(-1)
                preds_host = logits_reduced if preds_host is None else nested_concat(preds_host, logits_reduced, padding_index=-100)
            if labels is not None:
                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
            self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control)

            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
            if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0:
                eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
                if not prediction_loss_only:
                    preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
                    labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))

                # Set back to None to begin a new accumulation
                losses_host, preds_host, labels_host = None, None, None

        if self.args.past_index and hasattr(self, "_past"):
            # Clean the state at the end of the evaluation loop
            delattr(self, "_past")

        # Gather all remaining tensors and put them back on the CPU
        eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
        if not prediction_loss_only:
            preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
            labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))

        eval_loss = eval_losses_gatherer.finalize()
        preds = preds_gatherer.finalize() if not prediction_loss_only else None
        label_ids = labels_gatherer.finalize() if not prediction_loss_only else None

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}

        if eval_loss is not None:
            metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item()

        # Prefix all keys with metric_key_prefix + '_'
        for key in list(metrics.keys()):
            if not key.startswith(f"{metric_key_prefix}_"):
                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)

        return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
示例#10
0
    def prediction_loop(self, data_loader, world_size):
        num_examples = len(data_loader.dataset)
        batch_size = data_loader.batch_size
        eval_losses_gatherer = DistributedTensorGatherer(
            world_size, num_examples, make_multiple_of=batch_size)
        preds_gatherer = DistributedTensorGatherer(world_size, num_examples)
        labels_gatherer = DistributedTensorGatherer(world_size, num_examples)
        losses_host, preds_host, labels_host = None, None, None
        self.model.eval()

        for step, inputs in enumerate(data_loader):
            loss, logits, labels = self.prediction_step(inputs)
            losses = loss.repeat(batch_size)
            losses_host = losses if losses_host is None else torch.cat(
                (losses_host, losses), dim=0)
            preds_host = logits if preds_host is None else trainer_pt_utils.nested_concat(
                preds_host, logits, padding_index=-100)
            labels_host = labels if labels_host is None else trainer_pt_utils.nested_concat(
                labels_host, labels, padding_index=-100)
            eval_losses_gatherer.add_arrays(
                trainer_pt_utils.nested_numpify(losses_host))
            preds_gatherer.add_arrays(
                trainer_pt_utils.nested_numpify(preds_host))
            labels_gatherer.add_arrays(
                trainer_pt_utils.nested_numpify(labels_host))
            losses_host, preds_host, labels_host = None, None, None

        eval_loss = eval_losses_gatherer.finalize()
        preds = preds_gatherer.finalize()
        labels_ids = labels_gatherer.finalize()

        if self.type_score == "PER":
            preds_ids = np.argmax(preds, axis=-1)

            predicted_phonemes = self.processor.batch_decode(
                torch.from_numpy(preds_ids))
            true_phonemes = self.processor.batch_decode(
                torch.from_numpy(labels_ids))

            per = generate_per_score(true_phonemes, predicted_phonemes)

            return per

        elif self.type_score == "WER":
            pred = EvalPrediction(predictions=preds, label_ids=labels_ids)
            pred_logits = pred.predictions
            pred_ids = np.argmax(pred_logits, axis=-1)

            pred.label_ids[pred.label_ids ==
                           -100] = self.processor.tokenizer.pad_token_id

            pred_str = self.processor.batch_decode(pred_ids)

            # we do not want to group tokens when computing the metrics
            label_str = self.processor.batch_decode(pred.label_ids,
                                                    group_tokens=False)

            metrics = compute_wer(pred_str, label_str)
            metrics = denumpify_detensorize(metrics)
            metrics["t_loss"] = eval_loss.mean().item()
            wer = PredictionOutput(preds, labels_ids, metrics).metrics["wer"]

            return wer
示例#11
0
    def prediction_loop(
        self,
        dataset: tf.data.Dataset,
        steps: int,
        num_examples: int,
        description: str,
        prediction_loss_only: Optional[bool] = None,
    ) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by :func:`~transformers.TFTrainer.evaluate` and
        :func:`~transformers.TFTrainer.predict`.

        Works both with or without labels.
        """

        prediction_loss_only = (
            prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
        )

        logger.info("***** Running %s *****", description)
        logger.info("  Num examples in dataset = %d", num_examples)
        if description == "Evaluation":
            logger.info("  Num examples in used in evaluation = %d", self.args.eval_batch_size * steps)
        logger.info("  Batch size = %d", self.args.eval_batch_size)

        label_ids: np.ndarray = None
        preds: np.ndarray = None
        self.eval_loss.reset_states()

        # Reset the past mems state at the beginning of the evaluation if necessary.
        if self.args.past_index >= 0:
            self._past = None

        for step, batch in enumerate(dataset):

            logits = self.distributed_prediction_steps(batch)
            _, labels = batch

            if not prediction_loss_only:
                if isinstance(logits, tuple):
                    logits = logits[0]

                if isinstance(labels, tuple):
                    labels = labels[0]

                if self.args.n_replicas > 1:
                    for val in logits.values:
                        if preds is None:
                            preds = val.numpy()
                        else:
                            preds = np.append(preds, val.numpy(), axis=0)

                    for val in labels.values:
                        if label_ids is None:
                            label_ids = val.numpy()
                        else:
                            label_ids = np.append(label_ids, val.numpy(), axis=0)
                else:
                    if preds is None:
                        preds = logits.numpy()
                    else:
                        preds = np.append(preds, logits.numpy(), axis=0)

                    if label_ids is None:
                        label_ids = labels.numpy()
                    else:
                        label_ids = np.append(label_ids, labels.numpy(), axis=0)

                if step == steps - 1:
                    break

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}

        metrics["eval_loss"] = self.eval_loss.result().numpy() / steps

        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{key}"] = metrics.pop(key)

        if self.args.past_index and hasattr(self, "_past"):
            # Clean the state at the end of training
            delattr(self, "_past")

        return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
示例#12
0
    def prediction_loop(
        self,
        dataloader: DataLoader,
        description: str,
        prediction_loss_only: Optional[bool] = None,
        ignore_keys: Optional[List[str]] = None,
        metric_key_prefix: str = "eval",
    ) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`.

        Works both with or without labels.
        """
        if not isinstance(dataloader.dataset, collections.abc.Sized):
            raise ValueError("dataset must implement __len__")
        prediction_loss_only = (prediction_loss_only
                                if prediction_loss_only is not None else
                                self.args.prediction_loss_only)

        if self.args.deepspeed and not self.args.do_train:
            # no harm, but flagging to the user that deepspeed config is ignored for eval
            # flagging only for when --do_train wasn't passed as only then it's redundant
            logger.info(
                "Detected the deepspeed argument but it will not be used for evaluation"
            )

        model = self._wrap_model(self.model, training=False)

        # if full fp16 is wanted on eval and this ``evaluation`` or ``predict`` isn't called while
        # ``train`` is running, half it first and then put on device
        if not self.is_in_train and self.args.fp16_full_eval:
            model = model.half().to(self.args.device)

        batch_size = dataloader.batch_size
        num_examples = self.num_examples(dataloader)
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", num_examples)
        logger.info("  Batch size = %d", batch_size)

        model.eval()

        self.callback_handler.eval_dataloader = dataloader

        re_labels = None
        pred_relations = None
        entities = None
        for step, inputs in enumerate(dataloader):
            outputs, labels = self.prediction_step(model,
                                                   inputs,
                                                   prediction_loss_only,
                                                   ignore_keys=ignore_keys)
            re_labels = labels[
                1] if re_labels is None else re_labels + labels[1]
            pred_relations = (outputs.pred_relations if pred_relations is None
                              else pred_relations + outputs.pred_relations)
            entities = outputs.entities if entities is None else entities + outputs.entities

            self.control = self.callback_handler.on_prediction_step(
                self.args, self.state, self.control)

        gt_relations = []
        for b in range(len(re_labels)):
            rel_sent = []
            for head, tail in zip(re_labels[b]["head"], re_labels[b]["tail"]):
                rel = {}
                rel["head_id"] = head
                rel["head"] = (entities[b]["start"][rel["head_id"]],
                               entities[b]["end"][rel["head_id"]])
                rel["head_type"] = entities[b]["label"][rel["head_id"]]

                rel["tail_id"] = tail
                rel["tail"] = (entities[b]["start"][rel["tail_id"]],
                               entities[b]["end"][rel["tail_id"]])
                rel["tail_type"] = entities[b]["label"][rel["tail_id"]]

                rel["type"] = 1

                rel_sent.append(rel)

            gt_relations.append(rel_sent)

        re_metrics = self.compute_metrics(
            EvalPrediction(predictions=pred_relations, label_ids=gt_relations))

        re_metrics = {
            "precision": re_metrics["ALL"]["p"],
            "recall": re_metrics["ALL"]["r"],
            "f1": re_metrics["ALL"]["f1"],
        }
        re_metrics[f"{metric_key_prefix}_loss"] = outputs.loss.mean().item()

        metrics = {}

        # # Prefix all keys with metric_key_prefix + '_'
        for key in list(re_metrics.keys()):
            if not key.startswith(f"{metric_key_prefix}_"):
                metrics[f"{metric_key_prefix}_{key}"] = re_metrics.pop(key)
            else:
                metrics[f"{key}"] = re_metrics.pop(key)

        return metrics
    def _prediction_loop(
            self,
            dataloader: DataLoader,
            description: str,
            prediction_loss_only: Optional[bool] = None) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.

        Works both with or without labels.
        """

        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only

        # multi-gpu eval
        if self.args.n_gpu > 1 and not isinstance(self.model,
                                                  torch.nn.DataParallel):
            model = torch.nn.DataParallel(self.model)
        else:
            model = self.model
        model.to(self.args.device)

        if is_tpu_available():
            batch_size = dataloader._loader._loader.batch_size
        else:
            batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", self.num_examples(dataloader))
        logger.info("  Batch size = %d", batch_size)
        eval_losses: List[float] = []
        preds_t1: np.ndarray = None
        preds_t2: np.ndarray = None
        label_ids_t1: np.ndarray = None
        label_ids_t2: np.ndarray = None
        model.eval()

        for inputs in tqdm(dataloader, desc=description):
            has_labels = any(
                inputs.get(k) is not None for k in [
                    "labels", "labels_t1", "labels_t2", "lm_labels",
                    "masked_lm_labels"
                ])

            for k, v in inputs.items():
                inputs[k] = v.to(self.args.device)

            with torch.no_grad():
                outputs = model(**inputs)

                if has_labels:
                    if self.alternate:
                        step_eval_loss, logits, task = outputs[:3]
                    else:
                        step_eval_loss, logits_t1, logits_t2 = outputs[:3]
                    eval_losses += [step_eval_loss.mean().item()]
                else:
                    logits = outputs[0]

            if self.alternate:
                if not prediction_loss_only:
                    if task == 0:
                        if preds_t1 is None:
                            preds_t1 = logits.detach().cpu().numpy()
                        else:
                            preds_t1 = np.append(preds_t1,
                                                 logits.detach().cpu().numpy(),
                                                 axis=0)
                        if inputs.get("labels") is not None:
                            if label_ids_t1 is None:
                                label_ids_t1 = inputs["labels"].detach().cpu(
                                ).numpy()
                            else:
                                label_ids_t1 = np.append(
                                    label_ids_t1,
                                    inputs["labels"].detach().cpu().numpy(),
                                    axis=0)

                    elif task == 1:
                        if preds_t2 is None:
                            preds_t2 = logits.detach().cpu().numpy()
                        else:
                            preds_t2 = np.append(preds_t2,
                                                 logits.detach().cpu().numpy(),
                                                 axis=0)
                        if inputs.get("labels") is not None:
                            if label_ids_t2 is None:
                                label_ids_t2 = inputs["labels"].detach().cpu(
                                ).numpy()
                            else:
                                label_ids_t2 = np.append(
                                    label_ids_t2,
                                    inputs["labels"].detach().cpu().numpy(),
                                    axis=0)

            else:
                if not prediction_loss_only:
                    if preds_t1 is None or preds_t2 is None:
                        preds_t1 = logits_t1.detach().cpu().numpy()
                        preds_t2 = logits_t1.detach().cpu().numpy()
                    else:
                        preds_t1 = np.append(preds_t1,
                                             logits_t1.detach().cpu().numpy(),
                                             axis=0)
                        preds_t2 = np.append(preds_t2,
                                             logits_t2.detach().cpu().numpy(),
                                             axis=0)
                    if inputs.get("labels_t1") is not None:
                        if label_ids_t1 is None or label_ids_t2 is None:
                            label_ids_t1 = inputs["labels_t1"].detach().cpu(
                            ).numpy()
                            label_ids_t2 = inputs["labels_t2"].detach().cpu(
                            ).numpy()
                        else:
                            label_ids_t1 = np.append(
                                label_ids_t1,
                                inputs["labels_t1"].detach().cpu().numpy(),
                                axis=0)
                            label_ids_t2 = np.append(
                                label_ids_t2,
                                inputs["labels_t2"].detach().cpu().numpy(),
                                axis=0)

        # if is_tpu_available() and preds is not None and label_ids is not None:
        #     # tpu-comment: Get all predictions and labels from all worker shards of eval dataset
        #     preds = xm.mesh_reduce("eval_preds", preds, np.concatenate)
        #     label_ids = xm.mesh_reduce("eval_out_label_ids", label_ids, np.concatenate)

        metrics = {}
        if self.compute_metrics is not None:
            if preds_t1 is not None and label_ids_t1 is not None:
                metrics["task 1"] = self.compute_metrics(
                    EvalPrediction(predictions=preds_t1,
                                   label_ids=label_ids_t1))
            if preds_t2 is not None and label_ids_t2 is not None:
                metrics["task 2"] = self.compute_metrics(
                    EvalPrediction(predictions=preds_t2,
                                   label_ids=label_ids_t2))

        if len(eval_losses) > 0:
            metrics["eval_loss"] = np.mean(eval_losses)

        # Prefix all keys with eval_
        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{key}"] = metrics.pop(key)

        return (PredictionOutput(predictions=preds_t1,
                                 label_ids=label_ids_t1,
                                 metrics=metrics),
                PredictionOutput(predictions=preds_t2,
                                 label_ids=label_ids_t2,
                                 metrics=metrics))
示例#14
0
    def _prediction_loop(
            self,
            dataloader: DataLoader,
            description: str,
            prediction_loss_only: Optional[bool] = None) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.

        Works both with or without labels.
        """

        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only

        # multi-gpu eval
        if self.args.n_gpu > 1 and not isinstance(self.model,
                                                  torch.nn.DataParallel):
            model = torch.nn.DataParallel(self.model)
        else:
            model = self.model
        model.to(self.args.device)

        if is_tpu_available():
            batch_size = dataloader._loader._loader.batch_size
        else:
            batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", self.num_examples(dataloader))
        logger.info("  Batch size = %d", batch_size)
        eval_losses: List[float] = []

        eval_tag_losses = []
        eval_gen_losses = []
        eval_cov_losses = []

        preds = []
        label_ids = []
        model.eval()

        for inputs in tqdm(dataloader, desc=description):
            for k, v in inputs.items():
                inputs[k] = v.to(self.args.device)

            with torch.no_grad():
                outputs = model(**inputs)
                step_eval_loss, logits = outputs[:2]
                other_loss = outputs[-1]

                eval_losses += [step_eval_loss.mean().item()]
                eval_tag_losses += [other_loss['tag_loss'].mean().item()]
                eval_gen_losses += [other_loss['gen_loss'].mean().item()]
                eval_cov_losses += [other_loss['cov_loss'].mean().item()]

            if not prediction_loss_only:

                preds.append(logits.detach().cpu().numpy().argmax(-1))

                if inputs.get("tgt_token") is not None:
                    label_ids.append(
                        inputs["tgt_token"][:, 1:].detach().cpu().numpy())

        if is_tpu_available():
            # tpu-comment: Get all predictions and labels from all worker shards of eval dataset
            preds = xm.mesh_reduce("eval_preds", preds, np.concatenate)
            label_ids = xm.mesh_reduce("eval_out_label_ids", label_ids,
                                       np.concatenate)

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = self.compute_metrics(
                EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}
        if len(eval_losses) > 0:
            metrics["eval_loss"] = np.mean(eval_losses)
        if len(eval_tag_losses) > 0:
            metrics["eval_tag_loss"] = np.mean(eval_tag_losses)
        if len(eval_gen_losses) > 0:
            metrics["eval_gen_loss"] = np.mean(eval_gen_losses)
        if len(eval_cov_losses) > 0:
            metrics["eval_cov_loss"] = np.mean(eval_cov_losses)
        if metrics["eval_cov_loss"] != 0:
            metrics["eval_loss"] = metrics["eval_tag_loss"] + metrics[
                "eval_gen_loss"]

        # Prefix all keys with eval_
        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{key}"] = metrics.pop(key)

        return PredictionOutput(predictions=preds,
                                label_ids=label_ids,
                                metrics=metrics)
示例#15
0
    def evaluate_answer_verifier_with_explicit_reader(
        self,
        evidence_reader,
        multiple_choice_dataset,
        answer_verifier_dataset,
    ):

        evidence_reader = evidence_reader.to(self.args.device)
        _model = self.model
        self.model = evidence_reader
        evidence_reader_output = self.evaluate(
            multiple_choice_dataset,
            description="Evaluation",
            metric_key_prefix="evidence_reader",
            compute_metrics=compute_mc_metrics)
        self.model = _model

        answer_verifier_output = self.evaluate(
            answer_verifier_dataset,
            description="Evaluation",
            metric_key_prefix="intensive_selector",
            compute_metrics=compute_mc_metrics)

        evidence_reader_predictions = {}
        answer_verifier_predictions = {}
        labels = {}
        for prediction, label_id, example_id in zip(
                *evidence_reader_output[:-1]):
            evidence_reader_predictions[example_id] = torch.softmax(
                torch.tensor(prediction), -1)
            labels[example_id] = label_id

        for prediction, label_id, example_id in zip(
                *answer_verifier_output[:-1]):
            answer_verifier_predictions[example_id] = torch.softmax(
                torch.tensor(prediction), -1)
            assert labels[example_id] == label_id

        merge_ratio = [
            0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 1
        ]
        merge_prediction = {k: [] for k in merge_ratio}
        merge_prediction_dict = {k: {} for k in merge_ratio}
        label_list = []
        all_example_ids = []
        for example_id, label_id, in labels.items():
            all_example_ids.append(example_id)
            label_list.append(label_id)
            answer_verifier_prediction = answer_verifier_predictions[
                example_id]
            evidence_reader_prediction = evidence_reader_predictions[
                example_id]
            for ratio in merge_ratio:
                merge_prediction[ratio].append(
                    (ratio * answer_verifier_prediction +
                     (1 - ratio) * evidence_reader_prediction).tolist())
                merge_prediction_dict[ratio][example_id] = (
                    ratio * answer_verifier_prediction +
                    (1 - ratio) * evidence_reader_prediction).tolist()

        all_merged_results = {}
        for ratio in merge_ratio:
            merged_results = {
                f'merge_{ratio}_{k}': v
                for k, v in compute_mc_metrics(EvalPrediction(
                    predictions=merge_prediction[ratio], label_ids=label_list),
                                               all_example_ids=all_example_ids)
            }
            all_merged_results = {**all_merged_results, **merged_results}

        metrics = {
            **evidence_reader_output.metrics,
            **answer_verifier_output.metrics
        }
        metrics = {**metrics, **all_merged_results}
        return metrics, merge_prediction_dict
示例#16
0
    def _prediction_loop(
            self,
            dataloader: DataLoader,
            description: str,
            prediction_loss_only: Optional[bool] = None) -> PredictionOutput:
        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only
        model = self.model
        batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", self.num_examples(dataloader))
        logger.info("  Batch size = %d", batch_size)
        eval_losses: List[float] = []
        preds: torch.Tensor = None
        label_ids: torch.Tensor = None
        model.eval()

        for inputs in tqdm(dataloader, desc=description):
            has_labels = any(
                inputs.get(k) is not None
                for k in ["labels", "lm_labels", "masked_lm_labels"])

            for k, v in inputs.items():
                inputs[k] = v.to(self.device)

            with torch.no_grad():
                outputs = model(**inputs)
                if has_labels:
                    step_eval_loss, logits = outputs[:2]
                    eval_losses += [step_eval_loss.mean().item()]
                else:
                    logits = outputs[0]

            if not prediction_loss_only:
                if preds is None:
                    preds = logits.detach()
                else:
                    preds = torch.cat((preds, logits.detach()), dim=0)
                if inputs.get("labels") is not None:
                    if label_ids is None:
                        label_ids = inputs["labels"].detach()
                    else:
                        label_ids = torch.cat(
                            (label_ids, inputs["labels"].detach()), dim=0)

        if self.args.local_rank != -1:
            # In distributed mode, concatenate all results from all nodes:
            if preds is not None:
                preds = self.distributed_concat(
                    preds, num_total_examples=self.num_examples(dataloader))
            if label_ids is not None:
                label_ids = self.distributed_concat(
                    label_ids,
                    num_total_examples=self.num_examples(dataloader))

        if preds is not None:
            preds = preds.cpu().numpy()
        if label_ids is not None:
            label_ids = label_ids.cpu().numpy()

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = self.compute_metrics(
                EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}
        if len(eval_losses) > 0:
            metrics["eval_loss"] = np.mean(eval_losses)

        # Prefix all keys with eval_
        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{key}"] = metrics.pop(key)

        return PredictionOutput(predictions=preds,
                                label_ids=label_ids,
                                metrics=metrics)
示例#17
0
    def evidence_reading(self,
                         evidence_reader,
                         eval_dataset,
                         prepare_feature_func,
                         metric_key_prefix="fulleval"):
        evidence_reader = evidence_reader.to(self.args.device)

        evidence_reading_data_collator = DataCollatorForMultipleChoice(
            tokenizer=self.tokenizer)

        processed_datasets = eval_dataset.map(
            prepare_feature_func,
            batched=True,
            remove_columns=eval_dataset.column_names,
            load_from_cache_file=False,
        )
        if 'evidence_sentence' in processed_datasets.column_names:
            evidence_sentences = {
                eid: evidence_sent
                for eid, evidence_sent in zip(
                    processed_datasets['example_ids'],
                    processed_datasets['evidence_sentence'])
            }
            processed_datasets = processed_datasets.remove_columns(
                "evidence_sentence")
        else:
            evidence_sentences = {}

        start_time = time.time()
        evidence_generator = self.model
        self.model = evidence_reader
        output = self.evaluate(processed_datasets,
                               data_collator=evidence_reading_data_collator,
                               description="Evaluation",
                               metric_key_prefix=metric_key_prefix,
                               compute_metrics=compute_mc_metrics)
        self.model = evidence_generator

        answer_dict = {}
        for orig_example_id, answer in zip(eval_dataset['example_id'],
                                           eval_dataset['answer']):
            answer_dict[orig_example_id] = ord(answer) - ord('A')

        is_answer_option = []
        for processed_example_id in output.example_ids:
            orig_example_id = processed_example_id[:-2]
            corresponding_option = int(processed_example_id[-1])
            is_answer_option.append(
                int(corresponding_option == answer_dict[orig_example_id]))
        right_option_acc = compute_mc_metrics(
            EvalPrediction(predictions=output.predictions,
                           label_ids=output.label_ids), is_answer_option)
        wrong_option_acc = compute_mc_metrics(
            EvalPrediction(predictions=output.predictions,
                           label_ids=output.label_ids),
            1 - np.array(is_answer_option))
        output.metrics.update({
            f'{metric_key_prefix}_right_acc':
            right_option_acc['accuracy'],
            f'{metric_key_prefix}_wrong_acc':
            wrong_option_acc['accuracy']
        })

        n_samples = len(processed_datasets)
        output.metrics.update(
            speed_metrics(metric_key_prefix, start_time, n_samples))

        return output.metrics, evidence_sentences
示例#18
0
    def evaluate(self,
                 dataset,
                 data_collator=None,
                 description="",
                 metric_key_prefix="eval",
                 compute_metrics=None):
        # predicition with single device

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(
            dataset,
            sampler=eval_sampler,
            batch_size=self.args.eval_batch_size,
            collate_fn=self.data_collator
            if data_collator is None else data_collator,
            num_workers=self.args.dataloader_num_workers)

        batch_size = eval_dataloader.batch_size
        num_examples = len(eval_dataloader.dataset)
        logger.info("***** Running {} *****".format(description))
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.args.eval_batch_size)
        losses_host: torch.Tensor = None
        preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
        labels_host: Union[torch.Tensor, List[torch.Tensor]] = None

        world_size = max(1, self.args.world_size)
        compute_metrics = self.compute_metrics if compute_metrics is None else compute_metrics
        prediction_loss_only = True if compute_metrics is None else None

        eval_losses_gatherer = DistributedTensorGatherer(
            world_size, num_examples, make_multiple_of=batch_size)
        if not prediction_loss_only:
            # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass
            # a batch size to the sampler)
            make_multiple_of = None
            if hasattr(eval_dataloader, "sampler") and isinstance(
                    eval_dataloader.sampler, SequentialDistributedSampler):
                make_multiple_of = eval_dataloader.sampler.batch_size
            preds_gatherer = DistributedTensorGatherer(
                world_size, num_examples, make_multiple_of=make_multiple_of)
            labels_gatherer = DistributedTensorGatherer(
                world_size, num_examples, make_multiple_of=make_multiple_of)

        model = self._wrap_model(self.model)
        model.eval()

        all_example_ids = []
        start_time = timeit.default_timer()
        for step, inputs in enumerate(tqdm(eval_dataloader)):
            if 'example_ids' in inputs.keys():
                example_ids = inputs.pop('example_ids')
                all_example_ids += example_ids
            loss, logits, labels = self.prediction_step(
                model, inputs, prediction_loss_only)

            if loss is not None:
                losses = loss.repeat(eval_dataloader.batch_size)
                losses_host = losses if losses_host is None else torch.cat(
                    (losses_host, losses), dim=0)
            if logits is not None:
                preds_host = logits if preds_host is None else nested_concat(
                    preds_host, logits, padding_index=-100)
            if labels is not None:
                labels_host = labels if labels_host is None else nested_concat(
                    labels_host, labels, padding_index=-100)

            # Gather all remaining tensors and put them back on the CPU
        eval_losses_gatherer.add_arrays(nested_numpify(losses_host))
        if not prediction_loss_only:
            preds_gatherer.add_arrays(nested_numpify(preds_host))
            labels_gatherer.add_arrays(nested_numpify(labels_host))

        eval_loss = eval_losses_gatherer.finalize()
        preds = preds_gatherer.finalize() if not prediction_loss_only else None
        label_ids = labels_gatherer.finalize(
        ) if not prediction_loss_only else None

        if compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = compute_metrics(EvalPrediction(predictions=preds,
                                                     label_ids=label_ids),
                                      all_example_ids=all_example_ids
                                      if len(all_example_ids) > 0 else None)
        else:
            metrics = {}

        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
        metrics = denumpify_detensorize(metrics)

        eval_time = timeit.default_timer() - start_time
        logger.info("  Evaluation done in total %f secs (%f sec per example)",
                    eval_time, eval_time / len(dataset))

        if eval_loss is not None:
            metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item()

        # Prefix all keys with metric_key_prefix + '_'
        for key in list(metrics.keys()):
            if not key.startswith(f"{metric_key_prefix}_"):
                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)

        return PredictionOutput(
            predictions=preds,
            label_ids=label_ids,
            metrics=metrics,
            example_ids=None if len(all_example_ids) == 0 else all_example_ids)
示例#19
0
    def _prediction_loop(
            self,
            dataset: tf.data.Dataset,
            description: str,
            prediction_loss_only: Optional[bool] = None) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
        Works both with or without labels.
        """

        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only

        logger.info("***** Running %s *****", description)
        logger.info("  Batch size = %d", self.args.eval_batch_size)

        label_ids: np.ndarray = None
        preds: np.ndarray = None

        step: int = 1

        for features, labels in dataset:
            step = tf.convert_to_tensor(step, dtype=tf.int64)
            loss, logits = self._evaluate_steps(features, labels)
            loss = tf.reduce_mean(loss)

            if not prediction_loss_only:
                if isinstance(logits, tuple):
                    logits = logits[0]

                if isinstance(labels, tuple):
                    labels = labels[0]

                if self.args.n_gpu > 1:
                    for val in logits.values:
                        if preds is None:
                            preds = val.numpy()
                        else:
                            preds = np.append(preds, val.numpy(), axis=0)

                    for val in labels.values:
                        if label_ids is None:
                            label_ids = val.numpy()
                        else:
                            label_ids = np.append(label_ids,
                                                  val.numpy(),
                                                  axis=0)
                else:
                    if preds is None:
                        preds = logits.numpy()
                    else:
                        preds = np.append(preds, logits.numpy(), axis=0)

                    if label_ids is None:
                        label_ids = labels.numpy()
                    else:
                        label_ids = np.append(label_ids,
                                              labels.numpy(),
                                              axis=0)

            step += 1

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = self.compute_metrics(
                EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}

        metrics["eval_loss"] = loss.numpy()

        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{key}"] = metrics.pop(key)

        return PredictionOutput(predictions=preds,
                                label_ids=label_ids,
                                metrics=metrics)