Пример #1
0
    def evaluate_fixed_shot(self, model, num_shots) -> Dict[str, Any]:
        """Evaluates task for different metrics.

        Args:
          model: the language model
          num_shots: number of shots

        Returns:
          Dictionary with results for metrics.
        """
        results = {}
        if num_shots:
            samples = make_nshot_dataset(
                self._ds, num_shots, separator=self.few_shot_example_separator)
        else:
            samples = copy.deepcopy(self._ds)

        inputs = [self.task_prefix + sample["input"] for sample in samples]
        if self.verbose:
            print(
                f"Example of formatted input for {num_shots} shots: {inputs[0]}"
            )

        if self.multiple_choice_metrics:
            targets = [sample["choice"] for sample in samples]
            if self.verbose:
                print(
                    f"Example of targets for multiple choice evaluation: {','.join(targets[0])}"
                )
            log_probs = model.cond_log_prob(inputs=inputs, targets=targets)

        if self.generative_metrics:
            generative_responses = model.generate_text(
                inputs=inputs,
                stop_string=self.stop_string,
                output_regex=self.output_regex,
            )

        for i, sample in enumerate(samples):
            if self.multiple_choice_metrics:
                sample["log_prob"] = log_probs[i]
            if self.generative_metrics:
                sample["generative_response"] = generative_responses[i]

        results = {}

        result = metrics.measure_multiple_choice_metrics(
            samples, self.multiple_choice_metrics)
        results.update(result)

        result = metrics.measure_generative_metrics(samples,
                                                    self.generative_metrics)
        results.update(result)

        return results
Пример #2
0
 def gen_metric_fn(targets: Sequence[Mapping[str, Any]],
                   predictions: Sequence[str]) -> Mapping[str, Any]:
     bb_task = BigBenchJsonTaskFetcher.get_bigbench_json_task_or_subtask(
         task_name=task_name,
         task_path=task_path,
         subtask_name=subtask_name,
         json_util=json_util)
     samples = get_samples_for_gen_eval_examples(targets, predictions,
                                                 len(bb_task.task_prefix))
     gen_metrics = bb_task.generative_metrics
     return task_metrics.measure_generative_metrics(samples, gen_metrics)
Пример #3
0
    def evaluate_fixed_shot(self, model, num_shots) -> Dict[str, Any]:
        """Evaluates task for different metrics.

        Args:
          model: the language model
          num_shots: number of shots

        Returns:
          Dictionary with results for metrics.
        """
        results = {}
        if num_shots:
            samples = make_nshot_dataset(self._ds, num_shots)
        else:
            samples = copy.deepcopy(self._ds)

        inputs = [self.task_prefix + sample["input"] for sample in samples]

        if self.multiple_choice_metrics:
            targets = [sample["choice"] for sample in samples]
            log_probs = model.cond_log_prob(inputs=inputs, targets=targets)

        if self.generative_metrics:
            generative_responses = model.generate_text(inputs=inputs)

        for i, sample in enumerate(samples):
            if self.multiple_choice_metrics:
                sample["log_prob"] = log_probs[i]
            if self.generative_metrics:
                sample["generative_response"] = generative_responses[i]

        results = {}

        result = metrics.measure_multiple_choice_metrics(
            samples, self.multiple_choice_metrics)
        results.update(result)

        result = metrics.measure_generative_metrics(samples,
                                                    self.generative_metrics)
        results.update(result)

        return results
Пример #4
0
    def evaluate_fixed_shot(self,
                            model,
                            num_shots,
                            score: bool = True) -> Dict[str, Any]:
        """Evaluates task for different metrics.

        Args:
          model: the language model
          num_shots: number of shots
          score: if False, skips scoring, which is useful for testing

        Returns:
          Dictionary with results for metrics.
        """
        if num_shots:
            samples = make_nshot_dataset(
                self._ds,
                num_shots,
                rng=self.rng,
                separator=self.few_shot_example_separator,
                max_examples=self.num_examples,
            )
        else:
            if self.max_examples:
                samples = copy.deepcopy(
                    self.rng.choice(self._ds,
                                    min(self.max_examples, len(self._ds)),
                                    replace=False))
            else:
                samples = copy.deepcopy(self._ds)

        self.shot_examples[num_shots] = samples

        inputs = [self.task_prefix + sample["input"] for sample in samples]
        if self.verbose:
            print(
                f"Example of formatted input for {num_shots} shots: {inputs[0]}"
            )

        if self.multiple_choice_metrics:
            targets = [sample["choice"] for sample in samples]
            if self.verbose:
                print(
                    f"Example of targets for multiple choice evaluation: {','.join(targets[0])}"
                )
            log_probs = model.cond_log_prob(
                inputs=inputs,
                targets=targets,
                absolute_normalization=self.absolute_normalization,
            )

        if self.generative_metrics:
            generative_responses = model.generate_text(
                inputs=inputs,
                stop_string=self.stop_string,
                output_regex=self.output_regex,
            )

        for i, sample in enumerate(samples):
            if self.multiple_choice_metrics:
                sample["log_prob"] = log_probs[i]
            if self.generative_metrics:
                sample["generative_response"] = generative_responses[i]

        results = {}

        if not score:
            return results

        result = metrics.measure_multiple_choice_metrics(
            samples, self.multiple_choice_metrics)
        results.update(result)

        result = metrics.measure_generative_metrics(samples,
                                                    self.generative_metrics)
        results.update(result)

        return results
Пример #5
0
    def evaluate_fixed_shot(
            self,
            model,
            num_shots,
            score: bool = True,
            random_seed: int = 42,
            generate_probs_for_targets: bool = True) -> Dict[str, Any]:
        """Evaluates task for different metrics.

        Args:
          model: the language model
          num_shots: number of shots
          random_seed: random number generator seed
          score: if False, skips scoring, which is useful for testing
          generate_probs_for_targets: if False, skips generating model cond log probs for generative targets

        Returns:
          Dictionary with results for metrics.
        """
        samples = make_nshot_dataset(
            self._ds,
            num_shots,
            random_seed=random_seed,
            separator=self.few_shot_example_separator,
            max_examples=self.num_examples,
        )

        self.shot_examples[num_shots] = samples

        inputs = [self.task_prefix + sample["input"] for sample in samples]
        if self.verbose:
            print(
                f"Example of formatted input for {num_shots} shots: {inputs[0]}"
            )

        if self.multiple_choice_metrics:
            targets = [sample["choice"] for sample in samples]
            if self.verbose:
                print(
                    f"Example of targets for multiple choice evaluation: {','.join(targets[0])}"
                )
            absolute_log_probs = model.cond_log_prob(
                inputs=inputs,
                targets=targets,
                absolute_normalization=True,
            )
            if len(absolute_log_probs[0]) == 1 and len(inputs) == 1:
                absolute_log_probs = [absolute_log_probs]

            normalized_log_probs = [
                list(np.array(x) - logsumexp(x)) for x in absolute_log_probs
            ]

            if self.absolute_normalization:
                log_probs = absolute_log_probs
            else:
                log_probs = normalized_log_probs

        if self.generative_metrics:
            generative_responses = model.generate_text(
                inputs=inputs,
                stop_string=self.stop_string,
                output_regex=self.output_regex,
            )
            if isinstance(generative_responses, str) and len(inputs) == 1:
                generative_responses = [generative_responses]
        if self.scoring_metrics and generate_probs_for_targets:
            ll_targets = [sample["target"] for sample in samples]
            log_probs_gen = model.cond_log_prob(inputs=inputs,
                                                targets=ll_targets,
                                                absolute_normalization=True)
        for i, sample in enumerate(samples):
            if self.multiple_choice_metrics:
                sample["absolute_log_prob"] = absolute_log_probs[i]
                sample["normalized_log_prob"] = normalized_log_probs[i]
                sample["log_prob"] = log_probs[i]
            if self.generative_metrics:
                sample["generative_response"] = generative_responses[i]
            if self.scoring_metrics and generate_probs_for_targets:
                sample["log_prob_gen"] = log_probs_gen[i]

        results = {}

        if not score:
            return results

        result = metrics.measure_multiple_choice_metrics(
            samples, self.multiple_choice_metrics)
        results.update(result)

        result = metrics.measure_generative_metrics(samples,
                                                    self.generative_metrics)
        results.update(result)

        if generate_probs_for_targets:
            result = metrics.measure_scoring_metrics(samples,
                                                     self.scoring_metrics)
            results.update(result)

        return results