def evaluate_fixed_shot(self, model, num_shots) -> Dict[str, Any]: """Evaluates task for different metrics. Args: model: the language model num_shots: number of shots Returns: Dictionary with results for metrics. """ results = {} if num_shots: samples = make_nshot_dataset( self._ds, num_shots, separator=self.few_shot_example_separator) else: samples = copy.deepcopy(self._ds) inputs = [self.task_prefix + sample["input"] for sample in samples] if self.verbose: print( f"Example of formatted input for {num_shots} shots: {inputs[0]}" ) if self.multiple_choice_metrics: targets = [sample["choice"] for sample in samples] if self.verbose: print( f"Example of targets for multiple choice evaluation: {','.join(targets[0])}" ) log_probs = model.cond_log_prob(inputs=inputs, targets=targets) if self.generative_metrics: generative_responses = model.generate_text( inputs=inputs, stop_string=self.stop_string, output_regex=self.output_regex, ) for i, sample in enumerate(samples): if self.multiple_choice_metrics: sample["log_prob"] = log_probs[i] if self.generative_metrics: sample["generative_response"] = generative_responses[i] results = {} result = metrics.measure_multiple_choice_metrics( samples, self.multiple_choice_metrics) results.update(result) result = metrics.measure_generative_metrics(samples, self.generative_metrics) results.update(result) return results
def gen_metric_fn(targets: Sequence[Mapping[str, Any]], predictions: Sequence[str]) -> Mapping[str, Any]: bb_task = BigBenchJsonTaskFetcher.get_bigbench_json_task_or_subtask( task_name=task_name, task_path=task_path, subtask_name=subtask_name, json_util=json_util) samples = get_samples_for_gen_eval_examples(targets, predictions, len(bb_task.task_prefix)) gen_metrics = bb_task.generative_metrics return task_metrics.measure_generative_metrics(samples, gen_metrics)
def evaluate_fixed_shot(self, model, num_shots) -> Dict[str, Any]: """Evaluates task for different metrics. Args: model: the language model num_shots: number of shots Returns: Dictionary with results for metrics. """ results = {} if num_shots: samples = make_nshot_dataset(self._ds, num_shots) else: samples = copy.deepcopy(self._ds) inputs = [self.task_prefix + sample["input"] for sample in samples] if self.multiple_choice_metrics: targets = [sample["choice"] for sample in samples] log_probs = model.cond_log_prob(inputs=inputs, targets=targets) if self.generative_metrics: generative_responses = model.generate_text(inputs=inputs) for i, sample in enumerate(samples): if self.multiple_choice_metrics: sample["log_prob"] = log_probs[i] if self.generative_metrics: sample["generative_response"] = generative_responses[i] results = {} result = metrics.measure_multiple_choice_metrics( samples, self.multiple_choice_metrics) results.update(result) result = metrics.measure_generative_metrics(samples, self.generative_metrics) results.update(result) return results
def evaluate_fixed_shot(self, model, num_shots, score: bool = True) -> Dict[str, Any]: """Evaluates task for different metrics. Args: model: the language model num_shots: number of shots score: if False, skips scoring, which is useful for testing Returns: Dictionary with results for metrics. """ if num_shots: samples = make_nshot_dataset( self._ds, num_shots, rng=self.rng, separator=self.few_shot_example_separator, max_examples=self.num_examples, ) else: if self.max_examples: samples = copy.deepcopy( self.rng.choice(self._ds, min(self.max_examples, len(self._ds)), replace=False)) else: samples = copy.deepcopy(self._ds) self.shot_examples[num_shots] = samples inputs = [self.task_prefix + sample["input"] for sample in samples] if self.verbose: print( f"Example of formatted input for {num_shots} shots: {inputs[0]}" ) if self.multiple_choice_metrics: targets = [sample["choice"] for sample in samples] if self.verbose: print( f"Example of targets for multiple choice evaluation: {','.join(targets[0])}" ) log_probs = model.cond_log_prob( inputs=inputs, targets=targets, absolute_normalization=self.absolute_normalization, ) if self.generative_metrics: generative_responses = model.generate_text( inputs=inputs, stop_string=self.stop_string, output_regex=self.output_regex, ) for i, sample in enumerate(samples): if self.multiple_choice_metrics: sample["log_prob"] = log_probs[i] if self.generative_metrics: sample["generative_response"] = generative_responses[i] results = {} if not score: return results result = metrics.measure_multiple_choice_metrics( samples, self.multiple_choice_metrics) results.update(result) result = metrics.measure_generative_metrics(samples, self.generative_metrics) results.update(result) return results
def evaluate_fixed_shot( self, model, num_shots, score: bool = True, random_seed: int = 42, generate_probs_for_targets: bool = True) -> Dict[str, Any]: """Evaluates task for different metrics. Args: model: the language model num_shots: number of shots random_seed: random number generator seed score: if False, skips scoring, which is useful for testing generate_probs_for_targets: if False, skips generating model cond log probs for generative targets Returns: Dictionary with results for metrics. """ samples = make_nshot_dataset( self._ds, num_shots, random_seed=random_seed, separator=self.few_shot_example_separator, max_examples=self.num_examples, ) self.shot_examples[num_shots] = samples inputs = [self.task_prefix + sample["input"] for sample in samples] if self.verbose: print( f"Example of formatted input for {num_shots} shots: {inputs[0]}" ) if self.multiple_choice_metrics: targets = [sample["choice"] for sample in samples] if self.verbose: print( f"Example of targets for multiple choice evaluation: {','.join(targets[0])}" ) absolute_log_probs = model.cond_log_prob( inputs=inputs, targets=targets, absolute_normalization=True, ) if len(absolute_log_probs[0]) == 1 and len(inputs) == 1: absolute_log_probs = [absolute_log_probs] normalized_log_probs = [ list(np.array(x) - logsumexp(x)) for x in absolute_log_probs ] if self.absolute_normalization: log_probs = absolute_log_probs else: log_probs = normalized_log_probs if self.generative_metrics: generative_responses = model.generate_text( inputs=inputs, stop_string=self.stop_string, output_regex=self.output_regex, ) if isinstance(generative_responses, str) and len(inputs) == 1: generative_responses = [generative_responses] if self.scoring_metrics and generate_probs_for_targets: ll_targets = [sample["target"] for sample in samples] log_probs_gen = model.cond_log_prob(inputs=inputs, targets=ll_targets, absolute_normalization=True) for i, sample in enumerate(samples): if self.multiple_choice_metrics: sample["absolute_log_prob"] = absolute_log_probs[i] sample["normalized_log_prob"] = normalized_log_probs[i] sample["log_prob"] = log_probs[i] if self.generative_metrics: sample["generative_response"] = generative_responses[i] if self.scoring_metrics and generate_probs_for_targets: sample["log_prob_gen"] = log_probs_gen[i] results = {} if not score: return results result = metrics.measure_multiple_choice_metrics( samples, self.multiple_choice_metrics) results.update(result) result = metrics.measure_generative_metrics(samples, self.generative_metrics) results.update(result) if generate_probs_for_targets: result = metrics.measure_scoring_metrics(samples, self.scoring_metrics) results.update(result) return results