def compute_metrics(pred_labels) -> Dict: preds, labels = zip(*pred_labels) preds = utils.expand_like(preds) labels = utils.expand_like(labels) preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if self.hparams.finetuning_task is not None: result = metric.compute(predictions=preds, references=labels) if len(result) > 1: result["combined_score"] = np.mean(list( result.values())).item() return result elif is_regression: return {"mse": ((preds - labels)**2).mean().item()} else: return { "accuracy": (preds == labels).astype(np.float32).mean().item() }
def compute_metrics(label_list: List[Any], pred_labels: List[Any]) -> Dict: predictions, labels = zip(*pred_labels) predictions = utils.expand_like(predictions) predictions = np.argmax(predictions, axis=2) labels = utils.expand_like(labels) # Remove ignored index (special tokens) true_predictions = [[ label_list[pr] for (pr, la) in zip(prediction, label) if la != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[la] for (pr, la) in zip(prediction, label) if la != -100 ] for prediction, label in zip(predictions, labels)] return { "accuracy_score": seq_metrics.accuracy_score(true_labels, true_predictions), "precision": seq_metrics.precision_score(true_labels, true_predictions), "recall": seq_metrics.recall_score(true_labels, true_predictions), "f1": seq_metrics.f1_score(true_labels, true_predictions), }
def compute_metrics( data_config, column_names, post_processing_function, raw_datasets, tokenized_datasets, model, metric, predictions, ): inds, predictions = zip(*predictions) inds = np.hstack(inds) sorted_inds = np.argsort(inds) predictions = zip(*predictions) predictions = [utils.expand_like(p) for p in predictions] predictions = [p[sorted_inds] for p in predictions] # We need to add back in columns needed for validation. tokenized_datasets["validation"].set_format( type=tokenized_datasets["validation"].format["type"], columns=list(tokenized_datasets["validation"].features.keys()), ) output = post_processing_function( examples=raw_datasets["validation"], features=tokenized_datasets["validation"], predictions=predictions, data_args=data_config, column_names=column_names, prefix="eval", model=model, ) result = metric.compute(predictions=output.predictions, references=output.label_ids) # Then remove them again so that data collation doesn't break. hf.remove_unused_columns(model, tokenized_datasets["validation"]) return result
def compute_metrics(predictions): predictions = zip(*predictions) predictions = [utils.expand_like(p) for p in predictions] # We need to add back in columns needed for validation. self.tokenized_datasets["validation"].set_format( type=self.tokenized_datasets["validation"].format["type"], columns=list( self.tokenized_datasets["validation"].features.keys()), ) output = self.data_processors.post_processing_function( examples=self.raw_datasets["validation"], features=self.tokenized_datasets["validation"], predictions=predictions, data_args=self.data_config, column_names=self.column_names, prefix="eval", model=self.model, ) result = metric.compute(predictions=output.predictions, references=output.label_ids) # Then remove them again so that data collation doesn't break. hf.remove_unused_columns(self.model, self.tokenized_datasets["validation"]) return result
def compute_metrics(pred_labels) -> Dict: preds, labels = zip(*pred_labels) preds = utils.expand_like(preds) labels = utils.expand_like(labels) preds = np.argmax(preds, axis=1) return metric.compute(predictions=preds, references=labels)
def test_expand_like() -> None: array_list = [np.array([[1, 2], [3, 4]]), np.array([[2, 3, 4], [3, 4, 5]])] result = utils.expand_like(array_list) assert np.array_equal(result, np.array([[1, 2, -100], [3, 4, -100], [2, 3, 4], [3, 4, 5]]))