def test_prf1_metrics(self) -> None: self.assertMetricsAlmostEqual( compute_classification_metrics(PREDICTIONS1, LABEL_NAMES1, loss=2.0, average_precisions=False), ClassificationMetrics( accuracy=0.5, macro_prf1_metrics=MacroPRF1Metrics( per_label_scores={ # label1: TP = 1, FP = 0, FN = 1 "label1": PRF1Scores(1, 0, 1, 1.0, 0.5, 2.0 / 3), # label2: TP = 1, FP = 1, FN = 1 "label2": PRF1Scores(1, 1, 1, 0.5, 0.5, 0.5), # label3: TP = 0, FP = 1, FN = 0 "label3": PRF1Scores(0, 1, 0, 0.0, 0.0, 0.0), }, macro_scores=MacroPRF1Scores(3, 0.5, 1.0 / 3, 7.0 / 18), ), per_label_soft_scores=None, mcc=None, roc_auc=None, loss=2.0, ), )
def test_compute_mcc(self) -> None: metrics = compute_classification_metrics(PREDICTIONS2, LABEL_NAMES2, loss=5.0) self.assertAlmostEqual(metrics.mcc, 1.0 / 6) # Just to test the metrics print without errors metrics.print_metrics()
def calculate_metric(self): return compute_classification_metrics( [ LabelPrediction(scores, pred, expect) for scores, pred, expect in zip(self.all_scores, self.all_preds, self.all_targets) ], self.label_names, self.calculate_loss(), )
def calculate_metric(self): return compute_classification_metrics( list( itertools.chain.from_iterable( (LabelPrediction(s, p, e) for s, p, e in zip(scores, pred, expect)) for scores, pred, expect in zip( self.all_scores, self.all_preds, self.all_targets))), self.label_names, self.calculate_loss(), )
def calculate_metric(self): # If we are running in memory efficient mode, then scores in # LabelPrediction should be an empty list label_predictions = [ LabelPrediction(scores, pred, expect) for scores, pred, expect in zip_longest( self.all_scores, self.all_preds, self.all_targets, fillvalue=[] ) ] return compute_classification_metrics( label_predictions, self.label_names, self.calculate_loss(), # Compute soft-metrics only if self.is_memory_efficient is False average_precisions=(not self.is_memory_efficient), recall_at_precision_thresholds=self.recall_at_precision_thresholds, )
def compute_length_metrics( all_target_lens: List[int], all_target_length_preds: List[List[int]], select_length_beam, log_per_label_metrics: bool = True, ): length_metrics = {} length_report = {} if all_target_length_preds: all_length_pred_agg = {} beam = len(all_target_length_preds[0]) for i in range(beam): all_length_pred_agg[i] = [] for label, preds in zip(all_target_lens, all_target_length_preds): for l in range(beam): if label in preds[0 : l + 1]: all_length_pred_agg[l].append(label) else: all_length_pred_agg[l].append(preds[0]) for i in range(beam): length_metrics[i] = accuracy_score(all_target_lens, all_length_pred_agg[i]) max_len = max(all_target_lens + all_length_pred_agg[select_length_beam]) all_pairs = [ LabelPrediction( [1 if idx == pred else 0 for idx in range(max_len + 1)], pred, expect ) for pred, expect in zip( all_length_pred_agg[select_length_beam], all_target_lens ) ] length_report = compute_classification_metrics( all_pairs, [str(l) for l in range(max_len + 1)], 0.0, # Placeholder loss log_per_label_metrics=log_per_label_metrics, ) return length_metrics, length_report
def calculate_metric(self): all_rows = zip( self.all_context[self.ROW_INDEX], self.all_context[self.ANSWERS_COLUMN], self.all_context[self.QUES_COLUMN], self.all_context[self.DOC_COLUMN], self.all_pred_answers, self.all_start_pos_preds, self.all_end_pos_preds, self.all_has_answer_preds, self.all_start_pos_targets, self.all_end_pos_targets, self.all_has_answer_targets, self.all_start_pos_scores, self.all_end_pos_scores, self.all_has_answer_scores, ) all_rows_dict = {} for row in all_rows: try: all_rows_dict[row[0]].append(row) except KeyError: all_rows_dict[row[0]] = [row] all_rows = [] for rows in all_rows_dict.values(): argmax = np.argmax([row[11] + row[12] for row in rows]) all_rows.append(rows[argmax]) sorted(all_rows, key=lambda x: int(x[0])) ( self.all_context[self.ROW_INDEX], self.all_context[self.ANSWERS_COLUMN], self.all_context[self.QUES_COLUMN], self.all_context[self.DOC_COLUMN], self.all_pred_answers, self.all_start_pos_preds, self.all_end_pos_preds, self.all_has_answer_preds, self.all_start_pos_targets, self.all_end_pos_targets, self.all_has_answer_targets, self.all_start_pos_scores, self.all_end_pos_scores, self.all_has_answer_scores, ) = zip(*all_rows) exact_matches = self._compute_exact_matches( self.all_pred_answers, self.all_context[self.ANSWERS_COLUMN], self.all_has_answer_preds, self.all_has_answer_targets, ) f1_score = self._compute_f1_score( self.all_pred_answers, self.all_context[self.ANSWERS_COLUMN], self.all_has_answer_preds, self.all_has_answer_targets, ) count = len(self.all_has_answer_preds) self.all_preds = ( self.all_pred_answers, self.all_start_pos_preds, self.all_end_pos_preds, self.all_has_answer_preds, ) self.all_targets = ( self.all_context[self.ANSWERS_COLUMN], self.all_start_pos_targets, self.all_end_pos_targets, self.all_has_answer_targets, ) self.all_scores = ( self.all_start_pos_scores, self.all_end_pos_scores, self.all_has_answer_scores, ) label_predictions = None if not self.ignore_impossible: label_predictions = [ LabelPrediction(scores, pred, expect) for scores, pred, expect in zip_longest( self.all_has_answer_scores, self.all_has_answer_preds, self.all_has_answer_targets, fillvalue=[], ) ] metrics = SquadMetrics( exact_matches=100.0 * exact_matches / count, f1_score=100.0 * f1_score / count, num_examples=count, classification_metrics=compute_classification_metrics( label_predictions, self.has_answer_labels, self.calculate_loss(), ) if label_predictions else None, ) return metrics
def test_compute_roc_auc(self) -> None: metrics = compute_classification_metrics(PREDICTIONS2, LABEL_NAMES2, loss=5.0) self.assertAlmostEqual(metrics.roc_auc, 1.0 / 6)