def calculate_metric(self): scores_list: List[float] = [] preds_list: List[int] = [] targets_list: List[int] = [] for (scores, preds, targets) in zip( self.all_scores, self.all_preds, self.all_targets ): non_pad_idxs = [ idx for (idx, target) in enumerate(targets) if target != self.pad_index ] scores = [scores[idx] for idx in non_pad_idxs] preds = [preds[idx] for idx in non_pad_idxs] targets = [targets[idx] for idx in non_pad_idxs] assert len(scores) == len(preds) == len(targets) scores_list.extend(scores) preds_list.extend(preds) targets_list.extend(targets) label_predictions: List[LabelPrediction] = [ LabelPrediction(scores, pred, target) for (scores, pred, target) in zip(scores_list, preds_list, targets_list) ] calibration_metrics = compute_calibration(label_predictions) return calibration_metrics
def calculate_metric(self): return compute_classification_metrics( [ LabelPrediction(scores, pred, expect) for scores, pred, expect in zip(self.all_scores, self.all_preds, self.all_targets) ], self.label_names, self.calculate_loss(), )
def calculate_metric(self): return compute_classification_metrics( list( itertools.chain.from_iterable( (LabelPrediction(s, p, e) for s, p, e in zip(scores, pred, expect)) for scores, pred, expect in zip( self.all_scores, self.all_preds, self.all_targets))), self.label_names, self.calculate_loss(), )
def calculate_metric(self): list_score_pred_expect = [] for label_idx, _ in enumerate(self.label_names): list_score_pred_expect.append( list( itertools.chain.from_iterable( (LabelPrediction(s, p, e) for s, p, e in zip(scores[label_idx], pred[label_idx], expect[label_idx]) if e != self.pad_idx[label_idx]) for scores, pred, expect in zip( self.all_scores, self.all_preds, self.all_targets)))) metrics = compute_multi_label_multi_class_soft_metrics( list_score_pred_expect, self.label_names, self.label_vocabs) return metrics
def calculate_metric(self): # If we are running in memory efficient mode, then scores in # LabelPrediction should be an empty list label_predictions = [ LabelPrediction(scores, pred, expect) for scores, pred, expect in zip_longest( self.all_scores, self.all_preds, self.all_targets, fillvalue=[] ) ] return compute_classification_metrics( label_predictions, self.label_names, self.calculate_loss(), # Compute soft-metrics only if self.is_memory_efficient is False average_precisions=(not self.is_memory_efficient), recall_at_precision_thresholds=self.recall_at_precision_thresholds, )
def calculate_metric(self): list_score_pred_expect = [] for label_idx in range(0, len(self.label_names)): list_score_pred_expect.append( list( itertools.chain.from_iterable( (LabelPrediction(s, p, e) for s, p, e in zip(scores, pred, expect) if e != self.pad_idx[label_idx]) for scores, pred, expect in zip( self.all_scores[label_idx], self.all_preds[label_idx], self.all_targets[label_idx], )))) metrics = compute_multi_label_multi_class_soft_metrics( list_score_pred_expect, self.label_names, self.label_vocabs, self.calculate_loss(), ) return metrics
def compute_length_metrics( all_target_lens: List[int], all_target_length_preds: List[List[int]], select_length_beam, log_per_label_metrics: bool = True, ): length_metrics = {} length_report = {} if all_target_length_preds: all_length_pred_agg = {} beam = len(all_target_length_preds[0]) for i in range(beam): all_length_pred_agg[i] = [] for label, preds in zip(all_target_lens, all_target_length_preds): for l in range(beam): if label in preds[0 : l + 1]: all_length_pred_agg[l].append(label) else: all_length_pred_agg[l].append(preds[0]) for i in range(beam): length_metrics[i] = accuracy_score(all_target_lens, all_length_pred_agg[i]) max_len = max(all_target_lens + all_length_pred_agg[select_length_beam]) all_pairs = [ LabelPrediction( [1 if idx == pred else 0 for idx in range(max_len + 1)], pred, expect ) for pred, expect in zip( all_length_pred_agg[select_length_beam], all_target_lens ) ] length_report = compute_classification_metrics( all_pairs, [str(l) for l in range(max_len + 1)], 0.0, # Placeholder loss log_per_label_metrics=log_per_label_metrics, ) return length_metrics, length_report
def calculate_metric(self): all_rows = zip( self.all_context[self.ROW_INDEX], self.all_context[self.ANSWERS_COLUMN], self.all_context[self.QUES_COLUMN], self.all_context[self.DOC_COLUMN], self.all_pred_answers, self.all_start_pos_preds, self.all_end_pos_preds, self.all_has_answer_preds, self.all_start_pos_targets, self.all_end_pos_targets, self.all_has_answer_targets, self.all_start_pos_scores, self.all_end_pos_scores, self.all_has_answer_scores, ) all_rows_dict = {} for row in all_rows: try: all_rows_dict[row[0]].append(row) except KeyError: all_rows_dict[row[0]] = [row] all_rows = [] for rows in all_rows_dict.values(): argmax = np.argmax([row[11] + row[12] for row in rows]) all_rows.append(rows[argmax]) sorted(all_rows, key=lambda x: int(x[0])) ( self.all_context[self.ROW_INDEX], self.all_context[self.ANSWERS_COLUMN], self.all_context[self.QUES_COLUMN], self.all_context[self.DOC_COLUMN], self.all_pred_answers, self.all_start_pos_preds, self.all_end_pos_preds, self.all_has_answer_preds, self.all_start_pos_targets, self.all_end_pos_targets, self.all_has_answer_targets, self.all_start_pos_scores, self.all_end_pos_scores, self.all_has_answer_scores, ) = zip(*all_rows) exact_matches = self._compute_exact_matches( self.all_pred_answers, self.all_context[self.ANSWERS_COLUMN], self.all_has_answer_preds, self.all_has_answer_targets, ) f1_score = self._compute_f1_score( self.all_pred_answers, self.all_context[self.ANSWERS_COLUMN], self.all_has_answer_preds, self.all_has_answer_targets, ) count = len(self.all_has_answer_preds) self.all_preds = ( self.all_pred_answers, self.all_start_pos_preds, self.all_end_pos_preds, self.all_has_answer_preds, ) self.all_targets = ( self.all_context[self.ANSWERS_COLUMN], self.all_start_pos_targets, self.all_end_pos_targets, self.all_has_answer_targets, ) self.all_scores = ( self.all_start_pos_scores, self.all_end_pos_scores, self.all_has_answer_scores, ) label_predictions = None if not self.ignore_impossible: label_predictions = [ LabelPrediction(scores, pred, expect) for scores, pred, expect in zip_longest( self.all_has_answer_scores, self.all_has_answer_preds, self.all_has_answer_targets, fillvalue=[], ) ] metrics = SquadMetrics( exact_matches=100.0 * exact_matches / count, f1_score=100.0 * f1_score / count, num_examples=count, classification_metrics=compute_classification_metrics( label_predictions, self.has_answer_labels, self.calculate_loss(), ) if label_predictions else None, ) return metrics
from pytext.metrics import ( ClassificationMetrics, LabelPrediction, MacroPRF1Metrics, MacroPRF1Scores, PRF1Scores, SoftClassificationMetrics, compute_classification_metrics, compute_soft_metrics, ) from pytext.metrics.tests.metrics_test_base import MetricsTestBase LABEL_NAMES1 = ["label1", "label2", "label3"] PREDICTIONS1 = [ LabelPrediction(scores, predicted, expected) for scores, predicted, expected in [ ([0.5, 0.3, 0.2], 0, 0), ([0.1, 0.8, 0.1], 1, 0), ([0.3, 0.6, 0.1], 1, 1), ([0.2, 0.1, 0.7], 2, 1), ] ] LABEL_NAMES2 = ["label1", "label2"] PREDICTIONS2 = [ LabelPrediction(scores, predicted, expected) for scores, predicted, expected in [ ([0.4, 0.6], 1, 0), ([0.3, 0.2], 0, 0), ([0.4, 0.8], 1, 1),