def calculate_metric(self):
        scores_list: List[float] = []
        preds_list: List[int] = []
        targets_list: List[int] = []

        for (scores, preds, targets) in zip(
            self.all_scores, self.all_preds, self.all_targets
        ):
            non_pad_idxs = [
                idx for (idx, target) in enumerate(targets) if target != self.pad_index
            ]

            scores = [scores[idx] for idx in non_pad_idxs]
            preds = [preds[idx] for idx in non_pad_idxs]
            targets = [targets[idx] for idx in non_pad_idxs]

            assert len(scores) == len(preds) == len(targets)

            scores_list.extend(scores)
            preds_list.extend(preds)
            targets_list.extend(targets)

        label_predictions: List[LabelPrediction] = [
            LabelPrediction(scores, pred, target)
            for (scores, pred, target) in zip(scores_list, preds_list, targets_list)
        ]

        calibration_metrics = compute_calibration(label_predictions)

        return calibration_metrics
예제 #2
0
 def calculate_metric(self):
     return compute_classification_metrics(
         [
             LabelPrediction(scores, pred, expect) for scores, pred, expect
             in zip(self.all_scores, self.all_preds, self.all_targets)
         ],
         self.label_names,
         self.calculate_loss(),
     )
예제 #3
0
 def calculate_metric(self):
     return compute_classification_metrics(
         list(
             itertools.chain.from_iterable(
                 (LabelPrediction(s, p, e)
                  for s, p, e in zip(scores, pred, expect))
                 for scores, pred, expect in zip(
                     self.all_scores, self.all_preds, self.all_targets))),
         self.label_names,
         self.calculate_loss(),
     )
예제 #4
0
 def calculate_metric(self):
     list_score_pred_expect = []
     for label_idx, _ in enumerate(self.label_names):
         list_score_pred_expect.append(
             list(
                 itertools.chain.from_iterable(
                     (LabelPrediction(s, p, e)
                      for s, p, e in zip(scores[label_idx], pred[label_idx],
                                         expect[label_idx])
                      if e != self.pad_idx[label_idx])
                     for scores, pred, expect in zip(
                         self.all_scores, self.all_preds,
                         self.all_targets))))
     metrics = compute_multi_label_multi_class_soft_metrics(
         list_score_pred_expect, self.label_names, self.label_vocabs)
     return metrics
예제 #5
0
 def calculate_metric(self):
     # If we are running in memory efficient mode, then scores in
     # LabelPrediction should be an empty list
     label_predictions = [
         LabelPrediction(scores, pred, expect)
         for scores, pred, expect in zip_longest(
             self.all_scores, self.all_preds, self.all_targets, fillvalue=[]
         )
     ]
     return compute_classification_metrics(
         label_predictions,
         self.label_names,
         self.calculate_loss(),
         # Compute soft-metrics only if self.is_memory_efficient is False
         average_precisions=(not self.is_memory_efficient),
         recall_at_precision_thresholds=self.recall_at_precision_thresholds,
     )
 def calculate_metric(self):
     list_score_pred_expect = []
     for label_idx in range(0, len(self.label_names)):
         list_score_pred_expect.append(
             list(
                 itertools.chain.from_iterable(
                     (LabelPrediction(s, p, e)
                      for s, p, e in zip(scores, pred, expect)
                      if e != self.pad_idx[label_idx])
                     for scores, pred, expect in zip(
                         self.all_scores[label_idx],
                         self.all_preds[label_idx],
                         self.all_targets[label_idx],
                     ))))
     metrics = compute_multi_label_multi_class_soft_metrics(
         list_score_pred_expect,
         self.label_names,
         self.label_vocabs,
         self.calculate_loss(),
     )
     return metrics
예제 #7
0
def compute_length_metrics(
    all_target_lens: List[int],
    all_target_length_preds: List[List[int]],
    select_length_beam,
    log_per_label_metrics: bool = True,
):
    length_metrics = {}
    length_report = {}
    if all_target_length_preds:
        all_length_pred_agg = {}
        beam = len(all_target_length_preds[0])
        for i in range(beam):
            all_length_pred_agg[i] = []
        for label, preds in zip(all_target_lens, all_target_length_preds):
            for l in range(beam):
                if label in preds[0 : l + 1]:
                    all_length_pred_agg[l].append(label)
                else:
                    all_length_pred_agg[l].append(preds[0])
        for i in range(beam):
            length_metrics[i] = accuracy_score(all_target_lens, all_length_pred_agg[i])

        max_len = max(all_target_lens + all_length_pred_agg[select_length_beam])
        all_pairs = [
            LabelPrediction(
                [1 if idx == pred else 0 for idx in range(max_len + 1)], pred, expect
            )
            for pred, expect in zip(
                all_length_pred_agg[select_length_beam], all_target_lens
            )
        ]

        length_report = compute_classification_metrics(
            all_pairs,
            [str(l) for l in range(max_len + 1)],
            0.0,  # Placeholder loss
            log_per_label_metrics=log_per_label_metrics,
        )

    return length_metrics, length_report
예제 #8
0
    def calculate_metric(self):
        all_rows = zip(
            self.all_context[self.ROW_INDEX],
            self.all_context[self.ANSWERS_COLUMN],
            self.all_context[self.QUES_COLUMN],
            self.all_context[self.DOC_COLUMN],
            self.all_pred_answers,
            self.all_start_pos_preds,
            self.all_end_pos_preds,
            self.all_has_answer_preds,
            self.all_start_pos_targets,
            self.all_end_pos_targets,
            self.all_has_answer_targets,
            self.all_start_pos_scores,
            self.all_end_pos_scores,
            self.all_has_answer_scores,
        )

        all_rows_dict = {}
        for row in all_rows:
            try:
                all_rows_dict[row[0]].append(row)
            except KeyError:
                all_rows_dict[row[0]] = [row]

        all_rows = []
        for rows in all_rows_dict.values():
            argmax = np.argmax([row[11] + row[12] for row in rows])
            all_rows.append(rows[argmax])

        sorted(all_rows, key=lambda x: int(x[0]))

        (
            self.all_context[self.ROW_INDEX],
            self.all_context[self.ANSWERS_COLUMN],
            self.all_context[self.QUES_COLUMN],
            self.all_context[self.DOC_COLUMN],
            self.all_pred_answers,
            self.all_start_pos_preds,
            self.all_end_pos_preds,
            self.all_has_answer_preds,
            self.all_start_pos_targets,
            self.all_end_pos_targets,
            self.all_has_answer_targets,
            self.all_start_pos_scores,
            self.all_end_pos_scores,
            self.all_has_answer_scores,
        ) = zip(*all_rows)

        exact_matches = self._compute_exact_matches(
            self.all_pred_answers,
            self.all_context[self.ANSWERS_COLUMN],
            self.all_has_answer_preds,
            self.all_has_answer_targets,
        )
        f1_score = self._compute_f1_score(
            self.all_pred_answers,
            self.all_context[self.ANSWERS_COLUMN],
            self.all_has_answer_preds,
            self.all_has_answer_targets,
        )
        count = len(self.all_has_answer_preds)
        self.all_preds = (
            self.all_pred_answers,
            self.all_start_pos_preds,
            self.all_end_pos_preds,
            self.all_has_answer_preds,
        )
        self.all_targets = (
            self.all_context[self.ANSWERS_COLUMN],
            self.all_start_pos_targets,
            self.all_end_pos_targets,
            self.all_has_answer_targets,
        )
        self.all_scores = (
            self.all_start_pos_scores,
            self.all_end_pos_scores,
            self.all_has_answer_scores,
        )
        label_predictions = None
        if not self.ignore_impossible:
            label_predictions = [
                LabelPrediction(scores, pred, expect)
                for scores, pred, expect in zip_longest(
                    self.all_has_answer_scores,
                    self.all_has_answer_preds,
                    self.all_has_answer_targets,
                    fillvalue=[],
                )
            ]

        metrics = SquadMetrics(
            exact_matches=100.0 * exact_matches / count,
            f1_score=100.0 * f1_score / count,
            num_examples=count,
            classification_metrics=compute_classification_metrics(
                label_predictions,
                self.has_answer_labels,
                self.calculate_loss(),
            ) if label_predictions else None,
        )
        return metrics
예제 #9
0
from pytext.metrics import (
    ClassificationMetrics,
    LabelPrediction,
    MacroPRF1Metrics,
    MacroPRF1Scores,
    PRF1Scores,
    SoftClassificationMetrics,
    compute_classification_metrics,
    compute_soft_metrics,
)
from pytext.metrics.tests.metrics_test_base import MetricsTestBase

LABEL_NAMES1 = ["label1", "label2", "label3"]
PREDICTIONS1 = [
    LabelPrediction(scores, predicted, expected)
    for scores, predicted, expected in [
        ([0.5, 0.3, 0.2], 0, 0),
        ([0.1, 0.8, 0.1], 1, 0),
        ([0.3, 0.6, 0.1], 1, 1),
        ([0.2, 0.1, 0.7], 2, 1),
    ]
]

LABEL_NAMES2 = ["label1", "label2"]
PREDICTIONS2 = [
    LabelPrediction(scores, predicted, expected)
    for scores, predicted, expected in [
        ([0.4, 0.6], 1, 0),
        ([0.3, 0.2], 0, 0),
        ([0.4, 0.8], 1, 1),