Пример #1
0
    def test_abstain_labels(self) -> None:
        # We abstain on the last example by convention (label=-1)
        golds = np.array([1, 0, 1, 0, -1])
        preds = np.array([1, 0, 1, 1, 0])
        probs = np.array([0.8, 0.6, 0.9, 0.7, 0.4])

        # Test no abstain
        scorer = Scorer(metrics=["accuracy"], abstain_label=None)
        results = scorer.score(golds, preds, probs)
        results_expected = dict(accuracy=0.6)
        self.assertEqual(results, results_expected)

        # Test abstain=-1 for gold
        scorer = Scorer(metrics=["accuracy"], abstain_label=-1)
        results = scorer.score(golds, preds, probs)
        results_expected = dict(accuracy=0.75)
        self.assertEqual(results, results_expected)

        # Test abstain=-1 for preds and gold
        abstain_preds = np.array([-1, -1, 1, 1, 0])
        results = scorer.score(golds, abstain_preds)
        results_expected = dict(accuracy=0.5)
        self.assertEqual(results, results_expected)

        # Test abstain set to different value
        scorer = Scorer(metrics=["accuracy"], abstain_label=10)
        results = scorer.score(golds, preds, probs)
        results_expected = dict(accuracy=0.6)
        self.assertEqual(results, results_expected)
Пример #2
0
    def test_dict_metric(self) -> None:
        def dict_metric(golds, preds, probs):
            return dict(a=1, b=2)

        scorer = Scorer(custom_metric_funcs=dict(dict_metric=dict_metric))
        results = scorer.score(*self._get_labels())
        results_expected = dict(a=1, b=2)
        self.assertEqual(results, results_expected)
Пример #3
0
    def test_scorer(self) -> None:
        def pred_sum(golds, preds, probs):
            return np.sum(preds)

        scorer = Scorer(metrics=["accuracy", "f1"],
                        custom_metric_funcs=dict(pred_sum=pred_sum))

        results = scorer.score(*self._get_labels())
        results_expected = dict(accuracy=0.6, f1=2 / 3, pred_sum=3)
        self.assertEqual(results, results_expected)
Пример #4
0
    def test_score_slices(self):
        DATA = [5, 10, 19, 22, 25]

        @slicing_function()
        def sf(x):
            return x.num < 20

        # We expect 3/5 correct -> 0.6 accuracy
        golds = np.array([0, 1, 0, 1, 0])
        preds = np.array([0, 0, 0, 0, 0])
        probs = preds_to_probs(preds, 2)

        # In the slice, we expect the last 2 elements to masked
        # We expect 2/3 correct -> 0.666 accuracy
        data = [SimpleNamespace(num=x) for x in DATA]
        S = SFApplier([sf]).apply(data)
        scorer = Scorer(metrics=["accuracy"])

        # Test normal score
        metrics = scorer.score(golds=golds, preds=preds, probs=probs)
        self.assertEqual(metrics["accuracy"], 0.6)

        # Test score_slices
        slice_metrics = scorer.score_slices(S=S,
                                            golds=golds,
                                            preds=preds,
                                            probs=probs)
        self.assertEqual(slice_metrics["overall"]["accuracy"], 0.6)
        self.assertEqual(slice_metrics["sf"]["accuracy"], 2.0 / 3.0)

        # Test as_dataframe=True
        metrics_df = scorer.score_slices(S=S,
                                         golds=golds,
                                         preds=preds,
                                         probs=probs,
                                         as_dataframe=True)
        self.assertTrue(isinstance(metrics_df, pd.DataFrame))
        self.assertEqual(metrics_df["accuracy"]["overall"], 0.6)
        self.assertEqual(metrics_df["accuracy"]["sf"], 2.0 / 3.0)

        # Test wrong shapes
        with self.assertRaisesRegex(ValueError,
                                    "must have the same number of elements"):
            scorer.score_slices(S=S,
                                golds=golds[:1],
                                preds=preds,
                                probs=probs,
                                as_dataframe=True)
Пример #5
0
    def score(
        self,
        L: np.ndarray,
        Y: np.ndarray,
        metrics: Optional[List[str]] = ["accuracy"],
        tie_break_policy: str = "abstain",
    ) -> Dict[str, float]:
        """Calculate one or more scores from user-specified and/or user-defined metrics.

        Parameters
        ----------
        L
            An [n,m] matrix with values in {-1,0,1,...,k-1}
        Y
            Gold labels associated with data points in L
        metrics
            A list of metric names
        tie_break_policy
            Policy to break ties when converting probabilistic labels to predictions


        Returns
        -------
        Dict[str, float]
            A dictionary mapping metric names to metric scores

        Example
        -------
        >>> L = np.array([[1, 1, -1], [0, 0, -1], [1, 1, -1]])
        >>> label_model = LabelModel(verbose=False)
        >>> label_model.fit(L)
        >>> label_model.score(L, Y=np.array([1, 1, 1]))
        {'accuracy': 0.6666666666666666}
        >>> label_model.score(L, Y=np.array([1, 1, 1]), metrics=["f1"])
        {'f1': 0.8}
        """
        if tie_break_policy == "abstain":  # pragma: no cover
            logging.warning(
                "Metrics calculated over data points with non-abstain labels only"
            )

        Y_pred, Y_prob = self.predict(L,
                                      return_probs=True,
                                      tie_break_policy=tie_break_policy)

        scorer = Scorer(metrics=metrics)
        results = scorer.score(Y, Y_pred, Y_prob)
        return results
Пример #6
0
 def test_no_probs(self) -> None:
     scorer = Scorer()
     golds, preds, probs = self._get_labels()
     self.assertEqual(scorer.score(golds, preds),
                      scorer.score(golds, preds, probs))
Пример #7
0
 def test_no_labels(self) -> None:
     scorer = Scorer()
     with self.assertRaisesRegex(ValueError, "Cannot score"):
         scorer.score([], [], [])
Пример #8
0
 def test_no_metrics(self) -> None:
     scorer = Scorer()
     self.assertEqual(scorer.score(*self._get_labels()), {})
    # Define train dataset
    L_train = L_data_local[train_idx]
    Y_train = Y_data_local[train_idx]
    # Define test dataset
    L_test = L_data_local[test_idx]
    Y_test = Y_data_local[test_idx]

    # Evaluate a dependency-informed Snorkel model
    l_model = LabelModel(cardinality=2, verbose=False)
    l_model.fit(L_train, n_epochs=n_epochs, lr=lr)

    try:
        if abstain_rate < 0:
            Y_pred = l_model.predict(L_test, tie_break_policy="abstain")
        else:
            Y_prob = l_model.predict_proba(L_test)
            Y_pred = predict_at_abstain_rate(Y_prob, abstain_rate)

        scores = scorer.score(Y_test, preds=Y_pred)
        all_scores.append(scores)
    except Exception as e:
        print("Iter {}: {}".format(i+1,e))
        continue
    
    # Logging
    print("Iteration " + str(i+1) + ":", scores)

print("-- SUMMARY --")
print("accuracy: AVG {:.3f}, STD {:.3f}".format(np.mean([s["accuracy"] for s in all_scores]), np.std([s["accuracy"] for s in all_scores])))
print("f1: AVG {:.3f}, STD {:.3f}".format(np.mean([s["f1"] for s in all_scores]), np.std([s["f1"] for s in all_scores])))
print("abstain rate: AVG {:.3f}, STD {:.3f}".format(np.mean([s["abstain rate"] for s in all_scores]), np.std([s["abstain rate"] for s in all_scores])))