def test_abstain_labels(self) -> None: # We abstain on the last example by convention (label=-1) golds = np.array([1, 0, 1, 0, -1]) preds = np.array([1, 0, 1, 1, 0]) probs = np.array([0.8, 0.6, 0.9, 0.7, 0.4]) # Test no abstain scorer = Scorer(metrics=["accuracy"], abstain_label=None) results = scorer.score(golds, preds, probs) results_expected = dict(accuracy=0.6) self.assertEqual(results, results_expected) # Test abstain=-1 for gold scorer = Scorer(metrics=["accuracy"], abstain_label=-1) results = scorer.score(golds, preds, probs) results_expected = dict(accuracy=0.75) self.assertEqual(results, results_expected) # Test abstain=-1 for preds and gold abstain_preds = np.array([-1, -1, 1, 1, 0]) results = scorer.score(golds, abstain_preds) results_expected = dict(accuracy=0.5) self.assertEqual(results, results_expected) # Test abstain set to different value scorer = Scorer(metrics=["accuracy"], abstain_label=10) results = scorer.score(golds, preds, probs) results_expected = dict(accuracy=0.6) self.assertEqual(results, results_expected)
def test_dict_metric(self) -> None: def dict_metric(golds, preds, probs): return dict(a=1, b=2) scorer = Scorer(custom_metric_funcs=dict(dict_metric=dict_metric)) results = scorer.score(*self._get_labels()) results_expected = dict(a=1, b=2) self.assertEqual(results, results_expected)
def test_scorer(self) -> None: def pred_sum(golds, preds, probs): return np.sum(preds) scorer = Scorer(metrics=["accuracy", "f1"], custom_metric_funcs=dict(pred_sum=pred_sum)) results = scorer.score(*self._get_labels()) results_expected = dict(accuracy=0.6, f1=2 / 3, pred_sum=3) self.assertEqual(results, results_expected)
def test_score_slices(self): DATA = [5, 10, 19, 22, 25] @slicing_function() def sf(x): return x.num < 20 # We expect 3/5 correct -> 0.6 accuracy golds = np.array([0, 1, 0, 1, 0]) preds = np.array([0, 0, 0, 0, 0]) probs = preds_to_probs(preds, 2) # In the slice, we expect the last 2 elements to masked # We expect 2/3 correct -> 0.666 accuracy data = [SimpleNamespace(num=x) for x in DATA] S = SFApplier([sf]).apply(data) scorer = Scorer(metrics=["accuracy"]) # Test normal score metrics = scorer.score(golds=golds, preds=preds, probs=probs) self.assertEqual(metrics["accuracy"], 0.6) # Test score_slices slice_metrics = scorer.score_slices(S=S, golds=golds, preds=preds, probs=probs) self.assertEqual(slice_metrics["overall"]["accuracy"], 0.6) self.assertEqual(slice_metrics["sf"]["accuracy"], 2.0 / 3.0) # Test as_dataframe=True metrics_df = scorer.score_slices(S=S, golds=golds, preds=preds, probs=probs, as_dataframe=True) self.assertTrue(isinstance(metrics_df, pd.DataFrame)) self.assertEqual(metrics_df["accuracy"]["overall"], 0.6) self.assertEqual(metrics_df["accuracy"]["sf"], 2.0 / 3.0) # Test wrong shapes with self.assertRaisesRegex(ValueError, "must have the same number of elements"): scorer.score_slices(S=S, golds=golds[:1], preds=preds, probs=probs, as_dataframe=True)
def score( self, L: np.ndarray, Y: np.ndarray, metrics: Optional[List[str]] = ["accuracy"], tie_break_policy: str = "abstain", ) -> Dict[str, float]: """Calculate one or more scores from user-specified and/or user-defined metrics. Parameters ---------- L An [n,m] matrix with values in {-1,0,1,...,k-1} Y Gold labels associated with data points in L metrics A list of metric names tie_break_policy Policy to break ties when converting probabilistic labels to predictions Returns ------- Dict[str, float] A dictionary mapping metric names to metric scores Example ------- >>> L = np.array([[1, 1, -1], [0, 0, -1], [1, 1, -1]]) >>> label_model = LabelModel(verbose=False) >>> label_model.fit(L) >>> label_model.score(L, Y=np.array([1, 1, 1])) {'accuracy': 0.6666666666666666} >>> label_model.score(L, Y=np.array([1, 1, 1]), metrics=["f1"]) {'f1': 0.8} """ if tie_break_policy == "abstain": # pragma: no cover logging.warning( "Metrics calculated over data points with non-abstain labels only" ) Y_pred, Y_prob = self.predict(L, return_probs=True, tie_break_policy=tie_break_policy) scorer = Scorer(metrics=metrics) results = scorer.score(Y, Y_pred, Y_prob) return results
def test_no_probs(self) -> None: scorer = Scorer() golds, preds, probs = self._get_labels() self.assertEqual(scorer.score(golds, preds), scorer.score(golds, preds, probs))
def test_no_labels(self) -> None: scorer = Scorer() with self.assertRaisesRegex(ValueError, "Cannot score"): scorer.score([], [], [])
def test_no_metrics(self) -> None: scorer = Scorer() self.assertEqual(scorer.score(*self._get_labels()), {})
# Define train dataset L_train = L_data_local[train_idx] Y_train = Y_data_local[train_idx] # Define test dataset L_test = L_data_local[test_idx] Y_test = Y_data_local[test_idx] # Evaluate a dependency-informed Snorkel model l_model = LabelModel(cardinality=2, verbose=False) l_model.fit(L_train, n_epochs=n_epochs, lr=lr) try: if abstain_rate < 0: Y_pred = l_model.predict(L_test, tie_break_policy="abstain") else: Y_prob = l_model.predict_proba(L_test) Y_pred = predict_at_abstain_rate(Y_prob, abstain_rate) scores = scorer.score(Y_test, preds=Y_pred) all_scores.append(scores) except Exception as e: print("Iter {}: {}".format(i+1,e)) continue # Logging print("Iteration " + str(i+1) + ":", scores) print("-- SUMMARY --") print("accuracy: AVG {:.3f}, STD {:.3f}".format(np.mean([s["accuracy"] for s in all_scores]), np.std([s["accuracy"] for s in all_scores]))) print("f1: AVG {:.3f}, STD {:.3f}".format(np.mean([s["f1"] for s in all_scores]), np.std([s["f1"] for s in all_scores]))) print("abstain rate: AVG {:.3f}, STD {:.3f}".format(np.mean([s["abstain rate"] for s in all_scores]), np.std([s["abstain rate"] for s in all_scores])))