def test_lf_applier(self) -> None: data_points = [SimpleNamespace(num=num) for num in DATA] applier = LFApplier([f, g]) L = applier.apply(data_points, progress_bar=False) np.testing.assert_equal(L, L_EXPECTED) L = applier.apply(data_points, progress_bar=True) np.testing.assert_equal(L, L_EXPECTED)
def test_lf_applier(self) -> None: data_points = [SimpleNamespace(num=num) for num in DATA] applier = LFApplier([f, g]) L = applier.apply(data_points, progress_bar=False) np.testing.assert_equal(L, L_EXPECTED) L = applier.apply(data_points, progress_bar=True) np.testing.assert_equal(L, L_EXPECTED) L, meta = applier.apply(data_points, return_meta=True) np.testing.assert_equal(L, L_EXPECTED) self.assertEqual(meta, ApplierMetadata(dict()))
def test_lf_applier_fault(self) -> None: data_points = [SimpleNamespace(num=num) for num in DATA] applier = LFApplier([f, f_bad]) with self.assertRaises(AttributeError): applier.apply(data_points, progress_bar=False) L = applier.apply(data_points, progress_bar=False, fault_tolerant=True) np.testing.assert_equal(L, L_EXPECTED_BAD) L, meta = applier.apply( data_points, progress_bar=False, fault_tolerant=True, return_meta=True ) np.testing.assert_equal(L, L_EXPECTED_BAD) self.assertEqual(meta, ApplierMetadata(dict(f_bad=5)))
def main(): lfs = [lf_contains_link, lf_contains_co, lf_contains_sub] baseApp = LFApplier(lfs) labels = baseApp.apply(src) print(labels) print(LFAnalysis(labels, lfs).lf_summary()) buckets = get_label_buckets(labels[:, 0], labels[:, 1]) print(buckets) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(labels, n_epochs=500, log_freq=50, seed=123) pred_labels = label_model.predict(L=labels, tie_break_policy="abstain") print(pred_labels)
def test_lf_applier_preprocessor_memoized(self) -> None: data_points = [SimpleNamespace(num=num) for num in DATA] square_hit_tracker = SquareHitTracker() @preprocessor(memoize=True) def square_memoize(x: DataPoint) -> DataPoint: x.num_squared = square_hit_tracker(x.num) return x @labeling_function(pre=[square_memoize]) def fp_memoized(x: DataPoint) -> int: return 0 if x.num_squared > 42 else -1 applier = LFApplier([f, fp_memoized]) L = applier.apply(data_points, progress_bar=False) np.testing.assert_equal(L, L_PREPROCESS_EXPECTED) self.assertEqual(square_hit_tracker.n_hits, 4)
def _create_model(self, ctx): X, y = [], [] texts = [] n = None for label in sorted(ctx): for text, text_orig in sorted(ctx[label]): X.append(text) texts.append(text_orig) y.append(label) le = LabelEncoder() y_idx = le.fit_transform(y) m = make_pipeline( TfidfVectorizer(ngram_range=(1, 3), max_features=n, tokenizer=lambda s: s.split(), lowercase=False), LogisticRegression()) m.fit(X, y_idx) voc = {} for word, idx in m.steps[0][1].vocabulary_.items(): voc[idx] = word k = 3 keywords = {} for label, weights in zip(le.classes_, m.steps[1][1].coef_): i = np.argsort(weights)[-k:] kw = [voc[ii] for ii in i] keywords[label] = kw k = defaultdict(list) for label, kws in keywords.items(): other_kws = set( sum((v for l, v in keywords.items() if l != label), [])) for kw in kws: if kw in other_kws: continue k[label].append(kw) keywords = k self.lfs, self.idx_label_map = get_lfs(keywords) self.applier = LFApplier(self.lfs)
def test_lf_applier_numpy(self) -> None: X = np.vstack((DATA, DATA)).T applier = LFApplier([f_np, g_np]) L = applier.apply(X, progress_bar=False) np.testing.assert_equal(L, L_EXPECTED)
def test_lf_applier_no_labels(self) -> None: data_points = [SimpleNamespace(num=num) for num in DATA] applier = LFApplier([h]) L = applier.apply(data_points, progress_bar=False) np.testing.assert_equal(L, -1)