def test_lf_applier(self) -> None: data_points = [SimpleNamespace(num=num) for num in DATA] applier = LFApplier([f, g]) L = applier.apply(data_points, progress_bar=False) np.testing.assert_equal(L, L_EXPECTED) L = applier.apply(data_points, progress_bar=True) np.testing.assert_equal(L, L_EXPECTED)
def test_lf_applier(self) -> None: data_points = [SimpleNamespace(num=num) for num in DATA] applier = LFApplier([f, g]) L = applier.apply(data_points, progress_bar=False) np.testing.assert_equal(L, L_EXPECTED) L = applier.apply(data_points, progress_bar=True) np.testing.assert_equal(L, L_EXPECTED) L, meta = applier.apply(data_points, return_meta=True) np.testing.assert_equal(L, L_EXPECTED) self.assertEqual(meta, ApplierMetadata(dict()))
def main(): lfs = [lf_contains_link, lf_contains_co, lf_contains_sub] baseApp = LFApplier(lfs) labels = baseApp.apply(src) print(labels) print(LFAnalysis(labels, lfs).lf_summary()) buckets = get_label_buckets(labels[:, 0], labels[:, 1]) print(buckets) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(labels, n_epochs=500, log_freq=50, seed=123) pred_labels = label_model.predict(L=labels, tie_break_policy="abstain") print(pred_labels)
def test_lf_applier_preprocessor_memoized(self) -> None: data_points = [SimpleNamespace(num=num) for num in DATA] square_hit_tracker = SquareHitTracker() @preprocessor(memoize=True) def square_memoize(x: DataPoint) -> DataPoint: x.num_squared = square_hit_tracker(x.num) return x @labeling_function(pre=[square_memoize]) def fp_memoized(x: DataPoint) -> int: return 0 if x.num_squared > 42 else -1 applier = LFApplier([f, fp_memoized]) L = applier.apply(data_points, progress_bar=False) np.testing.assert_equal(L, L_PREPROCESS_EXPECTED) self.assertEqual(square_hit_tracker.n_hits, 4)
def _create_model(self, ctx): X, y = [], [] texts = [] n = None for label in sorted(ctx): for text, text_orig in sorted(ctx[label]): X.append(text) texts.append(text_orig) y.append(label) le = LabelEncoder() y_idx = le.fit_transform(y) m = make_pipeline( TfidfVectorizer(ngram_range=(1, 3), max_features=n, tokenizer=lambda s: s.split(), lowercase=False), LogisticRegression()) m.fit(X, y_idx) voc = {} for word, idx in m.steps[0][1].vocabulary_.items(): voc[idx] = word k = 3 keywords = {} for label, weights in zip(le.classes_, m.steps[1][1].coef_): i = np.argsort(weights)[-k:] kw = [voc[ii] for ii in i] keywords[label] = kw k = defaultdict(list) for label, kws in keywords.items(): other_kws = set( sum((v for l, v in keywords.items() if l != label), [])) for kw in kws: if kw in other_kws: continue k[label].append(kw) keywords = k self.lfs, self.idx_label_map = get_lfs(keywords) self.applier = LFApplier(self.lfs)
def test_lf_applier_fault(self) -> None: data_points = [SimpleNamespace(num=num) for num in DATA] applier = LFApplier([f, f_bad]) with self.assertRaises(AttributeError): applier.apply(data_points, progress_bar=False) L = applier.apply(data_points, progress_bar=False, fault_tolerant=True) np.testing.assert_equal(L, L_EXPECTED_BAD) L, meta = applier.apply( data_points, progress_bar=False, fault_tolerant=True, return_meta=True ) np.testing.assert_equal(L, L_EXPECTED_BAD) self.assertEqual(meta, ApplierMetadata(dict(f_bad=5)))
def test_lf_applier_numpy(self) -> None: X = np.vstack((DATA, DATA)).T applier = LFApplier([f_np, g_np]) L = applier.apply(X, progress_bar=False) np.testing.assert_equal(L, L_EXPECTED)
def test_lf_applier_no_labels(self) -> None: data_points = [SimpleNamespace(num=num) for num in DATA] applier = LFApplier([h]) L = applier.apply(data_points, progress_bar=False) np.testing.assert_equal(L, -1)
class ContextSentenceFactor(Factor): def _get_ctx(self, doc, r): toks = [] for tok in doc: if tok.is_space or tok.is_punct or tok.is_quote: continue if (tok.idx + len(tok.text)) < r['start'] - self.ctx_size or tok.idx > r[ 'end'] + self.ctx_size: continue if tok.idx + len(tok.text) >= r[ 'start'] - self.ctx_size and tok.idx < r['start']: toks.append((tok, '[B]')) elif r['start'] <= tok.idx <= r['end']: toks.append((tok, '[I]')) else: toks.append((tok, '[A]')) return toks def __init__(self, tasks, results): self.spans = [] ctx = defaultdict(list) self.ctx_size = 100 self.docs = [] for task, result in zip(tasks, results): text = task doc = nlp_spacy(text) self.docs.append(doc) spans = [] for sent in doc.sents: spans.append({ 'start': sent.start_char, 'end': sent.end_char, 'text': sent.text }) self.spans.append(spans) for r in result: toks = self._get_ctx(doc, r) text = [] text_orig = [] for tok, prefix in toks: # text.append(tok.text) text.append(prefix + tok.text) text_orig.append(tok.text) text = ' '.join(text) text_orig = ' '.join(text_orig) label = r['labels'][0] ctx[label].append((text, text_orig)) self._create_model(ctx) def _create_model(self, ctx): X, y = [], [] texts = [] n = None for label in sorted(ctx): for text, text_orig in sorted(ctx[label]): X.append(text) texts.append(text_orig) y.append(label) le = LabelEncoder() y_idx = le.fit_transform(y) m = make_pipeline( TfidfVectorizer(ngram_range=(1, 3), max_features=n, tokenizer=lambda s: s.split(), lowercase=False), LogisticRegression()) m.fit(X, y_idx) voc = {} for word, idx in m.steps[0][1].vocabulary_.items(): voc[idx] = word k = 3 keywords = {} for label, weights in zip(le.classes_, m.steps[1][1].coef_): i = np.argsort(weights)[-k:] kw = [voc[ii] for ii in i] keywords[label] = kw k = defaultdict(list) for label, kws in keywords.items(): other_kws = set( sum((v for l, v in keywords.items() if l != label), [])) for kw in kws: if kw in other_kws: continue k[label].append(kw) keywords = k self.lfs, self.idx_label_map = get_lfs(keywords) self.applier = LFApplier(self.lfs) def apply(self, tasks): print('Create regions...') random.shuffle(tasks) regions = self.create_regions(tasks[:100]) print(f'Num regions: {len(regions)}') L_train = self.applier.apply(regions) lfa = LFAnalysis(L=L_train, lfs=self.lfs) confl = lfa.lf_conflicts() cov = lfa.lf_coverages() confli = np.argsort(confl) lfs_sorted = [self.lfs[i] for i in confli] out = [] for lf, cf, cv in zip(lfs_sorted, confl[confli], cov[confli]): print(lf.name, cf, cv) out.append({'lop': lf.name, 'conflict': cf, 'coverage': cv}) return out def create_regions(self, tasks): regions = [] nskipped = 0 for task in tqdm(tasks): if len(task) > 10000: nskipped += 1 continue doc = nlp_spacy(task) for sent in doc.sents: ctx = self._get_ctx(doc, { 'start': sent.start_char, 'end': sent.end_char }) r = defaultdict(list) for tok, where in ctx: r[where.lstrip('[').rstrip(']')].append(tok.text) region = {} for where in r: region[where] = ' '.join(r[where]) if 'I' not in region: continue regions.append(region) print(f'Num skipped = {nskipped}') return regions def match(self, results): out = defaultdict(list) for spans, result in zip(self.spans, results): scores, argmaxes = match(result, spans) for r, s in zip(result, scores): out[r['labels'][0]].append(s) return {label: float(np.mean(v)) for label, v in out.items()}