示例#1
0
 def test_lf_applier(self) -> None:
     data_points = [SimpleNamespace(num=num) for num in DATA]
     applier = LFApplier([f, g])
     L = applier.apply(data_points, progress_bar=False)
     np.testing.assert_equal(L, L_EXPECTED)
     L = applier.apply(data_points, progress_bar=True)
     np.testing.assert_equal(L, L_EXPECTED)
 def test_lf_applier(self) -> None:
     data_points = [SimpleNamespace(num=num) for num in DATA]
     applier = LFApplier([f, g])
     L = applier.apply(data_points, progress_bar=False)
     np.testing.assert_equal(L, L_EXPECTED)
     L = applier.apply(data_points, progress_bar=True)
     np.testing.assert_equal(L, L_EXPECTED)
     L, meta = applier.apply(data_points, return_meta=True)
     np.testing.assert_equal(L, L_EXPECTED)
     self.assertEqual(meta, ApplierMetadata(dict()))
示例#3
0
def main():
    lfs = [lf_contains_link, lf_contains_co, lf_contains_sub]
    baseApp = LFApplier(lfs)
    labels = baseApp.apply(src)
    print(labels)
    print(LFAnalysis(labels, lfs).lf_summary())
    buckets = get_label_buckets(labels[:, 0], labels[:, 1])
    print(buckets)

    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(labels, n_epochs=500, log_freq=50, seed=123)
    pred_labels = label_model.predict(L=labels, tie_break_policy="abstain")
    print(pred_labels)
    def test_lf_applier_preprocessor_memoized(self) -> None:
        data_points = [SimpleNamespace(num=num) for num in DATA]
        square_hit_tracker = SquareHitTracker()

        @preprocessor(memoize=True)
        def square_memoize(x: DataPoint) -> DataPoint:
            x.num_squared = square_hit_tracker(x.num)
            return x

        @labeling_function(pre=[square_memoize])
        def fp_memoized(x: DataPoint) -> int:
            return 0 if x.num_squared > 42 else -1

        applier = LFApplier([f, fp_memoized])
        L = applier.apply(data_points, progress_bar=False)
        np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
        self.assertEqual(square_hit_tracker.n_hits, 4)
示例#5
0
    def _create_model(self, ctx):
        X, y = [], []
        texts = []
        n = None
        for label in sorted(ctx):
            for text, text_orig in sorted(ctx[label]):
                X.append(text)
                texts.append(text_orig)
                y.append(label)
        le = LabelEncoder()
        y_idx = le.fit_transform(y)
        m = make_pipeline(
            TfidfVectorizer(ngram_range=(1, 3),
                            max_features=n,
                            tokenizer=lambda s: s.split(),
                            lowercase=False), LogisticRegression())
        m.fit(X, y_idx)

        voc = {}
        for word, idx in m.steps[0][1].vocabulary_.items():
            voc[idx] = word

        k = 3
        keywords = {}
        for label, weights in zip(le.classes_, m.steps[1][1].coef_):
            i = np.argsort(weights)[-k:]
            kw = [voc[ii] for ii in i]
            keywords[label] = kw

        k = defaultdict(list)
        for label, kws in keywords.items():
            other_kws = set(
                sum((v for l, v in keywords.items() if l != label), []))
            for kw in kws:
                if kw in other_kws:
                    continue
                k[label].append(kw)
        keywords = k

        self.lfs, self.idx_label_map = get_lfs(keywords)
        self.applier = LFApplier(self.lfs)
 def test_lf_applier_fault(self) -> None:
     data_points = [SimpleNamespace(num=num) for num in DATA]
     applier = LFApplier([f, f_bad])
     with self.assertRaises(AttributeError):
         applier.apply(data_points, progress_bar=False)
     L = applier.apply(data_points, progress_bar=False, fault_tolerant=True)
     np.testing.assert_equal(L, L_EXPECTED_BAD)
     L, meta = applier.apply(
         data_points, progress_bar=False, fault_tolerant=True, return_meta=True
     )
     np.testing.assert_equal(L, L_EXPECTED_BAD)
     self.assertEqual(meta, ApplierMetadata(dict(f_bad=5)))
 def test_lf_applier_numpy(self) -> None:
     X = np.vstack((DATA, DATA)).T
     applier = LFApplier([f_np, g_np])
     L = applier.apply(X, progress_bar=False)
     np.testing.assert_equal(L, L_EXPECTED)
 def test_lf_applier_no_labels(self) -> None:
     data_points = [SimpleNamespace(num=num) for num in DATA]
     applier = LFApplier([h])
     L = applier.apply(data_points, progress_bar=False)
     np.testing.assert_equal(L, -1)
示例#9
0
class ContextSentenceFactor(Factor):
    def _get_ctx(self, doc, r):
        toks = []
        for tok in doc:
            if tok.is_space or tok.is_punct or tok.is_quote:
                continue
            if (tok.idx +
                    len(tok.text)) < r['start'] - self.ctx_size or tok.idx > r[
                        'end'] + self.ctx_size:
                continue
            if tok.idx + len(tok.text) >= r[
                    'start'] - self.ctx_size and tok.idx < r['start']:
                toks.append((tok, '[B]'))
            elif r['start'] <= tok.idx <= r['end']:
                toks.append((tok, '[I]'))
            else:
                toks.append((tok, '[A]'))
        return toks

    def __init__(self, tasks, results):
        self.spans = []
        ctx = defaultdict(list)
        self.ctx_size = 100
        self.docs = []
        for task, result in zip(tasks, results):
            text = task
            doc = nlp_spacy(text)
            self.docs.append(doc)
            spans = []
            for sent in doc.sents:
                spans.append({
                    'start': sent.start_char,
                    'end': sent.end_char,
                    'text': sent.text
                })
            self.spans.append(spans)

            for r in result:
                toks = self._get_ctx(doc, r)
                text = []
                text_orig = []
                for tok, prefix in toks:
                    # text.append(tok.text)
                    text.append(prefix + tok.text)
                    text_orig.append(tok.text)
                text = ' '.join(text)
                text_orig = ' '.join(text_orig)
                label = r['labels'][0]
                ctx[label].append((text, text_orig))
        self._create_model(ctx)

    def _create_model(self, ctx):
        X, y = [], []
        texts = []
        n = None
        for label in sorted(ctx):
            for text, text_orig in sorted(ctx[label]):
                X.append(text)
                texts.append(text_orig)
                y.append(label)
        le = LabelEncoder()
        y_idx = le.fit_transform(y)
        m = make_pipeline(
            TfidfVectorizer(ngram_range=(1, 3),
                            max_features=n,
                            tokenizer=lambda s: s.split(),
                            lowercase=False), LogisticRegression())
        m.fit(X, y_idx)

        voc = {}
        for word, idx in m.steps[0][1].vocabulary_.items():
            voc[idx] = word

        k = 3
        keywords = {}
        for label, weights in zip(le.classes_, m.steps[1][1].coef_):
            i = np.argsort(weights)[-k:]
            kw = [voc[ii] for ii in i]
            keywords[label] = kw

        k = defaultdict(list)
        for label, kws in keywords.items():
            other_kws = set(
                sum((v for l, v in keywords.items() if l != label), []))
            for kw in kws:
                if kw in other_kws:
                    continue
                k[label].append(kw)
        keywords = k

        self.lfs, self.idx_label_map = get_lfs(keywords)
        self.applier = LFApplier(self.lfs)

    def apply(self, tasks):
        print('Create regions...')
        random.shuffle(tasks)
        regions = self.create_regions(tasks[:100])
        print(f'Num regions: {len(regions)}')
        L_train = self.applier.apply(regions)
        lfa = LFAnalysis(L=L_train, lfs=self.lfs)
        confl = lfa.lf_conflicts()
        cov = lfa.lf_coverages()
        confli = np.argsort(confl)
        lfs_sorted = [self.lfs[i] for i in confli]
        out = []
        for lf, cf, cv in zip(lfs_sorted, confl[confli], cov[confli]):
            print(lf.name, cf, cv)
            out.append({'lop': lf.name, 'conflict': cf, 'coverage': cv})
        return out

    def create_regions(self, tasks):
        regions = []
        nskipped = 0
        for task in tqdm(tasks):
            if len(task) > 10000:
                nskipped += 1
                continue
            doc = nlp_spacy(task)
            for sent in doc.sents:
                ctx = self._get_ctx(doc, {
                    'start': sent.start_char,
                    'end': sent.end_char
                })
                r = defaultdict(list)
                for tok, where in ctx:
                    r[where.lstrip('[').rstrip(']')].append(tok.text)
                region = {}
                for where in r:
                    region[where] = ' '.join(r[where])
                if 'I' not in region:
                    continue
                regions.append(region)
        print(f'Num skipped = {nskipped}')
        return regions

    def match(self, results):
        out = defaultdict(list)
        for spans, result in zip(self.spans, results):
            scores, argmaxes = match(result, spans)
            for r, s in zip(result, scores):
                out[r['labels'][0]].append(s)
        return {label: float(np.mean(v)) for label, v in out.items()}