def train_model_random_lfs(randomly_sampled_lfs, train_matrix, dev_matrix,
                           dev_labels, test_matrix, regularization_grid):
    hyper_grid_results = defaultdict(dict)
    train_grid_results = defaultdict(dict)
    dev_grid_results = defaultdict(dict)
    test_grid_results = defaultdict(dict)
    models = defaultdict(dict)

    for lf_sample in tqdm_notebook(enumerate(randomly_sampled_lfs)):
        for param in regularization_grid:

            label_model = LabelModel(cardinality=2)
            label_model.fit(
                train_matrix[:, lf_sample[1]],
                n_epochs=1000,
                seed=100,
                lr=0.01,
                l2=param,
            )

            # Get marginals for each parameter
            hyper_grid_results[str(param)] = roc_curve(
                dev_labels,
                label_model.predict_proba(dev_matrix[:, lf_sample[1]])[:, 1])

        # Convert marginals into AUROCs
        hyper_grid_results = {
            param: auc(hyper_grid_results[param][0],
                       hyper_grid_results[param][1])
            for param in hyper_grid_results
        }

        # Select the parameter with the highest AUROC
        best_param = float(
            max(hyper_grid_results.items(), key=operator.itemgetter(1))[0])

        # Re-fit the model
        label_model.fit(
            train_matrix[:, lf_sample[1]],
            n_epochs=1000,
            seed=100,
            lr=0.01,
            l2=best_param,
        )

        # Save marginals for output
        key = f'{lf_sample[0]}:{",".join(map(str, lf_sample[1]))}'
        train_grid_results[key] = label_model.predict_proba(
            train_matrix[:, lf_sample[1]])
        dev_grid_results[key] = label_model.predict_proba(
            dev_matrix[:, lf_sample[1]])
        test_grid_results[key] = label_model.predict_proba(
            test_matrix[:, lf_sample[1]])
        models[key] = label_model

    return train_grid_results, dev_grid_results, test_grid_results, models
def snorkel_process(keylist, dataframe, allweaklabf):
    def func(x):
        idx = (-x).argsort()[1:]
        x[idx] = 0
        return x

    cardinalitynu = len(keylist)
    applier = PandasLFApplier(lfs=allweaklabf)
    all_train_l = applier.apply(df=dataframe)
    report = LFAnalysis(L=all_train_l, lfs=allweaklabf).lf_summary()
    print(report)
    label_model = LabelModel(cardinality=cardinalitynu, verbose=False)
    label_model.fit(all_train_l)
    predt = label_model.predict(all_train_l)
    predt1 = label_model.predict_proba(all_train_l)
    keylist1 = keylist.copy()
    #keylist1.append('Not_relevent')
    predt2 = pd.DataFrame(predt1, columns=keylist1)
    dataframe['L_label'] = predt
    dataframe1 = dataframe.join(predt2, how='outer')
    dataframe1 = dataframe1[dataframe1.L_label >= 0]

    train, test = train_test_split(dataframe1, test_size=0.2)

    trainsent = train.sent.values
    trainlabel = train[keylist].values
    trainlabe2 = trainlabel.copy()
    np.apply_along_axis(func, 1, trainlabe2)
    trainlabe2 = np.where(trainlabe2 > 0, 1, 0)
    testsent = test.sent.values
    testlabel = test[keylist].values
    testlabe2 = testlabel.copy()
    np.apply_along_axis(func, 1, testlabe2)
    testlabe2 = np.where(testlabe2 > 0, 1, 0)
    return trainsent, trainlabe2, testsent, testlabe2, keylist, report
예제 #3
0
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    sc = SparkContext()
    sql = SQLContext(sc)
    data = sql.read.parquet(data_path)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = SparkLFApplier(lfs)
    L = applier.apply(data.rdd)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
    data_labeled = data.withColumn("y_prob", y_prob_sql_array)
    data_labeled.write.mode("overwrite").parquet(output_path)
    logging.info(f"Labels saved to {output_path}")
예제 #4
0
def predict_documents(documents: pd.DataFrame, trigger_label_model: LabelModel,
                      role_label_model: LabelModel):
    if 'event_triggers' not in documents and 'event_roles' not in documents:
        documents = documents.apply(pipeline.add_default_events, axis=1)

    # 1. Get trigger probabilities
    df_predict_triggers, _ = pipeline.build_event_trigger_examples(documents)
    trigger_lf_applier = PandasLFApplier(pipeline.get_trigger_list_lfs())
    L_predict_triggers = trigger_lf_applier.apply(df_predict_triggers)
    event_trigger_probs = trigger_label_model.predict_proba(L_predict_triggers)

    merged_event_trigger_examples = pipeline.merge_event_trigger_examples(
        df_predict_triggers,
        utils.zero_out_abstains(event_trigger_probs, L_predict_triggers))

    # 2. Get role probabilities
    df_predict_roles, _ = pipeline.build_event_role_examples(documents)
    role_lf_applier = PandasLFApplier(pipeline.get_role_list_lfs())
    L_predict_roles = role_lf_applier.apply(df_predict_roles)
    event_roles_probs = role_label_model.predict_proba(L_predict_roles)

    merged_event_role_examples = pipeline.merge_event_role_examples(
        df_predict_roles,
        utils.zero_out_abstains(event_roles_probs, L_predict_roles))

    # 3. Update documents with trigger & role probabilities
    labeled_documents: pd.DataFrame = documents.copy()
    # Make sure to remove event_triggers and roles that were built per default
    for idx, row in labeled_documents.iterrows():
        row['event_triggers'] = []
        row['event_roles'] = []
    if 'id' in labeled_documents:
        labeled_documents.set_index('id', inplace=True)

    triggers = merged_event_trigger_examples[['event_triggers']]
    roles = merged_event_role_examples[['event_roles']]

    labeled_documents.update(triggers)
    labeled_documents.update(roles)

    labeled_documents.reset_index(level=0, inplace=True)

    # 4. Add ACE events
    labeled_documents = ace_formatter.snorkel_to_ace_format(labeled_documents)
    return labeled_documents
예제 #5
0
def get_snorkel_labels(train_df, lfs, labels):
    applier = PandasLFApplier(
        [labeling_function(name=lf.__name__)(lf) for lf in lfs])
    label_model = LabelModel(cardinality=len(labels), verbose=True)
    L_train = applier.apply(df=train_df)
    label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123)
    L_probs = label_model.predict_proba(L=L_train)

    df_filtered, probs_filtered = filter_unlabeled_dataframe(X=train_df,
                                                             y=L_probs,
                                                             L=L_train)
    return df_filtered, probs_filtered
예제 #6
0
    def test_label_model(self) -> None:
        """Test the LabelModel's estimate of P and Y."""
        np.random.seed(123)
        P, Y, L = generate_simple_label_matrix(self.n, self.m,
                                               self.cardinality)

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L, n_epochs=200, lr=0.01, seed=123)

        # Test estimated LF conditional probabilities
        P_lm = label_model._get_conditional_probs().reshape(
            (self.m, self.cardinality + 1, -1))
        np.testing.assert_array_almost_equal(P, P_lm, decimal=2)

        # Test predicted labels
        Y_lm = label_model.predict_proba(L).argmax(axis=1)
        err = np.where(Y != Y_lm, 1, 0).sum() / self.n
        self.assertLess(err, 0.1)
    def test_labeling_convergence(self) -> None:
        """Test convergence of end to end labeling pipeline."""
        # Apply LFs
        labeling_functions = (
            [f]
            + [get_positive_labeling_function(divisor) for divisor in range(2, 9)]
            + [get_negative_labeling_function(divisor) for divisor in range(2, 9)]
        )
        applier = PandasLFApplier(labeling_functions)
        L_train = applier.apply(self.df_train, progress_bar=False)

        self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions)))

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0)
        Y_lm = label_model.predict_proba(L_train).argmax(axis=1)
        Y = self.df_train.y
        err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN
        self.assertLess(err, 0.05)
예제 #8
0
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    data = dd.read_parquet(data_path)
    data = data.repartition(npartitions=2)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = DaskLFApplier(lfs)
    L = applier.apply(data)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    data = data.reset_index().set_index("index")
    data_labeled = data.assign(y_prob=dd.from_array(y_prob))
    dd.to_parquet(data_labeled, output_path)
    logging.info(f"Labels saved to {output_path}")
예제 #9
0
def test_e2e():
    """Run an end-to-end test on documents of the hardware domain."""
    PARALLEL = 4

    max_docs = 12

    fonduer.init_logging(
        log_dir="log_folder",
        format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s",
        level=logging.INFO,
    )

    session = fonduer.Meta.init(CONN_STRING).Session()

    docs_path = "tests/data/html/"
    pdf_path = "tests/data/pdf/"

    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

    corpus_parser = Parser(
        session,
        parallelism=PARALLEL,
        structural=True,
        lingual=True,
        visual=True,
        pdf_path=pdf_path,
    )
    corpus_parser.apply(doc_preprocessor)
    assert session.query(Document).count() == max_docs

    num_docs = session.query(Document).count()
    logger.info(f"Docs: {num_docs}")
    assert num_docs == max_docs

    num_sentences = session.query(Sentence).count()
    logger.info(f"Sentences: {num_sentences}")

    # Divide into test and train
    docs = sorted(corpus_parser.get_documents())
    last_docs = sorted(corpus_parser.get_last_documents())

    ld = len(docs)
    assert ld == len(last_docs)
    assert len(docs[0].sentences) == len(last_docs[0].sentences)

    assert len(docs[0].sentences) == 799
    assert len(docs[1].sentences) == 663
    assert len(docs[2].sentences) == 784
    assert len(docs[3].sentences) == 661
    assert len(docs[4].sentences) == 513
    assert len(docs[5].sentences) == 700
    assert len(docs[6].sentences) == 528
    assert len(docs[7].sentences) == 161
    assert len(docs[8].sentences) == 228
    assert len(docs[9].sentences) == 511
    assert len(docs[10].sentences) == 331
    assert len(docs[11].sentences) == 528

    # Check table numbers
    assert len(docs[0].tables) == 9
    assert len(docs[1].tables) == 9
    assert len(docs[2].tables) == 14
    assert len(docs[3].tables) == 11
    assert len(docs[4].tables) == 11
    assert len(docs[5].tables) == 10
    assert len(docs[6].tables) == 10
    assert len(docs[7].tables) == 2
    assert len(docs[8].tables) == 7
    assert len(docs[9].tables) == 10
    assert len(docs[10].tables) == 6
    assert len(docs[11].tables) == 9

    # Check figure numbers
    assert len(docs[0].figures) == 32
    assert len(docs[1].figures) == 11
    assert len(docs[2].figures) == 38
    assert len(docs[3].figures) == 31
    assert len(docs[4].figures) == 7
    assert len(docs[5].figures) == 38
    assert len(docs[6].figures) == 10
    assert len(docs[7].figures) == 31
    assert len(docs[8].figures) == 4
    assert len(docs[9].figures) == 27
    assert len(docs[10].figures) == 5
    assert len(docs[11].figures) == 27

    # Check caption numbers
    assert len(docs[0].captions) == 0
    assert len(docs[1].captions) == 0
    assert len(docs[2].captions) == 0
    assert len(docs[3].captions) == 0
    assert len(docs[4].captions) == 0
    assert len(docs[5].captions) == 0
    assert len(docs[6].captions) == 0
    assert len(docs[7].captions) == 0
    assert len(docs[8].captions) == 0
    assert len(docs[9].captions) == 0
    assert len(docs[10].captions) == 0
    assert len(docs[11].captions) == 0

    train_docs = set()
    dev_docs = set()
    test_docs = set()
    splits = (0.5, 0.75)
    data = [(doc.name, doc) for doc in docs]
    data.sort(key=lambda x: x[0])
    for i, (doc_name, doc) in enumerate(data):
        if i < splits[0] * ld:
            train_docs.add(doc)
        elif i < splits[1] * ld:
            dev_docs.add(doc)
        else:
            test_docs.add(doc)
    logger.info([x.name for x in train_docs])

    # NOTE: With multi-relation support, return values of getting candidates,
    # mentions, or sparse matrices are formatted as a list of lists. This means
    # that with a single relation, we need to index into the list of lists to
    # get the candidates/mentions/sparse matrix for a particular relation or
    # mention.

    # Mention Extraction
    part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3)
    temp_ngrams = MentionNgramsTemp(n_max=2)
    volt_ngrams = MentionNgramsVolt(n_max=1)

    Part = mention_subclass("Part")
    Temp = mention_subclass("Temp")
    Volt = mention_subclass("Volt")

    mention_extractor = MentionExtractor(
        session,
        [Part, Temp, Volt],
        [part_ngrams, temp_ngrams, volt_ngrams],
        [part_matcher, temp_matcher, volt_matcher],
    )

    mention_extractor.apply(docs, parallelism=PARALLEL)

    assert session.query(Part).count() == 299
    assert session.query(Temp).count() == 138
    assert session.query(Volt).count() == 140
    assert len(mention_extractor.get_mentions()) == 3
    assert len(mention_extractor.get_mentions()[0]) == 299
    assert (len(
        mention_extractor.get_mentions(docs=[
            session.query(Document).filter(Document.name == "112823").first()
        ])[0]) == 70)

    # Candidate Extraction
    PartTemp = candidate_subclass("PartTemp", [Part, Temp])
    PartVolt = candidate_subclass("PartVolt", [Part, Volt])

    candidate_extractor = CandidateExtractor(
        session, [PartTemp, PartVolt],
        throttlers=[temp_throttler, volt_throttler])

    for i, docs in enumerate([train_docs, dev_docs, test_docs]):
        candidate_extractor.apply(docs, split=i, parallelism=PARALLEL)

    assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493
    assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61
    assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416
    assert session.query(PartVolt).count() == 4282

    # Grab candidate lists
    train_cands = candidate_extractor.get_candidates(split=0, sort=True)
    dev_cands = candidate_extractor.get_candidates(split=1, sort=True)
    test_cands = candidate_extractor.get_candidates(split=2, sort=True)
    assert len(train_cands) == 2
    assert len(train_cands[0]) == 3493
    assert (len(
        candidate_extractor.get_candidates(docs=[
            session.query(Document).filter(Document.name == "112823").first()
        ])[0]) == 1432)

    # Featurization
    featurizer = Featurizer(session, [PartTemp, PartVolt])

    # Test that FeatureKey is properly reset
    featurizer.apply(split=1, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 214
    assert session.query(FeatureKey).count() == 1260

    # Test Dropping FeatureKey
    # Should force a row deletion
    featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"])
    assert session.query(FeatureKey).count() == 1259

    # Should only remove the part_volt as a relation and leave part_temp
    assert set(
        session.query(FeatureKey).filter(
            FeatureKey.name ==
            "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == {
                "part_temp", "part_volt"
            }
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                         candidate_classes=[PartVolt])
    assert session.query(FeatureKey).filter(
        FeatureKey.name ==
        "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes == ["part_temp"]
    assert session.query(FeatureKey).count() == 1259

    # Inserting the removed key
    featurizer.upsert_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                           candidate_classes=[PartTemp, PartVolt])
    assert set(
        session.query(FeatureKey).filter(
            FeatureKey.name ==
            "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == {
                "part_temp", "part_volt"
            }
    assert session.query(FeatureKey).count() == 1259
    # Removing the key again
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                         candidate_classes=[PartVolt])

    # Removing the last relation from a key should delete the row
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                         candidate_classes=[PartTemp])
    assert session.query(FeatureKey).count() == 1258
    session.query(Feature).delete(synchronize_session="fetch")
    session.query(FeatureKey).delete(synchronize_session="fetch")

    featurizer.apply(split=0, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6478
    assert session.query(FeatureKey).count() == 4538
    F_train = featurizer.get_feature_matrices(train_cands)
    assert F_train[0].shape == (3493, 4538)
    assert F_train[1].shape == (2985, 4538)
    assert len(featurizer.get_keys()) == 4538

    featurizer.apply(split=1, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6692
    assert session.query(FeatureKey).count() == 4538
    F_dev = featurizer.get_feature_matrices(dev_cands)
    assert F_dev[0].shape == (61, 4538)
    assert F_dev[1].shape == (153, 4538)

    featurizer.apply(split=2, parallelism=PARALLEL)
    assert session.query(Feature).count() == 8252
    assert session.query(FeatureKey).count() == 4538
    F_test = featurizer.get_feature_matrices(test_cands)
    assert F_test[0].shape == (416, 4538)
    assert F_test[1].shape == (1144, 4538)

    gold_file = "tests/data/hardware_tutorial_gold.csv"

    labeler = Labeler(session, [PartTemp, PartVolt])

    labeler.apply(
        docs=last_docs,
        lfs=[[gold], [gold]],
        table=GoldLabel,
        train=True,
        parallelism=PARALLEL,
    )
    assert session.query(GoldLabel).count() == 8252

    stg_temp_lfs = [
        LF_storage_row,
        LF_operating_row,
        LF_temperature_row,
        LF_tstg_row,
        LF_to_left,
        LF_negative_number_left,
    ]

    ce_v_max_lfs = [
        LF_bad_keywords_in_row,
        LF_current_in_row,
        LF_non_ce_voltages_in_row,
    ]

    with pytest.raises(ValueError):
        labeler.apply(split=0,
                      lfs=stg_temp_lfs,
                      train=True,
                      parallelism=PARALLEL)

    labeler.apply(
        docs=train_docs,
        lfs=[stg_temp_lfs, ce_v_max_lfs],
        train=True,
        parallelism=PARALLEL,
    )
    assert session.query(Label).count() == 6478
    assert session.query(LabelKey).count() == 9
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3493, 9)
    assert L_train[1].shape == (2985, 9)
    assert len(labeler.get_keys()) == 9

    # Test Dropping LabelerKey
    labeler.drop_keys(["LF_storage_row"])
    assert len(labeler.get_keys()) == 8

    # Test Upserting LabelerKey
    labeler.upsert_keys(["LF_storage_row"])
    assert "LF_storage_row" in [label.name for label in labeler.get_keys()]

    L_train_gold = labeler.get_gold_labels(train_cands)
    assert L_train_gold[0].shape == (3493, 1)

    L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold")
    assert L_train_gold[0].shape == (3493, 1)

    gen_model = LabelModel()
    gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100)

    train_marginals = gen_model.predict_proba(L_train[0])

    # Collect word counter
    word_counter = collect_word_counter(train_cands)

    emmental.init(fonduer.Meta.log_path)

    # Training config
    config = {
        "meta_config": {
            "verbose": False
        },
        "model_config": {
            "model_path": None,
            "device": 0,
            "dataparallel": False
        },
        "learner_config": {
            "n_epochs": 5,
            "optimizer_config": {
                "lr": 0.001,
                "l2": 0.0
            },
            "task_scheduler": "round_robin",
        },
        "logging_config": {
            "evaluation_freq": 1,
            "counter_unit": "epoch",
            "checkpointing": False,
            "checkpointer_config": {
                "checkpoint_metric": {
                    f"{ATTRIBUTE}/{ATTRIBUTE}/train/loss": "min"
                },
                "checkpoint_freq": 1,
                "checkpoint_runway": 2,
                "clear_intermediate_checkpoints": True,
                "clear_all_checkpoints": True,
            },
        },
    }
    emmental.Meta.update_config(config=config)

    # Generate word embedding module
    arity = 2
    # Geneate special tokens
    specials = []
    for i in range(arity):
        specials += [f"~~[[{i}", f"{i}]]~~"]

    emb_layer = EmbeddingModule(word_counter=word_counter,
                                word_dim=300,
                                specials=specials)

    diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1)
    train_idxs = np.where(diffs > 1e-6)[0]

    train_dataloader = EmmentalDataLoader(
        task_to_label_dict={ATTRIBUTE: "labels"},
        dataset=FonduerDataset(
            ATTRIBUTE,
            train_cands[0],
            F_train[0],
            emb_layer.word2id,
            train_marginals,
            train_idxs,
        ),
        split="train",
        batch_size=100,
        shuffle=True,
    )

    tasks = create_task(ATTRIBUTE,
                        2,
                        F_train[0].shape[1],
                        2,
                        emb_layer,
                        model="LogisticRegression")

    model = EmmentalModel(name=f"{ATTRIBUTE}_task")

    for task in tasks:
        model.add_task(task)

    emmental_learner = EmmentalLearner()
    emmental_learner.learn(model, [train_dataloader])

    test_dataloader = EmmentalDataLoader(
        task_to_label_dict={ATTRIBUTE: "labels"},
        dataset=FonduerDataset(ATTRIBUTE, test_cands[0], F_test[0],
                               emb_layer.word2id, 2),
        split="test",
        batch_size=100,
        shuffle=False,
    )

    test_preds = model.predict(test_dataloader, return_preds=True)
    positive = np.where(
        np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6)
    true_pred = [test_cands[0][_] for _ in positive[0]]

    pickle_file = "tests/data/parts_by_doc_dict.pkl"
    with open(pickle_file, "rb") as f:
        parts_by_doc = pickle.load(f)

    (TP, FP, FN) = entity_level_f1(true_pred,
                                   gold_file,
                                   ATTRIBUTE,
                                   test_docs,
                                   parts_by_doc=parts_by_doc)

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 < 0.7 and f1 > 0.3

    stg_temp_lfs_2 = [
        LF_to_left,
        LF_test_condition_aligned,
        LF_collector_aligned,
        LF_current_aligned,
        LF_voltage_row_temp,
        LF_voltage_row_part,
        LF_typ_row,
        LF_complement_left_row,
        LF_too_many_numbers_row,
        LF_temp_on_high_page_num,
        LF_temp_outside_table,
        LF_not_temp_relevant,
    ]
    labeler.update(split=0,
                   lfs=[stg_temp_lfs_2, ce_v_max_lfs],
                   parallelism=PARALLEL)
    assert session.query(Label).count() == 6478
    assert session.query(LabelKey).count() == 16
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3493, 16)

    gen_model = LabelModel()
    gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100)

    train_marginals = gen_model.predict_proba(L_train[0])

    diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1)
    train_idxs = np.where(diffs > 1e-6)[0]

    train_dataloader = EmmentalDataLoader(
        task_to_label_dict={ATTRIBUTE: "labels"},
        dataset=FonduerDataset(
            ATTRIBUTE,
            train_cands[0],
            F_train[0],
            emb_layer.word2id,
            train_marginals,
            train_idxs,
        ),
        split="train",
        batch_size=100,
        shuffle=True,
    )

    emmental.Meta.reset()
    emmental.init(fonduer.Meta.log_path)
    emmental.Meta.update_config(config=config)

    tasks = create_task(ATTRIBUTE,
                        2,
                        F_train[0].shape[1],
                        2,
                        emb_layer,
                        model="LogisticRegression")

    model = EmmentalModel(name=f"{ATTRIBUTE}_task")

    for task in tasks:
        model.add_task(task)

    emmental_learner = EmmentalLearner()
    emmental_learner.learn(model, [train_dataloader])

    test_preds = model.predict(test_dataloader, return_preds=True)
    positive = np.where(
        np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7)
    true_pred = [test_cands[0][_] for _ in positive[0]]

    (TP, FP, FN) = entity_level_f1(true_pred,
                                   gold_file,
                                   ATTRIBUTE,
                                   test_docs,
                                   parts_by_doc=parts_by_doc)

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing LSTM
    emmental.Meta.reset()
    emmental.init(fonduer.Meta.log_path)
    emmental.Meta.update_config(config=config)

    tasks = create_task(ATTRIBUTE,
                        2,
                        F_train[0].shape[1],
                        2,
                        emb_layer,
                        model="LSTM")

    model = EmmentalModel(name=f"{ATTRIBUTE}_task")

    for task in tasks:
        model.add_task(task)

    emmental_learner = EmmentalLearner()
    emmental_learner.learn(model, [train_dataloader])

    test_preds = model.predict(test_dataloader, return_preds=True)
    positive = np.where(
        np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7)
    true_pred = [test_cands[0][_] for _ in positive[0]]

    (TP, FP, FN) = entity_level_f1(true_pred,
                                   gold_file,
                                   ATTRIBUTE,
                                   test_docs,
                                   parts_by_doc=parts_by_doc)

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7
예제 #10
0
    def train_f_on_d_U(self, datafeeder, num_epochs, loss_type):
        sess = self.hls.sess

        total_batch = datafeeder.get_batches_per_epoch(f_d_U)
        batch_size = datafeeder.get_batch_size(f_d_U)

        if loss_type == 'pure-likelihood':
            train_op = self.hls.f_d_U_pure_likelihood_op
            loss_op = self.hls.f_d_U_pure_likelihood_loss
        elif loss_type == 'implication':
            train_op = self.hls.f_d_U_implication_op
            loss_op = self.hls.f_d_U_implication_loss
        elif loss_type == 'pr_loss':
            train_op = self.hls.pr_train_op
            loss_op = self.hls.pr_loss
        elif loss_type == 'gcross':
            train_op = self.hls.gcross_train_op
            loss_op = self.hls.gcross_loss
        elif loss_type == 'gcross_snorkel':
            train_op = self.hls.snork_gcross_train_op
            loss_op = self.hls.snork_gcross_loss
        elif loss_type == 'learn2reweight':
            train_op = self.hls.l2r_train_op
            loss_op = self.hls.l2r_loss
        elif loss_type == 'label_snorkel':
            train_op = self.hls.label_snorkel_train_op
            loss_op = self.hls.label_snorkel_loss
        elif loss_type == 'pure_snorkel':
            train_op = self.hls.pure_snorkel_train_op
            loss_op = self.hls.pure_snorkel_loss
        else:
            raise ValueError('Invalid loss type %s' % loss_type)

        best_saver_f_d_U = self.hls.best_savers.get_best_saver(f_d_U)
        metrics_dict = {}  #{'config': self.config}

        if 'label_snorkel' == self.config.mode or 'pure_snorkel' == self.config.mode or 'gcross_snorkel' == self.config.mode:
            label_model = LabelModel(cardinality=self.hls.num_classes,
                                     verbose=True)
            if os.path.isfile(
                    os.path.join(self.config.data_dir, "saved_label_model")):
                label_model = label_model.load(
                    os.path.join(self.config.data_dir, "saved_label_model"))
            else:
                print("LABEL MODEL NOT SAVED")
                exit()
        if 'gcross' in self.config.mode or 'learn2reweight' in self.config.mode:
            majority_model = MajorityLabelVoter(
                cardinality=self.hls.num_classes)

        with sess.as_default():
            print("Optimization started for f_d_U with %s loss!" % loss_type)
            print("Batch size: %d!" % batch_size)
            print("Batches per epoch : %d!" % total_batch)
            print("Number of epochs: %d!" % num_epochs)
            # Training cycle
            iteration = 0
            global_step = 0
            patience = 0
            for epoch in range(num_epochs):
                avg_epoch_cost = 0.

                for i in range(total_batch):
                    batch_x, batch_l, batch_m, batch_L, batch_d, batch_r =\
                            datafeeder.get_f_d_U_next_batch()

                    feed_dict = {
                        self.hls.f_d_U_adam_lr: self.config.f_d_U_adam_lr,
                        self.hls.f_d_U_x: batch_x,
                        self.hls.f_d_U_l: batch_l,
                        self.hls.f_d_U_m: batch_m,
                        self.hls.f_d_U_L: batch_L,
                        self.hls.f_d_U_d: batch_d,
                        self.hls.f_d_U_r: batch_r
                    }

                    batch_lsnork = conv_l_to_lsnork(batch_l, batch_m)

                    if 'label_snorkel' == self.config.mode or 'pure_snorkel' == self.config.mode or 'gcross_snorkel' == self.config.mode:
                        batch_snork_L = label_model.predict_proba(
                            L=batch_lsnork)  #snorkel_probs
                        feed_dict[self.hls.f_d_U_snork_L] = batch_snork_L

                    if 'gcross' == self.config.mode or 'learn2reweight' == self.config.mode:
                        batch_snork_L = majority_model.predict(
                            L=batch_lsnork)  #majority votes
                        batch_snork_L = np.eye(
                            self.hls.num_classes)[batch_snork_L]  #one hot rep
                        feed_dict[self.hls.f_d_U_snork_L] = batch_snork_L

                    merge_dict_a_into_b(self.hls.dropout_train_dict, feed_dict)
                    # Run optimization op (backprop) and cost op (to get loss value)
                    _, cost, num_d, f_d_U_global_step = sess.run(
                        [
                            train_op, loss_op, self.hls.f_d_U_num_d,
                            self.hls.f_d_U_global_step
                        ],
                        feed_dict=feed_dict)

                    global_epoch = f_d_U_global_step / total_batch
                    # This assertion is valid only if true U labels are available but not being used such as for
                    # synthetic data.
                    assert np.all(batch_L <= self.hls.num_classes)

                    avg_epoch_cost += cost / total_batch
                    cost1 = (avg_epoch_cost * total_batch) / (i + 1)
                    global_step += 1

                # Compute and report metrics, update checkpoints after each epoch
                print("\n========== epoch : {} ============\n".format(epoch))
                print("cost: {}\n".format(cost1))
                print("patience: {}\n".format(patience))
                precision, recall, f1_score, support = self.hls.test.test_f(
                    datafeeder)
                self.compute_f_d_metrics(metrics_dict, precision, recall,
                                         f1_score, support, global_epoch,
                                         f_d_U_global_step)
                print("\nmetrics_dict: ", metrics_dict)
                print()
                self.report_f_d_perfs_to_tensorboard(cost1, metrics_dict,
                                                     global_step)
                did_improve = self.maybe_save_metrics_dict(f_d_U, metrics_dict)
                if did_improve:
                    patience = 0  #rest patience if primary metric improved
                else:
                    patience += 1
                    if patience > self.config.early_stopping_p:
                        print("bye! stopping early!......")
                        break
                # Save checkpoint
                print()
                self.hls.mru_saver.save(global_step)
                print()
                best_saver_f_d_U.save_if_best(
                    metrics_dict[self.config.f_d_primary_metric])
                print()
                global_step += 1
            print("Optimization Finished for f_d_U!")
예제 #11
0
    Y_data = df.bm25_relevant.values
    print(df.shape)

    lfs = [
        lf.has_type_diap_medd_or_bhvr, lf.is_doctor_reply, lf.has_votes,
        lf.enity_overlap_jacc, lf.same_author, lf.number_relations_total,
        lf.entity_types
    ]

    applier = PandasLFApplier(lfs)
    L_data = applier.apply(df=df)

    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.load("trained_model_ehf.lbm")

    valid_probabilities = label_model.predict_proba(L=L_data)

    if 'predicted_prob' in df:
        del df['predicted_prob']
    df['predicted_prob'] = valid_probabilities[:, 1]

    PROBABILITY_CUTOFF = 0.5
    df['predicted_label'] = df['predicted_prob'] >= PROBABILITY_CUTOFF
    df_out = df[df['predicted_label'] == int(RELEVANT)][[
        'query_id', 'document_id'
    ]]

    with open(qrels_path, 'a+', encoding='utf8') as output_file:
        for index, row in df_out.iterrows():
            output_file.write(
                str(row['query_id']) + '\t0\t' + str(row['document_id']) +
예제 #12
0
def main(train_path, output_dir, label_dir):
    # Get all data
    df = pd.read_csv(train_path)

    # Get human labels
    human_labels = read_human_labels(label_dir)

    # df_test and lab_test: the set of all human-labeled notes, and their labels
    df_test = df.merge(human_labels, on=['record_number'])
    lab_test = df_test.human_label
    del df_test['human_label']

    # df_train: formed by removing all patients from df with a human-labeled note
    df_train = df.merge(df_test.mr, indicator=True, how='left', on = ['mr'])
    df_train = df_train.query('_merge=="left_only"').drop('_merge', axis=1)

    # Generate label matrix
    L_train = PandasLFApplier(lfs=lfs).apply(df=df_train)
    L_test = PandasLFApplier(lfs=lfs).apply(df=df_test)

    # Summarize LFs
    output_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
    #print(output_train)
    output_test  = LFAnalysis(L=L_test, lfs=lfs).lf_summary(Y = lab_test.values)
    #print(output_test)

    # Save LF analysis
    path = os.path.join(output_dir, 'LF_analysis_train.csv')
    output_train.to_csv(path, index = True)
    path = os.path.join(output_dir, 'LF_analysis_test.csv')
    output_test.to_csv(path, index = True)

    # Create label model
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123, class_balance = [0.3, 0.7])

    # Evaluate the label model using labeled test set
    for metric in ['recall', 'precision', 'f1', 'accuracy']:
        label_model_acc = label_model.score(L=L_test, Y=lab_test, metrics=[metric], tie_break_policy="random")[metric]
        print("%-15s %.2f%%" % (metric+":", label_model_acc * 100))

    null_f1 = f1_score(lab_test.values, np.ones((df_test.shape[0],)))
    print("%-15s %.2f%%" % ("null f1:", null_f1 * 100))
    print("%-15s %.2f%%" % ("null accuracy:", np.maximum(1-np.mean(lab_test), np.mean(lab_test)) * 100))

    # Save error analysis
    preds = label_model.predict_proba(L_test)
    error_analysis(df_test, L_test, lfs, preds[:,1], lab_test, output_dir)

    # Get labels on train
    probs_train = label_model.predict_proba(L_train)

    # Filter out unlabeled data points
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train)

    # Save filtered training set
    df_train_filtered['prob'] = probs_train_filtered[:,1]
    path = os.path.join(output_dir, 'df_train_filtered.csv')
    df_train_filtered.to_csv(path, index = False)

    # Save label probs
    path = os.path.join(output_dir, 'probs_train_filtered')
    np.save(path, probs_train_filtered[:,1])

    # Save training data set and labels
    assert len(df_test) == len(lab_test)
    df_test['human_label'] = lab_test
    path = os.path.join(output_dir, 'df_test.csv')
    df_test.to_csv(path, index = False)
    path = os.path.join(output_dir, 'lab_test')
    np.save(path, lab_test)
예제 #13
0
def label_user(inp_path, prefix=""):
    df_train = pd.read_pickle(inp_path)

    ########## threshold on word similarity
    take_first = 100
    overall_first = 10000
    global thresh_by_value, overall_thresh
    df_train['root_value'] = df_train['value'].swifter.set_dask_threshold(
        dask_threshold=0.001).allow_dask_on_strings().apply(
            lambda x: syn_to_hob[x])
    thresh_by_value = df_train.groupby(
        ["root_value"]).apply(lambda x: np.partition(
            x['lexicon_counts'], max(len(x['lexicon_counts']) - take_first, 0)
        )[max(len(x['lexicon_counts']) - take_first, 0)]).to_dict()
    overall_thresh = np.partition(df_train["lexicon_counts"].to_numpy(),
                                  max(len(df_train) - overall_first, 0))[max(
                                      len(df_train) - overall_first, 0)]
    print(overall_thresh)
    #############################

    # separately loose - strict, pos - neg, period - without
    names_pool = [
        "context:2_count_pos", "context:3_count_pos", "context:100_count_pos",
        "context:2_period_count_pos", "context:3_period_count_pos",
        "context:100_period_count_pos", "context:2_count_neg",
        "context:3_count_neg", "context:100_count_neg",
        "context:2_period_count_neg", "context:3_period_count_neg",
        "context:100_period_count_neg"
    ]
    for f_name in names_pool:
        curr_cols = [x for x in df_train.columns if f_name in x]
        df_train['total_' + f_name] = df_train[curr_cols].swifter.apply(sum,
                                                                        axis=1)
        df_train = df_train.drop(curr_cols, axis=1)
    for p in ["pos", "neg"]:
        df_train["new_total_context:100_count_" + p] = df_train[[
            "total_context:100_count_" + p, "total_context:3_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:100_count_" + p] - x["total_context:3_count_" +
                                                     p]),
                         axis=1)
        df_train["new_total_context:3_count_" + p] = df_train[[
            "total_context:3_count_" + p, "total_context:2_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:3_count_" + p] - x["total_context:2_count_" + p
                                                   ]),
                         axis=1)
        df_train["new_total_context:100_period_count_" + p] = df_train[[
            "total_context:3_period_count_" + p,
            "total_context:100_period_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:100_period_count_" + p] - x[
                "total_context:3_period_count_" + p]),
                         axis=1)
        df_train["new_total_context:3_period_count_" + p] = df_train[[
            "total_context:3_period_count_" + p,
            "total_context:2_period_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:3_period_count_" + p] - x[
                "total_context:2_period_count_" + p]),
                         axis=1)
        df_train["new_total_context:2_count_" + p] = df_train[[
            "total_context:100_period_count_" + p, "total_context:2_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:2_count_" + p] - x[
                "total_context:100_period_count_" + p]),
                         axis=1)

    df_train = df_train.drop(
        ["total_" + x for x in names_pool if "2_period_count" not in x],
        axis=1)

    lfs = [val_in_name, positive_lexicon_overall, positive_lexicon_pervalue]
    num_of_thesholds = 3
    step = 100 // num_of_thesholds

    for col in df_train:
        if col not in ["author", "value", "idd", "root_value"]:
            if col not in [
                    "pos_prob_mean", "neg_prob_mean", "num_good_posts"
            ]:  # , "lexicon_counts", "subreddit_counts", "name_in_subr_count"]:
                thresholds = [0]
                if "lexicon" in col and "unique" not in col:
                    continue
                if True:  # col in ["lexicon_counts", "unique_lexicon_counts"]:
                    vals = df_train[col].to_numpy()
                    thresholds = np.percentile(
                        vals, list(range(0 + step, 99 + step,
                                         step))).astype(int)
                    thresholds = sorted(list(set(thresholds)))
                    if len(thresholds) > 1:
                        thresholds = thresholds[:-1]
                    if "lexicon" in col:
                        thresholds = [3]
                    # max_val = max(vals)
                    # thresholds = list(range(0, int(max_val), int(max_val/5) + 1))
                # elif col == "pos_prob_mean":
                #    thresholds = [0.5 + 0.1 * x for x in range(5)]
                for i in range(len(thresholds)):
                    thresh = thresholds[i]
                    next_threshold = sys.maxsize if i == len(
                        thresholds) - 1 else thresholds[i + 1]
                    previous_threshold = -sys.maxsize if i == 0 else thresholds[
                        i - 1]
                    if "lexicon_counts" not in col:
                        lfs.append(
                            make_thresold_lf(thresh=thresh,
                                             col_name=col,
                                             next_threshold=next_threshold))
                    else:
                        lfs.append(
                            make_lexicon_lf(
                                thresh=thresh,
                                pref=col,
                                previous_threshold=previous_threshold))

    num_annotators = 0
    if num_annotators > 0:
        for i in range(1, num_annotators + 1):
            lfs.append(make_annotator_lf(worker_index=i))

    lfs = [
        x for x in lfs
        if any(y in str(x) for y in ["less", "context:2", "worker", "lexicon"])
    ]
    print("created lfs their number", len(lfs))
    print("\n".join(str(x) for x in lfs))

    #### validation #####
    do_val = False
    if do_val:
        df_golden = pd.read_csv(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_dev.csv"
        )
        name_val = list(df_golden["auth_val"])
        # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x])
        df_train["auth_val"] = df_train[["author", "value"]].swifter.apply(
            lambda x: x["author"] + "+++" + x["value"], axis=1)
        df_val = df_train[df_train.auth_val.isin(name_val)]
        df_dev = df_train[~df_train.auth_val.isin(name_val)]
        print("Number val", df_val.shape)
        print("Number dev", df_dev.shape)
        df_val = df_val.merge(df_golden, on="auth_val")
        y_val = np.array(df_val["final"])
        df_val = df_val.drop(labels="final", axis=1)
        # create test set as well

        with TQDMDaskProgressBar(desc="Dask Apply"):
            applier = PandasParallelLFApplier(lfs=lfs)
            L_val = applier.apply(df=df_val, n_parallel=num_cpu)
            L_dev = applier.apply(df=df_dev, n_parallel=num_cpu)

        dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary()
        analysis = LFAnalysis(L=L_val, lfs=lfs).lf_summary(y_val)
        analysis.to_csv("/home/tigunova/val_analysis.csv")
        dev_analysis.to_csv("/home/tigunova/dev_analysis.csv")
        print(analysis)
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_dev)  #, Y_dev=y_val)
        model_stat = label_model.score(L=L_val, Y=y_val)
        print(model_stat)
        exit(0)
    ###########

    #### picking threshold #####
    do_threshold = False
    if do_threshold:
        df_golden = pd.read_csv(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_validation.csv"
        )
        name_val = list(df_golden["auth_val"])
        # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x])
        df_train["auth_val"] = df_train[["author", "value"]].swifter.apply(
            lambda x: x["author"] + "+++" + x["value"], axis=1)
        df_val = df_train[df_train.auth_val.isin(name_val)]
        df_dev = df_train[~df_train.auth_val.isin(name_val)]
        pop_size = df_dev.shape[0]
        print("Number val", df_val.shape)
        print("Number dev", df_dev.shape)
        applier = PandasParallelLFApplier(lfs=lfs)
        df_val = df_val.merge(df_golden, on="auth_val")
        L_val = applier.apply(df=df_val, n_parallel=num_cpu)
        val_thresholds = [0.01 * x for x in range(100)]
        label_model = LabelModel(cardinality=2, verbose=True)
        with TQDMDaskProgressBar(desc="Dask Apply"):
            L_dev = applier.apply(df=df_dev, n_parallel=num_cpu)
            label_model.fit(L_dev, class_balance=[0.5, 0.5])  # , Y_dev=y_val)
            wghts = label_model.get_weights()
            print("\n".join(str(x) for x in zip(lfs, wghts)))
            probs_val = label_model.predict_proba(L=L_val)
            probs_df = pd.DataFrame(probs_val,
                                    columns=["neg_prob", "pos_prob"])
            df_val = pd.concat([df_val.reset_index(), probs_df], axis=1)
            probs_dev = label_model.predict_proba(L=L_dev)
            probs_df = pd.DataFrame(probs_dev,
                                    columns=["neg_prob", "pos_prob"])
            df_dev = pd.concat([df_dev.reset_index(), probs_df], axis=1)
            y_true = np.array(df_val["final"])
        for th in val_thresholds:
            y_pred = np.array(
                df_val["pos_prob"].apply(lambda x: 1 if x > th else 0))
            #print("true negatives")
            #print(df_val[df_val["final"] == 1][df_val["pos_prob"] <= th][["auth_val", "text"]])
            prec = precision_score(y_true, y_pred)

            pred_labels = y_pred
            true_labels = y_true
            # True Positive (TP): we predict a label of 1 (positive), and the true label is 1.
            TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1))

            # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
            TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0))

            # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
            FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0))

            # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
            FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1))

            print('TP: %i, FP: %i, TN: %i, FN: %i' % (TP, FP, TN, FN))

            # print(list(zip(label_model.predict(L=L_val_curr), y_val_curr)))
            # print("******************************")
            print("threshold %s, proportion population %.4f, precision %s" %
                  (str(th), df_dev[df_dev["pos_prob"] > th].shape[0] /
                   pop_size, str(prec)))
        exit(0)
    ###########

    with TQDMDaskProgressBar(desc="Dask Apply"):
        applier = PandasParallelLFApplier(lfs=lfs)
        L_train = applier.apply(df=df_train, n_parallel=num_cpu)

    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
    print(analysis)

    df_l_train = pd.DataFrame(
        L_train, columns=["llf_" + str(x).split(",")[0] for x in lfs])
    print(df_train.shape)
    print(df_l_train.shape)
    df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1)
    print(df_train.shape)
    print("********************************************")

    t4 = time.time()
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train,
                    n_epochs=1000,
                    lr=0.001,
                    log_freq=100,
                    seed=123,
                    class_balance=[0.3, 0.7])

    probs_train = label_model.predict_proba(L=L_train)
    print("labeling model work ", (time.time() - t4) / 60)

    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=df_train, y=probs_train, L=L_train)

    probs_df = pd.DataFrame(probs_train_filtered,
                            columns=["neg_prob", "pos_prob"])
    print(df_train_filtered.shape)
    print(probs_df.shape)
    result_filtered = pd.concat([
        df_train_filtered[['author', 'value', 'idd']].reset_index(), probs_df
    ],
                                axis=1)
    print(result_filtered.shape)
    print("****************************************************")

    result_filtered.to_csv("/home/tigunova/some_result_" + prefix + ".csv")

    print(df_train_filtered.shape)
    print(probs_df.shape)
    df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df],
                                  axis=1)
    df_train_filtered = df_train_filtered.drop(["index"], axis=1)
    print(df_train_filtered.shape)
    df_train_filtered.to_pickle(
        "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" +
        prefix + ".pkl")
    df_train_filtered.to_csv(
        "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" +
        prefix + ".csv")

    # df_train.iloc[L_train[:, 1] == POS].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/user_" + prefix + ".csv")

    ### write dict
    output_threshold = 0.63
    output_dict = defaultdict(list)
    auth_hobby_dict = defaultdict(list)
    for index, row in result_filtered.iterrows():
        if row.value == row.value and row.author == row.author:
            auth_hobby_dict[row.author].append([row.value, row.pos_prob])

    allowed_labels = []
    for index, row in df_train_filtered.iterrows():
        if row.value == row.value and row.author == row.author:
            if row.pos_prob > output_threshold:
                output_dict[row.author].append([row.value] + row.idd +
                                               [row.pos_prob])
                allowed_labels.append(syn_to_hob[row.value])
    print("\n".join([
        str(y) for y in sorted(dict(Counter(allowed_labels)).items(),
                               key=lambda x: x[1])
    ]))
    print(
        "After cropping",
        sum([
            x if x < 500 else 500
            for x in dict(Counter(allowed_labels)).values()
        ]))
    print("users in total", len(output_dict))
    for auth, stuffs in output_dict.items():
        prof = ":::".join(set([x[0] for x in stuffs]))
        prob = ":::".join([str(x[-1]) for x in stuffs])
        msgs = set([x for l in stuffs for x in l[1:-1]])
        output_dict[auth] = [prof] + list(msgs) + [prob]

    with open(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/final_author_dict_"
            + prefix + ".txt", "w") as f_out:
        f_out.write(repr(dict(auth_hobby_dict)))
    with open("/home/tigunova/users_profession1.txt", "w") as f_out:
        f_out.write(repr(dict(output_dict)))
예제 #14
0
def label_post(inp_path, prefix = ""):

    #lfs = [job_inpost, check_subreddit, check_iama]
    lfs = [job_inpost, check_iama]

    context_lens = [100, 3, 2]
    for with_per in [True, False]:
        for clen in context_lens:
            for kw in patterns:
                lfs.append(make_keyword_lf(keyword=kw, context_len=clen, with_period=with_per))

    print("created lfs, their count", len(lfs))

    df_train = pd.read_pickle(inp_path)

    df_train['texts'] = df_train['text'].swifter.apply(lambda x: [y.lower() for y in tokenize.sent_tokenize(x)])
    df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x])
    #df_train['containing_sentences'] = df_train[['texts', 'value']].swifter.apply(lambda y: find_val(y['texts'], y['value']), axis=1)

    print("loaded dataset")

    t1 = time.time()
    with TQDMDaskProgressBar(desc="Dask Apply"):
        applier = PandasParallelLFApplier(lfs=lfs)
        L_train = applier.apply(df=df_train, n_parallel=num_cpu)
    print("time mins ", (time.time() - t1) / 60)

    print(LFAnalysis(L=L_train, lfs=lfs).lf_summary())

    df_l_train = pd.DataFrame(L_train, columns=[str(x).split(",")[0] for x in lfs])
    print(df_train.shape)
    print(df_l_train.shape)
    df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1)
    print(df_train.shape)
    print("*************************************************")
    df_train = df_train.drop(["index"], axis=1)

    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123)
    probs_train = label_model.predict_proba(L=L_train)

    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=df_train, y=probs_train, L=L_train
    )
    print("the length of unfiltered posts", len(set(df_train['author'] + "+++++" + df_train['value'])))
    print("the length of filtered posts", len(set(df_train_filtered['author'] + "+++++" + df_train_filtered['value'])))

    probs_df = pd.DataFrame(probs_train_filtered, columns=["neg_prob", "pos_prob"])
    print(df_train_filtered.shape)
    print(probs_df.shape)
    df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df], axis=1)
    print(df_train_filtered.shape)

    df_train_filtered.to_pickle("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".pkl")
    df_train_filtered.to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".csv")

    #df_train.iloc[L_train[:, 1] != ABSTAIN].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/intr_train_post_tmp.csv")

    verbose = True
    if verbose:
        for i in range(len(lfs)):
            ppath = "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/interesting_datasets/" + str(lfs[i]).split(",")[0] + ".csv"
            df_train.iloc[L_train[:, i] != ABSTAIN].to_csv(ppath)


    auth_hobby_dict = defaultdict(set)
    for index, row in df_train.iterrows():
        if row.value == row.value and row.author == row.author:
            auth_hobby_dict[row.author].add(row.value)

    with open("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/author_profession_dict_" + prefix + ".txt", "w") as f_out:
        f_out.write(repr(dict(auth_hobby_dict)))
예제 #15
0
class Modeler:
    def __init__(self,
                 df_train,
                 df_dev,
                 df_valid,
                 df_test,
                 df_heldout,
                 lfs={},
                 label_model=None):
        df_train["seen"] = 0
        self.df_train = df_train.reset_index()
        self.df_dev = df_dev
        self.df_valid = df_valid
        self.df_test = df_test
        self.df_heldout = df_heldout
        #self.Y_train = df_train.label.values
        self.Y_dev = df_dev.label.values
        self.Y_valid = df_valid.label.values
        self.Y_test = df_test.label.values
        self.Y_heldout = df_heldout.label.values

        self.lfs = lfs

        self.L_train = None
        self.L_dev = None
        self.L_valid = None
        self.L_heldout = None
        cardinality = len(df_valid.label.unique())

        # for DEMOing purposes
        self.first_text_indices = [
            1262,  #"check out" "youtube"
            1892,  # I love
            1117,  # url concept
            1706,  # emoji concept
            952,  # "nice"
            971,  # positive concept
            958,  # actually use emoji concept
        ]

        self.count = 0

        if label_model is None:
            self.label_model = LabelModel(cardinality=cardinality,
                                          verbose=True)
        else:
            self.label_model = label_model

        self.vectorizer = CountVectorizer(ngram_range=(1, 2))
        self.vectorizer.fit(df_train.text.tolist())

    def get_lfs(self):
        return list(self.lfs.values())

    def add_lfs(self, new_lfs: dict):
        self.lfs.update(new_lfs)

    def remove_lfs(self, old_lf_ids: list):
        for lf_id in old_lf_ids:
            del self.lfs[lf_id]
        return len(self.lfs)

    def apply_lfs(self):
        applier = PandasLFApplier(lfs=self.get_lfs())
        self.L_train = applier.apply(df=self.df_train)
        self.L_dev = applier.apply(df=self.df_dev)
        self.L_heldout = applier.apply(df=self.df_heldout)
        #self.L_valid = applier.apply(df=self.df_valid)

    def find_duplicate_signature(self):
        label_matrix = np.vstack([self.L_train, self.L_dev])
        seen_signatures = {}
        dupes = {}
        lfs = self.get_lfs()
        signatures = [
            hash(label_matrix[:, i].tostring()) for i in range(len(lfs))
        ]
        for i, s in enumerate(signatures):
            lf = lfs[i]
            if s in seen_signatures:
                dupes[lf.name] = seen_signatures[s]
            else:
                seen_signatures[s] = lf.name
        return dupes

    def lf_examples(self, lf_id, n=5):
        lf = self.lfs[lf_id]
        applier = PandasLFApplier(lfs=[lf])
        L_train = applier.apply(df=self.df_train)
        labeled_examples = self.df_train[L_train != -1]
        samples = labeled_examples.sample(min(n, len(labeled_examples)),
                                          random_state=13)
        return [{"text": t} for t in samples["text"].values]

    def lf_mistakes(self, lf_id, n=5):
        lf = self.lfs[lf_id]
        applier = PandasLFApplier(lfs=[lf])
        L_dev = applier.apply(df=self.df_dev).squeeze()
        labeled_examples = self.df_dev[(L_dev != -1)
                                       & (L_dev != self.df_dev["label"])]
        samples = labeled_examples.sample(min(n, len(labeled_examples)),
                                          random_state=13)
        return [{"text": t} for t in samples["text"].values]

    def fit_label_model(self):
        assert self.L_train is not None

        self.label_model.fit(L_train=self.L_train,
                             n_epochs=1000,
                             lr=0.001,
                             log_freq=100,
                             seed=123)

    def analyze_lfs(self):
        if len(self.lfs) > 0:
            df = LFAnalysis(L=self.L_train, lfs=self.get_lfs()).lf_summary()
            dev_df = LFAnalysis(L=self.L_dev,
                                lfs=self.get_lfs()).lf_summary(Y=self.Y_dev)
            df = df.merge(dev_df,
                          how="outer",
                          suffixes=(" Training", " Dev."),
                          left_index=True,
                          right_index=True)
            df["Weight"] = self.label_model.get_weights()
            df["Duplicate"] = None
            for dupe, OG in self.find_duplicate_signature().items():
                print("Duplicate labeling signature detected")
                print(dupe, OG)
                df.at[dupe, "Duplicate"] = OG

            return df
        return None

    def get_label_model_stats(self):
        result = self.label_model.score(L=self.L_dev,
                                        Y=self.Y_dev,
                                        metrics=["f1", "precision", "recall"])

        probs_train = self.label_model.predict_proba(L=self.L_train)
        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=self.df_train, y=probs_train, L=self.L_train)
        result["training_label_coverage"] = len(probs_train_filtered) / len(
            probs_train)
        result["class_0_ratio"] = (probs_train_filtered[:, 0] >
                                   0.5).sum() / len(probs_train_filtered)
        if len(probs_train_filtered) == 0:
            result["class_0_ratio"] = 0

        return result

    def get_heldout_stats(self):
        if self.L_heldout is not None:
            return self.label_model.score(
                L=self.L_heldout,
                Y=self.Y_heldout,
                metrics=["f1", "precision", "recall"])
        return {}

    def train(self):
        probs_train = self.label_model.predict_proba(L=self.L_train)

        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=self.df_train, y=probs_train, L=self.L_train)

        if len(df_train_filtered) == 0:
            print("Labeling functions cover none of the training examples!",
                  file=sys.stderr)
            return {"micro_f1": 0}

        #from tensorflow.keras.utils import to_categorical
        #df_train_filtered, probs_train_filtered = self.df_dev, to_categorical(self.df_dev["label"].values)

        vectorizer = self.vectorizer
        X_train = vectorizer.transform(df_train_filtered.text.tolist())

        X_dev = vectorizer.transform(self.df_dev.text.tolist())
        X_valid = vectorizer.transform(self.df_valid.text.tolist())
        X_test = vectorizer.transform(self.df_test.text.tolist())

        self.keras_model = get_keras_logreg(input_dim=X_train.shape[1])

        self.keras_model.fit(
            x=X_train,
            y=probs_train_filtered,
            validation_data=(X_valid, preds_to_probs(self.Y_valid, 2)),
            callbacks=[get_keras_early_stopping()],
            epochs=20,
            verbose=0,
        )

        preds_test = self.keras_model.predict(x=X_test).argmax(axis=1)

        #return preds_test
        return self.get_stats(self.Y_test, preds_test)

    def get_heldout_lr_stats(self):
        X_heldout = self.vectorizer.transform(self.df_heldout.text.tolist())
        preds_test = self.keras_model.predict(x=X_heldout).argmax(axis=1)
        return self.get_stats(self.Y_heldout, preds_test)

    def get_stats(self, Y_test, preds_test):
        label_classes = np.unique(self.Y_test)
        accuracy = metrics.accuracy_score(Y_test, preds_test)
        precision_0, precision_1 = metrics.precision_score(
            Y_test, preds_test, labels=label_classes, average=None)
        recall_0, recall_1 = metrics.recall_score(Y_test,
                                                  preds_test,
                                                  labels=label_classes,
                                                  average=None)
        test_f1 = metrics.f1_score(Y_test, preds_test, labels=label_classes)

        #recall_0, recall_1 = metrics.precision_recall_fscore_support(self.Y_test, preds_test, labels=label_classes)["recall"]
        return {
            "micro_f1": test_f1,
            "recall_0": recall_0,
            "precision_0": precision_0,
            "accuracy": accuracy,
            "recall_1": recall_1,
            "precision_1": precision_1
        }

    def entropy(self, prob_dist):
        #return(-(L_row_i==-1).sum())
        return (-sum([x * log(x) for x in prob_dist]))

    def save(self, dir_name):
        self.label_model.save(os.path.join(dir_name, 'label_model.pkl'))
        with open(os.path.join(dir_name, 'model_lfs.pkl'), "wb+") as file:
            pickle.dump(self.lfs, file)

    def load(self, dir_name):
        with open(os.path.join(dir_name, 'model_lfs.pkl'), "rb") as file:
            lfs = pickle.load(file)
            label_model = LabelModel.load(
                os.path.join(dir_name, 'label_model.pkl'))
            self.lfs = lfs
            self.label_model = label_model
예제 #16
0
def run_snorkel_labelling_classification(labeling_functions, file, l_train,
                                         l_valid):
    lfs = labeling_functions
    # lfs = [lf.is_same_thread, lf.has_entities, lf.enity_overlap_jacc, lf.entity_type_overlap_jacc]
    # lfs = [is_same_thread, enity_overlap, entity_types, entity_type_overlap]

    # lfs = [is_long, has_votes, is_doctor_reply, is_same_thread, enity_overlap, has_type_dsyn, has_type_patf, has_type_sosy,
    #        has_type_dora, has_type_fndg, has_type_menp, has_type_chem, has_type_orch, has_type_horm, has_type_phsu,
    #        has_type_medd, has_type_bhvr, has_type_diap, has_type_bacs, has_type_enzy, has_type_inpo, has_type_elii]
    # lfs = [has_votes, is_doctor_reply, is_same_thread, enity_overlap]
    # lfs = [is_same_thread, enity_overlap, is_doctor_reply]

    # analysis = LFAnalysis(L=l_train, lfs=lfs).lf_summary(Y=Y_train)
    # print(analysis)
    # print(analysis['Conflicts'])
    # print(analysis['Overlaps'])

    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=l_train,
                    n_epochs=20000,
                    lr=0.0001,
                    log_freq=10,
                    seed=2345)
    # label_model.fit(L_train=L_train, n_epochs=20, lr=0.0001, log_freq=10, seed=81794)

    print("Model weights: " + str(label_model.get_weights()))

    valid_probabilities = label_model.predict_proba(L=l_valid)
    if 'predicted_prob' in df_valid:
        # df_valid.drop(columns=['predicted_prob'], axis=1)
        del df_valid['predicted_prob']
    df_valid.insert(50, 'predicted_prob', valid_probabilities[:, 1])

    # df_valid.to_csv("/container/filip/json/ehealthforum/trac/validation_df2.txt", sep="\t", header=True)
    # df_valid = pd.read_csv("/filip/json/ehealthforum/trac/validation_df.txt", sep="\t")

    def compute_precision_at_k(l, k):
        l = l[:k]
        return sum(l) / k

    PROBABILITY_CUTOFF = 0.5

    df_valid[
        'predicted_label'] = df_valid['predicted_prob'] >= PROBABILITY_CUTOFF

    true_positive_ratio = df_valid[df_valid.bm25_relevant == 1].count()['bm25_relevant'] / \
                          df_valid[df_valid.predicted_label == 1].count()['predicted_label']

    print("Number of True relevant: " +
          str(df_valid[df_valid.bm25_relevant == 1].count()['bm25_relevant']))
    print("Number of Predicted relevant: " + str(df_valid[
        df_valid.predicted_label == 1].count()['predicted_label']) + '\n')
    print('True positive ratio: ' + str(true_positive_ratio) + '\n')

    df_tru = df_valid.groupby(['query_thread']).head(10)['bm25_relevant']

    df_pred = df_valid.groupby(['query_thread']).head(10)['predicted_label']

    overall_precision = []

    for query, group in df_valid.groupby(['query_thread']):
        precision = compute_precision_at_k(
            group['predicted_label'].head(10).tolist(), 10)
        overall_precision.append(precision)

    print('Overall precision: ' +
          str(sum(overall_precision) / len(overall_precision)))
    print("Accuracy: " + str(accuracy_score(df_tru, df_pred)))

    label_model_acc = label_model.score(L=l_valid, Y=Y_valid)["accuracy"]
    print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")
예제 #17
0

# %% [markdown] {"tags": ["md-exclude"]}
# Let's briefly confirm that the labels the `LabelModel` produces are probabilistic in nature.
# The following histogram shows the confidences we have that each data point has the label SPAM.
# The points we are least certain about will have labels close to 0.5.

# %% {"tags": ["md-exclude"]}
def plot_probabilities_histogram(Y):
    plt.hist(Y, bins=10)
    plt.xlabel("Probability of SPAM")
    plt.ylabel("Number of data points")
    plt.show()


probs_train = label_model.predict_proba(L=L_train)
plot_probabilities_histogram(probs_train[:, SPAM])

# %% [markdown]
# ### Filtering out unlabeled data points

# %% [markdown]
# As we saw earlier, some of the data points in our `train` set received no labels from any of our LFs.
# These data points convey no supervision signal and tend to hurt performance, so we filter them out before training using a
# [built-in utility](https://snorkel.readthedocs.io/en/master/packages/_autosummary/labeling/snorkel.labeling.filter_unlabeled_dataframe.html#snorkel.labeling.filter_unlabeled_dataframe).

# %%
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
예제 #18
0
        return ABSTAIN


if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    ((df_dev, Y_dev), df_train, (df_test, Y_test)) = load_data()
    lfs = [lf_husband_wife, lf_husband_wife_left_window, lf_same_last_name,
           lf_married, lf_familial_relationship, lf_family_left_window,
           lf_other_relationship, lf_distant_supervision, lf_distant_supervision_last_names]
    applier = PandasLFApplier(lfs)
    L_dev = applier.apply(df_dev)
    L_train = applier.apply(df_train)
    print(LFAnalysis(L_dev, lfs).lf_summary(Y_dev))
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345)
    probs_dev = label_model.predict_proba(L_dev)
    preds_dev = probs_to_preds(probs_dev)
    print("Label model F1: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')))
    print("Label model AUC: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')))
    probs_train = label_model.predict_proba(L_train)
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train)
    X_train = get_feature_arrays(df_train_filtered)
    model = get_model()
    batch_size = 64
    model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=100)
    X_test = get_feature_arrays(df_test)
    probs_test = model.predict(X_test)
    preds_test = probs_to_preds(probs_test)
    print("Label model F1: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='f1')))
    print("Label model AUC: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='roc_auc')))
예제 #19
0
def test_e2e():
    """Run an end-to-end test on documents of the hardware domain."""
    PARALLEL = 4

    max_docs = 12

    fonduer.init_logging(
        log_dir="log_folder",
        format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s",
        level=logging.INFO,
    )

    session = fonduer.Meta.init(CONN_STRING).Session()

    docs_path = "tests/data/html/"
    pdf_path = "tests/data/pdf/"

    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

    corpus_parser = Parser(
        session,
        parallelism=PARALLEL,
        structural=True,
        lingual=True,
        visual=True,
        pdf_path=pdf_path,
    )
    corpus_parser.apply(doc_preprocessor)
    assert session.query(Document).count() == max_docs

    num_docs = session.query(Document).count()
    logger.info(f"Docs: {num_docs}")
    assert num_docs == max_docs

    num_sentences = session.query(Sentence).count()
    logger.info(f"Sentences: {num_sentences}")

    # Divide into test and train
    docs = sorted(corpus_parser.get_documents())
    last_docs = sorted(corpus_parser.get_last_documents())

    ld = len(docs)
    assert ld == len(last_docs)
    assert len(docs[0].sentences) == len(last_docs[0].sentences)

    assert len(docs[0].sentences) == 799
    assert len(docs[1].sentences) == 663
    assert len(docs[2].sentences) == 784
    assert len(docs[3].sentences) == 661
    assert len(docs[4].sentences) == 513
    assert len(docs[5].sentences) == 700
    assert len(docs[6].sentences) == 528
    assert len(docs[7].sentences) == 161
    assert len(docs[8].sentences) == 228
    assert len(docs[9].sentences) == 511
    assert len(docs[10].sentences) == 331
    assert len(docs[11].sentences) == 528

    # Check table numbers
    assert len(docs[0].tables) == 9
    assert len(docs[1].tables) == 9
    assert len(docs[2].tables) == 14
    assert len(docs[3].tables) == 11
    assert len(docs[4].tables) == 11
    assert len(docs[5].tables) == 10
    assert len(docs[6].tables) == 10
    assert len(docs[7].tables) == 2
    assert len(docs[8].tables) == 7
    assert len(docs[9].tables) == 10
    assert len(docs[10].tables) == 6
    assert len(docs[11].tables) == 9

    # Check figure numbers
    assert len(docs[0].figures) == 32
    assert len(docs[1].figures) == 11
    assert len(docs[2].figures) == 38
    assert len(docs[3].figures) == 31
    assert len(docs[4].figures) == 7
    assert len(docs[5].figures) == 38
    assert len(docs[6].figures) == 10
    assert len(docs[7].figures) == 31
    assert len(docs[8].figures) == 4
    assert len(docs[9].figures) == 27
    assert len(docs[10].figures) == 5
    assert len(docs[11].figures) == 27

    # Check caption numbers
    assert len(docs[0].captions) == 0
    assert len(docs[1].captions) == 0
    assert len(docs[2].captions) == 0
    assert len(docs[3].captions) == 0
    assert len(docs[4].captions) == 0
    assert len(docs[5].captions) == 0
    assert len(docs[6].captions) == 0
    assert len(docs[7].captions) == 0
    assert len(docs[8].captions) == 0
    assert len(docs[9].captions) == 0
    assert len(docs[10].captions) == 0
    assert len(docs[11].captions) == 0

    train_docs = set()
    dev_docs = set()
    test_docs = set()
    splits = (0.5, 0.75)
    data = [(doc.name, doc) for doc in docs]
    data.sort(key=lambda x: x[0])
    for i, (doc_name, doc) in enumerate(data):
        if i < splits[0] * ld:
            train_docs.add(doc)
        elif i < splits[1] * ld:
            dev_docs.add(doc)
        else:
            test_docs.add(doc)
    logger.info([x.name for x in train_docs])

    # NOTE: With multi-relation support, return values of getting candidates,
    # mentions, or sparse matrices are formatted as a list of lists. This means
    # that with a single relation, we need to index into the list of lists to
    # get the candidates/mentions/sparse matrix for a particular relation or
    # mention.

    # Mention Extraction
    part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3)
    temp_ngrams = MentionNgramsTemp(n_max=2)
    volt_ngrams = MentionNgramsVolt(n_max=1)

    Part = mention_subclass("Part")
    Temp = mention_subclass("Temp")
    Volt = mention_subclass("Volt")

    mention_extractor = MentionExtractor(
        session,
        [Part, Temp, Volt],
        [part_ngrams, temp_ngrams, volt_ngrams],
        [part_matcher, temp_matcher, volt_matcher],
    )

    mention_extractor.apply(docs, parallelism=PARALLEL)

    assert session.query(Part).count() == 299
    assert session.query(Temp).count() == 138
    assert session.query(Volt).count() == 140
    assert len(mention_extractor.get_mentions()) == 3
    assert len(mention_extractor.get_mentions()[0]) == 299
    assert (
        len(
            mention_extractor.get_mentions(
                docs=[session.query(Document).filter(Document.name == "112823").first()]
            )[0]
        )
        == 70
    )

    # Candidate Extraction
    PartTemp = candidate_subclass("PartTemp", [Part, Temp])
    PartVolt = candidate_subclass("PartVolt", [Part, Volt])

    candidate_extractor = CandidateExtractor(
        session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler]
    )

    for i, docs in enumerate([train_docs, dev_docs, test_docs]):
        candidate_extractor.apply(docs, split=i, parallelism=PARALLEL)

    assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493
    assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61
    assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416
    assert session.query(PartVolt).count() == 4282

    # Grab candidate lists
    train_cands = candidate_extractor.get_candidates(split=0, sort=True)
    dev_cands = candidate_extractor.get_candidates(split=1, sort=True)
    test_cands = candidate_extractor.get_candidates(split=2, sort=True)
    assert len(train_cands) == 2
    assert len(train_cands[0]) == 3493
    assert (
        len(
            candidate_extractor.get_candidates(
                docs=[session.query(Document).filter(Document.name == "112823").first()]
            )[0]
        )
        == 1432
    )

    # Featurization
    featurizer = Featurizer(session, [PartTemp, PartVolt])

    # Test that FeatureKey is properly reset
    featurizer.apply(split=1, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 214
    assert session.query(FeatureKey).count() == 1260

    # Test Dropping FeatureKey
    # Should force a row deletion
    featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"])
    assert session.query(FeatureKey).count() == 1259

    # Should only remove the part_volt as a relation and leave part_temp
    assert set(
        session.query(FeatureKey)
        .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]")
        .one()
        .candidate_classes
    ) == {"part_temp", "part_volt"}
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt])
    assert session.query(FeatureKey).filter(
        FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]"
    ).one().candidate_classes == ["part_temp"]
    assert session.query(FeatureKey).count() == 1259

    # Inserting the removed key
    featurizer.upsert_keys(
        ["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp, PartVolt]
    )
    assert set(
        session.query(FeatureKey)
        .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]")
        .one()
        .candidate_classes
    ) == {"part_temp", "part_volt"}
    assert session.query(FeatureKey).count() == 1259
    # Removing the key again
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt])

    # Removing the last relation from a key should delete the row
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp])
    assert session.query(FeatureKey).count() == 1258
    session.query(Feature).delete(synchronize_session="fetch")
    session.query(FeatureKey).delete(synchronize_session="fetch")

    featurizer.apply(split=0, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6478
    assert session.query(FeatureKey).count() == 4538
    F_train = featurizer.get_feature_matrices(train_cands)
    assert F_train[0].shape == (3493, 4538)
    assert F_train[1].shape == (2985, 4538)
    assert len(featurizer.get_keys()) == 4538

    featurizer.apply(split=1, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6692
    assert session.query(FeatureKey).count() == 4538
    F_dev = featurizer.get_feature_matrices(dev_cands)
    assert F_dev[0].shape == (61, 4538)
    assert F_dev[1].shape == (153, 4538)

    featurizer.apply(split=2, parallelism=PARALLEL)
    assert session.query(Feature).count() == 8252
    assert session.query(FeatureKey).count() == 4538
    F_test = featurizer.get_feature_matrices(test_cands)
    assert F_test[0].shape == (416, 4538)
    assert F_test[1].shape == (1144, 4538)

    gold_file = "tests/data/hardware_tutorial_gold.csv"

    labeler = Labeler(session, [PartTemp, PartVolt])

    labeler.apply(
        docs=last_docs,
        lfs=[[gold], [gold]],
        table=GoldLabel,
        train=True,
        parallelism=PARALLEL,
    )
    assert session.query(GoldLabel).count() == 8252

    stg_temp_lfs = [
        LF_storage_row,
        LF_operating_row,
        LF_temperature_row,
        LF_tstg_row,
        LF_to_left,
        LF_negative_number_left,
    ]

    ce_v_max_lfs = [
        LF_bad_keywords_in_row,
        LF_current_in_row,
        LF_non_ce_voltages_in_row,
    ]

    with pytest.raises(ValueError):
        labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL)

    labeler.apply(
        docs=train_docs,
        lfs=[stg_temp_lfs, ce_v_max_lfs],
        train=True,
        parallelism=PARALLEL,
    )
    assert session.query(Label).count() == 6478
    assert session.query(LabelKey).count() == 9
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3493, 9)
    assert L_train[1].shape == (2985, 9)
    assert len(labeler.get_keys()) == 9

    # Test Dropping LabelerKey
    labeler.drop_keys(["LF_storage_row"])
    assert len(labeler.get_keys()) == 8

    # Test Upserting LabelerKey
    labeler.upsert_keys(["LF_storage_row"])
    assert "LF_storage_row" in [label.name for label in labeler.get_keys()]

    L_train_gold = labeler.get_gold_labels(train_cands)
    assert L_train_gold[0].shape == (3493, 1)

    L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold")
    assert L_train_gold[0].shape == (3493, 1)

    gen_model = LabelModel()
    gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100)

    train_marginals = gen_model.predict_proba(L_train[0])

    disc_model = LogisticRegression()
    disc_model.train(
        (train_cands[0], F_train[0]),
        train_marginals,
        X_dev=(train_cands[0], F_train[0]),
        Y_dev=L_train_gold[0].reshape(-1),
        b=0.6,
        pos_label=TRUE,
        n_epochs=5,
        lr=0.001,
    )

    test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))]

    pickle_file = "tests/data/parts_by_doc_dict.pkl"
    with open(pickle_file, "rb") as f:
        parts_by_doc = pickle.load(f)

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 < 0.7 and f1 > 0.3

    stg_temp_lfs_2 = [
        LF_to_left,
        LF_test_condition_aligned,
        LF_collector_aligned,
        LF_current_aligned,
        LF_voltage_row_temp,
        LF_voltage_row_part,
        LF_typ_row,
        LF_complement_left_row,
        LF_too_many_numbers_row,
        LF_temp_on_high_page_num,
        LF_temp_outside_table,
        LF_not_temp_relevant,
    ]
    labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL)
    assert session.query(Label).count() == 6478
    assert session.query(LabelKey).count() == 16
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3493, 16)

    gen_model = LabelModel()
    gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100)

    train_marginals = gen_model.predict_proba(L_train[0])

    disc_model = LogisticRegression()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001
    )

    test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing LSTM
    disc_model = LSTM()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001
    )

    test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing Sparse Logistic Regression
    disc_model = SparseLogisticRegression()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001
    )

    test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing Sparse LSTM
    disc_model = SparseLSTM()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001
    )

    test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Evaluate mention level scores
    L_test_gold = labeler.get_gold_labels(test_cands, annotator="gold")
    Y_test = L_test_gold[0].reshape(-1)

    scores = disc_model.score((test_cands[0], F_test[0]), Y_test, b=0.6, pos_label=TRUE)

    logger.info(scores)

    assert scores["f1"] > 0.6
X_gold_sent, X_gold_shortest_path, X_gold_src, X_gold_tgt, X_gold_src_txt, X_gold_tgt_txt, y_gold = data_handler.get_test_data()

X_val_sent, X_val_shortest_path, X_val_src, X_val_tgt, X_val_src_txt, X_val_tgt_txt, y_val = data_handler.get_validation_data()

applier = PandasLFApplier(label_functions.lfs)

df_train = pd.DataFrame(list(zip(*data_handler.get_training_data())), columns=['shortest_path', 'sent', 'src', 'tgt', 'src_txt', 'tgt_txt'])

L_train = applier.apply(df_train)

label_model = LabelModel(cardinality=len(rel_names.rels_txt_to_int), verbose=True)
label_model.fit(L_train, n_epochs=1000, lr=0.01, log_freq=100, seed=123)

label_model.save('./models/LabelModel.model')

train_probs = label_model.predict_proba(L_train)
train_preds = probs_to_preds(train_probs, tie_break_policy='abstain')

df_train = df_train.join(pd.DataFrame({'preds': train_preds, 'probs': list(map(max, train_probs))}))

# -1 to otherwiseRelated
df_train.loc[df_train.preds == -1, 'preds'] = rel_names.rels_txt_to_int['otherwiseRelated']

# Downsample otherwiseRelated
dropNum = len(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']]) - int(df_train['preds'].value_counts().mean())
df_train = df_train.drop(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']].sample(dropNum).index)

cnts = {}
for x in df_train['preds']:
    name = rel_names.rels_int_to_text[x]
    if name not in cnts: