예제 #1
0
    def test_L_form(self):
        label_model = LabelModel(cardinality=2, verbose=False)
        L = np.array([[0, 1, 0], [0, 1, 0], [1, 0, 0], [0, 1, 0]])
        label_model._set_constants(L)
        self.assertEqual(label_model.n, 4)
        self.assertEqual(label_model.m, 3)

        L = np.array([[0, 1, 2], [0, 1, 2], [1, 0, 2], [0, 1, 0]])
        with self.assertRaisesRegex(ValueError, "L_train has cardinality"):
            label_model.fit(L, n_epochs=1)

        L = np.array([[0], [1], [-1]])
        with self.assertRaisesRegex(ValueError,
                                    "L_train should have at least 3"):
            label_model.fit(L, n_epochs=1)
    def test_optimizer_init(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        label_model.fit(L, optimizer="sgd", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.SGD)

        label_model.fit(L, optimizer="adam", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.Adam)

        label_model.fit(L, optimizer="adamax", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.Adamax)

        with self.assertRaisesRegex(ValueError, "Unrecognized optimizer"):
            label_model.fit(L, optimizer="bad_optimizer", n_epochs=1)
예제 #3
0
    def test_mv_default(self):
        # less than 2 LFs have overlaps
        label_model = LabelModel(cardinality=2, verbose=False)
        L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]])
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(
            label_model.predict(L), np.array([1, 1, 0])
        )

        # less than 2 LFs have conflicts
        L = np.array([[-1, -1, 1], [-1, 1, 1], [1, 1, 1]])
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(
            label_model.predict(L), np.array([1, 1, 1])
        )
예제 #4
0
    def test_warmup(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        lr_scheduler_config = {"warmup_steps": 3, "warmup_unit": "epochs"}
        label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5)
        self.assertEqual(label_model.warmup_steps, 3)

        lr_scheduler_config = {"warmup_percentage": 3 / 5}
        label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5)
        self.assertEqual(label_model.warmup_steps, 3)

        with self.assertRaisesRegex(ValueError, "LabelModel does not support"):
            lr_scheduler_config = {"warmup_steps": 1, "warmup_unit": "batches"}
            label_model.fit(L, lr_scheduler_config=lr_scheduler_config)
    def test_label_model_basic(self) -> None:
        """Test the LabelModel's estimate of P and Y on a simple synthetic dataset."""
        np.random.seed(123)
        P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality)

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L, n_epochs=200, lr=0.01, seed=123)

        # Test estimated LF conditional probabilities
        P_lm = label_model.get_conditional_probs()
        np.testing.assert_array_almost_equal(P, P_lm, decimal=2)

        # Test predicted labels
        score = label_model.score(L, Y)
        self.assertGreaterEqual(score["accuracy"], 0.9)
    def test_scheduler_init(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        label_model.fit(L, lr_scheduler="constant", n_epochs=1)
        self.assertIsNone(label_model.lr_scheduler)

        label_model.fit(L, lr_scheduler="linear", n_epochs=1)
        self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.LambdaLR)

        label_model.fit(L, lr_scheduler="exponential", n_epochs=1)
        self.assertIsInstance(
            label_model.lr_scheduler, optim.lr_scheduler.ExponentialLR
        )

        label_model.fit(L, lr_scheduler="step", n_epochs=1)
        self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.StepLR)
예제 #7
0
    def test_label_model(self) -> None:
        """Test the LabelModel's estimate of P and Y."""
        np.random.seed(123)
        P, Y, L = generate_simple_label_matrix(self.n, self.m,
                                               self.cardinality)

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L, n_epochs=200, lr=0.01, seed=123)

        # Test estimated LF conditional probabilities
        P_lm = label_model._get_conditional_probs().reshape(
            (self.m, self.cardinality + 1, -1))
        np.testing.assert_array_almost_equal(P, P_lm, decimal=2)

        # Test predicted labels
        Y_lm = label_model.predict_proba(L).argmax(axis=1)
        err = np.where(Y != Y_lm, 1, 0).sum() / self.n
        self.assertLess(err, 0.1)
    def test_class_balance(self):
        label_model = LabelModel(cardinality=2, verbose=False)
        # Test class balance
        Y_dev = np.array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1])
        label_model._set_class_balance(class_balance=None, Y_dev=Y_dev)
        np.testing.assert_array_almost_equal(label_model.p, np.array([0.6, 0.4]))

        class_balance = np.array([0.0, 1.0])
        with self.assertRaisesRegex(ValueError, "Class balance prior is 0"):
            label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev)

        class_balance = np.array([0.0])
        with self.assertRaisesRegex(ValueError, "class_balance has 1 entries."):
            label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev)

        Y_dev_one_class = np.array([0, 0, 0])
        with self.assertRaisesRegex(
            ValueError, "Does not match LabelModel cardinality"
        ):
            label_model._set_class_balance(class_balance=None, Y_dev=Y_dev_one_class)
    def test_labeling_convergence(self) -> None:
        """Test convergence of end to end labeling pipeline."""
        # Apply LFs
        labeling_functions = (
            [f]
            + [get_positive_labeling_function(divisor) for divisor in range(2, 9)]
            + [get_negative_labeling_function(divisor) for divisor in range(2, 9)]
        )
        applier = PandasLFApplier(labeling_functions)
        L_train = applier.apply(self.df_train, progress_bar=False)

        self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions)))

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0)
        Y_lm = label_model.predict_proba(L_train).argmax(axis=1)
        Y = self.df_train.y
        err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN
        self.assertLess(err, 0.05)
    def test_predict(self):
        # 3 LFs that always disagree/abstain leads to all abstains
        L = np.array([[-1, 1, 0], [0, -1, 1], [1, 0, -1]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([-1, -1, -1]))

        L = np.array([[0, 1, 0], [0, 1, 0]])
        label_model = self._set_up_model(L)

        label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(
            0.01, 0.99))
        preds = label_model.predict(L)

        true_preds = np.array([0, 0])
        np.testing.assert_array_equal(preds, true_preds)

        preds, probs = label_model.predict(L, return_probs=True)
        true_probs = np.array([[0.99, 0.01], [0.99, 0.01]])
        np.testing.assert_array_almost_equal(probs, true_probs)
    def test_get_weight(self):
        # set up L matrix
        true_accs = [0.95, 0.6, 0.7, 0.55, 0.8]
        coverage = [1.0, 0.8, 1.0, 1.0, 1.0]
        L = -1 * np.ones((1000, len(true_accs)))
        Y = np.zeros(1000)

        for i in range(1000):
            Y[i] = 1 if np.random.rand() <= 0.5 else 0
            for j in range(5):
                if np.random.rand() <= coverage[j]:
                    L[i, j] = (Y[i] if np.random.rand() <= true_accs[j] else
                               np.abs(Y[i] - 1))

        label_model = LabelModel(cardinality=2)
        label_model.fit(L, n_epochs=1000, seed=123)

        accs = label_model.get_weights()
        for i in range(len(accs)):
            true_acc = true_accs[i]
            self.assertAlmostEqual(accs[i], true_acc, delta=0.1)
def curate_twitter(save_name='../../pandafied_data/curated_twitter.csv'):
    df_train = pd.read_csv('../../pandafied_data/pandafied_twitter.csv')
    #from utils import load_unlabeled_spam_dataset
    #df_train = load_unlabeled_spam_dataset()

    # Define the set of labeling functions (LFs)
    #lfs = [lf_keyword_wateroverlast,lf_keyword_voertuig,lf_keyword_aanrijding,lf_keyword_te_water,lf_keyword_persoon,lf_keyword_brand,lf_keyword_mps,lf_keyword_kps,lf_keyword_luchtdr]

    #lfs = [lf_keyword_keywords]

    lfs = [lf_keyword_wateroverlast]

    # Apply the LFs to the unlabeled training data
    applier = PandasLFApplier(lfs)
    L_train = applier.apply(df_train)

    # Train the label model and compute the training labels
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
    df_train["label"] = label_model.predict(L=L_train,
                                            tie_break_policy="abstain")
    #tie_break_policy="true-random"
    #tie_break_policy="abstain"
    counter = 0
    for i in range(len(df_train["label"])):
        if df_train["label"][i] == WATER:
            print()
            print(df_train["text"][i])
            print(df_train["label"][i])
            print()
            counter += 1

    print("num entries total: " + str(len(df_train["label"])))
    print("num entries water: " + str(counter))

    #df_train = df_train[df_train.label != ABSTAIN]

    twitter_curated = df_train[df_train.label == WATER]
    twitter_curated = twitter_curated.drop(columns='label')
    twitter_curated.to_csv(save_name, index=False)
예제 #13
0
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    data = dd.read_parquet(data_path)
    data = data.repartition(npartitions=2)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = DaskLFApplier(lfs)
    L = applier.apply(data)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    data = data.reset_index().set_index("index")
    data_labeled = data.assign(y_prob=dd.from_array(y_prob))
    dd.to_parquet(data_labeled, output_path)
    logging.info(f"Labels saved to {output_path}")
예제 #14
0
    def test_score(self):
        L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]])
        Y = np.array([1, 0, 1])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=100)
        results = label_model.score(L, Y, metrics=["accuracy", "coverage"])
        np.testing.assert_array_almost_equal(
            label_model.predict(L), np.array([1, -1, 1])
        )

        results_expected = dict(accuracy=1.0, coverage=2 / 3)
        self.assertEqual(results, results_expected)

        L = np.array([[1, 0, 1], [1, 0, 1]])
        label_model = self._set_up_model(L)
        label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(0.01, 0.99))

        results = label_model.score(L, Y=np.array([0, 1]))
        results_expected = dict(accuracy=0.5)
        self.assertEqual(results, results_expected)

        results = label_model.score(L=L, Y=np.array([1, 0]), metrics=["accuracy", "f1"])
        results_expected = dict(accuracy=0.5, f1=2 / 3)
        self.assertEqual(results, results_expected)
예제 #15
0
test_unfired_idx = [i for i,item in enumerate(test_m) if sum(item)==0]
targets_test = test_L[test_fired_idx]

#majority voting using snorkel's majority voting model
maj_preds_test = majority_model.predict(L=test_lsnork[test_fired_idx])
maj_precision_test, maj_recall_test, maj_f1_score_test, maj_support_test = precision_recall_fscore_support(targets_test, maj_preds_test)
maj_accuracy_test = compute_accuracy(maj_support_test, maj_recall_test)

print("precision on *** RULE COVERD TEST SET ***   of MAJORITY VOTING: {}".format(maj_precision_test))
print("recall on *** RULE COVERED TEST SET ***  of MAJORITY VOTING: {}".format(maj_recall_test))
print("f1_score on *** RULE COVERED TEST SET *** of MAJORITY VOTING: {}".format(maj_f1_score_test))
print("support on *** RULE COVERED TEST SET ***  of MAJORITY VOTING: {}".format(maj_support_test))
print("accuracy on *** RULE COVERED TEST SET ***   of MAJORITY VOTING: {}".format(maj_accuracy_test))


#Now train snorkels label model
print("Training Snorkel's LabelModel")
label_model = LabelModel(cardinality=num_classes, verbose=True)
label_model.fit(L_train=U_lsnork, n_epochs=1000, lr=0.001, log_freq=100, seed=123)
label_model.save(os.path.join(path_dir,"saved_label_model"))



snork_preds_test = label_model.predict(L=test_lsnork[test_fired_idx])
snork_precision_test, snork_recall_test, snork_f1_score_test, snork_support_test = precision_recall_fscore_support(targets_test, snork_preds_test)
snork_accuracy_test = compute_accuracy(snork_support_test, snork_recall_test)
print("precision on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_precision_test))
print("recall on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_recall_test))
print("f1_score on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_f1_score_test))
print("support on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_support_test))
print("accuracy on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_accuracy_test))
예제 #16
0
# %% [markdown]
# However, as we can clearly see by looking the summary statistics of our LFs in the previous section, they are not all equally accurate, and should not be treated identically. In addition to having varied accuracies and coverages, LFs may be correlated, resulting in certain signals being overrepresented in a majority-vote-based model. To handle these issues appropriately, we will instead use a more sophisticated Snorkel `LabelModel` to combine the outputs of the LFs.
#
# This model will ultimately produce a single set of noise-aware training labels, which are probabilistic or confidence-weighted labels. We will then use these labels to train a classifier for our task. For more technical details of this overall approach, see our [NeurIPS 2016](https://arxiv.org/abs/1605.07723) and [AAAI 2019](https://arxiv.org/abs/1810.02840) papers. For more info on the API, see the [`LabelModel` documentation](https://snorkel.readthedocs.io/en/master/packages/_autosummary/labeling/snorkel.labeling.LabelModel.html#snorkel.labeling.LabelModel).
#
# Note that no gold labels are used during the training process.
# The only information we need is the label matrix, which contains the output of the LFs on our training set.
# The `LabelModel` is able to learn weights for the labeling functions using only the label matrix as input.
# We also specify the `cardinality`, or number of classes.
# The `LabelModel` trains much more quickly than typical discriminative models since we only need the label matrix as input.

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123)

# %%
majority_acc = majority_model.score(L=L_valid, Y=Y_valid)["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

label_model_acc = label_model.score(L=L_valid, Y=Y_valid)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

# %% [markdown]
# So our `LabelModel` improves over the majority vote baseline!
# However, it is typically **not suitable as an inference-time model** to make predictions for unseen data points, due to (among other things) some data points having all abstain labels.
# In the next section, we will use the output of the label model as  training labels to train a
# discriminative classifier to see if we can improve performance further.
# This classifier will only need the text of the comment to make predictions, making it much more suitable
예제 #17
0
def label_post(inp_path, prefix = ""):

    #lfs = [job_inpost, check_subreddit, check_iama]
    lfs = [job_inpost, check_iama]

    context_lens = [100, 3, 2]
    for with_per in [True, False]:
        for clen in context_lens:
            for kw in patterns:
                lfs.append(make_keyword_lf(keyword=kw, context_len=clen, with_period=with_per))

    print("created lfs, their count", len(lfs))

    df_train = pd.read_pickle(inp_path)

    df_train['texts'] = df_train['text'].swifter.apply(lambda x: [y.lower() for y in tokenize.sent_tokenize(x)])
    df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x])
    #df_train['containing_sentences'] = df_train[['texts', 'value']].swifter.apply(lambda y: find_val(y['texts'], y['value']), axis=1)

    print("loaded dataset")

    t1 = time.time()
    with TQDMDaskProgressBar(desc="Dask Apply"):
        applier = PandasParallelLFApplier(lfs=lfs)
        L_train = applier.apply(df=df_train, n_parallel=num_cpu)
    print("time mins ", (time.time() - t1) / 60)

    print(LFAnalysis(L=L_train, lfs=lfs).lf_summary())

    df_l_train = pd.DataFrame(L_train, columns=[str(x).split(",")[0] for x in lfs])
    print(df_train.shape)
    print(df_l_train.shape)
    df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1)
    print(df_train.shape)
    print("*************************************************")
    df_train = df_train.drop(["index"], axis=1)

    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123)
    probs_train = label_model.predict_proba(L=L_train)

    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=df_train, y=probs_train, L=L_train
    )
    print("the length of unfiltered posts", len(set(df_train['author'] + "+++++" + df_train['value'])))
    print("the length of filtered posts", len(set(df_train_filtered['author'] + "+++++" + df_train_filtered['value'])))

    probs_df = pd.DataFrame(probs_train_filtered, columns=["neg_prob", "pos_prob"])
    print(df_train_filtered.shape)
    print(probs_df.shape)
    df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df], axis=1)
    print(df_train_filtered.shape)

    df_train_filtered.to_pickle("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".pkl")
    df_train_filtered.to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".csv")

    #df_train.iloc[L_train[:, 1] != ABSTAIN].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/intr_train_post_tmp.csv")

    verbose = True
    if verbose:
        for i in range(len(lfs)):
            ppath = "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/interesting_datasets/" + str(lfs[i]).split(",")[0] + ".csv"
            df_train.iloc[L_train[:, i] != ABSTAIN].to_csv(ppath)


    auth_hobby_dict = defaultdict(set)
    for index, row in df_train.iterrows():
        if row.value == row.value and row.author == row.author:
            auth_hobby_dict[row.author].add(row.value)

    with open("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/author_profession_dict_" + prefix + ".txt", "w") as f_out:
        f_out.write(repr(dict(auth_hobby_dict)))
예제 #18
0
def label_user(inp_path, prefix=""):
    df_train = pd.read_pickle(inp_path)

    ########## threshold on word similarity
    take_first = 100
    overall_first = 10000
    global thresh_by_value, overall_thresh
    df_train['root_value'] = df_train['value'].swifter.set_dask_threshold(
        dask_threshold=0.001).allow_dask_on_strings().apply(
            lambda x: syn_to_hob[x])
    thresh_by_value = df_train.groupby(
        ["root_value"]).apply(lambda x: np.partition(
            x['lexicon_counts'], max(len(x['lexicon_counts']) - take_first, 0)
        )[max(len(x['lexicon_counts']) - take_first, 0)]).to_dict()
    overall_thresh = np.partition(df_train["lexicon_counts"].to_numpy(),
                                  max(len(df_train) - overall_first, 0))[max(
                                      len(df_train) - overall_first, 0)]
    print(overall_thresh)
    #############################

    # separately loose - strict, pos - neg, period - without
    names_pool = [
        "context:2_count_pos", "context:3_count_pos", "context:100_count_pos",
        "context:2_period_count_pos", "context:3_period_count_pos",
        "context:100_period_count_pos", "context:2_count_neg",
        "context:3_count_neg", "context:100_count_neg",
        "context:2_period_count_neg", "context:3_period_count_neg",
        "context:100_period_count_neg"
    ]
    for f_name in names_pool:
        curr_cols = [x for x in df_train.columns if f_name in x]
        df_train['total_' + f_name] = df_train[curr_cols].swifter.apply(sum,
                                                                        axis=1)
        df_train = df_train.drop(curr_cols, axis=1)
    for p in ["pos", "neg"]:
        df_train["new_total_context:100_count_" + p] = df_train[[
            "total_context:100_count_" + p, "total_context:3_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:100_count_" + p] - x["total_context:3_count_" +
                                                     p]),
                         axis=1)
        df_train["new_total_context:3_count_" + p] = df_train[[
            "total_context:3_count_" + p, "total_context:2_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:3_count_" + p] - x["total_context:2_count_" + p
                                                   ]),
                         axis=1)
        df_train["new_total_context:100_period_count_" + p] = df_train[[
            "total_context:3_period_count_" + p,
            "total_context:100_period_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:100_period_count_" + p] - x[
                "total_context:3_period_count_" + p]),
                         axis=1)
        df_train["new_total_context:3_period_count_" + p] = df_train[[
            "total_context:3_period_count_" + p,
            "total_context:2_period_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:3_period_count_" + p] - x[
                "total_context:2_period_count_" + p]),
                         axis=1)
        df_train["new_total_context:2_count_" + p] = df_train[[
            "total_context:100_period_count_" + p, "total_context:2_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:2_count_" + p] - x[
                "total_context:100_period_count_" + p]),
                         axis=1)

    df_train = df_train.drop(
        ["total_" + x for x in names_pool if "2_period_count" not in x],
        axis=1)

    lfs = [val_in_name, positive_lexicon_overall, positive_lexicon_pervalue]
    num_of_thesholds = 3
    step = 100 // num_of_thesholds

    for col in df_train:
        if col not in ["author", "value", "idd", "root_value"]:
            if col not in [
                    "pos_prob_mean", "neg_prob_mean", "num_good_posts"
            ]:  # , "lexicon_counts", "subreddit_counts", "name_in_subr_count"]:
                thresholds = [0]
                if "lexicon" in col and "unique" not in col:
                    continue
                if True:  # col in ["lexicon_counts", "unique_lexicon_counts"]:
                    vals = df_train[col].to_numpy()
                    thresholds = np.percentile(
                        vals, list(range(0 + step, 99 + step,
                                         step))).astype(int)
                    thresholds = sorted(list(set(thresholds)))
                    if len(thresholds) > 1:
                        thresholds = thresholds[:-1]
                    if "lexicon" in col:
                        thresholds = [3]
                    # max_val = max(vals)
                    # thresholds = list(range(0, int(max_val), int(max_val/5) + 1))
                # elif col == "pos_prob_mean":
                #    thresholds = [0.5 + 0.1 * x for x in range(5)]
                for i in range(len(thresholds)):
                    thresh = thresholds[i]
                    next_threshold = sys.maxsize if i == len(
                        thresholds) - 1 else thresholds[i + 1]
                    previous_threshold = -sys.maxsize if i == 0 else thresholds[
                        i - 1]
                    if "lexicon_counts" not in col:
                        lfs.append(
                            make_thresold_lf(thresh=thresh,
                                             col_name=col,
                                             next_threshold=next_threshold))
                    else:
                        lfs.append(
                            make_lexicon_lf(
                                thresh=thresh,
                                pref=col,
                                previous_threshold=previous_threshold))

    num_annotators = 0
    if num_annotators > 0:
        for i in range(1, num_annotators + 1):
            lfs.append(make_annotator_lf(worker_index=i))

    lfs = [
        x for x in lfs
        if any(y in str(x) for y in ["less", "context:2", "worker", "lexicon"])
    ]
    print("created lfs their number", len(lfs))
    print("\n".join(str(x) for x in lfs))

    #### validation #####
    do_val = False
    if do_val:
        df_golden = pd.read_csv(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_dev.csv"
        )
        name_val = list(df_golden["auth_val"])
        # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x])
        df_train["auth_val"] = df_train[["author", "value"]].swifter.apply(
            lambda x: x["author"] + "+++" + x["value"], axis=1)
        df_val = df_train[df_train.auth_val.isin(name_val)]
        df_dev = df_train[~df_train.auth_val.isin(name_val)]
        print("Number val", df_val.shape)
        print("Number dev", df_dev.shape)
        df_val = df_val.merge(df_golden, on="auth_val")
        y_val = np.array(df_val["final"])
        df_val = df_val.drop(labels="final", axis=1)
        # create test set as well

        with TQDMDaskProgressBar(desc="Dask Apply"):
            applier = PandasParallelLFApplier(lfs=lfs)
            L_val = applier.apply(df=df_val, n_parallel=num_cpu)
            L_dev = applier.apply(df=df_dev, n_parallel=num_cpu)

        dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary()
        analysis = LFAnalysis(L=L_val, lfs=lfs).lf_summary(y_val)
        analysis.to_csv("/home/tigunova/val_analysis.csv")
        dev_analysis.to_csv("/home/tigunova/dev_analysis.csv")
        print(analysis)
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_dev)  #, Y_dev=y_val)
        model_stat = label_model.score(L=L_val, Y=y_val)
        print(model_stat)
        exit(0)
    ###########

    #### picking threshold #####
    do_threshold = False
    if do_threshold:
        df_golden = pd.read_csv(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_validation.csv"
        )
        name_val = list(df_golden["auth_val"])
        # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x])
        df_train["auth_val"] = df_train[["author", "value"]].swifter.apply(
            lambda x: x["author"] + "+++" + x["value"], axis=1)
        df_val = df_train[df_train.auth_val.isin(name_val)]
        df_dev = df_train[~df_train.auth_val.isin(name_val)]
        pop_size = df_dev.shape[0]
        print("Number val", df_val.shape)
        print("Number dev", df_dev.shape)
        applier = PandasParallelLFApplier(lfs=lfs)
        df_val = df_val.merge(df_golden, on="auth_val")
        L_val = applier.apply(df=df_val, n_parallel=num_cpu)
        val_thresholds = [0.01 * x for x in range(100)]
        label_model = LabelModel(cardinality=2, verbose=True)
        with TQDMDaskProgressBar(desc="Dask Apply"):
            L_dev = applier.apply(df=df_dev, n_parallel=num_cpu)
            label_model.fit(L_dev, class_balance=[0.5, 0.5])  # , Y_dev=y_val)
            wghts = label_model.get_weights()
            print("\n".join(str(x) for x in zip(lfs, wghts)))
            probs_val = label_model.predict_proba(L=L_val)
            probs_df = pd.DataFrame(probs_val,
                                    columns=["neg_prob", "pos_prob"])
            df_val = pd.concat([df_val.reset_index(), probs_df], axis=1)
            probs_dev = label_model.predict_proba(L=L_dev)
            probs_df = pd.DataFrame(probs_dev,
                                    columns=["neg_prob", "pos_prob"])
            df_dev = pd.concat([df_dev.reset_index(), probs_df], axis=1)
            y_true = np.array(df_val["final"])
        for th in val_thresholds:
            y_pred = np.array(
                df_val["pos_prob"].apply(lambda x: 1 if x > th else 0))
            #print("true negatives")
            #print(df_val[df_val["final"] == 1][df_val["pos_prob"] <= th][["auth_val", "text"]])
            prec = precision_score(y_true, y_pred)

            pred_labels = y_pred
            true_labels = y_true
            # True Positive (TP): we predict a label of 1 (positive), and the true label is 1.
            TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1))

            # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
            TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0))

            # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
            FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0))

            # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
            FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1))

            print('TP: %i, FP: %i, TN: %i, FN: %i' % (TP, FP, TN, FN))

            # print(list(zip(label_model.predict(L=L_val_curr), y_val_curr)))
            # print("******************************")
            print("threshold %s, proportion population %.4f, precision %s" %
                  (str(th), df_dev[df_dev["pos_prob"] > th].shape[0] /
                   pop_size, str(prec)))
        exit(0)
    ###########

    with TQDMDaskProgressBar(desc="Dask Apply"):
        applier = PandasParallelLFApplier(lfs=lfs)
        L_train = applier.apply(df=df_train, n_parallel=num_cpu)

    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
    print(analysis)

    df_l_train = pd.DataFrame(
        L_train, columns=["llf_" + str(x).split(",")[0] for x in lfs])
    print(df_train.shape)
    print(df_l_train.shape)
    df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1)
    print(df_train.shape)
    print("********************************************")

    t4 = time.time()
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train,
                    n_epochs=1000,
                    lr=0.001,
                    log_freq=100,
                    seed=123,
                    class_balance=[0.3, 0.7])

    probs_train = label_model.predict_proba(L=L_train)
    print("labeling model work ", (time.time() - t4) / 60)

    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=df_train, y=probs_train, L=L_train)

    probs_df = pd.DataFrame(probs_train_filtered,
                            columns=["neg_prob", "pos_prob"])
    print(df_train_filtered.shape)
    print(probs_df.shape)
    result_filtered = pd.concat([
        df_train_filtered[['author', 'value', 'idd']].reset_index(), probs_df
    ],
                                axis=1)
    print(result_filtered.shape)
    print("****************************************************")

    result_filtered.to_csv("/home/tigunova/some_result_" + prefix + ".csv")

    print(df_train_filtered.shape)
    print(probs_df.shape)
    df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df],
                                  axis=1)
    df_train_filtered = df_train_filtered.drop(["index"], axis=1)
    print(df_train_filtered.shape)
    df_train_filtered.to_pickle(
        "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" +
        prefix + ".pkl")
    df_train_filtered.to_csv(
        "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" +
        prefix + ".csv")

    # df_train.iloc[L_train[:, 1] == POS].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/user_" + prefix + ".csv")

    ### write dict
    output_threshold = 0.63
    output_dict = defaultdict(list)
    auth_hobby_dict = defaultdict(list)
    for index, row in result_filtered.iterrows():
        if row.value == row.value and row.author == row.author:
            auth_hobby_dict[row.author].append([row.value, row.pos_prob])

    allowed_labels = []
    for index, row in df_train_filtered.iterrows():
        if row.value == row.value and row.author == row.author:
            if row.pos_prob > output_threshold:
                output_dict[row.author].append([row.value] + row.idd +
                                               [row.pos_prob])
                allowed_labels.append(syn_to_hob[row.value])
    print("\n".join([
        str(y) for y in sorted(dict(Counter(allowed_labels)).items(),
                               key=lambda x: x[1])
    ]))
    print(
        "After cropping",
        sum([
            x if x < 500 else 500
            for x in dict(Counter(allowed_labels)).values()
        ]))
    print("users in total", len(output_dict))
    for auth, stuffs in output_dict.items():
        prof = ":::".join(set([x[0] for x in stuffs]))
        prob = ":::".join([str(x[-1]) for x in stuffs])
        msgs = set([x for l in stuffs for x in l[1:-1]])
        output_dict[auth] = [prof] + list(msgs) + [prob]

    with open(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/final_author_dict_"
            + prefix + ".txt", "w") as f_out:
        f_out.write(repr(dict(auth_hobby_dict)))
    with open("/home/tigunova/users_profession1.txt", "w") as f_out:
        f_out.write(repr(dict(output_dict)))
예제 #19
0
def main(train_path, output_dir, label_dir):
    # Get all data
    df = pd.read_csv(train_path)

    # Get human labels
    human_labels = read_human_labels(label_dir)

    # df_test and lab_test: the set of all human-labeled notes, and their labels
    df_test = df.merge(human_labels, on=['record_number'])
    lab_test = df_test.human_label
    del df_test['human_label']

    # df_train: formed by removing all patients from df with a human-labeled note
    df_train = df.merge(df_test.mr, indicator=True, how='left', on = ['mr'])
    df_train = df_train.query('_merge=="left_only"').drop('_merge', axis=1)

    # Generate label matrix
    L_train = PandasLFApplier(lfs=lfs).apply(df=df_train)
    L_test = PandasLFApplier(lfs=lfs).apply(df=df_test)

    # Summarize LFs
    output_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
    #print(output_train)
    output_test  = LFAnalysis(L=L_test, lfs=lfs).lf_summary(Y = lab_test.values)
    #print(output_test)

    # Save LF analysis
    path = os.path.join(output_dir, 'LF_analysis_train.csv')
    output_train.to_csv(path, index = True)
    path = os.path.join(output_dir, 'LF_analysis_test.csv')
    output_test.to_csv(path, index = True)

    # Create label model
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123, class_balance = [0.3, 0.7])

    # Evaluate the label model using labeled test set
    for metric in ['recall', 'precision', 'f1', 'accuracy']:
        label_model_acc = label_model.score(L=L_test, Y=lab_test, metrics=[metric], tie_break_policy="random")[metric]
        print("%-15s %.2f%%" % (metric+":", label_model_acc * 100))

    null_f1 = f1_score(lab_test.values, np.ones((df_test.shape[0],)))
    print("%-15s %.2f%%" % ("null f1:", null_f1 * 100))
    print("%-15s %.2f%%" % ("null accuracy:", np.maximum(1-np.mean(lab_test), np.mean(lab_test)) * 100))

    # Save error analysis
    preds = label_model.predict_proba(L_test)
    error_analysis(df_test, L_test, lfs, preds[:,1], lab_test, output_dir)

    # Get labels on train
    probs_train = label_model.predict_proba(L_train)

    # Filter out unlabeled data points
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train)

    # Save filtered training set
    df_train_filtered['prob'] = probs_train_filtered[:,1]
    path = os.path.join(output_dir, 'df_train_filtered.csv')
    df_train_filtered.to_csv(path, index = False)

    # Save label probs
    path = os.path.join(output_dir, 'probs_train_filtered')
    np.save(path, probs_train_filtered[:,1])

    # Save training data set and labels
    assert len(df_test) == len(lab_test)
    df_test['human_label'] = lab_test
    path = os.path.join(output_dir, 'df_test.csv')
    df_test.to_csv(path, index = False)
    path = os.path.join(output_dir, 'lab_test')
    np.save(path, lab_test)
예제 #20
0
def run_snorkel_labelling_classification(labeling_functions, file, l_train,
                                         l_valid):
    lfs = labeling_functions
    # lfs = [lf.is_same_thread, lf.has_entities, lf.enity_overlap_jacc, lf.entity_type_overlap_jacc]
    # lfs = [is_same_thread, enity_overlap, entity_types, entity_type_overlap]

    # lfs = [is_long, has_votes, is_doctor_reply, is_same_thread, enity_overlap, has_type_dsyn, has_type_patf, has_type_sosy,
    #        has_type_dora, has_type_fndg, has_type_menp, has_type_chem, has_type_orch, has_type_horm, has_type_phsu,
    #        has_type_medd, has_type_bhvr, has_type_diap, has_type_bacs, has_type_enzy, has_type_inpo, has_type_elii]
    # lfs = [has_votes, is_doctor_reply, is_same_thread, enity_overlap]
    # lfs = [is_same_thread, enity_overlap, is_doctor_reply]

    # analysis = LFAnalysis(L=l_train, lfs=lfs).lf_summary(Y=Y_train)
    # print(analysis)
    # print(analysis['Conflicts'])
    # print(analysis['Overlaps'])

    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=l_train,
                    n_epochs=20000,
                    lr=0.0001,
                    log_freq=10,
                    seed=2345)
    # label_model.fit(L_train=L_train, n_epochs=20, lr=0.0001, log_freq=10, seed=81794)

    print("Model weights: " + str(label_model.get_weights()))

    valid_probabilities = label_model.predict_proba(L=l_valid)
    if 'predicted_prob' in df_valid:
        # df_valid.drop(columns=['predicted_prob'], axis=1)
        del df_valid['predicted_prob']
    df_valid.insert(50, 'predicted_prob', valid_probabilities[:, 1])

    # df_valid.to_csv("/container/filip/json/ehealthforum/trac/validation_df2.txt", sep="\t", header=True)
    # df_valid = pd.read_csv("/filip/json/ehealthforum/trac/validation_df.txt", sep="\t")

    def compute_precision_at_k(l, k):
        l = l[:k]
        return sum(l) / k

    PROBABILITY_CUTOFF = 0.5

    df_valid[
        'predicted_label'] = df_valid['predicted_prob'] >= PROBABILITY_CUTOFF

    true_positive_ratio = df_valid[df_valid.bm25_relevant == 1].count()['bm25_relevant'] / \
                          df_valid[df_valid.predicted_label == 1].count()['predicted_label']

    print("Number of True relevant: " +
          str(df_valid[df_valid.bm25_relevant == 1].count()['bm25_relevant']))
    print("Number of Predicted relevant: " + str(df_valid[
        df_valid.predicted_label == 1].count()['predicted_label']) + '\n')
    print('True positive ratio: ' + str(true_positive_ratio) + '\n')

    df_tru = df_valid.groupby(['query_thread']).head(10)['bm25_relevant']

    df_pred = df_valid.groupby(['query_thread']).head(10)['predicted_label']

    overall_precision = []

    for query, group in df_valid.groupby(['query_thread']):
        precision = compute_precision_at_k(
            group['predicted_label'].head(10).tolist(), 10)
        overall_precision.append(precision)

    print('Overall precision: ' +
          str(sum(overall_precision) / len(overall_precision)))
    print("Accuracy: " + str(accuracy_score(df_tru, df_pred)))

    label_model_acc = label_model.score(L=l_valid, Y=Y_valid)["accuracy"]
    print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")
예제 #21
0
파일: pipeline.py 프로젝트: DFKI-NLP/wsee
def get_role_probs(lf_train: pd.DataFrame,
                   filter_abstains: bool = False,
                   lfs: Optional[List[labeling_function]] = None,
                   lf_dev: pd.DataFrame = None,
                   seed: Optional[int] = None,
                   tmp_path: Union[str, Path] = None,
                   use_majority_label_voter=False) -> pd.DataFrame:
    """
    Takes "raw" data frame, builds argument role examples, (trains LabelModel), calculates event_argument_probs
    and returns merged argument role examples with event_argument_probs.
    :param use_majority_label_voter: Whether to use a majority label voter instead of the snorkel label model
    :param seed: Seed for use in label model (mu initialization)
    :param filter_abstains: Filters rows where all labeling functions abstained
    :param lf_train: Training dataset which will be labeled using Snorkel
    :param lfs: List of labeling functions
    :param lf_dev: Optional development dataset that can be used to set a prior for the class balance
    :param tmp_path: Path to temporarily store variables that are shared during random repeats
    :return: Labeled lf_train, labeling function applier, label model
    """
    df_train, L_train = None, None
    df_dev, Y_dev, L_dev = None, None, None
    tmp_train_path, tmp_dev_path = None, None

    # For random repeats try to load pickled variables from first run as they are shared
    if tmp_path:
        tmp_train_path = Path(tmp_path).joinpath("role_train.pkl")
        os.makedirs(os.path.dirname(tmp_train_path), exist_ok=True)
        if tmp_train_path.exists():
            with open(tmp_train_path, 'rb') as pickled_train:
                df_train, L_train = pickle.load(pickled_train)
        if lf_dev is not None:
            tmp_dev_path = Path(tmp_path).joinpath("role_dev.pkl")
            os.makedirs(os.path.dirname(tmp_dev_path), exist_ok=True)
            if tmp_dev_path.exists():
                with open(tmp_dev_path, 'rb') as pickled_dev:
                    df_dev, Y_dev, L_dev = pickle.load(pickled_dev)

    if lfs is None:
        lfs = get_role_list_lfs()
    applier = PandasLFApplier(lfs)

    if L_train is None or df_train is None:
        df_train, _ = build_event_role_examples(lf_train)
        logger.info("Running Event Role Labeling Function Applier")
        L_train = applier.apply(df_train)
        if tmp_path:
            with open(tmp_train_path, 'wb') as pickled_train:
                pickle.dump((df_train, L_train), pickled_train)
    if lf_dev is not None and any(element is None
                                  for element in [df_dev, Y_dev, L_dev]):
        df_dev, Y_dev = build_event_role_examples(lf_dev)
        logger.info("Running Event Role Labeling Function Applier on dev set")
        L_dev = applier.apply(df_dev)
        if tmp_path:
            with open(tmp_dev_path, 'wb') as pickled_dev:
                pickle.dump((df_dev, Y_dev, L_dev), pickled_dev)

    if use_majority_label_voter:
        logger.info(
            "Using MajorityLabelVoter to calculate role class probabilities")
        label_model = MajorityLabelVoter(cardinality=11)
    else:
        label_model = LabelModel(cardinality=11, verbose=True)
        logger.info(
            "Fitting LabelModel on the data and predicting role class probabilities"
        )
        if seed:
            label_model.fit(L_train=L_train,
                            n_epochs=5000,
                            log_freq=500,
                            seed=seed,
                            Y_dev=Y_dev)
        else:
            label_model.fit(L_train=L_train,
                            n_epochs=5000,
                            log_freq=500,
                            Y_dev=Y_dev)

    # Evaluate label model on development data
    if df_dev is not None and Y_dev is not None:
        metrics = ["accuracy", "f1_micro", "f1_macro"]
        logger.info("Evaluate on the dev set")
        label_model_metrics = label_model.score(L=L_dev,
                                                Y=Y_dev,
                                                tie_break_policy="random",
                                                metrics=metrics)
        if use_majority_label_voter:
            logger.info('Role Majority Label Voter Metrics')
        else:
            logger.info('Role Label Model Metrics')
        logger.info(
            f"{'Accuracy:':<25} {label_model_metrics['accuracy'] * 100:.1f}%")
        logger.info(
            f"{'F1 (micro averaged):':<25} {label_model_metrics['f1_micro'] * 100:.1f}%"
        )
        logger.info(
            f"{'F1 (macro averaged):':<25} {label_model_metrics['f1_macro'] * 100:.1f}%"
        )

    event_role_probs = label_model.predict_proba(L_train)

    if filter_abstains:
        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=event_role_probs, L=L_train)

        merged_event_role_examples = merge_event_role_examples(
            df_train_filtered, probs_train_filtered)
    else:
        # Multiplies probabilities of abstains with zero so that the example is treated as padding in the end model
        merged_event_role_examples = merge_event_role_examples(
            df_train, utils.zero_out_abstains(event_role_probs, L_train))
    return merged_event_role_examples
예제 #22
0
def test_e2e():
    """Run an end-to-end test on documents of the hardware domain."""
    PARALLEL = 4

    max_docs = 12

    fonduer.init_logging(
        log_dir="log_folder",
        format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s",
        level=logging.INFO,
    )

    session = fonduer.Meta.init(CONN_STRING).Session()

    docs_path = "tests/data/html/"
    pdf_path = "tests/data/pdf/"

    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

    corpus_parser = Parser(
        session,
        parallelism=PARALLEL,
        structural=True,
        lingual=True,
        visual=True,
        pdf_path=pdf_path,
    )
    corpus_parser.apply(doc_preprocessor)
    assert session.query(Document).count() == max_docs

    num_docs = session.query(Document).count()
    logger.info(f"Docs: {num_docs}")
    assert num_docs == max_docs

    num_sentences = session.query(Sentence).count()
    logger.info(f"Sentences: {num_sentences}")

    # Divide into test and train
    docs = sorted(corpus_parser.get_documents())
    last_docs = sorted(corpus_parser.get_last_documents())

    ld = len(docs)
    assert ld == len(last_docs)
    assert len(docs[0].sentences) == len(last_docs[0].sentences)

    assert len(docs[0].sentences) == 799
    assert len(docs[1].sentences) == 663
    assert len(docs[2].sentences) == 784
    assert len(docs[3].sentences) == 661
    assert len(docs[4].sentences) == 513
    assert len(docs[5].sentences) == 700
    assert len(docs[6].sentences) == 528
    assert len(docs[7].sentences) == 161
    assert len(docs[8].sentences) == 228
    assert len(docs[9].sentences) == 511
    assert len(docs[10].sentences) == 331
    assert len(docs[11].sentences) == 528

    # Check table numbers
    assert len(docs[0].tables) == 9
    assert len(docs[1].tables) == 9
    assert len(docs[2].tables) == 14
    assert len(docs[3].tables) == 11
    assert len(docs[4].tables) == 11
    assert len(docs[5].tables) == 10
    assert len(docs[6].tables) == 10
    assert len(docs[7].tables) == 2
    assert len(docs[8].tables) == 7
    assert len(docs[9].tables) == 10
    assert len(docs[10].tables) == 6
    assert len(docs[11].tables) == 9

    # Check figure numbers
    assert len(docs[0].figures) == 32
    assert len(docs[1].figures) == 11
    assert len(docs[2].figures) == 38
    assert len(docs[3].figures) == 31
    assert len(docs[4].figures) == 7
    assert len(docs[5].figures) == 38
    assert len(docs[6].figures) == 10
    assert len(docs[7].figures) == 31
    assert len(docs[8].figures) == 4
    assert len(docs[9].figures) == 27
    assert len(docs[10].figures) == 5
    assert len(docs[11].figures) == 27

    # Check caption numbers
    assert len(docs[0].captions) == 0
    assert len(docs[1].captions) == 0
    assert len(docs[2].captions) == 0
    assert len(docs[3].captions) == 0
    assert len(docs[4].captions) == 0
    assert len(docs[5].captions) == 0
    assert len(docs[6].captions) == 0
    assert len(docs[7].captions) == 0
    assert len(docs[8].captions) == 0
    assert len(docs[9].captions) == 0
    assert len(docs[10].captions) == 0
    assert len(docs[11].captions) == 0

    train_docs = set()
    dev_docs = set()
    test_docs = set()
    splits = (0.5, 0.75)
    data = [(doc.name, doc) for doc in docs]
    data.sort(key=lambda x: x[0])
    for i, (doc_name, doc) in enumerate(data):
        if i < splits[0] * ld:
            train_docs.add(doc)
        elif i < splits[1] * ld:
            dev_docs.add(doc)
        else:
            test_docs.add(doc)
    logger.info([x.name for x in train_docs])

    # NOTE: With multi-relation support, return values of getting candidates,
    # mentions, or sparse matrices are formatted as a list of lists. This means
    # that with a single relation, we need to index into the list of lists to
    # get the candidates/mentions/sparse matrix for a particular relation or
    # mention.

    # Mention Extraction
    part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3)
    temp_ngrams = MentionNgramsTemp(n_max=2)
    volt_ngrams = MentionNgramsVolt(n_max=1)

    Part = mention_subclass("Part")
    Temp = mention_subclass("Temp")
    Volt = mention_subclass("Volt")

    mention_extractor = MentionExtractor(
        session,
        [Part, Temp, Volt],
        [part_ngrams, temp_ngrams, volt_ngrams],
        [part_matcher, temp_matcher, volt_matcher],
    )

    mention_extractor.apply(docs, parallelism=PARALLEL)

    assert session.query(Part).count() == 299
    assert session.query(Temp).count() == 138
    assert session.query(Volt).count() == 140
    assert len(mention_extractor.get_mentions()) == 3
    assert len(mention_extractor.get_mentions()[0]) == 299
    assert (len(
        mention_extractor.get_mentions(docs=[
            session.query(Document).filter(Document.name == "112823").first()
        ])[0]) == 70)

    # Candidate Extraction
    PartTemp = candidate_subclass("PartTemp", [Part, Temp])
    PartVolt = candidate_subclass("PartVolt", [Part, Volt])

    candidate_extractor = CandidateExtractor(
        session, [PartTemp, PartVolt],
        throttlers=[temp_throttler, volt_throttler])

    for i, docs in enumerate([train_docs, dev_docs, test_docs]):
        candidate_extractor.apply(docs, split=i, parallelism=PARALLEL)

    assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493
    assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61
    assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416
    assert session.query(PartVolt).count() == 4282

    # Grab candidate lists
    train_cands = candidate_extractor.get_candidates(split=0, sort=True)
    dev_cands = candidate_extractor.get_candidates(split=1, sort=True)
    test_cands = candidate_extractor.get_candidates(split=2, sort=True)
    assert len(train_cands) == 2
    assert len(train_cands[0]) == 3493
    assert (len(
        candidate_extractor.get_candidates(docs=[
            session.query(Document).filter(Document.name == "112823").first()
        ])[0]) == 1432)

    # Featurization
    featurizer = Featurizer(session, [PartTemp, PartVolt])

    # Test that FeatureKey is properly reset
    featurizer.apply(split=1, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 214
    assert session.query(FeatureKey).count() == 1260

    # Test Dropping FeatureKey
    # Should force a row deletion
    featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"])
    assert session.query(FeatureKey).count() == 1259

    # Should only remove the part_volt as a relation and leave part_temp
    assert set(
        session.query(FeatureKey).filter(
            FeatureKey.name ==
            "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == {
                "part_temp", "part_volt"
            }
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                         candidate_classes=[PartVolt])
    assert session.query(FeatureKey).filter(
        FeatureKey.name ==
        "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes == ["part_temp"]
    assert session.query(FeatureKey).count() == 1259

    # Inserting the removed key
    featurizer.upsert_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                           candidate_classes=[PartTemp, PartVolt])
    assert set(
        session.query(FeatureKey).filter(
            FeatureKey.name ==
            "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == {
                "part_temp", "part_volt"
            }
    assert session.query(FeatureKey).count() == 1259
    # Removing the key again
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                         candidate_classes=[PartVolt])

    # Removing the last relation from a key should delete the row
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                         candidate_classes=[PartTemp])
    assert session.query(FeatureKey).count() == 1258
    session.query(Feature).delete(synchronize_session="fetch")
    session.query(FeatureKey).delete(synchronize_session="fetch")

    featurizer.apply(split=0, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6478
    assert session.query(FeatureKey).count() == 4538
    F_train = featurizer.get_feature_matrices(train_cands)
    assert F_train[0].shape == (3493, 4538)
    assert F_train[1].shape == (2985, 4538)
    assert len(featurizer.get_keys()) == 4538

    featurizer.apply(split=1, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6692
    assert session.query(FeatureKey).count() == 4538
    F_dev = featurizer.get_feature_matrices(dev_cands)
    assert F_dev[0].shape == (61, 4538)
    assert F_dev[1].shape == (153, 4538)

    featurizer.apply(split=2, parallelism=PARALLEL)
    assert session.query(Feature).count() == 8252
    assert session.query(FeatureKey).count() == 4538
    F_test = featurizer.get_feature_matrices(test_cands)
    assert F_test[0].shape == (416, 4538)
    assert F_test[1].shape == (1144, 4538)

    gold_file = "tests/data/hardware_tutorial_gold.csv"

    labeler = Labeler(session, [PartTemp, PartVolt])

    labeler.apply(
        docs=last_docs,
        lfs=[[gold], [gold]],
        table=GoldLabel,
        train=True,
        parallelism=PARALLEL,
    )
    assert session.query(GoldLabel).count() == 8252

    stg_temp_lfs = [
        LF_storage_row,
        LF_operating_row,
        LF_temperature_row,
        LF_tstg_row,
        LF_to_left,
        LF_negative_number_left,
    ]

    ce_v_max_lfs = [
        LF_bad_keywords_in_row,
        LF_current_in_row,
        LF_non_ce_voltages_in_row,
    ]

    with pytest.raises(ValueError):
        labeler.apply(split=0,
                      lfs=stg_temp_lfs,
                      train=True,
                      parallelism=PARALLEL)

    labeler.apply(
        docs=train_docs,
        lfs=[stg_temp_lfs, ce_v_max_lfs],
        train=True,
        parallelism=PARALLEL,
    )
    assert session.query(Label).count() == 6478
    assert session.query(LabelKey).count() == 9
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3493, 9)
    assert L_train[1].shape == (2985, 9)
    assert len(labeler.get_keys()) == 9

    # Test Dropping LabelerKey
    labeler.drop_keys(["LF_storage_row"])
    assert len(labeler.get_keys()) == 8

    # Test Upserting LabelerKey
    labeler.upsert_keys(["LF_storage_row"])
    assert "LF_storage_row" in [label.name for label in labeler.get_keys()]

    L_train_gold = labeler.get_gold_labels(train_cands)
    assert L_train_gold[0].shape == (3493, 1)

    L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold")
    assert L_train_gold[0].shape == (3493, 1)

    gen_model = LabelModel()
    gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100)

    train_marginals = gen_model.predict_proba(L_train[0])

    # Collect word counter
    word_counter = collect_word_counter(train_cands)

    emmental.init(fonduer.Meta.log_path)

    # Training config
    config = {
        "meta_config": {
            "verbose": False
        },
        "model_config": {
            "model_path": None,
            "device": 0,
            "dataparallel": False
        },
        "learner_config": {
            "n_epochs": 5,
            "optimizer_config": {
                "lr": 0.001,
                "l2": 0.0
            },
            "task_scheduler": "round_robin",
        },
        "logging_config": {
            "evaluation_freq": 1,
            "counter_unit": "epoch",
            "checkpointing": False,
            "checkpointer_config": {
                "checkpoint_metric": {
                    f"{ATTRIBUTE}/{ATTRIBUTE}/train/loss": "min"
                },
                "checkpoint_freq": 1,
                "checkpoint_runway": 2,
                "clear_intermediate_checkpoints": True,
                "clear_all_checkpoints": True,
            },
        },
    }
    emmental.Meta.update_config(config=config)

    # Generate word embedding module
    arity = 2
    # Geneate special tokens
    specials = []
    for i in range(arity):
        specials += [f"~~[[{i}", f"{i}]]~~"]

    emb_layer = EmbeddingModule(word_counter=word_counter,
                                word_dim=300,
                                specials=specials)

    diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1)
    train_idxs = np.where(diffs > 1e-6)[0]

    train_dataloader = EmmentalDataLoader(
        task_to_label_dict={ATTRIBUTE: "labels"},
        dataset=FonduerDataset(
            ATTRIBUTE,
            train_cands[0],
            F_train[0],
            emb_layer.word2id,
            train_marginals,
            train_idxs,
        ),
        split="train",
        batch_size=100,
        shuffle=True,
    )

    tasks = create_task(ATTRIBUTE,
                        2,
                        F_train[0].shape[1],
                        2,
                        emb_layer,
                        model="LogisticRegression")

    model = EmmentalModel(name=f"{ATTRIBUTE}_task")

    for task in tasks:
        model.add_task(task)

    emmental_learner = EmmentalLearner()
    emmental_learner.learn(model, [train_dataloader])

    test_dataloader = EmmentalDataLoader(
        task_to_label_dict={ATTRIBUTE: "labels"},
        dataset=FonduerDataset(ATTRIBUTE, test_cands[0], F_test[0],
                               emb_layer.word2id, 2),
        split="test",
        batch_size=100,
        shuffle=False,
    )

    test_preds = model.predict(test_dataloader, return_preds=True)
    positive = np.where(
        np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6)
    true_pred = [test_cands[0][_] for _ in positive[0]]

    pickle_file = "tests/data/parts_by_doc_dict.pkl"
    with open(pickle_file, "rb") as f:
        parts_by_doc = pickle.load(f)

    (TP, FP, FN) = entity_level_f1(true_pred,
                                   gold_file,
                                   ATTRIBUTE,
                                   test_docs,
                                   parts_by_doc=parts_by_doc)

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 < 0.7 and f1 > 0.3

    stg_temp_lfs_2 = [
        LF_to_left,
        LF_test_condition_aligned,
        LF_collector_aligned,
        LF_current_aligned,
        LF_voltage_row_temp,
        LF_voltage_row_part,
        LF_typ_row,
        LF_complement_left_row,
        LF_too_many_numbers_row,
        LF_temp_on_high_page_num,
        LF_temp_outside_table,
        LF_not_temp_relevant,
    ]
    labeler.update(split=0,
                   lfs=[stg_temp_lfs_2, ce_v_max_lfs],
                   parallelism=PARALLEL)
    assert session.query(Label).count() == 6478
    assert session.query(LabelKey).count() == 16
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3493, 16)

    gen_model = LabelModel()
    gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100)

    train_marginals = gen_model.predict_proba(L_train[0])

    diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1)
    train_idxs = np.where(diffs > 1e-6)[0]

    train_dataloader = EmmentalDataLoader(
        task_to_label_dict={ATTRIBUTE: "labels"},
        dataset=FonduerDataset(
            ATTRIBUTE,
            train_cands[0],
            F_train[0],
            emb_layer.word2id,
            train_marginals,
            train_idxs,
        ),
        split="train",
        batch_size=100,
        shuffle=True,
    )

    emmental.Meta.reset()
    emmental.init(fonduer.Meta.log_path)
    emmental.Meta.update_config(config=config)

    tasks = create_task(ATTRIBUTE,
                        2,
                        F_train[0].shape[1],
                        2,
                        emb_layer,
                        model="LogisticRegression")

    model = EmmentalModel(name=f"{ATTRIBUTE}_task")

    for task in tasks:
        model.add_task(task)

    emmental_learner = EmmentalLearner()
    emmental_learner.learn(model, [train_dataloader])

    test_preds = model.predict(test_dataloader, return_preds=True)
    positive = np.where(
        np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7)
    true_pred = [test_cands[0][_] for _ in positive[0]]

    (TP, FP, FN) = entity_level_f1(true_pred,
                                   gold_file,
                                   ATTRIBUTE,
                                   test_docs,
                                   parts_by_doc=parts_by_doc)

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing LSTM
    emmental.Meta.reset()
    emmental.init(fonduer.Meta.log_path)
    emmental.Meta.update_config(config=config)

    tasks = create_task(ATTRIBUTE,
                        2,
                        F_train[0].shape[1],
                        2,
                        emb_layer,
                        model="LSTM")

    model = EmmentalModel(name=f"{ATTRIBUTE}_task")

    for task in tasks:
        model.add_task(task)

    emmental_learner = EmmentalLearner()
    emmental_learner.learn(model, [train_dataloader])

    test_preds = model.predict(test_dataloader, return_preds=True)
    positive = np.where(
        np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7)
    true_pred = [test_cands[0][_] for _ in positive[0]]

    (TP, FP, FN) = entity_level_f1(true_pred,
                                   gold_file,
                                   ATTRIBUTE,
                                   test_docs,
                                   parts_by_doc=parts_by_doc)

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7
예제 #23
0
    def train_f_on_d_U(self, datafeeder, num_epochs, loss_type):
        sess = self.hls.sess

        total_batch = datafeeder.get_batches_per_epoch(f_d_U)
        batch_size = datafeeder.get_batch_size(f_d_U)

        if loss_type == 'pure-likelihood':
            train_op = self.hls.f_d_U_pure_likelihood_op
            loss_op = self.hls.f_d_U_pure_likelihood_loss
        elif loss_type == 'implication':
            train_op = self.hls.f_d_U_implication_op
            loss_op = self.hls.f_d_U_implication_loss
        elif loss_type == 'pr_loss':
            train_op = self.hls.pr_train_op
            loss_op = self.hls.pr_loss
        elif loss_type == 'gcross':
            train_op = self.hls.gcross_train_op
            loss_op = self.hls.gcross_loss
        elif loss_type == 'gcross_snorkel':
            train_op = self.hls.snork_gcross_train_op
            loss_op = self.hls.snork_gcross_loss
        elif loss_type == 'learn2reweight':
            train_op = self.hls.l2r_train_op
            loss_op = self.hls.l2r_loss
        elif loss_type == 'label_snorkel':
            train_op = self.hls.label_snorkel_train_op
            loss_op = self.hls.label_snorkel_loss
        elif loss_type == 'pure_snorkel':
            train_op = self.hls.pure_snorkel_train_op
            loss_op = self.hls.pure_snorkel_loss
        else:
            raise ValueError('Invalid loss type %s' % loss_type)

        best_saver_f_d_U = self.hls.best_savers.get_best_saver(f_d_U)
        metrics_dict = {}  #{'config': self.config}

        if 'label_snorkel' == self.config.mode or 'pure_snorkel' == self.config.mode or 'gcross_snorkel' == self.config.mode:
            label_model = LabelModel(cardinality=self.hls.num_classes,
                                     verbose=True)
            if os.path.isfile(
                    os.path.join(self.config.data_dir, "saved_label_model")):
                label_model = label_model.load(
                    os.path.join(self.config.data_dir, "saved_label_model"))
            else:
                print("LABEL MODEL NOT SAVED")
                exit()
        if 'gcross' in self.config.mode or 'learn2reweight' in self.config.mode:
            majority_model = MajorityLabelVoter(
                cardinality=self.hls.num_classes)

        with sess.as_default():
            print("Optimization started for f_d_U with %s loss!" % loss_type)
            print("Batch size: %d!" % batch_size)
            print("Batches per epoch : %d!" % total_batch)
            print("Number of epochs: %d!" % num_epochs)
            # Training cycle
            iteration = 0
            global_step = 0
            patience = 0
            for epoch in range(num_epochs):
                avg_epoch_cost = 0.

                for i in range(total_batch):
                    batch_x, batch_l, batch_m, batch_L, batch_d, batch_r =\
                            datafeeder.get_f_d_U_next_batch()

                    feed_dict = {
                        self.hls.f_d_U_adam_lr: self.config.f_d_U_adam_lr,
                        self.hls.f_d_U_x: batch_x,
                        self.hls.f_d_U_l: batch_l,
                        self.hls.f_d_U_m: batch_m,
                        self.hls.f_d_U_L: batch_L,
                        self.hls.f_d_U_d: batch_d,
                        self.hls.f_d_U_r: batch_r
                    }

                    batch_lsnork = conv_l_to_lsnork(batch_l, batch_m)

                    if 'label_snorkel' == self.config.mode or 'pure_snorkel' == self.config.mode or 'gcross_snorkel' == self.config.mode:
                        batch_snork_L = label_model.predict_proba(
                            L=batch_lsnork)  #snorkel_probs
                        feed_dict[self.hls.f_d_U_snork_L] = batch_snork_L

                    if 'gcross' == self.config.mode or 'learn2reweight' == self.config.mode:
                        batch_snork_L = majority_model.predict(
                            L=batch_lsnork)  #majority votes
                        batch_snork_L = np.eye(
                            self.hls.num_classes)[batch_snork_L]  #one hot rep
                        feed_dict[self.hls.f_d_U_snork_L] = batch_snork_L

                    merge_dict_a_into_b(self.hls.dropout_train_dict, feed_dict)
                    # Run optimization op (backprop) and cost op (to get loss value)
                    _, cost, num_d, f_d_U_global_step = sess.run(
                        [
                            train_op, loss_op, self.hls.f_d_U_num_d,
                            self.hls.f_d_U_global_step
                        ],
                        feed_dict=feed_dict)

                    global_epoch = f_d_U_global_step / total_batch
                    # This assertion is valid only if true U labels are available but not being used such as for
                    # synthetic data.
                    assert np.all(batch_L <= self.hls.num_classes)

                    avg_epoch_cost += cost / total_batch
                    cost1 = (avg_epoch_cost * total_batch) / (i + 1)
                    global_step += 1

                # Compute and report metrics, update checkpoints after each epoch
                print("\n========== epoch : {} ============\n".format(epoch))
                print("cost: {}\n".format(cost1))
                print("patience: {}\n".format(patience))
                precision, recall, f1_score, support = self.hls.test.test_f(
                    datafeeder)
                self.compute_f_d_metrics(metrics_dict, precision, recall,
                                         f1_score, support, global_epoch,
                                         f_d_U_global_step)
                print("\nmetrics_dict: ", metrics_dict)
                print()
                self.report_f_d_perfs_to_tensorboard(cost1, metrics_dict,
                                                     global_step)
                did_improve = self.maybe_save_metrics_dict(f_d_U, metrics_dict)
                if did_improve:
                    patience = 0  #rest patience if primary metric improved
                else:
                    patience += 1
                    if patience > self.config.early_stopping_p:
                        print("bye! stopping early!......")
                        break
                # Save checkpoint
                print()
                self.hls.mru_saver.save(global_step)
                print()
                best_saver_f_d_U.save_if_best(
                    metrics_dict[self.config.f_d_primary_metric])
                print()
                global_step += 1
            print("Optimization Finished for f_d_U!")
예제 #24
0
def test_e2e():
    """Run an end-to-end test on documents of the hardware domain."""
    PARALLEL = 4

    max_docs = 12

    fonduer.init_logging(
        log_dir="log_folder",
        format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s",
        level=logging.INFO,
    )

    session = fonduer.Meta.init(CONN_STRING).Session()

    docs_path = "tests/data/html/"
    pdf_path = "tests/data/pdf/"

    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

    corpus_parser = Parser(
        session,
        parallelism=PARALLEL,
        structural=True,
        lingual=True,
        visual=True,
        pdf_path=pdf_path,
    )
    corpus_parser.apply(doc_preprocessor)
    assert session.query(Document).count() == max_docs

    num_docs = session.query(Document).count()
    logger.info(f"Docs: {num_docs}")
    assert num_docs == max_docs

    num_sentences = session.query(Sentence).count()
    logger.info(f"Sentences: {num_sentences}")

    # Divide into test and train
    docs = sorted(corpus_parser.get_documents())
    last_docs = sorted(corpus_parser.get_last_documents())

    ld = len(docs)
    assert ld == len(last_docs)
    assert len(docs[0].sentences) == len(last_docs[0].sentences)

    assert len(docs[0].sentences) == 799
    assert len(docs[1].sentences) == 663
    assert len(docs[2].sentences) == 784
    assert len(docs[3].sentences) == 661
    assert len(docs[4].sentences) == 513
    assert len(docs[5].sentences) == 700
    assert len(docs[6].sentences) == 528
    assert len(docs[7].sentences) == 161
    assert len(docs[8].sentences) == 228
    assert len(docs[9].sentences) == 511
    assert len(docs[10].sentences) == 331
    assert len(docs[11].sentences) == 528

    # Check table numbers
    assert len(docs[0].tables) == 9
    assert len(docs[1].tables) == 9
    assert len(docs[2].tables) == 14
    assert len(docs[3].tables) == 11
    assert len(docs[4].tables) == 11
    assert len(docs[5].tables) == 10
    assert len(docs[6].tables) == 10
    assert len(docs[7].tables) == 2
    assert len(docs[8].tables) == 7
    assert len(docs[9].tables) == 10
    assert len(docs[10].tables) == 6
    assert len(docs[11].tables) == 9

    # Check figure numbers
    assert len(docs[0].figures) == 32
    assert len(docs[1].figures) == 11
    assert len(docs[2].figures) == 38
    assert len(docs[3].figures) == 31
    assert len(docs[4].figures) == 7
    assert len(docs[5].figures) == 38
    assert len(docs[6].figures) == 10
    assert len(docs[7].figures) == 31
    assert len(docs[8].figures) == 4
    assert len(docs[9].figures) == 27
    assert len(docs[10].figures) == 5
    assert len(docs[11].figures) == 27

    # Check caption numbers
    assert len(docs[0].captions) == 0
    assert len(docs[1].captions) == 0
    assert len(docs[2].captions) == 0
    assert len(docs[3].captions) == 0
    assert len(docs[4].captions) == 0
    assert len(docs[5].captions) == 0
    assert len(docs[6].captions) == 0
    assert len(docs[7].captions) == 0
    assert len(docs[8].captions) == 0
    assert len(docs[9].captions) == 0
    assert len(docs[10].captions) == 0
    assert len(docs[11].captions) == 0

    train_docs = set()
    dev_docs = set()
    test_docs = set()
    splits = (0.5, 0.75)
    data = [(doc.name, doc) for doc in docs]
    data.sort(key=lambda x: x[0])
    for i, (doc_name, doc) in enumerate(data):
        if i < splits[0] * ld:
            train_docs.add(doc)
        elif i < splits[1] * ld:
            dev_docs.add(doc)
        else:
            test_docs.add(doc)
    logger.info([x.name for x in train_docs])

    # NOTE: With multi-relation support, return values of getting candidates,
    # mentions, or sparse matrices are formatted as a list of lists. This means
    # that with a single relation, we need to index into the list of lists to
    # get the candidates/mentions/sparse matrix for a particular relation or
    # mention.

    # Mention Extraction
    part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3)
    temp_ngrams = MentionNgramsTemp(n_max=2)
    volt_ngrams = MentionNgramsVolt(n_max=1)

    Part = mention_subclass("Part")
    Temp = mention_subclass("Temp")
    Volt = mention_subclass("Volt")

    mention_extractor = MentionExtractor(
        session,
        [Part, Temp, Volt],
        [part_ngrams, temp_ngrams, volt_ngrams],
        [part_matcher, temp_matcher, volt_matcher],
    )

    mention_extractor.apply(docs, parallelism=PARALLEL)

    assert session.query(Part).count() == 299
    assert session.query(Temp).count() == 138
    assert session.query(Volt).count() == 140
    assert len(mention_extractor.get_mentions()) == 3
    assert len(mention_extractor.get_mentions()[0]) == 299
    assert (
        len(
            mention_extractor.get_mentions(
                docs=[session.query(Document).filter(Document.name == "112823").first()]
            )[0]
        )
        == 70
    )

    # Candidate Extraction
    PartTemp = candidate_subclass("PartTemp", [Part, Temp])
    PartVolt = candidate_subclass("PartVolt", [Part, Volt])

    candidate_extractor = CandidateExtractor(
        session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler]
    )

    for i, docs in enumerate([train_docs, dev_docs, test_docs]):
        candidate_extractor.apply(docs, split=i, parallelism=PARALLEL)

    assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493
    assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61
    assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416
    assert session.query(PartVolt).count() == 4282

    # Grab candidate lists
    train_cands = candidate_extractor.get_candidates(split=0, sort=True)
    dev_cands = candidate_extractor.get_candidates(split=1, sort=True)
    test_cands = candidate_extractor.get_candidates(split=2, sort=True)
    assert len(train_cands) == 2
    assert len(train_cands[0]) == 3493
    assert (
        len(
            candidate_extractor.get_candidates(
                docs=[session.query(Document).filter(Document.name == "112823").first()]
            )[0]
        )
        == 1432
    )

    # Featurization
    featurizer = Featurizer(session, [PartTemp, PartVolt])

    # Test that FeatureKey is properly reset
    featurizer.apply(split=1, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 214
    assert session.query(FeatureKey).count() == 1260

    # Test Dropping FeatureKey
    # Should force a row deletion
    featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"])
    assert session.query(FeatureKey).count() == 1259

    # Should only remove the part_volt as a relation and leave part_temp
    assert set(
        session.query(FeatureKey)
        .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]")
        .one()
        .candidate_classes
    ) == {"part_temp", "part_volt"}
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt])
    assert session.query(FeatureKey).filter(
        FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]"
    ).one().candidate_classes == ["part_temp"]
    assert session.query(FeatureKey).count() == 1259

    # Inserting the removed key
    featurizer.upsert_keys(
        ["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp, PartVolt]
    )
    assert set(
        session.query(FeatureKey)
        .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]")
        .one()
        .candidate_classes
    ) == {"part_temp", "part_volt"}
    assert session.query(FeatureKey).count() == 1259
    # Removing the key again
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt])

    # Removing the last relation from a key should delete the row
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp])
    assert session.query(FeatureKey).count() == 1258
    session.query(Feature).delete(synchronize_session="fetch")
    session.query(FeatureKey).delete(synchronize_session="fetch")

    featurizer.apply(split=0, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6478
    assert session.query(FeatureKey).count() == 4538
    F_train = featurizer.get_feature_matrices(train_cands)
    assert F_train[0].shape == (3493, 4538)
    assert F_train[1].shape == (2985, 4538)
    assert len(featurizer.get_keys()) == 4538

    featurizer.apply(split=1, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6692
    assert session.query(FeatureKey).count() == 4538
    F_dev = featurizer.get_feature_matrices(dev_cands)
    assert F_dev[0].shape == (61, 4538)
    assert F_dev[1].shape == (153, 4538)

    featurizer.apply(split=2, parallelism=PARALLEL)
    assert session.query(Feature).count() == 8252
    assert session.query(FeatureKey).count() == 4538
    F_test = featurizer.get_feature_matrices(test_cands)
    assert F_test[0].shape == (416, 4538)
    assert F_test[1].shape == (1144, 4538)

    gold_file = "tests/data/hardware_tutorial_gold.csv"

    labeler = Labeler(session, [PartTemp, PartVolt])

    labeler.apply(
        docs=last_docs,
        lfs=[[gold], [gold]],
        table=GoldLabel,
        train=True,
        parallelism=PARALLEL,
    )
    assert session.query(GoldLabel).count() == 8252

    stg_temp_lfs = [
        LF_storage_row,
        LF_operating_row,
        LF_temperature_row,
        LF_tstg_row,
        LF_to_left,
        LF_negative_number_left,
    ]

    ce_v_max_lfs = [
        LF_bad_keywords_in_row,
        LF_current_in_row,
        LF_non_ce_voltages_in_row,
    ]

    with pytest.raises(ValueError):
        labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL)

    labeler.apply(
        docs=train_docs,
        lfs=[stg_temp_lfs, ce_v_max_lfs],
        train=True,
        parallelism=PARALLEL,
    )
    assert session.query(Label).count() == 6478
    assert session.query(LabelKey).count() == 9
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3493, 9)
    assert L_train[1].shape == (2985, 9)
    assert len(labeler.get_keys()) == 9

    # Test Dropping LabelerKey
    labeler.drop_keys(["LF_storage_row"])
    assert len(labeler.get_keys()) == 8

    # Test Upserting LabelerKey
    labeler.upsert_keys(["LF_storage_row"])
    assert "LF_storage_row" in [label.name for label in labeler.get_keys()]

    L_train_gold = labeler.get_gold_labels(train_cands)
    assert L_train_gold[0].shape == (3493, 1)

    L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold")
    assert L_train_gold[0].shape == (3493, 1)

    gen_model = LabelModel()
    gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100)

    train_marginals = gen_model.predict_proba(L_train[0])

    disc_model = LogisticRegression()
    disc_model.train(
        (train_cands[0], F_train[0]),
        train_marginals,
        X_dev=(train_cands[0], F_train[0]),
        Y_dev=L_train_gold[0].reshape(-1),
        b=0.6,
        pos_label=TRUE,
        n_epochs=5,
        lr=0.001,
    )

    test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))]

    pickle_file = "tests/data/parts_by_doc_dict.pkl"
    with open(pickle_file, "rb") as f:
        parts_by_doc = pickle.load(f)

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 < 0.7 and f1 > 0.3

    stg_temp_lfs_2 = [
        LF_to_left,
        LF_test_condition_aligned,
        LF_collector_aligned,
        LF_current_aligned,
        LF_voltage_row_temp,
        LF_voltage_row_part,
        LF_typ_row,
        LF_complement_left_row,
        LF_too_many_numbers_row,
        LF_temp_on_high_page_num,
        LF_temp_outside_table,
        LF_not_temp_relevant,
    ]
    labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL)
    assert session.query(Label).count() == 6478
    assert session.query(LabelKey).count() == 16
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3493, 16)

    gen_model = LabelModel()
    gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100)

    train_marginals = gen_model.predict_proba(L_train[0])

    disc_model = LogisticRegression()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001
    )

    test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing LSTM
    disc_model = LSTM()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001
    )

    test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing Sparse Logistic Regression
    disc_model = SparseLogisticRegression()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001
    )

    test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing Sparse LSTM
    disc_model = SparseLSTM()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001
    )

    test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Evaluate mention level scores
    L_test_gold = labeler.get_gold_labels(test_cands, annotator="gold")
    Y_test = L_test_gold[0].reshape(-1)

    scores = disc_model.score((test_cands[0], F_test[0]), Y_test, b=0.6, pos_label=TRUE)

    logger.info(scores)

    assert scores["f1"] > 0.6
data_handler = dh.DataHandler()

cui2vec = Cui2Vec().cui2vec

X_gold_sent, X_gold_shortest_path, X_gold_src, X_gold_tgt, X_gold_src_txt, X_gold_tgt_txt, y_gold = data_handler.get_test_data()

X_val_sent, X_val_shortest_path, X_val_src, X_val_tgt, X_val_src_txt, X_val_tgt_txt, y_val = data_handler.get_validation_data()

applier = PandasLFApplier(label_functions.lfs)

df_train = pd.DataFrame(list(zip(*data_handler.get_training_data())), columns=['shortest_path', 'sent', 'src', 'tgt', 'src_txt', 'tgt_txt'])

L_train = applier.apply(df_train)

label_model = LabelModel(cardinality=len(rel_names.rels_txt_to_int), verbose=True)
label_model.fit(L_train, n_epochs=1000, lr=0.01, log_freq=100, seed=123)

label_model.save('./models/LabelModel.model')

train_probs = label_model.predict_proba(L_train)
train_preds = probs_to_preds(train_probs, tie_break_policy='abstain')

df_train = df_train.join(pd.DataFrame({'preds': train_preds, 'probs': list(map(max, train_probs))}))

# -1 to otherwiseRelated
df_train.loc[df_train.preds == -1, 'preds'] = rel_names.rels_txt_to_int['otherwiseRelated']

# Downsample otherwiseRelated
dropNum = len(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']]) - int(df_train['preds'].value_counts().mean())
df_train = df_train.drop(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']].sample(dropNum).index)