Пример #1
0
    def train(self, dataset):
        # Apply labeler functions to training set
        lfs_applier = PandasLFApplier(lfs=self.lfs)
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            lfs_train = lfs_applier.apply(df=dataset)

        # Build probabilistic label model
        label_model = LabelModel(cardinality=3, verbose=True)
        label_model.fit(L_train=lfs_train, n_epochs=500, log_freq=100, seed=42)
        label_probs = label_model.predict_proba(lfs_train)

        # Filter unlabeled data points
        df_filtered, probs_filtered = filter_unlabeled_dataframe(X=dataset,
                                                                 y=label_probs,
                                                                 L=lfs_train)

        # Featurize data using scikit
        self.vectorizer = CountVectorizer(ngram_range=(1, 5))
        dataset_train = self.vectorizer.fit_transform(
            df_filtered.sentence.tolist())

        # Replace probabilistic labels with most likely label
        preds_filtered = probs_to_preds(probs=probs_filtered)

        # Train scikit model
        self.model = LogisticRegression(C=1e3,
                                        solver="liblinear",
                                        multi_class='auto')
        self.model.fit(X=dataset_train, y=preds_filtered)
Пример #2
0
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    sc = SparkContext()
    sql = SQLContext(sc)
    data = sql.read.parquet(data_path)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = SparkLFApplier(lfs)
    L = applier.apply(data.rdd)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
    data_labeled = data.withColumn("y_prob", y_prob_sql_array)
    data_labeled.write.mode("overwrite").parquet(output_path)
    logging.info(f"Labels saved to {output_path}")
Пример #3
0
def generative_model(L_train, n_epochs=500, print_every=100):
    model = LabelModel(cardinality=2)

    logger.info("Training generative model...")
    model.fit(L_train=L_train,
              n_epochs=n_epochs,
              seed=1234,
              log_freq=print_every)
    logger.info("Done.")

    marginals = model.predict_proba(L_train)

    return marginals
Пример #4
0
    def test_labeling_convergence(self) -> None:
        """Test convergence of end to end labeling pipeline."""
        # Apply LFs
        labeling_functions = ([f] + [
            get_positive_labeling_function(divisor) for divisor in range(2, 9)
        ] + [
            get_negative_labeling_function(divisor) for divisor in range(2, 9)
        ])
        applier = PandasLFApplier(labeling_functions)
        L_train = applier.apply(self.df_train, progress_bar=False)

        self.assertEqual(L_train.shape,
                         (self.N_TRAIN, len(labeling_functions)))

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0)
        Y_lm = label_model.predict_proba(L_train).argmax(axis=1)
        Y = self.df_train.y
        err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN
        self.assertLess(err, 0.05)
def label_model_creator(df_dev, Y_dev, df_train, df_test, Y_test):

    # Accumulate all the labeling_functions for supply
    supply_lfs = [
        lf_supply, lf_customer, lf_sales_to, lf_our_customer, lf_acquisition,
        lf_people, lf_sold, lf_relation, lf_competition
    ]

    # Apply the above labeling functions to the data in Pandas dataframe formats
    applier = PandasLFApplier(supply_lfs)

    # Use the applier of the labeling functions to both development set and train set
    L_dev = applier.apply(df_dev)
    L_train = applier.apply(df_train)
    L_test = applier.apply(df_test)

    # caridnality : 2 (True and False)
    label_model = LabelModel(cardinality=2, verbose=True)

    # Fit the label_model
    label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500)

    # accuracy for the label model using the test set
    label_model_acc = label_model.score(L=L_test,
                                        Y=Y_test,
                                        tie_break_policy="random")["accuracy"]
    print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

    # check the F-1 score and ROC_AUC score
    probs_dev = label_model.predict_proba(L_dev)
    preds_dev = probs_to_preds(probs_dev)
    print(
        f"Label model f1 score: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')}"
    )
    print(
        f"Label model roc-auc: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}"
    )

    return label_model, L_train
Пример #6
0
def labeling_evaluation(df_train, df_test, label_model):
    lfs = [
        LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short,
        LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword,
        LabelingFunction.lf_surname_re, LabelingFunction.industry_cls
    ]

    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=df_train)
    L_test = applier.apply(df=df_test)
    Y_test = df_test.label.values
    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    if label_model == "majority":
        majority_model = MajorityLabelVoter()
        preds_train = majority_model.predict(L=L_train)
        majority_acc = majority_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

        df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=preds_train, L=L_train)
        return df_train_filtered, preds_train_filtered, analysis

    if label_model == "weighted":
        label_model = LabelModel(cardinality=len(
            [c for c in dir(Polarity) if not c.startswith("__")]),
                                 verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
        probs_train = label_model.predict_proba(L_train)
        label_model_acc = label_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=probs_train, L=L_train)
        preds_train_filtered = probs_to_preds(probs_train_filtered)
        return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
Пример #7
0
    def train(self):
        '''
        Train the logistic regression discriminative model
        '''
        # We pull out the label vectors for ease of use later
        Y_test = self.df_test.label.values

        applier = PandasLFApplier(lfs=self.lfs)
        L_train = applier.apply(df=self.df_train)

        # Use Label Model to combined input data
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

        # Make predictions
        probs_train = label_model.predict_proba(L=L_train)

        # Filter abstained inputs
        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=self.df_train, y=probs_train, L=L_train)

        # Represent each data point as a one-hot vector
        vectorizer = CountVectorizer(ngram_range=(1, 5))
        X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())
        X_test = vectorizer.transform(self.df_test.text.tolist())

        # Turn probs into preds
        preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

        # Train logistic regression model
        sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
        sklearn_model.fit(X=X_train, y=preds_train_filtered)

        print(
            f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%"
        )
        dump(sklearn_model, 'sklearn_model.joblib')
        dump(vectorizer, 'vectorizer.joblib')
Пример #8
0
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    data = dd.read_parquet(data_path)
    data = data.repartition(npartitions=2)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = DaskLFApplier(lfs)
    L = applier.apply(data)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    data = data.reset_index().set_index("index")
    data_labeled = data.assign(y_prob=dd.from_array(y_prob))
    dd.to_parquet(data_labeled, output_path)
    logging.info(f"Labels saved to {output_path}")
Пример #9
0
def train_end_classifier(Xtrain,
                         Xtest,
                         iwssession,
                         lfsets,
                         device='cuda',
                         gap=20,
                         class_balance=None,
                         modelparams=None,
                         uniform=False,
                         verbose=False):
    """
    Function to fit label model, train downstream classifier, and return test set predictions

    Parameters
    ----------
    Xtrain : ndarray of shape (n training samples,d features)
        Features for training data
    Xtest : ndarray of shape (n test samples,d features)
        Features for test data
    iwssession : Object
        An instance of InteractiveWeakSupervision class
    lfsets : dict
         A dictonary containing, for each run of IWS, final LF indices for each iteration
            {runindex: {iteration : LF_indices})}
    device : str, default = 'cuda'
        String passed to torch to identify which device to use, e.g. "cpu" or 'cuda:0'
    gap : int, default = 20
        Provide downstream results every "gap" iterations
    class_balance : tuple, default = None
        Class balance tuple (negative class fraction, positive class fraction)
        passed to graphical model, e.g. class_balance = (0.5,0.5)
    modelparams : dict, defgault = None
        Dictionary containing sizes of hidden layers and activation functions of the downstream MLP
    uniform : bool, default = False
        Use uniform weighted LFs to obtain label instead of fitting a graphical model to learn weights.
    verbose : bool, default = False
        Print iteration info if true.

    Returns
    -------
    results: dict
        A dictionary containing the probabilistic test set predictions for each iws run,
        and each internal iteration
        {runindex: {iteration_idx : test_predictions})}
    """
    if modelparams is None:
        modelparams = {
            'h_sizes': [Xtrain.shape[1], 20, 20],
            'activations': [torch.nn.ReLU(), torch.nn.ReLU()]
        }

    results = {}
    # for each run of IWS
    for key, iterdict in lfsets.items():
        results[key] = {}
        # establish which IWS iterations to obtain results for
        itermax = len(iterdict.keys())
        finaliter = itermax - 1
        iters_to_run = list(range(0, itermax, gap))
        if finaliter not in iters_to_run:
            # always obtain results for final iteration
            iters_to_run.append(finaliter)

        for iteration_idx in iters_to_run:
            if verbose:
                print('IWS run: %d' % key, ' iteration: %d' % iteration_idx)
            trainidxs = iterdict[iteration_idx]
            # get seleted LFs

            if uniform:
                LFStmp = np.asarray(iwssession.LFs_csc[:, trainidxs].todense())
                n, m = LFStmp.shape
                weights = np.ones(m)
                rowsums = np.asarray((LFStmp != 0).sum(1)).flatten()
                filteridx = rowsums != 0

                posevidence = ((LFStmp == 1).astype(np.float32)).dot(weights)
                negevidence = ((LFStmp == -1).astype(np.float32)).dot(weights)

                posevidence = np.clip(posevidence, 0.0, 700.0)
                negevidence = np.clip(negevidence, 0.0, 700.0)

                bin_posterior = np.exp(posevidence) / (np.exp(posevidence) +
                                                       np.exp(negevidence))
                bin_posterior = bin_posterior.astype(np.float32)
            else:
                Lambdas = np.asarray(iwssession.LFs_csc[:,
                                                        trainidxs].todense())
                # create snorkel LF format
                rowsums = (Lambdas != 0).sum(1)
                filteridx = rowsums != 0
                Lambda_snorkel = np.copy(Lambdas)
                Lambda_snorkel[Lambda_snorkel == 0] = -10
                Lambda_snorkel[Lambda_snorkel == -1] = 0
                Lambda_snorkel[Lambda_snorkel == -10] = -1

                # create variable to filter out samples with 0 LF votes

                # train label model
                if 'cuda' in device:
                    torch.cuda.empty_cache()
                    label_model = LabelModel(cardinality=2,
                                             verbose=True,
                                             device=device)
                    label_model.fit(Lambda_snorkel[filteridx],
                                    class_balance=class_balance)
                    torch.cuda.empty_cache()
                else:
                    label_model = LabelModel(cardinality=2, verbose=True)
                    label_model.fit(Lambda_snorkel[filteridx],
                                    class_balance=class_balance)

                # get label estimate
                posterior = label_model.predict_proba(Lambda_snorkel)
                bin_posterior = posterior[:, 1].astype(np.float32)

            tmpindicator = np.isnan(bin_posterior)
            if tmpindicator.sum() > 0:
                bin_posterior[tmpindicator] = np.median(
                    bin_posterior[~tmpindicator])

                # train classifier on label estimate and get test set prediction
            Xtrain_filtered = Xtrain[filteridx]
            probs_train_filtered = bin_posterior[filteridx]
            torch.cuda.empty_cache()
            model = TorchMLP(h_sizes=modelparams['h_sizes'],
                             activations=modelparams['activations'],
                             optimizer='Adam',
                             nepochs=250)

            if 'cuda' in device:
                tdevice = torch.device(device)
                model.model.to(tdevice)
                model.fit(Xtrain_filtered,
                          probs_train_filtered,
                          device=tdevice)
                test_predictions = model.predict_proba(Xtest, device=tdevice)
            else:
                model.fit(Xtrain_filtered, probs_train_filtered)
                test_predictions = model.predict_proba(Xtest)
            results[key][iteration_idx] = test_predictions
    return results
Пример #10
0
def get_probabilistic_labels(iwssession,
                             lfsets,
                             device='cuda',
                             gap=20,
                             class_balance=None,
                             uniform=False,
                             verbose=False):
    """
    Function to fit label model, train downstream classifier, and return test set predictions

    Parameters
    ----------
    iwssession : Object
        An instance of InteractiveWeakSupervision class
    lfsets : dict
         A dictonary containing, for each run of IWS, final LF indices for each iteration
            {runindex: {iteration : LF_indices})}
    device : str, default = 'cuda'
        String passed to torch to identify which device to use, e.g. "cpu" or 'cuda:0'
    gap : int, default = 20
        Provide downstream results every "gap" iterations
    class_balance : tuple, default = None
        Class balance tuple (negative class fraction, positive class fraction)
        passed to graphical model, e.g. class_balance = (0.5,0.5)
    uniform : bool, default = False
        Use uniform weighted LFs to obtain label instead of fitting a graphical model to learn weights.
    verbose : bool, default = False
        Print iteration info if true.

    Returns
    -------
    results: dict
        A dictionary containing the probabilistic train labels and a boolean filter index variable for each iws run,
        and each internal iteration. The filter index variable is True for every sample where we have at least one
        non-abstain vote.
        {runindex: {iteration_idx : (prob_labels,filteridx)})}
    """

    results = {}
    # for each run of IWS
    for key, iterdict in lfsets.items():
        results[key] = {}
        # establish which IWS iterations to obtain results for
        itermax = len(iterdict.keys())
        finaliter = itermax - 1
        iters_to_run = list(range(0, itermax, gap))
        if finaliter not in iters_to_run:
            # always obtain results for final iteration
            iters_to_run.append(finaliter)

        for iteration_idx in iters_to_run:
            if verbose:
                print('IWS run: %d' % key, ' iteration: %d' % iteration_idx)
            trainidxs = iterdict[iteration_idx]
            # get seleted LFs

            if uniform:
                LFStmp = iwssession.LFs_csc[:, trainidxs].copy()
                n, m = LFStmp.shape
                weights = np.ones(m)
                rowsums = np.asarray((LFStmp != 0).sum(1)).flatten()
                filteridx = rowsums != 0

                posevidence = ((LFStmp == 1).astype(np.float32)).dot(weights)
                negevidence = ((LFStmp == -1).astype(np.float32)).dot(weights)
                posevidence = np.asarray(posevidence).flatten()
                negevidence = np.asarray(negevidence).flatten()

                posevidence = np.clip(posevidence, 0.0, 700.0)
                negevidence = np.clip(negevidence, 0.0, 700.0)

                bin_posterior = np.exp(posevidence) / (np.exp(posevidence) +
                                                       np.exp(negevidence))
                bin_posterior = bin_posterior.astype(np.float32)
            else:
                Lambdas = np.asarray(iwssession.LFs_csc[:,
                                                        trainidxs].todense())
                # create snorkel LF format
                rowsums = (Lambdas != 0).sum(1)
                filteridx = rowsums != 0
                Lambda_snorkel = np.copy(Lambdas)
                Lambda_snorkel[Lambda_snorkel == 0] = -10
                Lambda_snorkel[Lambda_snorkel == -1] = 0
                Lambda_snorkel[Lambda_snorkel == -10] = -1

                # create variable to filter out samples with 0 LF votes

                # train label model
                if 'cuda' in device:
                    torch.cuda.empty_cache()
                    label_model = LabelModel(cardinality=2,
                                             verbose=True,
                                             device=device)
                    label_model.fit(Lambda_snorkel[filteridx],
                                    class_balance=class_balance)
                    torch.cuda.empty_cache()
                else:
                    label_model = LabelModel(cardinality=2, verbose=True)
                    label_model.fit(Lambda_snorkel[filteridx],
                                    class_balance=class_balance)

                # get label estimate
                posterior = label_model.predict_proba(Lambda_snorkel)
                bin_posterior = posterior[:, 1].astype(np.float32)

            tmpindicator = np.isnan(bin_posterior)
            if tmpindicator.sum() > 0:
                bin_posterior[tmpindicator] = np.median(
                    bin_posterior[~tmpindicator])

            results[key][iteration_idx] = (bin_posterior, filteridx)
    return results
    # Define train dataset
    L_train = L_data_local[train_idx]
    Y_train = Y_data_local[train_idx]
    # Define test dataset
    L_test = L_data_local[test_idx]
    Y_test = Y_data_local[test_idx]

    # Evaluate a dependency-informed Snorkel model
    l_model = LabelModel(cardinality=2, verbose=False)
    l_model.fit(L_train, n_epochs=n_epochs, lr=lr)

    try:
        if abstain_rate < 0:
            Y_pred = l_model.predict(L_test, tie_break_policy="abstain")
        else:
            Y_prob = l_model.predict_proba(L_test)
            Y_pred = predict_at_abstain_rate(Y_prob, abstain_rate)

        scores = scorer.score(Y_test, preds=Y_pred)
        all_scores.append(scores)
    except Exception as e:
        print("Iter {}: {}".format(i+1,e))
        continue
    
    # Logging
    print("Iteration " + str(i+1) + ":", scores)

print("-- SUMMARY --")
print("accuracy: AVG {:.3f}, STD {:.3f}".format(np.mean([s["accuracy"] for s in all_scores]), np.std([s["accuracy"] for s in all_scores])))
print("f1: AVG {:.3f}, STD {:.3f}".format(np.mean([s["f1"] for s in all_scores]), np.std([s["f1"] for s in all_scores])))
print("abstain rate: AVG {:.3f}, STD {:.3f}".format(np.mean([s["abstain rate"] for s in all_scores]), np.std([s["abstain rate"] for s in all_scores])))
Пример #12
0
labeler = Labeler(session, candidate_classes)
labeler.apply(docs=train_docs, lfs=[[gold]], table=GoldLabel, train=True)

from fonduer_lfs import president_name_pob_lfs

labeler.apply(split=0, lfs=[president_name_pob_lfs], train=True, parallelism=PARALLEL)
L_train = labeler.get_label_matrices(train_cands)

L_gold_train = labeler.get_gold_labels(train_cands, annotator="gold")

from snorkel.labeling.model import LabelModel

label_model = LabelModel(verbose=False)
label_model.fit(L_train[0], n_epochs=500)

train_marginals = label_model.predict_proba(L_train[0])

ATTRIBUTE = "wiki"

import numpy as np
import emmental
from emmental.data import EmmentalDataLoader
from emmental.learner import EmmentalLearner
from emmental.model import EmmentalModel
from emmental.modules.embedding_module import EmbeddingModule
from fonduer.learning.dataset import FonduerDataset
from fonduer.learning.task import create_task
from fonduer.learning.utils import collect_word_counter
# Collect word counter
word_counter = collect_word_counter(train_cands)
Пример #13
0
def test_e2e():
    """Run an end-to-end test on documents of the hardware domain."""
    # GitHub Actions gives 2 cores
    # help.github.com/en/actions/reference/virtual-environments-for-github-hosted-runners
    PARALLEL = 2

    max_docs = 12

    fonduer.init_logging(
        format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s",
        level=logging.INFO,
    )

    session = fonduer.Meta.init(CONN_STRING).Session()

    docs_path = "tests/data/html/"
    pdf_path = "tests/data/pdf/"

    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

    corpus_parser = Parser(
        session,
        parallelism=PARALLEL,
        structural=True,
        lingual=True,
        visual=True,
        pdf_path=pdf_path,
    )
    corpus_parser.apply(doc_preprocessor)
    assert session.query(Document).count() == max_docs

    num_docs = session.query(Document).count()
    logger.info(f"Docs: {num_docs}")
    assert num_docs == max_docs

    num_sentences = session.query(Sentence).count()
    logger.info(f"Sentences: {num_sentences}")

    # Divide into test and train
    docs = sorted(corpus_parser.get_documents())
    last_docs = sorted(corpus_parser.get_last_documents())

    ld = len(docs)
    assert ld == len(last_docs)
    assert len(docs[0].sentences) == len(last_docs[0].sentences)

    assert len(docs[0].sentences) == 799
    assert len(docs[1].sentences) == 663
    assert len(docs[2].sentences) == 784
    assert len(docs[3].sentences) == 661
    assert len(docs[4].sentences) == 513
    assert len(docs[5].sentences) == 700
    assert len(docs[6].sentences) == 528
    assert len(docs[7].sentences) == 161
    assert len(docs[8].sentences) == 228
    assert len(docs[9].sentences) == 511
    assert len(docs[10].sentences) == 331
    assert len(docs[11].sentences) == 528

    # Check table numbers
    assert len(docs[0].tables) == 9
    assert len(docs[1].tables) == 9
    assert len(docs[2].tables) == 14
    assert len(docs[3].tables) == 11
    assert len(docs[4].tables) == 11
    assert len(docs[5].tables) == 10
    assert len(docs[6].tables) == 10
    assert len(docs[7].tables) == 2
    assert len(docs[8].tables) == 7
    assert len(docs[9].tables) == 10
    assert len(docs[10].tables) == 6
    assert len(docs[11].tables) == 9

    # Check figure numbers
    assert len(docs[0].figures) == 32
    assert len(docs[1].figures) == 11
    assert len(docs[2].figures) == 38
    assert len(docs[3].figures) == 31
    assert len(docs[4].figures) == 7
    assert len(docs[5].figures) == 38
    assert len(docs[6].figures) == 10
    assert len(docs[7].figures) == 31
    assert len(docs[8].figures) == 4
    assert len(docs[9].figures) == 27
    assert len(docs[10].figures) == 5
    assert len(docs[11].figures) == 27

    # Check caption numbers
    assert len(docs[0].captions) == 0
    assert len(docs[1].captions) == 0
    assert len(docs[2].captions) == 0
    assert len(docs[3].captions) == 0
    assert len(docs[4].captions) == 0
    assert len(docs[5].captions) == 0
    assert len(docs[6].captions) == 0
    assert len(docs[7].captions) == 0
    assert len(docs[8].captions) == 0
    assert len(docs[9].captions) == 0
    assert len(docs[10].captions) == 0
    assert len(docs[11].captions) == 0

    train_docs = set()
    dev_docs = set()
    test_docs = set()
    splits = (0.5, 0.75)
    data = [(doc.name, doc) for doc in docs]
    data.sort(key=lambda x: x[0])
    for i, (doc_name, doc) in enumerate(data):
        if i < splits[0] * ld:
            train_docs.add(doc)
        elif i < splits[1] * ld:
            dev_docs.add(doc)
        else:
            test_docs.add(doc)
    logger.info([x.name for x in train_docs])

    # NOTE: With multi-relation support, return values of getting candidates,
    # mentions, or sparse matrices are formatted as a list of lists. This means
    # that with a single relation, we need to index into the list of lists to
    # get the candidates/mentions/sparse matrix for a particular relation or
    # mention.

    # Mention Extraction
    part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3)
    temp_ngrams = MentionNgramsTemp(n_max=2)
    volt_ngrams = MentionNgramsVolt(n_max=1)

    Part = mention_subclass("Part")
    Temp = mention_subclass("Temp")
    Volt = mention_subclass("Volt")

    mention_extractor = MentionExtractor(
        session,
        [Part, Temp, Volt],
        [part_ngrams, temp_ngrams, volt_ngrams],
        [part_matcher, temp_matcher, volt_matcher],
    )

    mention_extractor.apply(docs, parallelism=PARALLEL)

    assert session.query(Part).count() == 299
    assert session.query(Temp).count() == 138
    assert session.query(Volt).count() == 140
    assert len(mention_extractor.get_mentions()) == 3
    assert len(mention_extractor.get_mentions()[0]) == 299
    assert (len(
        mention_extractor.get_mentions(docs=[
            session.query(Document).filter(Document.name == "112823").first()
        ])[0]) == 70)

    # Candidate Extraction
    PartTemp = candidate_subclass("PartTemp", [Part, Temp])
    PartVolt = candidate_subclass("PartVolt", [Part, Volt])

    candidate_extractor = CandidateExtractor(
        session, [PartTemp, PartVolt],
        throttlers=[temp_throttler, volt_throttler])

    for i, docs in enumerate([train_docs, dev_docs, test_docs]):
        candidate_extractor.apply(docs, split=i, parallelism=PARALLEL)

    assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493
    assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61
    assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416
    assert session.query(PartVolt).count() == 4282

    # Grab candidate lists
    train_cands = candidate_extractor.get_candidates(split=0, sort=True)
    dev_cands = candidate_extractor.get_candidates(split=1, sort=True)
    test_cands = candidate_extractor.get_candidates(split=2, sort=True)
    assert len(train_cands) == 2
    assert len(train_cands[0]) == 3493
    assert (len(
        candidate_extractor.get_candidates(docs=[
            session.query(Document).filter(Document.name == "112823").first()
        ])[0]) == 1432)

    # Featurization
    featurizer = Featurizer(session, [PartTemp, PartVolt])

    # Test that FeatureKey is properly reset
    featurizer.apply(split=1, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 214
    assert session.query(FeatureKey).count() == 1260

    # Test Dropping FeatureKey
    # Should force a row deletion
    featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"])
    assert session.query(FeatureKey).count() == 1259

    # Should only remove the part_volt as a relation and leave part_temp
    assert set(
        session.query(FeatureKey).filter(
            FeatureKey.name ==
            "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == {
                "part_temp", "part_volt"
            }
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                         candidate_classes=[PartVolt])
    assert session.query(FeatureKey).filter(
        FeatureKey.name ==
        "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes == ["part_temp"]
    assert session.query(FeatureKey).count() == 1259

    # Inserting the removed key
    featurizer.upsert_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                           candidate_classes=[PartTemp, PartVolt])
    assert set(
        session.query(FeatureKey).filter(
            FeatureKey.name ==
            "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == {
                "part_temp", "part_volt"
            }
    assert session.query(FeatureKey).count() == 1259
    # Removing the key again
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                         candidate_classes=[PartVolt])

    # Removing the last relation from a key should delete the row
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"],
                         candidate_classes=[PartTemp])
    assert session.query(FeatureKey).count() == 1258
    session.query(Feature).delete(synchronize_session="fetch")
    session.query(FeatureKey).delete(synchronize_session="fetch")

    featurizer.apply(split=0, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6478
    assert session.query(FeatureKey).count() == 4538
    F_train = featurizer.get_feature_matrices(train_cands)
    assert F_train[0].shape == (3493, 4538)
    assert F_train[1].shape == (2985, 4538)
    assert len(featurizer.get_keys()) == 4538

    featurizer.apply(split=1, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6692
    assert session.query(FeatureKey).count() == 4538
    F_dev = featurizer.get_feature_matrices(dev_cands)
    assert F_dev[0].shape == (61, 4538)
    assert F_dev[1].shape == (153, 4538)

    featurizer.apply(split=2, parallelism=PARALLEL)
    assert session.query(Feature).count() == 8252
    assert session.query(FeatureKey).count() == 4538
    F_test = featurizer.get_feature_matrices(test_cands)
    assert F_test[0].shape == (416, 4538)
    assert F_test[1].shape == (1144, 4538)

    gold_file = "tests/data/hardware_tutorial_gold.csv"

    labeler = Labeler(session, [PartTemp, PartVolt])

    labeler.apply(
        docs=last_docs,
        lfs=[[gold], [gold]],
        table=GoldLabel,
        train=True,
        parallelism=PARALLEL,
    )
    assert session.query(GoldLabel).count() == 8252

    stg_temp_lfs = [
        LF_storage_row,
        LF_operating_row,
        LF_temperature_row,
        LF_tstg_row,
        LF_to_left,
        LF_negative_number_left,
    ]

    ce_v_max_lfs = [
        LF_bad_keywords_in_row,
        LF_current_in_row,
        LF_non_ce_voltages_in_row,
    ]

    with pytest.raises(ValueError):
        labeler.apply(split=0,
                      lfs=stg_temp_lfs,
                      train=True,
                      parallelism=PARALLEL)

    labeler.apply(
        docs=train_docs,
        lfs=[stg_temp_lfs, ce_v_max_lfs],
        train=True,
        parallelism=PARALLEL,
    )
    assert session.query(Label).count() == 6478
    assert session.query(LabelKey).count() == 9
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3493, 9)
    assert L_train[1].shape == (2985, 9)
    assert len(labeler.get_keys()) == 9

    # Test Dropping LabelerKey
    labeler.drop_keys(["LF_storage_row"])
    assert len(labeler.get_keys()) == 8

    # Test Upserting LabelerKey
    labeler.upsert_keys(["LF_storage_row"])
    assert "LF_storage_row" in [label.name for label in labeler.get_keys()]

    L_train_gold = labeler.get_gold_labels(train_cands)
    assert L_train_gold[0].shape == (3493, 1)

    L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold")
    assert L_train_gold[0].shape == (3493, 1)

    label_model = LabelModel()
    label_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100)

    train_marginals = label_model.predict_proba(L_train[0])

    # Collect word counter
    word_counter = collect_word_counter(train_cands)

    emmental.init(fonduer.Meta.log_path)

    # Training config
    config = {
        "meta_config": {
            "verbose": False
        },
        "model_config": {
            "model_path": None,
            "device": 0,
            "dataparallel": False
        },
        "learner_config": {
            "n_epochs": 5,
            "optimizer_config": {
                "lr": 0.001,
                "l2": 0.0
            },
            "task_scheduler": "round_robin",
        },
        "logging_config": {
            "evaluation_freq": 1,
            "counter_unit": "epoch",
            "checkpointing": False,
            "checkpointer_config": {
                "checkpoint_metric": {
                    f"{ATTRIBUTE}/{ATTRIBUTE}/train/loss": "min"
                },
                "checkpoint_freq": 1,
                "checkpoint_runway": 2,
                "clear_intermediate_checkpoints": True,
                "clear_all_checkpoints": True,
            },
        },
    }
    emmental.Meta.update_config(config=config)

    # Generate word embedding module
    arity = 2
    # Geneate special tokens
    specials = []
    for i in range(arity):
        specials += [f"~~[[{i}", f"{i}]]~~"]

    emb_layer = EmbeddingModule(word_counter=word_counter,
                                word_dim=300,
                                specials=specials)

    diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1)
    train_idxs = np.where(diffs > 1e-6)[0]

    train_dataloader = EmmentalDataLoader(
        task_to_label_dict={ATTRIBUTE: "labels"},
        dataset=FonduerDataset(
            ATTRIBUTE,
            train_cands[0],
            F_train[0],
            emb_layer.word2id,
            train_marginals,
            train_idxs,
        ),
        split="train",
        batch_size=100,
        shuffle=True,
    )

    tasks = create_task(ATTRIBUTE,
                        2,
                        F_train[0].shape[1],
                        2,
                        emb_layer,
                        model="LogisticRegression")

    model = EmmentalModel(name=f"{ATTRIBUTE}_task")

    for task in tasks:
        model.add_task(task)

    emmental_learner = EmmentalLearner()
    emmental_learner.learn(model, [train_dataloader])

    test_dataloader = EmmentalDataLoader(
        task_to_label_dict={ATTRIBUTE: "labels"},
        dataset=FonduerDataset(ATTRIBUTE, test_cands[0], F_test[0],
                               emb_layer.word2id, 2),
        split="test",
        batch_size=100,
        shuffle=False,
    )

    test_preds = model.predict(test_dataloader, return_preds=True)
    positive = np.where(
        np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6)
    true_pred = [test_cands[0][_] for _ in positive[0]]

    pickle_file = "tests/data/parts_by_doc_dict.pkl"
    with open(pickle_file, "rb") as f:
        parts_by_doc = pickle.load(f)

    (TP, FP, FN) = entity_level_f1(true_pred,
                                   gold_file,
                                   ATTRIBUTE,
                                   test_docs,
                                   parts_by_doc=parts_by_doc)

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 < 0.7 and f1 > 0.3

    stg_temp_lfs_2 = [
        LF_to_left,
        LF_test_condition_aligned,
        LF_collector_aligned,
        LF_current_aligned,
        LF_voltage_row_temp,
        LF_voltage_row_part,
        LF_typ_row,
        LF_complement_left_row,
        LF_too_many_numbers_row,
        LF_temp_on_high_page_num,
        LF_temp_outside_table,
        LF_not_temp_relevant,
    ]
    labeler.update(split=0,
                   lfs=[stg_temp_lfs_2, ce_v_max_lfs],
                   parallelism=PARALLEL)
    assert session.query(Label).count() == 6478
    assert session.query(LabelKey).count() == 16
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3493, 16)

    label_model = LabelModel()
    label_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100)

    train_marginals = label_model.predict_proba(L_train[0])

    diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1)
    train_idxs = np.where(diffs > 1e-6)[0]

    train_dataloader = EmmentalDataLoader(
        task_to_label_dict={ATTRIBUTE: "labels"},
        dataset=FonduerDataset(
            ATTRIBUTE,
            train_cands[0],
            F_train[0],
            emb_layer.word2id,
            train_marginals,
            train_idxs,
        ),
        split="train",
        batch_size=100,
        shuffle=True,
    )

    valid_dataloader = EmmentalDataLoader(
        task_to_label_dict={ATTRIBUTE: "labels"},
        dataset=FonduerDataset(
            ATTRIBUTE,
            train_cands[0],
            F_train[0],
            emb_layer.word2id,
            np.argmax(train_marginals, axis=1),
            train_idxs,
        ),
        split="valid",
        batch_size=100,
        shuffle=False,
    )

    emmental.Meta.reset()
    emmental.init(fonduer.Meta.log_path)
    emmental.Meta.update_config(config=config)

    tasks = create_task(ATTRIBUTE,
                        2,
                        F_train[0].shape[1],
                        2,
                        emb_layer,
                        model="LogisticRegression")

    model = EmmentalModel(name=f"{ATTRIBUTE}_task")

    for task in tasks:
        model.add_task(task)

    emmental_learner = EmmentalLearner()
    emmental_learner.learn(model, [train_dataloader, valid_dataloader])

    test_preds = model.predict(test_dataloader, return_preds=True)
    positive = np.where(
        np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7)
    true_pred = [test_cands[0][_] for _ in positive[0]]

    (TP, FP, FN) = entity_level_f1(true_pred,
                                   gold_file,
                                   ATTRIBUTE,
                                   test_docs,
                                   parts_by_doc=parts_by_doc)

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing LSTM
    emmental.Meta.reset()
    emmental.init(fonduer.Meta.log_path)
    emmental.Meta.update_config(config=config)

    tasks = create_task(ATTRIBUTE,
                        2,
                        F_train[0].shape[1],
                        2,
                        emb_layer,
                        model="LSTM")

    model = EmmentalModel(name=f"{ATTRIBUTE}_task")

    for task in tasks:
        model.add_task(task)

    emmental_learner = EmmentalLearner()
    emmental_learner.learn(model, [train_dataloader])

    test_preds = model.predict(test_dataloader, return_preds=True)
    positive = np.where(
        np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7)
    true_pred = [test_cands[0][_] for _ in positive[0]]

    (TP, FP, FN) = entity_level_f1(true_pred,
                                   gold_file,
                                   ATTRIBUTE,
                                   test_docs,
                                   parts_by_doc=parts_by_doc)

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7
    def run_labeling_functions(cands):
        ABSTAIN = -1
        FALSE = 0
        TRUE = 1
        # Extract candidates
        train_cands = cands[0]
        dev_cands = cands[1]
        test_cands = cands[2] 

        @labeling_function()
        def LF_other_station_table(c):
            station_span = c.station.context.get_span().lower()
            neighbour_cells = get_neighbor_cell_ngrams_own(c.price, dist=100, directions=True, n_max = 4, absolute = True)
            up_cells = [x for x in neighbour_cells if len(x) > 1 and x[1] == 'DOWN' and x[0] in stations_list]
            # No station name in upper cells
            if (len(up_cells) == 0):
                return ABSTAIN
            # Check if the next upper aligned station-span corresponds to the candidate span (or equivalents)
            closest_header = up_cells[len(up_cells)-1]
            return TRUE if closest_header[0] in stations_mapping_dict[station_span] else FALSE

        @labeling_function()
        def LF_station_non_meta_tag(c):
            html_tags = get_ancestor_tag_names(c.station)
            return FALSE if ('head' in html_tags and 'title' in html_tags) else ABSTAIN

        # Basic constraint for the price LFs to be true -> no wrong station (increase accuracy)
        def base(c):
            return (
                LF_station_non_meta_tag(c) != 0 and 
                LF_other_station_table(c) != 0 and 
                LF_off_peak_head(c) != 0 and
                LF_purchases(c)
            )

        # 2.) Create labeling functions 
        @labeling_function()
        def LF_on_peak_head(c):
            return TRUE if 'on peak' in get_aligned_ngrams(c.price, n_min=2, n_max=2)  and base(c) else ABSTAIN

        @labeling_function()
        def LF_off_peak_head(c):
            return FALSE if 'off peak' in get_aligned_ngrams(c.price, n_min=2, n_max=2) else ABSTAIN

        @labeling_function()
        def LF_price_range(c):
            price = float(c.price.context.get_span())
            return TRUE if price > 0 and price < 1000 and base(c) else FALSE

        @labeling_function()
        def LF_price_head(c):
            return TRUE if 'price' in get_aligned_ngrams(c.price) and base(c) else ABSTAIN

        @labeling_function()
        def LF_firm_head(c):
            return TRUE if 'firm' in get_aligned_ngrams(c.price)and base(c) else ABSTAIN

        @labeling_function()
        def LF_dollar_to_left(c):
            return TRUE if '$' in get_left_ngrams(c.price, window=2) and base(c) else ABSTAIN

        @labeling_function()
        def LF_purchases(c):
            return FALSE if 'purchases' in get_aligned_ngrams(c.price, n_min=1) else ABSTAIN

        station_price_lfs = [
            LF_other_station_table,
            LF_station_non_meta_tag,

            # indicator
            LF_price_range,

            # negative indicators
            LF_off_peak_head,
            LF_purchases,

            # positive indicators
            LF_on_peak_head,    
            LF_price_head,
            LF_firm_head,
            LF_dollar_to_left,
        ]

        # 3.) Apply the LFs on the training set
        labeler = Labeler(session, [StationPrice])
        labeler.apply(split=0, lfs=[station_price_lfs], train=True, clear=True, parallelism=PARALLEL)
        L_train = labeler.get_label_matrices(train_cands)

        # Check that LFs are all applied (avoid crash)
        applied_lfs = L_train[0].shape[1]
        has_non_applied = applied_lfs != len(station_price_lfs)
        print(f"Labeling functions on train_cands not ABSTAIN: {applied_lfs} (/{len(station_price_lfs)})")

        if (has_non_applied):
            applied_lfs = get_applied_lfs(session)
            non_applied_lfs = [l.name for l in station_price_lfs if l.name not in applied_lfs]
            print(f"Labling functions {non_applied_lfs} are not applied.")
            station_price_lfs = [l for l in station_price_lfs if l.name in applied_lfs]

        # 4.) Evaluate their accuracy
        L_gold_train = labeler.get_gold_labels(train_cands, annotator='gold')
        # Sort LFs for LFAnalysis because LFAnalysis does not sort LFs,
        # while columns of L_train are sorted alphabetically already.
        sorted_lfs = sorted(station_price_lfs, key=lambda lf: lf.name)
        LFAnalysis(L=L_train[0], lfs=sorted_lfs).lf_summary(Y=L_gold_train[0].reshape(-1))

        # 5.) Build generative model
        gen_model = LabelModel(cardinality=2)
        gen_model.fit(L_train[0], n_epochs=500, log_freq=100)

        train_marginals_lfs = gen_model.predict_proba(L_train[0])

        # Apply on dev-set
        labeler.apply(split=1, lfs=[station_price_lfs], clear=True, parallelism=PARALLEL)
        L_dev = labeler.get_label_matrices(dev_cands)

        L_gold_dev = labeler.get_gold_labels(dev_cands, annotator='gold')
        LFAnalysis(L=L_dev[0], lfs=sorted_lfs).lf_summary(Y=L_gold_dev[0].reshape(-1))
        return (gen_model, train_marginals_lfs)
Пример #15
0
# We perform a simple random hyperparameter search over learning rate and L2 regularization, using our small labeled development set to choose the best model.

# %%
from snorkel.labeling.model import LabelModel
from snorkel.analysis.utils import probs_to_preds
from snorkel.analysis.metrics import metric_score

label_model = LabelModel(cardinality=2, verbose=True)
label_model.train_model(L_train,
                        log_train_every=10,
                        lr=0.05,
                        class_balance=[0.7, 0.3],
                        n_epochs=100)

# %%
Y_probs_valid = label_model.predict_proba(L_valid)
Y_preds_valid = probs_to_preds(Y_probs_valid)
metric_score(Y_valid, Y_preds_valid, probs=None, metric="f1")

# %% [markdown]
# **Majority Vote**

# %%
from snorkel.labeling.model import MajorityLabelVoter

mv_model = MajorityLabelVoter()
Y_probs_valid = mv_model.predict_proba(L_valid)
Y_preds_valid = probs_to_preds(Y_probs_valid)
metric_score(Y_valid, Y_preds_valid, probs=None, metric="f1")

# %%
Пример #16
0
                                    Y=y_dev,
                                    tie_break_policy="random")["accuracy"]
print(f'label model acc: {label_model_acc}')

print('fitting Majority Label Voter model')
majority_model = MajorityLabelVoter(cardinality=config['cardinality'])
# preds_train = majority_model.predict(L=L_train)
majority_acc = majority_model.score(L=L_dev,
                                    Y=np.array(y_dev).reshape(-1, 1),
                                    tie_break_policy="random")["accuracy"]
print(f'majority_label_acc: {majority_acc}')

log_metric('majority_label_acc', majority_acc)
log_metric('label_model_acc', label_model_acc)

probs_train = label_model.predict_proba(L=L_train)
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=X_train, y=probs_train, L=L_train)

print('setting up Label Model')
stop_words = config['stop_words']
custom_stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)
# vectorizer = CountVectorizer(ngram_range=(1, 5))
vectorizer = TfidfVectorizer(stop_words=custom_stop_words).fit(
    X_train.text.tolist())
X_train_vectorized = vectorizer.transform(X_train.text.tolist())
X_train_filtered_vectorized = vectorizer.transform(
    df_train_filtered.text.tolist())
preds_train_filtered = probs_to_preds(
    probs=probs_train_filtered
)  # using weak labels generated by Label Model to train downstream classifier
Пример #17
0
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

    L_test = applier.apply(test_df)
    # to_numerical = lambda x: x=='leave'
    # Y_test = [to_numerical(item) for item in test_df.label]
    Y_test = []
    for item in test_df.label:
        if item == 'stay':
            Y_test.append(STAY)
        else:
            Y_test.append(LEAVE)

    Y_test = np.asarray(Y_test)
    label_model_performance = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random",
                                                metrics=['accuracy', 'precision', 'recall', 'f1'])
    print(f"Label Model Accuracy: {label_model_performance['accuracy'] * 100:.1f}%")
    predict_probs = label_model.predict_proba(L_unlabeled)
    preds = probs_to_preds(predict_probs)
    pred_labels = []
    for i in range(len(preds)):
        if preds[i]:
            pred_labels.append('leave')
        else:
            pred_labels.append('stay')
    unlabeled_data['label'] = pred_labels
    unlabeled_data.to_csv(os.path.join(data_dir, 'snorkel_labeled_data.csv'), sep=',', index=False)


def model_analysis(label_model: LabelModel,
                   training_set: pd.DataFrame,
                   L_train: np.ndarray,
                   L_test: np.ndarray,
                   Y_test: np.ndarray,
                   lfs: list,
                   output_file="output") -> None:
    # TODO: consider using **kwargs instead of this painful list of arguments
    """Output analysis for the label model to a file

    :param label_model: The current label model which we want to output analysis for
    :type label_model: LabelModel
    :param training_set: A dataframe containing the training dataset
    :type training_set: pd.DataFrame
    :param L_train: The matrix of labels generated by the labeling functions on the training data
    :type L_train: np.ndarray
    :param L_test: The matrix of labels generated bt the labeling functions on the testing data
    :type L_test: np.ndarray
    :param Y_test: Gold labels associated with data points in L_test
    :type Y_test: np.ndarray
    :param lfs: List of labeling functions
    :type lfs: list
    :param output_file: A path where the output file should be writtent to, defaults to `PROJECT_ROOT/output`
    :type output_file: str, optional
    """
    Y_train = label_model.predict_proba(L=L_train)
    Y_pred = label_model.predict(L=L_test, tie_break_policy="abstain")
    lf_analysis_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    # TODO: Write this df to a output file. Ask Jennifer about how to handle this
    print(lf_analysis_train)

    # build majority label voter model
    majority_model = MajorityLabelVoter()
    majority_acc = majority_model.score(L=L_test,
                                        Y=Y_test,
                                        tie_break_policy="abstain",
                                        metrics=["f1", "accuracy"])
    label_model_acc = label_model.score(L=L_test,
                                        Y=Y_test,
                                        tie_break_policy="abstain",
                                        metrics=["f1", "accuracy"])

    # get precision and recall scores
    p_score = precision_score(y_true=Y_test, y_pred=Y_pred, average='weighted')
    r_score = recall_score(y_true=Y_test,
                           y_pred=Y_pred,
                           average='weighted',
                           labels=np.unique(Y_pred))

    # how many documents abstained
    probs_train = majority_model.predict_proba(L=L_train)
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=training_set, y=probs_train, L=L_train)

    # get number of false positives
    buckets = get_label_buckets(Y_test, Y_pred)
    true_positives, false_positives, true_negatives, false_negatives = (
        buckets.get((1, 1)), buckets.get((1, 0)), buckets.get(
            (0, 0)), buckets.get((0, 1)))
    # write analysis to file
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    with open(f"{'../output/logs/'}{output_file}_run_{timestamp}.txt",
              "w") as output_file:
        output_file.write(
            f"{'Majority Vote Accuracy:':<25} {majority_acc['accuracy'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Majority Vote F1 Score:':<25} {majority_acc['f1'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Label Model Accuracy:':<25} {label_model_acc['accuracy'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Label Model F1 Score:':<25} {label_model_acc['f1'] * 100:.2f}%"
        )
        output_file.write(f"\n{'Precision Score:':<25} {p_score * 100:.2f}%")
        output_file.write(f"\n{'Recall Score:':<25} {r_score * 100:.2f}%")
        output_file.write(
            f"\n{'Abstained Data Points:':<25} {len(df_train_filtered)}")
        output_file.write(
            f"\n{'True Positives:':<25} {len(true_positives) if true_positives is not None else 0}"
        )
        output_file.write(
            f"\n{'False Positives:':<25} {len(false_positives) if false_positives is not None else 0}"
        )
        output_file.write(
            f"\n{'False Negatives:':<25} {len(false_negatives) if false_negatives is not None else 0}"
        )
        output_file.write(
            f"\n{'True Negatives:':<25} {len(true_negatives) if true_negatives is not None else 0}"
        )
        output_file.write(
            f"\n{'Abstained Positives:':<25} {len(buckets[(1, -1)])}")
        output_file.write(
            f"\n{'Abstained Negatives:':<25} {len(buckets[(0, -1)])}")
Пример #19
0
# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345)

# %% [markdown]
# ### Label Model Metrics
# Since our dataset is highly unbalanced (91% of the labels are negative), even a trivial baseline that always outputs negative can get a high accuracy. So we evaluate the label model using the F1 score and ROC-AUC rather than accuracy.

# %%
from snorkel.analysis import metric_score
from snorkel.utils import probs_to_preds

probs_dev = label_model.predict_proba(L_dev)
preds_dev = probs_to_preds(probs_dev)
print(
    f"Label model f1 score: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')}"
)
print(
    f"Label model roc-auc: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}"
)

# %% [markdown]
# ### Part 4: Training our End Extraction Model
#
# In this final section of the tutorial, we'll use our noisy training labels to train our end machine learning model. We start by filtering out training data points which did not recieve a label from any LF, as these data points contain no signal.
#
# %%
from snorkel.labeling import filter_unlabeled_dataframe