예제 #1
0
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    sc = SparkContext()
    sql = SQLContext(sc)
    data = sql.read.parquet(data_path)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = SparkLFApplier(lfs)
    L = applier.apply(data.rdd)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
    data_labeled = data.withColumn("y_prob", y_prob_sql_array)
    data_labeled.write.mode("overwrite").parquet(output_path)
    logging.info(f"Labels saved to {output_path}")
def generate_labels_with_snorkel(dataframe):
    """
    Labels the full data using Snorkel
    :param dataframe: Pandas dataframe containing all data
    :return: dataframe extended with a label column
    """

    # Define the set of labeling functions (LFs)
    lfs = [
        lf_ubo_is_company, lf_troika_company, lf_uk_blacklisted_company,
        lf_non_uk_blacklisted_company
    ]

    # Apply the LFs to the unlabeled training data
    applier = PandasLFApplier(lfs)
    L_train = applier.apply(dataframe)

    # Train the label model and compute the training labels
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
    dataframe["label"] = label_model.predict(L=L_train,
                                             tie_break_policy="abstain")

    # Filter out the abstain data points
    dataframe = dataframe[dataframe.label != ABSTAIN]

    return dataframe
예제 #3
0
 def test_class_balance(self):
     label_model = LabelModel(cardinality=2, verbose=False)
     # Test class balance
     Y_dev = np.array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1])
     label_model._set_class_balance(class_balance=None, Y_dev=Y_dev)
     np.testing.assert_array_almost_equal(label_model.p,
                                          np.array([0.6, 0.4]))
 def test_optimizer(self):
     L = np.array([[0, -1, 0], [0, 1, 0]])
     label_model = LabelModel(cardinality=2, verbose=False)
     label_model.fit(L, n_epochs=1, optimizer="sgd")
     label_model.fit(L, n_epochs=1, optimizer="adam")
     label_model.fit(L, n_epochs=1, optimizer="adamax")
     with self.assertRaisesRegex(ValueError, "Unrecognized optimizer option"):
         label_model.fit(L, n_epochs=1, optimizer="bad_opt")
    def test_set_mu_eps(self):
        mu_eps = 0.0123

        # Construct a label matrix such that P(\lambda_1 = 0 | Y) = 0.0, so it will hit
        # the mu_eps floor
        L = np.array([[1, 1, 1], [1, 1, 1]])
        label_model = LabelModel(verbose=False)
        label_model.fit(L, mu_eps=mu_eps)
        self.assertAlmostEqual(label_model.get_conditional_probs()[0, 1, 0], mu_eps)
def train_model_random_lfs(randomly_sampled_lfs, train_matrix, dev_matrix,
                           dev_labels, test_matrix, regularization_grid):
    hyper_grid_results = defaultdict(dict)
    train_grid_results = defaultdict(dict)
    dev_grid_results = defaultdict(dict)
    test_grid_results = defaultdict(dict)
    models = defaultdict(dict)

    for lf_sample in tqdm_notebook(enumerate(randomly_sampled_lfs)):
        for param in regularization_grid:

            label_model = LabelModel(cardinality=2)
            label_model.fit(
                train_matrix[:, lf_sample[1]],
                n_epochs=1000,
                seed=100,
                lr=0.01,
                l2=param,
            )

            # Get marginals for each parameter
            hyper_grid_results[str(param)] = roc_curve(
                dev_labels,
                label_model.predict_proba(dev_matrix[:, lf_sample[1]])[:, 1])

        # Convert marginals into AUROCs
        hyper_grid_results = {
            param: auc(hyper_grid_results[param][0],
                       hyper_grid_results[param][1])
            for param in hyper_grid_results
        }

        # Select the parameter with the highest AUROC
        best_param = float(
            max(hyper_grid_results.items(), key=operator.itemgetter(1))[0])

        # Re-fit the model
        label_model.fit(
            train_matrix[:, lf_sample[1]],
            n_epochs=1000,
            seed=100,
            lr=0.01,
            l2=best_param,
        )

        # Save marginals for output
        key = f'{lf_sample[0]}:{",".join(map(str, lf_sample[1]))}'
        train_grid_results[key] = label_model.predict_proba(
            train_matrix[:, lf_sample[1]])
        dev_grid_results[key] = label_model.predict_proba(
            dev_matrix[:, lf_sample[1]])
        test_grid_results[key] = label_model.predict_proba(
            test_matrix[:, lf_sample[1]])
        models[key] = label_model

    return train_grid_results, dev_grid_results, test_grid_results, models
    def test_score(self):
        L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]])
        Y = np.array([1, 0, 1])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=100)
        results = label_model.score(L, Y, metrics=["accuracy", "coverage"])
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, -1, 1]))

        results_expected = dict(accuracy=1.0, coverage=2 / 3)
        self.assertEqual(results, results_expected)

        L = np.array([[1, 0, 1], [1, 0, 1]])
        label_model = self._set_up_model(L)
        label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(
            0.01, 0.99))

        results = label_model.score(L, Y=np.array([0, 1]))
        results_expected = dict(accuracy=0.5)
        self.assertEqual(results, results_expected)

        results = label_model.score(L=L,
                                    Y=np.array([1, 0]),
                                    metrics=["accuracy", "f1"])
        results_expected = dict(accuracy=0.5, f1=2 / 3)
        self.assertEqual(results, results_expected)
예제 #8
0
def load_snorkel_ee_components(save_path: Union[str, Path]) \
        -> Tuple[LabelModel, LabelModel]:
    save_path = Path(save_path)
    assert save_path.exists(), f"Save path does not exist: {save_path}"

    trigger_label_model: LabelModel = LabelModel()
    trigger_label_model.load(Path(save_path).joinpath('trigger_lm.pt'))
    role_label_model: LabelModel = LabelModel()
    role_label_model.load(Path(save_path).joinpath('role_lm.pt'))

    return trigger_label_model, role_label_model
예제 #9
0
def get_snorkel_labels(train_df, lfs, labels):
    applier = PandasLFApplier(
        [labeling_function(name=lf.__name__)(lf) for lf in lfs])
    label_model = LabelModel(cardinality=len(labels), verbose=True)
    L_train = applier.apply(df=train_df)
    label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123)
    L_probs = label_model.predict_proba(L=L_train)

    df_filtered, probs_filtered = filter_unlabeled_dataframe(X=train_df,
                                                             y=L_probs,
                                                             L=L_train)
    return df_filtered, probs_filtered
    def test_model_loss(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel(cardinality=2, verbose=False)

        label_model.fit(L, n_epochs=1)
        init_loss = label_model._loss_mu().item()

        label_model.fit(L, n_epochs=10)
        next_loss = label_model._loss_mu().item()

        self.assertLessEqual(next_loss, init_loss)

        with self.assertRaisesRegex(Exception, "Loss is NaN."):
            label_model.fit(L, n_epochs=10, lr=1e8)
예제 #11
0
파일: wrapper.py 프로젝트: laranea/snorkel
    def __init__(
        self,
        cardinality: int = 2,
        verbose: bool = True,
        device: str = "cpu",
        metric: str = "accuracy",
        tie_break_policy: str = "abstain",
        n_epochs: int = 100,
        lr: float = 0.01,
        l2: float = 0.0,
        optimizer: str = "sgd",
        optimizer_config: Optional[OptimizerConfig] = None,
        lr_scheduler: str = "constant",
        lr_scheduler_config: Optional[LRSchedulerConfig] = None,
        prec_init: float = 0.7,
        seed: int = np.random.randint(1e6),
        log_freq: int = 10,
        mu_eps: Optional[float] = None,
        class_balance: Optional[List[float]] = None,
        **kwargs: Any,
    ) -> None:

        self.cardinality = cardinality
        self.verbose = verbose
        self.device = device
        self.metric = metric
        self.tie_break_policy = tie_break_policy
        self.n_epochs = n_epochs
        self.lr = lr
        self.l2 = l2
        self.optimizer = optimizer
        self.optimizer_config = (
            optimizer_config if optimizer_config is not None else
            OptimizerConfig()  # type: ignore
        )
        self.lr_scheduler = lr_scheduler
        self.lr_scheduler_config = (
            lr_scheduler_config if lr_scheduler_config is not None else
            LRSchedulerConfig()  # type: ignore
        )
        self.prec_init = prec_init
        self.seed = seed
        self.log_freq = log_freq
        self.mu_eps = mu_eps
        self.class_balance = class_balance

        self.label_model = LabelModel(cardinality=self.cardinality,
                                      verbose=self.verbose,
                                      device=self.device)
예제 #12
0
    def __init__(self,
                 df_train,
                 df_dev,
                 df_valid,
                 df_test,
                 df_heldout,
                 lfs={},
                 label_model=None):
        df_train["seen"] = 0
        self.df_train = df_train.reset_index()
        self.df_dev = df_dev
        self.df_valid = df_valid
        self.df_test = df_test
        self.df_heldout = df_heldout
        #self.Y_train = df_train.label.values
        self.Y_dev = df_dev.label.values
        self.Y_valid = df_valid.label.values
        self.Y_test = df_test.label.values
        self.Y_heldout = df_heldout.label.values

        self.lfs = lfs

        self.L_train = None
        self.L_dev = None
        self.L_valid = None
        self.L_heldout = None
        cardinality = len(df_valid.label.unique())

        # for DEMOing purposes
        self.first_text_indices = [
            1262,  #"check out" "youtube"
            1892,  # I love
            1117,  # url concept
            1706,  # emoji concept
            952,  # "nice"
            971,  # positive concept
            958,  # actually use emoji concept
        ]

        self.count = 0

        if label_model is None:
            self.label_model = LabelModel(cardinality=cardinality,
                                          verbose=True)
        else:
            self.label_model = label_model

        self.vectorizer = CountVectorizer(ngram_range=(1, 2))
        self.vectorizer.fit(df_train.text.tolist())
    def test_optimizer_init(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        label_model.fit(L, optimizer="sgd", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.SGD)

        label_model.fit(L, optimizer="adam", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.Adam)

        label_model.fit(L, optimizer="adamax", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.Adamax)

        with self.assertRaisesRegex(ValueError, "Unrecognized optimizer"):
            label_model.fit(L, optimizer="bad_optimizer", n_epochs=1)
예제 #14
0
 def load(self, dir_name):
     with open(os.path.join(dir_name, 'model_lfs.pkl'), "rb") as file:
         lfs = pickle.load(file)
         label_model = LabelModel.load(
             os.path.join(dir_name, 'label_model.pkl'))
         self.lfs = lfs
         self.label_model = label_model
 def test_save_and_load(self):
     L = np.array([[0, -1, 0], [0, 1, 0]])
     label_model = LabelModel(cardinality=2, verbose=False)
     label_model.fit(L, n_epochs=1)
     dir_path = tempfile.mkdtemp()
     save_path = dir_path + "label_model"
     label_model.save(save_path)
     label_model.load(save_path)
     shutil.rmtree(dir_path)
    def test_scheduler_init(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        label_model.fit(L, lr_scheduler="constant", n_epochs=1)
        self.assertIsNone(label_model.lr_scheduler)

        label_model.fit(L, lr_scheduler="linear", n_epochs=1)
        self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.LambdaLR)

        label_model.fit(L, lr_scheduler="exponential", n_epochs=1)
        self.assertIsInstance(
            label_model.lr_scheduler, optim.lr_scheduler.ExponentialLR
        )

        label_model.fit(L, lr_scheduler="step", n_epochs=1)
        self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.StepLR)
    def test_augmented_L_construction(self):
        # 5 LFs
        n = 3
        m = 5
        k = 2
        L = np.array([[0, 0, 0, 1, 0], [0, 1, 1, 0, -1], [0, 0, 0, 0, -1]])
        L_shift = L + 1
        lm = LabelModel(cardinality=k, verbose=False)
        lm._set_constants(L_shift)
        lm._create_tree()
        L_aug = lm._get_augmented_label_matrix(L_shift, higher_order=True)

        # Should have 10 columns:
        # - 5 * 2 = 10 for the sources
        self.assertEqual(L_aug.shape, (3, 10))

        # 13 total nonzero entries
        self.assertEqual(L_aug.sum(), 13)

        # Next, check the singleton entries
        for i in range(n):
            for j in range(m):
                if L_shift[i, j] > 0:
                    self.assertEqual(L_aug[i, j * k + L_shift[i, j] - 1], 1)

        # Finally, check the clique entries
        # Singleton clique 1
        self.assertEqual(len(lm.c_tree.node[1]["members"]), 1)
        j = lm.c_tree.node[1]["start_index"]
        self.assertEqual(L_aug[0, j], 1)

        # Singleton clique 2
        self.assertEqual(len(lm.c_tree.node[2]["members"]), 1)
        j = lm.c_tree.node[2]["start_index"]
        self.assertEqual(L_aug[0, j + 1], 0)
예제 #18
0
def predict_documents(documents: pd.DataFrame, trigger_label_model: LabelModel,
                      role_label_model: LabelModel):
    if 'event_triggers' not in documents and 'event_roles' not in documents:
        documents = documents.apply(pipeline.add_default_events, axis=1)

    # 1. Get trigger probabilities
    df_predict_triggers, _ = pipeline.build_event_trigger_examples(documents)
    trigger_lf_applier = PandasLFApplier(pipeline.get_trigger_list_lfs())
    L_predict_triggers = trigger_lf_applier.apply(df_predict_triggers)
    event_trigger_probs = trigger_label_model.predict_proba(L_predict_triggers)

    merged_event_trigger_examples = pipeline.merge_event_trigger_examples(
        df_predict_triggers,
        utils.zero_out_abstains(event_trigger_probs, L_predict_triggers))

    # 2. Get role probabilities
    df_predict_roles, _ = pipeline.build_event_role_examples(documents)
    role_lf_applier = PandasLFApplier(pipeline.get_role_list_lfs())
    L_predict_roles = role_lf_applier.apply(df_predict_roles)
    event_roles_probs = role_label_model.predict_proba(L_predict_roles)

    merged_event_role_examples = pipeline.merge_event_role_examples(
        df_predict_roles,
        utils.zero_out_abstains(event_roles_probs, L_predict_roles))

    # 3. Update documents with trigger & role probabilities
    labeled_documents: pd.DataFrame = documents.copy()
    # Make sure to remove event_triggers and roles that were built per default
    for idx, row in labeled_documents.iterrows():
        row['event_triggers'] = []
        row['event_roles'] = []
    if 'id' in labeled_documents:
        labeled_documents.set_index('id', inplace=True)

    triggers = merged_event_trigger_examples[['event_triggers']]
    roles = merged_event_role_examples[['event_roles']]

    labeled_documents.update(triggers)
    labeled_documents.update(roles)

    labeled_documents.reset_index(level=0, inplace=True)

    # 4. Add ACE events
    labeled_documents = ace_formatter.snorkel_to_ace_format(labeled_documents)
    return labeled_documents
예제 #19
0
    def test_loss(self):
        L = np.array([[0, -1, 0], [0, 1, -1]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=1)
        label_model.mu = nn.Parameter(label_model.mu_init.clone() + 0.05)

        # l2_loss = l2*M*K*||mu - mu_init||_2 = 3*2*(0.05^2) = 0.03
        self.assertAlmostEqual(label_model._loss_l2(l2=1.0).item(), 0.03)
        self.assertAlmostEqual(label_model._loss_l2(l2=np.ones(6)).item(), 0.03)

        # mu_loss = ||O - \mu^T P \mu||_2 + ||\mu^T P - diag(O)||_2
        self.assertAlmostEqual(label_model._loss_mu().item(), 0.675, 3)
    def test_class_balance(self):
        label_model = LabelModel(cardinality=2, verbose=False)
        # Test class balance
        Y_dev = np.array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1])
        label_model._set_class_balance(class_balance=None, Y_dev=Y_dev)
        np.testing.assert_array_almost_equal(label_model.p, np.array([0.6, 0.4]))

        class_balance = np.array([0.0, 1.0])
        with self.assertRaisesRegex(ValueError, "Class balance prior is 0"):
            label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev)

        class_balance = np.array([0.0])
        with self.assertRaisesRegex(ValueError, "class_balance has 1 entries."):
            label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev)

        Y_dev_one_class = np.array([0, 0, 0])
        with self.assertRaisesRegex(
            ValueError, "Does not match LabelModel cardinality"
        ):
            label_model._set_class_balance(class_balance=None, Y_dev=Y_dev_one_class)
    def test_mv_default(self):
        # less than 2 LFs have overlaps
        label_model = LabelModel(cardinality=2, verbose=False)
        L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]])
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, 1, 0]))

        # less than 2 LFs have conflicts
        L = np.array([[-1, -1, 1], [-1, 1, 1], [1, 1, 1]])
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, 1, 1]))
    def test_labeling_convergence(self) -> None:
        """Test convergence of end to end labeling pipeline."""
        # Apply LFs
        labeling_functions = (
            [f]
            + [get_positive_labeling_function(divisor) for divisor in range(2, 9)]
            + [get_negative_labeling_function(divisor) for divisor in range(2, 9)]
        )
        applier = PandasLFApplier(labeling_functions)
        L_train = applier.apply(self.df_train, progress_bar=False)

        self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions)))

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0)
        Y_lm = label_model.predict_proba(L_train).argmax(axis=1)
        Y = self.df_train.y
        err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN
        self.assertLess(err, 0.05)
    def test_get_weight(self):
        # set up L matrix
        true_accs = [0.95, 0.6, 0.7, 0.55, 0.8]
        coverage = [1.0, 0.8, 1.0, 1.0, 1.0]
        L = -1 * np.ones((1000, len(true_accs)))
        Y = np.zeros(1000)

        for i in range(1000):
            Y[i] = 1 if np.random.rand() <= 0.5 else 0
            for j in range(5):
                if np.random.rand() <= coverage[j]:
                    L[i, j] = (Y[i] if np.random.rand() <= true_accs[j] else
                               np.abs(Y[i] - 1))

        label_model = LabelModel(cardinality=2)
        label_model.fit(L, n_epochs=1000, seed=123)

        accs = label_model.get_weights()
        for i in range(len(accs)):
            true_acc = true_accs[i]
            self.assertAlmostEqual(accs[i], true_acc, delta=0.1)
예제 #24
0
def get_majority_vote_label(train_df, lfs, labels):
    applier = PandasLFApplier(
        [labeling_function(name=lf.__name__)(lf) for lf in lfs])
    label_model = LabelModel(cardinality=len(labels), verbose=True)
    L_train = applier.apply(df=train_df)
    majority_model = MajorityLabelVoter(cardinality=len(labels))
    preds_train = majority_model.predict(L=L_train)

    non_abstain_idxs = np.argwhere(preds_train >= 0).flatten()
    df_filtered = train_df.iloc[non_abstain_idxs]
    probs_filtered = preds_train[non_abstain_idxs]
    return df_filtered, probs_filtered
    def test_label_model_sparse(self) -> None:
        """Test the LabelModel's estimate of P and Y on a sparse synthetic dataset.

        This tests the common setting where LFs abstain most of the time, which can
        cause issues for example if parameter clamping set too high (e.g. see Issue
        #1422).
        """
        np.random.seed(123)
        P, Y, L = generate_simple_label_matrix(self.n,
                                               self.m,
                                               self.cardinality,
                                               abstain_multiplier=1000.0)

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L, n_epochs=1000, lr=0.01, seed=123)

        # Test estimated LF conditional probabilities
        P_lm = label_model.get_conditional_probs()
        np.testing.assert_array_almost_equal(P, P_lm, decimal=2)

        # Test predicted labels *only on non-abstained data points*
        Y_pred = label_model.predict(L, tie_break_policy="abstain")
        idx, = np.where(Y_pred != -1)
        acc = np.where(Y_pred[idx] == Y[idx], 1, 0).sum() / len(idx)
        self.assertGreaterEqual(acc, 0.65)

        # Make sure that we don't output abstain when an LF votes, per issue #1422
        self.assertEqual(len(idx),
                         np.where((L + 1).sum(axis=1) != 0, 1, 0).sum())
def snorkel_process(keylist, dataframe, allweaklabf):
    def func(x):
        idx = (-x).argsort()[1:]
        x[idx] = 0
        return x

    cardinalitynu = len(keylist)
    applier = PandasLFApplier(lfs=allweaklabf)
    all_train_l = applier.apply(df=dataframe)
    report = LFAnalysis(L=all_train_l, lfs=allweaklabf).lf_summary()
    print(report)
    label_model = LabelModel(cardinality=cardinalitynu, verbose=False)
    label_model.fit(all_train_l)
    predt = label_model.predict(all_train_l)
    predt1 = label_model.predict_proba(all_train_l)
    keylist1 = keylist.copy()
    #keylist1.append('Not_relevent')
    predt2 = pd.DataFrame(predt1, columns=keylist1)
    dataframe['L_label'] = predt
    dataframe1 = dataframe.join(predt2, how='outer')
    dataframe1 = dataframe1[dataframe1.L_label >= 0]

    train, test = train_test_split(dataframe1, test_size=0.2)

    trainsent = train.sent.values
    trainlabel = train[keylist].values
    trainlabe2 = trainlabel.copy()
    np.apply_along_axis(func, 1, trainlabe2)
    trainlabe2 = np.where(trainlabe2 > 0, 1, 0)
    testsent = test.sent.values
    testlabel = test[keylist].values
    testlabe2 = testlabel.copy()
    np.apply_along_axis(func, 1, testlabe2)
    testlabe2 = np.where(testlabe2 > 0, 1, 0)
    return trainsent, trainlabe2, testsent, testlabe2, keylist, report
def curate_twitter(save_name='../../pandafied_data/curated_twitter.csv'):
    df_train = pd.read_csv('../../pandafied_data/pandafied_twitter.csv')
    #from utils import load_unlabeled_spam_dataset
    #df_train = load_unlabeled_spam_dataset()

    # Define the set of labeling functions (LFs)
    #lfs = [lf_keyword_wateroverlast,lf_keyword_voertuig,lf_keyword_aanrijding,lf_keyword_te_water,lf_keyword_persoon,lf_keyword_brand,lf_keyword_mps,lf_keyword_kps,lf_keyword_luchtdr]

    #lfs = [lf_keyword_keywords]

    lfs = [lf_keyword_wateroverlast]

    # Apply the LFs to the unlabeled training data
    applier = PandasLFApplier(lfs)
    L_train = applier.apply(df_train)

    # Train the label model and compute the training labels
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
    df_train["label"] = label_model.predict(L=L_train,
                                            tie_break_policy="abstain")
    #tie_break_policy="true-random"
    #tie_break_policy="abstain"
    counter = 0
    for i in range(len(df_train["label"])):
        if df_train["label"][i] == WATER:
            print()
            print(df_train["text"][i])
            print(df_train["label"][i])
            print()
            counter += 1

    print("num entries total: " + str(len(df_train["label"])))
    print("num entries water: " + str(counter))

    #df_train = df_train[df_train.label != ABSTAIN]

    twitter_curated = df_train[df_train.label == WATER]
    twitter_curated = twitter_curated.drop(columns='label')
    twitter_curated.to_csv(save_name, index=False)
    def test_L_form(self):
        label_model = LabelModel(cardinality=2, verbose=False)
        L = np.array([[-1, 1, -1], [-1, 1, -1], [1, -1, -1], [-1, 1, -1]])
        label_model._set_constants(L)
        self.assertEqual(label_model.n, 4)
        self.assertEqual(label_model.m, 3)

        L = np.array([[-1, 0, 1], [-1, 0, 2], [0, -1, 2], [-1, 0, -1]])
        with self.assertRaisesRegex(ValueError, "L_train has cardinality"):
            label_model.fit(L, n_epochs=1)

        L = np.array([[0, 1], [1, 1], [0, 1]])
        with self.assertRaisesRegex(ValueError, "L_train should have at least 3"):
            label_model.fit(L, n_epochs=1)
    def test_warmup(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        lr_scheduler_config = {"warmup_steps": 3, "warmup_unit": "epochs"}
        label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5)
        self.assertEqual(label_model.warmup_steps, 3)

        lr_scheduler_config = {"warmup_percentage": 3 / 5}
        label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5)
        self.assertEqual(label_model.warmup_steps, 3)

        with self.assertRaisesRegex(ValueError, "LabelModel does not support"):
            lr_scheduler_config = {"warmup_steps": 1, "warmup_unit": "batches"}
            label_model.fit(L, lr_scheduler_config=lr_scheduler_config)
예제 #30
0
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    data = dd.read_parquet(data_path)
    data = data.repartition(npartitions=2)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = DaskLFApplier(lfs)
    L = applier.apply(data)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    data = data.reset_index().set_index("index")
    data_labeled = data.assign(y_prob=dd.from_array(y_prob))
    dd.to_parquet(data_labeled, output_path)
    logging.info(f"Labels saved to {output_path}")