Exemplo n.º 1
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    sc = SparkContext()
    sql = SQLContext(sc)
    data = sql.read.parquet(data_path)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = SparkLFApplier(lfs)
    L = applier.apply(data.rdd)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
    data_labeled = data.withColumn("y_prob", y_prob_sql_array)
    logging.info(f"Labels saved to {output_path}")
def generate_labels_with_snorkel(dataframe):
    Labels the full data using Snorkel
    :param dataframe: Pandas dataframe containing all data
    :return: dataframe extended with a label column

    # Define the set of labeling functions (LFs)
    lfs = [
        lf_ubo_is_company, lf_troika_company, lf_uk_blacklisted_company,

    # Apply the LFs to the unlabeled training data
    applier = PandasLFApplier(lfs)
    L_train = applier.apply(dataframe)

    # Train the label model and compute the training labels
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
    dataframe["label"] = label_model.predict(L=L_train,

    # Filter out the abstain data points
    dataframe = dataframe[dataframe.label != ABSTAIN]

    return dataframe
Exemplo n.º 3
 def test_class_balance(self):
     label_model = LabelModel(cardinality=2, verbose=False)
     # Test class balance
     Y_dev = np.array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1])
     label_model._set_class_balance(class_balance=None, Y_dev=Y_dev)
                                          np.array([0.6, 0.4]))
 def test_optimizer(self):
     L = np.array([[0, -1, 0], [0, 1, 0]])
     label_model = LabelModel(cardinality=2, verbose=False)
     label_model.fit(L, n_epochs=1, optimizer="sgd")
     label_model.fit(L, n_epochs=1, optimizer="adam")
     label_model.fit(L, n_epochs=1, optimizer="adamax")
     with self.assertRaisesRegex(ValueError, "Unrecognized optimizer option"):
         label_model.fit(L, n_epochs=1, optimizer="bad_opt")
    def test_set_mu_eps(self):
        mu_eps = 0.0123

        # Construct a label matrix such that P(\lambda_1 = 0 | Y) = 0.0, so it will hit
        # the mu_eps floor
        L = np.array([[1, 1, 1], [1, 1, 1]])
        label_model = LabelModel(verbose=False)
        label_model.fit(L, mu_eps=mu_eps)
        self.assertAlmostEqual(label_model.get_conditional_probs()[0, 1, 0], mu_eps)
def train_model_random_lfs(randomly_sampled_lfs, train_matrix, dev_matrix,
                           dev_labels, test_matrix, regularization_grid):
    hyper_grid_results = defaultdict(dict)
    train_grid_results = defaultdict(dict)
    dev_grid_results = defaultdict(dict)
    test_grid_results = defaultdict(dict)
    models = defaultdict(dict)

    for lf_sample in tqdm_notebook(enumerate(randomly_sampled_lfs)):
        for param in regularization_grid:

            label_model = LabelModel(cardinality=2)
                train_matrix[:, lf_sample[1]],

            # Get marginals for each parameter
            hyper_grid_results[str(param)] = roc_curve(
                label_model.predict_proba(dev_matrix[:, lf_sample[1]])[:, 1])

        # Convert marginals into AUROCs
        hyper_grid_results = {
            param: auc(hyper_grid_results[param][0],
            for param in hyper_grid_results

        # Select the parameter with the highest AUROC
        best_param = float(
            max(hyper_grid_results.items(), key=operator.itemgetter(1))[0])

        # Re-fit the model
            train_matrix[:, lf_sample[1]],

        # Save marginals for output
        key = f'{lf_sample[0]}:{",".join(map(str, lf_sample[1]))}'
        train_grid_results[key] = label_model.predict_proba(
            train_matrix[:, lf_sample[1]])
        dev_grid_results[key] = label_model.predict_proba(
            dev_matrix[:, lf_sample[1]])
        test_grid_results[key] = label_model.predict_proba(
            test_matrix[:, lf_sample[1]])
        models[key] = label_model

    return train_grid_results, dev_grid_results, test_grid_results, models
    def test_score(self):
        L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]])
        Y = np.array([1, 0, 1])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=100)
        results = label_model.score(L, Y, metrics=["accuracy", "coverage"])
                                             np.array([1, -1, 1]))

        results_expected = dict(accuracy=1.0, coverage=2 / 3)
        self.assertEqual(results, results_expected)

        L = np.array([[1, 0, 1], [1, 0, 1]])
        label_model = self._set_up_model(L)
        label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(
            0.01, 0.99))

        results = label_model.score(L, Y=np.array([0, 1]))
        results_expected = dict(accuracy=0.5)
        self.assertEqual(results, results_expected)

        results = label_model.score(L=L,
                                    Y=np.array([1, 0]),
                                    metrics=["accuracy", "f1"])
        results_expected = dict(accuracy=0.5, f1=2 / 3)
        self.assertEqual(results, results_expected)
Exemplo n.º 8
def load_snorkel_ee_components(save_path: Union[str, Path]) \
        -> Tuple[LabelModel, LabelModel]:
    save_path = Path(save_path)
    assert save_path.exists(), f"Save path does not exist: {save_path}"

    trigger_label_model: LabelModel = LabelModel()
    role_label_model: LabelModel = LabelModel()

    return trigger_label_model, role_label_model
Exemplo n.º 9
def get_snorkel_labels(train_df, lfs, labels):
    applier = PandasLFApplier(
        [labeling_function(name=lf.__name__)(lf) for lf in lfs])
    label_model = LabelModel(cardinality=len(labels), verbose=True)
    L_train = applier.apply(df=train_df)
    label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123)
    L_probs = label_model.predict_proba(L=L_train)

    df_filtered, probs_filtered = filter_unlabeled_dataframe(X=train_df,
    return df_filtered, probs_filtered
    def test_model_loss(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel(cardinality=2, verbose=False)

        label_model.fit(L, n_epochs=1)
        init_loss = label_model._loss_mu().item()

        label_model.fit(L, n_epochs=10)
        next_loss = label_model._loss_mu().item()

        self.assertLessEqual(next_loss, init_loss)

        with self.assertRaisesRegex(Exception, "Loss is NaN."):
            label_model.fit(L, n_epochs=10, lr=1e8)
Exemplo n.º 11
    def __init__(
        cardinality: int = 2,
        verbose: bool = True,
        device: str = "cpu",
        metric: str = "accuracy",
        tie_break_policy: str = "abstain",
        n_epochs: int = 100,
        lr: float = 0.01,
        l2: float = 0.0,
        optimizer: str = "sgd",
        optimizer_config: Optional[OptimizerConfig] = None,
        lr_scheduler: str = "constant",
        lr_scheduler_config: Optional[LRSchedulerConfig] = None,
        prec_init: float = 0.7,
        seed: int = np.random.randint(1e6),
        log_freq: int = 10,
        mu_eps: Optional[float] = None,
        class_balance: Optional[List[float]] = None,
        **kwargs: Any,
    ) -> None:

        self.cardinality = cardinality
        self.verbose = verbose
        self.device = device
        self.metric = metric
        self.tie_break_policy = tie_break_policy
        self.n_epochs = n_epochs
        self.lr = lr
        self.l2 = l2
        self.optimizer = optimizer
        self.optimizer_config = (
            optimizer_config if optimizer_config is not None else
            OptimizerConfig()  # type: ignore
        self.lr_scheduler = lr_scheduler
        self.lr_scheduler_config = (
            lr_scheduler_config if lr_scheduler_config is not None else
            LRSchedulerConfig()  # type: ignore
        self.prec_init = prec_init
        self.seed = seed
        self.log_freq = log_freq
        self.mu_eps = mu_eps
        self.class_balance = class_balance

        self.label_model = LabelModel(cardinality=self.cardinality,
Exemplo n.º 12
    def __init__(self,
        df_train["seen"] = 0
        self.df_train = df_train.reset_index()
        self.df_dev = df_dev
        self.df_valid = df_valid
        self.df_test = df_test
        self.df_heldout = df_heldout
        #self.Y_train = df_train.label.values
        self.Y_dev = df_dev.label.values
        self.Y_valid = df_valid.label.values
        self.Y_test = df_test.label.values
        self.Y_heldout = df_heldout.label.values

        self.lfs = lfs

        self.L_train = None
        self.L_dev = None
        self.L_valid = None
        self.L_heldout = None
        cardinality = len(df_valid.label.unique())

        # for DEMOing purposes
        self.first_text_indices = [
            1262,  #"check out" "youtube"
            1892,  # I love
            1117,  # url concept
            1706,  # emoji concept
            952,  # "nice"
            971,  # positive concept
            958,  # actually use emoji concept

        self.count = 0

        if label_model is None:
            self.label_model = LabelModel(cardinality=cardinality,
            self.label_model = label_model

        self.vectorizer = CountVectorizer(ngram_range=(1, 2))
    def test_optimizer_init(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        label_model.fit(L, optimizer="sgd", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.SGD)

        label_model.fit(L, optimizer="adam", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.Adam)

        label_model.fit(L, optimizer="adamax", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.Adamax)

        with self.assertRaisesRegex(ValueError, "Unrecognized optimizer"):
            label_model.fit(L, optimizer="bad_optimizer", n_epochs=1)
Exemplo n.º 14
 def load(self, dir_name):
     with open(os.path.join(dir_name, 'model_lfs.pkl'), "rb") as file:
         lfs = pickle.load(file)
         label_model = LabelModel.load(
             os.path.join(dir_name, 'label_model.pkl'))
         self.lfs = lfs
         self.label_model = label_model
Exemplo n.º 15
 def test_save_and_load(self):
     L = np.array([[0, -1, 0], [0, 1, 0]])
     label_model = LabelModel(cardinality=2, verbose=False)
     label_model.fit(L, n_epochs=1)
     dir_path = tempfile.mkdtemp()
     save_path = dir_path + "label_model"
Exemplo n.º 16
    def test_scheduler_init(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        label_model.fit(L, lr_scheduler="constant", n_epochs=1)

        label_model.fit(L, lr_scheduler="linear", n_epochs=1)
        self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.LambdaLR)

        label_model.fit(L, lr_scheduler="exponential", n_epochs=1)
            label_model.lr_scheduler, optim.lr_scheduler.ExponentialLR

        label_model.fit(L, lr_scheduler="step", n_epochs=1)
        self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.StepLR)
    def test_augmented_L_construction(self):
        # 5 LFs
        n = 3
        m = 5
        k = 2
        L = np.array([[0, 0, 0, 1, 0], [0, 1, 1, 0, -1], [0, 0, 0, 0, -1]])
        L_shift = L + 1
        lm = LabelModel(cardinality=k, verbose=False)
        L_aug = lm._get_augmented_label_matrix(L_shift, higher_order=True)

        # Should have 10 columns:
        # - 5 * 2 = 10 for the sources
        self.assertEqual(L_aug.shape, (3, 10))

        # 13 total nonzero entries
        self.assertEqual(L_aug.sum(), 13)

        # Next, check the singleton entries
        for i in range(n):
            for j in range(m):
                if L_shift[i, j] > 0:
                    self.assertEqual(L_aug[i, j * k + L_shift[i, j] - 1], 1)

        # Finally, check the clique entries
        # Singleton clique 1
        self.assertEqual(len(lm.c_tree.node[1]["members"]), 1)
        j = lm.c_tree.node[1]["start_index"]
        self.assertEqual(L_aug[0, j], 1)

        # Singleton clique 2
        self.assertEqual(len(lm.c_tree.node[2]["members"]), 1)
        j = lm.c_tree.node[2]["start_index"]
        self.assertEqual(L_aug[0, j + 1], 0)
Exemplo n.º 18
def predict_documents(documents: pd.DataFrame, trigger_label_model: LabelModel,
                      role_label_model: LabelModel):
    if 'event_triggers' not in documents and 'event_roles' not in documents:
        documents = documents.apply(pipeline.add_default_events, axis=1)

    # 1. Get trigger probabilities
    df_predict_triggers, _ = pipeline.build_event_trigger_examples(documents)
    trigger_lf_applier = PandasLFApplier(pipeline.get_trigger_list_lfs())
    L_predict_triggers = trigger_lf_applier.apply(df_predict_triggers)
    event_trigger_probs = trigger_label_model.predict_proba(L_predict_triggers)

    merged_event_trigger_examples = pipeline.merge_event_trigger_examples(
        utils.zero_out_abstains(event_trigger_probs, L_predict_triggers))

    # 2. Get role probabilities
    df_predict_roles, _ = pipeline.build_event_role_examples(documents)
    role_lf_applier = PandasLFApplier(pipeline.get_role_list_lfs())
    L_predict_roles = role_lf_applier.apply(df_predict_roles)
    event_roles_probs = role_label_model.predict_proba(L_predict_roles)

    merged_event_role_examples = pipeline.merge_event_role_examples(
        utils.zero_out_abstains(event_roles_probs, L_predict_roles))

    # 3. Update documents with trigger & role probabilities
    labeled_documents: pd.DataFrame = documents.copy()
    # Make sure to remove event_triggers and roles that were built per default
    for idx, row in labeled_documents.iterrows():
        row['event_triggers'] = []
        row['event_roles'] = []
    if 'id' in labeled_documents:
        labeled_documents.set_index('id', inplace=True)

    triggers = merged_event_trigger_examples[['event_triggers']]
    roles = merged_event_role_examples[['event_roles']]


    labeled_documents.reset_index(level=0, inplace=True)

    # 4. Add ACE events
    labeled_documents = ace_formatter.snorkel_to_ace_format(labeled_documents)
    return labeled_documents
Exemplo n.º 19
    def test_loss(self):
        L = np.array([[0, -1, 0], [0, 1, -1]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=1)
        label_model.mu = nn.Parameter(label_model.mu_init.clone() + 0.05)

        # l2_loss = l2*M*K*||mu - mu_init||_2 = 3*2*(0.05^2) = 0.03
        self.assertAlmostEqual(label_model._loss_l2(l2=1.0).item(), 0.03)
        self.assertAlmostEqual(label_model._loss_l2(l2=np.ones(6)).item(), 0.03)

        # mu_loss = ||O - \mu^T P \mu||_2 + ||\mu^T P - diag(O)||_2
        self.assertAlmostEqual(label_model._loss_mu().item(), 0.675, 3)
Exemplo n.º 20
    def test_class_balance(self):
        label_model = LabelModel(cardinality=2, verbose=False)
        # Test class balance
        Y_dev = np.array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1])
        label_model._set_class_balance(class_balance=None, Y_dev=Y_dev)
        np.testing.assert_array_almost_equal(label_model.p, np.array([0.6, 0.4]))

        class_balance = np.array([0.0, 1.0])
        with self.assertRaisesRegex(ValueError, "Class balance prior is 0"):
            label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev)

        class_balance = np.array([0.0])
        with self.assertRaisesRegex(ValueError, "class_balance has 1 entries."):
            label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev)

        Y_dev_one_class = np.array([0, 0, 0])
        with self.assertRaisesRegex(
            ValueError, "Does not match LabelModel cardinality"
            label_model._set_class_balance(class_balance=None, Y_dev=Y_dev_one_class)
    def test_mv_default(self):
        # less than 2 LFs have overlaps
        label_model = LabelModel(cardinality=2, verbose=False)
        L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]])
        label_model.fit(L, n_epochs=100)
                                             np.array([1, 1, 0]))

        # less than 2 LFs have conflicts
        L = np.array([[-1, -1, 1], [-1, 1, 1], [1, 1, 1]])
        label_model.fit(L, n_epochs=100)
                                             np.array([1, 1, 1]))
    def test_labeling_convergence(self) -> None:
        """Test convergence of end to end labeling pipeline."""
        # Apply LFs
        labeling_functions = (
            + [get_positive_labeling_function(divisor) for divisor in range(2, 9)]
            + [get_negative_labeling_function(divisor) for divisor in range(2, 9)]
        applier = PandasLFApplier(labeling_functions)
        L_train = applier.apply(self.df_train, progress_bar=False)

        self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions)))

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0)
        Y_lm = label_model.predict_proba(L_train).argmax(axis=1)
        Y = self.df_train.y
        err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN
        self.assertLess(err, 0.05)
    def test_get_weight(self):
        # set up L matrix
        true_accs = [0.95, 0.6, 0.7, 0.55, 0.8]
        coverage = [1.0, 0.8, 1.0, 1.0, 1.0]
        L = -1 * np.ones((1000, len(true_accs)))
        Y = np.zeros(1000)

        for i in range(1000):
            Y[i] = 1 if np.random.rand() <= 0.5 else 0
            for j in range(5):
                if np.random.rand() <= coverage[j]:
                    L[i, j] = (Y[i] if np.random.rand() <= true_accs[j] else
                               np.abs(Y[i] - 1))

        label_model = LabelModel(cardinality=2)
        label_model.fit(L, n_epochs=1000, seed=123)

        accs = label_model.get_weights()
        for i in range(len(accs)):
            true_acc = true_accs[i]
            self.assertAlmostEqual(accs[i], true_acc, delta=0.1)
Exemplo n.º 24
def get_majority_vote_label(train_df, lfs, labels):
    applier = PandasLFApplier(
        [labeling_function(name=lf.__name__)(lf) for lf in lfs])
    label_model = LabelModel(cardinality=len(labels), verbose=True)
    L_train = applier.apply(df=train_df)
    majority_model = MajorityLabelVoter(cardinality=len(labels))
    preds_train = majority_model.predict(L=L_train)

    non_abstain_idxs = np.argwhere(preds_train >= 0).flatten()
    df_filtered = train_df.iloc[non_abstain_idxs]
    probs_filtered = preds_train[non_abstain_idxs]
    return df_filtered, probs_filtered
    def test_label_model_sparse(self) -> None:
        """Test the LabelModel's estimate of P and Y on a sparse synthetic dataset.

        This tests the common setting where LFs abstain most of the time, which can
        cause issues for example if parameter clamping set too high (e.g. see Issue
        P, Y, L = generate_simple_label_matrix(self.n,

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L, n_epochs=1000, lr=0.01, seed=123)

        # Test estimated LF conditional probabilities
        P_lm = label_model.get_conditional_probs()
        np.testing.assert_array_almost_equal(P, P_lm, decimal=2)

        # Test predicted labels *only on non-abstained data points*
        Y_pred = label_model.predict(L, tie_break_policy="abstain")
        idx, = np.where(Y_pred != -1)
        acc = np.where(Y_pred[idx] == Y[idx], 1, 0).sum() / len(idx)
        self.assertGreaterEqual(acc, 0.65)

        # Make sure that we don't output abstain when an LF votes, per issue #1422
                         np.where((L + 1).sum(axis=1) != 0, 1, 0).sum())
def snorkel_process(keylist, dataframe, allweaklabf):
    def func(x):
        idx = (-x).argsort()[1:]
        x[idx] = 0
        return x

    cardinalitynu = len(keylist)
    applier = PandasLFApplier(lfs=allweaklabf)
    all_train_l = applier.apply(df=dataframe)
    report = LFAnalysis(L=all_train_l, lfs=allweaklabf).lf_summary()
    label_model = LabelModel(cardinality=cardinalitynu, verbose=False)
    predt = label_model.predict(all_train_l)
    predt1 = label_model.predict_proba(all_train_l)
    keylist1 = keylist.copy()
    predt2 = pd.DataFrame(predt1, columns=keylist1)
    dataframe['L_label'] = predt
    dataframe1 = dataframe.join(predt2, how='outer')
    dataframe1 = dataframe1[dataframe1.L_label >= 0]

    train, test = train_test_split(dataframe1, test_size=0.2)

    trainsent = train.sent.values
    trainlabel = train[keylist].values
    trainlabe2 = trainlabel.copy()
    np.apply_along_axis(func, 1, trainlabe2)
    trainlabe2 = np.where(trainlabe2 > 0, 1, 0)
    testsent = test.sent.values
    testlabel = test[keylist].values
    testlabe2 = testlabel.copy()
    np.apply_along_axis(func, 1, testlabe2)
    testlabe2 = np.where(testlabe2 > 0, 1, 0)
    return trainsent, trainlabe2, testsent, testlabe2, keylist, report
Exemplo n.º 27
def curate_twitter(save_name='../../pandafied_data/curated_twitter.csv'):
    df_train = pd.read_csv('../../pandafied_data/pandafied_twitter.csv')
    #from utils import load_unlabeled_spam_dataset
    #df_train = load_unlabeled_spam_dataset()

    # Define the set of labeling functions (LFs)
    #lfs = [lf_keyword_wateroverlast,lf_keyword_voertuig,lf_keyword_aanrijding,lf_keyword_te_water,lf_keyword_persoon,lf_keyword_brand,lf_keyword_mps,lf_keyword_kps,lf_keyword_luchtdr]

    #lfs = [lf_keyword_keywords]

    lfs = [lf_keyword_wateroverlast]

    # Apply the LFs to the unlabeled training data
    applier = PandasLFApplier(lfs)
    L_train = applier.apply(df_train)

    # Train the label model and compute the training labels
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
    df_train["label"] = label_model.predict(L=L_train,
    counter = 0
    for i in range(len(df_train["label"])):
        if df_train["label"][i] == WATER:
            counter += 1

    print("num entries total: " + str(len(df_train["label"])))
    print("num entries water: " + str(counter))

    #df_train = df_train[df_train.label != ABSTAIN]

    twitter_curated = df_train[df_train.label == WATER]
    twitter_curated = twitter_curated.drop(columns='label')
    twitter_curated.to_csv(save_name, index=False)
Exemplo n.º 28
    def test_L_form(self):
        label_model = LabelModel(cardinality=2, verbose=False)
        L = np.array([[-1, 1, -1], [-1, 1, -1], [1, -1, -1], [-1, 1, -1]])
        self.assertEqual(label_model.n, 4)
        self.assertEqual(label_model.m, 3)

        L = np.array([[-1, 0, 1], [-1, 0, 2], [0, -1, 2], [-1, 0, -1]])
        with self.assertRaisesRegex(ValueError, "L_train has cardinality"):
            label_model.fit(L, n_epochs=1)

        L = np.array([[0, 1], [1, 1], [0, 1]])
        with self.assertRaisesRegex(ValueError, "L_train should have at least 3"):
            label_model.fit(L, n_epochs=1)
    def test_warmup(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        lr_scheduler_config = {"warmup_steps": 3, "warmup_unit": "epochs"}
        label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5)
        self.assertEqual(label_model.warmup_steps, 3)

        lr_scheduler_config = {"warmup_percentage": 3 / 5}
        label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5)
        self.assertEqual(label_model.warmup_steps, 3)

        with self.assertRaisesRegex(ValueError, "LabelModel does not support"):
            lr_scheduler_config = {"warmup_steps": 1, "warmup_unit": "batches"}
            label_model.fit(L, lr_scheduler_config=lr_scheduler_config)
Exemplo n.º 30
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    data = dd.read_parquet(data_path)
    data = data.repartition(npartitions=2)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = DaskLFApplier(lfs)
    L = applier.apply(data)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    data = data.reset_index().set_index("index")
    data_labeled = data.assign(y_prob=dd.from_array(y_prob))
    dd.to_parquet(data_labeled, output_path)
    logging.info(f"Labels saved to {output_path}")