Exemplo n.º 1
0
    def test_majority_label_vote(self):
        L = np.array([[0, 1, 0], [0, 1, 0], [1, 0, 0], [-1, -1, 1]])
        ml_voter = MajorityLabelVoter()
        Y_p = ml_voter.predict_proba(L)

        Y_p_true = np.array([[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]])
        np.testing.assert_array_almost_equal(Y_p, Y_p_true)
Exemplo n.º 2
0
def labeling_evaluation(df_train, df_test, label_model):
    lfs = [
        LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short,
        LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword,
        LabelingFunction.lf_surname_re, LabelingFunction.industry_cls
    ]

    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=df_train)
    L_test = applier.apply(df=df_test)
    Y_test = df_test.label.values
    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    if label_model == "majority":
        majority_model = MajorityLabelVoter()
        preds_train = majority_model.predict(L=L_train)
        majority_acc = majority_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

        df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=preds_train, L=L_train)
        return df_train_filtered, preds_train_filtered, analysis

    if label_model == "weighted":
        label_model = LabelModel(cardinality=len(
            [c for c in dir(Polarity) if not c.startswith("__")]),
                                 verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
        probs_train = label_model.predict_proba(L_train)
        label_model_acc = label_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=probs_train, L=L_train)
        preds_train_filtered = probs_to_preds(probs_train_filtered)
        return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
def weak_supervisor(dataframe, model_type):
    labeling_functions = [positive_labeling_function, positive1_labeling_function, negative_labeling_function,
                          negative1_labeling_function]
    pandasApplier = PandasLFApplier(lfs=labeling_functions)
    label_training_matrix = pandasApplier.apply(df=dataframe)

    if model_type == "label_model":
        # constructing a probabilistic label model
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train=label_training_matrix, n_epochs=300, log_freq=50, seed=123)
        dataframe["weak_labels"] = label_model.predict(L=label_training_matrix)
        print("dataframe shape: ", dataframe.shape)
        dataframe = dataframe[dataframe["weak_labels"] != -1]
        print("dataframe shape after filtering: ", dataframe.shape)
        return dataframe

    else:
        majorityLabelVoter = MajorityLabelVoter()
        dataframe["weak_labels"] = majorityLabelVoter.predict(L=label_training_matrix)
        print("dataframe shape: ", dataframe.shape)
        dataframe = dataframe[dataframe["weak_labels"] != -1]
        print("dataframe shape after filtering: ", dataframe.shape)
        return dataframe
def model_analysis(label_model: LabelModel,
                   training_set: pd.DataFrame,
                   L_train: np.ndarray,
                   L_test: np.ndarray,
                   Y_test: np.ndarray,
                   lfs: list,
                   output_file="output") -> None:
    # TODO: consider using **kwargs instead of this painful list of arguments
    """Output analysis for the label model to a file

    :param label_model: The current label model which we want to output analysis for
    :type label_model: LabelModel
    :param training_set: A dataframe containing the training dataset
    :type training_set: pd.DataFrame
    :param L_train: The matrix of labels generated by the labeling functions on the training data
    :type L_train: np.ndarray
    :param L_test: The matrix of labels generated bt the labeling functions on the testing data
    :type L_test: np.ndarray
    :param Y_test: Gold labels associated with data points in L_test
    :type Y_test: np.ndarray
    :param lfs: List of labeling functions
    :type lfs: list
    :param output_file: A path where the output file should be writtent to, defaults to `PROJECT_ROOT/output`
    :type output_file: str, optional
    """
    Y_train = label_model.predict_proba(L=L_train)
    Y_pred = label_model.predict(L=L_test, tie_break_policy="abstain")
    lf_analysis_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    # TODO: Write this df to a output file. Ask Jennifer about how to handle this
    print(lf_analysis_train)

    # build majority label voter model
    majority_model = MajorityLabelVoter()
    majority_acc = majority_model.score(L=L_test,
                                        Y=Y_test,
                                        tie_break_policy="abstain",
                                        metrics=["f1", "accuracy"])
    label_model_acc = label_model.score(L=L_test,
                                        Y=Y_test,
                                        tie_break_policy="abstain",
                                        metrics=["f1", "accuracy"])

    # get precision and recall scores
    p_score = precision_score(y_true=Y_test, y_pred=Y_pred, average='weighted')
    r_score = recall_score(y_true=Y_test,
                           y_pred=Y_pred,
                           average='weighted',
                           labels=np.unique(Y_pred))

    # how many documents abstained
    probs_train = majority_model.predict_proba(L=L_train)
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=training_set, y=probs_train, L=L_train)

    # get number of false positives
    buckets = get_label_buckets(Y_test, Y_pred)
    true_positives, false_positives, true_negatives, false_negatives = (
        buckets.get((1, 1)), buckets.get((1, 0)), buckets.get(
            (0, 0)), buckets.get((0, 1)))
    # write analysis to file
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    with open(f"{'../output/logs/'}{output_file}_run_{timestamp}.txt",
              "w") as output_file:
        output_file.write(
            f"{'Majority Vote Accuracy:':<25} {majority_acc['accuracy'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Majority Vote F1 Score:':<25} {majority_acc['f1'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Label Model Accuracy:':<25} {label_model_acc['accuracy'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Label Model F1 Score:':<25} {label_model_acc['f1'] * 100:.2f}%"
        )
        output_file.write(f"\n{'Precision Score:':<25} {p_score * 100:.2f}%")
        output_file.write(f"\n{'Recall Score:':<25} {r_score * 100:.2f}%")
        output_file.write(
            f"\n{'Abstained Data Points:':<25} {len(df_train_filtered)}")
        output_file.write(
            f"\n{'True Positives:':<25} {len(true_positives) if true_positives is not None else 0}"
        )
        output_file.write(
            f"\n{'False Positives:':<25} {len(false_positives) if false_positives is not None else 0}"
        )
        output_file.write(
            f"\n{'False Negatives:':<25} {len(false_negatives) if false_negatives is not None else 0}"
        )
        output_file.write(
            f"\n{'True Negatives:':<25} {len(true_negatives) if true_negatives is not None else 0}"
        )
        output_file.write(
            f"\n{'Abstained Positives:':<25} {len(buckets[(1, -1)])}")
        output_file.write(
            f"\n{'Abstained Negatives:':<25} {len(buckets[(0, -1)])}")
Exemplo n.º 5
0
print('applying labelling functions to data...')
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=X_train)
L_dev = applier.apply(df=X_dev)

print('fitting Label Model')
label_model = LabelModel(cardinality=config['cardinality'], verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
label_model_acc = label_model.score(L=L_dev,
                                    Y=y_dev,
                                    tie_break_policy="random")["accuracy"]
print(f'label model acc: {label_model_acc}')

print('fitting Majority Label Voter model')
majority_model = MajorityLabelVoter(cardinality=config['cardinality'])
# preds_train = majority_model.predict(L=L_train)
majority_acc = majority_model.score(L=L_dev,
                                    Y=np.array(y_dev).reshape(-1, 1),
                                    tie_break_policy="random")["accuracy"]
print(f'majority_label_acc: {majority_acc}')

log_metric('majority_label_acc', majority_acc)
log_metric('label_model_acc', label_model_acc)

probs_train = label_model.predict_proba(L=L_train)
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=X_train, y=probs_train, L=L_train)

print('setting up Label Model')
stop_words = config['stop_words']
Exemplo n.º 6
0
    keyword_verb
]

# apply label functions
applier = PandasLFApplier(lfs=lfs)
# create a label matrix for the training set
L_train = applier.apply(df=data_train)
# create a label matrix for the test det
L_test = applier.apply(df=data_test)

# summary statistics for the LFs
lf_summary = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
print(lf_summary)

# take the majority vote on a per-data point basis
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

# use LabelModel to produce training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

# result using majority-vote model
Y_test = data_test.label.values
majority_acc = majority_model.score(L=L_test,
                                    Y=Y_test,
                                    tie_break_policy="random")["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

# results using label model
label_model_acc = label_model.score(L=L_test,
Exemplo n.º 7
0
lfs += allKeywordLFGemeentenBE
lfs += allKeywordLFGemeentenNL

lfs += allKeywordNamedEntBE
lfs += allKeywordNamedEntNL

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

result = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
print(result)

from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter(cardinality=2)
preds_train_majority = majority_model.predict(L=L_train)

from snorkel.labeling.model import LabelModel
label_model = LabelModel(cardinality=2, verbose=True, device='cuda')
#according to location data, BE tweets = 10-15%
label_model.fit(L_train=L_train,
                n_epochs=500,
                class_balance=[0.15, 0.85],
                log_freq=100,
                seed=82)
preds_train_label = label_model.predict(L=L_train)

L_dev = applier.apply(df=df_dev)
mapping = {'BE': 0, 'NL': 1}
Y_dev = np.array([mapping[i] for i in df_dev['label']])
Exemplo n.º 8
0
                        lr=0.05,
                        class_balance=[0.7, 0.3],
                        n_epochs=100)

# %%
Y_probs_valid = label_model.predict_proba(L_valid)
Y_preds_valid = probs_to_preds(Y_probs_valid)
metric_score(Y_valid, Y_preds_valid, probs=None, metric="f1")

# %% [markdown]
# **Majority Vote**

# %%
from snorkel.labeling.model import MajorityLabelVoter

mv_model = MajorityLabelVoter()
Y_probs_valid = mv_model.predict_proba(L_valid)
Y_preds_valid = probs_to_preds(Y_probs_valid)
metric_score(Y_valid, Y_preds_valid, probs=None, metric="f1")

# %%
# from metal.tuners import RandomSearchTuner

# # Creating search space
# search_space = {
#     "l2": {"range": [0.0001, 0.1], "scale": "log"},  # linear range
#     "lr": {"range": [0.0001, 0.1], "scale": "log"},  # log range
# }

# searcher = RandomSearchTuner(LabelModel, log_dir="./run_logs", log_writer_class=None)
Exemplo n.º 9
0
def majority_acc(line: np.ndarray, label_series: Series) -> float:
    majority_model = MajorityLabelVoter()
    maj_model_train_acc = majority_model.score(
        L=line, Y=label_series.values, tie_break_policy="random")["accuracy"]
    return maj_model_train_acc
Exemplo n.º 10
0
def majority_acc(L: np.ndarray, df: pd.DataFrame) -> float:
    majority_model = MajorityLabelVoter()
    maj_model_train_acc = majority_model.score(
        L=L, Y=df.label, tie_break_policy="random")["accuracy"]
    return maj_model_train_acc