Пример #1
0
def labeling_evaluation(df_train, df_test, label_model):
    lfs = [
        LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short,
        LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword,
        LabelingFunction.lf_surname_re, LabelingFunction.industry_cls
    ]

    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=df_train)
    L_test = applier.apply(df=df_test)
    Y_test = df_test.label.values
    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    if label_model == "majority":
        majority_model = MajorityLabelVoter()
        preds_train = majority_model.predict(L=L_train)
        majority_acc = majority_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

        df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=preds_train, L=L_train)
        return df_train_filtered, preds_train_filtered, analysis

    if label_model == "weighted":
        label_model = LabelModel(cardinality=len(
            [c for c in dir(Polarity) if not c.startswith("__")]),
                                 verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
        probs_train = label_model.predict_proba(L_train)
        label_model_acc = label_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=probs_train, L=L_train)
        preds_train_filtered = probs_to_preds(probs_train_filtered)
        return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
def weak_supervisor(dataframe, model_type):
    labeling_functions = [positive_labeling_function, positive1_labeling_function, negative_labeling_function,
                          negative1_labeling_function]
    pandasApplier = PandasLFApplier(lfs=labeling_functions)
    label_training_matrix = pandasApplier.apply(df=dataframe)

    if model_type == "label_model":
        # constructing a probabilistic label model
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train=label_training_matrix, n_epochs=300, log_freq=50, seed=123)
        dataframe["weak_labels"] = label_model.predict(L=label_training_matrix)
        print("dataframe shape: ", dataframe.shape)
        dataframe = dataframe[dataframe["weak_labels"] != -1]
        print("dataframe shape after filtering: ", dataframe.shape)
        return dataframe

    else:
        majorityLabelVoter = MajorityLabelVoter()
        dataframe["weak_labels"] = majorityLabelVoter.predict(L=label_training_matrix)
        print("dataframe shape: ", dataframe.shape)
        dataframe = dataframe[dataframe["weak_labels"] != -1]
        print("dataframe shape after filtering: ", dataframe.shape)
        return dataframe
Пример #3
0
]

# apply label functions
applier = PandasLFApplier(lfs=lfs)
# create a label matrix for the training set
L_train = applier.apply(df=data_train)
# create a label matrix for the test det
L_test = applier.apply(df=data_test)

# summary statistics for the LFs
lf_summary = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
print(lf_summary)

# take the majority vote on a per-data point basis
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

# use LabelModel to produce training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

# result using majority-vote model
Y_test = data_test.label.values
majority_acc = majority_model.score(L=L_test,
                                    Y=Y_test,
                                    tie_break_policy="random")["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

# results using label model
label_model_acc = label_model.score(L=L_test,
                                    Y=Y_test,