Пример #1
0
    def train_model(self,
                    df_train: pd.DataFrame,
                    application_area_lfs: list,
                    analysis_path: str = "output",
                    label_output_path: str = "labels.jsonl",
                    save_model_path: str = None):
        """Using our labeling functions, we can train a probabilistic model which is able to generate weak labels for our data points

        :param df_train: The training data for the model
        :type df_train: pd.DataFrame
        :param application_area_lfs: A list of labeling functions to use in training the Label Model
        :type application_area_lfs: list
        :param analysis_path: Folder path where the model output should be stored, defaults to `PROJECT_ROOT/output`
        :type analysis_path: str, optional
        :param label_output_path: Path to file where probabilistic labels generated by the model should be stored, defaults to "labels.jsonl"
        :type label_output_path: str, optional
        :param save_model_path: A path to where the Label Model should be save at. If no path is provided, the model is not saved
        :type save_model_path: str, optional
        """
        file_name_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        applier = PandasLFApplier(lfs=application_area_lfs)
        L_train = applier.apply(df=df_train)

        model = LabelModel(cardinality=2, verbose=True)
        model.fit(L_train=L_train, n_epochs=800, log_freq=100)
        if (save_model_path is not None):
            model.save(save_model_path)

        int_labels, prob_labels = model.predict(L=L_train,
                                                return_probs=True,
                                                tie_break_policy="abstain")
        probs_df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=prob_labels, L=L_train)

        int_df_train_filtered, int_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=int_labels, L=L_train)
        # write out both labels. In the probability outputs, p_rel is the second probability listed
        assert list(probs_df_train_filtered["paperid"]) == list(
            int_df_train_filtered["paperid"])
        with open(f"{label_output_path}", mode="w") as out:
            for idx, paper_id in enumerate(probs_df_train_filtered["paperid"]):
                out.write(
                    json.dumps({
                        "id": paper_id,
                        # cast to int and float to get rid of nonserializable numpy types
                        "is_rel": int(int_train_filtered[idx]),
                        "p_rel": float(probs_train_filtered[idx][1])
                    }) + "\n")

        # output LF analysis to csv file sorted by coverage
        lf_analysis = LFAnalysis(L=L_train,
                                 lfs=application_area_lfs).lf_summary()
        with open(
                f"{self.PROJECT_ROOT}/output/{analysis_path}_{file_name_timestamp}.csv",
                "w") as outfile:
            lf_analysis = lf_analysis.sort_values("Coverage")
            lf_analysis.to_csv(outfile, encoding="utf-8", index=True)
Пример #2
0
def run_analysis(
    applied_lf_matrix: np.ndarray,
    lfs: List[LabelingFunction],
    save_csv_to: AbsolutePath,
    save_json_to: AbsolutePath,
    label_series: Optional[Series] = None,
) -> None:
    lf_analysis_summary = LFAnalysis(applied_lf_matrix, lfs).lf_summary(
        Y=label_series.values if label_series is not None else None)
    lf_analysis_summary.to_csv(save_csv_to)
    analysis_dict = lf_analysis_summary.to_dict()
    del analysis_dict["j"]
    with open(save_json_to, "w") as f:
        json.dump(analysis_dict, f, indent=4, sort_keys=True, cls=NumpyEncoder)
Пример #3
0
def print_analysis(l_train,lfs):
    """
    Prints LF's coverage and statistics
    """
    coverage_masechet_then_parans, coverage_perek_then_parans, \
    coverage_daf_in_parntes, coverage_no_double_parans, coverage_no_mishna = (
                l_train != ABSTAIN).mean(axis=0)
    txt_file = open(r"data/analysis.txt", "a+")
    txt_file.write("\n\n")
    txt_file.write('Analysis for date ['+str(datetime.datetime.now())+']: \n')
    txt_file.write('[SAMPLE_SIZE: ' + str(utility.SAMPLE_SIZE) + '] \n')
    txt_file.write('[TRANSFORMATION_FACTOR: ' + str(TRANSFORMATION_FACTOR) + '] \n')
    txt_file.write("\n\n")
    txt_file.write(":::::::::::::::::::::::::::|LFs Coverage|::::::::::::::::::::::::::::::::\n")
    txt_file.write(f"coverage_masechet_then_parans: {coverage_masechet_then_parans * 100:.1f}%\n")
    txt_file.write(f"coverage_perek_then_parans: {coverage_perek_then_parans * 100:.1f}%\n")
    txt_file.write(f"coverage_daf_in_parntes: {coverage_daf_in_parntes * 100:.1f}%\n")
    txt_file.write(f"coverage_no_double_parans: {coverage_no_double_parans * 100:.1f}%\n")
    txt_file.write(f"coverage_no_mishna: {coverage_no_mishna * 100:.1f}%\n")
    txt_file.write(":::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n")
    txt_file.write(":::::::::::::::::::::::|LFs Summary - l_train|:::::::::::::::::::::::::::\n")
    txt_file.write(LFAnalysis(L=l_train, lfs=lfs).lf_summary().to_string())
    txt_file.write("\n")
    txt_file.write(":::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n")
    # txt_file.write("::::::::::::::::::::::::|LFs Summary - l_dev|::::::::::::::::::::::::::::\n")
    # txt_file.write(LFAnalysis(L=l_dev, lfs=lfs).lf_summary(Y=Y_dev).to_string())
    # txt_file.write("\n")
    # txt_file.write(":::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n")
    txt_file.close()
def snorkel_process(keylist, dataframe, allweaklabf):
    def func(x):
        idx = (-x).argsort()[1:]
        x[idx] = 0
        return x

    cardinalitynu = len(keylist)
    applier = PandasLFApplier(lfs=allweaklabf)
    all_train_l = applier.apply(df=dataframe)
    report = LFAnalysis(L=all_train_l, lfs=allweaklabf).lf_summary()
    print(report)
    label_model = LabelModel(cardinality=cardinalitynu, verbose=False)
    label_model.fit(all_train_l)
    predt = label_model.predict(all_train_l)
    predt1 = label_model.predict_proba(all_train_l)
    keylist1 = keylist.copy()
    #keylist1.append('Not_relevent')
    predt2 = pd.DataFrame(predt1, columns=keylist1)
    dataframe['L_label'] = predt
    dataframe1 = dataframe.join(predt2, how='outer')
    dataframe1 = dataframe1[dataframe1.L_label >= 0]

    train, test = train_test_split(dataframe1, test_size=0.2)

    trainsent = train.sent.values
    trainlabel = train[keylist].values
    trainlabe2 = trainlabel.copy()
    np.apply_along_axis(func, 1, trainlabe2)
    trainlabe2 = np.where(trainlabe2 > 0, 1, 0)
    testsent = test.sent.values
    testlabel = test[keylist].values
    testlabe2 = testlabel.copy()
    np.apply_along_axis(func, 1, testlabe2)
    testlabe2 = np.where(testlabe2 > 0, 1, 0)
    return trainsent, trainlabe2, testsent, testlabe2, keylist, report
Пример #5
0
 def apply(self, tasks):
     print('Create regions...')
     random.shuffle(tasks)
     regions = self.create_regions(tasks[:100])
     print(f'Num regions: {len(regions)}')
     L_train = self.applier.apply(regions)
     lfa = LFAnalysis(L=L_train, lfs=self.lfs)
     confl = lfa.lf_conflicts()
     cov = lfa.lf_coverages()
     confli = np.argsort(confl)
     lfs_sorted = [self.lfs[i] for i in confli]
     out = []
     for lf, cf, cv in zip(lfs_sorted, confl[confli], cov[confli]):
         print(lf.name, cf, cv)
         out.append({'lop': lf.name, 'conflict': cf, 'coverage': cv})
     return out
Пример #6
0
def startSnorkelLabeling(df, keyword_groups={}, label=IRRELEVANT, l_type='SnorkelFilter'):
    '''
    Function: Filter words for user
    Inputs:
        - df: tweets DataFrame (columns: [id, text])
        - keywords: Keyword group and its relevant keywords
          E.g. {'usps': ['postal service', 'usps'], 'invest': ['invest','portfolio','stock']}
    Outputs:
        - a_df: Categorised Data (e.g. columns = ['id', 'tweets', 'Refund', 'COVID'])
        - analysis: Snorkel Labeling Function statistics
    '''

    lfs = []
    for name, keywords in keyword_groups.items():
        lfs.append(make_keyword_lf(lf_name=name, keywords=keywords, label=label))

    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=df)

    if l_type == 'SnorkelFilter': # For spam detection (Step 2)
        L_final = get_L_final_filter(L_train)
        df['relevance'] = L_final

    elif l_type == 'SnorkelCategorise': # For categorising tweets (Step 3)
        L_final = get_L_final_categorise(L_train)

        L_final_with_names = dict(zip(keyword_groups.keys(), L_final))
        for name, L_values in L_final_with_names.items():
            df[name] = L_values

    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    #return L_train, L_final, df, analysis
    return df, analysis
Пример #7
0
    def test_lf_summary(self) -> None:
        df = self.lfa.lf_summary(self.Y, est_weights=None)
        df_expected = pd.DataFrame(
            {
                "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]],
                "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6],
                "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6],
                "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6],
                "Correct": [1, 0, 1, 1, 1, 2],
                "Incorrect": [2, 0, 2, 1, 1, 2],
                "Emp. Acc.": [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4],
            }
        )
        pd.testing.assert_frame_equal(df.round(6), df_expected.round(6))

        df = self.lfa.lf_summary(Y=None, est_weights=None)
        df_expected = pd.DataFrame(
            {
                "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]],
                "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6],
                "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6],
                "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6],
            }
        )
        pd.testing.assert_frame_equal(df.round(6), df_expected.round(6))

        est_weights = [1, 0, 1, 1, 1, 0.5]
        names = list("abcdef")
        lfs = [LabelingFunction(s, f) for s in names]
        lfa = LFAnalysis(np.array(L), lfs)
        df = lfa.lf_summary(self.Y, est_weights=est_weights)
        df_expected = pd.DataFrame(
            {
                "j": [0, 1, 2, 3, 4, 5],
                "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]],
                "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6],
                "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6],
                "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6],
                "Correct": [1, 0, 1, 1, 1, 2],
                "Incorrect": [2, 0, 2, 1, 1, 2],
                "Emp. Acc.": [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4],
                "Learned Weight": [1, 0, 1, 1, 1, 0.5],
            }
        ).set_index(pd.Index(names))
        pd.testing.assert_frame_equal(df.round(6), df_expected.round(6))
Пример #8
0
    def analyze_lfs(self):
        if len(self.lfs) > 0:
            df = LFAnalysis(L=self.L_train, lfs=self.get_lfs()).lf_summary()
            dev_df = LFAnalysis(L=self.L_dev,
                                lfs=self.get_lfs()).lf_summary(Y=self.Y_dev)
            df = df.merge(dev_df,
                          how="outer",
                          suffixes=(" Training", " Dev."),
                          left_index=True,
                          right_index=True)
            df["Weight"] = self.label_model.get_weights()
            df["Duplicate"] = None
            for dupe, OG in self.find_duplicate_signature().items():
                print("Duplicate labeling signature detected")
                print(dupe, OG)
                df.at[dupe, "Duplicate"] = OG

            return df
        return None
Пример #9
0
def createAnalysis(final_df, category_names):

    L_final = []
    for name in category_names:
        category_if = [-1 if i == 0 else i for i in final_df[name].tolist()]
        L_final.append(category_if)

    L_train = [list(x) for x in list(zip(*L_final))]
    lfs = [LabelingFunction(name=name, f=None) for name in category_names]

    return LFAnalysis(L=np.array(L_train), lfs=lfs).lf_summary()
Пример #10
0
def main():
    lfs = [lf_contains_link, lf_contains_co, lf_contains_sub]
    baseApp = LFApplier(lfs)
    labels = baseApp.apply(src)
    print(labels)
    print(LFAnalysis(labels, lfs).lf_summary())
    buckets = get_label_buckets(labels[:, 0], labels[:, 1])
    print(buckets)

    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(labels, n_epochs=500, log_freq=50, seed=123)
    pred_labels = label_model.predict(L=labels, tie_break_policy="abstain")
    print(pred_labels)
Пример #11
0
    def _test_generate_L(self, k: int, decimal: Optional[int] = 2) -> None:
        """Test generated label matrix L for consistency with P, Y.

        This tests for consistency between the true conditional LF probabilities, P,
        and the empirical ones computed from L and Y, where P, L, and Y are generated
        by the generate_simple_label_matrix function.

        Parameters
        ----------
        k
            Cardinality
        decimal
            Number of decimals to check element-wise error, err < 1.5 * 10**(-decimal)
        """
        np.random.seed(123)
        P, Y, L = generate_simple_label_matrix(self.n, self.m, k)
        P_emp = LFAnalysis(L).lf_empirical_probs(Y, k=k)
        np.testing.assert_array_almost_equal(P, P_emp, decimal=decimal)
Пример #12
0
def labeling_evaluation(df_train, df_test, label_model):
    lfs = [
        LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short,
        LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword,
        LabelingFunction.lf_surname_re, LabelingFunction.industry_cls
    ]

    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=df_train)
    L_test = applier.apply(df=df_test)
    Y_test = df_test.label.values
    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    if label_model == "majority":
        majority_model = MajorityLabelVoter()
        preds_train = majority_model.predict(L=L_train)
        majority_acc = majority_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

        df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=preds_train, L=L_train)
        return df_train_filtered, preds_train_filtered, analysis

    if label_model == "weighted":
        label_model = LabelModel(cardinality=len(
            [c for c in dir(Polarity) if not c.startswith("__")]),
                                 verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
        probs_train = label_model.predict_proba(L_train)
        label_model_acc = label_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=probs_train, L=L_train)
        preds_train_filtered = probs_to_preds(probs_train_filtered)
        return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
 def setUp(self) -> None:
     self.lfa = LFAnalysis(np.array(L))
     self.lfa_wo_abstain = LFAnalysis(np.array(L_wo_abstain))
     self.Y = np.array(Y)
class TestAnalysis(unittest.TestCase):
    def setUp(self) -> None:
        self.lfa = LFAnalysis(np.array(L))
        self.lfa_wo_abstain = LFAnalysis(np.array(L_wo_abstain))
        self.Y = np.array(Y)

    def test_label_coverage(self) -> None:
        self.assertEqual(self.lfa.label_coverage(), 5 / 6)

    def test_label_overlap(self) -> None:
        self.assertEqual(self.lfa.label_overlap(), 4 / 6)

    def test_label_conflict(self) -> None:
        self.assertEqual(self.lfa.label_conflict(), 3 / 6)

    def test_lf_polarities(self) -> None:
        polarities = self.lfa.lf_polarities()
        self.assertEqual(polarities, [[1, 2], [], [0, 2], [2], [0, 1], [0]])

    def test_lf_coverages(self) -> None:
        coverages = self.lfa.lf_coverages()
        coverages_expected = [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6]
        np.testing.assert_array_almost_equal(coverages,
                                             np.array(coverages_expected))

    def test_lf_overlaps(self) -> None:
        overlaps = self.lfa.lf_overlaps(normalize_by_coverage=False)
        overlaps_expected = [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6]
        np.testing.assert_array_almost_equal(overlaps,
                                             np.array(overlaps_expected))

        overlaps = self.lfa.lf_overlaps(normalize_by_coverage=True)
        overlaps_expected = [1, 0, 1, 1 / 2, 1, 1]
        np.testing.assert_array_almost_equal(overlaps,
                                             np.array(overlaps_expected))

    def test_lf_conflicts(self) -> None:
        conflicts = self.lfa.lf_conflicts(normalize_by_overlaps=False)
        conflicts_expected = [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6]
        np.testing.assert_array_almost_equal(conflicts,
                                             np.array(conflicts_expected))

        conflicts = self.lfa.lf_conflicts(normalize_by_overlaps=True)
        conflicts_expected = [1, 0, 2 / 3, 1, 1, 3 / 4]
        np.testing.assert_array_almost_equal(conflicts,
                                             np.array(conflicts_expected))

    def test_lf_empirical_accuracies(self) -> None:
        accs = self.lfa.lf_empirical_accuracies(self.Y)
        accs_expected = [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4]
        np.testing.assert_array_almost_equal(accs, np.array(accs_expected))

    def test_lf_empirical_probs(self) -> None:
        P_emp = self.lfa.lf_empirical_probs(self.Y, 3)
        P = np.array([
            [[1 / 2, 1, 0], [0, 0, 0], [1 / 2, 0, 1 / 2], [0, 0, 1 / 2]],
            [[1, 1, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
            [[0, 1, 1 / 2], [1 / 2, 0, 1 / 2], [0, 0, 0], [1 / 2, 0, 0]],
            [[1, 1 / 2, 1 / 2], [0, 0, 0], [0, 0, 0], [0, 1 / 2, 1 / 2]],
            [[1 / 2, 1, 1 / 2], [1 / 2, 0, 0], [0, 0, 1 / 2], [0, 0, 0]],
            [[0, 1, 0], [1, 0, 1], [0, 0, 0], [0, 0, 0]],
        ])
        np.testing.assert_array_almost_equal(P, P_emp)

    def test_lf_summary(self) -> None:
        df = self.lfa.lf_summary(self.Y, est_weights=None)
        df_expected = pd.DataFrame({
            "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]],
            "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6],
            "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6],
            "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6],
            "Correct": [1, 0, 1, 1, 1, 2],
            "Incorrect": [2, 0, 2, 1, 1, 2],
            "Emp. Acc.": [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4],
        })
        pd.testing.assert_frame_equal(df.round(6), df_expected.round(6))

        df = self.lfa.lf_summary(Y=None, est_weights=None)
        df_expected = pd.DataFrame({
            "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]],
            "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6],
            "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6],
            "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6],
        })
        pd.testing.assert_frame_equal(df.round(6), df_expected.round(6))

        est_weights = [1, 0, 1, 1, 1, 0.5]
        names = list("abcdef")
        lfs = [LabelingFunction(s, f) for s in names]
        lfa = LFAnalysis(np.array(L), lfs)
        df = lfa.lf_summary(self.Y, est_weights=est_weights)
        df_expected = pd.DataFrame({
            "j": [0, 1, 2, 3, 4, 5],
            "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]],
            "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6],
            "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6],
            "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6],
            "Correct": [1, 0, 1, 1, 1, 2],
            "Incorrect": [2, 0, 2, 1, 1, 2],
            "Emp. Acc.": [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4],
            "Learned Weight": [1, 0, 1, 1, 1, 0.5],
        }).set_index(pd.Index(names))
        pd.testing.assert_frame_equal(df.round(6), df_expected.round(6))

    def test_wrong_number_of_lfs(self) -> None:
        with self.assertRaisesRegex(ValueError, "Number of LFs"):
            LFAnalysis(np.array(L), [LabelingFunction(s, f) for s in "ab"])

    def test_lf_summary_without_abstain(self) -> None:
        df = self.lfa_wo_abstain.lf_summary(self.Y + 4, est_weights=None)
        df_expected = pd.DataFrame({
            "Polarity": [[3, 4, 5], [3, 4], [3, 4, 5], [4, 5], [3, 4, 5], [3]],
            "Coverage": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            "Overlaps": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            "Conflicts": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            "Correct": [1, 1, 1, 3, 1, 0],
            "Incorrect": [5, 5, 5, 3, 5, 6],
            "Emp. Acc.": [1 / 6, 1 / 6, 1 / 6, 3 / 6, 1 / 6, 0],
        })
        pd.testing.assert_frame_equal(df.round(6), df_expected.round(6))
 def test_wrong_number_of_lfs(self) -> None:
     with self.assertRaisesRegex(ValueError, "Number of LFs"):
         LFAnalysis(np.array(L), [LabelingFunction(s, f) for s in "ab"])
        pred_train_mv = majority_model.predict(L=L_train)
        pred_train_lm = label_model.predict(L=L_train)

        # Calculate accuracy
        majority_acc = majority_model.score(
            L_valid, Y_valid, tie_break_policy="random")["accuracy"]
        print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")
        # majorityvote_summary = majority_model.score(L_valid,Y_valid,tie_break_policy="random", metrics=["accuracy"])

        labelmodel_acc = label_model.score(
            L_valid, Y_valid, tie_break_policy="random")["accuracy"]
        print(f"{'Label Model Accuracy:':<25} {labelmodel_acc * 100:.1f}%")
        # labelmodel_summary = [label_model.score(L_valid,Y_valid,tie_break_policy="random", metrics=["accuracy"])]

        # Store values in list
        acc_mv = majority_acc
        lst_acc_mv.append(acc_mv)
        acc_lm = labelmodel_acc
        lst_acc_lm.append(acc_lm)

        # Labeling function accuracy
        lf_accuracies = LFAnalysis(L=L_valid,
                                   lfs=lfs).lf_empirical_accuracies(Y_valid)
        print(lf_accuracies)
        lf_acc.append(lf_accuracies)

        i = i + 1

    mean_acc_mv.append(statistics.mean(lst_acc_mv))
    mean_acc_lm.append(statistics.mean(lst_acc_lm))
    keywords_neg=["bad", "worst", "horrible", "awful", "terrible", "crap", "shit", "garbage", "rubbish", "waste"])
keyword_actor = make_keyword_lf(name="keyword_actor", 
    keywords_pos=["beautiful", "handsome", "talented"], 
    keywords_neg=[])
keyword_finish = make_keyword_lf(name="keyword_finish", 
    keywords_pos=[], 
    keywords_neg=["fast forward", "n t finish"])
keyword_plot = make_keyword_lf(name="keyword_plot", 
    keywords_pos=["well written", "absorbing", "attractive", "innovative", "instructive", "interesting", "touching", "moving"], 
    keywords_neg=["to sleep", "fell asleep", "boring", "dull", "plain"])
keyword_compare = make_keyword_lf(name="keyword_compare", 
    keywords_pos=[], 
    keywords_neg=[" than this", " than the film", " than the movie"])

lfs = [
    expression_nexttime,
    expression_recommend,
    expression_value,
    keyword_compare,
    keyword_general,
    keyword_actor,
    keyword_finish,
    keyword_plot
]

applier = PandasLFApplier(lfs=lfs)
#L_train = applier.apply(df=df_train)    
L_dev = applier.apply(df=df_dev)
print("LF_analysis")
print(LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev))
Пример #18
0
def label_user(inp_path, prefix=""):
    df_train = pd.read_pickle(inp_path)

    ########## threshold on word similarity
    take_first = 100
    overall_first = 10000
    global thresh_by_value, overall_thresh
    df_train['root_value'] = df_train['value'].swifter.set_dask_threshold(
        dask_threshold=0.001).allow_dask_on_strings().apply(
            lambda x: syn_to_hob[x])
    thresh_by_value = df_train.groupby(
        ["root_value"]).apply(lambda x: np.partition(
            x['lexicon_counts'], max(len(x['lexicon_counts']) - take_first, 0)
        )[max(len(x['lexicon_counts']) - take_first, 0)]).to_dict()
    overall_thresh = np.partition(df_train["lexicon_counts"].to_numpy(),
                                  max(len(df_train) - overall_first, 0))[max(
                                      len(df_train) - overall_first, 0)]
    print(overall_thresh)
    #############################

    # separately loose - strict, pos - neg, period - without
    names_pool = [
        "context:2_count_pos", "context:3_count_pos", "context:100_count_pos",
        "context:2_period_count_pos", "context:3_period_count_pos",
        "context:100_period_count_pos", "context:2_count_neg",
        "context:3_count_neg", "context:100_count_neg",
        "context:2_period_count_neg", "context:3_period_count_neg",
        "context:100_period_count_neg"
    ]
    for f_name in names_pool:
        curr_cols = [x for x in df_train.columns if f_name in x]
        df_train['total_' + f_name] = df_train[curr_cols].swifter.apply(sum,
                                                                        axis=1)
        df_train = df_train.drop(curr_cols, axis=1)
    for p in ["pos", "neg"]:
        df_train["new_total_context:100_count_" + p] = df_train[[
            "total_context:100_count_" + p, "total_context:3_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:100_count_" + p] - x["total_context:3_count_" +
                                                     p]),
                         axis=1)
        df_train["new_total_context:3_count_" + p] = df_train[[
            "total_context:3_count_" + p, "total_context:2_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:3_count_" + p] - x["total_context:2_count_" + p
                                                   ]),
                         axis=1)
        df_train["new_total_context:100_period_count_" + p] = df_train[[
            "total_context:3_period_count_" + p,
            "total_context:100_period_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:100_period_count_" + p] - x[
                "total_context:3_period_count_" + p]),
                         axis=1)
        df_train["new_total_context:3_period_count_" + p] = df_train[[
            "total_context:3_period_count_" + p,
            "total_context:2_period_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:3_period_count_" + p] - x[
                "total_context:2_period_count_" + p]),
                         axis=1)
        df_train["new_total_context:2_count_" + p] = df_train[[
            "total_context:100_period_count_" + p, "total_context:2_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:2_count_" + p] - x[
                "total_context:100_period_count_" + p]),
                         axis=1)

    df_train = df_train.drop(
        ["total_" + x for x in names_pool if "2_period_count" not in x],
        axis=1)

    lfs = [val_in_name, positive_lexicon_overall, positive_lexicon_pervalue]
    num_of_thesholds = 3
    step = 100 // num_of_thesholds

    for col in df_train:
        if col not in ["author", "value", "idd", "root_value"]:
            if col not in [
                    "pos_prob_mean", "neg_prob_mean", "num_good_posts"
            ]:  # , "lexicon_counts", "subreddit_counts", "name_in_subr_count"]:
                thresholds = [0]
                if "lexicon" in col and "unique" not in col:
                    continue
                if True:  # col in ["lexicon_counts", "unique_lexicon_counts"]:
                    vals = df_train[col].to_numpy()
                    thresholds = np.percentile(
                        vals, list(range(0 + step, 99 + step,
                                         step))).astype(int)
                    thresholds = sorted(list(set(thresholds)))
                    if len(thresholds) > 1:
                        thresholds = thresholds[:-1]
                    if "lexicon" in col:
                        thresholds = [3]
                    # max_val = max(vals)
                    # thresholds = list(range(0, int(max_val), int(max_val/5) + 1))
                # elif col == "pos_prob_mean":
                #    thresholds = [0.5 + 0.1 * x for x in range(5)]
                for i in range(len(thresholds)):
                    thresh = thresholds[i]
                    next_threshold = sys.maxsize if i == len(
                        thresholds) - 1 else thresholds[i + 1]
                    previous_threshold = -sys.maxsize if i == 0 else thresholds[
                        i - 1]
                    if "lexicon_counts" not in col:
                        lfs.append(
                            make_thresold_lf(thresh=thresh,
                                             col_name=col,
                                             next_threshold=next_threshold))
                    else:
                        lfs.append(
                            make_lexicon_lf(
                                thresh=thresh,
                                pref=col,
                                previous_threshold=previous_threshold))

    num_annotators = 0
    if num_annotators > 0:
        for i in range(1, num_annotators + 1):
            lfs.append(make_annotator_lf(worker_index=i))

    lfs = [
        x for x in lfs
        if any(y in str(x) for y in ["less", "context:2", "worker", "lexicon"])
    ]
    print("created lfs their number", len(lfs))
    print("\n".join(str(x) for x in lfs))

    #### validation #####
    do_val = False
    if do_val:
        df_golden = pd.read_csv(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_dev.csv"
        )
        name_val = list(df_golden["auth_val"])
        # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x])
        df_train["auth_val"] = df_train[["author", "value"]].swifter.apply(
            lambda x: x["author"] + "+++" + x["value"], axis=1)
        df_val = df_train[df_train.auth_val.isin(name_val)]
        df_dev = df_train[~df_train.auth_val.isin(name_val)]
        print("Number val", df_val.shape)
        print("Number dev", df_dev.shape)
        df_val = df_val.merge(df_golden, on="auth_val")
        y_val = np.array(df_val["final"])
        df_val = df_val.drop(labels="final", axis=1)
        # create test set as well

        with TQDMDaskProgressBar(desc="Dask Apply"):
            applier = PandasParallelLFApplier(lfs=lfs)
            L_val = applier.apply(df=df_val, n_parallel=num_cpu)
            L_dev = applier.apply(df=df_dev, n_parallel=num_cpu)

        dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary()
        analysis = LFAnalysis(L=L_val, lfs=lfs).lf_summary(y_val)
        analysis.to_csv("/home/tigunova/val_analysis.csv")
        dev_analysis.to_csv("/home/tigunova/dev_analysis.csv")
        print(analysis)
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_dev)  #, Y_dev=y_val)
        model_stat = label_model.score(L=L_val, Y=y_val)
        print(model_stat)
        exit(0)
    ###########

    #### picking threshold #####
    do_threshold = False
    if do_threshold:
        df_golden = pd.read_csv(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_validation.csv"
        )
        name_val = list(df_golden["auth_val"])
        # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x])
        df_train["auth_val"] = df_train[["author", "value"]].swifter.apply(
            lambda x: x["author"] + "+++" + x["value"], axis=1)
        df_val = df_train[df_train.auth_val.isin(name_val)]
        df_dev = df_train[~df_train.auth_val.isin(name_val)]
        pop_size = df_dev.shape[0]
        print("Number val", df_val.shape)
        print("Number dev", df_dev.shape)
        applier = PandasParallelLFApplier(lfs=lfs)
        df_val = df_val.merge(df_golden, on="auth_val")
        L_val = applier.apply(df=df_val, n_parallel=num_cpu)
        val_thresholds = [0.01 * x for x in range(100)]
        label_model = LabelModel(cardinality=2, verbose=True)
        with TQDMDaskProgressBar(desc="Dask Apply"):
            L_dev = applier.apply(df=df_dev, n_parallel=num_cpu)
            label_model.fit(L_dev, class_balance=[0.5, 0.5])  # , Y_dev=y_val)
            wghts = label_model.get_weights()
            print("\n".join(str(x) for x in zip(lfs, wghts)))
            probs_val = label_model.predict_proba(L=L_val)
            probs_df = pd.DataFrame(probs_val,
                                    columns=["neg_prob", "pos_prob"])
            df_val = pd.concat([df_val.reset_index(), probs_df], axis=1)
            probs_dev = label_model.predict_proba(L=L_dev)
            probs_df = pd.DataFrame(probs_dev,
                                    columns=["neg_prob", "pos_prob"])
            df_dev = pd.concat([df_dev.reset_index(), probs_df], axis=1)
            y_true = np.array(df_val["final"])
        for th in val_thresholds:
            y_pred = np.array(
                df_val["pos_prob"].apply(lambda x: 1 if x > th else 0))
            #print("true negatives")
            #print(df_val[df_val["final"] == 1][df_val["pos_prob"] <= th][["auth_val", "text"]])
            prec = precision_score(y_true, y_pred)

            pred_labels = y_pred
            true_labels = y_true
            # True Positive (TP): we predict a label of 1 (positive), and the true label is 1.
            TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1))

            # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
            TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0))

            # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
            FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0))

            # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
            FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1))

            print('TP: %i, FP: %i, TN: %i, FN: %i' % (TP, FP, TN, FN))

            # print(list(zip(label_model.predict(L=L_val_curr), y_val_curr)))
            # print("******************************")
            print("threshold %s, proportion population %.4f, precision %s" %
                  (str(th), df_dev[df_dev["pos_prob"] > th].shape[0] /
                   pop_size, str(prec)))
        exit(0)
    ###########

    with TQDMDaskProgressBar(desc="Dask Apply"):
        applier = PandasParallelLFApplier(lfs=lfs)
        L_train = applier.apply(df=df_train, n_parallel=num_cpu)

    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
    print(analysis)

    df_l_train = pd.DataFrame(
        L_train, columns=["llf_" + str(x).split(",")[0] for x in lfs])
    print(df_train.shape)
    print(df_l_train.shape)
    df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1)
    print(df_train.shape)
    print("********************************************")

    t4 = time.time()
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train,
                    n_epochs=1000,
                    lr=0.001,
                    log_freq=100,
                    seed=123,
                    class_balance=[0.3, 0.7])

    probs_train = label_model.predict_proba(L=L_train)
    print("labeling model work ", (time.time() - t4) / 60)

    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=df_train, y=probs_train, L=L_train)

    probs_df = pd.DataFrame(probs_train_filtered,
                            columns=["neg_prob", "pos_prob"])
    print(df_train_filtered.shape)
    print(probs_df.shape)
    result_filtered = pd.concat([
        df_train_filtered[['author', 'value', 'idd']].reset_index(), probs_df
    ],
                                axis=1)
    print(result_filtered.shape)
    print("****************************************************")

    result_filtered.to_csv("/home/tigunova/some_result_" + prefix + ".csv")

    print(df_train_filtered.shape)
    print(probs_df.shape)
    df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df],
                                  axis=1)
    df_train_filtered = df_train_filtered.drop(["index"], axis=1)
    print(df_train_filtered.shape)
    df_train_filtered.to_pickle(
        "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" +
        prefix + ".pkl")
    df_train_filtered.to_csv(
        "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" +
        prefix + ".csv")

    # df_train.iloc[L_train[:, 1] == POS].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/user_" + prefix + ".csv")

    ### write dict
    output_threshold = 0.63
    output_dict = defaultdict(list)
    auth_hobby_dict = defaultdict(list)
    for index, row in result_filtered.iterrows():
        if row.value == row.value and row.author == row.author:
            auth_hobby_dict[row.author].append([row.value, row.pos_prob])

    allowed_labels = []
    for index, row in df_train_filtered.iterrows():
        if row.value == row.value and row.author == row.author:
            if row.pos_prob > output_threshold:
                output_dict[row.author].append([row.value] + row.idd +
                                               [row.pos_prob])
                allowed_labels.append(syn_to_hob[row.value])
    print("\n".join([
        str(y) for y in sorted(dict(Counter(allowed_labels)).items(),
                               key=lambda x: x[1])
    ]))
    print(
        "After cropping",
        sum([
            x if x < 500 else 500
            for x in dict(Counter(allowed_labels)).values()
        ]))
    print("users in total", len(output_dict))
    for auth, stuffs in output_dict.items():
        prof = ":::".join(set([x[0] for x in stuffs]))
        prob = ":::".join([str(x[-1]) for x in stuffs])
        msgs = set([x for l in stuffs for x in l[1:-1]])
        output_dict[auth] = [prof] + list(msgs) + [prob]

    with open(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/final_author_dict_"
            + prefix + ".txt", "w") as f_out:
        f_out.write(repr(dict(auth_hobby_dict)))
    with open("/home/tigunova/users_profession1.txt", "w") as f_out:
        f_out.write(repr(dict(output_dict)))
Пример #19
0
# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import PandasLFApplier, LFAnalysis

lfs = [
    stars_in_review,
    shared_first_author,
    polarity_positive,
    subjectivity_positive,
    polarity_negative,
]

applier = PandasLFApplier(lfs)
L_dev = applier.apply(df_dev)

# %%
LFAnalysis(L_dev, lfs).lf_summary(df_dev.rating.values)

# %% [markdown]
# ### Applying labeling functions to the training set
#
# We apply the labeling functions to the training set, and then filter out data points unlabeled by any LF to form our final training set.

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling.model.label_model import LabelModel

L_train = applier.apply(df_train)
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=5000, seed=123, log_freq=20, lr=0.01)
preds_train = label_model.predict(L_train)

# %% {"tags": ["md-exclude-output"]}
    lf_carry_subject,
    lf_not_person,
    lf_ydist,
    lf_dist,
    lf_area,
]

applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
L_valid = applier.apply(df_valid)

# %%
from snorkel.labeling import LFAnalysis

Y_valid = df_valid.label.values
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)

# %% [markdown]
# ## 3. Train Label Model
# We now train a multi-class `LabelModel` to assign training labels to the unalabeled training set.

# %%
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train, seed=123, lr=0.01, log_freq=10, n_epochs=100)

# %% [markdown]
# We use [F1](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html) Micro average for the multiclass setting, which calculates metrics globally across classes, by counting the total true positives, false negatives and false positives.

# %%
Пример #21
0
    return LEAVE if vote_found or stay_found else ABSTAIN


if __name__ == "__main__":
    data_dir = 'data'
    data_file = 'train_test_dataset.csv'
    file = os.path.join(data_dir, data_file)
    data = pd.read_csv(file, usecols=[0, 1])

    train_df, test_df = train_test_split(data, test_size=0.2, shuffle=True, random_state=42)

    lfs = [usual_hashtags_stay, usual_texts_stay, usual_texts_leave, usual_hashtags_leave]
    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(train_df)
    print(LFAnalysis(L_train, lfs=lfs).lf_summary())

    save_file = os.path.join(data_dir, 'ground_truth_matrix')
    np.save(save_file, L_train, allow_pickle=True)

    # golden_labels = []
    # for y in train_df.label:
    #     if y == 'leave':
    #         golden_labels.append(LEAVE)
    #     if y == 'stay':
    #         golden_labels.append(STAY)
    #
    # golden_labels = np.asarray(golden_labels)
    # save_file = os.path.join(data_dir, 'ground_truth')
    # np.save(save_file, golden_labels, allow_pickle=True)
Пример #22
0

lfs = [
    ai_positive_lf, proceedings_lf, generative_lf, synthesis_lf, gaussian_lf,
    has_company_lf, any_ai_lf, has_comparison_lf
]

###############################################################
## APPYLING LABELLING FUNCTIONS TO TRAIN & DEV SETS
###############################################################

applier = PandasLFApplier(lfs=lfs)
processed_train_data = applier.apply(data)
processed_dev_data = applier.apply(data)
logging.info("applied labelling functions to scraped data")
print(LFAnalysis(L=processed_train_data, lfs=lfs).lf_summary())

###############################################################
## FITTING THE GENERATIVE MODELAND PREDICTING

###############################################################
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=processed_train_data,
                n_epochs=500,
                log_freq=100,
                seed=123)

pred_LM_train = label_model.predict(processed_dev_data)
logging.info("generated noisy labels")
logging.info("writing to DataFrame")
Пример #23
0
# * **Polarity**: The set of unique labels this LF outputs (excluding abstains)
# * **Coverage**: The fraction of the dataset the LF labels
# * **Overlaps**: The fraction of the dataset where this LF and at least one other LF label
# * **Conflicts**: The fraction of the dataset where this LF and at least one other LF label and disagree
# * **Correct**: The number of data points this LF labels correctly (if gold labels are provided)
# * **Incorrect**: The number of data points this LF labels incorrectly (if gold labels are provided)
# * **Empirical Accuracy**: The empirical accuracy of this LF (if gold labels are provided)
#
# For *Correct*, *Incorrect*, and *Empirical Accuracy*, we don't want to penalize the LF for data points where it abstained.
# We calculate these statistics only over those data points where the LF output a label.
# Since we have labels for the `dev` set but not the `train` set, we'll compute these statistics for the `dev` set only by supplying `Y_dev`.

# %%
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

# %%
LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)

# %% [markdown]
# So even these very simple rules do quite well!
# We might want to pick the `check` rule, since both have high precision and `check` has higher coverage.
# But let's look at our data to be sure.
#
# The helper method `get_label_buckets(...)` groups data points by their predicted label and true label.
# For example, we can find the indices of data points that the LF labeled `SPAM` that actually belong to class `HAM`.
# This may give ideas for where the LF could be made more specific.

# %%
from snorkel.analysis import get_label_buckets
    def run_labeling_functions(cands):
        ABSTAIN = -1
        FALSE = 0
        TRUE = 1
        # Extract candidates
        train_cands = cands[0]
        dev_cands = cands[1]
        test_cands = cands[2] 

        @labeling_function()
        def LF_other_station_table(c):
            station_span = c.station.context.get_span().lower()
            neighbour_cells = get_neighbor_cell_ngrams_own(c.price, dist=100, directions=True, n_max = 4, absolute = True)
            up_cells = [x for x in neighbour_cells if len(x) > 1 and x[1] == 'DOWN' and x[0] in stations_list]
            # No station name in upper cells
            if (len(up_cells) == 0):
                return ABSTAIN
            # Check if the next upper aligned station-span corresponds to the candidate span (or equivalents)
            closest_header = up_cells[len(up_cells)-1]
            return TRUE if closest_header[0] in stations_mapping_dict[station_span] else FALSE

        @labeling_function()
        def LF_station_non_meta_tag(c):
            html_tags = get_ancestor_tag_names(c.station)
            return FALSE if ('head' in html_tags and 'title' in html_tags) else ABSTAIN

        # Basic constraint for the price LFs to be true -> no wrong station (increase accuracy)
        def base(c):
            return (
                LF_station_non_meta_tag(c) != 0 and 
                LF_other_station_table(c) != 0 and 
                LF_off_peak_head(c) != 0 and
                LF_purchases(c)
            )

        # 2.) Create labeling functions 
        @labeling_function()
        def LF_on_peak_head(c):
            return TRUE if 'on peak' in get_aligned_ngrams(c.price, n_min=2, n_max=2)  and base(c) else ABSTAIN

        @labeling_function()
        def LF_off_peak_head(c):
            return FALSE if 'off peak' in get_aligned_ngrams(c.price, n_min=2, n_max=2) else ABSTAIN

        @labeling_function()
        def LF_price_range(c):
            price = float(c.price.context.get_span())
            return TRUE if price > 0 and price < 1000 and base(c) else FALSE

        @labeling_function()
        def LF_price_head(c):
            return TRUE if 'price' in get_aligned_ngrams(c.price) and base(c) else ABSTAIN

        @labeling_function()
        def LF_firm_head(c):
            return TRUE if 'firm' in get_aligned_ngrams(c.price)and base(c) else ABSTAIN

        @labeling_function()
        def LF_dollar_to_left(c):
            return TRUE if '$' in get_left_ngrams(c.price, window=2) and base(c) else ABSTAIN

        @labeling_function()
        def LF_purchases(c):
            return FALSE if 'purchases' in get_aligned_ngrams(c.price, n_min=1) else ABSTAIN

        station_price_lfs = [
            LF_other_station_table,
            LF_station_non_meta_tag,

            # indicator
            LF_price_range,

            # negative indicators
            LF_off_peak_head,
            LF_purchases,

            # positive indicators
            LF_on_peak_head,    
            LF_price_head,
            LF_firm_head,
            LF_dollar_to_left,
        ]

        # 3.) Apply the LFs on the training set
        labeler = Labeler(session, [StationPrice])
        labeler.apply(split=0, lfs=[station_price_lfs], train=True, clear=True, parallelism=PARALLEL)
        L_train = labeler.get_label_matrices(train_cands)

        # Check that LFs are all applied (avoid crash)
        applied_lfs = L_train[0].shape[1]
        has_non_applied = applied_lfs != len(station_price_lfs)
        print(f"Labeling functions on train_cands not ABSTAIN: {applied_lfs} (/{len(station_price_lfs)})")

        if (has_non_applied):
            applied_lfs = get_applied_lfs(session)
            non_applied_lfs = [l.name for l in station_price_lfs if l.name not in applied_lfs]
            print(f"Labling functions {non_applied_lfs} are not applied.")
            station_price_lfs = [l for l in station_price_lfs if l.name in applied_lfs]

        # 4.) Evaluate their accuracy
        L_gold_train = labeler.get_gold_labels(train_cands, annotator='gold')
        # Sort LFs for LFAnalysis because LFAnalysis does not sort LFs,
        # while columns of L_train are sorted alphabetically already.
        sorted_lfs = sorted(station_price_lfs, key=lambda lf: lf.name)
        LFAnalysis(L=L_train[0], lfs=sorted_lfs).lf_summary(Y=L_gold_train[0].reshape(-1))

        # 5.) Build generative model
        gen_model = LabelModel(cardinality=2)
        gen_model.fit(L_train[0], n_epochs=500, log_freq=100)

        train_marginals_lfs = gen_model.predict_proba(L_train[0])

        # Apply on dev-set
        labeler.apply(split=1, lfs=[station_price_lfs], clear=True, parallelism=PARALLEL)
        L_dev = labeler.get_label_matrices(dev_cands)

        L_gold_dev = labeler.get_gold_labels(dev_cands, annotator='gold')
        LFAnalysis(L=L_dev[0], lfs=sorted_lfs).lf_summary(Y=L_gold_dev[0].reshape(-1))
        return (gen_model, train_marginals_lfs)
Пример #25
0
def main(train_path, output_dir, label_dir):
    # Get all data
    df = pd.read_csv(train_path)

    # Get human labels
    human_labels = read_human_labels(label_dir)

    # df_test and lab_test: the set of all human-labeled notes, and their labels
    df_test = df.merge(human_labels, on=['record_number'])
    lab_test = df_test.human_label
    del df_test['human_label']

    # df_train: formed by removing all patients from df with a human-labeled note
    df_train = df.merge(df_test.mr, indicator=True, how='left', on = ['mr'])
    df_train = df_train.query('_merge=="left_only"').drop('_merge', axis=1)

    # Generate label matrix
    L_train = PandasLFApplier(lfs=lfs).apply(df=df_train)
    L_test = PandasLFApplier(lfs=lfs).apply(df=df_test)

    # Summarize LFs
    output_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
    #print(output_train)
    output_test  = LFAnalysis(L=L_test, lfs=lfs).lf_summary(Y = lab_test.values)
    #print(output_test)

    # Save LF analysis
    path = os.path.join(output_dir, 'LF_analysis_train.csv')
    output_train.to_csv(path, index = True)
    path = os.path.join(output_dir, 'LF_analysis_test.csv')
    output_test.to_csv(path, index = True)

    # Create label model
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123, class_balance = [0.3, 0.7])

    # Evaluate the label model using labeled test set
    for metric in ['recall', 'precision', 'f1', 'accuracy']:
        label_model_acc = label_model.score(L=L_test, Y=lab_test, metrics=[metric], tie_break_policy="random")[metric]
        print("%-15s %.2f%%" % (metric+":", label_model_acc * 100))

    null_f1 = f1_score(lab_test.values, np.ones((df_test.shape[0],)))
    print("%-15s %.2f%%" % ("null f1:", null_f1 * 100))
    print("%-15s %.2f%%" % ("null accuracy:", np.maximum(1-np.mean(lab_test), np.mean(lab_test)) * 100))

    # Save error analysis
    preds = label_model.predict_proba(L_test)
    error_analysis(df_test, L_test, lfs, preds[:,1], lab_test, output_dir)

    # Get labels on train
    probs_train = label_model.predict_proba(L_train)

    # Filter out unlabeled data points
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train)

    # Save filtered training set
    df_train_filtered['prob'] = probs_train_filtered[:,1]
    path = os.path.join(output_dir, 'df_train_filtered.csv')
    df_train_filtered.to_csv(path, index = False)

    # Save label probs
    path = os.path.join(output_dir, 'probs_train_filtered')
    np.save(path, probs_train_filtered[:,1])

    # Save training data set and labels
    assert len(df_test) == len(lab_test)
    df_test['human_label'] = lab_test
    path = os.path.join(output_dir, 'df_test.csv')
    df_test.to_csv(path, index = False)
    path = os.path.join(output_dir, 'lab_test')
    np.save(path, lab_test)
Пример #26
0
# Combining Labeling Function Outputs
lfs = [
    question_mark, question_word, keyword_w, keyword_k, keyword_sein,
    keyword_verb
]

# apply label functions
applier = PandasLFApplier(lfs=lfs)
# create a label matrix for the training set
L_train = applier.apply(df=data_train)
# create a label matrix for the test det
L_test = applier.apply(df=data_test)

# summary statistics for the LFs
lf_summary = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
print(lf_summary)

# take the majority vote on a per-data point basis
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

# use LabelModel to produce training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

# result using majority-vote model
Y_test = data_test.label.values
majority_acc = majority_model.score(L=L_test,
                                    Y=Y_test,
                                    tie_break_policy="random")["accuracy"]
Пример #27
0
    if (p1_last != p2_last) and ((p1_last, p2_last) in last_names or (p2_last, p1_last) in last_names):
        return POSITIVE
    else:
        return ABSTAIN


if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    ((df_dev, Y_dev), df_train, (df_test, Y_test)) = load_data()
    lfs = [lf_husband_wife, lf_husband_wife_left_window, lf_same_last_name,
           lf_married, lf_familial_relationship, lf_family_left_window,
           lf_other_relationship, lf_distant_supervision, lf_distant_supervision_last_names]
    applier = PandasLFApplier(lfs)
    L_dev = applier.apply(df_dev)
    L_train = applier.apply(df_train)
    print(LFAnalysis(L_dev, lfs).lf_summary(Y_dev))
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345)
    probs_dev = label_model.predict_proba(L_dev)
    preds_dev = probs_to_preds(probs_dev)
    print("Label model F1: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')))
    print("Label model AUC: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')))
    probs_train = label_model.predict_proba(L_train)
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train)
    X_train = get_feature_arrays(df_train_filtered)
    model = get_model()
    batch_size = 64
    model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=100)
    X_test = get_feature_arrays(df_test)
    probs_test = model.predict(X_test)
    preds_test = probs_to_preds(probs_test)
Пример #28
0
def label_post(inp_path, prefix = ""):

    #lfs = [job_inpost, check_subreddit, check_iama]
    lfs = [job_inpost, check_iama]

    context_lens = [100, 3, 2]
    for with_per in [True, False]:
        for clen in context_lens:
            for kw in patterns:
                lfs.append(make_keyword_lf(keyword=kw, context_len=clen, with_period=with_per))

    print("created lfs, their count", len(lfs))

    df_train = pd.read_pickle(inp_path)

    df_train['texts'] = df_train['text'].swifter.apply(lambda x: [y.lower() for y in tokenize.sent_tokenize(x)])
    df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x])
    #df_train['containing_sentences'] = df_train[['texts', 'value']].swifter.apply(lambda y: find_val(y['texts'], y['value']), axis=1)

    print("loaded dataset")

    t1 = time.time()
    with TQDMDaskProgressBar(desc="Dask Apply"):
        applier = PandasParallelLFApplier(lfs=lfs)
        L_train = applier.apply(df=df_train, n_parallel=num_cpu)
    print("time mins ", (time.time() - t1) / 60)

    print(LFAnalysis(L=L_train, lfs=lfs).lf_summary())

    df_l_train = pd.DataFrame(L_train, columns=[str(x).split(",")[0] for x in lfs])
    print(df_train.shape)
    print(df_l_train.shape)
    df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1)
    print(df_train.shape)
    print("*************************************************")
    df_train = df_train.drop(["index"], axis=1)

    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123)
    probs_train = label_model.predict_proba(L=L_train)

    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=df_train, y=probs_train, L=L_train
    )
    print("the length of unfiltered posts", len(set(df_train['author'] + "+++++" + df_train['value'])))
    print("the length of filtered posts", len(set(df_train_filtered['author'] + "+++++" + df_train_filtered['value'])))

    probs_df = pd.DataFrame(probs_train_filtered, columns=["neg_prob", "pos_prob"])
    print(df_train_filtered.shape)
    print(probs_df.shape)
    df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df], axis=1)
    print(df_train_filtered.shape)

    df_train_filtered.to_pickle("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".pkl")
    df_train_filtered.to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".csv")

    #df_train.iloc[L_train[:, 1] != ABSTAIN].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/intr_train_post_tmp.csv")

    verbose = True
    if verbose:
        for i in range(len(lfs)):
            ppath = "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/interesting_datasets/" + str(lfs[i]).split(",")[0] + ".csv"
            df_train.iloc[L_train[:, i] != ABSTAIN].to_csv(ppath)


    auth_hobby_dict = defaultdict(set)
    for index, row in df_train.iterrows():
        if row.value == row.value and row.author == row.author:
            auth_hobby_dict[row.author].add(row.value)

    with open("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/author_profession_dict_" + prefix + ".txt", "w") as f_out:
        f_out.write(repr(dict(auth_hobby_dict)))
def model_analysis(label_model: LabelModel,
                   training_set: pd.DataFrame,
                   L_train: np.ndarray,
                   L_test: np.ndarray,
                   Y_test: np.ndarray,
                   lfs: list,
                   output_file="output") -> None:
    # TODO: consider using **kwargs instead of this painful list of arguments
    """Output analysis for the label model to a file

    :param label_model: The current label model which we want to output analysis for
    :type label_model: LabelModel
    :param training_set: A dataframe containing the training dataset
    :type training_set: pd.DataFrame
    :param L_train: The matrix of labels generated by the labeling functions on the training data
    :type L_train: np.ndarray
    :param L_test: The matrix of labels generated bt the labeling functions on the testing data
    :type L_test: np.ndarray
    :param Y_test: Gold labels associated with data points in L_test
    :type Y_test: np.ndarray
    :param lfs: List of labeling functions
    :type lfs: list
    :param output_file: A path where the output file should be writtent to, defaults to `PROJECT_ROOT/output`
    :type output_file: str, optional
    """
    Y_train = label_model.predict_proba(L=L_train)
    Y_pred = label_model.predict(L=L_test, tie_break_policy="abstain")
    lf_analysis_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    # TODO: Write this df to a output file. Ask Jennifer about how to handle this
    print(lf_analysis_train)

    # build majority label voter model
    majority_model = MajorityLabelVoter()
    majority_acc = majority_model.score(L=L_test,
                                        Y=Y_test,
                                        tie_break_policy="abstain",
                                        metrics=["f1", "accuracy"])
    label_model_acc = label_model.score(L=L_test,
                                        Y=Y_test,
                                        tie_break_policy="abstain",
                                        metrics=["f1", "accuracy"])

    # get precision and recall scores
    p_score = precision_score(y_true=Y_test, y_pred=Y_pred, average='weighted')
    r_score = recall_score(y_true=Y_test,
                           y_pred=Y_pred,
                           average='weighted',
                           labels=np.unique(Y_pred))

    # how many documents abstained
    probs_train = majority_model.predict_proba(L=L_train)
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=training_set, y=probs_train, L=L_train)

    # get number of false positives
    buckets = get_label_buckets(Y_test, Y_pred)
    true_positives, false_positives, true_negatives, false_negatives = (
        buckets.get((1, 1)), buckets.get((1, 0)), buckets.get(
            (0, 0)), buckets.get((0, 1)))
    # write analysis to file
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    with open(f"{'../output/logs/'}{output_file}_run_{timestamp}.txt",
              "w") as output_file:
        output_file.write(
            f"{'Majority Vote Accuracy:':<25} {majority_acc['accuracy'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Majority Vote F1 Score:':<25} {majority_acc['f1'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Label Model Accuracy:':<25} {label_model_acc['accuracy'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Label Model F1 Score:':<25} {label_model_acc['f1'] * 100:.2f}%"
        )
        output_file.write(f"\n{'Precision Score:':<25} {p_score * 100:.2f}%")
        output_file.write(f"\n{'Recall Score:':<25} {r_score * 100:.2f}%")
        output_file.write(
            f"\n{'Abstained Data Points:':<25} {len(df_train_filtered)}")
        output_file.write(
            f"\n{'True Positives:':<25} {len(true_positives) if true_positives is not None else 0}"
        )
        output_file.write(
            f"\n{'False Positives:':<25} {len(false_positives) if false_positives is not None else 0}"
        )
        output_file.write(
            f"\n{'False Negatives:':<25} {len(false_negatives) if false_negatives is not None else 0}"
        )
        output_file.write(
            f"\n{'True Negatives:':<25} {len(true_negatives) if true_negatives is not None else 0}"
        )
        output_file.write(
            f"\n{'Abstained Positives:':<25} {len(buckets[(1, -1)])}")
        output_file.write(
            f"\n{'Abstained Negatives:':<25} {len(buckets[(0, -1)])}")
Пример #30
0
# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import PandasLFApplier

applier = PandasLFApplier(worker_lfs)
L_train = applier.apply(df_train)
L_dev = applier.apply(df_dev)

# %% [markdown]
# Note that because our dev set is so small and our LFs are relatively sparse, many LFs will appear to have zero coverage.
# Fortunately, our label model learns weights for LFs based on their outputs on the training set, which is generally much larger.

# %%
from snorkel.labeling import LFAnalysis

LFAnalysis(L_dev, worker_lfs).lf_summary(Y_dev).sample(5)

# %% [markdown]
# So the crowd labels in general are quite good! But how much of our dev and training
# sets do they cover?

# %%
print(
    f"Training set coverage: {100 * LFAnalysis(L_train).label_coverage(): 0.1f}%"
)
print(f"Dev set coverage: {100 * LFAnalysis(L_dev).label_coverage(): 0.1f}%")

# %% [markdown]
# ### Additional labeling functions
#
# To improve coverage of the training set, we can mix the crowdworker labeling functions with labeling