def train_model(self, df_train: pd.DataFrame, application_area_lfs: list, analysis_path: str = "output", label_output_path: str = "labels.jsonl", save_model_path: str = None): """Using our labeling functions, we can train a probabilistic model which is able to generate weak labels for our data points :param df_train: The training data for the model :type df_train: pd.DataFrame :param application_area_lfs: A list of labeling functions to use in training the Label Model :type application_area_lfs: list :param analysis_path: Folder path where the model output should be stored, defaults to `PROJECT_ROOT/output` :type analysis_path: str, optional :param label_output_path: Path to file where probabilistic labels generated by the model should be stored, defaults to "labels.jsonl" :type label_output_path: str, optional :param save_model_path: A path to where the Label Model should be save at. If no path is provided, the model is not saved :type save_model_path: str, optional """ file_name_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") applier = PandasLFApplier(lfs=application_area_lfs) L_train = applier.apply(df=df_train) model = LabelModel(cardinality=2, verbose=True) model.fit(L_train=L_train, n_epochs=800, log_freq=100) if (save_model_path is not None): model.save(save_model_path) int_labels, prob_labels = model.predict(L=L_train, return_probs=True, tie_break_policy="abstain") probs_df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=prob_labels, L=L_train) int_df_train_filtered, int_train_filtered = filter_unlabeled_dataframe( X=df_train, y=int_labels, L=L_train) # write out both labels. In the probability outputs, p_rel is the second probability listed assert list(probs_df_train_filtered["paperid"]) == list( int_df_train_filtered["paperid"]) with open(f"{label_output_path}", mode="w") as out: for idx, paper_id in enumerate(probs_df_train_filtered["paperid"]): out.write( json.dumps({ "id": paper_id, # cast to int and float to get rid of nonserializable numpy types "is_rel": int(int_train_filtered[idx]), "p_rel": float(probs_train_filtered[idx][1]) }) + "\n") # output LF analysis to csv file sorted by coverage lf_analysis = LFAnalysis(L=L_train, lfs=application_area_lfs).lf_summary() with open( f"{self.PROJECT_ROOT}/output/{analysis_path}_{file_name_timestamp}.csv", "w") as outfile: lf_analysis = lf_analysis.sort_values("Coverage") lf_analysis.to_csv(outfile, encoding="utf-8", index=True)
def run_analysis( applied_lf_matrix: np.ndarray, lfs: List[LabelingFunction], save_csv_to: AbsolutePath, save_json_to: AbsolutePath, label_series: Optional[Series] = None, ) -> None: lf_analysis_summary = LFAnalysis(applied_lf_matrix, lfs).lf_summary( Y=label_series.values if label_series is not None else None) lf_analysis_summary.to_csv(save_csv_to) analysis_dict = lf_analysis_summary.to_dict() del analysis_dict["j"] with open(save_json_to, "w") as f: json.dump(analysis_dict, f, indent=4, sort_keys=True, cls=NumpyEncoder)
def print_analysis(l_train,lfs): """ Prints LF's coverage and statistics """ coverage_masechet_then_parans, coverage_perek_then_parans, \ coverage_daf_in_parntes, coverage_no_double_parans, coverage_no_mishna = ( l_train != ABSTAIN).mean(axis=0) txt_file = open(r"data/analysis.txt", "a+") txt_file.write("\n\n") txt_file.write('Analysis for date ['+str(datetime.datetime.now())+']: \n') txt_file.write('[SAMPLE_SIZE: ' + str(utility.SAMPLE_SIZE) + '] \n') txt_file.write('[TRANSFORMATION_FACTOR: ' + str(TRANSFORMATION_FACTOR) + '] \n') txt_file.write("\n\n") txt_file.write(":::::::::::::::::::::::::::|LFs Coverage|::::::::::::::::::::::::::::::::\n") txt_file.write(f"coverage_masechet_then_parans: {coverage_masechet_then_parans * 100:.1f}%\n") txt_file.write(f"coverage_perek_then_parans: {coverage_perek_then_parans * 100:.1f}%\n") txt_file.write(f"coverage_daf_in_parntes: {coverage_daf_in_parntes * 100:.1f}%\n") txt_file.write(f"coverage_no_double_parans: {coverage_no_double_parans * 100:.1f}%\n") txt_file.write(f"coverage_no_mishna: {coverage_no_mishna * 100:.1f}%\n") txt_file.write(":::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n") txt_file.write(":::::::::::::::::::::::|LFs Summary - l_train|:::::::::::::::::::::::::::\n") txt_file.write(LFAnalysis(L=l_train, lfs=lfs).lf_summary().to_string()) txt_file.write("\n") txt_file.write(":::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n") # txt_file.write("::::::::::::::::::::::::|LFs Summary - l_dev|::::::::::::::::::::::::::::\n") # txt_file.write(LFAnalysis(L=l_dev, lfs=lfs).lf_summary(Y=Y_dev).to_string()) # txt_file.write("\n") # txt_file.write(":::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n") txt_file.close()
def snorkel_process(keylist, dataframe, allweaklabf): def func(x): idx = (-x).argsort()[1:] x[idx] = 0 return x cardinalitynu = len(keylist) applier = PandasLFApplier(lfs=allweaklabf) all_train_l = applier.apply(df=dataframe) report = LFAnalysis(L=all_train_l, lfs=allweaklabf).lf_summary() print(report) label_model = LabelModel(cardinality=cardinalitynu, verbose=False) label_model.fit(all_train_l) predt = label_model.predict(all_train_l) predt1 = label_model.predict_proba(all_train_l) keylist1 = keylist.copy() #keylist1.append('Not_relevent') predt2 = pd.DataFrame(predt1, columns=keylist1) dataframe['L_label'] = predt dataframe1 = dataframe.join(predt2, how='outer') dataframe1 = dataframe1[dataframe1.L_label >= 0] train, test = train_test_split(dataframe1, test_size=0.2) trainsent = train.sent.values trainlabel = train[keylist].values trainlabe2 = trainlabel.copy() np.apply_along_axis(func, 1, trainlabe2) trainlabe2 = np.where(trainlabe2 > 0, 1, 0) testsent = test.sent.values testlabel = test[keylist].values testlabe2 = testlabel.copy() np.apply_along_axis(func, 1, testlabe2) testlabe2 = np.where(testlabe2 > 0, 1, 0) return trainsent, trainlabe2, testsent, testlabe2, keylist, report
def apply(self, tasks): print('Create regions...') random.shuffle(tasks) regions = self.create_regions(tasks[:100]) print(f'Num regions: {len(regions)}') L_train = self.applier.apply(regions) lfa = LFAnalysis(L=L_train, lfs=self.lfs) confl = lfa.lf_conflicts() cov = lfa.lf_coverages() confli = np.argsort(confl) lfs_sorted = [self.lfs[i] for i in confli] out = [] for lf, cf, cv in zip(lfs_sorted, confl[confli], cov[confli]): print(lf.name, cf, cv) out.append({'lop': lf.name, 'conflict': cf, 'coverage': cv}) return out
def startSnorkelLabeling(df, keyword_groups={}, label=IRRELEVANT, l_type='SnorkelFilter'): ''' Function: Filter words for user Inputs: - df: tweets DataFrame (columns: [id, text]) - keywords: Keyword group and its relevant keywords E.g. {'usps': ['postal service', 'usps'], 'invest': ['invest','portfolio','stock']} Outputs: - a_df: Categorised Data (e.g. columns = ['id', 'tweets', 'Refund', 'COVID']) - analysis: Snorkel Labeling Function statistics ''' lfs = [] for name, keywords in keyword_groups.items(): lfs.append(make_keyword_lf(lf_name=name, keywords=keywords, label=label)) applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df) if l_type == 'SnorkelFilter': # For spam detection (Step 2) L_final = get_L_final_filter(L_train) df['relevance'] = L_final elif l_type == 'SnorkelCategorise': # For categorising tweets (Step 3) L_final = get_L_final_categorise(L_train) L_final_with_names = dict(zip(keyword_groups.keys(), L_final)) for name, L_values in L_final_with_names.items(): df[name] = L_values analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() #return L_train, L_final, df, analysis return df, analysis
def test_lf_summary(self) -> None: df = self.lfa.lf_summary(self.Y, est_weights=None) df_expected = pd.DataFrame( { "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]], "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6], "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6], "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6], "Correct": [1, 0, 1, 1, 1, 2], "Incorrect": [2, 0, 2, 1, 1, 2], "Emp. Acc.": [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4], } ) pd.testing.assert_frame_equal(df.round(6), df_expected.round(6)) df = self.lfa.lf_summary(Y=None, est_weights=None) df_expected = pd.DataFrame( { "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]], "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6], "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6], "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6], } ) pd.testing.assert_frame_equal(df.round(6), df_expected.round(6)) est_weights = [1, 0, 1, 1, 1, 0.5] names = list("abcdef") lfs = [LabelingFunction(s, f) for s in names] lfa = LFAnalysis(np.array(L), lfs) df = lfa.lf_summary(self.Y, est_weights=est_weights) df_expected = pd.DataFrame( { "j": [0, 1, 2, 3, 4, 5], "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]], "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6], "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6], "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6], "Correct": [1, 0, 1, 1, 1, 2], "Incorrect": [2, 0, 2, 1, 1, 2], "Emp. Acc.": [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4], "Learned Weight": [1, 0, 1, 1, 1, 0.5], } ).set_index(pd.Index(names)) pd.testing.assert_frame_equal(df.round(6), df_expected.round(6))
def analyze_lfs(self): if len(self.lfs) > 0: df = LFAnalysis(L=self.L_train, lfs=self.get_lfs()).lf_summary() dev_df = LFAnalysis(L=self.L_dev, lfs=self.get_lfs()).lf_summary(Y=self.Y_dev) df = df.merge(dev_df, how="outer", suffixes=(" Training", " Dev."), left_index=True, right_index=True) df["Weight"] = self.label_model.get_weights() df["Duplicate"] = None for dupe, OG in self.find_duplicate_signature().items(): print("Duplicate labeling signature detected") print(dupe, OG) df.at[dupe, "Duplicate"] = OG return df return None
def createAnalysis(final_df, category_names): L_final = [] for name in category_names: category_if = [-1 if i == 0 else i for i in final_df[name].tolist()] L_final.append(category_if) L_train = [list(x) for x in list(zip(*L_final))] lfs = [LabelingFunction(name=name, f=None) for name in category_names] return LFAnalysis(L=np.array(L_train), lfs=lfs).lf_summary()
def main(): lfs = [lf_contains_link, lf_contains_co, lf_contains_sub] baseApp = LFApplier(lfs) labels = baseApp.apply(src) print(labels) print(LFAnalysis(labels, lfs).lf_summary()) buckets = get_label_buckets(labels[:, 0], labels[:, 1]) print(buckets) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(labels, n_epochs=500, log_freq=50, seed=123) pred_labels = label_model.predict(L=labels, tie_break_policy="abstain") print(pred_labels)
def _test_generate_L(self, k: int, decimal: Optional[int] = 2) -> None: """Test generated label matrix L for consistency with P, Y. This tests for consistency between the true conditional LF probabilities, P, and the empirical ones computed from L and Y, where P, L, and Y are generated by the generate_simple_label_matrix function. Parameters ---------- k Cardinality decimal Number of decimals to check element-wise error, err < 1.5 * 10**(-decimal) """ np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, k) P_emp = LFAnalysis(L).lf_empirical_probs(Y, k=k) np.testing.assert_array_almost_equal(P, P_emp, decimal=decimal)
def labeling_evaluation(df_train, df_test, label_model): lfs = [ LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short, LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword, LabelingFunction.lf_surname_re, LabelingFunction.industry_cls ] applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df_train) L_test = applier.apply(df=df_test) Y_test = df_test.label.values analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() if label_model == "majority": majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=L_train) majority_acc = majority_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe( X=df_train, y=preds_train, L=L_train) return df_train_filtered, preds_train_filtered, analysis if label_model == "weighted": label_model = LabelModel(cardinality=len( [c for c in dir(Polarity) if not c.startswith("__")]), verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) probs_train = label_model.predict_proba(L_train) label_model_acc = label_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) preds_train_filtered = probs_to_preds(probs_train_filtered) return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
def setUp(self) -> None: self.lfa = LFAnalysis(np.array(L)) self.lfa_wo_abstain = LFAnalysis(np.array(L_wo_abstain)) self.Y = np.array(Y)
class TestAnalysis(unittest.TestCase): def setUp(self) -> None: self.lfa = LFAnalysis(np.array(L)) self.lfa_wo_abstain = LFAnalysis(np.array(L_wo_abstain)) self.Y = np.array(Y) def test_label_coverage(self) -> None: self.assertEqual(self.lfa.label_coverage(), 5 / 6) def test_label_overlap(self) -> None: self.assertEqual(self.lfa.label_overlap(), 4 / 6) def test_label_conflict(self) -> None: self.assertEqual(self.lfa.label_conflict(), 3 / 6) def test_lf_polarities(self) -> None: polarities = self.lfa.lf_polarities() self.assertEqual(polarities, [[1, 2], [], [0, 2], [2], [0, 1], [0]]) def test_lf_coverages(self) -> None: coverages = self.lfa.lf_coverages() coverages_expected = [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6] np.testing.assert_array_almost_equal(coverages, np.array(coverages_expected)) def test_lf_overlaps(self) -> None: overlaps = self.lfa.lf_overlaps(normalize_by_coverage=False) overlaps_expected = [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6] np.testing.assert_array_almost_equal(overlaps, np.array(overlaps_expected)) overlaps = self.lfa.lf_overlaps(normalize_by_coverage=True) overlaps_expected = [1, 0, 1, 1 / 2, 1, 1] np.testing.assert_array_almost_equal(overlaps, np.array(overlaps_expected)) def test_lf_conflicts(self) -> None: conflicts = self.lfa.lf_conflicts(normalize_by_overlaps=False) conflicts_expected = [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6] np.testing.assert_array_almost_equal(conflicts, np.array(conflicts_expected)) conflicts = self.lfa.lf_conflicts(normalize_by_overlaps=True) conflicts_expected = [1, 0, 2 / 3, 1, 1, 3 / 4] np.testing.assert_array_almost_equal(conflicts, np.array(conflicts_expected)) def test_lf_empirical_accuracies(self) -> None: accs = self.lfa.lf_empirical_accuracies(self.Y) accs_expected = [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4] np.testing.assert_array_almost_equal(accs, np.array(accs_expected)) def test_lf_empirical_probs(self) -> None: P_emp = self.lfa.lf_empirical_probs(self.Y, 3) P = np.array([ [[1 / 2, 1, 0], [0, 0, 0], [1 / 2, 0, 1 / 2], [0, 0, 1 / 2]], [[1, 1, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0]], [[0, 1, 1 / 2], [1 / 2, 0, 1 / 2], [0, 0, 0], [1 / 2, 0, 0]], [[1, 1 / 2, 1 / 2], [0, 0, 0], [0, 0, 0], [0, 1 / 2, 1 / 2]], [[1 / 2, 1, 1 / 2], [1 / 2, 0, 0], [0, 0, 1 / 2], [0, 0, 0]], [[0, 1, 0], [1, 0, 1], [0, 0, 0], [0, 0, 0]], ]) np.testing.assert_array_almost_equal(P, P_emp) def test_lf_summary(self) -> None: df = self.lfa.lf_summary(self.Y, est_weights=None) df_expected = pd.DataFrame({ "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]], "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6], "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6], "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6], "Correct": [1, 0, 1, 1, 1, 2], "Incorrect": [2, 0, 2, 1, 1, 2], "Emp. Acc.": [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4], }) pd.testing.assert_frame_equal(df.round(6), df_expected.round(6)) df = self.lfa.lf_summary(Y=None, est_weights=None) df_expected = pd.DataFrame({ "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]], "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6], "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6], "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6], }) pd.testing.assert_frame_equal(df.round(6), df_expected.round(6)) est_weights = [1, 0, 1, 1, 1, 0.5] names = list("abcdef") lfs = [LabelingFunction(s, f) for s in names] lfa = LFAnalysis(np.array(L), lfs) df = lfa.lf_summary(self.Y, est_weights=est_weights) df_expected = pd.DataFrame({ "j": [0, 1, 2, 3, 4, 5], "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]], "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6], "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6], "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6], "Correct": [1, 0, 1, 1, 1, 2], "Incorrect": [2, 0, 2, 1, 1, 2], "Emp. Acc.": [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4], "Learned Weight": [1, 0, 1, 1, 1, 0.5], }).set_index(pd.Index(names)) pd.testing.assert_frame_equal(df.round(6), df_expected.round(6)) def test_wrong_number_of_lfs(self) -> None: with self.assertRaisesRegex(ValueError, "Number of LFs"): LFAnalysis(np.array(L), [LabelingFunction(s, f) for s in "ab"]) def test_lf_summary_without_abstain(self) -> None: df = self.lfa_wo_abstain.lf_summary(self.Y + 4, est_weights=None) df_expected = pd.DataFrame({ "Polarity": [[3, 4, 5], [3, 4], [3, 4, 5], [4, 5], [3, 4, 5], [3]], "Coverage": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "Overlaps": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "Conflicts": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "Correct": [1, 1, 1, 3, 1, 0], "Incorrect": [5, 5, 5, 3, 5, 6], "Emp. Acc.": [1 / 6, 1 / 6, 1 / 6, 3 / 6, 1 / 6, 0], }) pd.testing.assert_frame_equal(df.round(6), df_expected.round(6))
def test_wrong_number_of_lfs(self) -> None: with self.assertRaisesRegex(ValueError, "Number of LFs"): LFAnalysis(np.array(L), [LabelingFunction(s, f) for s in "ab"])
pred_train_mv = majority_model.predict(L=L_train) pred_train_lm = label_model.predict(L=L_train) # Calculate accuracy majority_acc = majority_model.score( L_valid, Y_valid, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") # majorityvote_summary = majority_model.score(L_valid,Y_valid,tie_break_policy="random", metrics=["accuracy"]) labelmodel_acc = label_model.score( L_valid, Y_valid, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {labelmodel_acc * 100:.1f}%") # labelmodel_summary = [label_model.score(L_valid,Y_valid,tie_break_policy="random", metrics=["accuracy"])] # Store values in list acc_mv = majority_acc lst_acc_mv.append(acc_mv) acc_lm = labelmodel_acc lst_acc_lm.append(acc_lm) # Labeling function accuracy lf_accuracies = LFAnalysis(L=L_valid, lfs=lfs).lf_empirical_accuracies(Y_valid) print(lf_accuracies) lf_acc.append(lf_accuracies) i = i + 1 mean_acc_mv.append(statistics.mean(lst_acc_mv)) mean_acc_lm.append(statistics.mean(lst_acc_lm))
keywords_neg=["bad", "worst", "horrible", "awful", "terrible", "crap", "shit", "garbage", "rubbish", "waste"]) keyword_actor = make_keyword_lf(name="keyword_actor", keywords_pos=["beautiful", "handsome", "talented"], keywords_neg=[]) keyword_finish = make_keyword_lf(name="keyword_finish", keywords_pos=[], keywords_neg=["fast forward", "n t finish"]) keyword_plot = make_keyword_lf(name="keyword_plot", keywords_pos=["well written", "absorbing", "attractive", "innovative", "instructive", "interesting", "touching", "moving"], keywords_neg=["to sleep", "fell asleep", "boring", "dull", "plain"]) keyword_compare = make_keyword_lf(name="keyword_compare", keywords_pos=[], keywords_neg=[" than this", " than the film", " than the movie"]) lfs = [ expression_nexttime, expression_recommend, expression_value, keyword_compare, keyword_general, keyword_actor, keyword_finish, keyword_plot ] applier = PandasLFApplier(lfs=lfs) #L_train = applier.apply(df=df_train) L_dev = applier.apply(df=df_dev) print("LF_analysis") print(LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev))
def label_user(inp_path, prefix=""): df_train = pd.read_pickle(inp_path) ########## threshold on word similarity take_first = 100 overall_first = 10000 global thresh_by_value, overall_thresh df_train['root_value'] = df_train['value'].swifter.set_dask_threshold( dask_threshold=0.001).allow_dask_on_strings().apply( lambda x: syn_to_hob[x]) thresh_by_value = df_train.groupby( ["root_value"]).apply(lambda x: np.partition( x['lexicon_counts'], max(len(x['lexicon_counts']) - take_first, 0) )[max(len(x['lexicon_counts']) - take_first, 0)]).to_dict() overall_thresh = np.partition(df_train["lexicon_counts"].to_numpy(), max(len(df_train) - overall_first, 0))[max( len(df_train) - overall_first, 0)] print(overall_thresh) ############################# # separately loose - strict, pos - neg, period - without names_pool = [ "context:2_count_pos", "context:3_count_pos", "context:100_count_pos", "context:2_period_count_pos", "context:3_period_count_pos", "context:100_period_count_pos", "context:2_count_neg", "context:3_count_neg", "context:100_count_neg", "context:2_period_count_neg", "context:3_period_count_neg", "context:100_period_count_neg" ] for f_name in names_pool: curr_cols = [x for x in df_train.columns if f_name in x] df_train['total_' + f_name] = df_train[curr_cols].swifter.apply(sum, axis=1) df_train = df_train.drop(curr_cols, axis=1) for p in ["pos", "neg"]: df_train["new_total_context:100_count_" + p] = df_train[[ "total_context:100_count_" + p, "total_context:3_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:100_count_" + p] - x["total_context:3_count_" + p]), axis=1) df_train["new_total_context:3_count_" + p] = df_train[[ "total_context:3_count_" + p, "total_context:2_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:3_count_" + p] - x["total_context:2_count_" + p ]), axis=1) df_train["new_total_context:100_period_count_" + p] = df_train[[ "total_context:3_period_count_" + p, "total_context:100_period_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:100_period_count_" + p] - x[ "total_context:3_period_count_" + p]), axis=1) df_train["new_total_context:3_period_count_" + p] = df_train[[ "total_context:3_period_count_" + p, "total_context:2_period_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:3_period_count_" + p] - x[ "total_context:2_period_count_" + p]), axis=1) df_train["new_total_context:2_count_" + p] = df_train[[ "total_context:100_period_count_" + p, "total_context:2_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:2_count_" + p] - x[ "total_context:100_period_count_" + p]), axis=1) df_train = df_train.drop( ["total_" + x for x in names_pool if "2_period_count" not in x], axis=1) lfs = [val_in_name, positive_lexicon_overall, positive_lexicon_pervalue] num_of_thesholds = 3 step = 100 // num_of_thesholds for col in df_train: if col not in ["author", "value", "idd", "root_value"]: if col not in [ "pos_prob_mean", "neg_prob_mean", "num_good_posts" ]: # , "lexicon_counts", "subreddit_counts", "name_in_subr_count"]: thresholds = [0] if "lexicon" in col and "unique" not in col: continue if True: # col in ["lexicon_counts", "unique_lexicon_counts"]: vals = df_train[col].to_numpy() thresholds = np.percentile( vals, list(range(0 + step, 99 + step, step))).astype(int) thresholds = sorted(list(set(thresholds))) if len(thresholds) > 1: thresholds = thresholds[:-1] if "lexicon" in col: thresholds = [3] # max_val = max(vals) # thresholds = list(range(0, int(max_val), int(max_val/5) + 1)) # elif col == "pos_prob_mean": # thresholds = [0.5 + 0.1 * x for x in range(5)] for i in range(len(thresholds)): thresh = thresholds[i] next_threshold = sys.maxsize if i == len( thresholds) - 1 else thresholds[i + 1] previous_threshold = -sys.maxsize if i == 0 else thresholds[ i - 1] if "lexicon_counts" not in col: lfs.append( make_thresold_lf(thresh=thresh, col_name=col, next_threshold=next_threshold)) else: lfs.append( make_lexicon_lf( thresh=thresh, pref=col, previous_threshold=previous_threshold)) num_annotators = 0 if num_annotators > 0: for i in range(1, num_annotators + 1): lfs.append(make_annotator_lf(worker_index=i)) lfs = [ x for x in lfs if any(y in str(x) for y in ["less", "context:2", "worker", "lexicon"]) ] print("created lfs their number", len(lfs)) print("\n".join(str(x) for x in lfs)) #### validation ##### do_val = False if do_val: df_golden = pd.read_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_dev.csv" ) name_val = list(df_golden["auth_val"]) # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) df_train["auth_val"] = df_train[["author", "value"]].swifter.apply( lambda x: x["author"] + "+++" + x["value"], axis=1) df_val = df_train[df_train.auth_val.isin(name_val)] df_dev = df_train[~df_train.auth_val.isin(name_val)] print("Number val", df_val.shape) print("Number dev", df_dev.shape) df_val = df_val.merge(df_golden, on="auth_val") y_val = np.array(df_val["final"]) df_val = df_val.drop(labels="final", axis=1) # create test set as well with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_val = applier.apply(df=df_val, n_parallel=num_cpu) L_dev = applier.apply(df=df_dev, n_parallel=num_cpu) dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary() analysis = LFAnalysis(L=L_val, lfs=lfs).lf_summary(y_val) analysis.to_csv("/home/tigunova/val_analysis.csv") dev_analysis.to_csv("/home/tigunova/dev_analysis.csv") print(analysis) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_dev) #, Y_dev=y_val) model_stat = label_model.score(L=L_val, Y=y_val) print(model_stat) exit(0) ########### #### picking threshold ##### do_threshold = False if do_threshold: df_golden = pd.read_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_validation.csv" ) name_val = list(df_golden["auth_val"]) # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) df_train["auth_val"] = df_train[["author", "value"]].swifter.apply( lambda x: x["author"] + "+++" + x["value"], axis=1) df_val = df_train[df_train.auth_val.isin(name_val)] df_dev = df_train[~df_train.auth_val.isin(name_val)] pop_size = df_dev.shape[0] print("Number val", df_val.shape) print("Number dev", df_dev.shape) applier = PandasParallelLFApplier(lfs=lfs) df_val = df_val.merge(df_golden, on="auth_val") L_val = applier.apply(df=df_val, n_parallel=num_cpu) val_thresholds = [0.01 * x for x in range(100)] label_model = LabelModel(cardinality=2, verbose=True) with TQDMDaskProgressBar(desc="Dask Apply"): L_dev = applier.apply(df=df_dev, n_parallel=num_cpu) label_model.fit(L_dev, class_balance=[0.5, 0.5]) # , Y_dev=y_val) wghts = label_model.get_weights() print("\n".join(str(x) for x in zip(lfs, wghts))) probs_val = label_model.predict_proba(L=L_val) probs_df = pd.DataFrame(probs_val, columns=["neg_prob", "pos_prob"]) df_val = pd.concat([df_val.reset_index(), probs_df], axis=1) probs_dev = label_model.predict_proba(L=L_dev) probs_df = pd.DataFrame(probs_dev, columns=["neg_prob", "pos_prob"]) df_dev = pd.concat([df_dev.reset_index(), probs_df], axis=1) y_true = np.array(df_val["final"]) for th in val_thresholds: y_pred = np.array( df_val["pos_prob"].apply(lambda x: 1 if x > th else 0)) #print("true negatives") #print(df_val[df_val["final"] == 1][df_val["pos_prob"] <= th][["auth_val", "text"]]) prec = precision_score(y_true, y_pred) pred_labels = y_pred true_labels = y_true # True Positive (TP): we predict a label of 1 (positive), and the true label is 1. TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1)) # True Negative (TN): we predict a label of 0 (negative), and the true label is 0. TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0)) # False Positive (FP): we predict a label of 1 (positive), but the true label is 0. FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0)) # False Negative (FN): we predict a label of 0 (negative), but the true label is 1. FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1)) print('TP: %i, FP: %i, TN: %i, FN: %i' % (TP, FP, TN, FN)) # print(list(zip(label_model.predict(L=L_val_curr), y_val_curr))) # print("******************************") print("threshold %s, proportion population %.4f, precision %s" % (str(th), df_dev[df_dev["pos_prob"] > th].shape[0] / pop_size, str(prec))) exit(0) ########### with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_train = applier.apply(df=df_train, n_parallel=num_cpu) analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() print(analysis) df_l_train = pd.DataFrame( L_train, columns=["llf_" + str(x).split(",")[0] for x in lfs]) print(df_train.shape) print(df_l_train.shape) df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1) print(df_train.shape) print("********************************************") t4 = time.time() label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123, class_balance=[0.3, 0.7]) probs_train = label_model.predict_proba(L=L_train) print("labeling model work ", (time.time() - t4) / 60) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) probs_df = pd.DataFrame(probs_train_filtered, columns=["neg_prob", "pos_prob"]) print(df_train_filtered.shape) print(probs_df.shape) result_filtered = pd.concat([ df_train_filtered[['author', 'value', 'idd']].reset_index(), probs_df ], axis=1) print(result_filtered.shape) print("****************************************************") result_filtered.to_csv("/home/tigunova/some_result_" + prefix + ".csv") print(df_train_filtered.shape) print(probs_df.shape) df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df], axis=1) df_train_filtered = df_train_filtered.drop(["index"], axis=1) print(df_train_filtered.shape) df_train_filtered.to_pickle( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" + prefix + ".pkl") df_train_filtered.to_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" + prefix + ".csv") # df_train.iloc[L_train[:, 1] == POS].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/user_" + prefix + ".csv") ### write dict output_threshold = 0.63 output_dict = defaultdict(list) auth_hobby_dict = defaultdict(list) for index, row in result_filtered.iterrows(): if row.value == row.value and row.author == row.author: auth_hobby_dict[row.author].append([row.value, row.pos_prob]) allowed_labels = [] for index, row in df_train_filtered.iterrows(): if row.value == row.value and row.author == row.author: if row.pos_prob > output_threshold: output_dict[row.author].append([row.value] + row.idd + [row.pos_prob]) allowed_labels.append(syn_to_hob[row.value]) print("\n".join([ str(y) for y in sorted(dict(Counter(allowed_labels)).items(), key=lambda x: x[1]) ])) print( "After cropping", sum([ x if x < 500 else 500 for x in dict(Counter(allowed_labels)).values() ])) print("users in total", len(output_dict)) for auth, stuffs in output_dict.items(): prof = ":::".join(set([x[0] for x in stuffs])) prob = ":::".join([str(x[-1]) for x in stuffs]) msgs = set([x for l in stuffs for x in l[1:-1]]) output_dict[auth] = [prof] + list(msgs) + [prob] with open( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/final_author_dict_" + prefix + ".txt", "w") as f_out: f_out.write(repr(dict(auth_hobby_dict))) with open("/home/tigunova/users_profession1.txt", "w") as f_out: f_out.write(repr(dict(output_dict)))
# %% {"tags": ["md-exclude-output"]} from snorkel.labeling import PandasLFApplier, LFAnalysis lfs = [ stars_in_review, shared_first_author, polarity_positive, subjectivity_positive, polarity_negative, ] applier = PandasLFApplier(lfs) L_dev = applier.apply(df_dev) # %% LFAnalysis(L_dev, lfs).lf_summary(df_dev.rating.values) # %% [markdown] # ### Applying labeling functions to the training set # # We apply the labeling functions to the training set, and then filter out data points unlabeled by any LF to form our final training set. # %% {"tags": ["md-exclude-output"]} from snorkel.labeling.model.label_model import LabelModel L_train = applier.apply(df_train) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=5000, seed=123, log_freq=20, lr=0.01) preds_train = label_model.predict(L_train) # %% {"tags": ["md-exclude-output"]}
lf_carry_subject, lf_not_person, lf_ydist, lf_dist, lf_area, ] applier = PandasLFApplier(lfs) L_train = applier.apply(df_train) L_valid = applier.apply(df_valid) # %% from snorkel.labeling import LFAnalysis Y_valid = df_valid.label.values LFAnalysis(L_valid, lfs).lf_summary(Y_valid) # %% [markdown] # ## 3. Train Label Model # We now train a multi-class `LabelModel` to assign training labels to the unalabeled training set. # %% from snorkel.labeling.model import LabelModel label_model = LabelModel(cardinality=3, verbose=True) label_model.fit(L_train, seed=123, lr=0.01, log_freq=10, n_epochs=100) # %% [markdown] # We use [F1](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html) Micro average for the multiclass setting, which calculates metrics globally across classes, by counting the total true positives, false negatives and false positives. # %%
return LEAVE if vote_found or stay_found else ABSTAIN if __name__ == "__main__": data_dir = 'data' data_file = 'train_test_dataset.csv' file = os.path.join(data_dir, data_file) data = pd.read_csv(file, usecols=[0, 1]) train_df, test_df = train_test_split(data, test_size=0.2, shuffle=True, random_state=42) lfs = [usual_hashtags_stay, usual_texts_stay, usual_texts_leave, usual_hashtags_leave] applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(train_df) print(LFAnalysis(L_train, lfs=lfs).lf_summary()) save_file = os.path.join(data_dir, 'ground_truth_matrix') np.save(save_file, L_train, allow_pickle=True) # golden_labels = [] # for y in train_df.label: # if y == 'leave': # golden_labels.append(LEAVE) # if y == 'stay': # golden_labels.append(STAY) # # golden_labels = np.asarray(golden_labels) # save_file = os.path.join(data_dir, 'ground_truth') # np.save(save_file, golden_labels, allow_pickle=True)
lfs = [ ai_positive_lf, proceedings_lf, generative_lf, synthesis_lf, gaussian_lf, has_company_lf, any_ai_lf, has_comparison_lf ] ############################################################### ## APPYLING LABELLING FUNCTIONS TO TRAIN & DEV SETS ############################################################### applier = PandasLFApplier(lfs=lfs) processed_train_data = applier.apply(data) processed_dev_data = applier.apply(data) logging.info("applied labelling functions to scraped data") print(LFAnalysis(L=processed_train_data, lfs=lfs).lf_summary()) ############################################################### ## FITTING THE GENERATIVE MODELAND PREDICTING ############################################################### label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=processed_train_data, n_epochs=500, log_freq=100, seed=123) pred_LM_train = label_model.predict(processed_dev_data) logging.info("generated noisy labels") logging.info("writing to DataFrame")
# * **Polarity**: The set of unique labels this LF outputs (excluding abstains) # * **Coverage**: The fraction of the dataset the LF labels # * **Overlaps**: The fraction of the dataset where this LF and at least one other LF label # * **Conflicts**: The fraction of the dataset where this LF and at least one other LF label and disagree # * **Correct**: The number of data points this LF labels correctly (if gold labels are provided) # * **Incorrect**: The number of data points this LF labels incorrectly (if gold labels are provided) # * **Empirical Accuracy**: The empirical accuracy of this LF (if gold labels are provided) # # For *Correct*, *Incorrect*, and *Empirical Accuracy*, we don't want to penalize the LF for data points where it abstained. # We calculate these statistics only over those data points where the LF output a label. # Since we have labels for the `dev` set but not the `train` set, we'll compute these statistics for the `dev` set only by supplying `Y_dev`. # %% from snorkel.labeling import LFAnalysis LFAnalysis(L=L_train, lfs=lfs).lf_summary() # %% LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev) # %% [markdown] # So even these very simple rules do quite well! # We might want to pick the `check` rule, since both have high precision and `check` has higher coverage. # But let's look at our data to be sure. # # The helper method `get_label_buckets(...)` groups data points by their predicted label and true label. # For example, we can find the indices of data points that the LF labeled `SPAM` that actually belong to class `HAM`. # This may give ideas for where the LF could be made more specific. # %% from snorkel.analysis import get_label_buckets
def run_labeling_functions(cands): ABSTAIN = -1 FALSE = 0 TRUE = 1 # Extract candidates train_cands = cands[0] dev_cands = cands[1] test_cands = cands[2] @labeling_function() def LF_other_station_table(c): station_span = c.station.context.get_span().lower() neighbour_cells = get_neighbor_cell_ngrams_own(c.price, dist=100, directions=True, n_max = 4, absolute = True) up_cells = [x for x in neighbour_cells if len(x) > 1 and x[1] == 'DOWN' and x[0] in stations_list] # No station name in upper cells if (len(up_cells) == 0): return ABSTAIN # Check if the next upper aligned station-span corresponds to the candidate span (or equivalents) closest_header = up_cells[len(up_cells)-1] return TRUE if closest_header[0] in stations_mapping_dict[station_span] else FALSE @labeling_function() def LF_station_non_meta_tag(c): html_tags = get_ancestor_tag_names(c.station) return FALSE if ('head' in html_tags and 'title' in html_tags) else ABSTAIN # Basic constraint for the price LFs to be true -> no wrong station (increase accuracy) def base(c): return ( LF_station_non_meta_tag(c) != 0 and LF_other_station_table(c) != 0 and LF_off_peak_head(c) != 0 and LF_purchases(c) ) # 2.) Create labeling functions @labeling_function() def LF_on_peak_head(c): return TRUE if 'on peak' in get_aligned_ngrams(c.price, n_min=2, n_max=2) and base(c) else ABSTAIN @labeling_function() def LF_off_peak_head(c): return FALSE if 'off peak' in get_aligned_ngrams(c.price, n_min=2, n_max=2) else ABSTAIN @labeling_function() def LF_price_range(c): price = float(c.price.context.get_span()) return TRUE if price > 0 and price < 1000 and base(c) else FALSE @labeling_function() def LF_price_head(c): return TRUE if 'price' in get_aligned_ngrams(c.price) and base(c) else ABSTAIN @labeling_function() def LF_firm_head(c): return TRUE if 'firm' in get_aligned_ngrams(c.price)and base(c) else ABSTAIN @labeling_function() def LF_dollar_to_left(c): return TRUE if '$' in get_left_ngrams(c.price, window=2) and base(c) else ABSTAIN @labeling_function() def LF_purchases(c): return FALSE if 'purchases' in get_aligned_ngrams(c.price, n_min=1) else ABSTAIN station_price_lfs = [ LF_other_station_table, LF_station_non_meta_tag, # indicator LF_price_range, # negative indicators LF_off_peak_head, LF_purchases, # positive indicators LF_on_peak_head, LF_price_head, LF_firm_head, LF_dollar_to_left, ] # 3.) Apply the LFs on the training set labeler = Labeler(session, [StationPrice]) labeler.apply(split=0, lfs=[station_price_lfs], train=True, clear=True, parallelism=PARALLEL) L_train = labeler.get_label_matrices(train_cands) # Check that LFs are all applied (avoid crash) applied_lfs = L_train[0].shape[1] has_non_applied = applied_lfs != len(station_price_lfs) print(f"Labeling functions on train_cands not ABSTAIN: {applied_lfs} (/{len(station_price_lfs)})") if (has_non_applied): applied_lfs = get_applied_lfs(session) non_applied_lfs = [l.name for l in station_price_lfs if l.name not in applied_lfs] print(f"Labling functions {non_applied_lfs} are not applied.") station_price_lfs = [l for l in station_price_lfs if l.name in applied_lfs] # 4.) Evaluate their accuracy L_gold_train = labeler.get_gold_labels(train_cands, annotator='gold') # Sort LFs for LFAnalysis because LFAnalysis does not sort LFs, # while columns of L_train are sorted alphabetically already. sorted_lfs = sorted(station_price_lfs, key=lambda lf: lf.name) LFAnalysis(L=L_train[0], lfs=sorted_lfs).lf_summary(Y=L_gold_train[0].reshape(-1)) # 5.) Build generative model gen_model = LabelModel(cardinality=2) gen_model.fit(L_train[0], n_epochs=500, log_freq=100) train_marginals_lfs = gen_model.predict_proba(L_train[0]) # Apply on dev-set labeler.apply(split=1, lfs=[station_price_lfs], clear=True, parallelism=PARALLEL) L_dev = labeler.get_label_matrices(dev_cands) L_gold_dev = labeler.get_gold_labels(dev_cands, annotator='gold') LFAnalysis(L=L_dev[0], lfs=sorted_lfs).lf_summary(Y=L_gold_dev[0].reshape(-1)) return (gen_model, train_marginals_lfs)
def main(train_path, output_dir, label_dir): # Get all data df = pd.read_csv(train_path) # Get human labels human_labels = read_human_labels(label_dir) # df_test and lab_test: the set of all human-labeled notes, and their labels df_test = df.merge(human_labels, on=['record_number']) lab_test = df_test.human_label del df_test['human_label'] # df_train: formed by removing all patients from df with a human-labeled note df_train = df.merge(df_test.mr, indicator=True, how='left', on = ['mr']) df_train = df_train.query('_merge=="left_only"').drop('_merge', axis=1) # Generate label matrix L_train = PandasLFApplier(lfs=lfs).apply(df=df_train) L_test = PandasLFApplier(lfs=lfs).apply(df=df_test) # Summarize LFs output_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary() #print(output_train) output_test = LFAnalysis(L=L_test, lfs=lfs).lf_summary(Y = lab_test.values) #print(output_test) # Save LF analysis path = os.path.join(output_dir, 'LF_analysis_train.csv') output_train.to_csv(path, index = True) path = os.path.join(output_dir, 'LF_analysis_test.csv') output_test.to_csv(path, index = True) # Create label model label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123, class_balance = [0.3, 0.7]) # Evaluate the label model using labeled test set for metric in ['recall', 'precision', 'f1', 'accuracy']: label_model_acc = label_model.score(L=L_test, Y=lab_test, metrics=[metric], tie_break_policy="random")[metric] print("%-15s %.2f%%" % (metric+":", label_model_acc * 100)) null_f1 = f1_score(lab_test.values, np.ones((df_test.shape[0],))) print("%-15s %.2f%%" % ("null f1:", null_f1 * 100)) print("%-15s %.2f%%" % ("null accuracy:", np.maximum(1-np.mean(lab_test), np.mean(lab_test)) * 100)) # Save error analysis preds = label_model.predict_proba(L_test) error_analysis(df_test, L_test, lfs, preds[:,1], lab_test, output_dir) # Get labels on train probs_train = label_model.predict_proba(L_train) # Filter out unlabeled data points df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train) # Save filtered training set df_train_filtered['prob'] = probs_train_filtered[:,1] path = os.path.join(output_dir, 'df_train_filtered.csv') df_train_filtered.to_csv(path, index = False) # Save label probs path = os.path.join(output_dir, 'probs_train_filtered') np.save(path, probs_train_filtered[:,1]) # Save training data set and labels assert len(df_test) == len(lab_test) df_test['human_label'] = lab_test path = os.path.join(output_dir, 'df_test.csv') df_test.to_csv(path, index = False) path = os.path.join(output_dir, 'lab_test') np.save(path, lab_test)
# Combining Labeling Function Outputs lfs = [ question_mark, question_word, keyword_w, keyword_k, keyword_sein, keyword_verb ] # apply label functions applier = PandasLFApplier(lfs=lfs) # create a label matrix for the training set L_train = applier.apply(df=data_train) # create a label matrix for the test det L_test = applier.apply(df=data_test) # summary statistics for the LFs lf_summary = LFAnalysis(L=L_train, lfs=lfs).lf_summary() print(lf_summary) # take the majority vote on a per-data point basis majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=L_train) # use LabelModel to produce training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) # result using majority-vote model Y_test = data_test.label.values majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
if (p1_last != p2_last) and ((p1_last, p2_last) in last_names or (p2_last, p1_last) in last_names): return POSITIVE else: return ABSTAIN if __name__ == "__main__": warnings.filterwarnings("ignore") ((df_dev, Y_dev), df_train, (df_test, Y_test)) = load_data() lfs = [lf_husband_wife, lf_husband_wife_left_window, lf_same_last_name, lf_married, lf_familial_relationship, lf_family_left_window, lf_other_relationship, lf_distant_supervision, lf_distant_supervision_last_names] applier = PandasLFApplier(lfs) L_dev = applier.apply(df_dev) L_train = applier.apply(df_train) print(LFAnalysis(L_dev, lfs).lf_summary(Y_dev)) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345) probs_dev = label_model.predict_proba(L_dev) preds_dev = probs_to_preds(probs_dev) print("Label model F1: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1'))) print("Label model AUC: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc'))) probs_train = label_model.predict_proba(L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train) X_train = get_feature_arrays(df_train_filtered) model = get_model() batch_size = 64 model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=100) X_test = get_feature_arrays(df_test) probs_test = model.predict(X_test) preds_test = probs_to_preds(probs_test)
def label_post(inp_path, prefix = ""): #lfs = [job_inpost, check_subreddit, check_iama] lfs = [job_inpost, check_iama] context_lens = [100, 3, 2] for with_per in [True, False]: for clen in context_lens: for kw in patterns: lfs.append(make_keyword_lf(keyword=kw, context_len=clen, with_period=with_per)) print("created lfs, their count", len(lfs)) df_train = pd.read_pickle(inp_path) df_train['texts'] = df_train['text'].swifter.apply(lambda x: [y.lower() for y in tokenize.sent_tokenize(x)]) df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) #df_train['containing_sentences'] = df_train[['texts', 'value']].swifter.apply(lambda y: find_val(y['texts'], y['value']), axis=1) print("loaded dataset") t1 = time.time() with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_train = applier.apply(df=df_train, n_parallel=num_cpu) print("time mins ", (time.time() - t1) / 60) print(LFAnalysis(L=L_train, lfs=lfs).lf_summary()) df_l_train = pd.DataFrame(L_train, columns=[str(x).split(",")[0] for x in lfs]) print(df_train.shape) print(df_l_train.shape) df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1) print(df_train.shape) print("*************************************************") df_train = df_train.drop(["index"], axis=1) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123) probs_train = label_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train ) print("the length of unfiltered posts", len(set(df_train['author'] + "+++++" + df_train['value']))) print("the length of filtered posts", len(set(df_train_filtered['author'] + "+++++" + df_train_filtered['value']))) probs_df = pd.DataFrame(probs_train_filtered, columns=["neg_prob", "pos_prob"]) print(df_train_filtered.shape) print(probs_df.shape) df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df], axis=1) print(df_train_filtered.shape) df_train_filtered.to_pickle("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".pkl") df_train_filtered.to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".csv") #df_train.iloc[L_train[:, 1] != ABSTAIN].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/intr_train_post_tmp.csv") verbose = True if verbose: for i in range(len(lfs)): ppath = "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/interesting_datasets/" + str(lfs[i]).split(",")[0] + ".csv" df_train.iloc[L_train[:, i] != ABSTAIN].to_csv(ppath) auth_hobby_dict = defaultdict(set) for index, row in df_train.iterrows(): if row.value == row.value and row.author == row.author: auth_hobby_dict[row.author].add(row.value) with open("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/author_profession_dict_" + prefix + ".txt", "w") as f_out: f_out.write(repr(dict(auth_hobby_dict)))
def model_analysis(label_model: LabelModel, training_set: pd.DataFrame, L_train: np.ndarray, L_test: np.ndarray, Y_test: np.ndarray, lfs: list, output_file="output") -> None: # TODO: consider using **kwargs instead of this painful list of arguments """Output analysis for the label model to a file :param label_model: The current label model which we want to output analysis for :type label_model: LabelModel :param training_set: A dataframe containing the training dataset :type training_set: pd.DataFrame :param L_train: The matrix of labels generated by the labeling functions on the training data :type L_train: np.ndarray :param L_test: The matrix of labels generated bt the labeling functions on the testing data :type L_test: np.ndarray :param Y_test: Gold labels associated with data points in L_test :type Y_test: np.ndarray :param lfs: List of labeling functions :type lfs: list :param output_file: A path where the output file should be writtent to, defaults to `PROJECT_ROOT/output` :type output_file: str, optional """ Y_train = label_model.predict_proba(L=L_train) Y_pred = label_model.predict(L=L_test, tie_break_policy="abstain") lf_analysis_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary() # TODO: Write this df to a output file. Ask Jennifer about how to handle this print(lf_analysis_train) # build majority label voter model majority_model = MajorityLabelVoter() majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain", metrics=["f1", "accuracy"]) label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain", metrics=["f1", "accuracy"]) # get precision and recall scores p_score = precision_score(y_true=Y_test, y_pred=Y_pred, average='weighted') r_score = recall_score(y_true=Y_test, y_pred=Y_pred, average='weighted', labels=np.unique(Y_pred)) # how many documents abstained probs_train = majority_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=training_set, y=probs_train, L=L_train) # get number of false positives buckets = get_label_buckets(Y_test, Y_pred) true_positives, false_positives, true_negatives, false_negatives = ( buckets.get((1, 1)), buckets.get((1, 0)), buckets.get( (0, 0)), buckets.get((0, 1))) # write analysis to file timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") with open(f"{'../output/logs/'}{output_file}_run_{timestamp}.txt", "w") as output_file: output_file.write( f"{'Majority Vote Accuracy:':<25} {majority_acc['accuracy'] * 100:.2f}%" ) output_file.write( f"\n{'Majority Vote F1 Score:':<25} {majority_acc['f1'] * 100:.2f}%" ) output_file.write( f"\n{'Label Model Accuracy:':<25} {label_model_acc['accuracy'] * 100:.2f}%" ) output_file.write( f"\n{'Label Model F1 Score:':<25} {label_model_acc['f1'] * 100:.2f}%" ) output_file.write(f"\n{'Precision Score:':<25} {p_score * 100:.2f}%") output_file.write(f"\n{'Recall Score:':<25} {r_score * 100:.2f}%") output_file.write( f"\n{'Abstained Data Points:':<25} {len(df_train_filtered)}") output_file.write( f"\n{'True Positives:':<25} {len(true_positives) if true_positives is not None else 0}" ) output_file.write( f"\n{'False Positives:':<25} {len(false_positives) if false_positives is not None else 0}" ) output_file.write( f"\n{'False Negatives:':<25} {len(false_negatives) if false_negatives is not None else 0}" ) output_file.write( f"\n{'True Negatives:':<25} {len(true_negatives) if true_negatives is not None else 0}" ) output_file.write( f"\n{'Abstained Positives:':<25} {len(buckets[(1, -1)])}") output_file.write( f"\n{'Abstained Negatives:':<25} {len(buckets[(0, -1)])}")
# %% {"tags": ["md-exclude-output"]} from snorkel.labeling import PandasLFApplier applier = PandasLFApplier(worker_lfs) L_train = applier.apply(df_train) L_dev = applier.apply(df_dev) # %% [markdown] # Note that because our dev set is so small and our LFs are relatively sparse, many LFs will appear to have zero coverage. # Fortunately, our label model learns weights for LFs based on their outputs on the training set, which is generally much larger. # %% from snorkel.labeling import LFAnalysis LFAnalysis(L_dev, worker_lfs).lf_summary(Y_dev).sample(5) # %% [markdown] # So the crowd labels in general are quite good! But how much of our dev and training # sets do they cover? # %% print( f"Training set coverage: {100 * LFAnalysis(L_train).label_coverage(): 0.1f}%" ) print(f"Dev set coverage: {100 * LFAnalysis(L_dev).label_coverage(): 0.1f}%") # %% [markdown] # ### Additional labeling functions # # To improve coverage of the training set, we can mix the crowdworker labeling functions with labeling