def test_L_form(self): label_model = LabelModel(cardinality=2, verbose=False) L = np.array([[0, 1, 0], [0, 1, 0], [1, 0, 0], [0, 1, 0]]) label_model._set_constants(L) self.assertEqual(label_model.n, 4) self.assertEqual(label_model.m, 3) L = np.array([[0, 1, 2], [0, 1, 2], [1, 0, 2], [0, 1, 0]]) with self.assertRaisesRegex(ValueError, "L_train has cardinality"): label_model.fit(L, n_epochs=1) L = np.array([[0], [1], [-1]]) with self.assertRaisesRegex(ValueError, "L_train should have at least 3"): label_model.fit(L, n_epochs=1)
def test_optimizer_init(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() label_model.fit(L, optimizer="sgd", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.SGD) label_model.fit(L, optimizer="adam", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.Adam) label_model.fit(L, optimizer="adamax", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.Adamax) with self.assertRaisesRegex(ValueError, "Unrecognized optimizer"): label_model.fit(L, optimizer="bad_optimizer", n_epochs=1)
def test_mv_default(self): # less than 2 LFs have overlaps label_model = LabelModel(cardinality=2, verbose=False) L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]]) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal( label_model.predict(L), np.array([1, 1, 0]) ) # less than 2 LFs have conflicts L = np.array([[-1, -1, 1], [-1, 1, 1], [1, 1, 1]]) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal( label_model.predict(L), np.array([1, 1, 1]) )
def test_warmup(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() lr_scheduler_config = {"warmup_steps": 3, "warmup_unit": "epochs"} label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5) self.assertEqual(label_model.warmup_steps, 3) lr_scheduler_config = {"warmup_percentage": 3 / 5} label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5) self.assertEqual(label_model.warmup_steps, 3) with self.assertRaisesRegex(ValueError, "LabelModel does not support"): lr_scheduler_config = {"warmup_steps": 1, "warmup_unit": "batches"} label_model.fit(L, lr_scheduler_config=lr_scheduler_config)
def test_label_model_basic(self) -> None: """Test the LabelModel's estimate of P and Y on a simple synthetic dataset.""" np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L, n_epochs=200, lr=0.01, seed=123) # Test estimated LF conditional probabilities P_lm = label_model.get_conditional_probs() np.testing.assert_array_almost_equal(P, P_lm, decimal=2) # Test predicted labels score = label_model.score(L, Y) self.assertGreaterEqual(score["accuracy"], 0.9)
def test_scheduler_init(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() label_model.fit(L, lr_scheduler="constant", n_epochs=1) self.assertIsNone(label_model.lr_scheduler) label_model.fit(L, lr_scheduler="linear", n_epochs=1) self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.LambdaLR) label_model.fit(L, lr_scheduler="exponential", n_epochs=1) self.assertIsInstance( label_model.lr_scheduler, optim.lr_scheduler.ExponentialLR ) label_model.fit(L, lr_scheduler="step", n_epochs=1) self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.StepLR)
def test_label_model(self) -> None: """Test the LabelModel's estimate of P and Y.""" np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L, n_epochs=200, lr=0.01, seed=123) # Test estimated LF conditional probabilities P_lm = label_model._get_conditional_probs().reshape( (self.m, self.cardinality + 1, -1)) np.testing.assert_array_almost_equal(P, P_lm, decimal=2) # Test predicted labels Y_lm = label_model.predict_proba(L).argmax(axis=1) err = np.where(Y != Y_lm, 1, 0).sum() / self.n self.assertLess(err, 0.1)
def test_class_balance(self): label_model = LabelModel(cardinality=2, verbose=False) # Test class balance Y_dev = np.array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1]) label_model._set_class_balance(class_balance=None, Y_dev=Y_dev) np.testing.assert_array_almost_equal(label_model.p, np.array([0.6, 0.4])) class_balance = np.array([0.0, 1.0]) with self.assertRaisesRegex(ValueError, "Class balance prior is 0"): label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev) class_balance = np.array([0.0]) with self.assertRaisesRegex(ValueError, "class_balance has 1 entries."): label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev) Y_dev_one_class = np.array([0, 0, 0]) with self.assertRaisesRegex( ValueError, "Does not match LabelModel cardinality" ): label_model._set_class_balance(class_balance=None, Y_dev=Y_dev_one_class)
def test_labeling_convergence(self) -> None: """Test convergence of end to end labeling pipeline.""" # Apply LFs labeling_functions = ( [f] + [get_positive_labeling_function(divisor) for divisor in range(2, 9)] + [get_negative_labeling_function(divisor) for divisor in range(2, 9)] ) applier = PandasLFApplier(labeling_functions) L_train = applier.apply(self.df_train, progress_bar=False) self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions))) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0) Y_lm = label_model.predict_proba(L_train).argmax(axis=1) Y = self.df_train.y err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN self.assertLess(err, 0.05)
def test_predict(self): # 3 LFs that always disagree/abstain leads to all abstains L = np.array([[-1, 1, 0], [0, -1, 1], [1, 0, -1]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([-1, -1, -1])) L = np.array([[0, 1, 0], [0, 1, 0]]) label_model = self._set_up_model(L) label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp( 0.01, 0.99)) preds = label_model.predict(L) true_preds = np.array([0, 0]) np.testing.assert_array_equal(preds, true_preds) preds, probs = label_model.predict(L, return_probs=True) true_probs = np.array([[0.99, 0.01], [0.99, 0.01]]) np.testing.assert_array_almost_equal(probs, true_probs)
def test_get_weight(self): # set up L matrix true_accs = [0.95, 0.6, 0.7, 0.55, 0.8] coverage = [1.0, 0.8, 1.0, 1.0, 1.0] L = -1 * np.ones((1000, len(true_accs))) Y = np.zeros(1000) for i in range(1000): Y[i] = 1 if np.random.rand() <= 0.5 else 0 for j in range(5): if np.random.rand() <= coverage[j]: L[i, j] = (Y[i] if np.random.rand() <= true_accs[j] else np.abs(Y[i] - 1)) label_model = LabelModel(cardinality=2) label_model.fit(L, n_epochs=1000, seed=123) accs = label_model.get_weights() for i in range(len(accs)): true_acc = true_accs[i] self.assertAlmostEqual(accs[i], true_acc, delta=0.1)
def curate_twitter(save_name='../../pandafied_data/curated_twitter.csv'): df_train = pd.read_csv('../../pandafied_data/pandafied_twitter.csv') #from utils import load_unlabeled_spam_dataset #df_train = load_unlabeled_spam_dataset() # Define the set of labeling functions (LFs) #lfs = [lf_keyword_wateroverlast,lf_keyword_voertuig,lf_keyword_aanrijding,lf_keyword_te_water,lf_keyword_persoon,lf_keyword_brand,lf_keyword_mps,lf_keyword_kps,lf_keyword_luchtdr] #lfs = [lf_keyword_keywords] lfs = [lf_keyword_wateroverlast] # Apply the LFs to the unlabeled training data applier = PandasLFApplier(lfs) L_train = applier.apply(df_train) # Train the label model and compute the training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123) df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain") #tie_break_policy="true-random" #tie_break_policy="abstain" counter = 0 for i in range(len(df_train["label"])): if df_train["label"][i] == WATER: print() print(df_train["text"][i]) print(df_train["label"][i]) print() counter += 1 print("num entries total: " + str(len(df_train["label"]))) print("num entries water: " + str(counter)) #df_train = df_train[df_train.label != ABSTAIN] twitter_curated = df_train[df_train.label == WATER] twitter_curated = twitter_curated.drop(columns='label') twitter_curated.to_csv(save_name, index=False)
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") data = dd.read_parquet(data_path) data = data.repartition(npartitions=2) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = DaskLFApplier(lfs) L = applier.apply(data) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] data = data.reset_index().set_index("index") data_labeled = data.assign(y_prob=dd.from_array(y_prob)) dd.to_parquet(data_labeled, output_path) logging.info(f"Labels saved to {output_path}")
def test_score(self): L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]]) Y = np.array([1, 0, 1]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=100) results = label_model.score(L, Y, metrics=["accuracy", "coverage"]) np.testing.assert_array_almost_equal( label_model.predict(L), np.array([1, -1, 1]) ) results_expected = dict(accuracy=1.0, coverage=2 / 3) self.assertEqual(results, results_expected) L = np.array([[1, 0, 1], [1, 0, 1]]) label_model = self._set_up_model(L) label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(0.01, 0.99)) results = label_model.score(L, Y=np.array([0, 1])) results_expected = dict(accuracy=0.5) self.assertEqual(results, results_expected) results = label_model.score(L=L, Y=np.array([1, 0]), metrics=["accuracy", "f1"]) results_expected = dict(accuracy=0.5, f1=2 / 3) self.assertEqual(results, results_expected)
test_unfired_idx = [i for i,item in enumerate(test_m) if sum(item)==0] targets_test = test_L[test_fired_idx] #majority voting using snorkel's majority voting model maj_preds_test = majority_model.predict(L=test_lsnork[test_fired_idx]) maj_precision_test, maj_recall_test, maj_f1_score_test, maj_support_test = precision_recall_fscore_support(targets_test, maj_preds_test) maj_accuracy_test = compute_accuracy(maj_support_test, maj_recall_test) print("precision on *** RULE COVERD TEST SET *** of MAJORITY VOTING: {}".format(maj_precision_test)) print("recall on *** RULE COVERED TEST SET *** of MAJORITY VOTING: {}".format(maj_recall_test)) print("f1_score on *** RULE COVERED TEST SET *** of MAJORITY VOTING: {}".format(maj_f1_score_test)) print("support on *** RULE COVERED TEST SET *** of MAJORITY VOTING: {}".format(maj_support_test)) print("accuracy on *** RULE COVERED TEST SET *** of MAJORITY VOTING: {}".format(maj_accuracy_test)) #Now train snorkels label model print("Training Snorkel's LabelModel") label_model = LabelModel(cardinality=num_classes, verbose=True) label_model.fit(L_train=U_lsnork, n_epochs=1000, lr=0.001, log_freq=100, seed=123) label_model.save(os.path.join(path_dir,"saved_label_model")) snork_preds_test = label_model.predict(L=test_lsnork[test_fired_idx]) snork_precision_test, snork_recall_test, snork_f1_score_test, snork_support_test = precision_recall_fscore_support(targets_test, snork_preds_test) snork_accuracy_test = compute_accuracy(snork_support_test, snork_recall_test) print("precision on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_precision_test)) print("recall on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_recall_test)) print("f1_score on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_f1_score_test)) print("support on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_support_test)) print("accuracy on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_accuracy_test))
# %% [markdown] # However, as we can clearly see by looking the summary statistics of our LFs in the previous section, they are not all equally accurate, and should not be treated identically. In addition to having varied accuracies and coverages, LFs may be correlated, resulting in certain signals being overrepresented in a majority-vote-based model. To handle these issues appropriately, we will instead use a more sophisticated Snorkel `LabelModel` to combine the outputs of the LFs. # # This model will ultimately produce a single set of noise-aware training labels, which are probabilistic or confidence-weighted labels. We will then use these labels to train a classifier for our task. For more technical details of this overall approach, see our [NeurIPS 2016](https://arxiv.org/abs/1605.07723) and [AAAI 2019](https://arxiv.org/abs/1810.02840) papers. For more info on the API, see the [`LabelModel` documentation](https://snorkel.readthedocs.io/en/master/packages/_autosummary/labeling/snorkel.labeling.LabelModel.html#snorkel.labeling.LabelModel). # # Note that no gold labels are used during the training process. # The only information we need is the label matrix, which contains the output of the LFs on our training set. # The `LabelModel` is able to learn weights for the labeling functions using only the label matrix as input. # We also specify the `cardinality`, or number of classes. # The `LabelModel` trains much more quickly than typical discriminative models since we only need the label matrix as input. # %% {"tags": ["md-exclude-output"]} from snorkel.labeling import LabelModel label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123) # %% majority_acc = majority_model.score(L=L_valid, Y=Y_valid)["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") label_model_acc = label_model.score(L=L_valid, Y=Y_valid)["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") # %% [markdown] # So our `LabelModel` improves over the majority vote baseline! # However, it is typically **not suitable as an inference-time model** to make predictions for unseen data points, due to (among other things) some data points having all abstain labels. # In the next section, we will use the output of the label model as training labels to train a # discriminative classifier to see if we can improve performance further. # This classifier will only need the text of the comment to make predictions, making it much more suitable
def label_post(inp_path, prefix = ""): #lfs = [job_inpost, check_subreddit, check_iama] lfs = [job_inpost, check_iama] context_lens = [100, 3, 2] for with_per in [True, False]: for clen in context_lens: for kw in patterns: lfs.append(make_keyword_lf(keyword=kw, context_len=clen, with_period=with_per)) print("created lfs, their count", len(lfs)) df_train = pd.read_pickle(inp_path) df_train['texts'] = df_train['text'].swifter.apply(lambda x: [y.lower() for y in tokenize.sent_tokenize(x)]) df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) #df_train['containing_sentences'] = df_train[['texts', 'value']].swifter.apply(lambda y: find_val(y['texts'], y['value']), axis=1) print("loaded dataset") t1 = time.time() with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_train = applier.apply(df=df_train, n_parallel=num_cpu) print("time mins ", (time.time() - t1) / 60) print(LFAnalysis(L=L_train, lfs=lfs).lf_summary()) df_l_train = pd.DataFrame(L_train, columns=[str(x).split(",")[0] for x in lfs]) print(df_train.shape) print(df_l_train.shape) df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1) print(df_train.shape) print("*************************************************") df_train = df_train.drop(["index"], axis=1) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123) probs_train = label_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train ) print("the length of unfiltered posts", len(set(df_train['author'] + "+++++" + df_train['value']))) print("the length of filtered posts", len(set(df_train_filtered['author'] + "+++++" + df_train_filtered['value']))) probs_df = pd.DataFrame(probs_train_filtered, columns=["neg_prob", "pos_prob"]) print(df_train_filtered.shape) print(probs_df.shape) df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df], axis=1) print(df_train_filtered.shape) df_train_filtered.to_pickle("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".pkl") df_train_filtered.to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".csv") #df_train.iloc[L_train[:, 1] != ABSTAIN].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/intr_train_post_tmp.csv") verbose = True if verbose: for i in range(len(lfs)): ppath = "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/interesting_datasets/" + str(lfs[i]).split(",")[0] + ".csv" df_train.iloc[L_train[:, i] != ABSTAIN].to_csv(ppath) auth_hobby_dict = defaultdict(set) for index, row in df_train.iterrows(): if row.value == row.value and row.author == row.author: auth_hobby_dict[row.author].add(row.value) with open("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/author_profession_dict_" + prefix + ".txt", "w") as f_out: f_out.write(repr(dict(auth_hobby_dict)))
def label_user(inp_path, prefix=""): df_train = pd.read_pickle(inp_path) ########## threshold on word similarity take_first = 100 overall_first = 10000 global thresh_by_value, overall_thresh df_train['root_value'] = df_train['value'].swifter.set_dask_threshold( dask_threshold=0.001).allow_dask_on_strings().apply( lambda x: syn_to_hob[x]) thresh_by_value = df_train.groupby( ["root_value"]).apply(lambda x: np.partition( x['lexicon_counts'], max(len(x['lexicon_counts']) - take_first, 0) )[max(len(x['lexicon_counts']) - take_first, 0)]).to_dict() overall_thresh = np.partition(df_train["lexicon_counts"].to_numpy(), max(len(df_train) - overall_first, 0))[max( len(df_train) - overall_first, 0)] print(overall_thresh) ############################# # separately loose - strict, pos - neg, period - without names_pool = [ "context:2_count_pos", "context:3_count_pos", "context:100_count_pos", "context:2_period_count_pos", "context:3_period_count_pos", "context:100_period_count_pos", "context:2_count_neg", "context:3_count_neg", "context:100_count_neg", "context:2_period_count_neg", "context:3_period_count_neg", "context:100_period_count_neg" ] for f_name in names_pool: curr_cols = [x for x in df_train.columns if f_name in x] df_train['total_' + f_name] = df_train[curr_cols].swifter.apply(sum, axis=1) df_train = df_train.drop(curr_cols, axis=1) for p in ["pos", "neg"]: df_train["new_total_context:100_count_" + p] = df_train[[ "total_context:100_count_" + p, "total_context:3_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:100_count_" + p] - x["total_context:3_count_" + p]), axis=1) df_train["new_total_context:3_count_" + p] = df_train[[ "total_context:3_count_" + p, "total_context:2_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:3_count_" + p] - x["total_context:2_count_" + p ]), axis=1) df_train["new_total_context:100_period_count_" + p] = df_train[[ "total_context:3_period_count_" + p, "total_context:100_period_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:100_period_count_" + p] - x[ "total_context:3_period_count_" + p]), axis=1) df_train["new_total_context:3_period_count_" + p] = df_train[[ "total_context:3_period_count_" + p, "total_context:2_period_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:3_period_count_" + p] - x[ "total_context:2_period_count_" + p]), axis=1) df_train["new_total_context:2_count_" + p] = df_train[[ "total_context:100_period_count_" + p, "total_context:2_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:2_count_" + p] - x[ "total_context:100_period_count_" + p]), axis=1) df_train = df_train.drop( ["total_" + x for x in names_pool if "2_period_count" not in x], axis=1) lfs = [val_in_name, positive_lexicon_overall, positive_lexicon_pervalue] num_of_thesholds = 3 step = 100 // num_of_thesholds for col in df_train: if col not in ["author", "value", "idd", "root_value"]: if col not in [ "pos_prob_mean", "neg_prob_mean", "num_good_posts" ]: # , "lexicon_counts", "subreddit_counts", "name_in_subr_count"]: thresholds = [0] if "lexicon" in col and "unique" not in col: continue if True: # col in ["lexicon_counts", "unique_lexicon_counts"]: vals = df_train[col].to_numpy() thresholds = np.percentile( vals, list(range(0 + step, 99 + step, step))).astype(int) thresholds = sorted(list(set(thresholds))) if len(thresholds) > 1: thresholds = thresholds[:-1] if "lexicon" in col: thresholds = [3] # max_val = max(vals) # thresholds = list(range(0, int(max_val), int(max_val/5) + 1)) # elif col == "pos_prob_mean": # thresholds = [0.5 + 0.1 * x for x in range(5)] for i in range(len(thresholds)): thresh = thresholds[i] next_threshold = sys.maxsize if i == len( thresholds) - 1 else thresholds[i + 1] previous_threshold = -sys.maxsize if i == 0 else thresholds[ i - 1] if "lexicon_counts" not in col: lfs.append( make_thresold_lf(thresh=thresh, col_name=col, next_threshold=next_threshold)) else: lfs.append( make_lexicon_lf( thresh=thresh, pref=col, previous_threshold=previous_threshold)) num_annotators = 0 if num_annotators > 0: for i in range(1, num_annotators + 1): lfs.append(make_annotator_lf(worker_index=i)) lfs = [ x for x in lfs if any(y in str(x) for y in ["less", "context:2", "worker", "lexicon"]) ] print("created lfs their number", len(lfs)) print("\n".join(str(x) for x in lfs)) #### validation ##### do_val = False if do_val: df_golden = pd.read_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_dev.csv" ) name_val = list(df_golden["auth_val"]) # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) df_train["auth_val"] = df_train[["author", "value"]].swifter.apply( lambda x: x["author"] + "+++" + x["value"], axis=1) df_val = df_train[df_train.auth_val.isin(name_val)] df_dev = df_train[~df_train.auth_val.isin(name_val)] print("Number val", df_val.shape) print("Number dev", df_dev.shape) df_val = df_val.merge(df_golden, on="auth_val") y_val = np.array(df_val["final"]) df_val = df_val.drop(labels="final", axis=1) # create test set as well with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_val = applier.apply(df=df_val, n_parallel=num_cpu) L_dev = applier.apply(df=df_dev, n_parallel=num_cpu) dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary() analysis = LFAnalysis(L=L_val, lfs=lfs).lf_summary(y_val) analysis.to_csv("/home/tigunova/val_analysis.csv") dev_analysis.to_csv("/home/tigunova/dev_analysis.csv") print(analysis) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_dev) #, Y_dev=y_val) model_stat = label_model.score(L=L_val, Y=y_val) print(model_stat) exit(0) ########### #### picking threshold ##### do_threshold = False if do_threshold: df_golden = pd.read_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_validation.csv" ) name_val = list(df_golden["auth_val"]) # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) df_train["auth_val"] = df_train[["author", "value"]].swifter.apply( lambda x: x["author"] + "+++" + x["value"], axis=1) df_val = df_train[df_train.auth_val.isin(name_val)] df_dev = df_train[~df_train.auth_val.isin(name_val)] pop_size = df_dev.shape[0] print("Number val", df_val.shape) print("Number dev", df_dev.shape) applier = PandasParallelLFApplier(lfs=lfs) df_val = df_val.merge(df_golden, on="auth_val") L_val = applier.apply(df=df_val, n_parallel=num_cpu) val_thresholds = [0.01 * x for x in range(100)] label_model = LabelModel(cardinality=2, verbose=True) with TQDMDaskProgressBar(desc="Dask Apply"): L_dev = applier.apply(df=df_dev, n_parallel=num_cpu) label_model.fit(L_dev, class_balance=[0.5, 0.5]) # , Y_dev=y_val) wghts = label_model.get_weights() print("\n".join(str(x) for x in zip(lfs, wghts))) probs_val = label_model.predict_proba(L=L_val) probs_df = pd.DataFrame(probs_val, columns=["neg_prob", "pos_prob"]) df_val = pd.concat([df_val.reset_index(), probs_df], axis=1) probs_dev = label_model.predict_proba(L=L_dev) probs_df = pd.DataFrame(probs_dev, columns=["neg_prob", "pos_prob"]) df_dev = pd.concat([df_dev.reset_index(), probs_df], axis=1) y_true = np.array(df_val["final"]) for th in val_thresholds: y_pred = np.array( df_val["pos_prob"].apply(lambda x: 1 if x > th else 0)) #print("true negatives") #print(df_val[df_val["final"] == 1][df_val["pos_prob"] <= th][["auth_val", "text"]]) prec = precision_score(y_true, y_pred) pred_labels = y_pred true_labels = y_true # True Positive (TP): we predict a label of 1 (positive), and the true label is 1. TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1)) # True Negative (TN): we predict a label of 0 (negative), and the true label is 0. TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0)) # False Positive (FP): we predict a label of 1 (positive), but the true label is 0. FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0)) # False Negative (FN): we predict a label of 0 (negative), but the true label is 1. FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1)) print('TP: %i, FP: %i, TN: %i, FN: %i' % (TP, FP, TN, FN)) # print(list(zip(label_model.predict(L=L_val_curr), y_val_curr))) # print("******************************") print("threshold %s, proportion population %.4f, precision %s" % (str(th), df_dev[df_dev["pos_prob"] > th].shape[0] / pop_size, str(prec))) exit(0) ########### with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_train = applier.apply(df=df_train, n_parallel=num_cpu) analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() print(analysis) df_l_train = pd.DataFrame( L_train, columns=["llf_" + str(x).split(",")[0] for x in lfs]) print(df_train.shape) print(df_l_train.shape) df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1) print(df_train.shape) print("********************************************") t4 = time.time() label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123, class_balance=[0.3, 0.7]) probs_train = label_model.predict_proba(L=L_train) print("labeling model work ", (time.time() - t4) / 60) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) probs_df = pd.DataFrame(probs_train_filtered, columns=["neg_prob", "pos_prob"]) print(df_train_filtered.shape) print(probs_df.shape) result_filtered = pd.concat([ df_train_filtered[['author', 'value', 'idd']].reset_index(), probs_df ], axis=1) print(result_filtered.shape) print("****************************************************") result_filtered.to_csv("/home/tigunova/some_result_" + prefix + ".csv") print(df_train_filtered.shape) print(probs_df.shape) df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df], axis=1) df_train_filtered = df_train_filtered.drop(["index"], axis=1) print(df_train_filtered.shape) df_train_filtered.to_pickle( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" + prefix + ".pkl") df_train_filtered.to_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" + prefix + ".csv") # df_train.iloc[L_train[:, 1] == POS].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/user_" + prefix + ".csv") ### write dict output_threshold = 0.63 output_dict = defaultdict(list) auth_hobby_dict = defaultdict(list) for index, row in result_filtered.iterrows(): if row.value == row.value and row.author == row.author: auth_hobby_dict[row.author].append([row.value, row.pos_prob]) allowed_labels = [] for index, row in df_train_filtered.iterrows(): if row.value == row.value and row.author == row.author: if row.pos_prob > output_threshold: output_dict[row.author].append([row.value] + row.idd + [row.pos_prob]) allowed_labels.append(syn_to_hob[row.value]) print("\n".join([ str(y) for y in sorted(dict(Counter(allowed_labels)).items(), key=lambda x: x[1]) ])) print( "After cropping", sum([ x if x < 500 else 500 for x in dict(Counter(allowed_labels)).values() ])) print("users in total", len(output_dict)) for auth, stuffs in output_dict.items(): prof = ":::".join(set([x[0] for x in stuffs])) prob = ":::".join([str(x[-1]) for x in stuffs]) msgs = set([x for l in stuffs for x in l[1:-1]]) output_dict[auth] = [prof] + list(msgs) + [prob] with open( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/final_author_dict_" + prefix + ".txt", "w") as f_out: f_out.write(repr(dict(auth_hobby_dict))) with open("/home/tigunova/users_profession1.txt", "w") as f_out: f_out.write(repr(dict(output_dict)))
def main(train_path, output_dir, label_dir): # Get all data df = pd.read_csv(train_path) # Get human labels human_labels = read_human_labels(label_dir) # df_test and lab_test: the set of all human-labeled notes, and their labels df_test = df.merge(human_labels, on=['record_number']) lab_test = df_test.human_label del df_test['human_label'] # df_train: formed by removing all patients from df with a human-labeled note df_train = df.merge(df_test.mr, indicator=True, how='left', on = ['mr']) df_train = df_train.query('_merge=="left_only"').drop('_merge', axis=1) # Generate label matrix L_train = PandasLFApplier(lfs=lfs).apply(df=df_train) L_test = PandasLFApplier(lfs=lfs).apply(df=df_test) # Summarize LFs output_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary() #print(output_train) output_test = LFAnalysis(L=L_test, lfs=lfs).lf_summary(Y = lab_test.values) #print(output_test) # Save LF analysis path = os.path.join(output_dir, 'LF_analysis_train.csv') output_train.to_csv(path, index = True) path = os.path.join(output_dir, 'LF_analysis_test.csv') output_test.to_csv(path, index = True) # Create label model label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123, class_balance = [0.3, 0.7]) # Evaluate the label model using labeled test set for metric in ['recall', 'precision', 'f1', 'accuracy']: label_model_acc = label_model.score(L=L_test, Y=lab_test, metrics=[metric], tie_break_policy="random")[metric] print("%-15s %.2f%%" % (metric+":", label_model_acc * 100)) null_f1 = f1_score(lab_test.values, np.ones((df_test.shape[0],))) print("%-15s %.2f%%" % ("null f1:", null_f1 * 100)) print("%-15s %.2f%%" % ("null accuracy:", np.maximum(1-np.mean(lab_test), np.mean(lab_test)) * 100)) # Save error analysis preds = label_model.predict_proba(L_test) error_analysis(df_test, L_test, lfs, preds[:,1], lab_test, output_dir) # Get labels on train probs_train = label_model.predict_proba(L_train) # Filter out unlabeled data points df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train) # Save filtered training set df_train_filtered['prob'] = probs_train_filtered[:,1] path = os.path.join(output_dir, 'df_train_filtered.csv') df_train_filtered.to_csv(path, index = False) # Save label probs path = os.path.join(output_dir, 'probs_train_filtered') np.save(path, probs_train_filtered[:,1]) # Save training data set and labels assert len(df_test) == len(lab_test) df_test['human_label'] = lab_test path = os.path.join(output_dir, 'df_test.csv') df_test.to_csv(path, index = False) path = os.path.join(output_dir, 'lab_test') np.save(path, lab_test)
def run_snorkel_labelling_classification(labeling_functions, file, l_train, l_valid): lfs = labeling_functions # lfs = [lf.is_same_thread, lf.has_entities, lf.enity_overlap_jacc, lf.entity_type_overlap_jacc] # lfs = [is_same_thread, enity_overlap, entity_types, entity_type_overlap] # lfs = [is_long, has_votes, is_doctor_reply, is_same_thread, enity_overlap, has_type_dsyn, has_type_patf, has_type_sosy, # has_type_dora, has_type_fndg, has_type_menp, has_type_chem, has_type_orch, has_type_horm, has_type_phsu, # has_type_medd, has_type_bhvr, has_type_diap, has_type_bacs, has_type_enzy, has_type_inpo, has_type_elii] # lfs = [has_votes, is_doctor_reply, is_same_thread, enity_overlap] # lfs = [is_same_thread, enity_overlap, is_doctor_reply] # analysis = LFAnalysis(L=l_train, lfs=lfs).lf_summary(Y=Y_train) # print(analysis) # print(analysis['Conflicts']) # print(analysis['Overlaps']) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=l_train, n_epochs=20000, lr=0.0001, log_freq=10, seed=2345) # label_model.fit(L_train=L_train, n_epochs=20, lr=0.0001, log_freq=10, seed=81794) print("Model weights: " + str(label_model.get_weights())) valid_probabilities = label_model.predict_proba(L=l_valid) if 'predicted_prob' in df_valid: # df_valid.drop(columns=['predicted_prob'], axis=1) del df_valid['predicted_prob'] df_valid.insert(50, 'predicted_prob', valid_probabilities[:, 1]) # df_valid.to_csv("/container/filip/json/ehealthforum/trac/validation_df2.txt", sep="\t", header=True) # df_valid = pd.read_csv("/filip/json/ehealthforum/trac/validation_df.txt", sep="\t") def compute_precision_at_k(l, k): l = l[:k] return sum(l) / k PROBABILITY_CUTOFF = 0.5 df_valid[ 'predicted_label'] = df_valid['predicted_prob'] >= PROBABILITY_CUTOFF true_positive_ratio = df_valid[df_valid.bm25_relevant == 1].count()['bm25_relevant'] / \ df_valid[df_valid.predicted_label == 1].count()['predicted_label'] print("Number of True relevant: " + str(df_valid[df_valid.bm25_relevant == 1].count()['bm25_relevant'])) print("Number of Predicted relevant: " + str(df_valid[ df_valid.predicted_label == 1].count()['predicted_label']) + '\n') print('True positive ratio: ' + str(true_positive_ratio) + '\n') df_tru = df_valid.groupby(['query_thread']).head(10)['bm25_relevant'] df_pred = df_valid.groupby(['query_thread']).head(10)['predicted_label'] overall_precision = [] for query, group in df_valid.groupby(['query_thread']): precision = compute_precision_at_k( group['predicted_label'].head(10).tolist(), 10) overall_precision.append(precision) print('Overall precision: ' + str(sum(overall_precision) / len(overall_precision))) print("Accuracy: " + str(accuracy_score(df_tru, df_pred))) label_model_acc = label_model.score(L=l_valid, Y=Y_valid)["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")
def get_role_probs(lf_train: pd.DataFrame, filter_abstains: bool = False, lfs: Optional[List[labeling_function]] = None, lf_dev: pd.DataFrame = None, seed: Optional[int] = None, tmp_path: Union[str, Path] = None, use_majority_label_voter=False) -> pd.DataFrame: """ Takes "raw" data frame, builds argument role examples, (trains LabelModel), calculates event_argument_probs and returns merged argument role examples with event_argument_probs. :param use_majority_label_voter: Whether to use a majority label voter instead of the snorkel label model :param seed: Seed for use in label model (mu initialization) :param filter_abstains: Filters rows where all labeling functions abstained :param lf_train: Training dataset which will be labeled using Snorkel :param lfs: List of labeling functions :param lf_dev: Optional development dataset that can be used to set a prior for the class balance :param tmp_path: Path to temporarily store variables that are shared during random repeats :return: Labeled lf_train, labeling function applier, label model """ df_train, L_train = None, None df_dev, Y_dev, L_dev = None, None, None tmp_train_path, tmp_dev_path = None, None # For random repeats try to load pickled variables from first run as they are shared if tmp_path: tmp_train_path = Path(tmp_path).joinpath("role_train.pkl") os.makedirs(os.path.dirname(tmp_train_path), exist_ok=True) if tmp_train_path.exists(): with open(tmp_train_path, 'rb') as pickled_train: df_train, L_train = pickle.load(pickled_train) if lf_dev is not None: tmp_dev_path = Path(tmp_path).joinpath("role_dev.pkl") os.makedirs(os.path.dirname(tmp_dev_path), exist_ok=True) if tmp_dev_path.exists(): with open(tmp_dev_path, 'rb') as pickled_dev: df_dev, Y_dev, L_dev = pickle.load(pickled_dev) if lfs is None: lfs = get_role_list_lfs() applier = PandasLFApplier(lfs) if L_train is None or df_train is None: df_train, _ = build_event_role_examples(lf_train) logger.info("Running Event Role Labeling Function Applier") L_train = applier.apply(df_train) if tmp_path: with open(tmp_train_path, 'wb') as pickled_train: pickle.dump((df_train, L_train), pickled_train) if lf_dev is not None and any(element is None for element in [df_dev, Y_dev, L_dev]): df_dev, Y_dev = build_event_role_examples(lf_dev) logger.info("Running Event Role Labeling Function Applier on dev set") L_dev = applier.apply(df_dev) if tmp_path: with open(tmp_dev_path, 'wb') as pickled_dev: pickle.dump((df_dev, Y_dev, L_dev), pickled_dev) if use_majority_label_voter: logger.info( "Using MajorityLabelVoter to calculate role class probabilities") label_model = MajorityLabelVoter(cardinality=11) else: label_model = LabelModel(cardinality=11, verbose=True) logger.info( "Fitting LabelModel on the data and predicting role class probabilities" ) if seed: label_model.fit(L_train=L_train, n_epochs=5000, log_freq=500, seed=seed, Y_dev=Y_dev) else: label_model.fit(L_train=L_train, n_epochs=5000, log_freq=500, Y_dev=Y_dev) # Evaluate label model on development data if df_dev is not None and Y_dev is not None: metrics = ["accuracy", "f1_micro", "f1_macro"] logger.info("Evaluate on the dev set") label_model_metrics = label_model.score(L=L_dev, Y=Y_dev, tie_break_policy="random", metrics=metrics) if use_majority_label_voter: logger.info('Role Majority Label Voter Metrics') else: logger.info('Role Label Model Metrics') logger.info( f"{'Accuracy:':<25} {label_model_metrics['accuracy'] * 100:.1f}%") logger.info( f"{'F1 (micro averaged):':<25} {label_model_metrics['f1_micro'] * 100:.1f}%" ) logger.info( f"{'F1 (macro averaged):':<25} {label_model_metrics['f1_macro'] * 100:.1f}%" ) event_role_probs = label_model.predict_proba(L_train) if filter_abstains: df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=event_role_probs, L=L_train) merged_event_role_examples = merge_event_role_examples( df_train_filtered, probs_train_filtered) else: # Multiplies probabilities of abstains with zero so that the example is treated as padding in the end model merged_event_role_examples = merge_event_role_examples( df_train, utils.zero_out_abstains(event_role_probs, L_train)) return merged_event_role_examples
def test_e2e(): """Run an end-to-end test on documents of the hardware domain.""" PARALLEL = 4 max_docs = 12 fonduer.init_logging( log_dir="log_folder", format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s", level=logging.INFO, ) session = fonduer.Meta.init(CONN_STRING).Session() docs_path = "tests/data/html/" pdf_path = "tests/data/pdf/" doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser( session, parallelism=PARALLEL, structural=True, lingual=True, visual=True, pdf_path=pdf_path, ) corpus_parser.apply(doc_preprocessor) assert session.query(Document).count() == max_docs num_docs = session.query(Document).count() logger.info(f"Docs: {num_docs}") assert num_docs == max_docs num_sentences = session.query(Sentence).count() logger.info(f"Sentences: {num_sentences}") # Divide into test and train docs = sorted(corpus_parser.get_documents()) last_docs = sorted(corpus_parser.get_last_documents()) ld = len(docs) assert ld == len(last_docs) assert len(docs[0].sentences) == len(last_docs[0].sentences) assert len(docs[0].sentences) == 799 assert len(docs[1].sentences) == 663 assert len(docs[2].sentences) == 784 assert len(docs[3].sentences) == 661 assert len(docs[4].sentences) == 513 assert len(docs[5].sentences) == 700 assert len(docs[6].sentences) == 528 assert len(docs[7].sentences) == 161 assert len(docs[8].sentences) == 228 assert len(docs[9].sentences) == 511 assert len(docs[10].sentences) == 331 assert len(docs[11].sentences) == 528 # Check table numbers assert len(docs[0].tables) == 9 assert len(docs[1].tables) == 9 assert len(docs[2].tables) == 14 assert len(docs[3].tables) == 11 assert len(docs[4].tables) == 11 assert len(docs[5].tables) == 10 assert len(docs[6].tables) == 10 assert len(docs[7].tables) == 2 assert len(docs[8].tables) == 7 assert len(docs[9].tables) == 10 assert len(docs[10].tables) == 6 assert len(docs[11].tables) == 9 # Check figure numbers assert len(docs[0].figures) == 32 assert len(docs[1].figures) == 11 assert len(docs[2].figures) == 38 assert len(docs[3].figures) == 31 assert len(docs[4].figures) == 7 assert len(docs[5].figures) == 38 assert len(docs[6].figures) == 10 assert len(docs[7].figures) == 31 assert len(docs[8].figures) == 4 assert len(docs[9].figures) == 27 assert len(docs[10].figures) == 5 assert len(docs[11].figures) == 27 # Check caption numbers assert len(docs[0].captions) == 0 assert len(docs[1].captions) == 0 assert len(docs[2].captions) == 0 assert len(docs[3].captions) == 0 assert len(docs[4].captions) == 0 assert len(docs[5].captions) == 0 assert len(docs[6].captions) == 0 assert len(docs[7].captions) == 0 assert len(docs[8].captions) == 0 assert len(docs[9].captions) == 0 assert len(docs[10].captions) == 0 assert len(docs[11].captions) == 0 train_docs = set() dev_docs = set() test_docs = set() splits = (0.5, 0.75) data = [(doc.name, doc) for doc in docs] data.sort(key=lambda x: x[0]) for i, (doc_name, doc) in enumerate(data): if i < splits[0] * ld: train_docs.add(doc) elif i < splits[1] * ld: dev_docs.add(doc) else: test_docs.add(doc) logger.info([x.name for x in train_docs]) # NOTE: With multi-relation support, return values of getting candidates, # mentions, or sparse matrices are formatted as a list of lists. This means # that with a single relation, we need to index into the list of lists to # get the candidates/mentions/sparse matrix for a particular relation or # mention. # Mention Extraction part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3) temp_ngrams = MentionNgramsTemp(n_max=2) volt_ngrams = MentionNgramsVolt(n_max=1) Part = mention_subclass("Part") Temp = mention_subclass("Temp") Volt = mention_subclass("Volt") mention_extractor = MentionExtractor( session, [Part, Temp, Volt], [part_ngrams, temp_ngrams, volt_ngrams], [part_matcher, temp_matcher, volt_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) assert session.query(Part).count() == 299 assert session.query(Temp).count() == 138 assert session.query(Volt).count() == 140 assert len(mention_extractor.get_mentions()) == 3 assert len(mention_extractor.get_mentions()[0]) == 299 assert (len( mention_extractor.get_mentions(docs=[ session.query(Document).filter(Document.name == "112823").first() ])[0]) == 70) # Candidate Extraction PartTemp = candidate_subclass("PartTemp", [Part, Temp]) PartVolt = candidate_subclass("PartVolt", [Part, Volt]) candidate_extractor = CandidateExtractor( session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler]) for i, docs in enumerate([train_docs, dev_docs, test_docs]): candidate_extractor.apply(docs, split=i, parallelism=PARALLEL) assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493 assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61 assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416 assert session.query(PartVolt).count() == 4282 # Grab candidate lists train_cands = candidate_extractor.get_candidates(split=0, sort=True) dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2, sort=True) assert len(train_cands) == 2 assert len(train_cands[0]) == 3493 assert (len( candidate_extractor.get_candidates(docs=[ session.query(Document).filter(Document.name == "112823").first() ])[0]) == 1432) # Featurization featurizer = Featurizer(session, [PartTemp, PartVolt]) # Test that FeatureKey is properly reset featurizer.apply(split=1, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 214 assert session.query(FeatureKey).count() == 1260 # Test Dropping FeatureKey # Should force a row deletion featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"]) assert session.query(FeatureKey).count() == 1259 # Should only remove the part_volt as a relation and leave part_temp assert set( session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == { "part_temp", "part_volt" } featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) assert session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes == ["part_temp"] assert session.query(FeatureKey).count() == 1259 # Inserting the removed key featurizer.upsert_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp, PartVolt]) assert set( session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == { "part_temp", "part_volt" } assert session.query(FeatureKey).count() == 1259 # Removing the key again featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) # Removing the last relation from a key should delete the row featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp]) assert session.query(FeatureKey).count() == 1258 session.query(Feature).delete(synchronize_session="fetch") session.query(FeatureKey).delete(synchronize_session="fetch") featurizer.apply(split=0, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 6478 assert session.query(FeatureKey).count() == 4538 F_train = featurizer.get_feature_matrices(train_cands) assert F_train[0].shape == (3493, 4538) assert F_train[1].shape == (2985, 4538) assert len(featurizer.get_keys()) == 4538 featurizer.apply(split=1, parallelism=PARALLEL) assert session.query(Feature).count() == 6692 assert session.query(FeatureKey).count() == 4538 F_dev = featurizer.get_feature_matrices(dev_cands) assert F_dev[0].shape == (61, 4538) assert F_dev[1].shape == (153, 4538) featurizer.apply(split=2, parallelism=PARALLEL) assert session.query(Feature).count() == 8252 assert session.query(FeatureKey).count() == 4538 F_test = featurizer.get_feature_matrices(test_cands) assert F_test[0].shape == (416, 4538) assert F_test[1].shape == (1144, 4538) gold_file = "tests/data/hardware_tutorial_gold.csv" labeler = Labeler(session, [PartTemp, PartVolt]) labeler.apply( docs=last_docs, lfs=[[gold], [gold]], table=GoldLabel, train=True, parallelism=PARALLEL, ) assert session.query(GoldLabel).count() == 8252 stg_temp_lfs = [ LF_storage_row, LF_operating_row, LF_temperature_row, LF_tstg_row, LF_to_left, LF_negative_number_left, ] ce_v_max_lfs = [ LF_bad_keywords_in_row, LF_current_in_row, LF_non_ce_voltages_in_row, ] with pytest.raises(ValueError): labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL) labeler.apply( docs=train_docs, lfs=[stg_temp_lfs, ce_v_max_lfs], train=True, parallelism=PARALLEL, ) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 9 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 9) assert L_train[1].shape == (2985, 9) assert len(labeler.get_keys()) == 9 # Test Dropping LabelerKey labeler.drop_keys(["LF_storage_row"]) assert len(labeler.get_keys()) == 8 # Test Upserting LabelerKey labeler.upsert_keys(["LF_storage_row"]) assert "LF_storage_row" in [label.name for label in labeler.get_keys()] L_train_gold = labeler.get_gold_labels(train_cands) assert L_train_gold[0].shape == (3493, 1) L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold") assert L_train_gold[0].shape == (3493, 1) gen_model = LabelModel() gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = gen_model.predict_proba(L_train[0]) # Collect word counter word_counter = collect_word_counter(train_cands) emmental.init(fonduer.Meta.log_path) # Training config config = { "meta_config": { "verbose": False }, "model_config": { "model_path": None, "device": 0, "dataparallel": False }, "learner_config": { "n_epochs": 5, "optimizer_config": { "lr": 0.001, "l2": 0.0 }, "task_scheduler": "round_robin", }, "logging_config": { "evaluation_freq": 1, "counter_unit": "epoch", "checkpointing": False, "checkpointer_config": { "checkpoint_metric": { f"{ATTRIBUTE}/{ATTRIBUTE}/train/loss": "min" }, "checkpoint_freq": 1, "checkpoint_runway": 2, "clear_intermediate_checkpoints": True, "clear_all_checkpoints": True, }, }, } emmental.Meta.update_config(config=config) # Generate word embedding module arity = 2 # Geneate special tokens specials = [] for i in range(arity): specials += [f"~~[[{i}", f"{i}]]~~"] emb_layer = EmbeddingModule(word_counter=word_counter, word_dim=300, specials=specials) diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1) train_idxs = np.where(diffs > 1e-6)[0] train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, train_marginals, train_idxs, ), split="train", batch_size=100, shuffle=True, ) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LogisticRegression") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader]) test_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset(ATTRIBUTE, test_cands[0], F_test[0], emb_layer.word2id, 2), split="test", batch_size=100, shuffle=False, ) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6) true_pred = [test_cands[0][_] for _ in positive[0]] pickle_file = "tests/data/parts_by_doc_dict.pkl" with open(pickle_file, "rb") as f: parts_by_doc = pickle.load(f) (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 < 0.7 and f1 > 0.3 stg_temp_lfs_2 = [ LF_to_left, LF_test_condition_aligned, LF_collector_aligned, LF_current_aligned, LF_voltage_row_temp, LF_voltage_row_part, LF_typ_row, LF_complement_left_row, LF_too_many_numbers_row, LF_temp_on_high_page_num, LF_temp_outside_table, LF_not_temp_relevant, ] labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 16 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 16) gen_model = LabelModel() gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = gen_model.predict_proba(L_train[0]) diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1) train_idxs = np.where(diffs > 1e-6)[0] train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, train_marginals, train_idxs, ), split="train", batch_size=100, shuffle=True, ) emmental.Meta.reset() emmental.init(fonduer.Meta.log_path) emmental.Meta.update_config(config=config) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LogisticRegression") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader]) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7) true_pred = [test_cands[0][_] for _ in positive[0]] (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing LSTM emmental.Meta.reset() emmental.init(fonduer.Meta.log_path) emmental.Meta.update_config(config=config) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LSTM") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader]) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7) true_pred = [test_cands[0][_] for _ in positive[0]] (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7
def train_f_on_d_U(self, datafeeder, num_epochs, loss_type): sess = self.hls.sess total_batch = datafeeder.get_batches_per_epoch(f_d_U) batch_size = datafeeder.get_batch_size(f_d_U) if loss_type == 'pure-likelihood': train_op = self.hls.f_d_U_pure_likelihood_op loss_op = self.hls.f_d_U_pure_likelihood_loss elif loss_type == 'implication': train_op = self.hls.f_d_U_implication_op loss_op = self.hls.f_d_U_implication_loss elif loss_type == 'pr_loss': train_op = self.hls.pr_train_op loss_op = self.hls.pr_loss elif loss_type == 'gcross': train_op = self.hls.gcross_train_op loss_op = self.hls.gcross_loss elif loss_type == 'gcross_snorkel': train_op = self.hls.snork_gcross_train_op loss_op = self.hls.snork_gcross_loss elif loss_type == 'learn2reweight': train_op = self.hls.l2r_train_op loss_op = self.hls.l2r_loss elif loss_type == 'label_snorkel': train_op = self.hls.label_snorkel_train_op loss_op = self.hls.label_snorkel_loss elif loss_type == 'pure_snorkel': train_op = self.hls.pure_snorkel_train_op loss_op = self.hls.pure_snorkel_loss else: raise ValueError('Invalid loss type %s' % loss_type) best_saver_f_d_U = self.hls.best_savers.get_best_saver(f_d_U) metrics_dict = {} #{'config': self.config} if 'label_snorkel' == self.config.mode or 'pure_snorkel' == self.config.mode or 'gcross_snorkel' == self.config.mode: label_model = LabelModel(cardinality=self.hls.num_classes, verbose=True) if os.path.isfile( os.path.join(self.config.data_dir, "saved_label_model")): label_model = label_model.load( os.path.join(self.config.data_dir, "saved_label_model")) else: print("LABEL MODEL NOT SAVED") exit() if 'gcross' in self.config.mode or 'learn2reweight' in self.config.mode: majority_model = MajorityLabelVoter( cardinality=self.hls.num_classes) with sess.as_default(): print("Optimization started for f_d_U with %s loss!" % loss_type) print("Batch size: %d!" % batch_size) print("Batches per epoch : %d!" % total_batch) print("Number of epochs: %d!" % num_epochs) # Training cycle iteration = 0 global_step = 0 patience = 0 for epoch in range(num_epochs): avg_epoch_cost = 0. for i in range(total_batch): batch_x, batch_l, batch_m, batch_L, batch_d, batch_r =\ datafeeder.get_f_d_U_next_batch() feed_dict = { self.hls.f_d_U_adam_lr: self.config.f_d_U_adam_lr, self.hls.f_d_U_x: batch_x, self.hls.f_d_U_l: batch_l, self.hls.f_d_U_m: batch_m, self.hls.f_d_U_L: batch_L, self.hls.f_d_U_d: batch_d, self.hls.f_d_U_r: batch_r } batch_lsnork = conv_l_to_lsnork(batch_l, batch_m) if 'label_snorkel' == self.config.mode or 'pure_snorkel' == self.config.mode or 'gcross_snorkel' == self.config.mode: batch_snork_L = label_model.predict_proba( L=batch_lsnork) #snorkel_probs feed_dict[self.hls.f_d_U_snork_L] = batch_snork_L if 'gcross' == self.config.mode or 'learn2reweight' == self.config.mode: batch_snork_L = majority_model.predict( L=batch_lsnork) #majority votes batch_snork_L = np.eye( self.hls.num_classes)[batch_snork_L] #one hot rep feed_dict[self.hls.f_d_U_snork_L] = batch_snork_L merge_dict_a_into_b(self.hls.dropout_train_dict, feed_dict) # Run optimization op (backprop) and cost op (to get loss value) _, cost, num_d, f_d_U_global_step = sess.run( [ train_op, loss_op, self.hls.f_d_U_num_d, self.hls.f_d_U_global_step ], feed_dict=feed_dict) global_epoch = f_d_U_global_step / total_batch # This assertion is valid only if true U labels are available but not being used such as for # synthetic data. assert np.all(batch_L <= self.hls.num_classes) avg_epoch_cost += cost / total_batch cost1 = (avg_epoch_cost * total_batch) / (i + 1) global_step += 1 # Compute and report metrics, update checkpoints after each epoch print("\n========== epoch : {} ============\n".format(epoch)) print("cost: {}\n".format(cost1)) print("patience: {}\n".format(patience)) precision, recall, f1_score, support = self.hls.test.test_f( datafeeder) self.compute_f_d_metrics(metrics_dict, precision, recall, f1_score, support, global_epoch, f_d_U_global_step) print("\nmetrics_dict: ", metrics_dict) print() self.report_f_d_perfs_to_tensorboard(cost1, metrics_dict, global_step) did_improve = self.maybe_save_metrics_dict(f_d_U, metrics_dict) if did_improve: patience = 0 #rest patience if primary metric improved else: patience += 1 if patience > self.config.early_stopping_p: print("bye! stopping early!......") break # Save checkpoint print() self.hls.mru_saver.save(global_step) print() best_saver_f_d_U.save_if_best( metrics_dict[self.config.f_d_primary_metric]) print() global_step += 1 print("Optimization Finished for f_d_U!")
def test_e2e(): """Run an end-to-end test on documents of the hardware domain.""" PARALLEL = 4 max_docs = 12 fonduer.init_logging( log_dir="log_folder", format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s", level=logging.INFO, ) session = fonduer.Meta.init(CONN_STRING).Session() docs_path = "tests/data/html/" pdf_path = "tests/data/pdf/" doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser( session, parallelism=PARALLEL, structural=True, lingual=True, visual=True, pdf_path=pdf_path, ) corpus_parser.apply(doc_preprocessor) assert session.query(Document).count() == max_docs num_docs = session.query(Document).count() logger.info(f"Docs: {num_docs}") assert num_docs == max_docs num_sentences = session.query(Sentence).count() logger.info(f"Sentences: {num_sentences}") # Divide into test and train docs = sorted(corpus_parser.get_documents()) last_docs = sorted(corpus_parser.get_last_documents()) ld = len(docs) assert ld == len(last_docs) assert len(docs[0].sentences) == len(last_docs[0].sentences) assert len(docs[0].sentences) == 799 assert len(docs[1].sentences) == 663 assert len(docs[2].sentences) == 784 assert len(docs[3].sentences) == 661 assert len(docs[4].sentences) == 513 assert len(docs[5].sentences) == 700 assert len(docs[6].sentences) == 528 assert len(docs[7].sentences) == 161 assert len(docs[8].sentences) == 228 assert len(docs[9].sentences) == 511 assert len(docs[10].sentences) == 331 assert len(docs[11].sentences) == 528 # Check table numbers assert len(docs[0].tables) == 9 assert len(docs[1].tables) == 9 assert len(docs[2].tables) == 14 assert len(docs[3].tables) == 11 assert len(docs[4].tables) == 11 assert len(docs[5].tables) == 10 assert len(docs[6].tables) == 10 assert len(docs[7].tables) == 2 assert len(docs[8].tables) == 7 assert len(docs[9].tables) == 10 assert len(docs[10].tables) == 6 assert len(docs[11].tables) == 9 # Check figure numbers assert len(docs[0].figures) == 32 assert len(docs[1].figures) == 11 assert len(docs[2].figures) == 38 assert len(docs[3].figures) == 31 assert len(docs[4].figures) == 7 assert len(docs[5].figures) == 38 assert len(docs[6].figures) == 10 assert len(docs[7].figures) == 31 assert len(docs[8].figures) == 4 assert len(docs[9].figures) == 27 assert len(docs[10].figures) == 5 assert len(docs[11].figures) == 27 # Check caption numbers assert len(docs[0].captions) == 0 assert len(docs[1].captions) == 0 assert len(docs[2].captions) == 0 assert len(docs[3].captions) == 0 assert len(docs[4].captions) == 0 assert len(docs[5].captions) == 0 assert len(docs[6].captions) == 0 assert len(docs[7].captions) == 0 assert len(docs[8].captions) == 0 assert len(docs[9].captions) == 0 assert len(docs[10].captions) == 0 assert len(docs[11].captions) == 0 train_docs = set() dev_docs = set() test_docs = set() splits = (0.5, 0.75) data = [(doc.name, doc) for doc in docs] data.sort(key=lambda x: x[0]) for i, (doc_name, doc) in enumerate(data): if i < splits[0] * ld: train_docs.add(doc) elif i < splits[1] * ld: dev_docs.add(doc) else: test_docs.add(doc) logger.info([x.name for x in train_docs]) # NOTE: With multi-relation support, return values of getting candidates, # mentions, or sparse matrices are formatted as a list of lists. This means # that with a single relation, we need to index into the list of lists to # get the candidates/mentions/sparse matrix for a particular relation or # mention. # Mention Extraction part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3) temp_ngrams = MentionNgramsTemp(n_max=2) volt_ngrams = MentionNgramsVolt(n_max=1) Part = mention_subclass("Part") Temp = mention_subclass("Temp") Volt = mention_subclass("Volt") mention_extractor = MentionExtractor( session, [Part, Temp, Volt], [part_ngrams, temp_ngrams, volt_ngrams], [part_matcher, temp_matcher, volt_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) assert session.query(Part).count() == 299 assert session.query(Temp).count() == 138 assert session.query(Volt).count() == 140 assert len(mention_extractor.get_mentions()) == 3 assert len(mention_extractor.get_mentions()[0]) == 299 assert ( len( mention_extractor.get_mentions( docs=[session.query(Document).filter(Document.name == "112823").first()] )[0] ) == 70 ) # Candidate Extraction PartTemp = candidate_subclass("PartTemp", [Part, Temp]) PartVolt = candidate_subclass("PartVolt", [Part, Volt]) candidate_extractor = CandidateExtractor( session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler] ) for i, docs in enumerate([train_docs, dev_docs, test_docs]): candidate_extractor.apply(docs, split=i, parallelism=PARALLEL) assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493 assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61 assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416 assert session.query(PartVolt).count() == 4282 # Grab candidate lists train_cands = candidate_extractor.get_candidates(split=0, sort=True) dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2, sort=True) assert len(train_cands) == 2 assert len(train_cands[0]) == 3493 assert ( len( candidate_extractor.get_candidates( docs=[session.query(Document).filter(Document.name == "112823").first()] )[0] ) == 1432 ) # Featurization featurizer = Featurizer(session, [PartTemp, PartVolt]) # Test that FeatureKey is properly reset featurizer.apply(split=1, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 214 assert session.query(FeatureKey).count() == 1260 # Test Dropping FeatureKey # Should force a row deletion featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"]) assert session.query(FeatureKey).count() == 1259 # Should only remove the part_volt as a relation and leave part_temp assert set( session.query(FeatureKey) .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]") .one() .candidate_classes ) == {"part_temp", "part_volt"} featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) assert session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]" ).one().candidate_classes == ["part_temp"] assert session.query(FeatureKey).count() == 1259 # Inserting the removed key featurizer.upsert_keys( ["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp, PartVolt] ) assert set( session.query(FeatureKey) .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]") .one() .candidate_classes ) == {"part_temp", "part_volt"} assert session.query(FeatureKey).count() == 1259 # Removing the key again featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) # Removing the last relation from a key should delete the row featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp]) assert session.query(FeatureKey).count() == 1258 session.query(Feature).delete(synchronize_session="fetch") session.query(FeatureKey).delete(synchronize_session="fetch") featurizer.apply(split=0, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 6478 assert session.query(FeatureKey).count() == 4538 F_train = featurizer.get_feature_matrices(train_cands) assert F_train[0].shape == (3493, 4538) assert F_train[1].shape == (2985, 4538) assert len(featurizer.get_keys()) == 4538 featurizer.apply(split=1, parallelism=PARALLEL) assert session.query(Feature).count() == 6692 assert session.query(FeatureKey).count() == 4538 F_dev = featurizer.get_feature_matrices(dev_cands) assert F_dev[0].shape == (61, 4538) assert F_dev[1].shape == (153, 4538) featurizer.apply(split=2, parallelism=PARALLEL) assert session.query(Feature).count() == 8252 assert session.query(FeatureKey).count() == 4538 F_test = featurizer.get_feature_matrices(test_cands) assert F_test[0].shape == (416, 4538) assert F_test[1].shape == (1144, 4538) gold_file = "tests/data/hardware_tutorial_gold.csv" labeler = Labeler(session, [PartTemp, PartVolt]) labeler.apply( docs=last_docs, lfs=[[gold], [gold]], table=GoldLabel, train=True, parallelism=PARALLEL, ) assert session.query(GoldLabel).count() == 8252 stg_temp_lfs = [ LF_storage_row, LF_operating_row, LF_temperature_row, LF_tstg_row, LF_to_left, LF_negative_number_left, ] ce_v_max_lfs = [ LF_bad_keywords_in_row, LF_current_in_row, LF_non_ce_voltages_in_row, ] with pytest.raises(ValueError): labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL) labeler.apply( docs=train_docs, lfs=[stg_temp_lfs, ce_v_max_lfs], train=True, parallelism=PARALLEL, ) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 9 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 9) assert L_train[1].shape == (2985, 9) assert len(labeler.get_keys()) == 9 # Test Dropping LabelerKey labeler.drop_keys(["LF_storage_row"]) assert len(labeler.get_keys()) == 8 # Test Upserting LabelerKey labeler.upsert_keys(["LF_storage_row"]) assert "LF_storage_row" in [label.name for label in labeler.get_keys()] L_train_gold = labeler.get_gold_labels(train_cands) assert L_train_gold[0].shape == (3493, 1) L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold") assert L_train_gold[0].shape == (3493, 1) gen_model = LabelModel() gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = gen_model.predict_proba(L_train[0]) disc_model = LogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, X_dev=(train_cands[0], F_train[0]), Y_dev=L_train_gold[0].reshape(-1), b=0.6, pos_label=TRUE, n_epochs=5, lr=0.001, ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] pickle_file = "tests/data/parts_by_doc_dict.pkl" with open(pickle_file, "rb") as f: parts_by_doc = pickle.load(f) (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 < 0.7 and f1 > 0.3 stg_temp_lfs_2 = [ LF_to_left, LF_test_condition_aligned, LF_collector_aligned, LF_current_aligned, LF_voltage_row_temp, LF_voltage_row_part, LF_typ_row, LF_complement_left_row, LF_too_many_numbers_row, LF_temp_on_high_page_num, LF_temp_outside_table, LF_not_temp_relevant, ] labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 16 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 16) gen_model = LabelModel() gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = gen_model.predict_proba(L_train[0]) disc_model = LogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing LSTM disc_model = LSTM() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing Sparse Logistic Regression disc_model = SparseLogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing Sparse LSTM disc_model = SparseLSTM() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Evaluate mention level scores L_test_gold = labeler.get_gold_labels(test_cands, annotator="gold") Y_test = L_test_gold[0].reshape(-1) scores = disc_model.score((test_cands[0], F_test[0]), Y_test, b=0.6, pos_label=TRUE) logger.info(scores) assert scores["f1"] > 0.6
data_handler = dh.DataHandler() cui2vec = Cui2Vec().cui2vec X_gold_sent, X_gold_shortest_path, X_gold_src, X_gold_tgt, X_gold_src_txt, X_gold_tgt_txt, y_gold = data_handler.get_test_data() X_val_sent, X_val_shortest_path, X_val_src, X_val_tgt, X_val_src_txt, X_val_tgt_txt, y_val = data_handler.get_validation_data() applier = PandasLFApplier(label_functions.lfs) df_train = pd.DataFrame(list(zip(*data_handler.get_training_data())), columns=['shortest_path', 'sent', 'src', 'tgt', 'src_txt', 'tgt_txt']) L_train = applier.apply(df_train) label_model = LabelModel(cardinality=len(rel_names.rels_txt_to_int), verbose=True) label_model.fit(L_train, n_epochs=1000, lr=0.01, log_freq=100, seed=123) label_model.save('./models/LabelModel.model') train_probs = label_model.predict_proba(L_train) train_preds = probs_to_preds(train_probs, tie_break_policy='abstain') df_train = df_train.join(pd.DataFrame({'preds': train_preds, 'probs': list(map(max, train_probs))})) # -1 to otherwiseRelated df_train.loc[df_train.preds == -1, 'preds'] = rel_names.rels_txt_to_int['otherwiseRelated'] # Downsample otherwiseRelated dropNum = len(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']]) - int(df_train['preds'].value_counts().mean()) df_train = df_train.drop(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']].sample(dropNum).index)