def snorkel_process(keylist, dataframe, allweaklabf): def func(x): idx = (-x).argsort()[1:] x[idx] = 0 return x cardinalitynu = len(keylist) applier = PandasLFApplier(lfs=allweaklabf) all_train_l = applier.apply(df=dataframe) report = LFAnalysis(L=all_train_l, lfs=allweaklabf).lf_summary() print(report) label_model = LabelModel(cardinality=cardinalitynu, verbose=False) label_model.fit(all_train_l) predt = label_model.predict(all_train_l) predt1 = label_model.predict_proba(all_train_l) keylist1 = keylist.copy() #keylist1.append('Not_relevent') predt2 = pd.DataFrame(predt1, columns=keylist1) dataframe['L_label'] = predt dataframe1 = dataframe.join(predt2, how='outer') dataframe1 = dataframe1[dataframe1.L_label >= 0] train, test = train_test_split(dataframe1, test_size=0.2) trainsent = train.sent.values trainlabel = train[keylist].values trainlabe2 = trainlabel.copy() np.apply_along_axis(func, 1, trainlabe2) trainlabe2 = np.where(trainlabe2 > 0, 1, 0) testsent = test.sent.values testlabel = test[keylist].values testlabe2 = testlabel.copy() np.apply_along_axis(func, 1, testlabe2) testlabe2 = np.where(testlabe2 > 0, 1, 0) return trainsent, trainlabe2, testsent, testlabe2, keylist, report
def test_score(self): L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]]) Y = np.array([1, 0, 1]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=100) results = label_model.score(L, Y, metrics=["accuracy", "coverage"]) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, -1, 1])) results_expected = dict(accuracy=1.0, coverage=2 / 3) self.assertEqual(results, results_expected) L = np.array([[1, 0, 1], [1, 0, 1]]) label_model = self._set_up_model(L) label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp( 0.01, 0.99)) results = label_model.score(L, Y=np.array([0, 1])) results_expected = dict(accuracy=0.5) self.assertEqual(results, results_expected) results = label_model.score(L=L, Y=np.array([1, 0]), metrics=["accuracy", "f1"]) results_expected = dict(accuracy=0.5, f1=2 / 3) self.assertEqual(results, results_expected)
def test_label_model_sparse(self) -> None: """Test the LabelModel's estimate of P and Y on a sparse synthetic dataset. This tests the common setting where LFs abstain most of the time, which can cause issues for example if parameter clamping set too high (e.g. see Issue #1422). """ np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality, abstain_multiplier=1000.0) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L, n_epochs=1000, lr=0.01, seed=123) # Test estimated LF conditional probabilities P_lm = label_model.get_conditional_probs() np.testing.assert_array_almost_equal(P, P_lm, decimal=2) # Test predicted labels *only on non-abstained data points* Y_pred = label_model.predict(L, tie_break_policy="abstain") idx, = np.where(Y_pred != -1) acc = np.where(Y_pred[idx] == Y[idx], 1, 0).sum() / len(idx) self.assertGreaterEqual(acc, 0.65) # Make sure that we don't output abstain when an LF votes, per issue #1422 self.assertEqual(len(idx), np.where((L + 1).sum(axis=1) != 0, 1, 0).sum())
def generate_labels_with_snorkel(dataframe): """ Labels the full data using Snorkel :param dataframe: Pandas dataframe containing all data :return: dataframe extended with a label column """ # Define the set of labeling functions (LFs) lfs = [ lf_ubo_is_company, lf_troika_company, lf_uk_blacklisted_company, lf_non_uk_blacklisted_company ] # Apply the LFs to the unlabeled training data applier = PandasLFApplier(lfs) L_train = applier.apply(dataframe) # Train the label model and compute the training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123) dataframe["label"] = label_model.predict(L=L_train, tie_break_policy="abstain") # Filter out the abstain data points dataframe = dataframe[dataframe.label != ABSTAIN] return dataframe
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") sc = SparkContext() sql = SQLContext(sc) data = sql.read.parquet(data_path) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = SparkLFApplier(lfs) L = applier.apply(data.rdd) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] y_prob_sql_array = F.array([F.lit(y) for y in y_prob]) data_labeled = data.withColumn("y_prob", y_prob_sql_array) data_labeled.write.mode("overwrite").parquet(output_path) logging.info(f"Labels saved to {output_path}")
def test_save_and_load(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) dir_path = tempfile.mkdtemp() save_path = dir_path + "label_model" label_model.save(save_path) label_model.load(save_path) shutil.rmtree(dir_path)
def test_set_mu_eps(self): mu_eps = 0.0123 # Construct a label matrix such that P(\lambda_1 = 0 | Y) = 0.0, so it will hit # the mu_eps floor L = np.array([[1, 1, 1], [1, 1, 1]]) label_model = LabelModel(verbose=False) label_model.fit(L, mu_eps=mu_eps) self.assertAlmostEqual(label_model.get_conditional_probs()[0, 1, 0], mu_eps)
def train_model_random_lfs(randomly_sampled_lfs, train_matrix, dev_matrix, dev_labels, test_matrix, regularization_grid): hyper_grid_results = defaultdict(dict) train_grid_results = defaultdict(dict) dev_grid_results = defaultdict(dict) test_grid_results = defaultdict(dict) models = defaultdict(dict) for lf_sample in tqdm_notebook(enumerate(randomly_sampled_lfs)): for param in regularization_grid: label_model = LabelModel(cardinality=2) label_model.fit( train_matrix[:, lf_sample[1]], n_epochs=1000, seed=100, lr=0.01, l2=param, ) # Get marginals for each parameter hyper_grid_results[str(param)] = roc_curve( dev_labels, label_model.predict_proba(dev_matrix[:, lf_sample[1]])[:, 1]) # Convert marginals into AUROCs hyper_grid_results = { param: auc(hyper_grid_results[param][0], hyper_grid_results[param][1]) for param in hyper_grid_results } # Select the parameter with the highest AUROC best_param = float( max(hyper_grid_results.items(), key=operator.itemgetter(1))[0]) # Re-fit the model label_model.fit( train_matrix[:, lf_sample[1]], n_epochs=1000, seed=100, lr=0.01, l2=best_param, ) # Save marginals for output key = f'{lf_sample[0]}:{",".join(map(str, lf_sample[1]))}' train_grid_results[key] = label_model.predict_proba( train_matrix[:, lf_sample[1]]) dev_grid_results[key] = label_model.predict_proba( dev_matrix[:, lf_sample[1]]) test_grid_results[key] = label_model.predict_proba( test_matrix[:, lf_sample[1]]) models[key] = label_model return train_grid_results, dev_grid_results, test_grid_results, models
def test_L_form(self): label_model = LabelModel(cardinality=2, verbose=False) L = np.array([[0, 1, 0], [0, 1, 0], [1, 0, 0], [0, 1, 0]]) label_model._set_constants(L) self.assertEqual(label_model.n, 4) self.assertEqual(label_model.m, 3) L = np.array([[0, 1, 2], [0, 1, 2], [1, 0, 2], [0, 1, 0]]) with self.assertRaisesRegex(ValueError, "L_train has cardinality"): label_model.fit(L, n_epochs=1)
def test_loss(self): L = np.array([[0, -1, 0], [0, 1, -1]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) label_model.mu = nn.Parameter(label_model.mu_init.clone() + 0.05) # l2_loss = l2*M*K*||mu - mu_init||_2 = 3*2*(0.05^2) = 0.03 self.assertAlmostEqual(label_model._loss_l2(l2=1.0).item(), 0.03) self.assertAlmostEqual(label_model._loss_l2(l2=np.ones(6)).item(), 0.03) # mu_loss = ||O - \mu^T P \mu||_2 + ||\mu^T P - diag(O)||_2 self.assertAlmostEqual(label_model._loss_mu().item(), 0.675, 3)
def get_snorkel_labels(train_df, lfs, labels): applier = PandasLFApplier( [labeling_function(name=lf.__name__)(lf) for lf in lfs]) label_model = LabelModel(cardinality=len(labels), verbose=True) L_train = applier.apply(df=train_df) label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123) L_probs = label_model.predict_proba(L=L_train) df_filtered, probs_filtered = filter_unlabeled_dataframe(X=train_df, y=L_probs, L=L_train) return df_filtered, probs_filtered
def test_mv_default(self): # less than 2 LFs have overlaps label_model = LabelModel(cardinality=2, verbose=False) L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]]) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, 1, 0])) # less than 2 LFs have conflicts L = np.array([[-1, -1, 1], [-1, 1, 1], [1, 1, 1]]) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, 1, 1]))
def test_optimizer(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1, optimizer="sgd") label_model.fit(L, n_epochs=1, optimizer="adam") label_model.fit(L, n_epochs=1, optimizer="adamax") with self.assertRaisesRegex(ValueError, "Unrecognized optimizer option"): label_model.fit(L, n_epochs=1, optimizer="bad_opt")
def test_label_model_basic(self) -> None: """Test the LabelModel's estimate of P and Y on a simple synthetic dataset.""" np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L, n_epochs=200, lr=0.01, seed=123) # Test estimated LF conditional probabilities P_lm = label_model.get_conditional_probs() np.testing.assert_array_almost_equal(P, P_lm, decimal=2) # Test predicted labels score = label_model.score(L, Y) self.assertGreaterEqual(score["accuracy"], 0.9)
def test_save_and_load(self): L = np.array([[0, -1, 0], [0, 1, 1]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) original_preds = label_model.predict(L) dir_path = tempfile.mkdtemp() save_path = dir_path + "label_model.pkl" label_model.save(save_path) label_model_new = LabelModel(cardinality=2, verbose=False) label_model_new.load(save_path) loaded_preds = label_model_new.predict(L) shutil.rmtree(dir_path) np.testing.assert_array_equal(loaded_preds, original_preds)
def test_label_model(self) -> None: """Test the LabelModel's estimate of P and Y.""" np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L, n_epochs=200, lr=0.01, seed=123) # Test estimated LF conditional probabilities P_lm = label_model._get_conditional_probs().reshape( (self.m, self.cardinality + 1, -1)) np.testing.assert_array_almost_equal(P, P_lm, decimal=2) # Test predicted labels Y_lm = label_model.predict_proba(L).argmax(axis=1) err = np.where(Y != Y_lm, 1, 0).sum() / self.n self.assertLess(err, 0.1)
def test_labeling_convergence(self) -> None: """Test convergence of end to end labeling pipeline.""" # Apply LFs labeling_functions = ( [f] + [get_positive_labeling_function(divisor) for divisor in range(2, 9)] + [get_negative_labeling_function(divisor) for divisor in range(2, 9)] ) applier = PandasLFApplier(labeling_functions) L_train = applier.apply(self.df_train, progress_bar=False) self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions))) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0) Y_lm = label_model.predict_proba(L_train).argmax(axis=1) Y = self.df_train.y err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN self.assertLess(err, 0.05)
def test_predict(self): # 3 LFs that always disagree/abstain leads to all abstains L = np.array([[-1, 1, 0], [0, -1, 1], [1, 0, -1]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([-1, -1, -1])) L = np.array([[0, 1, 0], [0, 1, 0]]) label_model = self._set_up_model(L) label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp( 0.01, 0.99)) preds = label_model.predict(L) true_preds = np.array([0, 0]) np.testing.assert_array_equal(preds, true_preds) preds, probs = label_model.predict(L, return_probs=True) true_probs = np.array([[0.99, 0.01], [0.99, 0.01]]) np.testing.assert_array_almost_equal(probs, true_probs)
def test_get_weight(self): # set up L matrix true_accs = [0.95, 0.6, 0.7, 0.55, 0.8] coverage = [1.0, 0.8, 1.0, 1.0, 1.0] L = -1 * np.ones((1000, len(true_accs))) Y = np.zeros(1000) for i in range(1000): Y[i] = 1 if np.random.rand() <= 0.5 else 0 for j in range(5): if np.random.rand() <= coverage[j]: L[i, j] = (Y[i] if np.random.rand() <= true_accs[j] else np.abs(Y[i] - 1)) label_model = LabelModel(cardinality=2) label_model.fit(L, n_epochs=1000, seed=123) accs = label_model.get_weights() for i in range(len(accs)): true_acc = true_accs[i] self.assertAlmostEqual(accs[i], true_acc, delta=0.1)
def curate_twitter(save_name='../../pandafied_data/curated_twitter.csv'): df_train = pd.read_csv('../../pandafied_data/pandafied_twitter.csv') #from utils import load_unlabeled_spam_dataset #df_train = load_unlabeled_spam_dataset() # Define the set of labeling functions (LFs) #lfs = [lf_keyword_wateroverlast,lf_keyword_voertuig,lf_keyword_aanrijding,lf_keyword_te_water,lf_keyword_persoon,lf_keyword_brand,lf_keyword_mps,lf_keyword_kps,lf_keyword_luchtdr] #lfs = [lf_keyword_keywords] lfs = [lf_keyword_wateroverlast] # Apply the LFs to the unlabeled training data applier = PandasLFApplier(lfs) L_train = applier.apply(df_train) # Train the label model and compute the training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123) df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain") #tie_break_policy="true-random" #tie_break_policy="abstain" counter = 0 for i in range(len(df_train["label"])): if df_train["label"][i] == WATER: print() print(df_train["text"][i]) print(df_train["label"][i]) print() counter += 1 print("num entries total: " + str(len(df_train["label"]))) print("num entries water: " + str(counter)) #df_train = df_train[df_train.label != ABSTAIN] twitter_curated = df_train[df_train.label == WATER] twitter_curated = twitter_curated.drop(columns='label') twitter_curated.to_csv(save_name, index=False)
def test_optimizer_init(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() label_model.fit(L, optimizer="sgd", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.SGD) label_model.fit(L, optimizer="adam", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.Adam) label_model.fit(L, optimizer="adamax", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.Adamax) with self.assertRaisesRegex(ValueError, "Unrecognized optimizer"): label_model.fit(L, optimizer="bad_optimizer", n_epochs=1)
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") data = dd.read_parquet(data_path) data = data.repartition(npartitions=2) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = DaskLFApplier(lfs) L = applier.apply(data) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] data = data.reset_index().set_index("index") data_labeled = data.assign(y_prob=dd.from_array(y_prob)) dd.to_parquet(data_labeled, output_path) logging.info(f"Labels saved to {output_path}")
def test_scheduler_init(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() label_model.fit(L, lr_scheduler="constant", n_epochs=1) self.assertIsNone(label_model.lr_scheduler) label_model.fit(L, lr_scheduler="linear", n_epochs=1) self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.LambdaLR) label_model.fit(L, lr_scheduler="exponential", n_epochs=1) self.assertIsInstance( label_model.lr_scheduler, optim.lr_scheduler.ExponentialLR ) label_model.fit(L, lr_scheduler="step", n_epochs=1) self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.StepLR)
def test_model_loss(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) init_loss = label_model._loss_mu().item() label_model.fit(L, n_epochs=10) next_loss = label_model._loss_mu().item() self.assertLessEqual(next_loss, init_loss) with self.assertRaisesRegex(Exception, "Loss is NaN."): label_model.fit(L, n_epochs=10, lr=1e8)
def test_warmup(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() lr_scheduler_config = {"warmup_steps": 3, "warmup_unit": "epochs"} label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5) self.assertEqual(label_model.warmup_steps, 3) lr_scheduler_config = {"warmup_percentage": 3 / 5} label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5) self.assertEqual(label_model.warmup_steps, 3) with self.assertRaisesRegex(ValueError, "LabelModel does not support"): lr_scheduler_config = {"warmup_steps": 1, "warmup_unit": "batches"} label_model.fit(L, lr_scheduler_config=lr_scheduler_config)
def test_e2e(): """Run an end-to-end test on documents of the hardware domain.""" PARALLEL = 4 max_docs = 12 fonduer.init_logging( log_dir="log_folder", format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s", level=logging.INFO, ) session = fonduer.Meta.init(CONN_STRING).Session() docs_path = "tests/data/html/" pdf_path = "tests/data/pdf/" doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser( session, parallelism=PARALLEL, structural=True, lingual=True, visual=True, pdf_path=pdf_path, ) corpus_parser.apply(doc_preprocessor) assert session.query(Document).count() == max_docs num_docs = session.query(Document).count() logger.info(f"Docs: {num_docs}") assert num_docs == max_docs num_sentences = session.query(Sentence).count() logger.info(f"Sentences: {num_sentences}") # Divide into test and train docs = sorted(corpus_parser.get_documents()) last_docs = sorted(corpus_parser.get_last_documents()) ld = len(docs) assert ld == len(last_docs) assert len(docs[0].sentences) == len(last_docs[0].sentences) assert len(docs[0].sentences) == 799 assert len(docs[1].sentences) == 663 assert len(docs[2].sentences) == 784 assert len(docs[3].sentences) == 661 assert len(docs[4].sentences) == 513 assert len(docs[5].sentences) == 700 assert len(docs[6].sentences) == 528 assert len(docs[7].sentences) == 161 assert len(docs[8].sentences) == 228 assert len(docs[9].sentences) == 511 assert len(docs[10].sentences) == 331 assert len(docs[11].sentences) == 528 # Check table numbers assert len(docs[0].tables) == 9 assert len(docs[1].tables) == 9 assert len(docs[2].tables) == 14 assert len(docs[3].tables) == 11 assert len(docs[4].tables) == 11 assert len(docs[5].tables) == 10 assert len(docs[6].tables) == 10 assert len(docs[7].tables) == 2 assert len(docs[8].tables) == 7 assert len(docs[9].tables) == 10 assert len(docs[10].tables) == 6 assert len(docs[11].tables) == 9 # Check figure numbers assert len(docs[0].figures) == 32 assert len(docs[1].figures) == 11 assert len(docs[2].figures) == 38 assert len(docs[3].figures) == 31 assert len(docs[4].figures) == 7 assert len(docs[5].figures) == 38 assert len(docs[6].figures) == 10 assert len(docs[7].figures) == 31 assert len(docs[8].figures) == 4 assert len(docs[9].figures) == 27 assert len(docs[10].figures) == 5 assert len(docs[11].figures) == 27 # Check caption numbers assert len(docs[0].captions) == 0 assert len(docs[1].captions) == 0 assert len(docs[2].captions) == 0 assert len(docs[3].captions) == 0 assert len(docs[4].captions) == 0 assert len(docs[5].captions) == 0 assert len(docs[6].captions) == 0 assert len(docs[7].captions) == 0 assert len(docs[8].captions) == 0 assert len(docs[9].captions) == 0 assert len(docs[10].captions) == 0 assert len(docs[11].captions) == 0 train_docs = set() dev_docs = set() test_docs = set() splits = (0.5, 0.75) data = [(doc.name, doc) for doc in docs] data.sort(key=lambda x: x[0]) for i, (doc_name, doc) in enumerate(data): if i < splits[0] * ld: train_docs.add(doc) elif i < splits[1] * ld: dev_docs.add(doc) else: test_docs.add(doc) logger.info([x.name for x in train_docs]) # NOTE: With multi-relation support, return values of getting candidates, # mentions, or sparse matrices are formatted as a list of lists. This means # that with a single relation, we need to index into the list of lists to # get the candidates/mentions/sparse matrix for a particular relation or # mention. # Mention Extraction part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3) temp_ngrams = MentionNgramsTemp(n_max=2) volt_ngrams = MentionNgramsVolt(n_max=1) Part = mention_subclass("Part") Temp = mention_subclass("Temp") Volt = mention_subclass("Volt") mention_extractor = MentionExtractor( session, [Part, Temp, Volt], [part_ngrams, temp_ngrams, volt_ngrams], [part_matcher, temp_matcher, volt_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) assert session.query(Part).count() == 299 assert session.query(Temp).count() == 138 assert session.query(Volt).count() == 140 assert len(mention_extractor.get_mentions()) == 3 assert len(mention_extractor.get_mentions()[0]) == 299 assert ( len( mention_extractor.get_mentions( docs=[session.query(Document).filter(Document.name == "112823").first()] )[0] ) == 70 ) # Candidate Extraction PartTemp = candidate_subclass("PartTemp", [Part, Temp]) PartVolt = candidate_subclass("PartVolt", [Part, Volt]) candidate_extractor = CandidateExtractor( session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler] ) for i, docs in enumerate([train_docs, dev_docs, test_docs]): candidate_extractor.apply(docs, split=i, parallelism=PARALLEL) assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493 assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61 assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416 assert session.query(PartVolt).count() == 4282 # Grab candidate lists train_cands = candidate_extractor.get_candidates(split=0, sort=True) dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2, sort=True) assert len(train_cands) == 2 assert len(train_cands[0]) == 3493 assert ( len( candidate_extractor.get_candidates( docs=[session.query(Document).filter(Document.name == "112823").first()] )[0] ) == 1432 ) # Featurization featurizer = Featurizer(session, [PartTemp, PartVolt]) # Test that FeatureKey is properly reset featurizer.apply(split=1, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 214 assert session.query(FeatureKey).count() == 1260 # Test Dropping FeatureKey # Should force a row deletion featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"]) assert session.query(FeatureKey).count() == 1259 # Should only remove the part_volt as a relation and leave part_temp assert set( session.query(FeatureKey) .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]") .one() .candidate_classes ) == {"part_temp", "part_volt"} featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) assert session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]" ).one().candidate_classes == ["part_temp"] assert session.query(FeatureKey).count() == 1259 # Inserting the removed key featurizer.upsert_keys( ["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp, PartVolt] ) assert set( session.query(FeatureKey) .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]") .one() .candidate_classes ) == {"part_temp", "part_volt"} assert session.query(FeatureKey).count() == 1259 # Removing the key again featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) # Removing the last relation from a key should delete the row featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp]) assert session.query(FeatureKey).count() == 1258 session.query(Feature).delete(synchronize_session="fetch") session.query(FeatureKey).delete(synchronize_session="fetch") featurizer.apply(split=0, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 6478 assert session.query(FeatureKey).count() == 4538 F_train = featurizer.get_feature_matrices(train_cands) assert F_train[0].shape == (3493, 4538) assert F_train[1].shape == (2985, 4538) assert len(featurizer.get_keys()) == 4538 featurizer.apply(split=1, parallelism=PARALLEL) assert session.query(Feature).count() == 6692 assert session.query(FeatureKey).count() == 4538 F_dev = featurizer.get_feature_matrices(dev_cands) assert F_dev[0].shape == (61, 4538) assert F_dev[1].shape == (153, 4538) featurizer.apply(split=2, parallelism=PARALLEL) assert session.query(Feature).count() == 8252 assert session.query(FeatureKey).count() == 4538 F_test = featurizer.get_feature_matrices(test_cands) assert F_test[0].shape == (416, 4538) assert F_test[1].shape == (1144, 4538) gold_file = "tests/data/hardware_tutorial_gold.csv" labeler = Labeler(session, [PartTemp, PartVolt]) labeler.apply( docs=last_docs, lfs=[[gold], [gold]], table=GoldLabel, train=True, parallelism=PARALLEL, ) assert session.query(GoldLabel).count() == 8252 stg_temp_lfs = [ LF_storage_row, LF_operating_row, LF_temperature_row, LF_tstg_row, LF_to_left, LF_negative_number_left, ] ce_v_max_lfs = [ LF_bad_keywords_in_row, LF_current_in_row, LF_non_ce_voltages_in_row, ] with pytest.raises(ValueError): labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL) labeler.apply( docs=train_docs, lfs=[stg_temp_lfs, ce_v_max_lfs], train=True, parallelism=PARALLEL, ) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 9 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 9) assert L_train[1].shape == (2985, 9) assert len(labeler.get_keys()) == 9 # Test Dropping LabelerKey labeler.drop_keys(["LF_storage_row"]) assert len(labeler.get_keys()) == 8 # Test Upserting LabelerKey labeler.upsert_keys(["LF_storage_row"]) assert "LF_storage_row" in [label.name for label in labeler.get_keys()] L_train_gold = labeler.get_gold_labels(train_cands) assert L_train_gold[0].shape == (3493, 1) L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold") assert L_train_gold[0].shape == (3493, 1) gen_model = LabelModel() gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = gen_model.predict_proba(L_train[0]) disc_model = LogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, X_dev=(train_cands[0], F_train[0]), Y_dev=L_train_gold[0].reshape(-1), b=0.6, pos_label=TRUE, n_epochs=5, lr=0.001, ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] pickle_file = "tests/data/parts_by_doc_dict.pkl" with open(pickle_file, "rb") as f: parts_by_doc = pickle.load(f) (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 < 0.7 and f1 > 0.3 stg_temp_lfs_2 = [ LF_to_left, LF_test_condition_aligned, LF_collector_aligned, LF_current_aligned, LF_voltage_row_temp, LF_voltage_row_part, LF_typ_row, LF_complement_left_row, LF_too_many_numbers_row, LF_temp_on_high_page_num, LF_temp_outside_table, LF_not_temp_relevant, ] labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 16 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 16) gen_model = LabelModel() gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = gen_model.predict_proba(L_train[0]) disc_model = LogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing LSTM disc_model = LSTM() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing Sparse Logistic Regression disc_model = SparseLogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing Sparse LSTM disc_model = SparseLSTM() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Evaluate mention level scores L_test_gold = labeler.get_gold_labels(test_cands, annotator="gold") Y_test = L_test_gold[0].reshape(-1) scores = disc_model.score((test_cands[0], F_test[0]), Y_test, b=0.6, pos_label=TRUE) logger.info(scores) assert scores["f1"] > 0.6
test_unfired_idx = [i for i,item in enumerate(test_m) if sum(item)==0] targets_test = test_L[test_fired_idx] #majority voting using snorkel's majority voting model maj_preds_test = majority_model.predict(L=test_lsnork[test_fired_idx]) maj_precision_test, maj_recall_test, maj_f1_score_test, maj_support_test = precision_recall_fscore_support(targets_test, maj_preds_test) maj_accuracy_test = compute_accuracy(maj_support_test, maj_recall_test) print("precision on *** RULE COVERD TEST SET *** of MAJORITY VOTING: {}".format(maj_precision_test)) print("recall on *** RULE COVERED TEST SET *** of MAJORITY VOTING: {}".format(maj_recall_test)) print("f1_score on *** RULE COVERED TEST SET *** of MAJORITY VOTING: {}".format(maj_f1_score_test)) print("support on *** RULE COVERED TEST SET *** of MAJORITY VOTING: {}".format(maj_support_test)) print("accuracy on *** RULE COVERED TEST SET *** of MAJORITY VOTING: {}".format(maj_accuracy_test)) #Now train snorkels label model print("Training Snorkel's LabelModel") label_model = LabelModel(cardinality=num_classes, verbose=True) label_model.fit(L_train=U_lsnork, n_epochs=1000, lr=0.001, log_freq=100, seed=123) label_model.save(os.path.join(path_dir,"saved_label_model")) snork_preds_test = label_model.predict(L=test_lsnork[test_fired_idx]) snork_precision_test, snork_recall_test, snork_f1_score_test, snork_support_test = precision_recall_fscore_support(targets_test, snork_preds_test) snork_accuracy_test = compute_accuracy(snork_support_test, snork_recall_test) print("precision on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_precision_test)) print("recall on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_recall_test)) print("f1_score on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_f1_score_test)) print("support on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_support_test)) print("accuracy on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_accuracy_test))
# %% [markdown] # However, as we can clearly see by looking the summary statistics of our LFs in the previous section, they are not all equally accurate, and should not be treated identically. In addition to having varied accuracies and coverages, LFs may be correlated, resulting in certain signals being overrepresented in a majority-vote-based model. To handle these issues appropriately, we will instead use a more sophisticated Snorkel `LabelModel` to combine the outputs of the LFs. # # This model will ultimately produce a single set of noise-aware training labels, which are probabilistic or confidence-weighted labels. We will then use these labels to train a classifier for our task. For more technical details of this overall approach, see our [NeurIPS 2016](https://arxiv.org/abs/1605.07723) and [AAAI 2019](https://arxiv.org/abs/1810.02840) papers. For more info on the API, see the [`LabelModel` documentation](https://snorkel.readthedocs.io/en/master/packages/_autosummary/labeling/snorkel.labeling.LabelModel.html#snorkel.labeling.LabelModel). # # Note that no gold labels are used during the training process. # The only information we need is the label matrix, which contains the output of the LFs on our training set. # The `LabelModel` is able to learn weights for the labeling functions using only the label matrix as input. # We also specify the `cardinality`, or number of classes. # The `LabelModel` trains much more quickly than typical discriminative models since we only need the label matrix as input. # %% {"tags": ["md-exclude-output"]} from snorkel.labeling import LabelModel label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123) # %% majority_acc = majority_model.score(L=L_valid, Y=Y_valid)["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") label_model_acc = label_model.score(L=L_valid, Y=Y_valid)["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") # %% [markdown] # So our `LabelModel` improves over the majority vote baseline! # However, it is typically **not suitable as an inference-time model** to make predictions for unseen data points, due to (among other things) some data points having all abstain labels. # In the next section, we will use the output of the label model as training labels to train a # discriminative classifier to see if we can improve performance further. # This classifier will only need the text of the comment to make predictions, making it much more suitable # for inference over unseen comments.
def test_lr_scheduler(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) label_model.fit(L, n_epochs=1, lr_scheduler="constant") label_model.fit(L, n_epochs=1, lr_scheduler="linear") label_model.fit(L, n_epochs=1, lr_scheduler="exponential") label_model.fit(L, n_epochs=1, lr_scheduler="step") with self.assertRaisesRegex(ValueError, "Unrecognized lr scheduler option"): label_model.fit(L, n_epochs=1, lr_scheduler="bad_scheduler")
def run_snorkel_labelling_classification(labeling_functions, file, l_train, l_valid): lfs = labeling_functions # lfs = [lf.is_same_thread, lf.has_entities, lf.enity_overlap_jacc, lf.entity_type_overlap_jacc] # lfs = [is_same_thread, enity_overlap, entity_types, entity_type_overlap] # lfs = [is_long, has_votes, is_doctor_reply, is_same_thread, enity_overlap, has_type_dsyn, has_type_patf, has_type_sosy, # has_type_dora, has_type_fndg, has_type_menp, has_type_chem, has_type_orch, has_type_horm, has_type_phsu, # has_type_medd, has_type_bhvr, has_type_diap, has_type_bacs, has_type_enzy, has_type_inpo, has_type_elii] # lfs = [has_votes, is_doctor_reply, is_same_thread, enity_overlap] # lfs = [is_same_thread, enity_overlap, is_doctor_reply] # analysis = LFAnalysis(L=l_train, lfs=lfs).lf_summary(Y=Y_train) # print(analysis) # print(analysis['Conflicts']) # print(analysis['Overlaps']) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=l_train, n_epochs=20000, lr=0.0001, log_freq=10, seed=2345) # label_model.fit(L_train=L_train, n_epochs=20, lr=0.0001, log_freq=10, seed=81794) print("Model weights: " + str(label_model.get_weights())) valid_probabilities = label_model.predict_proba(L=l_valid) if 'predicted_prob' in df_valid: # df_valid.drop(columns=['predicted_prob'], axis=1) del df_valid['predicted_prob'] df_valid.insert(50, 'predicted_prob', valid_probabilities[:, 1]) # df_valid.to_csv("/container/filip/json/ehealthforum/trac/validation_df2.txt", sep="\t", header=True) # df_valid = pd.read_csv("/filip/json/ehealthforum/trac/validation_df.txt", sep="\t") def compute_precision_at_k(l, k): l = l[:k] return sum(l) / k PROBABILITY_CUTOFF = 0.5 df_valid[ 'predicted_label'] = df_valid['predicted_prob'] >= PROBABILITY_CUTOFF true_positive_ratio = df_valid[df_valid.bm25_relevant == 1].count()['bm25_relevant'] / \ df_valid[df_valid.predicted_label == 1].count()['predicted_label'] print("Number of True relevant: " + str(df_valid[df_valid.bm25_relevant == 1].count()['bm25_relevant'])) print("Number of Predicted relevant: " + str(df_valid[ df_valid.predicted_label == 1].count()['predicted_label']) + '\n') print('True positive ratio: ' + str(true_positive_ratio) + '\n') df_tru = df_valid.groupby(['query_thread']).head(10)['bm25_relevant'] df_pred = df_valid.groupby(['query_thread']).head(10)['predicted_label'] overall_precision = [] for query, group in df_valid.groupby(['query_thread']): precision = compute_precision_at_k( group['predicted_label'].head(10).tolist(), 10) overall_precision.append(precision) print('Overall precision: ' + str(sum(overall_precision) / len(overall_precision))) print("Accuracy: " + str(accuracy_score(df_tru, df_pred))) label_model_acc = label_model.score(L=l_valid, Y=Y_valid)["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")