def train(self, dataset): # Apply labeler functions to training set lfs_applier = PandasLFApplier(lfs=self.lfs) with warnings.catch_warnings(): warnings.filterwarnings('ignore') lfs_train = lfs_applier.apply(df=dataset) # Build probabilistic label model label_model = LabelModel(cardinality=3, verbose=True) label_model.fit(L_train=lfs_train, n_epochs=500, log_freq=100, seed=42) label_probs = label_model.predict_proba(lfs_train) # Filter unlabeled data points df_filtered, probs_filtered = filter_unlabeled_dataframe(X=dataset, y=label_probs, L=lfs_train) # Featurize data using scikit self.vectorizer = CountVectorizer(ngram_range=(1, 5)) dataset_train = self.vectorizer.fit_transform( df_filtered.sentence.tolist()) # Replace probabilistic labels with most likely label preds_filtered = probs_to_preds(probs=probs_filtered) # Train scikit model self.model = LogisticRegression(C=1e3, solver="liblinear", multi_class='auto') self.model.fit(X=dataset_train, y=preds_filtered)
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") sc = SparkContext() sql = SQLContext(sc) data = sql.read.parquet(data_path) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = SparkLFApplier(lfs) L = applier.apply(data.rdd) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] y_prob_sql_array = F.array([F.lit(y) for y in y_prob]) data_labeled = data.withColumn("y_prob", y_prob_sql_array) data_labeled.write.mode("overwrite").parquet(output_path) logging.info(f"Labels saved to {output_path}")
def generative_model(L_train, n_epochs=500, print_every=100): model = LabelModel(cardinality=2) logger.info("Training generative model...") model.fit(L_train=L_train, n_epochs=n_epochs, seed=1234, log_freq=print_every) logger.info("Done.") marginals = model.predict_proba(L_train) return marginals
def test_labeling_convergence(self) -> None: """Test convergence of end to end labeling pipeline.""" # Apply LFs labeling_functions = ([f] + [ get_positive_labeling_function(divisor) for divisor in range(2, 9) ] + [ get_negative_labeling_function(divisor) for divisor in range(2, 9) ]) applier = PandasLFApplier(labeling_functions) L_train = applier.apply(self.df_train, progress_bar=False) self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions))) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0) Y_lm = label_model.predict_proba(L_train).argmax(axis=1) Y = self.df_train.y err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN self.assertLess(err, 0.05)
def label_model_creator(df_dev, Y_dev, df_train, df_test, Y_test): # Accumulate all the labeling_functions for supply supply_lfs = [ lf_supply, lf_customer, lf_sales_to, lf_our_customer, lf_acquisition, lf_people, lf_sold, lf_relation, lf_competition ] # Apply the above labeling functions to the data in Pandas dataframe formats applier = PandasLFApplier(supply_lfs) # Use the applier of the labeling functions to both development set and train set L_dev = applier.apply(df_dev) L_train = applier.apply(df_train) L_test = applier.apply(df_test) # caridnality : 2 (True and False) label_model = LabelModel(cardinality=2, verbose=True) # Fit the label_model label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500) # accuracy for the label model using the test set label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") # check the F-1 score and ROC_AUC score probs_dev = label_model.predict_proba(L_dev) preds_dev = probs_to_preds(probs_dev) print( f"Label model f1 score: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')}" ) print( f"Label model roc-auc: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}" ) return label_model, L_train
def labeling_evaluation(df_train, df_test, label_model): lfs = [ LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short, LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword, LabelingFunction.lf_surname_re, LabelingFunction.industry_cls ] applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df_train) L_test = applier.apply(df=df_test) Y_test = df_test.label.values analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() if label_model == "majority": majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=L_train) majority_acc = majority_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe( X=df_train, y=preds_train, L=L_train) return df_train_filtered, preds_train_filtered, analysis if label_model == "weighted": label_model = LabelModel(cardinality=len( [c for c in dir(Polarity) if not c.startswith("__")]), verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) probs_train = label_model.predict_proba(L_train) label_model_acc = label_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) preds_train_filtered = probs_to_preds(probs_train_filtered) return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
def train(self): ''' Train the logistic regression discriminative model ''' # We pull out the label vectors for ease of use later Y_test = self.df_test.label.values applier = PandasLFApplier(lfs=self.lfs) L_train = applier.apply(df=self.df_train) # Use Label Model to combined input data label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) # Make predictions probs_train = label_model.predict_proba(L=L_train) # Filter abstained inputs df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=self.df_train, y=probs_train, L=L_train) # Represent each data point as a one-hot vector vectorizer = CountVectorizer(ngram_range=(1, 5)) X_train = vectorizer.fit_transform(df_train_filtered.text.tolist()) X_test = vectorizer.transform(self.df_test.text.tolist()) # Turn probs into preds preds_train_filtered = probs_to_preds(probs=probs_train_filtered) # Train logistic regression model sklearn_model = LogisticRegression(C=1e3, solver="liblinear") sklearn_model.fit(X=X_train, y=preds_train_filtered) print( f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%" ) dump(sklearn_model, 'sklearn_model.joblib') dump(vectorizer, 'vectorizer.joblib')
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") data = dd.read_parquet(data_path) data = data.repartition(npartitions=2) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = DaskLFApplier(lfs) L = applier.apply(data) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] data = data.reset_index().set_index("index") data_labeled = data.assign(y_prob=dd.from_array(y_prob)) dd.to_parquet(data_labeled, output_path) logging.info(f"Labels saved to {output_path}")
def train_end_classifier(Xtrain, Xtest, iwssession, lfsets, device='cuda', gap=20, class_balance=None, modelparams=None, uniform=False, verbose=False): """ Function to fit label model, train downstream classifier, and return test set predictions Parameters ---------- Xtrain : ndarray of shape (n training samples,d features) Features for training data Xtest : ndarray of shape (n test samples,d features) Features for test data iwssession : Object An instance of InteractiveWeakSupervision class lfsets : dict A dictonary containing, for each run of IWS, final LF indices for each iteration {runindex: {iteration : LF_indices})} device : str, default = 'cuda' String passed to torch to identify which device to use, e.g. "cpu" or 'cuda:0' gap : int, default = 20 Provide downstream results every "gap" iterations class_balance : tuple, default = None Class balance tuple (negative class fraction, positive class fraction) passed to graphical model, e.g. class_balance = (0.5,0.5) modelparams : dict, defgault = None Dictionary containing sizes of hidden layers and activation functions of the downstream MLP uniform : bool, default = False Use uniform weighted LFs to obtain label instead of fitting a graphical model to learn weights. verbose : bool, default = False Print iteration info if true. Returns ------- results: dict A dictionary containing the probabilistic test set predictions for each iws run, and each internal iteration {runindex: {iteration_idx : test_predictions})} """ if modelparams is None: modelparams = { 'h_sizes': [Xtrain.shape[1], 20, 20], 'activations': [torch.nn.ReLU(), torch.nn.ReLU()] } results = {} # for each run of IWS for key, iterdict in lfsets.items(): results[key] = {} # establish which IWS iterations to obtain results for itermax = len(iterdict.keys()) finaliter = itermax - 1 iters_to_run = list(range(0, itermax, gap)) if finaliter not in iters_to_run: # always obtain results for final iteration iters_to_run.append(finaliter) for iteration_idx in iters_to_run: if verbose: print('IWS run: %d' % key, ' iteration: %d' % iteration_idx) trainidxs = iterdict[iteration_idx] # get seleted LFs if uniform: LFStmp = np.asarray(iwssession.LFs_csc[:, trainidxs].todense()) n, m = LFStmp.shape weights = np.ones(m) rowsums = np.asarray((LFStmp != 0).sum(1)).flatten() filteridx = rowsums != 0 posevidence = ((LFStmp == 1).astype(np.float32)).dot(weights) negevidence = ((LFStmp == -1).astype(np.float32)).dot(weights) posevidence = np.clip(posevidence, 0.0, 700.0) negevidence = np.clip(negevidence, 0.0, 700.0) bin_posterior = np.exp(posevidence) / (np.exp(posevidence) + np.exp(negevidence)) bin_posterior = bin_posterior.astype(np.float32) else: Lambdas = np.asarray(iwssession.LFs_csc[:, trainidxs].todense()) # create snorkel LF format rowsums = (Lambdas != 0).sum(1) filteridx = rowsums != 0 Lambda_snorkel = np.copy(Lambdas) Lambda_snorkel[Lambda_snorkel == 0] = -10 Lambda_snorkel[Lambda_snorkel == -1] = 0 Lambda_snorkel[Lambda_snorkel == -10] = -1 # create variable to filter out samples with 0 LF votes # train label model if 'cuda' in device: torch.cuda.empty_cache() label_model = LabelModel(cardinality=2, verbose=True, device=device) label_model.fit(Lambda_snorkel[filteridx], class_balance=class_balance) torch.cuda.empty_cache() else: label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(Lambda_snorkel[filteridx], class_balance=class_balance) # get label estimate posterior = label_model.predict_proba(Lambda_snorkel) bin_posterior = posterior[:, 1].astype(np.float32) tmpindicator = np.isnan(bin_posterior) if tmpindicator.sum() > 0: bin_posterior[tmpindicator] = np.median( bin_posterior[~tmpindicator]) # train classifier on label estimate and get test set prediction Xtrain_filtered = Xtrain[filteridx] probs_train_filtered = bin_posterior[filteridx] torch.cuda.empty_cache() model = TorchMLP(h_sizes=modelparams['h_sizes'], activations=modelparams['activations'], optimizer='Adam', nepochs=250) if 'cuda' in device: tdevice = torch.device(device) model.model.to(tdevice) model.fit(Xtrain_filtered, probs_train_filtered, device=tdevice) test_predictions = model.predict_proba(Xtest, device=tdevice) else: model.fit(Xtrain_filtered, probs_train_filtered) test_predictions = model.predict_proba(Xtest) results[key][iteration_idx] = test_predictions return results
def get_probabilistic_labels(iwssession, lfsets, device='cuda', gap=20, class_balance=None, uniform=False, verbose=False): """ Function to fit label model, train downstream classifier, and return test set predictions Parameters ---------- iwssession : Object An instance of InteractiveWeakSupervision class lfsets : dict A dictonary containing, for each run of IWS, final LF indices for each iteration {runindex: {iteration : LF_indices})} device : str, default = 'cuda' String passed to torch to identify which device to use, e.g. "cpu" or 'cuda:0' gap : int, default = 20 Provide downstream results every "gap" iterations class_balance : tuple, default = None Class balance tuple (negative class fraction, positive class fraction) passed to graphical model, e.g. class_balance = (0.5,0.5) uniform : bool, default = False Use uniform weighted LFs to obtain label instead of fitting a graphical model to learn weights. verbose : bool, default = False Print iteration info if true. Returns ------- results: dict A dictionary containing the probabilistic train labels and a boolean filter index variable for each iws run, and each internal iteration. The filter index variable is True for every sample where we have at least one non-abstain vote. {runindex: {iteration_idx : (prob_labels,filteridx)})} """ results = {} # for each run of IWS for key, iterdict in lfsets.items(): results[key] = {} # establish which IWS iterations to obtain results for itermax = len(iterdict.keys()) finaliter = itermax - 1 iters_to_run = list(range(0, itermax, gap)) if finaliter not in iters_to_run: # always obtain results for final iteration iters_to_run.append(finaliter) for iteration_idx in iters_to_run: if verbose: print('IWS run: %d' % key, ' iteration: %d' % iteration_idx) trainidxs = iterdict[iteration_idx] # get seleted LFs if uniform: LFStmp = iwssession.LFs_csc[:, trainidxs].copy() n, m = LFStmp.shape weights = np.ones(m) rowsums = np.asarray((LFStmp != 0).sum(1)).flatten() filteridx = rowsums != 0 posevidence = ((LFStmp == 1).astype(np.float32)).dot(weights) negevidence = ((LFStmp == -1).astype(np.float32)).dot(weights) posevidence = np.asarray(posevidence).flatten() negevidence = np.asarray(negevidence).flatten() posevidence = np.clip(posevidence, 0.0, 700.0) negevidence = np.clip(negevidence, 0.0, 700.0) bin_posterior = np.exp(posevidence) / (np.exp(posevidence) + np.exp(negevidence)) bin_posterior = bin_posterior.astype(np.float32) else: Lambdas = np.asarray(iwssession.LFs_csc[:, trainidxs].todense()) # create snorkel LF format rowsums = (Lambdas != 0).sum(1) filteridx = rowsums != 0 Lambda_snorkel = np.copy(Lambdas) Lambda_snorkel[Lambda_snorkel == 0] = -10 Lambda_snorkel[Lambda_snorkel == -1] = 0 Lambda_snorkel[Lambda_snorkel == -10] = -1 # create variable to filter out samples with 0 LF votes # train label model if 'cuda' in device: torch.cuda.empty_cache() label_model = LabelModel(cardinality=2, verbose=True, device=device) label_model.fit(Lambda_snorkel[filteridx], class_balance=class_balance) torch.cuda.empty_cache() else: label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(Lambda_snorkel[filteridx], class_balance=class_balance) # get label estimate posterior = label_model.predict_proba(Lambda_snorkel) bin_posterior = posterior[:, 1].astype(np.float32) tmpindicator = np.isnan(bin_posterior) if tmpindicator.sum() > 0: bin_posterior[tmpindicator] = np.median( bin_posterior[~tmpindicator]) results[key][iteration_idx] = (bin_posterior, filteridx) return results
# Define train dataset L_train = L_data_local[train_idx] Y_train = Y_data_local[train_idx] # Define test dataset L_test = L_data_local[test_idx] Y_test = Y_data_local[test_idx] # Evaluate a dependency-informed Snorkel model l_model = LabelModel(cardinality=2, verbose=False) l_model.fit(L_train, n_epochs=n_epochs, lr=lr) try: if abstain_rate < 0: Y_pred = l_model.predict(L_test, tie_break_policy="abstain") else: Y_prob = l_model.predict_proba(L_test) Y_pred = predict_at_abstain_rate(Y_prob, abstain_rate) scores = scorer.score(Y_test, preds=Y_pred) all_scores.append(scores) except Exception as e: print("Iter {}: {}".format(i+1,e)) continue # Logging print("Iteration " + str(i+1) + ":", scores) print("-- SUMMARY --") print("accuracy: AVG {:.3f}, STD {:.3f}".format(np.mean([s["accuracy"] for s in all_scores]), np.std([s["accuracy"] for s in all_scores]))) print("f1: AVG {:.3f}, STD {:.3f}".format(np.mean([s["f1"] for s in all_scores]), np.std([s["f1"] for s in all_scores]))) print("abstain rate: AVG {:.3f}, STD {:.3f}".format(np.mean([s["abstain rate"] for s in all_scores]), np.std([s["abstain rate"] for s in all_scores])))
labeler = Labeler(session, candidate_classes) labeler.apply(docs=train_docs, lfs=[[gold]], table=GoldLabel, train=True) from fonduer_lfs import president_name_pob_lfs labeler.apply(split=0, lfs=[president_name_pob_lfs], train=True, parallelism=PARALLEL) L_train = labeler.get_label_matrices(train_cands) L_gold_train = labeler.get_gold_labels(train_cands, annotator="gold") from snorkel.labeling.model import LabelModel label_model = LabelModel(verbose=False) label_model.fit(L_train[0], n_epochs=500) train_marginals = label_model.predict_proba(L_train[0]) ATTRIBUTE = "wiki" import numpy as np import emmental from emmental.data import EmmentalDataLoader from emmental.learner import EmmentalLearner from emmental.model import EmmentalModel from emmental.modules.embedding_module import EmbeddingModule from fonduer.learning.dataset import FonduerDataset from fonduer.learning.task import create_task from fonduer.learning.utils import collect_word_counter # Collect word counter word_counter = collect_word_counter(train_cands)
def test_e2e(): """Run an end-to-end test on documents of the hardware domain.""" # GitHub Actions gives 2 cores # help.github.com/en/actions/reference/virtual-environments-for-github-hosted-runners PARALLEL = 2 max_docs = 12 fonduer.init_logging( format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s", level=logging.INFO, ) session = fonduer.Meta.init(CONN_STRING).Session() docs_path = "tests/data/html/" pdf_path = "tests/data/pdf/" doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser( session, parallelism=PARALLEL, structural=True, lingual=True, visual=True, pdf_path=pdf_path, ) corpus_parser.apply(doc_preprocessor) assert session.query(Document).count() == max_docs num_docs = session.query(Document).count() logger.info(f"Docs: {num_docs}") assert num_docs == max_docs num_sentences = session.query(Sentence).count() logger.info(f"Sentences: {num_sentences}") # Divide into test and train docs = sorted(corpus_parser.get_documents()) last_docs = sorted(corpus_parser.get_last_documents()) ld = len(docs) assert ld == len(last_docs) assert len(docs[0].sentences) == len(last_docs[0].sentences) assert len(docs[0].sentences) == 799 assert len(docs[1].sentences) == 663 assert len(docs[2].sentences) == 784 assert len(docs[3].sentences) == 661 assert len(docs[4].sentences) == 513 assert len(docs[5].sentences) == 700 assert len(docs[6].sentences) == 528 assert len(docs[7].sentences) == 161 assert len(docs[8].sentences) == 228 assert len(docs[9].sentences) == 511 assert len(docs[10].sentences) == 331 assert len(docs[11].sentences) == 528 # Check table numbers assert len(docs[0].tables) == 9 assert len(docs[1].tables) == 9 assert len(docs[2].tables) == 14 assert len(docs[3].tables) == 11 assert len(docs[4].tables) == 11 assert len(docs[5].tables) == 10 assert len(docs[6].tables) == 10 assert len(docs[7].tables) == 2 assert len(docs[8].tables) == 7 assert len(docs[9].tables) == 10 assert len(docs[10].tables) == 6 assert len(docs[11].tables) == 9 # Check figure numbers assert len(docs[0].figures) == 32 assert len(docs[1].figures) == 11 assert len(docs[2].figures) == 38 assert len(docs[3].figures) == 31 assert len(docs[4].figures) == 7 assert len(docs[5].figures) == 38 assert len(docs[6].figures) == 10 assert len(docs[7].figures) == 31 assert len(docs[8].figures) == 4 assert len(docs[9].figures) == 27 assert len(docs[10].figures) == 5 assert len(docs[11].figures) == 27 # Check caption numbers assert len(docs[0].captions) == 0 assert len(docs[1].captions) == 0 assert len(docs[2].captions) == 0 assert len(docs[3].captions) == 0 assert len(docs[4].captions) == 0 assert len(docs[5].captions) == 0 assert len(docs[6].captions) == 0 assert len(docs[7].captions) == 0 assert len(docs[8].captions) == 0 assert len(docs[9].captions) == 0 assert len(docs[10].captions) == 0 assert len(docs[11].captions) == 0 train_docs = set() dev_docs = set() test_docs = set() splits = (0.5, 0.75) data = [(doc.name, doc) for doc in docs] data.sort(key=lambda x: x[0]) for i, (doc_name, doc) in enumerate(data): if i < splits[0] * ld: train_docs.add(doc) elif i < splits[1] * ld: dev_docs.add(doc) else: test_docs.add(doc) logger.info([x.name for x in train_docs]) # NOTE: With multi-relation support, return values of getting candidates, # mentions, or sparse matrices are formatted as a list of lists. This means # that with a single relation, we need to index into the list of lists to # get the candidates/mentions/sparse matrix for a particular relation or # mention. # Mention Extraction part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3) temp_ngrams = MentionNgramsTemp(n_max=2) volt_ngrams = MentionNgramsVolt(n_max=1) Part = mention_subclass("Part") Temp = mention_subclass("Temp") Volt = mention_subclass("Volt") mention_extractor = MentionExtractor( session, [Part, Temp, Volt], [part_ngrams, temp_ngrams, volt_ngrams], [part_matcher, temp_matcher, volt_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) assert session.query(Part).count() == 299 assert session.query(Temp).count() == 138 assert session.query(Volt).count() == 140 assert len(mention_extractor.get_mentions()) == 3 assert len(mention_extractor.get_mentions()[0]) == 299 assert (len( mention_extractor.get_mentions(docs=[ session.query(Document).filter(Document.name == "112823").first() ])[0]) == 70) # Candidate Extraction PartTemp = candidate_subclass("PartTemp", [Part, Temp]) PartVolt = candidate_subclass("PartVolt", [Part, Volt]) candidate_extractor = CandidateExtractor( session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler]) for i, docs in enumerate([train_docs, dev_docs, test_docs]): candidate_extractor.apply(docs, split=i, parallelism=PARALLEL) assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493 assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61 assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416 assert session.query(PartVolt).count() == 4282 # Grab candidate lists train_cands = candidate_extractor.get_candidates(split=0, sort=True) dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2, sort=True) assert len(train_cands) == 2 assert len(train_cands[0]) == 3493 assert (len( candidate_extractor.get_candidates(docs=[ session.query(Document).filter(Document.name == "112823").first() ])[0]) == 1432) # Featurization featurizer = Featurizer(session, [PartTemp, PartVolt]) # Test that FeatureKey is properly reset featurizer.apply(split=1, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 214 assert session.query(FeatureKey).count() == 1260 # Test Dropping FeatureKey # Should force a row deletion featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"]) assert session.query(FeatureKey).count() == 1259 # Should only remove the part_volt as a relation and leave part_temp assert set( session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == { "part_temp", "part_volt" } featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) assert session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes == ["part_temp"] assert session.query(FeatureKey).count() == 1259 # Inserting the removed key featurizer.upsert_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp, PartVolt]) assert set( session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == { "part_temp", "part_volt" } assert session.query(FeatureKey).count() == 1259 # Removing the key again featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) # Removing the last relation from a key should delete the row featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp]) assert session.query(FeatureKey).count() == 1258 session.query(Feature).delete(synchronize_session="fetch") session.query(FeatureKey).delete(synchronize_session="fetch") featurizer.apply(split=0, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 6478 assert session.query(FeatureKey).count() == 4538 F_train = featurizer.get_feature_matrices(train_cands) assert F_train[0].shape == (3493, 4538) assert F_train[1].shape == (2985, 4538) assert len(featurizer.get_keys()) == 4538 featurizer.apply(split=1, parallelism=PARALLEL) assert session.query(Feature).count() == 6692 assert session.query(FeatureKey).count() == 4538 F_dev = featurizer.get_feature_matrices(dev_cands) assert F_dev[0].shape == (61, 4538) assert F_dev[1].shape == (153, 4538) featurizer.apply(split=2, parallelism=PARALLEL) assert session.query(Feature).count() == 8252 assert session.query(FeatureKey).count() == 4538 F_test = featurizer.get_feature_matrices(test_cands) assert F_test[0].shape == (416, 4538) assert F_test[1].shape == (1144, 4538) gold_file = "tests/data/hardware_tutorial_gold.csv" labeler = Labeler(session, [PartTemp, PartVolt]) labeler.apply( docs=last_docs, lfs=[[gold], [gold]], table=GoldLabel, train=True, parallelism=PARALLEL, ) assert session.query(GoldLabel).count() == 8252 stg_temp_lfs = [ LF_storage_row, LF_operating_row, LF_temperature_row, LF_tstg_row, LF_to_left, LF_negative_number_left, ] ce_v_max_lfs = [ LF_bad_keywords_in_row, LF_current_in_row, LF_non_ce_voltages_in_row, ] with pytest.raises(ValueError): labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL) labeler.apply( docs=train_docs, lfs=[stg_temp_lfs, ce_v_max_lfs], train=True, parallelism=PARALLEL, ) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 9 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 9) assert L_train[1].shape == (2985, 9) assert len(labeler.get_keys()) == 9 # Test Dropping LabelerKey labeler.drop_keys(["LF_storage_row"]) assert len(labeler.get_keys()) == 8 # Test Upserting LabelerKey labeler.upsert_keys(["LF_storage_row"]) assert "LF_storage_row" in [label.name for label in labeler.get_keys()] L_train_gold = labeler.get_gold_labels(train_cands) assert L_train_gold[0].shape == (3493, 1) L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold") assert L_train_gold[0].shape == (3493, 1) label_model = LabelModel() label_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = label_model.predict_proba(L_train[0]) # Collect word counter word_counter = collect_word_counter(train_cands) emmental.init(fonduer.Meta.log_path) # Training config config = { "meta_config": { "verbose": False }, "model_config": { "model_path": None, "device": 0, "dataparallel": False }, "learner_config": { "n_epochs": 5, "optimizer_config": { "lr": 0.001, "l2": 0.0 }, "task_scheduler": "round_robin", }, "logging_config": { "evaluation_freq": 1, "counter_unit": "epoch", "checkpointing": False, "checkpointer_config": { "checkpoint_metric": { f"{ATTRIBUTE}/{ATTRIBUTE}/train/loss": "min" }, "checkpoint_freq": 1, "checkpoint_runway": 2, "clear_intermediate_checkpoints": True, "clear_all_checkpoints": True, }, }, } emmental.Meta.update_config(config=config) # Generate word embedding module arity = 2 # Geneate special tokens specials = [] for i in range(arity): specials += [f"~~[[{i}", f"{i}]]~~"] emb_layer = EmbeddingModule(word_counter=word_counter, word_dim=300, specials=specials) diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1) train_idxs = np.where(diffs > 1e-6)[0] train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, train_marginals, train_idxs, ), split="train", batch_size=100, shuffle=True, ) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LogisticRegression") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader]) test_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset(ATTRIBUTE, test_cands[0], F_test[0], emb_layer.word2id, 2), split="test", batch_size=100, shuffle=False, ) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6) true_pred = [test_cands[0][_] for _ in positive[0]] pickle_file = "tests/data/parts_by_doc_dict.pkl" with open(pickle_file, "rb") as f: parts_by_doc = pickle.load(f) (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 < 0.7 and f1 > 0.3 stg_temp_lfs_2 = [ LF_to_left, LF_test_condition_aligned, LF_collector_aligned, LF_current_aligned, LF_voltage_row_temp, LF_voltage_row_part, LF_typ_row, LF_complement_left_row, LF_too_many_numbers_row, LF_temp_on_high_page_num, LF_temp_outside_table, LF_not_temp_relevant, ] labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 16 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 16) label_model = LabelModel() label_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = label_model.predict_proba(L_train[0]) diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1) train_idxs = np.where(diffs > 1e-6)[0] train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, train_marginals, train_idxs, ), split="train", batch_size=100, shuffle=True, ) valid_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, np.argmax(train_marginals, axis=1), train_idxs, ), split="valid", batch_size=100, shuffle=False, ) emmental.Meta.reset() emmental.init(fonduer.Meta.log_path) emmental.Meta.update_config(config=config) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LogisticRegression") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader, valid_dataloader]) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7) true_pred = [test_cands[0][_] for _ in positive[0]] (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing LSTM emmental.Meta.reset() emmental.init(fonduer.Meta.log_path) emmental.Meta.update_config(config=config) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LSTM") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader]) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7) true_pred = [test_cands[0][_] for _ in positive[0]] (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7
def run_labeling_functions(cands): ABSTAIN = -1 FALSE = 0 TRUE = 1 # Extract candidates train_cands = cands[0] dev_cands = cands[1] test_cands = cands[2] @labeling_function() def LF_other_station_table(c): station_span = c.station.context.get_span().lower() neighbour_cells = get_neighbor_cell_ngrams_own(c.price, dist=100, directions=True, n_max = 4, absolute = True) up_cells = [x for x in neighbour_cells if len(x) > 1 and x[1] == 'DOWN' and x[0] in stations_list] # No station name in upper cells if (len(up_cells) == 0): return ABSTAIN # Check if the next upper aligned station-span corresponds to the candidate span (or equivalents) closest_header = up_cells[len(up_cells)-1] return TRUE if closest_header[0] in stations_mapping_dict[station_span] else FALSE @labeling_function() def LF_station_non_meta_tag(c): html_tags = get_ancestor_tag_names(c.station) return FALSE if ('head' in html_tags and 'title' in html_tags) else ABSTAIN # Basic constraint for the price LFs to be true -> no wrong station (increase accuracy) def base(c): return ( LF_station_non_meta_tag(c) != 0 and LF_other_station_table(c) != 0 and LF_off_peak_head(c) != 0 and LF_purchases(c) ) # 2.) Create labeling functions @labeling_function() def LF_on_peak_head(c): return TRUE if 'on peak' in get_aligned_ngrams(c.price, n_min=2, n_max=2) and base(c) else ABSTAIN @labeling_function() def LF_off_peak_head(c): return FALSE if 'off peak' in get_aligned_ngrams(c.price, n_min=2, n_max=2) else ABSTAIN @labeling_function() def LF_price_range(c): price = float(c.price.context.get_span()) return TRUE if price > 0 and price < 1000 and base(c) else FALSE @labeling_function() def LF_price_head(c): return TRUE if 'price' in get_aligned_ngrams(c.price) and base(c) else ABSTAIN @labeling_function() def LF_firm_head(c): return TRUE if 'firm' in get_aligned_ngrams(c.price)and base(c) else ABSTAIN @labeling_function() def LF_dollar_to_left(c): return TRUE if '$' in get_left_ngrams(c.price, window=2) and base(c) else ABSTAIN @labeling_function() def LF_purchases(c): return FALSE if 'purchases' in get_aligned_ngrams(c.price, n_min=1) else ABSTAIN station_price_lfs = [ LF_other_station_table, LF_station_non_meta_tag, # indicator LF_price_range, # negative indicators LF_off_peak_head, LF_purchases, # positive indicators LF_on_peak_head, LF_price_head, LF_firm_head, LF_dollar_to_left, ] # 3.) Apply the LFs on the training set labeler = Labeler(session, [StationPrice]) labeler.apply(split=0, lfs=[station_price_lfs], train=True, clear=True, parallelism=PARALLEL) L_train = labeler.get_label_matrices(train_cands) # Check that LFs are all applied (avoid crash) applied_lfs = L_train[0].shape[1] has_non_applied = applied_lfs != len(station_price_lfs) print(f"Labeling functions on train_cands not ABSTAIN: {applied_lfs} (/{len(station_price_lfs)})") if (has_non_applied): applied_lfs = get_applied_lfs(session) non_applied_lfs = [l.name for l in station_price_lfs if l.name not in applied_lfs] print(f"Labling functions {non_applied_lfs} are not applied.") station_price_lfs = [l for l in station_price_lfs if l.name in applied_lfs] # 4.) Evaluate their accuracy L_gold_train = labeler.get_gold_labels(train_cands, annotator='gold') # Sort LFs for LFAnalysis because LFAnalysis does not sort LFs, # while columns of L_train are sorted alphabetically already. sorted_lfs = sorted(station_price_lfs, key=lambda lf: lf.name) LFAnalysis(L=L_train[0], lfs=sorted_lfs).lf_summary(Y=L_gold_train[0].reshape(-1)) # 5.) Build generative model gen_model = LabelModel(cardinality=2) gen_model.fit(L_train[0], n_epochs=500, log_freq=100) train_marginals_lfs = gen_model.predict_proba(L_train[0]) # Apply on dev-set labeler.apply(split=1, lfs=[station_price_lfs], clear=True, parallelism=PARALLEL) L_dev = labeler.get_label_matrices(dev_cands) L_gold_dev = labeler.get_gold_labels(dev_cands, annotator='gold') LFAnalysis(L=L_dev[0], lfs=sorted_lfs).lf_summary(Y=L_gold_dev[0].reshape(-1)) return (gen_model, train_marginals_lfs)
# We perform a simple random hyperparameter search over learning rate and L2 regularization, using our small labeled development set to choose the best model. # %% from snorkel.labeling.model import LabelModel from snorkel.analysis.utils import probs_to_preds from snorkel.analysis.metrics import metric_score label_model = LabelModel(cardinality=2, verbose=True) label_model.train_model(L_train, log_train_every=10, lr=0.05, class_balance=[0.7, 0.3], n_epochs=100) # %% Y_probs_valid = label_model.predict_proba(L_valid) Y_preds_valid = probs_to_preds(Y_probs_valid) metric_score(Y_valid, Y_preds_valid, probs=None, metric="f1") # %% [markdown] # **Majority Vote** # %% from snorkel.labeling.model import MajorityLabelVoter mv_model = MajorityLabelVoter() Y_probs_valid = mv_model.predict_proba(L_valid) Y_preds_valid = probs_to_preds(Y_probs_valid) metric_score(Y_valid, Y_preds_valid, probs=None, metric="f1") # %%
Y=y_dev, tie_break_policy="random")["accuracy"] print(f'label model acc: {label_model_acc}') print('fitting Majority Label Voter model') majority_model = MajorityLabelVoter(cardinality=config['cardinality']) # preds_train = majority_model.predict(L=L_train) majority_acc = majority_model.score(L=L_dev, Y=np.array(y_dev).reshape(-1, 1), tie_break_policy="random")["accuracy"] print(f'majority_label_acc: {majority_acc}') log_metric('majority_label_acc', majority_acc) log_metric('label_model_acc', label_model_acc) probs_train = label_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=X_train, y=probs_train, L=L_train) print('setting up Label Model') stop_words = config['stop_words'] custom_stop_words = text.ENGLISH_STOP_WORDS.union(stop_words) # vectorizer = CountVectorizer(ngram_range=(1, 5)) vectorizer = TfidfVectorizer(stop_words=custom_stop_words).fit( X_train.text.tolist()) X_train_vectorized = vectorizer.transform(X_train.text.tolist()) X_train_filtered_vectorized = vectorizer.transform( df_train_filtered.text.tolist()) preds_train_filtered = probs_to_preds( probs=probs_train_filtered ) # using weak labels generated by Label Model to train downstream classifier
label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) L_test = applier.apply(test_df) # to_numerical = lambda x: x=='leave' # Y_test = [to_numerical(item) for item in test_df.label] Y_test = [] for item in test_df.label: if item == 'stay': Y_test.append(STAY) else: Y_test.append(LEAVE) Y_test = np.asarray(Y_test) label_model_performance = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random", metrics=['accuracy', 'precision', 'recall', 'f1']) print(f"Label Model Accuracy: {label_model_performance['accuracy'] * 100:.1f}%") predict_probs = label_model.predict_proba(L_unlabeled) preds = probs_to_preds(predict_probs) pred_labels = [] for i in range(len(preds)): if preds[i]: pred_labels.append('leave') else: pred_labels.append('stay') unlabeled_data['label'] = pred_labels unlabeled_data.to_csv(os.path.join(data_dir, 'snorkel_labeled_data.csv'), sep=',', index=False)
def model_analysis(label_model: LabelModel, training_set: pd.DataFrame, L_train: np.ndarray, L_test: np.ndarray, Y_test: np.ndarray, lfs: list, output_file="output") -> None: # TODO: consider using **kwargs instead of this painful list of arguments """Output analysis for the label model to a file :param label_model: The current label model which we want to output analysis for :type label_model: LabelModel :param training_set: A dataframe containing the training dataset :type training_set: pd.DataFrame :param L_train: The matrix of labels generated by the labeling functions on the training data :type L_train: np.ndarray :param L_test: The matrix of labels generated bt the labeling functions on the testing data :type L_test: np.ndarray :param Y_test: Gold labels associated with data points in L_test :type Y_test: np.ndarray :param lfs: List of labeling functions :type lfs: list :param output_file: A path where the output file should be writtent to, defaults to `PROJECT_ROOT/output` :type output_file: str, optional """ Y_train = label_model.predict_proba(L=L_train) Y_pred = label_model.predict(L=L_test, tie_break_policy="abstain") lf_analysis_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary() # TODO: Write this df to a output file. Ask Jennifer about how to handle this print(lf_analysis_train) # build majority label voter model majority_model = MajorityLabelVoter() majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain", metrics=["f1", "accuracy"]) label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain", metrics=["f1", "accuracy"]) # get precision and recall scores p_score = precision_score(y_true=Y_test, y_pred=Y_pred, average='weighted') r_score = recall_score(y_true=Y_test, y_pred=Y_pred, average='weighted', labels=np.unique(Y_pred)) # how many documents abstained probs_train = majority_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=training_set, y=probs_train, L=L_train) # get number of false positives buckets = get_label_buckets(Y_test, Y_pred) true_positives, false_positives, true_negatives, false_negatives = ( buckets.get((1, 1)), buckets.get((1, 0)), buckets.get( (0, 0)), buckets.get((0, 1))) # write analysis to file timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") with open(f"{'../output/logs/'}{output_file}_run_{timestamp}.txt", "w") as output_file: output_file.write( f"{'Majority Vote Accuracy:':<25} {majority_acc['accuracy'] * 100:.2f}%" ) output_file.write( f"\n{'Majority Vote F1 Score:':<25} {majority_acc['f1'] * 100:.2f}%" ) output_file.write( f"\n{'Label Model Accuracy:':<25} {label_model_acc['accuracy'] * 100:.2f}%" ) output_file.write( f"\n{'Label Model F1 Score:':<25} {label_model_acc['f1'] * 100:.2f}%" ) output_file.write(f"\n{'Precision Score:':<25} {p_score * 100:.2f}%") output_file.write(f"\n{'Recall Score:':<25} {r_score * 100:.2f}%") output_file.write( f"\n{'Abstained Data Points:':<25} {len(df_train_filtered)}") output_file.write( f"\n{'True Positives:':<25} {len(true_positives) if true_positives is not None else 0}" ) output_file.write( f"\n{'False Positives:':<25} {len(false_positives) if false_positives is not None else 0}" ) output_file.write( f"\n{'False Negatives:':<25} {len(false_negatives) if false_negatives is not None else 0}" ) output_file.write( f"\n{'True Negatives:':<25} {len(true_negatives) if true_negatives is not None else 0}" ) output_file.write( f"\n{'Abstained Positives:':<25} {len(buckets[(1, -1)])}") output_file.write( f"\n{'Abstained Negatives:':<25} {len(buckets[(0, -1)])}")
# %% {"tags": ["md-exclude-output"]} from snorkel.labeling.model import LabelModel label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345) # %% [markdown] # ### Label Model Metrics # Since our dataset is highly unbalanced (91% of the labels are negative), even a trivial baseline that always outputs negative can get a high accuracy. So we evaluate the label model using the F1 score and ROC-AUC rather than accuracy. # %% from snorkel.analysis import metric_score from snorkel.utils import probs_to_preds probs_dev = label_model.predict_proba(L_dev) preds_dev = probs_to_preds(probs_dev) print( f"Label model f1 score: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')}" ) print( f"Label model roc-auc: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}" ) # %% [markdown] # ### Part 4: Training our End Extraction Model # # In this final section of the tutorial, we'll use our noisy training labels to train our end machine learning model. We start by filtering out training data points which did not recieve a label from any LF, as these data points contain no signal. # # %% from snorkel.labeling import filter_unlabeled_dataframe