def train_model_random_lfs(randomly_sampled_lfs, train_matrix, dev_matrix, dev_labels, test_matrix, regularization_grid): hyper_grid_results = defaultdict(dict) train_grid_results = defaultdict(dict) dev_grid_results = defaultdict(dict) test_grid_results = defaultdict(dict) for lf_sample in tqdm_notebook(enumerate(randomly_sampled_lfs)): for param in regularization_grid: label_model = LabelModel(k=2) label_model.train_model( train_matrix[:,lf_sample[1]], n_epochs=1000, log_train_every=200, seed=100, lr=0.01, l2=param, verbose=False ) hyper_grid_results[str(param)] = label_model.predict_proba(dev_matrix[:,lf_sample[1]]) best_param = float(max(hyper_grid_results)) label_model.train_model( train_matrix[:,lf_sample[1]], n_epochs=1000, log_train_every=200, seed=50, lr=0.01, l2=best_param, verbose=False ) key = f'{lf_sample[0]}:{",".join(map(str, lf_sample[1]))}' train_grid_results[key] = label_model.predict_proba(train_matrix[:,lf_sample[1]]) dev_grid_results[key] = label_model.predict_proba(dev_matrix[:,lf_sample[1]]) test_grid_results[key] = label_model.predict_proba(test_matrix[:,lf_sample[1]]) return train_grid_results, dev_grid_results, test_grid_results
def generative_model(L_train, n_epochs=500, print_every=100): model = LabelModel(k=2) logger.info("Training generative model...") model.train_model(L_train, n_epochs=n_epochs, print_every=print_every) logger.info("Done.") marginals = model.predict_proba(L_train) return marginals
def generative_model(L_train, n_epochs=500, print_every=100): model = LabelModel(k=2) logger.info(f"Training generative model for...") model.train_model(L_train, n_epochs=n_epochs, print_every=print_every) logger.info("Done.") marginals = model.predict_proba(L_train) plt.hist(marginals[:, TRUE - 1], bins=20) plt.savefig( os.path.join(os.path.dirname(__file__), f"opamps_marginals.pdf")) return marginals
def apply_labellling_functions(featurizer_output): session = featurizer_output['session'] cands = featurizer_output['candidate_variable'] labeler = Labeler(session, cands) labeler.apply(lfs=[lfs], train=True, parallelism=config.PARALLEL) train_cands = [] train_cands.append( session.query(featurizer_output['candidate_variable'][0]).all()) L_train = labeler.get_label_matrices(train_cands) gen_model = LabelModel(k=2) gen_model.train_model(L_train[0], n_epochs=300, print_every=100) train_marginals = gen_model.predict_proba(L_train[0]) featurizer_output['train_marginals'] = train_marginals return featurizer_output
def test_gpustorage(self): # Running basics tutorial problem with open("tutorials/data/basics_tutorial.pkl", "rb") as f: X, Y, L, D = pickle.load(f) Xs, Ys, Ls, Ds = split_data(X, Y, L, D, splits=[0.8, 0.1, 0.1], stratify_by=Y, seed=123) label_model = LabelModel(k=2, seed=123) label_model.train_model(Ls[0], Y_dev=Ys[1], n_epochs=500, log_train_every=25) Y_train_ps = label_model.predict_proba(Ls[0]) # Creating a really large end model to use lots of memory end_model = EndModel([1000, 100000, 2], seed=123, device="cuda") # Getting initial GPU storage use initial_gpu_mem = GPUtil.getGPUs()[0].memoryUsed # Training model end_model.train_model( (Xs[0], Y_train_ps), valid_data=(Xs[1], Ys[1]), l2=0.1, batch_size=256, n_epochs=3, log_train_every=1, validation_metric="f1", ) # Final GPU storage use final_gpu_mem = GPUtil.getGPUs()[0].memoryUsed # On a Titan X, this model uses ~ 3 GB of memory gpu_mem_difference = final_gpu_mem - initial_gpu_mem self.assertGreater(gpu_mem_difference, 1000)
def train_baseline_model( train_matrix, dev_matrix, dev_labels, test_matrix, lf_indicies, regularization_grid, train_marginal_dir, write_file=False ): grid_results = {} dev_grid_results = {} test_grid_results = {} for param in regularization_grid: label_model = LabelModel(k=2) label_model.train_model( train_matrix[:,lf_indicies], n_epochs=1000, log_train_every=200, seed=100, lr=0.01, l2=param, verbose=False, #Y_dev=dev_labels ) grid_results[str(param)] = label_model.predict_proba(dev_matrix[:,lf_indicies]) best_param = float(max(grid_results)) label_model.train_model( train_matrix[:,lf_indicies], n_epochs=1000, log_train_every=200, seed=50, lr=0.01, l2=best_param, verbose=False, #Y_dev=dev_labels ) if write_file: ( pd.DataFrame( label_model.predict_proba(train_matrix[:,lf_indicies]), columns=["pos_class_marginals", "neg_class_marginals"] ) .to_csv(f"{train_marginal_dir}baseline_marginals.tsv.xz", compression="xz", index=False, sep="\t") ) dev_grid_results[best_param] = label_model.predict_proba(dev_matrix[:,lf_indicies]) test_grid_results[best_param] = label_model.predict_proba(test_matrix[:,lf_indicies]) return dev_grid_results, test_grid_results
def getTrainedModel1(self): # We build a matrix of LF votes for each comment ticket LF_matrix = self.make_Ls_matrix(self.LF_set['comments'], self.LFs) # Get true labels for LF set Y_LF_set = np.array(self.LF_set['resolution']) display( lf_summary(sparse.csr_matrix(LF_matrix), Y=Y_LF_set, lf_names=self.LF_names.values())) print("label coverage: " + label_coverage(LF_matrix)) mv = MajorityLabelVoter() Y_train_majority_votes = mv.predict(LF_matrix) print("classification report:\n" + classification_report(Y_LF_set, Y_train_majority_votes)) Ls_train = self.make_Ls_matrix(self.train, self.LFs) # You can tune the learning rate and class balance. model = LabelModel(k=2, seed=123) trainer = model.train_model(Ls_train, n_epochs=2000, print_every=1000, lr=0.0001, class_balance=np.array([0.2, 0.8])) Y_train = model.predict(Ls_train) + Y_LF_set print('Trained Label Model Metrics:') scores = model.score((Ls_train[1], Y_train[1]), metric=['accuracy', 'precision', 'recall', 'f1']) print(scores) return trainer, Y_train
L_train = labeler.get_label_matrices(train_cands) L_gold_train = labeler.get_gold_labels(train_cands, annotator="gold") from metal import analysis analysis.lf_summary( L_train[0], lf_names=labeler.get_keys(), Y=L_gold_train[0].todense().reshape(-1).tolist()[0], ) from metal.label_model import LabelModel gen_model = LabelModel(k=2) gen_model.train_model(L_train[0], n_epochs=500, print_every=100) train_marginals = gen_model.predict_proba(L_train[0]) from fonduer.learning import LogisticRegression disc_model = LogisticRegression() disc_model.train((train_cands[0], F_train[0]), train_marginals, n_epochs=10, lr=0.001) from my_fonduer_model import MyFonduerModel model = MyFonduerModel() import fonduer_model fonduer_model.save_model( fonduer_model=model, model_path="fonduer_model",
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 1000 # using get_frm_output_size() L, Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y=Y["dev"])) # training label model label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs=2000, log_train_every=100) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) # comparison with majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) Ytrain_p = label_model.predict_proba(L["train"]) #print(Ytrain_ps.shape) #(377*50,2) #Ydev_p = label_model.predict_proba(L["dev"]) # test models #label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1']) # End Model # Create datasets and dataloaders train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"]) data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers) #print(len(data_loader["train"])) # 18850 / batch_size #print(len(data_loader["dev"])) # 1500 / batch_size #print(len(data_loader["test"])) # 1000 / batch_size #import ipdb; ipdb.set_trace() # Define input encoder #cnn_encoder = FrameEncoderOC cnn_encoder = FrameEncoderOCDense if (torch.cuda.is_available()): device = 'cuda' else: device = 'cpu' #import ipdb; ipdb.set_trace() # Define LSTM module lstm_module = LSTMModule( encode_dim, hidden_size, bidirectional=False, verbose=False, lstm_reduction=args.lstm_reduction, encoder_class=cnn_encoder, encoder_kwargs={"requires_grad": args.requires_grad}) ''' # Define end model end_model = EndModel( input_module=lstm_module, layer_out_dims=[hidden_size, num_classes], optimizer="adam", #use_cuda=cuda, batchnorm=False, seed=args.seed, verbose=False, device = device, ) ''' init_kwargs = { "layer_out_dims": [hidden_size, num_classes], "input_module": lstm_module, "optimizer": "adam", "verbose": False, "input_batchnorm": False, "use_cuda": cuda, 'seed': args.seed, 'device': device } end_model = EndModel(**init_kwargs) if not os.path.exists(args.checkpoint_dir): os.mkdir(args.checkpoint_dir) with open(args.checkpoint_dir + '/init_kwargs.pickle', "wb") as f: pickle.dump(init_kwargs, f, protocol=pickle.HIGHEST_PROTOCOL) dropout = 0.4 # Train end model end_model.train_model( train_data=data_loader["train"], valid_data=data_loader["dev"], l2=args.weight_decay, lr=args.lr, n_epochs=args.n_epochs, log_train_every=1, verbose=True, progress_bar=True, loss_weights=[0.55, 0.45], input_dropout=0.1, middle_dropout=dropout, checkpoint_dir=args.checkpoint_dir, #writer = "json", #writer_config = { #"log_dir": args.log_dir, #"run_dir": args.run_dir, #"run_name": args.run_name, #"writer_metrics": ['accuracy','precision', 'recall', 'f1','roc-auc','ndcg'] #}, #validation_metric='f1', ) # evaluate end model print("Dev Set Performance") end_model.score( data_loader["dev"], verbose=True, metric=['accuracy', 'precision', 'recall', 'f1', 'roc-auc', 'ndcg']) print("Test Set Performance") end_model.score( data_loader["test"], verbose=True, metric=['accuracy', 'precision', 'recall', 'f1', 'roc-auc', 'ndcg'])
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 1000 # using get_frm_output_size() L,Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y = Y["dev"])) # training label model label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs = 500, log_train_every = 50) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) # comparison with majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) Ytrain_p = label_model.predict_proba(L["train"]) #print(Ytrain_ps.shape) #(377*50,2) #Ydev_p = label_model.predict_proba(L["dev"]) # test models #label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1']) # End Model # Create datasets and dataloaders train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"]) data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers) #print(len(data_loader["train"])) # 18850 / batch_size #print(len(data_loader["dev"])) # 1500 / batch_size #print(len(data_loader["test"])) # 1000 / batch_size #import ipdb; ipdb.set_trace() # Define input encoder cnn_encoder = FrameEncoderOC if(torch.cuda.is_available()): device = 'cuda' else: device = 'cpu' #import ipdb; ipdb.set_trace() # Define LSTM module lstm_module = LSTMModule( encode_dim, hidden_size, bidirectional=False, verbose=False, lstm_reduction="attention", encoder_class=cnn_encoder, ) # Define end model end_model = EndModel( input_module=lstm_module, layer_out_dims=[hidden_size, num_classes], optimizer="adam", #use_cuda=cuda, batchnorm=True, seed=123, verbose=False, device = device, ) #print('Training model') #tic = time.time() dropout = 0.4 # Train end model end_model.train_model( train_data=data_loader["train"], valid_data=data_loader["dev"], l2=args.weight_decay, lr=args.lr, n_epochs=args.n_epochs, log_train_every=1, verbose=True, progress_bar = True, loss_weights = [0.45,0.55], batchnorm = 'True', input_dropout = dropout, middle_dropout = dropout, #validation_metric='f1', ) #print('Time taken for training:') #print(time.time() - tic) # evaluate end model end_model.score(data_loader["dev"], verbose=True, metric=['accuracy','precision', 'recall', 'f1'])
zip([L[:, :7], L[:, :24], L], [L_dev[:, :7], L_dev[:, :24], L_dev])) test_data = list( zip([L[:, :7], L[:, :24], L], [L_test[:, :7], L_test[:, :24], L_test])) model_labels = ["Distant Supervision (DS)", "DS+User Defined Rules", "All"] # In[15]: model_grid_search = {} for model_data, model_label in zip(validation_data, model_labels): label_model = LabelModel(k=2, seed=100) grid_results = {} for param in regularization_grid: label_model.train_model(model_data[0], n_epochs=1000, verbose=False, lr=0.01, l2=param) grid_results[str(param)] = label_model.predict_proba(model_data[1])[:, 0] model_grid_search[model_label] = pd.DataFrame.from_dict(grid_results) # In[16]: model_grid_aucs = {} for model in model_grid_search: model_grid_aucs[model] = plot_curve(model_grid_search[model], candidate_dfs['dev'].curated_dsh, figsize=(16, 6), model_type='scatterplot',
def test_e2e(caplog): """Run an end-to-end test on documents of the hardware domain.""" caplog.set_level(logging.INFO) PARALLEL = 4 max_docs = 12 session = Meta.init("postgresql://localhost:5432/" + DB).Session() docs_path = "tests/data/html/" pdf_path = "tests/data/pdf/" doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser( session, parallelism=PARALLEL, structural=True, lingual=True, visual=True, pdf_path=pdf_path, ) corpus_parser.apply(doc_preprocessor) assert session.query(Document).count() == max_docs num_docs = session.query(Document).count() logger.info(f"Docs: {num_docs}") assert num_docs == max_docs num_sentences = session.query(Sentence).count() logger.info(f"Sentences: {num_sentences}") # Divide into test and train docs = sorted(corpus_parser.get_documents()) last_docs = sorted(corpus_parser.get_last_documents()) ld = len(docs) assert ld == len(last_docs) assert len(docs[0].sentences) == len(last_docs[0].sentences) assert len(docs[0].sentences) == 799 assert len(docs[1].sentences) == 663 assert len(docs[2].sentences) == 784 assert len(docs[3].sentences) == 661 assert len(docs[4].sentences) == 513 assert len(docs[5].sentences) == 700 assert len(docs[6].sentences) == 528 assert len(docs[7].sentences) == 161 assert len(docs[8].sentences) == 228 assert len(docs[9].sentences) == 511 assert len(docs[10].sentences) == 331 assert len(docs[11].sentences) == 528 # Check table numbers assert len(docs[0].tables) == 9 assert len(docs[1].tables) == 9 assert len(docs[2].tables) == 14 assert len(docs[3].tables) == 11 assert len(docs[4].tables) == 11 assert len(docs[5].tables) == 10 assert len(docs[6].tables) == 10 assert len(docs[7].tables) == 2 assert len(docs[8].tables) == 7 assert len(docs[9].tables) == 10 assert len(docs[10].tables) == 6 assert len(docs[11].tables) == 9 # Check figure numbers assert len(docs[0].figures) == 32 assert len(docs[1].figures) == 11 assert len(docs[2].figures) == 38 assert len(docs[3].figures) == 31 assert len(docs[4].figures) == 7 assert len(docs[5].figures) == 38 assert len(docs[6].figures) == 10 assert len(docs[7].figures) == 31 assert len(docs[8].figures) == 4 assert len(docs[9].figures) == 27 assert len(docs[10].figures) == 5 assert len(docs[11].figures) == 27 # Check caption numbers assert len(docs[0].captions) == 0 assert len(docs[1].captions) == 0 assert len(docs[2].captions) == 0 assert len(docs[3].captions) == 0 assert len(docs[4].captions) == 0 assert len(docs[5].captions) == 0 assert len(docs[6].captions) == 0 assert len(docs[7].captions) == 0 assert len(docs[8].captions) == 0 assert len(docs[9].captions) == 0 assert len(docs[10].captions) == 0 assert len(docs[11].captions) == 0 train_docs = set() dev_docs = set() test_docs = set() splits = (0.5, 0.75) data = [(doc.name, doc) for doc in docs] data.sort(key=lambda x: x[0]) for i, (doc_name, doc) in enumerate(data): if i < splits[0] * ld: train_docs.add(doc) elif i < splits[1] * ld: dev_docs.add(doc) else: test_docs.add(doc) logger.info([x.name for x in train_docs]) # NOTE: With multi-relation support, return values of getting candidates, # mentions, or sparse matrices are formatted as a list of lists. This means # that with a single relation, we need to index into the list of lists to # get the candidates/mentions/sparse matrix for a particular relation or # mention. # Mention Extraction part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3) temp_ngrams = MentionNgramsTemp(n_max=2) volt_ngrams = MentionNgramsVolt(n_max=1) Part = mention_subclass("Part") Temp = mention_subclass("Temp") Volt = mention_subclass("Volt") mention_extractor = MentionExtractor( session, [Part, Temp, Volt], [part_ngrams, temp_ngrams, volt_ngrams], [part_matcher, temp_matcher, volt_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) assert session.query(Part).count() == 299 assert session.query(Temp).count() == 147 assert session.query(Volt).count() == 140 assert len(mention_extractor.get_mentions()) == 3 assert len(mention_extractor.get_mentions()[0]) == 299 assert ( len( mention_extractor.get_mentions( docs=[session.query(Document).filter(Document.name == "112823").first()] )[0] ) == 70 ) # Candidate Extraction PartTemp = candidate_subclass("PartTemp", [Part, Temp]) PartVolt = candidate_subclass("PartVolt", [Part, Volt]) candidate_extractor = CandidateExtractor( session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler] ) for i, docs in enumerate([train_docs, dev_docs, test_docs]): candidate_extractor.apply(docs, split=i, parallelism=PARALLEL) assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3684 assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 72 assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 448 assert session.query(PartVolt).count() == 4282 # Grab candidate lists train_cands = candidate_extractor.get_candidates(split=0, sort=True) dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2, sort=True) assert len(train_cands) == 2 assert len(train_cands[0]) == 3684 assert ( len( candidate_extractor.get_candidates( docs=[session.query(Document).filter(Document.name == "112823").first()] )[0] ) == 1496 ) # Featurization featurizer = Featurizer(session, [PartTemp, PartVolt]) # Test that FeatureKey is properly reset featurizer.apply(split=1, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 225 assert session.query(FeatureKey).count() == 1179 # Test Dropping FeatureKey # Should force a row deletion featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NFP NN NFP]"]) assert session.query(FeatureKey).count() == 1178 # Should only remove the part_volt as a relation and leave part_temp assert set( session.query(FeatureKey) .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]") .one() .candidate_classes ) == {"part_temp", "part_volt"} featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) assert session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]" ).one().candidate_classes == ["part_temp"] assert session.query(FeatureKey).count() == 1178 # Removing the last relation from a key should delete the row featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp]) assert session.query(FeatureKey).count() == 1177 session.query(Feature).delete() session.query(FeatureKey).delete() featurizer.apply(split=0, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 6669 assert session.query(FeatureKey).count() == 4161 F_train = featurizer.get_feature_matrices(train_cands) assert F_train[0].shape == (3684, 4161) assert F_train[1].shape == (2985, 4161) assert len(featurizer.get_keys()) == 4161 featurizer.apply(split=1, parallelism=PARALLEL) assert session.query(Feature).count() == 6894 assert session.query(FeatureKey).count() == 4161 F_dev = featurizer.get_feature_matrices(dev_cands) assert F_dev[0].shape == (72, 4161) assert F_dev[1].shape == (153, 4161) featurizer.apply(split=2, parallelism=PARALLEL) assert session.query(Feature).count() == 8486 assert session.query(FeatureKey).count() == 4161 F_test = featurizer.get_feature_matrices(test_cands) assert F_test[0].shape == (448, 4161) assert F_test[1].shape == (1144, 4161) gold_file = "tests/data/hardware_tutorial_gold.csv" load_hardware_labels(session, PartTemp, gold_file, ATTRIBUTE, annotator_name="gold") assert session.query(GoldLabel).count() == 4204 load_hardware_labels(session, PartVolt, gold_file, ATTRIBUTE, annotator_name="gold") assert session.query(GoldLabel).count() == 8486 stg_temp_lfs = [ LF_storage_row, LF_operating_row, LF_temperature_row, LF_tstg_row, LF_to_left, LF_negative_number_left, ] ce_v_max_lfs = [ LF_bad_keywords_in_row, LF_current_in_row, LF_non_ce_voltages_in_row, ] labeler = Labeler(session, [PartTemp, PartVolt]) with pytest.raises(ValueError): labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL) labeler.apply( split=0, lfs=[stg_temp_lfs, ce_v_max_lfs], train=True, parallelism=PARALLEL ) assert session.query(Label).count() == 6669 assert session.query(LabelKey).count() == 9 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3684, 9) assert L_train[1].shape == (2985, 9) assert len(labeler.get_keys()) == 9 L_train_gold = labeler.get_gold_labels(train_cands) assert L_train_gold[0].shape == (3684, 1) L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold") assert L_train_gold[0].shape == (3684, 1) gen_model = LabelModel(k=2) gen_model.train_model(L_train[0], n_epochs=500, print_every=100) train_marginals = gen_model.predict_proba(L_train[0])[:, 1] disc_model = LogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=20, lr=0.001 ) test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))] pickle_file = "tests/data/parts_by_doc_dict.pkl" with open(pickle_file, "rb") as f: parts_by_doc = pickle.load(f) (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 < 0.7 and f1 > 0.3 stg_temp_lfs_2 = [ LF_to_left, LF_test_condition_aligned, LF_collector_aligned, LF_current_aligned, LF_voltage_row_temp, LF_voltage_row_part, LF_typ_row, LF_complement_left_row, LF_too_many_numbers_row, LF_temp_on_high_page_num, LF_temp_outside_table, LF_not_temp_relevant, ] labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL) assert session.query(Label).count() == 6669 assert session.query(LabelKey).count() == 16 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3684, 16) gen_model = LabelModel(k=2) gen_model.train_model(L_train[0], n_epochs=500, print_every=100) train_marginals = gen_model.predict_proba(L_train[0])[:, 1] disc_model = LogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=20, lr=0.001 ) test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing LSTM disc_model = LSTM() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing Sparse Logistic Regression disc_model = SparseLogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=20, lr=0.001 ) test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing Sparse LSTM disc_model = SparseLSTM() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7
# Here in this section we are using the distant superivion paradigm to label our candidate sentences. # ## Grid Search # In[18]: regularization_grid = pd.np.round(pd.np.linspace(0.01, 5, num=15), 2) # In[19]: grid_results = {} label_model = LabelModel(k=2) for param in tqdm_notebook(regularization_grid): label_model.train_model(correct_L[:, 0:7], n_epochs=1000, print_every=200, seed=100, lr=0.01, l2=param) grid_results[str(param)] = label_model.predict_proba(correct_L_train[:, 0:7]) # In[20]: acc_results = defaultdict(list) for key in grid_results: acc_results[key].append( accuracy_score( candidate_dfs['train']['curated_dsh'].fillna(0), list(map(lambda x: 1 if x > 0.5 else 0, grid_results[key][:, 0])))) acc_df = pd.DataFrame(acc_results)
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 108 # using get_frm_output_size() if (torch.cuda.is_available()): device = torch.device('cuda:0') #device = 'cuda' else: device = 'cpu' #print(device) L, Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y=Y["dev"])) # majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) # training label model - no temporal modelling label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs=500, log_train_every=50) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) # training label model without temporal modelling # naive model #print(L["train"].todense().shape) # (18850,5) #print(L["dev"].todense().shape) # (1500,5) #print(Y["dev"].shape) # (1500,) m_per_task = L["train"].todense().shape[1] # 5 MRI_data_naive = { 'Li_train': torch.FloatTensor(np.array(L["train"].todense().astype('int_'))), 'Li_dev': torch.FloatTensor(np.array(L["dev"].todense())), 'R_dev': Y["dev"] } MRI_data_naive['class_balance'] = torch.FloatTensor([0.5, 0.5]).to(device) # training naive model naive_model = DPLabelModel( m=m_per_task, T=1, edges=[], coverage_sets=[[ 0, ]] * m_per_task, mu_sharing=[[ i, ] for i in range(m_per_task)], phi_sharing=[], device=device, #class_balance=MRI_data_naive['class_balance'], seed=0) optimize(naive_model, L_hat=MRI_data_naive['Li_train'], num_iter=300, lr=1e-3, momentum=0.8, clamp=True, seed=0) # evaluating naive model R_pred = naive_model.predict(MRI_data_naive['Li_dev']).data.numpy() R_pred = 2 - R_pred #print(R_pred) #print(MRI_data_naive['R_dev']) for metric in ['accuracy', 'f1', 'recall', 'precision']: score = metric_score(MRI_data_naive['R_dev'], R_pred, metric) print(f"{metric.capitalize()}: {score:.3f}") # training label model with temporal modelling # reshaping dataset num_frames = 50 n_patients_train = round(L["train"].todense().shape[0] / num_frames) #(377) n_patients_dev = round(L["dev"].todense().shape[0] / num_frames) #(30) Ltrain = np.reshape(np.array(L["train"].todense()), (n_patients_train, num_frames, -1)) Ldev = np.reshape(np.array(L["dev"].todense()), (n_patients_dev, num_frames, -1)) Ydev = np.reshape(Y["dev"], (n_patients_dev, num_frames)) # print(Ltrain.shape) # (377,50,5) #print(Ldev.shape) # (30,50,5) #print(Ydev.shape) # (30,50) # subsampling # selecting frames 3,13,23,33,43 indices = np.linspace(2, 42, 5).astype(int) m_per_task = 5 T = 5 Ltrain_small = Ltrain[:, indices, :] # shape (377,5,5) Ldev_small = Ldev[:, indices, :] # shape (30,5,5) Ydev_small = Ydev[:, indices] # shape (30,5) Ltrain_small = np.reshape( Ltrain_small, ((n_patients_train * T), m_per_task)) # shape (1885,5) Ldev_small = np.reshape( Ldev_small, ((n_patients_dev * T), m_per_task)) # shape (150,5) Ydev_small = np.reshape(Ydev_small, ((n_patients_dev * T), )) # shape (150,) MRI_data_temporal = { 'Li_train': torch.LongTensor(Ltrain_small).view(n_patients_train, (m_per_task * T)), 'Li_dev': torch.LongTensor(Ldev_small).view(n_patients_dev, (m_per_task * T)), 'R_dev': torch.LongTensor(Ydev_small)[::T] * (2**T - 1), 'm': m_per_task * T, 'T': T } MRI_data_temporal['class_balance'] = normalize( (MRI_data_temporal['R_dev'].unsqueeze(1) == torch.arange( 2**T, device=device).unsqueeze(0)).sum(0).float(), dim=0, p=1) max_seed = 10 temporal_models = [ None, ] * max_seed for seed in range(max_seed): markov_model = DPLabelModel( m=m_per_task * T, T=T, edges=[(i, i + m_per_task) for i in range((T - 1) * m_per_task)], coverage_sets=[[ t, ] for t in range(T) for _ in range(m_per_task)], mu_sharing=[[t * m_per_task + i for t in range(T)] for i in range(m_per_task)], phi_sharing=[[(t * m_per_task + i, (t + 1) * m_per_task + i) for t in range(T - 1)] for i in range(m_per_task)], device=device, class_balance=MRI_data_temporal['class_balance'], seed=seed) optimize(markov_model, L_hat=MRI_data_temporal['Li_train'], num_iter=1000, lr=1e-5, momentum=0.8, clamp=True, verbose=False, seed=seed) temporal_models[seed] = markov_model for seed, model in enumerate(temporal_models): R_pred = model.predict(MRI_data_temporal['Li_dev'].cpu()) F1 = metric_score(MRI_data_temporal['R_dev'].cpu() > 0, R_pred.cpu() > 0, 'f1') accuracy = metric_score(MRI_data_temporal['R_dev'].cpu(), R_pred.cpu(), 'accuracy') print(f"seed={seed} accuracy={accuracy:.3f} F1={F1:.3f}")
print(lf_summary(Ls[1], Y=Ys[1])) balance = sorted(Counter(Y_test).items()) balance2 = Counter(Y_test).values() new_balance = [] for elem in balance: new_balance.append(elem[1] / sum(balance2)) print(sorted(Counter(Y_test).items())) print(balance) print(new_balance) label_model = LabelModel(k=2, seed=123) label_model.train_model(Ls[0], class_balance=new_balance, n_epochs=500, log_train_every=50) score = label_model.score((Ls[1], Ys[1])) print('Trained Label Model Metrics:') scores = label_model.score((Ls[1], Ys[1]), metric=['accuracy', 'precision', 'recall', 'f1']) mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') scores = mv.score((Ls[1], Ys[1]), metric=['accuracy', 'precision', 'recall', 'f1']) Y_train_ps = label_model.predict_proba(Ls[0])
L_train = labeler.get_label_matrices(train_cands) L_gold_train = labeler.get_gold_labels(train_cands, annotator="gold") from metal import analysis analysis.lf_summary( L_train[0], lf_names=labeler.get_keys(), Y=L_gold_train[0].todense().reshape(-1).tolist()[0], ) from metal.label_model import LabelModel gen_model = LabelModel(k=2) gen_model.train_model(L_train[0], n_epochs=500, verbose=False) train_marginals = gen_model.predict_proba(L_train[0]) from fonduer.learning import LogisticRegression disc_model = LogisticRegression() disc_model.train((train_cands[0], F_train[0]), train_marginals, n_epochs=10, lr=0.001) from my_fonduer_model import MyFonduerModel model = MyFonduerModel() code_paths = [
return transformed_data train_ground = remap_labels(loader.train_ground) val_ground = remap_labels(loader.val_ground) L_train_sparse = sparse.csc_matrix( (remap_labels(L_train_sparse.data), L_train_sparse.indices, L_train_sparse.indptr)).T L_val_sparse = sparse.csc_matrix((remap_labels(L_val_sparse.data), L_val_sparse.indices, L_val_sparse.indptr)).T print('\n\n####### Running METAL Label Model ########') label_model = LabelModel() label_model.train_model(L_train_sparse, n_epochs=200, print_every=50, seed=123, verbose=False) train_marginals = label_model.predict_proba(L_train_sparse) label_model.score((L_train_sparse, train_ground), metric=metrics) ####### METAL with Exact Class Balance ######## print( '\n\n####### Running METAL Label Model with exact class balance ########') train_class_balance = np.array([ np.sum(train_ground == 1) / loader.train_num, np.sum(train_ground == 2) / loader.train_num ]) val_class_balance = np.array([ np.sum(val_ground == 1) / loader.val_num, np.sum(val_ground == 2) / loader.val_num
class SnorkeMeTalCollator(Collator): def __init__( self, positive_label: str, class_cardinality: int = 2, num_epochs: int = 500, log_train_every: int = 50, seed: int = 123, ): self.positive_label = positive_label self.class_cardinality = class_cardinality self.num_epochs = num_epochs self.log_train_every = log_train_every self.seed = seed self.label_model = LabelModel(k=self.class_cardinality, seed=seed) @classmethod def get_snorkel_index(cls, tag: str) -> int: if is_positive(tag): return 2 elif is_negative(tag): return 1 else: return 0 def get_tag(self, index: int) -> str: if index == 1: return self.positive_label else: return NEGATIVE_LABEL def get_index(self, prob: np.ndarray) -> str: assert prob.shape == (2, ) return prob.argmax() def collate_np(self, annotations) -> Tuple[np.ndarray, List[str], List[int]]: output_arrs: List[np.ndarray] = [] words_list: List[str] = [] id_to_labels: Dict[int, Tuple[int, int]] = {} num_funcs = len(annotations) for i, ann_inst in tqdm(enumerate(zip(*annotations))): ids = [inst['id'] for inst in ann_inst] inputs = [inst['input'] for inst in ann_inst] outputs = [inst['output'] for inst in ann_inst] input_len = len(inputs[0]) entry_id = ids[0] # output arr = (sentence x num_labels) output_arr = np.zeros((input_len, num_funcs)) for i, output in enumerate(outputs): for j, out_j in enumerate(output): output_arr[j, i] = SnorkeMeTalCollator.get_snorkel_index( out_j) label_start = len(words_list) for word_i, word in enumerate(inputs[0]): words_list.append(word) output_arrs.append(output_arr) label_end = len(words_list) id_to_labels[entry_id] = (label_start, label_end) output_res = np.concatenate(output_arrs, axis=0) return output_res, words_list, id_to_labels def train_label_model( self, collated_labels: np.ndarray, descriptions: Optional[List[str]], train_data_np: Optional[np.ndarray], ): sparse_labels = sparse.csr_matrix(collated_labels) if descriptions is not None: descriptions = [(i, desc) for i, desc in enumerate(descriptions)] logger.warn(f'labeling function order: {descriptions}') logger.warn(lf_summary(sparse_labels)) self.label_model.train_model( sparse_labels, n_epochs=self.num_epochs, log_train_every=self.log_train_every, Y_dev=train_data_np, ) def get_probabilistic_labels(self, collated_labels: np.ndarray) -> np.ndarray: sparse_labels = sparse.csr_matrix(collated_labels) return self.label_model.predict_proba(sparse_labels) def convert_to_tags( self, train_probs: np.ndarray, word_list: List[str], id_to_labels: Dict[int, Tuple[int, int]], ) -> List[AnnotatedDataType]: output = [] for entry_id, (label_start, label_end) in id_to_labels.items(): words = word_list[label_start:label_end] prob_labels = train_probs[label_start:label_end] label_ids = prob_labels.argmax(axis=1) labels = [self.get_tag(i) for i in label_ids] output.append({ 'id': entry_id, 'input': words, 'output': labels, }) return output def collate( self, annotations: List[AnnotatedDataType], should_verify: bool = False, descriptions: Optional[List[str]] = None, train_data: Optional[AnnotatedDataType] = None ) -> AnnotatedDataType: ''' args: ``annotations``: List[AnnotatedDataType] given a series of annotations, collate them into a single series of annotations per instance ''' if should_verify: # make sure the annotations are in the # proper format Collator.verify_annotations(annotations) train_data_np = None if train_data: # if train data specified, will be used by Snorkel to estimate class balanc train_data_np, word_lists, id_to_labels = self.collate_np( [train_data]) train_data_np = train_data_np.astype(int) train_data_np = train_data_np.reshape(-1) collate_np, word_lists, id_to_labels = self.collate_np(annotations) self.train_label_model(collated_labels=collate_np, descriptions=descriptions, train_data_np=train_data_np) y_train_probs = self.get_probabilistic_labels( collated_labels=collate_np, ) tags = self.convert_to_tags(y_train_probs, word_list=word_lists, id_to_labels=id_to_labels) return tags
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 1000 # using get_frm_output_size() L,Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y = Y["dev"])) # training label model label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs = 500, log_train_every = 50) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) # comparison with majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) Ytrain_p = label_model.predict_proba(L["train"]) #print(Ytrain_ps.shape) #(377*50,2) #Ydev_p = label_model.predict_proba(L["dev"]) # test models #label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1']) # End Model # Create datasets and dataloaders train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"]) data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers) #print(len(data_loader["train"])) # 18850 / batch_size #print(len(data_loader["dev"])) # 1500 / batch_size #print(len(data_loader["test"])) # 1000 / batch_size #import ipdb; ipdb.set_trace() # Define input encoder cnn_encoder = FrameEncoderOC if(torch.cuda.is_available()): device = 'cuda' else: device = 'cpu' #import ipdb; ipdb.set_trace() # Define LSTM module lstm_module = LSTMModule( encode_dim, hidden_size, bidirectional=False, verbose=False, lstm_reduction="attention", encoder_class=cnn_encoder, ) train_args = [data_loader["train"]] train_kwargs = { 'seed':args.seed, 'progress_bar':True, 'log_train_every':1} init_args = [ [hidden_size, num_classes] ] init_kwargs = { "input_module": lstm_module, "optimizer": "adam", "verbose": False, "input_batchnorm": True, "use_cuda":torch.cuda.is_available(), 'checkpoint_dir':args.checkpoint_dir, 'seed':args.seed, 'device':device} search_space = { 'n_epochs':[10], 'batchnorm':[True], 'dropout': [0.1,0.25,0.4], 'lr':{'range': [1e-3, 1e-2], 'scale': 'log'}, 'l2':{'range': [1e-5, 1e-4], 'scale': 'log'},#[ 1.21*1e-5], #'checkpoint_metric':['f1'], } log_config = { "log_dir": "./run_logs", "run_name": 'cnn_lstm_oc' } max_search = 5 tuner_config = {"max_search": max_search } validation_metric = 'accuracy' # Set up logger and searcher tuner = RandomSearchTuner(EndModel, **log_config, log_writer_class=TensorBoardWriter, validation_metric=validation_metric, seed=1701) disc_model = tuner.search( search_space, valid_data = data_loader["dev"], train_args=train_args, init_args=init_args, init_kwargs=init_kwargs, train_kwargs=train_kwargs, max_search=tuner_config["max_search"], clean_up=False, ) # evaluate end model disc_model.score(data_loader["dev"], verbose=True, metric=['accuracy','precision', 'recall', 'f1'])