def load_ltrain(predicate_resume, session): key_group = predicate_resume["label_group"] LFs = get_labelling_functions(predicate_resume) labeler = LabelAnnotator(lfs=LFs) train_cids_query = get_train_cids_with_span(predicate_resume, session) L_train = labeler.load_matrix(session, cids_query=train_cids_query, key_group=key_group) return L_train
def get_L_train(LFs, parallelism=2, split=0): L_train = None labeler = None np.random.seed(1701) labeler = LabelAnnotator(lfs=LFs) print(datetime.datetime.now()) L_train = labeler.apply( split=split ) # ,cids_query=session.query(Candidate.id).filter(Candidate.get_parent().id %10==1)) print(datetime.datetime.now()) print(type(L_train)) print(L_train.shape) # print("**Total non_overlapping_coverage on L_train (percentage of labelled over all)** "+str(L_train.non_overlapping_coverage())) return L_train
def apply_LF(lf_file): """ Load labeling functions and applies on the candidates extracted in train set :param lf_file: labeling functions python file :return: L_train """ labeling_func = __import__(lf_file) LF_list = [o[1] for o in getmembers(labeling_func) if isfunction(o[1])] labeler = LabelAnnotator(lfs=LF_list) np.random.seed(1701) L_train = labeler.apply(split=0) L_train.todense() report.append('\n#LF Stats\n') report.append(L_train.lf_stats(session).to_csv(sep=' ', index=False, header=True)) return L_train
def predicate_candidate_labelling(predicate_resume, parallelism=1, limit=None, replace_key_set=False): logging.info("Starting labeling ") session = SnorkelSession() try: candidate_subclass = predicate_resume["candidate_subclass"] key_group = predicate_resume["label_group"] cids_query = session.query( candidate_subclass.id).filter(candidate_subclass.split == 0) ##skip cands already extracted #alreadyExistsGroup=session.query(LabelKey).filter(LabelKey.group==key_group).count()>0 #if alreadyExistsGroup: # cids_query= get_train_cids_not_labeled(predicate_resume,session) #if limit !=None: # cids_query=cids_query.filter(candidate_subclass.id<limit) LFs = get_labelling_functions(predicate_resume) labeler = LabelAnnotator(lfs=LFs) np.random.seed(1701) ##if first run or adding a new labeling functionS is needed to set replace key set to True #if not replace_key_set: # replace_key_set=not alreadyExistsGroup L_train = labeler.apply(parallelism=parallelism, cids_query=cids_query, key_group=key_group, clear=True, replace_key_set=True) print(L_train.lf_stats(session)) logging.info(L_train.lf_stats(session)) finally: logging.info("Finished labeling ")
def score_lfs(predicate_resume, L_gold_test, session, date_time, parallelism=8): dump_file_path = "./results/" + "lfs_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" key_group = predicate_resume["label_group"] LFs = get_labelling_functions(predicate_resume) labeler = LabelAnnotator(lfs=LFs) test_cids_query = get_test_cids_with_span(predicate_resume, session) L_test = labeler.apply(parallelism=parallelism, cids_query=test_cids_query, key_group=key_group, clear=True, replace_key_set=False) data_frame = L_test.lf_stats(session) print(data_frame) logging.info(data_frame) data_frame.to_csv(dump_file_path) gen_model = GenerativeModel() gen_model.train(L_test, epochs=100, decay=0.95, step_size=0.1 / L_test.shape[0], reg_param=1e-6) p, r, f1 = gen_model.score(L_test, L_gold_test) print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1)) logging.info("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format( p, r, f1)) dump_file_path1 = "./results/" + "test_gen_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" with open(dump_file_path1, 'w+b') as f: writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Precision", "Recall", "F1"]) writer.writerow( ["{0:.3f}".format(p), "{0:.3f}".format(r), "{0:.3f}".format(f1)]) test_marginals = gen_model.marginals(L_test) dump_file_path2 = "./results/" + "plt_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" #plt.hist(test_marginals, bins=20) #plt.savefig(dump_file_path2) #plt.show() dump_file_path3 = "./results/" + "gen_2_" + predicate_resume[ "predicate_name"] + date_time + ".csv" data_frame3 = gen_model.learned_lf_stats() data_frame3.to_csv(dump_file_path3) dump_file_path4 = "./results/" + "gen_3_" + predicate_resume[ "predicate_name"] + date_time + ".csv" tp, fp, tn, fn = gen_model.error_analysis(session, L_test, L_gold_test) with open(dump_file_path4, 'w+b') as f: writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerow(["TP", "FP", "TN", "FN"]) writer.writerow( [str(len(tp)), str(len(fp)), str(len(tn)), str(len(fn))]) dump_file_path5 = "./results/" + "gen_4_" + predicate_resume[ "predicate_name"] + date_time + ".csv" data_frame4 = L_test.lf_stats(session, L_gold_test, gen_model.learned_lf_stats()['Accuracy']) data_frame4.to_csv(dump_file_path5)
def LF_distant_supervision(c): v, h = c.virus.get_span(), c.host.get_span() return 1 if (v, h) in known_pairs else 0 # list of all LFs LFs = [ LF_detect, LF_infect, LF_isolate, LF_positive, LF_positive2, LF_misc, LF_v_cause_h, LF_v_h, LF_h_v, LF_other_verbs, LF_far_v_h, LF_far_h_v, LF_neg_h, LF_neg_assertions, LF_distant_supervision ] # set up the label annotator class labeler = LabelAnnotator(lfs=LFs) # ------------------------------------------- # START CROSS VALIDATION SPLIT in a loop: # Make an array of indexes (should equal number of documents 88). In a loop, split the index array into train, test, and dev arrays. The sentences get added to the respective t,t,d sets and the candidates are extracted. index_array = np.arange(0, 88) # for roc tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) # for recording prec, rec, f1 scores
candidates = session.query(DiseaseGene).filter( DiseaseGene.split == 0).limit(1).all() LF_DEBUG(candidates[0]) # In[ ]: LFs = get_lfs() # # Label The Candidates # Label each candidate based on the provided labels above. This code runs with realtive ease, but optimization is definitely needed when the number of label functions increases linearly. # In[ ]: labeler = LabelAnnotator(lfs=LFs) cids = session.query(Candidate.id).filter(Candidate.split == 0) get_ipython().magic( u'time L_train = labeler.apply(split=0, cids_query=cids, parallelism=5)') cids = session.query(Candidate.id).filter(Candidate.split == 1) get_ipython().magic( u'time L_dev = labeler.apply_existing(split=1, cids_query=cids, parallelism=5, clear=False)' ) cids = session.query(Candidate.id).filter(Candidate.split == 2) get_ipython().magic( u'time L_test = labeler.apply_existing(split=2, cids_query=cids, parallelism=5, clear=False)' )
from utils.label_functions.compound_disease_lf import CD_LFS #from utils.gene_gene_lf import GG_LFS # # Label The Candidates # Label each candidate based on the provided labels above. This code runs with realtive ease, but optimization is definitely needed when the number of label functions increases linearly. # In[ ]: label_functions = list(CG_LFS["CbG_DB"].values()) + list(CG_LFS["CbG_TEXT"].values()) + list(DG_LFS["DaG_TEXT"].values()) labeler = LabelAnnotator(lfs=label_functions) # # Quickly Relabel Candidates # Use this block here to re-label candidates that have already been labled from the above process. # In[ ]: train_df = pd.read_excel(spreadsheet_names['train']) train_cids = train_df.candidate_id.astype(int).tolist() train_df.head(2) # In[ ]:
docs = session.query(Document).order_by(Document.name).all() sentences = session.query(Sentence).all() #print(sentences) sents=set(); for i,doc in enumerate(docs): for s in doc.sentences: sents.add(s) cand_extractor.apply(sents) print("Number of candidates:", session.query(pairs).count()) labeler = LabelAnnotator(lfs=LFs) L_train = labeler.apply() print(L_train.lf_stats(session)) # generative model, training_marginals are probabilistic training labels gen_model = GenerativeModel() gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6) print(gen_model.weights.lf_accuracy) train_marginals = gen_model.marginals(L_train)
candidate_dfs = { key: load_candidate_dataframes(spreadsheet_names[key]) for key in spreadsheet_names } for key in candidate_dfs: print("Size of {} set: {}".format(key, candidate_dfs[key].shape[0])) # In[8]: label_functions = (list(DG_LFS["DaG_DB"].values()) + list(DG_LFS["DaG_TEXT"].values())) if quick_load: labeler = LabelAnnotator(lfs=[]) label_matricies = { key: labeler.load_matrix(session, cids_query=make_cids_query(session, candidate_dfs[key])) for key in candidate_dfs } else: labeler = LabelAnnotator(lfs=label_functions) label_matricies = { key: label_candidates(labeler,
LF_common_1000, LF_common_2000 ] LFs_BD = [ LF_colon, LF_known_abs, LF_single_letter, LF_roman_numeral, LF_common_2000, LF_same_thing_BD ] LFs_BM = [ LF_distance_far, LF_colon, LF_known_abs, LF_single_letter, LF_roman_numeral, LF_common_2000, LF_same_thing ] LFs_BT = [ LF_colon, LF_known_abs, LF_single_letter, LF_roman_numeral, LF_common_2000, LF_same_thing ] labeler_BC = LabelAnnotator(lfs=LFs_BC) labeler_BD = LabelAnnotator(lfs=LFs_BD) labeler_BM = LabelAnnotator(lfs=LFs_BM) labeler_BT = LabelAnnotator(lfs=LFs_BT) # Training L_train_BC = labeler_BC.apply(split=0) L_train_BD = labeler_BD.apply(split=0) L_train_BM = labeler_BM.apply(split=0) L_train_BT = labeler_BT.apply(split=0) L_train_BC L_train_BD L_train_BM L_train_BT # Labeling Function Performance - Coverage, Overlaps, Conflicts
def _get_labeler(predicate_resume): LFs = get_labelling_functions(predicate_resume) labeler = LabelAnnotator(lfs=LFs) return labeler
"Number of candidates:", session.query(candidate_class).filter(candidate_class.split == 0).count()) print("==============================") # Split to pull eval candidates from eval_split = 0 # Executing query for eval candidates eval_cands = session.query(candidate_class).filter( candidate_class.split == eval_split).order_by(candidate_class.id).all() print(f'Loaded {len(eval_cands)} candidates...') # Applying LFs print("Applying LFs...") from snorkel.annotations import LabelAnnotator labeler = LabelAnnotator(lfs=LFs) L_eval = labeler.apply(split=eval_split, parallelism=parallelism) # defining model from snorkel.learning import GenerativeModel # Creating generative model gen_model = GenerativeModel() # defining saved weights directory and name model_name = 'Price_Gen_20K' # this was provided when the model was saved! save_dir = '/dfs/scratch0/jdunnmon/data/memex-data/extractor_checkpoints/Price_Gen_20K' # this was provided when the model was saved! # loading print("Loading generative model...") gen_model.load(model_name=model_name, save_dir=save_dir, verbose=True)
BiomarkerCondition).filter(BiomarkerCondition.split == 1).count() session.commit() # In[ ]: from LF import * LFs_BC = [ LF_markerDatabase, LF_keyword, LF_distance, LF_abstract_titleWord, LF_single_letter, LF_auxpass, LF_known_abs, LF_same_thing_BC, LF_common_1000, LF_common_2000 ] # In[ ]: from snorkel.annotations import LabelAnnotator BC_labeler = LabelAnnotator(lfs=LFs_BC) # In[ ]: np.random.seed(1701) get_ipython().magic(u'time L_train_BC = BC_labeler.apply(split=0)') L_train_BC # In[ ]: get_ipython().magic( u'time L_train_BC = BC_labeler.load_matrix(session, split=0)') L_train_BC # In[ ]:
#candidates = [session.query(DiseaseGene).filter(DiseaseGene.id == ids).one() for ids in [19817,19818,19830,19862,19980,20001,20004]] for c in candidates: if c[0].get_parent().id != 14264: continue print c print get_tagged_text(c) print c[1].sentence.entity_cids[c[1].get_word_start()] # # Label The Candidates # This block of code will run through the label functions and label each candidate in the training and development groups. # In[ ]: labeler = LabelAnnotator(f=LFs) get_ipython().magic(u'time L_train = labeler.apply(split=0)') get_ipython().magic(u'time L_dev = labeler.apply_existing(split=1)') get_ipython().magic(u'time L_test = labeler.apply_existing(split=2)') # In[ ]: featurizer = FeatureAnnotator() get_ipython().magic(u'time F_train = featurizer.apply(split=0)') get_ipython().magic(u'time F_dev = featurizer.apply_existing(split=1)') get_ipython().magic(u'time F_test = featurizer.apply_existing(split=2)') # # Generate Coverage Stats
GeneGene = candidate_subclass('GeneGene', ['Gene1', 'Gene2']) elif edge_type == "cg": CompoundGene = candidate_subclass('CompoundGene', ['Compound', 'Gene']) elif edge_type == "cd": CompoundDisease = candidate_subclass('CompoundDisease', ['Compound', 'Disease']) else: print("Please pick a valid edge type") # # Load the data # Here is where we load the test dataset in conjunction with the previously trained disc models. Each algorithm will output a probability of a candidate being a true candidate. # In[6]: labeler = LabelAnnotator(lfs=[]) # In[7]: get_ipython().run_cell_magic(u'time', u'', u'L_test = labeler.load_matrix(session,split=2)') # In[8]: L_test.shape # In[9]: marginal_files = [ "stratified_data/lstm_disease_gene_holdout/LR_data/LR_test_marginals.csv", "stratified_data/lstm_disease_gene_holdout/lstm_one_test_marginals.csv",
# In[ ]: database_str = "sqlite:///" + os.environ['WORKINGPATH'] + "/Database/epilepsy.db" os.environ['SNORKELDB'] = database_str session = SnorkelSession() # # Load preprocessed data # To save time, this code will automatically load our labels that were generated in the previous file. # In[ ]: labeler = LabelAnnotator(f=None) L_train = labeler.load_matrix(session,split=0) L_dev = labeler.load_matrix(session,split=1) L_test = labeler.load_matrix(session,split=2) # In[ ]: print "Total Data Shape:" print L_train.shape print L_dev.shape print L_test.shape print print "The number of positive candiadtes (in KB) for each division:"
tweet = Tweet(tweet=raw_text, split=split) session.add(tweet) session.commit() print("Commit to snorkel database done...") #writing label generator def worker_label_generator(t): for worker_id in cand_dict[t.tweet.stable_id]: yield worker_id, cand_dict[t.tweet.stable_id][worker_id] np.random.seed(1701) labeler = LabelAnnotator(label_generator=worker_label_generator) L_train = labeler.apply(split=0) print(L_train.lf_stats(session)) print("Creat training data done...") print(" -train data shape", (L_train.shape)) print("Start to train a generative model") gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L_train, reg_type=2, reg_param=0.1, epochs=30) #doing statistics print(gen_model.learned_lf_stats()) print("Train a genetive model done...!")
def LF_police_at_location_left(c): if len(police_tags.intersection(get_right_tokens(c[1], window=7))) > 0: return 1 else: return 0 LFs = [ LF_crime_detect, LF_location_left_window, LF_police_at_location_left, LF_location_left_per_vic_window ] from snorkel.annotations import LabelAnnotator labeler1 = LabelAnnotator(f=LFs) np.random.seed(1701) get_ipython().magic(u'time L_train = labeler1.apply(split=0)') L_train L_train = labeler1.load_matrix(session, split=0) L_train for docno in session.query(Document).all(): print(docno) for i in range(L_train.shape[0]): if (L_train[i, :].toarray()[0][0] == 1.0): if (session.query(LocationPer).filter(LocationPer.split == 0) [i].get_parent().get_parent() == docno): print(