def extract_binary_candidates(predicate_resume, clear=False, parallelism=8, split=None, documents_titles=None, limit=None, page_size=10000): #create span and candidates logging.info("Starting candidates extraction ") subject_ne=predicate_resume['subject_ne'] object_ne=predicate_resume['object_ne'] session = SnorkelSession() CandidateSubclass = predicate_resume["candidate_subclass"] ngrams= Ngrams(n_max=7) subject_matcher = get_matcher(subject_ne) object_matcher = get_matcher(object_ne) cand_extractor = CandidateExtractor(CandidateSubclass, [ngrams, ngrams], [subject_matcher,object_matcher]) #skip sentences already extracted logging.info("Count candidates") sents_query_id = session.query(Sentence.id) candidates_count = session.query(CandidateSubclass).count() #logging.info("Delete span orphans") #delete_orphan_spans() if documents_titles==None and candidates_count>1 and clear==False: sents_query_id = get_sentences_ids_not_extracted(predicate_resume, session) elif documents_titles != None: #delete candidates for test and dev logging.info("Deleting candidates") update_candidates_by_page_titles(predicate_resume,documents_titles, split) sents_query_id=get_sentences_ids_by_title_not_extracted(predicate_resume,session,documents_titles) if limit is not None and documents_titles is None: sents_query_id=sents_query_id.limit(limit) sents_query=session.query(Sentence).filter(Sentence.id.in_(sents_query_id)) logging.info("Counting sentences") sents_count=sents_query.count() logging.info("Sents count"+str(sents_count)) print("Sents count"+str(sents_count)) if sents_count > page_size: page=page_size else: page=sents_count i=1 while(True): set_name="" if split == None: set_name="train" split2=0 else: set_name=str(split) split2=split logging.info('\tQuering sentences from %s to %s, in set \'%s\'', (page*(i-1)), page*i, set_name) sents=sents_query.order_by(Sentence.id).slice((page*(i-1)), page*i).all() logging.info("Extracting") if sents == None or len(sents) < 1 : break cand_extractor.apply(sents, split=split2, clear=clear, progress_bar=False, parallelism=parallelism) logging.info('\t\tcandidates extracted for %s', CandidateSubclass.__name__) i=i+1 clear=False logging.info("Finished candidates extraction ")
# This section embeds all candidate sentences. For each sentence, we place tags around each mention, tokenized the sentence and then matched each token to their corresponding word index. Any words missing from our vocab receive a index of 1. Lastly, the embedded sentences are exported as a sparse dataframe. # In[ ]: word_dict_df = pd.read_table("output/gene_interacts_gene_word_dict.tsv") word_dict = {word[0]: word[1] for word in word_dict_df.values.tolist()} fixed_word_dict = {word: word_dict[word] + 2 for word in word_dict} # In[ ]: limit = 1000000 total_candidate_count = total_candidates_df.shape[0] for offset in list(range(0, total_candidate_count, limit)): candidates = (session.query(GeneGene).filter( GeneGene.id.in_(total_candidates_df.candidate_id.astype( int).tolist())).offset(offset).limit(limit).all()) max_length = total_candidates_df.sen_length.max() # if first iteration create the file if offset == 0: (generate_embedded_df(candidates, fixed_word_dict, max_length=max_length).to_csv( "output/all_embedded_gg_sentences.tsv", index=False, sep="\t", mode="w")) # else append don't overwrite
from snorkel.parser.spacy_parser import Spacy # Applying corpus parser corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(list(doc_preprocessor), parallelism=parallelism, verbose=False) # Checking the number of parsed documents and sentences in the database. # In[ ]: from snorkel.models import Document, Sentence # Printing number of docs/sentences print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count()) # Separating into train, dev, and test sets # In[11]: from dataset_utils import create_test_train_splits # Getting all documents parsed by Snorkel docs = session.query(Document).order_by(Document.name).all() # Creating train, test, dev splits train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'location', gold_dict=None, dev_frac=0.1, test_frac=0.1)
corpus_parser.apply(doc_preprocessor) Sensitive = candidate_subclass('Sensitive', ['sensitive'], values=[ 'person', 'job', 'event', 'place', 'date', 'time', 'product', 'email', 'phone', 'quantity', 'address', 'url', 'org', 'file', 'password', False ]) # generating candidates. ngrams = Ngrams(n_max=6) ngramMatcher = NgramMatcher(longest_match_only=False) cand_extractor = CandidateExtractor(Sensitive, [ngrams], [ngramMatcher], symmetric_relations=False) sents = session.query(Sentence).all() cand_extractor.apply(sents, split=0) train_cands = session.query(Sensitive).filter(Sensitive.split == 0).all() finder = FinderAcora() def find(array, word): return [i for i, each in enumerate(array) if each == word] def LF_product(c): if len(c.sensitive.get_attrib_tokens("words")) == len( find(c.sensitive.get_attrib_tokens("ner_tags"), "PRODUCT")): print "PRODUCT:" + c.sensitive.get_span() return "product"
#term = r'(\$?\d\d\d?.*?per|\$?\d\d\d?.*?hours?|\$?\d\d\d?.*?half|\$?\d\d\d?.*?minutes?)' term = r'([Ll]ocation:.{0,100}|[cC]ity:.{0,100}|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)' # Doc length in characters, remove to have no max max_doc_length = None # Setting preprocessor print(f'Preprocessing folder: {data_loc}') doc_preprocessor = set_preprocessor(data_source, data_loc, max_docs=max_docs, verbose=False, clean_docs=False, content_fields=['raw_content', 'url'], term=term, max_doc_length=max_doc_length) # Setting parser and applying corpus preprocessor parser = SimpleTokenizer(delim='<|>') corpus_parser = CorpusParser(parser=parser) corpus_parser.apply(list(doc_preprocessor), parallelism=parallelism, verbose=False) # Printing number of docs/sentences print("==============================") print(f"DB creation results for {postgres_db_name}:") print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count()) print("==============================")
label_candidates(session, (total_candidates_df.query( "split==0&disease_mention_count==1&gene_mention_count==1"). candidate_id.values.tolist()), lfs, lf_names, num_threads=10, batch_size=50000, multitask=False) } # In[12]: if not quick_load: # Check to make sure the label functions match up with the candidate objects ids = label_matricies['train'].candidate_id.head(5).tolist() candidate_list = session.query(DiseaseGene).filter( DiseaseGene.id.in_(ids)).all() for candidate in tqdm_notebook(candidate_list): correct_output = list(map(lambda fn: fn(candidate), lfs)) test_output = label_matricies['train'].query( "[email protected]").fillna(0).values.tolist()[0] for pair in zip(correct_output, test_output[:-1]): assert pair[0] == pair[1] # In[13]: if not quick_load: label_matricies.update({ key: label_candidates(session, candidate_dfs[key]['candidate_id'].values.tolist(), lfs,
# In[ ]: category_list = np.random.choice([0, 1, 2], total_sentences, p=[0.7, 0.2, 0.1]) # In[ ]: # Divide the sentences into train, dev and test sets #Grab the sentences!!! train_sens = set() dev_sens = set() test_sens = set() offset = 0 category_index = 0 sql_query = session.query(Document).limit(chunk_size) #divde and insert into the database while True: documents = list(sql_query.offset(offset).all()) if not documents: break for doc in tqdm.tqdm(documents): for s in doc.sentences: # Stratify the data into train, dev, test category = category_list[category_index] category_index = category_index + 1
cutoff = 300 total_candidates_df = ( pd.read_table("../dataset_statistics/results/all_ctd_map.tsv.xz").query( "sen_length < 300")) total_candidates_df.head(2) # # Train Word Vectors # This section trains the word vectors using the specifications described above. # In[10]: words_to_embed = [] candidates = (session.query(CompoundDisease).filter( CompoundDisease.id.in_( total_candidates_df.candidate_id.astype(int).tolist())).all()) # In[11]: for cand in tqdm_notebook(candidates): args = [(cand[0].get_word_start(), cand[0].get_word_end(), 1), (cand[1].get_word_start(), cand[1].get_word_end(), 2)] words_to_embed.append(mark_sentence(candidate_to_tokens(cand), args)) # In[12]: model = FastText(words_to_embed, window=2, negative=10, iter=50,
for corpus_name in ['Emails Development', 'Emails Test']: #corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one() sentences = set() for document in corpus: for sentence in document.sentences: if number_of_people(sentence) < 5: sentences.add(sentence) %time c = ce.extract(sentences, corpus_name + ' Candidates', session) session.add(c) session.commit() from snorkel.models import CandidateSet train = session.query(CandidateSet).filter(CandidateSet.name == 'Emails Training Candidates').one() dev = session.query(CandidateSet).filter(CandidateSet.name == 'Emails Development Candidates').one() from snorkel.annotations import FeatureManager feature_manager = FeatureManager() %time F_train = feature_manager.create(session, c, 'Train Features') #To load existing use .. #%time F_train = feature_manager.load(session, train, 'Train Features') from snorkel.annotations import LabelManager label_manager = LabelManager()
cand_extractor = CandidateExtractor(GenePhenoPair, [gene_ngrams, pheno_ngrams], [GM, PM], symmetric_relations=True) print "Splitting Docs..." pathname = 'small_data/' if os.environ[ 'AGP_DATA_SIZE'] == 'small-data' else 'data/' with open(pathname + 'pmcids_400.pkl', 'rb') as f: sent_dicts = cPickle.load(f) train_ids, dev_ids, test_ids = set(sent_dicts['train']), set( sent_dicts['dev']), set(sent_dicts['test']) all_ids = train_ids.union(dev_ids).union(test_ids) # 40, 10, 10 train_sents, dev_sents, test_sents, all_sents = set(), set(), set(), set() train_docs, dev_docs, test_docs = set(), set(), set() docs = session.query(Document).order_by(Document.name).all() doc_sents = dict() for doc_num, doc in enumerate(docs): if len(train_docs) >= 40 and len(dev_docs) >= 10 and len(test_docs) >= 10: break doc_sents[doc_num] = set() for s in doc.sentences: all_sents.add(s) doc_sents[doc_num].add(s) name = doc.name.split('-')[0] if name in train_ids: train_docs.add(name) train_sents.add(s) elif name in dev_ids: dev_docs.add(name) dev_sents.add(s)
def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams, cand1Matcher, cand2Matcher, model_name, output_file_name, corpus_parser): print "Started" session = SnorkelSession() # The following line is for testing only. Feel free to ignore it. candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2]) sentences = set() docs = session.query(Document).order_by(Document.name).all() for doc in docs: for s in doc.sentences: sentences.add(s) cand_1_ngrams = Ngrams(n_max=cand1_ngrams) # condition_ngrams = Ngrams(n_max=7) cand_2_ngrams = Ngrams(n_max=cand2_ngrams) # medium_ngrams = Ngrams(n_max=5) # type_ngrams = Ngrams(n_max=5) # <--- Q: should we cut these down? # # level_ngrams = Ngrams(n_max=1) # unit_ngrams = Ngrams(n_max=1) # Construct our Matchers # cMatcher = matchers.getConditionMatcher() # mMatcher = matchers.getMediumMatcher() # tMatcher = matchers.getTypeMatcher() # lMatcher = matchers.getLevelMatcher() # uMatcher = matchers.getUnitMatcher() # Building the CandidateExtractors # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher]) candidate_extractor = CandidateExtractor(candidate_pair, [cand_1_ngrams, cand_2_ngrams], [cand1Matcher, cand2Matcher]) # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher]) # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher]) # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher]) # List of Candidate Sets for each relation type: [train, dev, test] candidate_extractor.apply(sentences, split=4, clear=True) cands = session.query(candidate_pair).filter( candidate_pair.split == 4).order_by(candidate_pair.id).all() session.commit() # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug) # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium) # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType) # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit) if (len(cands)) == 0: print "No Candidates Found" return if (pairing_name == 'BiomarkerCondition'): # session.rollback() # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1]) add_adj_candidate_BC(session, candidate_pair, cands, 4) # fix_specificity(session, BiomarkerCondition, cands_BC[1]) # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count() session.commit() lstm = reRNN(seed=1701, n_threads=None) lstm.load(model_name) predictions = lstm.predictions(cands) output_file = open(output_file_name, 'wb') import csv csvWriter = csv.writer(output_file) csvWriter.writerow( ['doc_id', 'sentence', candidate1, candidate2, 'prediction']) for i in range(len(cands)): doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:] sentence_string = cands[i].get_parent().text cand_1_string = cands[i].get_contexts()[0].get_span() cand_2_string = cands[i].get_contexts()[1].get_span() prediction = predictions[i] csvWriter.writerow([ unidecode(doc_string), unidecode(sentence_string), unidecode(cand_1_string), unidecode(cand_2_string), prediction ])
searcher.fit(F_dev, L_dev, n_epochs=50, rebalance=0.5, print_freq=25) disc_models.append(disc_model) w = disc_model.save_dict['w'] f = w.read_value() values = f.eval(session = disc_model.session) weights.append(values) test_marginals.append(disc_model.marginals(F_test)) # # Generate Statistics After Model Training # ## Grab the feature weights # In[ ]: features = session.query(FeatureKey).all() feat_data = [] for feat, w0, w1 in zip(features,weights[0],weights[1]): feat_data.append([feat.name, w0[0], w1[0]]) feat_frame = pd.DataFrame(feat_data, columns= ["Feature", "Model_KB", "Model_KB_CONTEXT"]) # ## Grab the class probabilities # In[ ]: test_marginals[0].shape cand_probs = [] for candidate_id in L_test.candidate_index: cand = session.query(Candidate).filter(Candidate.id == candidate_id).one() index = L_test.candidate_index[candidate_id]
dev_set_df = (entity_level_df.query("split==10&has_sentence==1").merge( dev_candidate_df, on=["drugbank_id", "doid_id"])) test_set_df = (entity_level_df.query("split==11&has_sentence==1").merge( test_candidate_df, on=["drugbank_id", "doid_id"])) # In[16]: total_candidates_df = (training_set_df.append(dev_set_df).append( test_set_df).drop_duplicates("candidate_id")) # In[17]: dev_candidates = (session.query(CompoundDisease).filter( CompoundDisease.id.in_( total_candidates_df.query("split==10").sample( 10000, random_state=100).candidate_id.tolist())).all()) dev_df = make_sentence_df(dev_candidates) dev_df.head(2) # In[18]: test_candidates = ( session.query(CompoundDisease).filter( CompoundDisease.id.in_( total_candidates_df.query("split==11") # Black list ethanol, alcohol and alcohold dependence # Samples too many sentences with the above entities .query("drugbank_id!='DB00898'&doid_id!='DOID:0050741'").sample( 10000, random_state=120).candidate_id.tolist())).all()) test_df = make_sentence_df(test_candidates)
doc_preprocessor = TSVDocPreprocessor('tutorials/intro/data/articles.tsv', max_docs=max_docs) # In[5]: from snorkel.parser import CorpusParser corpus_parser = CorpusParser() get_ipython().magic(u'time corpus_parser.apply(doc_preprocessor)') # In[6]: from snorkel.models import Document, Sentence print "Documents:", session.query(Document).count() print "Sentences:", session.query(Sentence).count() dict_final = {} crimetype_murder = [ 'killed', 'kill', 'kills', 'killing', 'murder', 'shot', 'shooting', 'convicted', 'murdered' ] crimetype_rape = [ 'rape', 'raped', 'gangraped', 'molested', 'molestation', 'molesting', 'harassment', 'raping' ] crimetype_attack = [ 'hurt', 'rioting', 'injured', 'attack', 'beating up', 'attacked' ]
help="the pair file [default: %default]") parser.add_option("-s", "--saved-dir", dest="saved_dir", help="directory to save the rank scores [default: %default]") parser.set_defaults( input_pair= "/Users/datienguyen/Desktop/coding/data-search/exp-data/dataSEARCH/pair-store/train.csv" #3_signals.top20doc.csv" # , saved_dir="../../data-search/exp-data/dataSearch/pair-store/") opts, args = parser.parse_args(sys.argv) input_pair = opts.input_pair session.query(Context).delete() session.query(Candidate).delete() values = ['positive', 'negative'] Tweet = candidate_subclass('Tweet', ['tweet'], values=values) #item_id,worker_id,query_id,doc1,doc2,annotation cand_dict = {} with open(input_pair, "r") as myFile: reader = csv.reader(myFile) for row in reader: #print(row) item_id = row[0] worker_id = row[1] anno = row[5]
# In[ ]: DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene']) # # Look at potential Candidates # Use this to look at loaded candidates from a given set. The constants represent the index to retrieve the training set, development set and testing set. # In[ ]: TRAIN = 0 DEV = 1 TEST = 2 candidates = session.query(DiseaseGene).filter( DiseaseGene.split == TRAIN).all() sv = SentenceNgramViewer(candidates, session) # In[ ]: sv # # Label Functions # Here is the fundamental part of the project. Below are the label functions that are used to give a candidate a label of 1,0 or -1 which corresponds to correct relation, not sure and incorrection relation. The goal here is to develop functions that can label as many candidates as possible. # In[ ]: gene_list = pd.read_csv('epilepsy-genes.tsv', sep="\t") # In[ ]:
n_docs = 500 doc_preprocessor = TSVDocPreprocessor( 'pdfs_big.tsv', max_docs=n_docs) # new files (88 papers) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor, count=n_docs) VirusHost = candidate_subclass('VirusHost', ['virus', 'host']) ngrams = Ngrams(n_max=10) virus_matcher = DictionaryMatch(d=virus_list) animals_matcher = DictionaryMatch(d=animals_list) cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [ virus_matcher, animals_matcher], nested_relations=True) docs = session.query(Document).order_by(Document.name).all() # Text Pattern based labeling functions, which look for certain keywords # List to parenthetical def ltp(x): return '(' + '|'.join(x) + ')' # -------------------------------- # Positive LFs: detect = {'detect', 'detects', 'detected',
cp = CorpusParser(doc_parser, sent_parser) %time corpus = cp.parse_corpus(session, 'Protein Training') for name, path in [('Protein Development', 'data/protein_dev.tsv'), ('Protein Test', 'data/protein_test.tsv')]: doc_parser.path=path %time corpus = cp.parse_corpus(session, name) session.commit() from snorkel import SnorkelSession session = SnorkelSession() from snorkel.models import Corpus corpus = session.query(Corpus).filter(Corpus.name == 'Protein Training').one() corpus sentences = set() for document in corpus: for sentence in document.sentences: sentences.add(sentence) from snorkel.candidates import Ngrams from snorkel.models import candidate_subclass #entity = candidate_subclass('entity', ['entity1', 'entity2']) import pandas as pd ROOT = 'data/dicts/' proteins = set(pd.read_csv(ROOT + 'protein_names.csv', header=None, index_col=0, encoding='utf-8').dropna()[1]) ngrams = Ngrams(n_max=1) from snorkel.matchers import DictionaryMatch
if print_stats: self.print_report(speaker, apologize, ask_agency, give_agency, \ gratitude, please) return { 'apologize': apologize, 'ask_agency': ask_agency, 'give_agency': give_agency, 'gratitude': gratitude, 'please': please } if __name__ == '__main__': session = SnorkelSession() docs = session.query(ReconDocument) i = 0 p = PolitenessExtractor() ''' Stupid code because I don't know how to get the length of the array of documents returned by the query. i.e. for i in range(len(docs)): does not work because len(docs) is not a thing... ''' while True: try: p.compute_score(docs[i], print_stats=False) i += 1 except:
for line in open(filename, "r", errors='ignore').readlines(): if line.split("\t")[0] in docID or len(line.split("\t")) != 2: continue docID.add(line.split("\t")[0]) fout.write(line.replace("\n", " ").strip() + "\n") print("total docID count", len(docID)) doc_preprocessor = TSVDocPreprocessor(newfile, encoding="utf-8", max_docs=n_docs) from snorkel.parser.spacy_parser import Spacy from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence # defined in context.py file if session.query(Document).count() == 0: corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor, count=n_docs) # ,parallelism=5) print("Documents:", session.query(Document).count()) from snorkel import SnorkelSession from snorkel.parser.spacy_parser import Spacy from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence from collections import defaultdict import numpy as np session = SnorkelSession() docs = session.query(Document).all() sents = session.query(Sentence).all() # get all sentences from snorkel.db
def get_gold_labels(session: SnorkelSession, annotator_name: str = "gold") -> List[dict]: # define relationship in case it is not defined ak = session.query(GoldLabelKey).filter( GoldLabelKey.name == annotator_name).first() return session.query(GoldLabel).filter(GoldLabel.key == ak).all()
LFs = [lf_preceding_half_missing_quantity, lf_preceding_half] # Setting extraction type -- should be a subfield in your data source extractions field! from dataset_utils import create_candidate_class extraction_type = 'price' # Creating candidate class candidate_class, candidate_class_name = create_candidate_class(extraction_type) # Printing number of docs/sentences from snorkel.models import Document, Sentence print("==============================") print(f"DB contents for {postgres_db_name}:") print( "Number of candidates:", session.query(candidate_class).filter(candidate_class.split == 0).count()) print("==============================") # Split to pull eval candidates from eval_split = 0 # Executing query for eval candidates eval_cands = session.query(candidate_class).filter( candidate_class.split == eval_split).order_by(candidate_class.id).all() print(f'Loaded {len(eval_cands)} candidates...') # Applying LFs print("Applying LFs...") from snorkel.annotations import LabelAnnotator labeler = LabelAnnotator(lfs=LFs) L_eval = labeler.apply(split=eval_split, parallelism=parallelism)
session = SnorkelSession() doc_preprocessor = TSVDocPreprocessor(path) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) pairs = candidate_subclass('pairs1', ['queryPair']) regexpmatch=RegexMatchSpan(rgx=".*") cs=queryCandidate() cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch]) docs = session.query(Document).order_by(Document.name).all() sentences = session.query(Sentence).all() #print(sentences) sents=set(); for i,doc in enumerate(docs): for s in doc.sentences: sents.add(s) cand_extractor.apply(sents) print("Number of candidates:", session.query(pairs).count()) labeler = LabelAnnotator(lfs=LFs)
def main(argv): parser = argparse.ArgumentParser(description='Process some arguments.') parser.add_argument('--dbPath', type=str, default=os.getcwd() + os.sep + 'snorkel.db', help='the path of snorkel database') parser.add_argument( '--lfPath', type=str, default=os.getcwd() + os.sep + 'util_default.py', help='the path of util.py file where labelling functions were defined') args = parser.parse_args() # Connect to db, and get session util_module = imp.load_source("module.name", args.lfPath) train_doc_breakdown_map = dict( ) # maps doc_id into a dict of ["Background", "Purpose", "Mechanism", "Method", "Finding"] test_doc_breakdown_map = dict() SnorkelSession = create_session_with_conn("sqlite:///" + args.dbPath) session = SnorkelSession() print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count()) sents = session.query(Sentence).all() n_max_corpus = 0 for sent in sents: n_max_corpus = max(n_max_corpus, len(sent.words)) print("The longest sentence has " + str(n_max_corpus) + " tokens.") ngrams = Ngrams(n_max=n_max_corpus) # from util import number_of_people docs = session.query(Document).all() train_sents = set() dev_sents = set() test_sents = set() for i, doc in enumerate(docs): for s in doc.sentences: if i % 10 == 8 and "cscw18" != doc.name[:6]: dev_sents.add(s) elif "cscw18" == doc.name[: 6]: # replace the earlier 10% test documents as cscw'18 annotation guideline 10 examples test_sents.add(s) elif "cscw18" != doc.name[:6]: train_sents.add(s) General, general_extractor = util_module.get_segment_class_and_matcher( "General", ngrams) general_cands = extract_and_display( train_sents, dev_sents, test_sents, session, general_extractor, General, "General", train_doc_breakdown_map=train_doc_breakdown_map, test_doc_breakdown_map=test_doc_breakdown_map) input("Finished general ") # load segment_candidate_class and corresponding_matcher, e.g. (Background, non_comma_dict_background_matcher) Background, background_matcher = util_module.get_segment_class_and_matcher( "Background", ngrams) background_cands = extract_and_display( train_sents, dev_sents, test_sents, session, background_matcher, Background, "Background", train_doc_breakdown_map=train_doc_breakdown_map, test_doc_breakdown_map=test_doc_breakdown_map) debug_sess_eval(session, Background, background_matcher)
train_df = pd.read_excel(spreadsheet_names['train']) train_cids = train_df.candidate_id.astype(int).tolist() train_df.head(2) # In[ ]: dev_df = pd.read_excel(spreadsheet_names['dev']) dev_df = dev_df[dev_df.curated_dsh.notnull()] dev_cids = list(map(int, dev_df.candidate_id.values)) dev_df.head(2) # In[ ]: train_hand_df = pd.read_excel(spreadsheet_names['train_hand_label']) train_hand_cids = train_hand_df[train_hand_df.curated_dsh.notnull()].candidate_id.astype(int).tolist() train_hand_df.head(2) # In[ ]: for cid_list in [train_cids, train_hand_cids, dev_cids]: cids = session.query(CompoundGene.id).filter(CompoundGene.id.in_(cid_list)) get_ipython().magic(u'time labeler.apply(cids_query=cids, parallelism=5)')
# Parsing corpus_parser = CorpusParser() # Note: Parallelism can be run with a Postgres DBMS, but not SQLite corpus_parser.apply(list(train_preprocessor)) corpus_parser.apply(list(dev_preprocessor), clear=False) corpus_parser.apply(list(test_preprocessor), clear=False) # Retrieving Stable IDs for each of the candidate sentences with open('articles/doc_ids.pkl', 'rb') as f: train_ids, dev_ids, test_ids = load(f) train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids) train_sents, dev_sents, test_sents = set(), set(), set() docs = session.query(Document).order_by(Document.name).all() # Assigning each sentence to {train,dev,test}-set based on Stable ID for i, doc in enumerate(docs): for s in doc.sentences: if doc.name in train_ids: train_sents.add(s) elif doc.name in dev_ids: dev_sents.add(s) elif doc.name in test_ids: test_sents.add(s) else: raise Exception('ID <{0}> not found in any id set'.format( doc.name)) #----------------------
) as candidate_sen inner join sentence on candidate_sen.sentence_id=sentence.id ''' candidate_sentence_df = pd.read_sql(sql, database_str).astype( {"entrez_gene_id": int}) candidate_sentence_df.head(2) # In[10]: total_candidates_df = (edge_level_df.merge(candidate_sentence_df, on=["doid_id", "entrez_gene_id"])) total_candidates_df.head(2) # In[11]: dev_candidates = (session.query(DiseaseGene).filter( DiseaseGene.id.in_( total_candidates_df.query("split==1").sample( 10000, random_state=100).candidate_id.tolist())).all()) dev_df = make_sentence_df(dev_candidates) dev_df.head(2) # In[12]: test_candidates = (session.query(DiseaseGene).filter( DiseaseGene.id.in_( total_candidates_df.query("split==2").sample( 10000, random_state=120).candidate_id.tolist())).all()) test_df = make_sentence_df(test_candidates) test_df.head(2) # In[13]:
dev_set_df = (entity_level_df.query("split==7&has_sentence==1").merge( dev_candidate_df.astype({"entrez_gene_id": int}), on=["drugbank_id", "entrez_gene_id"])) test_set_df = (entity_level_df.query("split==8&has_sentence==1").merge( test_candidate_df.astype({"entrez_gene_id": int}), on=["drugbank_id", "entrez_gene_id"])) # In[16]: total_candidates_df = (training_set_df.append(dev_set_df).append(test_set_df)) # In[17]: dev_candidates = (session.query(CompoundGene).filter( CompoundGene.id.in_( total_candidates_df.query("split==7").sample( 10000, random_state=100).candidate_id.tolist())).all()) dev_df = make_sentence_df(dev_candidates) dev_df.head(2) # In[18]: test_candidates = (session.query(CompoundGene).filter( CompoundGene.id.in_( total_candidates_df.query("split==8").sample( 10000, random_state=100).candidate_id.tolist())).all()) test_df = make_sentence_df(test_candidates) test_df.head(2) # In[ ]: