def reload_external_labels(session: SnorkelSession, input_file: Union[str, Path], annotator_name: str = "gold"): Education = get_candidate_class() with open(str(input_file), "r") as f: lbls = ujson.load(f) for lbl in lbls: # we check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join((lbl['person'], lbl['organization'])) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=lbl['value'])) # commit session session.commit() # reload annotator labels reload_annotator_labels(session, Education, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, Education, annotator_name, split=2, filter_label_split=False)
random.shuffle(cand_list) #take 10 percent from here train_cand_list += cand_list[0:int(len(cand_list) * 0.4)] print(" -number of pairs:", len(cand_dict)) print(" -number of signals:", n) print(" -number of pair to train GEN model", len(train_cand_list)) for i, cand in enumerate(cand_list): split = 0 if cand in train_cand_list else 1 raw_text = RawText(stable_id=cand, name=cand, text=cand) tweet = Tweet(tweet=raw_text, split=split) session.add(tweet) session.commit() print("Commit to snorkel database done...") #writing label generator def worker_label_generator(t): for worker_id in cand_dict[t.tweet.stable_id]: yield worker_id, cand_dict[t.tweet.stable_id][worker_id] np.random.seed(1701) labeler = LabelAnnotator(label_generator=worker_label_generator) L_train = labeler.apply(split=0) print(L_train.lf_stats(session))
def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams, cand1Matcher, cand2Matcher, model_name, output_file_name, corpus_parser): print "Started" session = SnorkelSession() # The following line is for testing only. Feel free to ignore it. candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2]) sentences = set() docs = session.query(Document).order_by(Document.name).all() for doc in docs: for s in doc.sentences: sentences.add(s) cand_1_ngrams = Ngrams(n_max=cand1_ngrams) # condition_ngrams = Ngrams(n_max=7) cand_2_ngrams = Ngrams(n_max=cand2_ngrams) # medium_ngrams = Ngrams(n_max=5) # type_ngrams = Ngrams(n_max=5) # <--- Q: should we cut these down? # # level_ngrams = Ngrams(n_max=1) # unit_ngrams = Ngrams(n_max=1) # Construct our Matchers # cMatcher = matchers.getConditionMatcher() # mMatcher = matchers.getMediumMatcher() # tMatcher = matchers.getTypeMatcher() # lMatcher = matchers.getLevelMatcher() # uMatcher = matchers.getUnitMatcher() # Building the CandidateExtractors # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher]) candidate_extractor = CandidateExtractor(candidate_pair, [cand_1_ngrams, cand_2_ngrams], [cand1Matcher, cand2Matcher]) # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher]) # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher]) # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher]) # List of Candidate Sets for each relation type: [train, dev, test] candidate_extractor.apply(sentences, split=4, clear=True) cands = session.query(candidate_pair).filter( candidate_pair.split == 4).order_by(candidate_pair.id).all() session.commit() # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug) # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium) # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType) # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit) if (len(cands)) == 0: print "No Candidates Found" return if (pairing_name == 'BiomarkerCondition'): # session.rollback() # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1]) add_adj_candidate_BC(session, candidate_pair, cands, 4) # fix_specificity(session, BiomarkerCondition, cands_BC[1]) # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count() session.commit() lstm = reRNN(seed=1701, n_threads=None) lstm.load(model_name) predictions = lstm.predictions(cands) output_file = open(output_file_name, 'wb') import csv csvWriter = csv.writer(output_file) csvWriter.writerow( ['doc_id', 'sentence', candidate1, candidate2, 'prediction']) for i in range(len(cands)): doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:] sentence_string = cands[i].get_parent().text cand_1_string = cands[i].get_contexts()[0].get_span() cand_2_string = cands[i].get_contexts()[1].get_span() prediction = predictions[i] csvWriter.writerow([ unidecode(doc_string), unidecode(sentence_string), unidecode(cand_1_string), unidecode(cand_2_string), prediction ])