def run_quickumls_server(opts): matcher = QuickUMLS(quickumls_fp=opts.quickumls_fp, threshold=opts.threshold, similarity_name=opts.similarity_name, window=opts.window, min_match_length=opts.min_match_length, verbose=opts.verbose) run_server(matcher, host=opts.host, port=opts.port, buffersize=4096)
dist = model.wmdistance(term1.lower().split(), term2.lower().split()) return dist def get_umls_tagging(text, matcher): info = matcher.match(text, best_match=True, ignore_syntax=False) taggings = [] if len(info) == 0: return None for one_c in info: one_c = one_c[0] print(one_c) result = {"cui": one_c["cui"], "term": one_c["term"]} taggings.append(result) return taggings from QuickUMLS.quickumls import QuickUMLS matcher = QuickUMLS("/home/tk2624/tools/QuickUMLS", threshold=0.8) text = "tension-free hernioplasty" print(get_umls_tagging(text, matcher)) # def main(): # print("hrmmo world") # # # if __name__== "__main__": # main()
def main(): # setting for output error list if not os.path.exists(FLAGS.output_dir): try: createdir= "mkdir "+FLAGS.output_dir os.system(createdir) except: print("DIR ERROR! Unable to create this directory!") exception_dir = os.path.join(FLAGS.output_dir+"/exceptionlist.txt") except_out = codecs.open(exception_dir,"w") # setting for bert model #tf.logging.set_verbosity(tf.logging.INFO) processors = { "bc5cdr": BC5CDRProcessor, } bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) task_name = "bc5cdr"#FLAGS.task_name.lower() processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # setting for TPU --- tpu_cluster_resolver = None is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 # setting for quickUMLS matcher = None if config.use_UMLS >0: #QuickUMLS matcher from QuickUMLS.quickumls import QuickUMLS matcher = QuickUMLS(parser_config.QuickUMLS_dir,threshold=0.8) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=config.bluebert_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) with tf.gfile.Open(os.path.join(config.bluebert_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} print ("Start parsing PICO elements.") # start reading each file (abstract) and predict count = 1 for f in os.listdir(FLAGS.data_dir): if not f.endswith(".txt"): continue input_file = os.path.join(FLAGS.data_dir, f) predict_examples = processor.get_pred_examples(input_file) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) #======= prediction ===========# result = estimator.predict(input_fn=predict_input_fn) result = list(result) pmid = re.sub("\.txt","",f) #output_predict_file= os.path.join(FLAGS.output_dir,pmid+"_output_predict_file") #output_err_file = os.path.join(FLAGS.output_dir,pmid+"_output_err_file") #result_to_pair(predict_examples, result, id2label, output_predict_file, output_err_file) words, preds = result_to_pair_for_return(predict_examples, result, id2label) pmid = re.sub("\.txt","",f) outfile_dir= codecs.open(os.path.join(FLAGS.output_dir,pmid+".json"),"w") try: json_out = formalization.generate_json(words, preds,matcher,pmid,sent_tags=[],entity_tags=["Participant","Intervention","Outcome"],attribute_tags=["measure","modifier"],relation_tags=[]) outfile_dir.write(json_out) if count%50 ==0: print ("processing the",count,"th abstracts...") count+=1 except: except_out.write(f+"\n") print ("Saved all parsing results in "+ FLAGS.output_dir)
def get_qmatcher(): return QuickUMLS(QUICKUMLS_LOCATION_WIDGET.value, accepted_semtypes=constants.ACCEPTED_CATEGORIES, overlapping_criteria='score')
def do_quickumls(containers,note_offset=0): ### TO-DO: have this take in the path to quickumls, don't iterate directly here (do spacy style folder thing) ### ideally woudl alter this to handle other cuis from QuickUMLS.quickumls import QuickUMLS quickumls_fp = '/usr/local/lib/python2.7/dist-packages/QuickUMLS/quickUMLS-install' matcher = QuickUMLS(quickumls_fp=quickumls_fp, overlapping_criteria='length', threshold=.7, similarity_name='cosine') if type(containers) is not list: containers=[containers] file_range=21 fileNums = range(20,file_range) for number in fileNums: # if j !=21: # j+=1 # continue # j += 1 # if number != 27: continue ### visualize interesting doc # folderPath='/home/john/Desktop/brat-v1.3_Crunchy_Frog/data/rapid-annotated/' folderPath = '/home/john/Desktop/nlp_work/test-annotations/' fullInputPath = folderPath + str(number) + "-classified.txt" thisFile = list(csv.reader(open(fullInputPath, 'rU'), delimiter=',')) file_text = thisFile[0][-1] file_labels=thisFile[0][:-1] file_label_string=','.join(file_labels) annotation_offset= len(file_label_string)+2 #+2 accounts for " and \n after labels print 'label string' print file_label_string print 'file length' print len(thisFile[0]) ### 1. RUN QUICK UMLS ON FILE #### found_entities,parsed_doc = matcher.match(text=file_text, best_match=True, ignore_syntax=False) # for entity in found_entities: # print entity subject_entities = [] negation_entities = [] qualifier_entities = [] for cui_container in containers: subject_cuis = cui_container.subject_cuis qualifier_cuis = cui_container.qualifier_cuis negation_cuis = cui_container.negation_cuis for findings in found_entities: while type(findings) ==list: findings = findings [0] ### 2. Compare CUIs to CUI tree #### ## TO - DO - add in complication and such (custom job) if findings['cui'] in subject_cuis: subject_entities.append(findings) elif findings['cui'] in negation_cuis and findings['similarity'] > .9: ## need higher similarity to avoid lots of false positives (e.g. inflammatory as negation for noninflammatory) negation_entities.append(findings) print 'FOUND NEG' print negation_entities elif findings['cui'] in qualifier_cuis: qualifier_entities.append(findings) #### TO-DO: Figure out more graceful way to handle this if cui_container.cui_to_morphological_variants_dic: regex_subject_findings=regex_matcher(parsed_doc=parsed_doc,cui_to_morphological_variants_dic=cui_container.cui_to_morphological_variants_dic, annotation_offset=288) else: regex_subject_findings=None if regex_subject_findings: subject_entities+=regex_subject_findings # regex_surgery_findings=regex_matcher(parsed_doc=parsed_doc,cui_to_morphological_variants_dic=cui_container.cui_to_morphological_variants_dic, # annotation_offset=288) ## TO-DO: FIX annotation offset hardcoding ### TO-DO ## ### need to repeat regex steps with simple surgery type things #### MAKE KEY VALUES HERE #### ####def make_key_value_objects#### ### FIND ANATOMY, ACTIVITY, OR COMPLICATION TERMS (BY THEMSELVES OR COMBINED) ### FIND NEGATION AND QUALIFIERS #### MAKE FEATURE ANNOTATIONS HERE #### #### for finding in subject_entities: finding['concept_type'] = 'subject' finding['priority'] = 1 for finding in negation_entities: finding['concept_type'] = 'negation' finding['priority'] = 2 for finding in qualifier_entities: finding['concept_type'] = 'qualifier' finding['priority'] = 2 from operator import itemgetter all_findings = subject_entities + negation_entities + qualifier_entities all_findings = removeOverlappingPhrases(all_findings) relevant_findings=[] all_findings.sort(key=itemgetter('priority', 'sentence')) count=1 #### WEED OUT UNNECESSARY TERMS. MAKE KEY VALUE OBJECTS #### from keyValue import keyValue key_value_dic={} relevant_sentences = [] for finding in all_findings: count+=1 if finding['priority']==1: relevant_findings.append(finding) this_sentence=finding['sentence'] relevant_sentences.append(this_sentence) this_kv=keyValue(concept=finding['cui'],concept_related_terms=[finding['ngram'],finding['start']+note_offset,finding['end']+note_offset],sentence=this_sentence) if this_sentence in key_value_dic: current_kvs=key_value_dic[this_sentence] current_kvs.append(this_kv) key_value_dic[this_sentence]=current_kvs else: key_value_dic[this_sentence]=[this_kv] ### currently done crudely (add negation/qualifer to ALL subjects. Could leverage sapcy dependencies) ### elif finding['priority'] !=1: this_sentence = finding['sentence'] if this_sentence in relevant_sentences: relevant_findings.append(finding) if finding['concept_type'] == 'qualifier': print 'ACTIVATED' print finding for kv in key_value_dic[this_sentence]: kv.modifiers=[finding['cui'],finding['ngram'],finding['start']+note_offset,finding['end']+note_offset] elif finding['concept_type'] == 'negation': for kv in key_value_dic[this_sentence]: kv.negation=[finding['cui'],finding['ngram'],finding['start']+note_offset,finding['end']+note_offset] ############################################################## for kvs in key_value_dic: for kv in key_value_dic[kvs]: print kv.sentence print kv.concept print kv.concept_related_terms print kv.negation print kv.modifiers return all_findings
if not sys.warnoptions: warnings.simplefilter("ignore") time0 = time.time() config = Config() model = NERModel(config) model.build() model.restore_session(config.dir_model) time1 = time.time() parser_config = parser_Config() matcher = None if parser_config.use_UMLS > 0: #QuickUMLS matcher from QuickUMLS.quickumls import QuickUMLS matcher = QuickUMLS(parser_config.QuickUMLS_dir, threshold=0.8) print("\nloading model...", time1 - time0) def main(): #predict input = parser_config.infile_dir time2 = time.time() from parser import text_tokenize tokenizer = text_tokenize.mytokenizer() infile = codecs.open( input, "r") # assume each line is one abstract: pmid||abstracttext outdir = parser_config.outjson_dir exception_dir = os.path.join(outdir + "/exceptionlist.txt") except_out = codecs.open(exception_dir, "w")