def run_quickumls_server(opts):
    matcher = QuickUMLS(quickumls_fp=opts.quickumls_fp,
                        threshold=opts.threshold,
                        similarity_name=opts.similarity_name,
                        window=opts.window,
                        min_match_length=opts.min_match_length,
                        verbose=opts.verbose)

    run_server(matcher, host=opts.host, port=opts.port, buffersize=4096)
    dist = model.wmdistance(term1.lower().split(), term2.lower().split())
    return dist


def get_umls_tagging(text, matcher):
    info = matcher.match(text, best_match=True, ignore_syntax=False)
    taggings = []
    if len(info) == 0:
        return None
    for one_c in info:
        one_c = one_c[0]
        print(one_c)
        result = {"cui": one_c["cui"], "term": one_c["term"]}
        taggings.append(result)
    return taggings


from QuickUMLS.quickumls import QuickUMLS

matcher = QuickUMLS("/home/tk2624/tools/QuickUMLS", threshold=0.8)

text = "tension-free hernioplasty"
print(get_umls_tagging(text, matcher))

# def main():
#     print("hrmmo world")
#
#
# if __name__== "__main__":
#     main()
Пример #3
0
def main():

    # setting for output error list
    if not os.path.exists(FLAGS.output_dir):
        try:
            createdir= "mkdir "+FLAGS.output_dir
            os.system(createdir)
        except:
            print("DIR ERROR! Unable to create this directory!")
        
    exception_dir = os.path.join(FLAGS.output_dir+"/exceptionlist.txt")
    except_out = codecs.open(exception_dir,"w")
    
    # setting for bert model
    #tf.logging.set_verbosity(tf.logging.INFO)
    processors = {
        "bc5cdr": BC5CDRProcessor,
    }

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    task_name = "bc5cdr"#FLAGS.task_name.lower()  
    processor = processors[task_name]()
    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
    
    # setting for TPU ---
    tpu_cluster_resolver = None
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    
    # setting for quickUMLS
    matcher = None
    if config.use_UMLS >0:
        #QuickUMLS matcher
        from QuickUMLS.quickumls import QuickUMLS
        matcher = QuickUMLS(parser_config.QuickUMLS_dir,threshold=0.8)
        


    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=config.bluebert_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))
    
    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)


    with tf.gfile.Open(os.path.join(config.bluebert_dir, 'label2id.pkl'), 'rb') as rf:
        label2id = pickle.load(rf)
        id2label = {value: key for key, value in label2id.items()}
    print ("Start parsing PICO elements.") 
    # start reading each file (abstract) and predict
    count = 1
    for f in os.listdir(FLAGS.data_dir):
        
            if not f.endswith(".txt"):
                continue 
            input_file = os.path.join(FLAGS.data_dir, f)
            predict_examples = processor.get_pred_examples(input_file)
            predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")

            filed_based_convert_examples_to_features(predict_examples, label_list,  FLAGS.max_seq_length, tokenizer, predict_file, mode="test")


            predict_drop_remainder = True if FLAGS.use_tpu else False
            predict_input_fn = file_based_input_fn_builder(
                input_file=predict_file, 
                seq_length=FLAGS.max_seq_length,
                is_training=False,
                drop_remainder=predict_drop_remainder)

            #======= prediction  ===========#
            result = estimator.predict(input_fn=predict_input_fn)        
            result = list(result)
            pmid = re.sub("\.txt","",f)            
            #output_predict_file= os.path.join(FLAGS.output_dir,pmid+"_output_predict_file")
            #output_err_file = os.path.join(FLAGS.output_dir,pmid+"_output_err_file")
            #result_to_pair(predict_examples, result, id2label, output_predict_file, output_err_file)
     
            words, preds = result_to_pair_for_return(predict_examples, result, id2label)
            pmid = re.sub("\.txt","",f)
            outfile_dir= codecs.open(os.path.join(FLAGS.output_dir,pmid+".json"),"w")
        try: 
            json_out = formalization.generate_json(words, preds,matcher,pmid,sent_tags=[],entity_tags=["Participant","Intervention","Outcome"],attribute_tags=["measure","modifier"],relation_tags=[])
            outfile_dir.write(json_out)
            if count%50 ==0:
                print ("processing the",count,"th abstracts...")
            count+=1
        except:
            except_out.write(f+"\n")
    print ("Saved all parsing results in "+ FLAGS.output_dir)
Пример #4
0
def get_qmatcher():
    return QuickUMLS(QUICKUMLS_LOCATION_WIDGET.value,
                     accepted_semtypes=constants.ACCEPTED_CATEGORIES,
                     overlapping_criteria='score')
Пример #5
0
def do_quickumls(containers,note_offset=0):
    ### TO-DO: have this take in the path to quickumls, don't iterate directly here (do spacy style folder thing)
    ### ideally woudl alter this to handle other cuis
    from QuickUMLS.quickumls import QuickUMLS

    quickumls_fp = '/usr/local/lib/python2.7/dist-packages/QuickUMLS/quickUMLS-install'
    matcher = QuickUMLS(quickumls_fp=quickumls_fp, overlapping_criteria='length', threshold=.7,
                        similarity_name='cosine')
    if type(containers) is not list:
        containers=[containers]
    file_range=21
    fileNums = range(20,file_range)
    for number in fileNums:
        # if j !=21:
        #     j+=1
        #     continue
        # j += 1
        # if number != 27: continue ### visualize interesting doc
        # folderPath='/home/john/Desktop/brat-v1.3_Crunchy_Frog/data/rapid-annotated/'
        folderPath = '/home/john/Desktop/nlp_work/test-annotations/'
        fullInputPath = folderPath + str(number) + "-classified.txt"
        thisFile = list(csv.reader(open(fullInputPath, 'rU'), delimiter=','))
        file_text = thisFile[0][-1]
        file_labels=thisFile[0][:-1]
        file_label_string=','.join(file_labels)
        annotation_offset= len(file_label_string)+2 #+2 accounts for " and \n after labels
        print 'label string'
        print file_label_string
        print 'file length'
        print len(thisFile[0])

        ### 1. RUN QUICK UMLS ON FILE ####
        found_entities,parsed_doc = matcher.match(text=file_text, best_match=True, ignore_syntax=False)

        # for entity in found_entities:
        #     print entity
        subject_entities = []
        negation_entities = []
        qualifier_entities = []
        for cui_container in containers:
            subject_cuis = cui_container.subject_cuis
            qualifier_cuis = cui_container.qualifier_cuis
            negation_cuis = cui_container.negation_cuis
            for findings in found_entities:
                while type(findings) ==list:
                    findings = findings [0]

            ### 2. Compare CUIs to CUI tree ####
                ## TO - DO - add in complication and such (custom job)

                if findings['cui'] in subject_cuis:
                    subject_entities.append(findings)
                elif findings['cui'] in negation_cuis and findings['similarity'] > .9: ## need higher similarity to avoid lots of false positives (e.g. inflammatory as negation for noninflammatory)
                    negation_entities.append(findings)
                    print 'FOUND NEG'
                    print negation_entities
                elif findings['cui'] in qualifier_cuis:
                    qualifier_entities.append(findings)



            #### TO-DO: Figure out more graceful way to handle this
            if cui_container.cui_to_morphological_variants_dic:
                regex_subject_findings=regex_matcher(parsed_doc=parsed_doc,cui_to_morphological_variants_dic=cui_container.cui_to_morphological_variants_dic,
                              annotation_offset=288)
            else: regex_subject_findings=None
            if regex_subject_findings: subject_entities+=regex_subject_findings
            # regex_surgery_findings=regex_matcher(parsed_doc=parsed_doc,cui_to_morphological_variants_dic=cui_container.cui_to_morphological_variants_dic,
            #               annotation_offset=288)
            ## TO-DO: FIX annotation offset hardcoding



            ### TO-DO ##
                ### need to repeat regex steps with simple surgery type things

            #### MAKE KEY VALUES HERE ####
              ####def make_key_value_objects####
                ### FIND ANATOMY, ACTIVITY, OR COMPLICATION TERMS (BY THEMSELVES OR COMBINED)
                ### FIND NEGATION AND QUALIFIERS
            #### MAKE FEATURE ANNOTATIONS HERE ####
            ####
        for finding in subject_entities:
            finding['concept_type'] = 'subject'
            finding['priority'] = 1
        for finding in negation_entities:
            finding['concept_type'] = 'negation'
            finding['priority'] = 2
        for finding in qualifier_entities:
            finding['concept_type'] = 'qualifier'
            finding['priority'] = 2
        from operator import itemgetter
        all_findings = subject_entities + negation_entities + qualifier_entities

        all_findings = removeOverlappingPhrases(all_findings)
        relevant_findings=[]
        all_findings.sort(key=itemgetter('priority', 'sentence'))
        count=1

        #### WEED OUT UNNECESSARY TERMS. MAKE KEY VALUE OBJECTS ####
        from keyValue import keyValue
        key_value_dic={}
        relevant_sentences = []
        for finding in all_findings:
            count+=1
            if finding['priority']==1:
                relevant_findings.append(finding)
                this_sentence=finding['sentence']
                relevant_sentences.append(this_sentence)
                this_kv=keyValue(concept=finding['cui'],concept_related_terms=[finding['ngram'],finding['start']+note_offset,finding['end']+note_offset],sentence=this_sentence)
                if this_sentence in key_value_dic:
                    current_kvs=key_value_dic[this_sentence]
                    current_kvs.append(this_kv)
                    key_value_dic[this_sentence]=current_kvs
                else:
                    key_value_dic[this_sentence]=[this_kv]
            ### currently done crudely (add negation/qualifer to ALL subjects. Could leverage sapcy dependencies) ###
            elif finding['priority'] !=1:
                this_sentence = finding['sentence']
                if this_sentence in relevant_sentences:
                    relevant_findings.append(finding)
                    if finding['concept_type'] == 'qualifier':
                        print 'ACTIVATED'
                        print finding
                        for kv in key_value_dic[this_sentence]:
                            kv.modifiers=[finding['cui'],finding['ngram'],finding['start']+note_offset,finding['end']+note_offset]

                    elif finding['concept_type'] == 'negation':
                        for kv in key_value_dic[this_sentence]:
                            kv.negation=[finding['cui'],finding['ngram'],finding['start']+note_offset,finding['end']+note_offset]
        ##############################################################
        for kvs in key_value_dic:
            for kv in key_value_dic[kvs]:
                print kv.sentence
                print kv.concept
                print kv.concept_related_terms
                print kv.negation
                print kv.modifiers
        return all_findings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

time0 = time.time()
config = Config()
model = NERModel(config)
model.build()
model.restore_session(config.dir_model)
time1 = time.time()
parser_config = parser_Config()

matcher = None
if parser_config.use_UMLS > 0:
    #QuickUMLS matcher
    from QuickUMLS.quickumls import QuickUMLS
    matcher = QuickUMLS(parser_config.QuickUMLS_dir, threshold=0.8)

print("\nloading model...", time1 - time0)


def main():
    #predict
    input = parser_config.infile_dir
    time2 = time.time()
    from parser import text_tokenize
    tokenizer = text_tokenize.mytokenizer()
    infile = codecs.open(
        input, "r")  # assume each line is one abstract: pmid||abstracttext
    outdir = parser_config.outjson_dir
    exception_dir = os.path.join(outdir + "/exceptionlist.txt")
    except_out = codecs.open(exception_dir, "w")