示例#1
0
def ceshi_run(stm_dict):
    #output_tmvar2 = 'tmVarJava/output/ceshi.txt.PubTator'
    output_tmvar2 = 'tmVarJava/output/29848cb18c2db29141bae9c5f7cc97b1d5175f4960eed341cca78cd9.PubTator.PubTator'
    dict_list = pubtator2dict_list(output_tmvar2, is_raw_text=True)
    is_raw_text, cur_thread_name = True, threading.current_thread().getName()
    ner_start_time = time.time()
    tagged_docs, num_entities = biobert_recognize(stm_dict, dict_list,
                                                  is_raw_text, cur_thread_name)
    ner_time = time.time() - ner_start_time
    print(
        datetime.now().strftime(stm_dict['time_format']),
        '[%s] NER %.3f sec, #entities: %d' %
        (cur_thread_name, ner_time, num_entities))
    return (tagged_docs, num_entities)
示例#2
0
    def tag_entities(self, text, cur_thread_name, is_raw_text, reuse=False):
        assert self.stm_dict is not None

        n_ascii_letters = 0
        for l in text:
            if l not in string.ascii_letters:
                continue
            n_ascii_letters += 1

        if n_ascii_letters == 0:
            text = 'No ascii letters. Please enter your text in English.'

        text_hash = hashlib.sha224(text.encode('utf-8')).hexdigest()
        print(datetime.now().strftime(self.stm_dict['time_format']),
              '[{}] text_hash: {}'.format(cur_thread_name, text_hash))

        bern_output_path = './output/bern_demo_{}.json'.format(text_hash)

        if reuse and os.path.exists(bern_output_path):
            print(datetime.now().strftime(self.stm_dict['time_format']),
                  '[{}] Found prev. output'.format(cur_thread_name))
            with open(bern_output_path, 'r', encoding='utf-8') as f_out:
                return json.load(f_out)

        home_gnormplus = self.stm_dict['gnormplus_home']
        input_gnormplus = os.path.join(home_gnormplus, 'input',
                                       '{}.PubTator'.format(text_hash))
        output_gnormplus = os.path.join(home_gnormplus, 'output',
                                        '{}.PubTator'.format(text_hash))

        home_tmvar2 = self.stm_dict['tmvar2_home']
        input_dir_tmvar2 = os.path.join(home_tmvar2, 'input')
        input_tmvar2 = os.path.join(input_dir_tmvar2,
                                    '{}.PubTator'.format(text_hash))
        output_tmvar2 = os.path.join(home_tmvar2, 'output',
                                     '{}.PubTator.PubTator'.format(text_hash))

        # Write input str to a .PubTator format file
        with open(input_gnormplus, 'w', encoding='utf-8') as f:
            # only title
            f.write(text_hash + '|t|')
            f.write('\n')
            f.write(text_hash + '|a|' + text + '\n\n')

        # Run GNormPlus
        gnormplus_start_time = time.time()
        tell_inputfile(self.stm_dict['gnormplus_host'],
                       self.stm_dict['gnormplus_port'],
                       '{}.PubTator'.format(text_hash))
        print(
            datetime.now().strftime(self.stm_dict['time_format']),
            '[{}] GNormPlus {:.3f} sec'.format(
                cur_thread_name,
                time.time() - gnormplus_start_time))

        # Move a GNormPlus output file to the tmVar2 input directory
        shutil.move(output_gnormplus, input_tmvar2)

        # Run tmVar 2.0
        tmvar2_start_time = time.time()
        tell_inputfile(self.stm_dict['tmvar2_host'],
                       self.stm_dict['tmvar2_port'],
                       '{}.PubTator'.format(text_hash))
        print(
            datetime.now().strftime(self.stm_dict['time_format']),
            '[{}] tmVar 2.0 {:.3f} sec'.format(cur_thread_name,
                                               time.time() -
                                               tmvar2_start_time))

        # Convert tmVar 2.0 outputs (?.PubTator.PubTator) to python dict
        dict_list = pubtator2dict_list(output_tmvar2, is_raw_text=True)

        # Delete temp files
        os.remove(input_gnormplus)
        os.remove(input_tmvar2)
        os.remove(output_tmvar2)

        # error
        if type(dict_list) is str:
            print(dict_list)
            return None

        # Run BioBERT of Lee et al., 2019
        start_time = time.time()
        tagged_docs, num_entities = \
            self.biobert_recognize(dict_list, is_raw_text, cur_thread_name)
        if tagged_docs is None:
            return None

        assert len(tagged_docs) == 1
        print(
            datetime.now().strftime(self.stm_dict['time_format']),
            '[%s] NER %.3f sec, #entities: %d' %
            (cur_thread_name, time.time() - start_time, num_entities))

        # Normalization models
        if num_entities > 0:
            # print(datetime.now().strftime(time_format),
            #       '[{}] Normalization models..'.format(cur_thread_name))
            tagged_docs = self.normalizer.normalize(text_hash,
                                                    tagged_docs,
                                                    cur_thread_name,
                                                    is_raw_text=is_raw_text)

        # Convert to PubAnnotation JSON
        tagged_docs[0] = get_pub_annotation(tagged_docs[0],
                                            is_raw_text=is_raw_text)

        # Save a BERN result
        with open(bern_output_path, 'w', encoding='utf-8') as f_out:
            json.dump(tagged_docs[0], f_out)

        return tagged_docs[0]
示例#3
0
    def tag_entities(self, cur_thread_name, is_raw_text, reuse=False):
        assert self.stm_dict is not None
        get_start_t = time.time()
        elapsed_time_dict = dict()

        home_gnormplus = self.stm_dict['gnormplus_home']
        input_gnormplus = os.path.join(home_gnormplus, 'input')
        output_gnormplus = os.path.join(home_gnormplus, 'output')

        home_tmvar2 = self.stm_dict['tmvar2_home']
        input_dir_tmvar2 = os.path.join(home_tmvar2, 'input')
        input_tmvar2 = os.path.join(input_dir_tmvar2)
        output_tmvar2 = os.path.join(home_tmvar2, 'output')

        # Run GNormPlus
        gnormplus_start_time = time.time()
        # 这里肯定要修改================================================
        shell_script = '''cd GNormPlusJava;java -Xmx12G -Xms12G -jar GNormPlus.jar input output setup.txt;cd -;'''  # % (input_gnormplus, output_gnormplus)
        print(shell_script)
        os.system(shell_script)
        gnormplus_time = time.time() - gnormplus_start_time
        elapsed_time_dict['gnormplus'] = round(gnormplus_time, 3)
        print(
            datetime.now().strftime(self.stm_dict['time_format']),
            '[{}] GNormPlus {:.3f} sec'.format(cur_thread_name,
                                               gnormplus_time))

        # GNorm的输出作为tmVar的输入,其实上面和下面的命令里已经都写死了
        input_tmvar2 = output_gnormplus

        # Run tmVar 2.0
        tmvar2_start_time = time.time()
        # 这里肯定要修改================================================
        shell_script = '''cd tmVarJava; java -Xmx12G -Xms12G -jar tmVar.jar ../GNormPlusJava/input output; cd -;'''  # % (input_tmvar2, output_tmvar2)
        os.system(shell_script)
        tmvar2_time = time.time() - tmvar2_start_time
        elapsed_time_dict['tmvar2'] = round(tmvar2_time, 3)
        print(datetime.now().strftime(self.stm_dict['time_format']),
              '[{}] tmVar 2.0 {:.3f} sec'.format(cur_thread_name, tmvar2_time))

        # Convert tmVar 2.0 outputs (?.PubTator.PubTator) to python dict
        file_list = glob.glob(output_tmvar2 + "/*.PubTator.PubTator")
        dict_list = [
            pubtator2dict_list(i, is_raw_text=True) for i in file_list
        ]

        # 至此所有的结果已经到dict_list了

        # Run BioBERT of Lee et al., 2019
        ner_start_time = time.time()
        # 这里设为False会报错
        is_raw_text = True
        tagged_docs_list = []
        for dict_l in dict_list:
            tagged_docs, num_entities = \
                biobert_recognize(self.stm_dict,dict_l, is_raw_text, cur_thread_name)
            tagged_docs_list.append((tagged_docs, num_entities))
        if tagged_docs_list is None:
            return None

        ner_time = time.time() - ner_start_time
        elapsed_time_dict['ner'] = round(ner_time, 3)
        print(
            datetime.now().strftime(self.stm_dict['time_format']),
            '[%s] NER %.3f sec, #entities: %d' %
            (cur_thread_name, ner_time, num_entities))
        #print(tagged_docs_list)
        #with open("ceshi_normllllll.list",'w') as ceshide:
        #    ceshide.write(json.dumps(tagged_docs_list))
        #return

        #for tagged_docs in tagged_docs_list:
        #    if tagged_docs is None:
        #        return None
        #    assert len(tagged_docs) == 1

        # Normalization models
        # 这里需要把load_dict.sh里的三个python脚本和两个jar全部run起来才跑的通
        os.system('sh load_dicts.sh')
        # 这里需要等服务启动起来后在跑,所以等一分钟
        time.sleep(60)
        normalization_time = 0.
        new_tagged_docs_list = []
        for tagged_docs, num_entities in tagged_docs_list:
            if tagged_docs is None:
                continue
            text_hash = tagged_docs[0]['pmid']
            if num_entities > 0:
                normalization_start_time = time.time()
                tagged_docs = self.normalizer.normalize(
                    text_hash,
                    tagged_docs,
                    cur_thread_name,
                    is_raw_text=is_raw_text)
                normalization_time = time.time() - normalization_start_time
            elapsed_time_dict['normalization'] = round(normalization_time, 3)
            # Convert to PubAnnotation JSON
            elapsed_time_dict['total'] = round(time.time() - get_start_t, 3)
            tagged_docs[0] = get_pub_annotation(
                tagged_docs[0],
                is_raw_text=is_raw_text,
                elapsed_time_dict=elapsed_time_dict)
            new_tagged_docs_list.append(tagged_docs[0])
            # Save a BERN result
            bern_output_path = './output/bern_demo_{}.json'.format(text_hash)
            if reuse and os.path.exists(bern_output_path):
                print(datetime.now().strftime(self.stm_dict['time_format']),
                      '[{}] Found prev. output'.format(cur_thread_name))
                with open(bern_output_path, 'r', encoding='utf-8') as f_out:
                    return json.load(f_out)
            with open(bern_output_path, 'w', encoding='utf-8') as f_out:
                json.dump(tagged_docs[0], f_out, sort_keys=True)

        return tagged_docs[0]