示例#1
0
    def genia_tokenizer(self):
        '''Tokenize pair text with genia tagger.

        '''
        tagger = GeniaTagger('./tools/geniatagger-3.0.2/geniatagger')
        with open('./chemprot_test_gs/new_testing_examples.json', 'r') as f:
            training_examples = json.load(f)
            # print(len(training_examples))
            for i in training_examples:
                tokenized_tuple = tagger.parse(i['sentence'])
                token_list = []
                for output in tokenized_tuple:
                    if output[0] in string.punctuation:
                        continue
                    pos = output[2]
                    if output[0] == pos:
                        continue
                    if output[0].endswith('..'):
                        token = output[0][:-2]
                    elif pos == 'CD':
                        token = 'NUM'
                    else:
                        token = output[0]
                    token_list.append(token)
                i['sentence'] = ' '.join(token_list)
        # with open('./chemprot_training/train_tokenized.json', 'w+') as j:
        with open('./chemprot_test_gs/testing_tokenized.json', 'w+') as j:
            json.dump(training_examples, j, indent=4)
    corpusReader = PlaintextCorpusReader(corpusRoot, ".*.txt", encoding = codec)

    outFile = open(corpusRoot + "genia_and_backoff.txt", "w")   
    
    for journal in corpusReader.fileids() :
        print ("******* start " + journal)
        sentList = corpusReader.sents(journal)

        for sent in sentList :
            
            taggedList = t2.tag(sent)
            
            for tag in taggedList :
                if tag[1] == "UNK" :
                    genia_tag_list = tagger.parse(tag[0])
                    for genia_tag in genia_tag_list :
                        if genia_tag[4] == "O" :
                            outFile.write(genia_tag[0] + "/" + genia_tag[2] + "  ")
                        else :
                            new_tag = genia_tag[4].split("-")[1]
                            outFile.write(genia_tag[0] + "/" + new_tag + "  ")
                            
                    
                else :
                    outFile.write(tag[0] + "/" + tag[1] + " ")
            outFile.write("\n\n")
        

        print ("##### end " + journal)
from nltk.tokenize import PunktSentenceTokenizer
from geniatagger import GeniaTagger

tagger = GeniaTagger('~/qwerty/shashank/geniatagger-3.0.2/geniatagger')
print(tagger.parse('This is a pen.'))
#print(tagger.parse('tis is  pen'))

#print(data)
med_tokenizer = PunktSentenceTokenizer(train_data)

    corpusReader = PlaintextCorpusReader(corpusRoot, ".*.txt", encoding=codec)

    outFile = open(corpusRoot + "genia_and_backoff.txt", "w")

    for journal in corpusReader.fileids():
        print("******* start " + journal)
        sentList = corpusReader.sents(journal)

        for sent in sentList:

            taggedList = t2.tag(sent)

            for tag in taggedList:
                if tag[1] == "UNK":
                    genia_tag_list = tagger.parse(tag[0])
                    for genia_tag in genia_tag_list:
                        if genia_tag[4] == "O":
                            outFile.write(genia_tag[0] + "/" + genia_tag[2] +
                                          "  ")
                        else:
                            new_tag = genia_tag[4].split("-")[1]
                            outFile.write(genia_tag[0] + "/" + new_tag + "  ")

                else:
                    outFile.write(tag[0] + "/" + tag[1] + " ")
            outFile.write("\n\n")

        print("##### end " + journal)

    outFile.close()
示例#5
0
def annotate_text(tager=''):
    genia = GeniaTagger('../genia-tagger/geniatagger-3.0.2/geniatagger')
    medpost = spacy.load(os.path.abspath('trained_tagger'))
    stanford = StanfordCoreNLP('http://localhost:9000')
    main_dir = 'corrected_outcomes'
    data_dir = os.path.abspath(os.path.join(main_dir, 'aggregated'))
    create_storage_dirs([data_dir])

    sub_dir = os.path.abspath(os.path.join(data_dir, 'test'))
    if not os.path.exists(os.path.dirname(sub_dir)):
        os.makedirs(os.path.dirname(sub_dir))

    turker, ebm_extract = e.read_anns('hierarchical_labels', 'outcomes', \
                                      ann_type='aggregated', model_phase='train')

    seq_dir = os.path.abspath(os.path.join(os.path.curdir, 'corrected_outcomes', 'test'))
    create_storage_dirs([seq_dir])
    ebm_csv = []

    start = time.time()

    with open(os.path.join(seq_dir, 'test_medpost.bmes'), 'w') as f:
        for pmid, doc in ebm_extract.items():
            abstract = ' '.join(i for i in doc.tokens)
            #pprint(abstract)
            u = doc.anns['AGGREGATED']
            v = doc.tokens
            o = []
            corr_outcomes = []
            temp, temp_2 = [], []
            t = 0
            m = 0
            o_come = e.print_labeled_spans_2(doc)[0] #extract outcomes from the abstract being examined, [(Outcome-type, Outcome), (Outcome-type, Outcome2)]

            #store the annotations and the index of the annotations for each abstract
            for x in range(len(u)):
                if x == t:
                    if u[x] != 0:
                        for ff in o_come:
                            for j in range(len(u)):
                                if j < len(ff[1].split()):
                                    o.append((t, u[x]))
                                    t += 1
                            break
                        o_come.pop(0)

                        txt_toks = [v[i[0]] for i in o]
                        text_wrds = ' '.join(i for i in txt_toks)

                        corr = correcting_spans.correct_text()
                        text_wrds = corr.statTerm_keyWord_punct_remove(text_wrds)

                        if tager.lower() == 'genia':
                            tagged = genia.parse(text_wrds)
                            pos = [i[2] for i in tagged]
                        elif tager.lower() == 'medpost':
                            tagged = medpost(text_wrds)
                            pos = [i.tag_ for i in tagged]
                        elif tager.lower() == 'stanford':
                            pos = []
                            for elem in word_tokenize(text_wrds):
                                stan = stanford.annotate(elem, properties={'annotators':'pos', 'outputFormat':'json'})
                                pos.append(stan['sentences'][0]['tokens'][0]['pos'])

                        text_pos = ' '.join(i for i in pos)

                        label = core_outcome[u[x]]

                        corrected_spans = corr.pos_co_occurrence_cleaning(text_wrds, text_pos, label)

                        if len(corrected_spans) == 0:
                            v[o[0][0]:(o[-1][0] + 1)] = txt_toks
                            u[o[0][0]:(o[-1][0] + 1)] = [0 for i in range(len(txt_toks))]
                        elif len(corrected_spans) < 2:
                            span = corrected_spans[0]
                            s = [i for i in span[1].split()]
                            ll = [o[0][1] if i in s else 0 for i in txt_toks]
                            v[o[0][0]:(o[-1][0] + 1)] = txt_toks
                            u[o[0][0]:(o[-1][0] + 1)] = ll
                        else:
                            s = [i for j in corrected_spans for i in j[1].split()]
                            ll = [o[0][1] if i in s else 0 for i in txt_toks]
                            v[o[0][0]:(o[-1][0] + 1)] = txt_toks
                            u[o[0][0]:(o[-1][0] + 1)] = ll

                        p = [i for i in corrected_spans]
                        if len(p) > 0:
                            for i in p:
                                corr_outcomes.append(i)
                        o.clear()

                    else:
                        t += 1
            if corr_outcomes:
                temp_2 = build_sequence_model(v, u, core_outcome, corr_outcomes)
                qq = 1
                for i in temp_2:
                    print(qq, i)
                    f.write('{}\n'.format(i))
                    qq += 1
                f.write('\n')
                for k in corr_outcomes:
                    ebm_csv.append(k)
        ebm_csv_df = pd.DataFrame(ebm_csv, columns=['Label','Outcome'])
        ebm_csv_df.to_csv(os.path.join(os.path.abspath(os.path.curdir), 'corrected_outcomes/test/labels_outcomes_medpost.csv'))
        f.close()
    print("Duration {}".format(time.time() - start))