Exemplo n.º 1
0
def txt2matrix_fortrain(ann_dir,mytrain,tag_included,filename,curpath):
    txt_files=readfromdir.get_file_list(ann_dir,['txt'])
    print "there's "+ str(len(txt_files))+" in total!"

    i=0
    for txt_file in txt_files:
        i+=1

        # read files

        myraw=codecs.open(txt_file).read()
        match=re.search('^(.*)\.txt',txt_file)
        name=match.group(1)
        ann_file=name+'_new.ann'
        print "reading file from",txt_file,ann_file,"..."
        myann=codecs.open(ann_file,"r")
        #print myann
        # output features
        text_tagged=labeling.ann_tagging(myann,myraw,tag_included)
        lines=" ".join(text_tagged.split(r'[;\n]'))
        sents=nltk.sent_tokenize(lines)
        lines=" ### ".join(sents)
        term_list, tag_list,index_list=t2c.txt2conll(lines,1)  # "1" here represents it's a training texts with annoatioin; "0" represents raw texts
        sents=" ".join(term_list).split("###")
        type_list=[]
        pos_list=[]
        # extract umls concepts:
        j=0
        for sent in sents:
            if j>=len(term_list):
                break

            metamap_output=umls_identify.formating_for_metamap(curpath,sent,filename)
            one_sent_term,type_list=umls_identify.label_umls_cui(metamap_output,sent)
            pos_list=POS.pos_tagging(one_sent_term)
            pos_list.append(".")
            type_list.append("O")
            terms=sent.split()
            sent_id=0

            for t in terms:
                if term_list[j]== "###":
                    j=j+1
                term=term_list[j]
                lemma=st.stem(term)
                #vector=word2vec.ouput_embedding(model,term.lower(),50)
                bc=BrownClustering.bc_indexing(term.lower(),bc_index)
                print>> mytrain, term_list[j]+"\t"+lemma+"\t"+pos_list[sent_id]+"\t"+type_list[sent_id]+"\t"+bc+"\t"+index_list[j]+"\t"+tag_list[j]
                sent_id+=1
                j=j+1

            print>>mytrain

    if i%5==0:
        print str(i) +" files finished"
def txt2matrix_fortrain(ann_dir, mytrain, tag_included, filename, curpath):
    txt_files = readfromdir.get_file_list(ann_dir, ['txt'])
    print "there's " + str(len(txt_files)) + " in total!"

    i = 0
    for txt_file in txt_files:
        i += 1

        # read files

        myraw = codecs.open(txt_file).read()
        match = re.search('^(.*)\.txt', txt_file)
        name = match.group(1)
        ann_file = name + '_new.ann'
        print "reading file from", txt_file, ann_file, "..."
        myann = codecs.open(ann_file, "r")
        #print myann
        # output features
        text_tagged = labeling.ann_tagging(myann, myraw, tag_included)
        lines = " ".join(text_tagged.split(r'[;\n]'))
        sents = nltk.sent_tokenize(lines)
        lines = " ### ".join(sents)
        term_list, tag_list, index_list = t2c.txt2conll(
            lines, 1
        )  # "1" here represents it's a training texts with annoatioin; "0" represents raw texts
        sents = " ".join(term_list).split("###")
        type_list = []
        pos_list = []
        # extract umls concepts:
        j = 0
        for sent in sents:
            if j >= len(term_list):
                break

            metamap_output = umls_identify.formating_for_metamap(
                curpath, sent, filename)
            one_sent_term, type_list = umls_identify.label_umls_cui(
                metamap_output, sent)
            pos_list = POS.pos_tagging(one_sent_term)
            pos_list.append(".")
            type_list.append("O")
            terms = sent.split()
            sent_id = 0

            for t in terms:
                if term_list[j] == "###":
                    j = j + 1
                term = term_list[j]
                lemma = st.stem(term)
                #vector=word2vec.ouput_embedding(model,term.lower(),50)
                bc = BrownClustering.bc_indexing(term.lower(), bc_index)
                print >> mytrain, term_list[j] + "\t" + lemma + "\t" + pos_list[
                    sent_id] + "\t" + type_list[
                        sent_id] + "\t" + bc + "\t" + index_list[
                            j] + "\t" + tag_list[j]
                sent_id += 1
                j = j + 1

            print >> mytrain

    if i % 5 == 0:
        print str(i) + " files finished"