示例#1
0
def parseContents(file):
    tree=ET.parse(file)
    root=tree.getroot()
    contentsDict=removeTags(root)
    contentsDict.pop("author",None)
    contentsDict.pop("jauthor",None)
    contentsDict.pop("keywords",None)
    contentsDict.pop("jkeywords",None)
    for content in contentsDict.values():
        if content==None or content=="":
            continue
        if isEngPaper(content):
            return None
    return contentsDict.values() #各セクションからなるlist 
示例#2
0
def main():
    filename = sys.argv[1]
    if os.path.exists(filename[:-4] + "_feature_" + f_type + ".txt"):
        sys.exit()
    tree = ET.parse(filename)
    root = tree.getroot()
    texts = removeTags(root)  #texts=dict{section title:body text}
    process_text_list, keywords = get_partof_text_list(
        texts, [1, 1, 1, 0, 0, 0])  #[title,abst,keywords,intro,conclusion,etc]
    tmp_fulltext, tmp_kw = get_partof_text_list(texts, [1, 1, 0, 1, 1, 1])
    fulltext = "".join(tmp_fulltext)
    #processJ(filename,process_text_list,process_text_list[0],process_text_list[1],keywords,fulltext) #通常

    #関係抽出素性用
    term_dic = processJ(filename, process_text_list, texts["title"],
                        texts["abstract"], keywords, fulltext, True)
    features = createRelTrainData(filename, process_text_list, term_dic)
    writeFileJson(filename[:-4] + "_feature_rel.json", features)
示例#3
0
def main():
    files = glob.glob("./data/papers/NLP_LATEX_CORPUS/*/*.xml",
                      recursive=False)
    for j, file in enumerate(files):
        print(j + 1, " / ", len(files), " : ", file)
        tree = ET.parse(file)
        root = tree.getroot()
        contentsDict = removeTags(root)
        contentsDict.pop("author", None)
        contentsDict.pop("jauthor", None)
        contentsDict.pop("keywords", None)
        contentsDict.pop("jkeywords", None)
        if not isJapPaper(contentsDict):
            #print(list(contentsDict.keys())[:2])
            continue

        text_list = getTextList(contentsDict)
        term_list = []
        juman_results = []

        #各文に対して処理
        for sec_i, text in enumerate(text_list):
            term_list_s = []
            #print(text)
            try:
                juman_result = juman2mecab(execJuman(text))
            except:
                continue
            juman_results.append(juman_result)
            #キーワード抽出
            partof_term = ""  #複合名詞抽出用tmp
            partof_termex = ""  #用語的表現抽出用tmp
            tmp_partof_termex = ""  #「○○の△△」の△△の部分を保存しておき、次の○○にする
            now_pos = 0  #用語的表現抽出用 現在の場所 0:空 1:○○中 2:"の"済 3:△△中
            word_head_pos = 0  #複合語の頭の位置
            word_head_posex = 0  #用語的表現の頭の位置
            tmp_word_head_posex = 0  #tmp_partof_termexと同じく
            nowread_head_pos = 0  #現在処理している形態素の頭の位置
            tmp_i_term = 0
            tmp_i_termex = 0
            tmp_tmp_i_termex = 0
            containsNorm = False
            tailIsSahen = False
            for i, morpheme in enumerate(juman_result):
                #print(morpheme[0])
                #キーワード抽出
                if morpheme[0] not in ["EOS", ""]:
                    midasi, yomi, genkei, hinsi, bunrui, katuyou1, katuyou2, imis, repname = morpheme
                    if isTargetMorphemeUni(midasi, hinsi, bunrui):
                        if len(partof_term) == 0:
                            word_head_pos = nowread_head_pos
                            tmp_i_term = i
                        partof_term += midasi
                        containsNorm = True
                        if bunrui == "サ変名詞":
                            tailIsSahen = True
                        else:
                            tailIsSahen = False
                    elif isTargetMorphemeNotUni(midasi, hinsi, bunrui,
                                                katuyou2):
                        if len(partof_term) == 0:
                            word_head_pos = nowread_head_pos
                            tmp_i_term = i
                        partof_term += midasi
                        tailIsSahen = False
                    else:  # キーワードを構成し得ない形態素の場合
                        if len(partof_term) > 0 and containsNorm:
                            term_list_s.append(partof_term)
                            if len(
                                    partof_termex
                            ) > 0 and tailIsSahen:  #「○○の」が住んでいて次にサ変名詞で終わるtermが来た場合
                                tmp_partof_termex = partof_term
                                tmp_word_head_posex = word_head_pos
                                tmp_tmp_i_termex = tmp_i_term
                                partof_termex += partof_term
                                term_list_s.append(partof_termex)
                                partof_termex = ""
                                word_head_posex = 0
                            elif len(
                                    partof_termex
                            ) > 0 and not tailIsSahen:  #「○○の」が済んていて次にサ変名詞で終わらないtermが来た場合、それを次の「○○の」候補に
                                tmp_partof_termex = partof_term
                                tmp_word_head_posex = word_head_pos
                                tmp_tmp_i_termex = tmp_i_term
                            if midasi in ["の", "を"] and hinsi == "助詞":
                                if len(tmp_partof_termex
                                       ) > 0:  # この前にも「○○の△△」が来た場合
                                    partof_termex = (tmp_partof_termex + "の")
                                    word_head_posex = tmp_word_head_posex
                                    tmp_i_termex = tmp_tmp_i_termex
                                    tmp_partof_termex = ""
                                    tmp_word_head_posex = 0
                                    partof_term = ""
                                    nowread_head_pos += len(midasi)
                                    continue
                                if len(partof_termex) == 0:  # 初めて「○○の」を作る場合
                                    partof_termex += (partof_term + "の")
                                    word_head_posex = word_head_pos
                                    tmp_i_termex = tmp_i_term
                                    tailIsSahen = False
                                    partof_term = ""
                                    word_head_pos = 0
                                    tmp_partof_termex = ""
                                    tmp_word_head_posex = 0
                                    nowread_head_pos += len(midasi)
                                    continue

                        partof_term = ""
                        partof_termex = ""
                        tmp_partof_termex = ""
                        now_pos = 0
                        word_head_pos = 0
                        word_head_posex = 0
                        tmp_word_head_posex = 0
                        containsNorm = False
                        tailIsSahen = False
                    if i + 1 == len(juman_result):
                        if len(partof_term) > 0 and containsNorm:
                            term_list_s.append(partof_term)
                            if len(partof_termex) > 0 and tailIsSahen:
                                tmp = partof_term
                                partof_termex += partof_term
                                term_list_s.append(partof_termex)
                nowread_head_pos += len(midasi)
            term_list.append(set(term_list_s))

        #for l in term_list:
        #    print(l)

        term_dic = {}
        term_files = glob.glob("./data/xml/Kyoki/*.txt", recursive=False)
        for i, term_list_s in enumerate(term_list):
            for term in term_list_s:
                if term not in term_dic:
                    term_dic[term] = {}
                for term2 in term_list_s:
                    if term == term2:
                        continue
                    if term2 not in term_dic[term]:
                        term_dic[term][term2] = 1
                    else:
                        term_dic[term][term2] += 1

        for term, term_dic_inner in term_dic.items():
            term_tmp = ""
            if "<SLASH>" in term_tmp:
                term_tmp = term.replace("<SLASH>", "/")
            if "<BSLASHA>" in term_tmp:
                term_tmp = term.replace("<BSLASHA>", "\a")
            if "<BSLASHB>" in term_tmp:
                term_tmp = term.replace("<BSLASHB>", "\b")
            if "<BSLASHF>" in term_tmp:
                term_tmp = term.replace("<BSLASHF>", "\f")
            if "<BSLASHT>" in term_tmp:
                term_tmp = term.replace("<BSLASHT>", "\t")
            if "<BSLASHR>" in term_tmp:
                term_tmp = term.replace("<BSLASHR>", "\r")
            if "<BSLASHN>" in term_tmp:
                term_tmp = term.replace("<BSLASHN>", "\n")
            if "<BSLASHV>" in term_tmp:
                term_tmp = term.replace("<BSLASHV>", "\v")
            if "<BSLASHU>" in term_tmp:
                term_tmp = term.replace("<BSLASHU>", "\\u")
            if "<BSLASHUU>" in term_tmp:
                term_tmp = term.replace("<BSLASHUU>", "\\U")
            if "<BSLASHX>" in term_tmp:
                term_tmp = term.replace("<BSLASHX>", "\\x")
            if "<BSLASHO>" in term_tmp:
                term_tmp = term.replace("<BSLASHO>", "\\o")
            if "<BSLASH0>" in term_tmp:
                term_tmp = term.replace("<BSLASH0>", "\0")
            if not os.path.exists("./data/xml/Kyoki/" + term_tmp + ".txt"):
                continue
            with open("./data/xml/Kyoki/" + term + ".txt", "r") as f:
                for line in f.readlines():
                    tl = ""
                    if "<SLASH>" in line:
                        tl = line.replace("<SLASH>", "/")
                    if "<BSLASHA>" in line:
                        tl = line.replace("<BSLASHA>", "\a")
                    if "<BSLASHB>" in line:
                        tl = line.replace("<BSLASHB>", "\b")
                    if "<BSLASHF>" in line:
                        tl = line.replace("<BSLASHF>", "\f")
                    if "<BSLASHT>" in line:
                        tl = line.replace("<BSLASHT>", "\t")
                    if "<BSLASHR>" in line:
                        tl = line.replace("<BSLASHR>", "\r")
                    if "<BSLASHN>" in line:
                        tl = line.replace("<BSLASHN>", "\n")
                    if "<BSLASHV>" in line:
                        tl = line.replace("<BSLASHV>", "\v")
                    if "<BSLASHU>" in line:
                        tl = line.replace("<BSLASHU>", "\\u")
                    if "<BSLASHUU>" in line:
                        tl = line.replace("<BSLASHUU>", "\\U")
                    if "<BSLASHX>" in line:
                        tl = line.replace("<BSLASHX>", "\\x")
                    if "<BSLASHO>" in line:
                        tl = line.replace("<BSLASHO>", "\\o")
                    if "<BSLASH0>" in line:
                        tl = line.replace("<BSLASH0>", "\0")
                    if tl == "":
                        t2, c = line.strip().split("\t")
                    else:
                        t2, c = tl.strip().split("\t")
                    if t2 in term_dic_inner:
                        term_dic[term_tmp][t2] += int(c)
                    else:
                        term_dic[term_tmp][t2] = int(c)

        for term, term_dic_inner in term_dic.items():
            if "/" in term:
                term = term.replace("/", "<SLASH>")
            if "\a" in term:
                term = term.replace("\a", "<BSLASHA>")
            if "\b" in term:
                term = term.replace("\b", "<BSLASHB>")
            if "\f" in term:
                term = term.replace("\f", "<BSLASHF>")
            if "\t" in term:
                term = term.replace("\t", "<BSLASHT>")
            if "\r" in term:
                term = term.replace("\r", "<BSLASHR>")
            if "\n" in term:
                term = term.replace("\n", "<BSLASHN>")
            if "\v" in term:
                term = term.replace("\v", "<BSLASHV>")
            if "\\u" in term:
                term = term.replace("\\u", "<BSLASHU>")
            if "\\U" in term:
                term = term.replace("\\U", "<BSLASHUU>")
            if "\\x" in term:
                term = term.replace("\\x", "<BSLASHX>")
            if "\\o" in term:
                term = term.replace("\\o", "<BSLASHO>")
            if "\0" in term:
                term = term.replace("\0", "<BSLASH0>")
            with open("./data/xml/Kyoki/" + term + ".txt", "w") as f:
                for term2, count in term_dic_inner.items():
                    term2t = term2
                    if "/" in term2:
                        term2t = term2.replace("/", "<SLASH>")
                    if "\a" in term2:
                        term2t = term2.replace("\a", "<BSLASHA>")
                    if "\b" in term2:
                        term2t = term2.replace("\b", "<BSLASHB>")
                    if "\f" in term2:
                        term2t = term2.replace("\f", "<BSLASHF>")
                    if "\t" in term2:
                        term2t = term2.replace("\t", "<BSLASHT>")
                    if "\r" in term2:
                        term2t = term2.replace("\r", "<BSLASHR>")
                    if "\n" in term2:
                        term2t = term2.replace("\n", "<BSLASHN>")
                    if "\v" in term2:
                        term2t = term2.replace("\v", "<BSLASHV>")
                    if "\\u" in term2:
                        term2t = term2.replace("\\u", "<BSLASHU>")
                    if "\\U" in term2:
                        term2t = term2.replace("\\U", "<BSLASHUU>")
                    if "\\x" in term2:
                        term2t = term2.replace("\\x", "<BSLASHX>")
                    if "\\o" in term2:
                        term2t = term2.replace("\\o", "<BSLASHO>")
                    if "\0" in term2:
                        term2t = term2.replace("\0", "<BSLASH0>")
                    f.write(term2t + "\t" + str(count) + "\n")
示例#4
0
import re
from utils import get_files
from xmlAnalyzer import removeTags
import xml.etree.ElementTree as ET

BG=0
MET=1
RES=2

files=get_files("./data/xml/")

for i,filename in enumerate(files):
    print(i,"/",len(files)," ",filename,end="")
    tree=ET.parse(filename)
    root=tree.getroot()
    texts=removeTags(root) #dict
    absts=re.split("[.。]",texts["abstract"])

    background=-1
    method=-1
    result=-1
    now=BG

    pos=0
    for sent in absts:
        if now==BG:
            if re.search("本稿で|本論文で|本研究で|提案",sent)!=None:
                method=pos
                now=MET
        if now==MET:
            if re.search("実験|結果|評価|精度が|精度に",sent)!=None: