Exemplo n.º 1
0
def tester_1():
    # Simple
    knp = pyknp.KNP(option="-tab -dpnd",
                    rcfile='/usr/local/etc/knprc',
                    jumanrcfile='/usr/local/etc/jumanrc')
    test = "昨日ノーベル物理学賞について学んだ"
    tagged_test = extract_ne(test, knp, detail_flag=False)
    print(swap_ne_tag_with_only_tag(tagged_test[0], "ARTIFACT", "PRIZE"))
Exemplo n.º 2
0
def prompt():
    knp = pyknp.KNP(option="-tab -dpnd",
                    rcfile='/usr/local/etc/knprc',
                    jumanrcfile='/usr/local/etc/jumanrc')
    while True:
        print(">>> ", end="")
        line = input()
        tagged_line = ner_func.extract_ne(line, knp, detail_flag=False)
        print(tagged_line[0] + "\n")
Exemplo n.º 3
0
 def __init__(self, logger=None, verbose=False, jumanpp=False):
     self._knp = pyknp.KNP(jumanpp=jumanpp)
     self.verbose = verbose
     if logger:
         self.logger = logger
     else:
         self.logger = logging.getLogger(__name__)
         logging.basicConfig(level=logging.WARNING)
         logging.captureWarnings(True)
Exemplo n.º 4
0
def tester_2():
    # Swap with ne
    knp = pyknp.KNP(option="-tab -dpnd",
                    rcfile='/usr/local/etc/knprc',
                    jumanrcfile='/usr/local/etc/jumanrc')
    test = "昨日ノーベル物理学賞について学んだ"
    test1 = "昨日英語の教科書を買った"
    tagged_test = extract_ne(test, knp, detail_flag=False)
    tagged_test1 = extract_ne(test1, knp, detail_flag=False)
    print(swap_ne_tag_with_ne_and_tag(tagged_test[0], "ノーベル物理学賞", "PRIZE", tagged_test[2]))
    print(swap_ne_tag_with_ne_and_tag(tagged_test1[0], "教科書", "EDUCATION", tagged_test1[2]))
    print(tagged_test[3])
Exemplo n.º 5
0
def _knp(sentence):
    if sentence == "":
        return None

    knp = pyknp.KNP()
    try:
        result = knp.parse(sentence)
    except:
        return None

    # 情報をクラスに格納
    phrases = OrderedDict()  # 文節クラスのディクショナリ
    for bnst in result.bnst_list():
        ph = mynlp.Phrase()
        ph.parent_id = bnst.parent_id
        ph.dpndtype = bnst.dpndtype

        # この文節にふくまれる単語情報を格納
        for mrph in bnst.mrph_list():  # mrph_list:文節内の形態素リスト
            word = mynlp.Word()
            word.surface = mrph.midasi  # 表層形
            word.base = mrph.genkei  # 原型
            word.yomi = mrph.yomi  # 読み

            # 品詞関連詳細情報
            pos_info = mrph.spec().split(" ")  # or .new_spec()
            # 表層形 読み 見出し語 品詞大分類 品詞大分類ID 品詞細分類 品詞細分類ID 活用型 活用型ID 活用形 活用形ID 意味情報
            word.pos = pos_info[3]  # 品詞
            word.pos_detail = pos_info[5]  # 品詞細分類

            # 意味情報関連
            imis = mrph.imis.split()  # 代表表記,漢字読み,カテゴリなど
            for imi in imis:
                if "代表表記" in imi:
                    word.descriptions = imi.split(":", 1)[-1]
                elif "カテゴリ" in imi:
                    word.category = imi.split(":", 1)[-1]
                elif "ドメイン" in imi:
                    word.domain = imi.split(":", 1)[-1]
                elif ("人名:" in imi) or ("地名:" in imi):  # 固有名詞
                    word.proper_noun = imi.split(":", 1)[-1]
                else:
                    word.another = word.another + imi + " "

            ph.words.append(word)

        phrases[bnst.bnst_id] = ph

    for ph_i, ph in phrases.items():
        if ph.parent_id != -1:
            phrases[ph.parent_id].children.append(ph_i)

    return phrases
Exemplo n.º 6
0
def main():
    print("label:x or o")
    string = '1990年生まれはみんなごはんとラーメンを一緒に食べることが普通だ'
    with open('./label_file') as fp:
        label = deque(fp.readlines())
    label = None
    with open('./all_text.csv') as f:
        for line in f.readlines():
            flag, string = line.split(',')
            if flag == 'x':
                print(string)
                knp = kp.KNP(option='-tab -anaphora')
                knp_parser = KNP_Parser()
                knp_parser.parse_knp(knp, string, label=label)
Exemplo n.º 7
0
    def __init__(
        self,
        knp_kwargs: Optional[Dict[str, str]] = None,
        preprocessor: Callable[[str], str] = None,
    ):
        import pyknp

        cmd = get_juman_command()
        assert cmd
        knp_kwargs = knp_kwargs or {}
        knp_kwargs.setdefault("jumancommand", cmd)

        self.knp = pyknp.KNP(**knp_kwargs)
        self.knp_kwargs = knp_kwargs
Exemplo n.º 8
0
def get_ave_vec(line):
    hinsi_list = ['名詞', '形容詞', '動詞']
    import pyknp
    knp = pyknp.KNP()
    decompose = True
    total_vec = []
    result = knp.parse(line)
    for mrph in result.mrph_list():
        if mrph.hinsi in hinsi_list:
            if decompose == True:
                midasi, ending = mrph2decomposed_str(mrph)
                if midasi in model.vocab:
                    total_vec.append(model[midasi])
            else:
                midasi = mrph2str(mrph)
    if total_vec == []:
        return False
    return sum(total_vec) / len(total_vec)
Exemplo n.º 9
0
def tester():
    knp = pyknp.KNP(option="-tab -dpnd",
                    rcfile='/usr/local/etc/knprc',
                    jumanrcfile='/usr/local/etc/jumanrc')
    line1 = "今年の人工知能学会は2016年6月6日~9日まで北九州国際会議場で開催されます"
    line2 = "昨夜,太郎は夜9時に花子へ会いに行った"
    line3 = "佐藤は昨夜,国会議事堂まで個人情報保護法についての議論を見に行った"
    line4 = "藤本太郎喜左衛門将時能という名前の人がいるらしい"
    tagged_line1 = ner_func.extract_ne(line1, knp, detail_flag=True)
    tagged_line2 = ner_func.extract_ne(line2, knp, detail_flag=True)
    tagged_line3 = ner_func.extract_ne(line3, knp, detail_flag=True)
    tagged_line4 = ner_func.extract_ne(line4, knp, detail_flag=True)
    print()
    print(line1)
    print(tagged_line1[0] + "\n")
    print(line2)
    print(tagged_line2[0] + "\n")
    print(line3)
    print(tagged_line3[0] + "\n")
    print(line4)
    print(tagged_line4[0] + "\n")
def __parse_bnst(line):
    """
    文節の格解析
    :param line: in str
    :return: bnst_list from pyknp
    """
    import pyknp
    # 詳細な分析結果の表示;照応解析あり
    knp = pyknp.KNP(option="-tab -anaphora")
    # 空白,タブの除去
    line2 = "".join(line.split())
    # 以下,文章を切り詰める応急処置.
    # TODO: 全文を解析できるように対応する必要あり.
    # 全角ピリオドを句点に置換後,右から区切る
    line2_list = line2.replace(".", "。").rsplit("。", 0)
    n = 1
    while len(line2_list[0]) >= 218:
        # 全角ピリオドを句点に置換後,右から区切る
        line2_list = line2.replace(".", "。").rsplit("。", n)
        n += 1
    results = knp.parse(line2_list[0])
    return results.bnst_list()
Exemplo n.º 11
0
    def ner(self, line):
        """
        $ python -m sagas.ja.knp_procs ner "太郎は5月18日の朝9時に花子に会いに行った."
        :param line:
        :return:
        """
        import re

        # KNP prepairing:
        # option (str) – KNP解析オプション (詳細解析結果を出力する-tabは必須。
        # 省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など)
        knp = pyknp.KNP(option="-tab -dpnd", jumanpp=False)

        def make_np_tagged_text(src_text: str):
            tagged_text = src_text  # copy
            result = knp.parse(src_text)  # tagging

            for tag in result.tag_list():
                if "NE:" in tag.fstring:  # if fstring has NE phrase
                    span = result.get_tag_span(tag.tag_id)
                    print('..', span, tag.fstring)
                    # extract NE phrase
                    search_r = re.search("<NE:(.*):(.*)>", tag.fstring)
                    # tagged_ne_phrase = re.search("<NE:(.*):(.*)>", tag.fstring).group(0)
                    # ne_phrase = re.search("<NE:(.*):(.*)>", tag.fstring).group(2)
                    tagged_ne_phrase = search_r.group(0)
                    ne_phrase = search_r.group(2)

                    # overwrite to src text
                    tagged_text = tagged_text.replace(ne_phrase,
                                                      tagged_ne_phrase)

            return tagged_text

        tc.emp('green', line)
        tc.emp('yellow', make_np_tagged_text(line))
import re
import csv
import os
import sys
import pyknp

title = sys.argv[1]

#ff = codecs.open("fuman_a201508.json","r","utf-8")
ff = codecs.open(title + ".json", "r", "utf-8")
topic_json = json.loads(ff.read().encode("utf-8"))
ff.close()

stop_words = [u"の", u"こと"]  # for temporary

knp = pyknp.KNP()

extracted_result = []  #{topicID,topicParameter,sentence,exracted,tokenized}
for topics in topic_json:
    ## マッチした場合のスコア作成
    word_match_score = {}
    for tt in range(0, len(topics[u"wordsInTopic"])):
        if topics[u"wordsInTopic"][tt] not in stop_words:
            word_match_score[topics[u"wordsInTopic"][tt]] = len(
                topics[u"wordsInTopic"]) - tt
    ##
    for sentences in topics[u"sentences"]:
        for ss in re.split("\n|。|!|?|!|\?", sentences.encode('utf-8')):
            ss = unicode(ss, 'utf-8')
            try:
                kres = knp.parse(ss.replace(' ', ''))
Exemplo n.º 13
0
 def __init__(self):
     self.knp = pyknp.KNP()
     self.ok_type = ['形容詞', '名詞', '動詞']
     self.swapwords = self.__get_stopwords()
Exemplo n.º 14
0
def createRelTrainData(filename, text_list, term_dic, n=3):
    juman_results = []
    #各文に対して処理
    for sec_i, text in enumerate(text_list):
        #print(text)
        juman_result = juman2mecab(execJuman(text))
        juman_results.append(juman_result)
    mecab_result_list = list(
        filter(lambda x: x not in ["EOS", ""], juman_results))
    head_poses = []  #アブスト各文の文頭位置
    p = 0
    for sentence in replaceDpoint(text_list[1].replace(".", ".")).split("."):
        head_poses.append(p)
        p += (len(sentence.replace("<dpoint>", ".")) + 1)
    #termを出現文によって分離
    print("process term list")
    title_terms = []  #タプル(term,pos)のリスト
    abst_terms = {}  #タプル(term,pos)のリスト の辞書
    for p in head_poses:
        abst_terms[p] = []  #空のリストで初期化しておく
    for term, poses in term_dic.items():
        for pos in poses:
            if pos[0] == 0:
                title_terms.append((term, pos))
            else:
                abst_terms[getPosIndex(pos[2], head_poses)].append((term, pos))
                #print(term,pos[2],getPosIndex(pos[2],head_poses))
    terms_list = abst_terms.values()
    #print(abst_terms.keys())
    #for k,v in abst_terms.items():
    #    print("key:",k," value:",v)
    #print("len(terms_list)= ",len(terms_list))
    print("process KNP")
    knp = pyknp.KNP(command='knp',
                    option='-tab -anaphora',
                    jumancommand='jumanpp',
                    jumanpp=True)
    knp_results = [[], []]  #list of title,list of abst
    knp_results[0].append(knp.parse(text_list[0]))
    for sentence in re.split(r"\.|。",
                             replaceDpoint(text_list[1].replace(".", "."))):
        #for sentence in replaceDpoint(titleabst_str[1].replace(".",".")).split("."):
        #print(sentence.replace("<dpoint>","."))
        knp_results[1].append(
            knp.parse(sentence.replace("<dpoint>", ".") + "."))
        #print(sentence.replace("<dpoint>",".")+".")
    head_morph_ids = [0]  #abst各文の先頭の形態素番号を保持
    #print(re.split(r"\.|。",replaceDpoint(titleabst_str[1].replace(".",".")).replace("<dpoint>",".")))
    #sys.exit()
    now = 0
    for rslt in knp_results[1]:
        now = now + len(rslt.mrph_list())
        head_morph_ids.append(now)
        #print(len(rslt.mrph_list()))
        #for m in rslt.mrph_list():
        #    print(m.midasi+" ",end="")
        #print()
    head_morph_ids.pop()
    feature_datas = []
    done_mrph_num = 0
    done_mrph_num_next = 0
    print("process each term")
    for i, terms in enumerate(terms_list):
        print(terms)
        #print(i," ",terms)
        if i > 0:
            done_mrph_num_next += len(knp_results[1][i - 1].mrph_list())
        for termL in terms:  # L->R
            #print("same sentence")
            #print(termL)
            s_posL = termL[1][1] - done_mrph_num
            e_posL = termL[1][1] - done_mrph_num
            sec_numL = termL[1][0]
            #単語が含まれているknp解析結果を取得
            if sec_numL == 0:
                knprslt = knp_results[0][0]
                s_idL = termL[1][1]
            else:
                #print(term,e_pos,":")
                for h_i, head in enumerate(head_morph_ids):
                    if e_posL < head:
                        #print(e_pos,head)
                        knprslt = knp_results[1][h_i - 1]
                        h = head_morph_ids[h_i - 1]
                        break
                s_idL = e_posL - h  #その文での形態素id
                #print(term,knprslt.mrph_list()[s_id].midasi,s_id,h)

            e_idL = s_idL
            tmp_term_len = len(
                mecab_result_list[sec_numL][e_posL][0])  #e_pos求める
            while tmp_term_len != len(termL[0]):
                #print(e_posL," ",knp_results[i-1].mrph_list()[e_posL+1].midasi)
                tmp_term_len += len(mecab_result_list[sec_numL][e_posL + 1][0])
                e_posL += 1
                e_idL += 1
            kihonL_b, kihonL_f, hinshiL_b, hinshiL_f = getBehindFrontNMorphenesByKNP(
                knprslt, s_idL, e_idL, n)  #この辺途中
            kihon_kakariL_f, hinshi_kakariL_f = getBehindFrontNMorphenesKakariByKNP(
                knprslt, s_idL, e_idL, n)
            #print("start:",termL[1][1]," end:",e_posL)
            #print(kihonL,hinshiL)
            for termR in terms:
                if isSameTerm(termL, termR):
                    continue
                print(termL[0], "->", termR[0])
                s_posR = termR[1][1] - done_mrph_num
                e_posR = termR[1][1] - done_mrph_num
                sec_numR = termR[1][0]
                #単語が含まれているknp解析結果を取得
                if sec_numR == 0:
                    knprslt = knp_results[0][0]
                    s_idR = termR[1][1]
                else:
                    #print(term,e_pos,":")
                    for h_i, head in enumerate(head_morph_ids):
                        if e_posR < head:
                            #print(e_pos,head)
                            knprslt = knp_results[1][h_i - 1]
                            h = head_morph_ids[h_i - 1]
                            break
                    s_idR = e_posR - h  #その文での形態素id
                    #print(term,knprslt.mrph_list()[s_id].midasi,s_id,h)
                e_idR = s_idR
                tmp_term_len = len(
                    mecab_result_list[sec_numR][e_posR][0])  #e_pos求める
                while tmp_term_len != len(termR[0]):
                    tmp_term_len += len(mecab_result_list[sec_numR][e_posR +
                                                                    1][0])
                    e_posR += 1
                    e_idR += 1
                #print(termL[0]," ",termR[0]," ",knpresult.mrph_list()[s_posR].midasi," ",knp_results[i-1].mrph_list()[e_posR].midasi)
                kihonR_b, kihonR_f, hinshiR_b, hinshiR_f = getBehindFrontNMorphenesByKNP(
                    knprslt, s_idR, e_idR, n)  #この辺途中
                kihon_kakariR_f, hinshi_kakariR_f = getBehindFrontNMorphenesKakariByKNP(
                    knprslt, s_idR, e_idR, n)
                tmpdata = {
                    "termL":
                    termL[0],
                    "termR":
                    termR[0],
                    "termLpos":
                    str(termL[1][0]) + "," + str(termL[1][1]) + "," +
                    str(termL[1][2]),
                    "termRpos":
                    str(termR[1][0]) + "," + str(termR[1][1]) + "," +
                    str(termR[1][2])
                }
                """
                tmpdata["kihonL_before_appear"]=(kihonL_b)
                tmpdata["kihonL_front_appear"]=(kihonL_f)
                tmpdata["hinshiL_before_appear"]=(hinshiL_b)
                tmpdata["hinshiL_front_appear"]=(hinshiL_f)
                tmpdata["kihonL_kakari_front_appear"]=(kihon_kakariL_f)
                tmpdata["hinshiL_kakari_front_appear"]=(hinshi_kakariL_f)
                tmpdata["kihonR_before_appear"]=(kihonR_b)
                tmpdata["kihonR_front_appear"]=(kihonR_f)
                tmpdata["hinshiR_before_appear"]=(hinshiR_b)
                tmpdata["hinshiR_front_appear"]=(hinshiR_f)
                tmpdata["kihonR_kakari_front_appear"]=(kihon_kakariR_f)
                tmpdata["hinshiR_kakari_front_appear"]=(hinshi_kakariR_f)
                """
                #extend_feature_vector_rel(tmpdata,termL[0],kihonL_b,kihonL_f,hinshiL_b,hinshiL_f,kihon_kakariL_f,hinshi_kakariL_f,termR[0],kihonR_b,kihonR_f,hinshiR_b,hinshiR_f,kihon_kakariR_f,hinshi_kakariR_f) #この辺途中
                #extend_feature_vector_kaku_rel(tmpdata,termL[1][1],e_posL,knp_results[i-1]) # 格の種類 左
                #extend_feature_vector_kaku_rel(tmpdata,termR[1][1],e_posR,knp_results[i-1])#右
                #extend_feature_vector_kakarinum_rel(tmpdata,termL[0],termL[1][1],e_posL,termR[0],termR[1][1],e_posR,knp_results[i-1]) # 係り受け数
                #extend_feature_vector_kakaritype_rel(tmpdata,termR[1][1],e_posR,knp_results[i-1]) # 係り受けのタイプ
                feature_datas.append(tmpdata)
            if i + 1 != len(terms_list):  #次の文
                #print("next sentence")
                #print(list(terms_list)[i+1])
                for termR in list(terms_list)[i + 1]:
                    print(termL[0], "->", termR[0])
                    s_posR = termR[1][1] - done_mrph_num_next
                    e_posR = termR[1][1] - done_mrph_num_next
                    if isSameTerm(termL, termR):
                        continue
                    print(termL[0], "->", termR[0])
                    s_posR = termR[1][1] - done_mrph_num
                    e_posR = termR[1][1] - done_mrph_num
                    sec_numR = termR[1][0]
                    #単語が含まれているknp解析結果を取得
                    if sec_numR == 0:
                        knprslt = knp_results[0][0]
                        s_idR = termR[1][1]
                    else:
                        #print(term,e_pos,":")
                        for h_i, head in enumerate(head_morph_ids):
                            if e_posR < head:
                                #print(e_pos,head)
                                knprslt = knp_results[1][h_i - 1]
                                h = head_morph_ids[h_i - 1]
                                break
                        s_idR = e_posR - h  #その文での形態素id
                        #print(term,knprslt.mrph_list()[s_id].midasi,s_id,h)
                    e_idR = s_idR
                    tmp_term_len = len(
                        mecab_result_list[sec_numR][e_posR][0])  #e_pos求める
                    while tmp_term_len != len(termR[0]):
                        #print(termR[0])
                        #print(" ",knp_results[i].mrph_list()[e_posR].midasi)
                        tmp_term_len += len(
                            mecab_result_list[sec_numR][e_posR + 1][0])
                        e_posR += 1
                        e_idR += 1
                    #print(termR[1][1])
                    #print(termL[0]," ",termR[0]," ",knpresult.mrph_list()[s_posR].midasi," ",knp_results[i].mrph_list()[e_posR].midasi)
                    kihonR_b, kihonR_f, hinshiR_b, hinshiR_f = getBehindFrontNMorphenesByKNP(
                        knprslt, s_idR, e_idR, n)  #この辺途中
                    kihon_kakariR_f, hinshi_kakariR_f = getBehindFrontNMorphenesKakariByKNP(
                        knprslt, s_idR, e_idR, n)
                    tmpdata = {
                        "termL":
                        termL[0],
                        "termR":
                        termR[0],
                        "termLpos":
                        str(termL[1][0]) + "," + str(termL[1][1]) + "," +
                        str(termL[1][2]),
                        "termRpos":
                        str(termR[1][0]) + "," + str(termR[1][1]) + "," +
                        str(termR[1][2])
                    }
                    """
                    tmpdata["kihonL_before_appear"]=(kihonL_b)
                    tmpdata["kihonL_front_appear"]=(kihonL_f)
                    tmpdata["hinshiL_before_appear"]=(hinshiL_b)
                    tmpdata["hinshiL_front_appear"]=(hinshiL_f)
                    tmpdata["kihonL_kakari_front_appear"]=(kihon_kakariL_f)
                    tmpdata["hinshiL_kakari_front_appear"]=(hinshi_kakariL_f)
                    tmpdata["kihonR_before_appear"]=(kihonR_b)
                    tmpdata["kihonR_front_appear"]=(kihonR_f)
                    tmpdata["hinshiR_before_appear"]=(hinshiR_b)
                    tmpdata["hinshiR_front_appear"]=(hinshiR_f)
                    tmpdata["kihonR_kakari_front_appear"]=(kihon_kakariR_f)
                    tmpdata["hinshiR_kakari_front_appear"]=(hinshi_kakariR_f)
                    """
                    #extend_feature_vector_rel(tmpdata,termL[0],kihonL_b,kihonL_f,hinshiL_b,hinshiL_f,kihon_kakariL_f,hinshi_kakariL_f,termR[0],kihonR_b,kihonR_f,hinshiR_b,hinshiR_f,kihon_kakariR_f,hinshi_kakariR_f) #この辺途中
                    #extend_feature_vector_joshi_rel(tmpdata,s_posL,e_posL,knp_results[i-1]) # 前後の助詞 左
                    #extend_feature_vector_joshi_rel(tmpdata,s_posR,e_posR,knp_results[i])#右
                    # 格の種類
                    # 係り受け数
                    # 係り受けのタイプ
                    feature_datas.append(tmpdata)
        if i > 0:
            done_mrph_num += len(knp_results[1][i - 1].mrph_list())
    return feature_datas
Exemplo n.º 15
0
 def __init__(self):
     self.knp = pyknp.KNP()
Exemplo n.º 16
0
def knp_multithread():
    return pyknp.KNP(multithreading=True)
Exemplo n.º 17
0
def knp():
    return pyknp.KNP()
Exemplo n.º 18
0
    def __init__(self,
                 data_dir,
                 svg_dir_url,
                 keyword='water',
                 data_date='20200120',
                 threshold='0.6',
                 category_file=None):
        # keywordが上位カテゴリならその下位カテゴリをkeyword_listに,下位カテゴリならそれをkeyword_list
        if category_file is not None:
            with open(category_file) as f:
                categ_lines = f.readlines()
            categ_dic = {}
            for line in categ_lines:
                categ = line.strip().split()
                super_categ = categ[2]
                sub_categ = categ[3]
                categ_dic[super_categ] = categ_dic.get(super_categ,
                                                       []) + [sub_categ]
            if keyword in categ_dic:
                self.keyword_list = categ_dic[keyword]
            else:
                self.keyword_list = [keyword]
        else:
            self.keyword_list = [keyword]

        self.data_dir = data_dir
        self.svg_dir_url = svg_dir_url
        self.threshold = str(float(threshold))
        self.data_date = data_date
        self.negative_pattern_list = ['不満だ/ふまんだ', '嫌だ/いやだ', '困る/こまる']

        self.knp = pyknp.KNP(
            jumancommand='/mnt/violet/share/tool/juman++v2/bin/jumanpp')
        self.rep2cluster = {}
        self.event_json = {}
        self.cluster_vec_dict = {}
        self.url_format_str = {}
        self.rep2events = {}

        for keyword in self.keyword_list:
            clustering_file = os.path.join(
                self.data_dir,
                'clustering/{}/{}_{}.json'.format(data_date, keyword,
                                                  threshold))
            cluster_vec_file = os.path.join(
                self.data_dir, 'clustering/{}/{}_{}.vec.pickle'.format(
                    data_date, keyword, threshold))
            event_path = os.path.join(
                self.data_dir, 'event_pairs/{}/{}.event_pairs.json'.format(
                    data_date, keyword))
            self.url_format_str[keyword] = os.path.join(
                self.svg_dir_url, '{}/{}/{{}}.svg'.format(data_date, keyword))

            self.rep2cluster[keyword] = {}
            self.event_json[keyword] = []
            self.cluster_vec_dict[keyword] = {}
            self.rep2events[keyword] = defaultdict(list)

            with open(clustering_file, 'r') as f:
                for key, value in json.load(f).items():
                    for v in value:
                        self.rep2cluster[keyword][v] = key

            with open(event_path, 'r') as f:
                self.event_json[keyword] = json.load(f)

            with open(cluster_vec_file, 'rb') as f:
                self.cluster_vec_dict[keyword] = pickle.load(f)

            for event in self.event_json[keyword]:
                self.rep2events[keyword][event["modifier_reps"]].append(
                    (event, "modifier"))
                self.rep2events[keyword][event["head_reps"]].append(
                    (event, "head"))
Exemplo n.º 19
0
 def __init__(self):
     self.original_knp = pyknp.KNP()
     self.knp_dict = {}
     if os.path.exists('knp.pickle'):
         self.load()
Exemplo n.º 20
0
 async def on_delete(self, req, resp):
     self.knp = pyknp.KNP()
Exemplo n.º 21
0
def processEachTerm(term_num,
                    term_dic,
                    mecab_result_list,
                    n,
                    titleabst_str=[],
                    keywords=[],
                    fulltext=""):  #n:素性とするngramの範囲
    """
    各語に対する素性抽出処理
    素性データは[対象語,出現場所pos,出現頻度,1文字か,タイトルに含まれるか,アブストに含まれるか,キーワードに含まれるか,前後n形態素基本形列挙,前後n形態素品詞列挙,基本形ベクトル化,品詞ベクトル化]
    """
    knp = pyknp.KNP(command='knp',
                    option='-tab -anaphora',
                    jumancommand='jumanpp',
                    jumanpp=True)
    knp_results = [[], []]  #list of title,list of abst
    knp_results[0].append(knp.parse(titleabst_str[0]))
    for sentence in re.split(r"\.|。",
                             replaceDpoint(titleabst_str[1].replace(".",
                                                                    "."))):
        #for sentence in replaceDpoint(titleabst_str[1].replace(".",".")).split("."):
        #print(sentence.replace("<dpoint>","."))
        knp_results[1].append(
            knp.parse(sentence.replace("<dpoint>", ".") + "."))
        #print(sentence.replace("<dpoint>",".")+".")
    head_morph_ids = [0]  #abst各文の先頭の形態素番号を保持
    #print(re.split(r"\.|。",replaceDpoint(titleabst_str[1].replace(".",".")).replace("<dpoint>",".")))
    #sys.exit()
    now = 0
    for rslt in knp_results[1]:
        now = now + len(rslt.mrph_list())
        head_morph_ids.append(now)
        #print(len(rslt.mrph_list()))
        #for m in rslt.mrph_list():
        #    print(m.midasi+" ",end="")
        #print()
    head_morph_ids.pop()

    #for rslt in knp_results:
    #    for r in rslt:
    #        for m in r.mrph_list():
    #            print(m.midasi)

    #num=0
    #for rslt in knp_results[1]:
    #    print(rslt.mrph_list()[0].midasi,num)
    #    num+=len(rslt.mrph_list())
    #print(head_morph_ids)
    #sys.exit()
    outputdata = []  #素性 dictのlist returnする
    freq_list = getFreqList(term_dic, fulltext)
    for term, pos_list in term_dic.items():
        in_title = "0.0"
        in_abst = "0.0"
        in_kw = "0.0"
        if term in titleabst_str[0]:
            in_title = "1.0"
        if term in titleabst_str[1]:
            in_abst = "1.0"
        if term in keywords:
            int_kw = "1.0"
        #freq=str(calcFreqFeature(freq_list,len(pos_list),10))#これ使うか単純に出現頻度そのまま入れるか
        freq = str(calcTfidfFeature(term, len(pos_list), term_num))
        is_uni = "0.0"
        if len(term) == 1:
            is_uni = "1.0"
        digit_rate = str(digit_num_per_term(term))
        alpha_rate = str(alpha_num_per_term(term))
        #tmpdata=[term,freq,is_uni,digit_rate,alpha_rate,in_title,in_abst,in_kw]
        tmpdata = {
            "term": term,
            "freq": freq,
            "is_uni": is_uni,
            "digit_rate": digit_rate,
            "alpha_rate": alpha_rate,
            "in_title": in_title,
            "in_abst": in_abst,
            "in_kw": in_kw
        }
        #print("term : ",term)
        for pos in pos_list:
            #print("  pos : ",pos)
            e_pos = pos[1]
            sec_num = pos[0]
            #単語が含まれているknp解析結果を取得
            if sec_num == 0:
                knprslt = knp_results[0][0]
                s_id = pos[1]
            else:
                #print(term,e_pos,":")
                for h_i, head in enumerate(head_morph_ids):
                    if e_pos < head:
                        #print(e_pos,head)
                        knprslt = knp_results[1][h_i - 1]
                        h = head_morph_ids[h_i - 1]
                        break
                s_id = e_pos - h  #その文での形態素id
                #print(term,knprslt.mrph_list()[s_id].midasi,s_id,h)

            e_id = s_id
            tmp_term_len = len(mecab_result_list[sec_num][e_pos][0])
            #print(term)
            while tmp_term_len != len(term):  #単語の末尾posを求める
                #print(" ",mecab_result_list[sec_num][e_pos+1][0])
                tmp_term_len += len(mecab_result_list[sec_num][e_pos + 1][0])
                e_pos += 1
                e_id += 1
            #print(term,s_id,e_id,knprslt)
            #kihon_b,kihon_f,hinshi_b,hinshi_f=getBehindFrontNMorphenes(mecab_result_list[sec_num],pos[1],e_pos,n)
            kihon_b, kihon_f, hinshi_b, hinshi_f = getBehindFrontNMorphenesByKNP(
                knprslt, s_id, e_id, n)
            #print(kihon_b,term,kihon_f)
            kihon_kakari_f, hinshi_kakari_f = getBehindFrontNMorphenesKakariByKNP(
                knprslt, s_id, e_id, n)
            #print(term,kihon_kakari_f)

            #print("".join(kihon[:4]),term,"".join(kihon[4:]))
            tmpdata["pos"] = pos
            tmpdata["kihon_before_appear"] = (kihon_b)
            tmpdata["kihon_front_appear"] = (kihon_f)
            tmpdata["hinshi_before_appear"] = (hinshi_b)
            tmpdata["hinshi_front_appear"] = (hinshi_f)
            tmpdata["kihon_kakari_front_appear"] = (kihon_kakari_f)
            tmpdata["hinshi_kakari_front_appear"] = (hinshi_kakari_f)
            extend_feature_vector(tmpdata, term, kihon_b, kihon_f, hinshi_b,
                                  hinshi_f, kihon_kakari_f, hinshi_kakari_f)
            outputdata.append(tmpdata)
            tmpdata = {
                "term": term,
                "freq": freq,
                "is_uni": is_uni,
                "digit_rate": digit_rate,
                "alpha_rate": alpha_rate,
                "in_title": in_title,
                "in_abst": in_abst,
                "in_kw": in_kw
            }
    #sys.exit()
    return outputdata