Пример #1
0
def extract_cp(phs):
    if not phs:
        return []
    cps = []
    for pi, ph in phs.items():
        previous_w = mynlp.WordClass()
        if not ph.parent_id:
            continue
        for word in ph.words:
            if previous_w.pos_detail == "形式名詞":
                continue
            if word.pos_detail == "格助詞" and previous_w.pos == "名詞":
                if ph.parent_id < 0:
                    continue
                parent = phs[ph.parent_id]
                parent_self_w = _extract_self_sufficient_word(parent.words)

                if not parent_self_w:
                    continue

                cp = {}
                cp["noun"] = previous_w.base
                cp["particle"] = word.particle
                cp["predicate"] = parent_self_w.base
                cp["cateogory"] = word.category

                cps.append(cp)
    return cps
Пример #2
0
def _knp(sentence):
    if sentence == "":
        return None

    knp = pyknp.KNP()
    try:
        result = knp.parse(sentence)
    except:
        return None

    # 情報をクラスに格納
    phrases = OrderedDict()  # 文節クラスのディクショナリ
    for bnst in result.bnst_list():
        ph = mynlp.PhraseClass(parent_id=bnst.parent_id,
                               dpndtype=bnst.dpndtype)

        # この文節にふくまれる単語情報を格納
        for mrph in bnst.mrph_list():  # mrph_list:文節内の形態素リスト
            word = mynlp.WordClass(surface=mrph.midasi,
                                   base=mrph.genkei,
                                   yomi=mrph.yomi)

            # 品詞関連詳細情報
            pos_info = mrph.spec().split(" ")  # or .new_spec()
            # 表層形 読み 見出し語 品詞大分類 品詞大分類ID 品詞細分類 品詞細分類ID 活用型 活用型ID 活用形 活用形ID 意味情報
            word.pos = pos_info[3]  # 品詞
            word.pos_detail = pos_info[5]  # 品詞細分類

            # 意味情報関連
            imis = mrph.imis.split()  # 代表表記,漢字読み,カテゴリなど
            for imi in imis:
                if "代表表記" in imi:
                    word.descriptions = imi.split(":", 1)[-1]
                elif "カテゴリ" in imi:
                    word.category = imi.split(":", 1)[-1]
                elif "ドメイン" in imi:
                    word.domain = imi.split(":", 1)[-1]
                elif ("人名:" in imi) or ("地名:" in imi):  # 固有名詞
                    word.proper_noun = imi.split(":", 1)[-1]
                else:
                    word.another = word.another + imi + " "

            ph.words.append(word)

        phrases[bnst.bnst_id] = ph

    for ph_i, ph in phrases.items():
        if ph.parent_id != -1:
            phrases[ph.parent_id].children.append(ph_i)

    return phrases
Пример #3
0
def _embed_to_class(bnsts):
    phs = OrderedDict()
    if not bnsts:
        return None
    for bnst_i, bnst in enumerate(bnsts):
        ph = mynlp.PhraseClass(parent_id=bnst["parent_id"], parent=bnst["parent"],
                               children=bnst["children"], dpndtype=bnst["dpndtype"])

        for mrph in bnst["words"]:
            word = mynlp.WordClass(mrph["surface"],mrph["base"],mrph["yomi"],
                                   mrph["pos"],mrph["pos_detail"],mrph["descriptions"],
                                   mrph["category"],mrph["domain"],mrph["another"],
                                   mrph["proper_noun"])
            ph.words.append(word)
        phs[bnst_i] = ph
    return phs
Пример #4
0
def _cross_word(w1, w2):
    nw = mynlp.WordClass()
    nw.base = w1.surface + w2.base
    nw.surface = w1.surface + w2.surface
    nw.yomi = w1.yomi + w2.yomi

    nw.pos = w2.pos
    nw.pos_detail = w2.pos_detail

    ll = w1.original_words
    if len(w2.original_words) > 0:
        ll.extend(w2.original_words)
    else:
        ll.append(w2.base)
    nw.original_words = ll

    try:
        nw.descriptions = w1.descriptions + "," + w2.descriptions
    except:
        if w1.descriptions:
            nw.descriptions = w1.descriptions
    try:
        nw.cateegory = w1.category.extend(w2.category)
    except:
        if w2.category:
            nw.cateegory = w2.category
    try:
        nw.domain = w1.domain.extend(w2.domain)
    except:
        if w2.domain:
            nw.domain = w2.domain
    try:
        nw.another = w1.another.extend(w2.another)
    except:
        if w2.domain:
            nw.domain = w2.domain
    try:
        nw.another = w1.another.extend(w2.another)
    except:
        if w2.another:
            nw.another = w2.another
    try:
        nw.proper_noun = w1.proper_noun.extend(w2.proper_noun)
    except:
        if w2.proper_noun:
            nw.proper_noun = w2.proper_noun
    return nw