예제 #1
0
def select_dependency_structure(line):
    """係り受け構造を抽出します
    """

    # KNP
    print("called select_dependency_structure()")
    knp = KNP(option = '-tab -anaphora')

    # 解析
    result = knp.parse(line)

    # 文節リスト
    bnst_list = result.bnst_list()

    # 文節リストをidによるディクショナリ化する
    bnst_dic = dict((x.bnst_id, x) for x in bnst_list)

    tuples = []
    for bnst in bnst_list:
        if bnst.parent_id != -1:
            # (from, to)
            tuples.append((select_normalization_representative_notation(bnst.fstring), select_normalization_representative_notation(bnst_dic[bnst.parent_id].fstring)))

    return tuples
예제 #2
0
class Solver(object):
    def __init__(self):
        self.juman = Juman()
        self.knp = KNP()

    def Q61(self):
        u"""61. 文を標準入力から読み込み、それを単語単位に分かち書きせよ (形態素間にスペースを挿入)
        """

        input_sentence = raw_input()
        result = self.juman.analysis(input_sentence.decode("utf8"))
        for mrph in result.mrph_list():
            sys.stdout.write("{} ".format(mrph.midasi.encode("utf8")))
        sys.stdout.write("\n")
        return

    def Q62(self):
        u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ

        ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定
        """
        data = u""
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                s = ",".join(mrph.midasi for mrph in result.mrph_list() if mrph.hinsi == u"名詞")  # 名詞だけ表示
                if len(s) > 0:
                    print(s)
                data = u""

    def Q63(self):
        u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ

        ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定
        """
        data = u""
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                s = ",".join(mrph.genkei for mrph in result.mrph_list() if mrph.hinsi == u"動詞")  # 動詞だけ表示
                if len(s) > 0:
                    print(s)
                data = u""

    def Q64(self):
        u"""64. 形態素解析結果を読み込み、形態素の原形を頻度順に並べよ

        ヒント: ディクショナリ、sorted 関数を使う
        """
        data = u""
        hist = {}
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                for mrph in result.mrph_list():
                    try:
                        hist[mrph.genkei] += 1
                    except KeyError:
                        hist[mrph.genkei] = 1
                data = u""
        for key, val in sorted(hist.items(), key=lambda t: t[1], reverse=True):
            print("{},{}".format(key.encode("utf8"), val))

    def Q65(self):
        u"""65. 形態素解析結果を読み込み、全形態素数 (総数) に対する述語の割合を計算せよ

        ここで、述語とは、動詞、イ形容詞 (形容詞)、ナ形容詞 (形容動詞) とする
        """

        data = u""
        num = 0
        denom = 0
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                if verbose:
                    logger.info("denom: {}".format(denom))
                for mrph in result.mrph_list():
                    denom += 1
                    if mrph.hinsi == u"動詞":
                        num += 1
                        continue
                    if mrph.hinsi == u"形容詞" and mrph.bunrui.startswith(u"イ形容詞"):
                        num += 1
                        continue
                    if mrph.hinsi == u"形容動詞" and mrph.bunrui.startswith(u"ナ形容詞"):
                        num += 1
                        continue
                data = u""

        print("{}/{}={}".format(num, denom, float(num) / denom))

    def Q66(self):
        u"""66. 形態素解析結果を読み込み、「サ変名詞+する/できる」というパターンを抽出しプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                buff = None
                for mrph in result.mrph_list():
                    if mrph.genkei == u"できる" or mrph.genkei == u"する":
                        if buff is not None:
                            extract.add((buff.genkei.encode("utf8"), mrph.genkei.encode("utf8")))

                    if mrph.bunrui == u"サ変名詞":
                        buff = mrph
                    else:
                        buff = None
                data = u""
        for t in extract:
            print("{}+{}".format(t[0], t[1]))

    def Q67(self):
        u"""67. 形態素解析結果を読み込み、「AのB」という表現 (A と B は名詞の1形態素) をすべてプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                buff = []
                for mrph in result.mrph_list():
                    if mrph.genkei == u"の" and len(buff) == 1:
                        buff.append(u"の")
                        continue
                    if mrph.hinsi == u"名詞":
                        if len(buff) == 0:
                            buff.append(mrph.genkei)
                            continue
                        if len(buff) == 2:
                            extract.add((buff[0], mrph.genkei))
                    buff = []
                data = u""
        for t in extract:
            print("{}の{}".format(t[0].encode("utf8"), t[1].encode("utf8")))

    def Q68(self):
        u"""68. 文を標準入力から読み込み、それを文節単位に分かち書きせよ (文節間にスペースを挿入)
        """

        input_sentence = raw_input()
        result = self.knp.parse(input_sentence.decode("utf8"))
        for bnst in result.bnst_list():
            sys.stdout.write("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
        sys.stdout.write("\n")
        return

    def Q69(self):
        u"""69. 構文解析結果を読み込み、接頭辞を含む文節をプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):
            data += line.decode("utf8")
            if line.strip() == "EOS":
                result = self.knp.result(data)
                for bnst in result.bnst_list():
                    if len(filter(lambda x: x.hinsi == u"接頭辞", bnst.mrph_list())) < 1:
                        continue
                    extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
                data = u""
        for bnst in extract:
            if len(bnst) > 0:
                print(bnst)
        return

    def Q70(self):
        u"""70. 構文解析結果を読み込み、名詞を2つ以上含む文節をプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):
            data += line.decode("utf8")
            if line.strip() == "EOS":
                result = self.knp.result(data)
                for bnst in result.bnst_list():
                    if len(filter(lambda x: x.hinsi == u"名詞", bnst.mrph_list())) < 2:
                        continue
                    extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
                data = u""
        for bnst in extract:
            if len(bnst) > 0:
                print(bnst)

        return
예제 #3
0
# coding: utf-8
from pyknp import KNP

sent = "先生は自転車で学校に行った。"
knp = KNP()
result = knp.parse(sent)

# 文節
for bnst in result.bnst_list():
    midasi = "".join(mrph.midasi for mrph in bnst.mrph_list())
    print(bnst.bnst_id, midasi, bnst.dpndtype, bnst.parent_id, bnst.fstring)

# タグ
print("-----------------------------------")
for tag in result.tag_list():
    midasi = "".join(mrph.midasi for mrph in bnst.mrph_list())
    print(tag.tag_id, midasi, tag.dpndtype, tag.parent_id, tag.fstring)

# 形態素
print("-----------------------------------")
for mrph in result.mrph_list():
    midasi = "".join(mrph.midasi for mrph in bnst.mrph_list())
    print(
        mrph.midasi,
        mrph.yomi,
        mrph.genkei,
        mrph.hinsi,
        mrph.bunrui,
        mrph.katuyou1,
        mrph.katuyou2,
        mrph.imis,
예제 #4
0
if __name__ == "__main__":
    sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

    parser = argparse.ArgumentParser()
    parser.add_argument("--rule_file",
                        dest='rule_file',
                        type=str,
                        action='store',
                        required=True)
    parser.add_argument("--sentence",
                        dest='sentence',
                        type=str,
                        action='store',
                        required=True)
    parser.add_argument("--juman_command",
                        dest='sentence',
                        type=str,
                        action='store',
                        default="/mnt/violet/share/tool/juman++v2/bin/jumanpp")
    args = parser.parse_args()

    mrph_seq_match = MrphSeqMatch(args.rule_file)

    knp = KNP(jumancommand=args.jumancommand, option="-tab -dpnd")
    result = knp.parse(args.sentence)

    flag = mrph_seq_match.mrph_seq_match(result)
    print(flag)
예제 #5
0
class Analyser:
    """Class for syntactic Analysis
    """

    def __init__(self, text: str, delimiter: str='\n'):
        self.text = text
        self.delimiter = delimiter
        self.sentences = util.split_text(self.text, delimiter)
        self.n_sentences = len(self.sentences)
        self.knp = KNP(option=DefaultOptions.KNP, jumanpp=False)
        self.trees = self._trees()
        self.juman = Juman(jumanpp=False)
        self.rs_pos = self.calc_rs_pos()
        self.n_mrphs = self.calc_n_mrphs()
        self.n_chunks = self.calc_n_chunks()
        self.n_types = self.calc_n_types()
        self.mean_n_mrphs = None \
            if self.n_sentences == 0 \
            else self.n_mrphs / self.n_sentences
        self.rs_modality = self.calc_rs_modality()
        self.r_conditional = None \
            if self.n_sentences == 0 \
            else self.calc_n_conditionals() / self.n_sentences
        self.mean_tree_depths = self.calc_mean_tree_depths()

    def _trees(self) -> Tree:
        """Analyse dependency structure using KNP
        Returns:
            list(trf.Tree)
        """

        results = []

        for sentence in self.sentences:
            chunks = []
            parse_result = self.knp.parse(sentence)
            for bnst in parse_result.bnst_list():
                chunk = Chunk(chunk_id=bnst.bnst_id,
                              link=bnst.parent_id,
                              description=bnst.fstring)
                chunks.append(chunk)
            surfaces = [m.midasi for m in parse_result.mrph_list()]
            results.append(Tree(sentence, chunks, surfaces))

        return results

    def calc_rs_pos(self) -> Dict[str, float]:
        """Calculate the ratio of each pos of words in input text
        Returns:
            float: the ratio of each pos of words in input text
        """
        pos = []
        # TODO: It may take a long time when the number of sentences are large
        for sentence in self.sentences:
            juman_result = self.juman.analysis(sentence)
            pos += [mrph.hinsi for mrph in juman_result.mrph_list()]
        pos_counter = Counter(pos)
        total = sum(pos_counter.values())
        return {name: float(num) / total for name, num in pos_counter.items()}

    def calc_mean_tree_depths(self) -> float:
        """Calculate the mean depth of dependency tree
        Returns:
            float: The mean depth of trees
        """
        return numpy.mean([tree.depth for tree in self.trees])

    def calc_mean_sentence_length(self) -> float:
        """Calculate the mean length (# of morphs) of sentences
        Returns:
            float: the mean length of sentences
        """
        result = 0
        for sentence in self.sentences:
            juman_result = self.juman.analysis(sentence)
            result += len(juman_result.mrph_list())
        return result / self.n_sentences

    def calc_n_sentences(self) -> int:
        """Calculate the number of sentences of input text
        Returns:
            int: the number of sentences of input text splitted by delimiter (default '。')
        """
        return self.n_sentences

    def calc_n_types(self) -> int:
        """Calculate the number of types of input text
        Returns:
            int: the number of types of input text
        """
        surfaces = []
        for sentence in self.sentences:
            juman_result = self.juman.analysis(sentence)
            surfaces += [mrph.midasi for mrph in juman_result.mrph_list()]
        word_type_counter = Counter(surfaces)
        return len(word_type_counter)

    def calc_n_mrphs(self) -> int:
        """Calculate the number of morphemes of input text
        Returns:
            int: the number of morphemes of input text
        """
        n_mrphs = 0
        for sentence in self.sentences:
            juman_result = self.juman.analysis(sentence)
            n_mrphs += len(juman_result.mrph_list())
        return n_mrphs

    def calc_n_chunks(self) -> int:
        # TODO: 共通化
        return sum([len(self.knp.parse(s).bnst_list())
                    for s in self.sentences])

    def calc_rs_modality(self) -> Dict[str, float]:

        modality_counter = Counter()
        for i, s in enumerate(self.sentences):
            chunks = []
            for bnst in self.knp.parse(s).bnst_list():
                chunk = Chunk(chunk_id=bnst.bnst_id,
                              link=bnst.parent,
                              description=bnst.fstring)
                chunks.append(chunk)

            s = "".join([chunk.description for chunk in chunks])
            ms = set(re.findall("<モダリティ-(.+?)>", s))
            modality_counter += Counter(ms)

            n = len(self.sentences)

        return dict([(k, float(c) / n)
                     for k, c in modality_counter.items()])

    def calc_n_conditionals(self) -> int:
        """
        Returns:
            int: the number of sentences that contains one or more conditional clauses
        """
        result = 0

        tokenizer = Tokenizer()
        for s in self.sentences:
            for token in tokenizer.tokenize(s):
                if token.infl_form == '仮定形':
                    result += 1
                    break

        return result

    def calc_mean_thesaurus_depths(self) -> float:
        # TODO: Share the parsing result
        surfaces = []
        tokenizer = Tokenizer()
        for s in self.sentences:
            for token in tokenizer.tokenize(s):
                pos, pos1, _, _ = token.part_of_speech.split(',')
                if pos == '名詞' and pos1 == '一般':
                    surfaces.append(token.surface)

        return wordnet.calc_mean_thesaurus_depths(surfaces)
예제 #6
0
파일: parser.py 프로젝트: hashimom/Kasuga
class Parser:
    def __init__(self):
        """
        Parser class
        """
        self.knp = KNP()

    def __call__(self, text):
        """
        Parser
        :param text: 入力テキスト
        :return: 解析結果(辞書型)
        """
        chunks = []
        links = []

        try:
            result = self.knp.parse(text)
        except Exception as e:
            print("text \"" + text + "\" " + str(e))
            return None

        # 単語配列生成
        for bnst in result.bnst_list():
            chunk = {"Independent": [], "Ancillary": [], "Link": None}

            for mrph in bnst.mrph_list():
                tmp = {"surface": mrph.midasi,
                       "original": mrph.genkei,
                       "read": mrph.yomi,
                       "position": [mrph.hinsi, mrph.bunrui],
                       "conjugate": [mrph.katuyou1, mrph.katuyou2],
                       }

                # 自立語
                if tmp["position"][0] != "助詞" and \
                        tmp["position"][0] != "助動詞" and \
                        tmp["position"][0] != "判定詞" and \
                        tmp["position"][0] != "特殊":
                    chunk["Independent"].append(tmp)

                # 付属語先頭
                else:
                    chunk["Ancillary"].append(tmp)

            # 文節情報と係り受け情報を登録
            chunks.append(chunk)
            links.append(bnst.parent_id)

        # 係り受け情報付与
        for parent_id, link_id in enumerate(links):
            if link_id > 0:
                chunks[parent_id]["Link"] = chunks[link_id]["Independent"]

        return {"Body": text, "Chunks": chunks}

    @classmethod
    def display(cls, info):
        """
        情報表示
        :param info: 解析済み情報
        :return:
        """
        for parse in info["Chunks"]:
            print("Chunk: ")

            surface = ""
            read = ""
            original = ""
            for token in parse["Independent"]:
                surface = surface + token["surface"]
                read = read + token["read"]
                original = original + token["original"]
            print(" Independent: " + surface + "/" + read + " (" + original + ")")

            surface = ""
            read = ""
            original = ""
            for token in parse["Ancillary"]:
                surface = surface + token["surface"]
                read = read + token["read"]
                original = original + token["original"]
            print(" Ancillary: " + surface + "/" + read + " (" + original + ")")

            if parse["Link"]:
                surface = ""
                read = ""
                original = ""
                for token in parse["Link"]:
                    surface = surface + token["surface"]
                    read = read + token["read"]
                    original = original + token["original"]
                print(" Link: " + surface + "/" + read + " (" + original + ")")