Пример #1
0
class KNP(object):
    """
    KNP を用いて構文解析を行うモジュールである.
    """

    def __init__(self, command='knp', option='-tab', rcfile='',
                 server=None, port=31000, timeout=30,
                 pattern=r'(?:^|\n)EOS($|\n)',
                 jumanrcfile='', juman_option='-e2 -B', juman_port=32000,
                 juman_command='juman', jumanpp=False):

        self.use_jumanpp = (juman_command == "jumanpp") or jumanpp
        assert 'EOS' in pattern
        self.pattern = pattern
        self.EOS = 'EOS'
        # tab形式しかパースしない
        assert '-tab' in option

        if rcfile and not os.path.isfile(os.path.expanduser(rcfile)):
            sys.stderr.write("Can't read rcfile (%s)!\n" % rcfile)
            quit(1)

        # Setup Juman(++)
        assert port != juman_port
        juman_args = {'option': juman_option, 'rcfile': jumanrcfile,
                      'server':server, 'port':juman_port}
        if self.use_jumanpp:
            self.juman = Jumanpp(**juman_args)
        else:
            self.juman = Juman(**juman_args)
        # Setup KNP
        if server is not None:
            self.socket = Socket(server, port, option=option, timeout=timeout)
            self.query = partial(self.socket.query, pattern=pattern)
        else:
            if rcfile:
                option += " -r {}".format(rcfile)
            self.subprocess = Subprocess(command, option=option)
            self.query = partial(self.subprocess.query, pattern=pattern)

    def parse_sentence(self, sentence):
        assert isinstance(sentence, str)
        juman_lines = self.juman.juman_lines(sentence)
        if self.EOS not in juman_lines:
            juman_lines += self.EOS
        return self.query(juman_lines)

    def knp(self, sentence):
        assert isinstance(sentence, str)
        result = BList(self.parse_sentence(sentence))
        return result

    def parse(self, sentence):
        """
        文字列 sentence を対象として構文解析を行い,構文解析結果オブジェクトを返す.
        """
        return self.knp(sentence)

    def result(self, input_str):
        return BList(input_str, self.EOS)
Пример #2
0
class KNP(object):
    """ KNPを用いて構文解析を行う/KNPの解析結果を読み取るモジュール

    Args:
        command (str): KNPコマンド
        option (str): KNP解析オプション 
                        (詳細解析結果を出力する-tabは必須。 
                        省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など)
        rcfile (str): KNP設定ファイルへのパス
        pattern (str): KNP出力の終端記号
        jumancommand (str): JUMANコマンド
        jumanrcfile (str): JUMAN設定ファイルへのパス
        jumanpp (bool): JUMAN++を用いるかJUMANを用いるか
    """
    def __init__(self,
                 command='knp',
                 server=None,
                 port=31000,
                 timeout=60,
                 option='-tab',
                 rcfile='',
                 pattern=r'EOS',
                 jumancommand='jumanpp',
                 jumanrcfile='',
                 jumanoption='',
                 jumanpp=True):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.options = option.split()
        self.rcfile = rcfile
        self.pattern = pattern
        self.socket = None
        self.subprocess = None
        self.jumanpp = jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            raise Exception("Can't read rcfile (%s)!" % self.rcfile)
        if distutils.spawn.find_executable(self.command) is None:
            raise Exception("Can't find KNP command: %s" % self.command)

        self.juman = Juman(command=jumancommand,
                           rcfile=jumanrcfile,
                           option=jumanoption,
                           jumanpp=self.jumanpp)

    def knp(self, sentence):
        """ parse関数と同じ """
        self.parse(sentence)

    def parse(self, sentence, juman_format=JUMAN_FORMAT.DEFAULT):
        """
        入力された文字列に対して形態素解析と構文解析を行い、文節列オブジェクトを返す

        Args:
            sentence (str): 文を表す文字列
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            BList: 文節列オブジェクト
        """
        assert (isinstance(sentence, six.text_type))
        juman_lines = self.juman.juman_lines(sentence)
        juman_str = "%s%s" % (juman_lines, self.pattern)
        return self.parse_juman_result(juman_str, juman_format)

    def parse_juman_result(self, juman_str, juman_format=JUMAN_FORMAT.DEFAULT):
        """
        JUMAN出力結果に対して構文解析を行い、文節列オブジェクトを返す

        Args:
            juman_str (str): ある文に関するJUMANの出力結果
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            BList: 文節列オブジェクト
        """
        if not self.socket and not self.subprocess:
            if self.server is not None:
                self.socket = Socket(self.server, self.port,
                                     "RUN -tab -normal\n")
            else:
                command = [self.command] + self.options
                if self.rcfile:
                    command.extend(['-r', self.rcfile])
                self.subprocess = Subprocess(command)

        if self.socket:
            knp_lines = self.socket.query(juman_str,
                                          pattern=r'^%s$' % self.pattern)
        else:
            knp_lines = self.subprocess.query(juman_str,
                                              pattern=r'^%s$' % self.pattern)
        return BList(knp_lines, self.pattern, juman_format)

    def reparse_knp_result(self, knp_str, juman_format=JUMAN_FORMAT.DEFAULT):
        """
        KNP出力結果に対してもう一度構文解析を行い、文節列オブジェクトを返す。
        KNPのfeatureを再付与する場合などに用いる。中身はparse_juman_result関数と同じ。

        Args:
            knp_str (str): ある文に関するKNPの出力結果
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            BList: 文節列オブジェクト
        """
        return self.parse_juman_result(knp_str, juman_format=juman_format)

    def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT):
        """
        ある文に関するKNP解析結果を文節列オブジェクトに変換する

        Args:
            input_str (str): ある文に関するKNPの出力結果
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            BList: 文節列オブジェクト
        """
        return BList(input_str, self.pattern, juman_format)
Пример #3
0
class Tokenizer(SerializationMixin):
    """Juman tokenizer

    Note:
        `spacy.Token._.fstring` is set. The Juman's output is stored into it during tokenizing.
    """

    serialization_fields = ["preprocessor", "juman_kwargs"]
    key_fstring = KEY_FSTRING

    @classmethod
    def install_extensions(cls):
        """See https://github.com/explosion/spacy-pytorch-transformers#extension-attributes."""
        Token.set_extension(cls.key_fstring, default=None, force=True)

    def __init__(
        self,
        cls: Type["Defaults"],
        nlp: Optional[Language] = None,
        juman_kwargs: Optional[Dict[str, str]] = None,
        preprocessor: Optional[Callable[[str], str]] = han_to_zen_normalize,
    ):
        """

        Args:
            juman_kwargs: passed to `pyknp.Juman.__init__`
            preprocessor: applied to text before tokenizing. `mojimoji.han_to_zen` is often used.
        """
        from pyknp import Juman

        juman_kwargs = juman_kwargs or {}
        default_command = get_juman_command()
        assert default_command
        juman_kwargs.setdefault("command", default_command)

        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        self.tokenizer = Juman(**juman_kwargs) if juman_kwargs else Juman()
        self.juman_kwargs = juman_kwargs
        self.preprocessor = preprocessor

    def reset_tokenizer(self):
        from pyknp import Juman

        self.tokenizer = Juman(
            **self.juman_kwargs) if self.juman_kwargs else Juman()

    def __call__(self, text: str) -> Doc:
        """Make doc from text. Juman's `fstring` is stored in `Token._.fstring`"""
        if self.preprocessor:
            text = self.preprocessor(text)
        juman_lines = self._juman_string(text)
        dtokens = self._detailed_tokens(juman_lines)
        doc = self._dtokens_to_doc(dtokens)
        doc.user_data[JUMAN_LINES] = juman_lines
        return doc

    def _juman_string(self, text: str) -> str:
        try:
            texts = _split_text_for_juman(text)
            lines: str = "".join(
                itertools.chain.from_iterable(
                    self.tokenizer.juman_lines(text) for text in texts))
        except BrokenPipeError:
            # Juman is sometimes broken due to its subprocess management.
            self.reset_tokenizer()
            lines = self.tokenizer.juman_lines(text)
        return lines

    def _dtokens_to_doc(self, dtokens: List[ShortUnitWord]) -> Doc:
        words = [x.surface for x in dtokens]
        spaces = [x.space for x in dtokens]
        doc = Doc(self.vocab, words=words, spaces=spaces)
        for token, dtoken in zip(doc, dtokens):
            token.lemma_ = dtoken.lemma
            token.tag_ = dtoken.pos
            token._.set(self.key_fstring, dtoken.fstring)
        doc.is_tagged = True
        return doc

    def _detailed_tokens(self, juman_lines: str) -> List[ShortUnitWord]:
        """Tokenize text with Juman and format the outputs for further processing"""
        from pyknp import MList

        ml = MList(juman_lines).mrph_list()
        words: List[ShortUnitWord] = []
        for m in ml:
            surface = m.midasi
            pos = m.hinsi + "," + m.bunrui
            lemma = m.genkei or surface
            words.append(ShortUnitWord(surface, lemma, pos, m.fstring, False))
        return words
Пример #4
0
from pyknp import KNP, Juman

line = "猫が好き。犬も好き"

jumanpp = Juman()
knp = KNP()

juman_line = jumanpp.juman_lines(line)
result = knp.parse(line)

print("基本句")
for tag in result.tag_list():  # 各基本句へのアクセス
    print("\t見出し:%s, 素性:%s" %
          ("".join(mrph.midasi for mrph in tag.mrph_list()), tag.fstring))
Пример #5
0
class KNP(object):
    """
    KNP を用いて構文解析を行うモジュールである.
    """
    def __init__(self,
                 command='knp',
                 server=None,
                 port=31000,
                 timeout=60,
                 option='-tab',
                 rcfile='',
                 pattern=r'EOS',
                 jumancommand='juman',
                 jumanrcfile='',
                 jumanpp=False):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.option = option
        self.rcfile = rcfile
        self.pattern = pattern
        self.socket = None
        self.subprocess = None
        self.jumanpp = (jumancommand == "jumanpp") or jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            sys.stderr.write("Can't read rcfile (%s)!\n" % self.rcfile)
            quit(1)

        if (self.jumanpp):
            self.juman = Jumanpp()
        else:
            self.juman = Juman(command=jumancommand, rcfile=jumanrcfile)

    def knp(self, sentence):
        self.parse(sentence)

    def parse(self, sentence):
        """
        文字列 sentence を対象として構文解析を行い,構文解析結果オブジェクトを返す.
        """
        assert (isinstance(sentence, six.text_type))
        juman_lines = self.juman.juman_lines(sentence)
        juman_str = "%s%s" % (juman_lines, self.pattern)
        if not self.socket and not self.subprocess:
            if self.server is not None:
                self.socket = Socket(self.server, self.port,
                                     "RUN -tab -normal\n")
            else:
                command = "%s %s" % (self.command, self.option)
                if self.rcfile:
                    command += " -r %s" % self.rcfile
                self.subprocess = Subprocess(command)

        if self.socket:
            knp_lines = self.socket.query(juman_str, pattern=self.pattern)
        else:
            knp_lines = self.subprocess.query(juman_str, pattern=self.pattern)
        return BList(knp_lines, self.pattern)

    def result(self, input_str):
        return BList(input_str, self.pattern)
Пример #6
0
class KNP(object):
    """ KNPを用いて構文解析を行う/KNPの解析結果を読み取るモジュール

    Args:
        command (str): KNPコマンド
        option (str): KNP解析オプション 
                        (詳細解析結果を出力する-tabは必須。 
                        省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など)
        rcfile (str): KNP設定ファイルへのパス
        pattern (str): KNP出力の終端記号
        jumancommand (str): JUMANコマンド
        jumanrcfile (str): JUMAN設定ファイルへのパス
        jumanpp (bool): JUMAN++を用いるかJUMANを用いるか
    """
    def __init__(self,
                 command='knp',
                 server=None,
                 port=31000,
                 timeout=60,
                 option='-tab',
                 rcfile='',
                 pattern=r'EOS',
                 jumancommand='jumanpp',
                 jumanrcfile='',
                 jumanpp=True):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.option = option
        self.rcfile = rcfile
        self.pattern = pattern
        self.socket = None
        self.subprocess = None
        self.jumanpp = jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            raise Exception("Can't read rcfile (%s)!" % self.rcfile)
        if distutils.spawn.find_executable(command) is None:
            raise Exception("Can't find KNP command: %s" % command)

        self.juman = Juman(command=jumancommand,
                           rcfile=jumanrcfile,
                           jumanpp=self.jumanpp)

    def knp(self, sentence):
        """ parse関数と同じ """
        self.parse(sentence)

    def parse(self, sentence):
        """
        文字列を入力として構文解析を行い、文節列オブジェクトを返す

        Args:
            sentence (str): 文を表す文字列

        Returns:
            BList: 文節列オブジェクト
        """
        assert (isinstance(sentence, six.text_type))
        juman_lines = self.juman.juman_lines(sentence)
        juman_str = "%s%s" % (juman_lines, self.pattern)
        if not self.socket and not self.subprocess:
            if self.server is not None:
                self.socket = Socket(self.server, self.port,
                                     "RUN -tab -normal\n")
            else:
                command = "%s %s" % (self.command, self.option)
                if self.rcfile:
                    command += " -r %s" % self.rcfile
                self.subprocess = Subprocess(command)

        if self.socket:
            knp_lines = self.socket.query(juman_str,
                                          pattern=r'^%s$' % (self.pattern))
        else:
            knp_lines = self.subprocess.query(juman_str,
                                              pattern=r'^%s$' % (self.pattern))
        return BList(knp_lines, self.pattern)

    def result(self, input_str):
        """
        ある文に関するKNP解析結果を文節列オブジェクトに変換する

        Args:
            input_str (str): ある文に関するKNPの出力結果

        Returns:
            BList: 文節列オブジェクト
        """
        return BList(input_str, self.pattern)
Пример #7
0
class KNP(object):
    """ KNPを用いて構文解析を行う/KNPの解析結果を読み取るモジュール

    Args:
        command (str): KNPコマンド
        option (str): KNP解析オプション 
                        (詳細解析結果を出力する-tabは必須。 
                        省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など)
        rcfile (str): KNP設定ファイルへのパス
        pattern (str): KNP出力の終端記号
        jumancommand (str): JUMANコマンド
        jumanrcfile (str): JUMAN設定ファイルへのパス
        jumanpp (bool): JUMAN++を用いるかJUMANを用いるか
    """

    def __init__(self, command='knp', server=None, port=31000, timeout=60,
                 option='-tab', rcfile='', pattern=r'EOS',
                 jumancommand='jumanpp', jumanrcfile='', jumanpp=True):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.option = option.split()
        self.rcfile = rcfile
        self.pattern = pattern
        self.socket = None
        self.subprocess = None
        self.jumanpp = jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            raise Exception("Can't read rcfile (%s)!" % self.rcfile)
        if distutils.spawn.find_executable(self.command) is None:
            raise Exception("Can't find KNP command: %s" % self.command)

        self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, jumanpp=self.jumanpp)

    def knp(self, sentence):
        """ parse関数と同じ """
        self.parse(sentence)

    def parse(self, sentence, juman_format=JUMAN_FORMAT.DEFAULT):
        """
        文字列を入力として構文解析を行い、文節列オブジェクトを返す

        Args:
            sentence (str): 文を表す文字列
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            BList: 文節列オブジェクト
        """
        assert(isinstance(sentence, six.text_type))
        juman_lines = self.juman.juman_lines(sentence)
        juman_str = "%s%s" % (juman_lines, self.pattern)
        if not self.socket and not self.subprocess:
            if self.server is not None:
                self.socket = Socket(
                    self.server, self.port, "RUN -tab -normal\n")
            else:
                command = [self.command] + self.option
                if self.rcfile:
                    command.extend(['-r', self.rcfile])
                self.subprocess = Subprocess(command)

        if self.socket:
            knp_lines = self.socket.query(juman_str, pattern=r'^%s$'%(self.pattern))
        else:
            knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$'%(self.pattern))
        return BList(knp_lines, self.pattern, juman_format)

    def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT):
        """
        ある文に関するKNP解析結果を文節列オブジェクトに変換する

        Args:
            input_str (str): ある文に関するKNPの出力結果
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            BList: 文節列オブジェクト
        """
        return BList(input_str, self.pattern, juman_format)
Пример #8
0
class Tokenizer(SerializationMixin):
    """Juman tokenizer

    Note:
        `spacy.Token._.fstring` is set. The Juman's output is stored into it during tokenizing.
    """

    serialization_fields = ["preprocessor", "juman_kwargs"]
    KEY_FSTRING = "juman_fstring"

    @classmethod
    def get_juman_fstring(cls, e: UserDataProto) -> str:
        if cls.KEY_FSTRING not in e.user_data:
            raise ValueError(f"{cls.KEY_FSTRING} is not set in {e}")
        return e.user_data[cls.KEY_FSTRING]

    @classmethod
    def set_juman_fstring(cls, e: UserDataProto, fstring: str):
        e.user_data[cls.KEY_FSTRING] = fstring

    def __init__(
        self,
        juman_kwargs: Optional[Dict[str, Any]] = None,
        preprocessor: Optional[Callable[[str], str]] = han_to_zen_normalize,
    ):
        """

        Args:
            juman_kwargs: passed to `pyknp.Juman.__init__`
            preprocessor: applied to text before tokenizing. `mojimoji.han_to_zen` is often used.
        """

        juman_kwargs = juman_kwargs or {}
        default_command = get_juman_command()
        assert default_command
        juman_kwargs.setdefault("command", default_command)

        self.juman_kwargs = juman_kwargs
        self.preprocessor = preprocessor
        self.set_tokenizer()

    def set_tokenizer(self):
        from pyknp import Juman

        self.tokenizer = Juman(
            **self.juman_kwargs) if self.juman_kwargs else Juman()

    def __call__(self, text: str) -> Doc:
        """Make doc from text. Juman's `fstring` is stored in `Token._.fstring`"""
        if self.preprocessor:
            text = self.preprocessor(text)
        juman_lines = self._juman_parse(text)
        dtokens = self._detailed_tokens(juman_lines)
        doc = self._dtokens_to_doc(dtokens)
        self.set_juman_fstring(doc, juman_lines)
        return doc

    def _juman_parse(self, text: str) -> str:
        texts = _split_text_for_juman(text)
        while True:
            try:
                lines: str = "".join(
                    itertools.chain.from_iterable(
                        self.tokenizer.juman_lines(text)
                        for text in texts  # type: ignore
                    ))
                break
            except BrokenPipeError:
                # Juman is sometimes broken due to its subprocess management.
                self.set_tokenizer()
        return lines

    def _dtokens_to_doc(self, dtokens: List[ShortUnitWord]) -> Doc:
        words = [x.surface + x.space for x in dtokens]
        doc = Doc.from_words(words)
        for token, dtoken in zip(doc, dtokens):
            token.tag_ = dtoken.pos
            token.lemma_ = dtoken.lemma
            self.set_juman_fstring(token, dtoken.fstring)
        return doc

    def _detailed_tokens(self, juman_lines: str) -> List[ShortUnitWord]:
        """Tokenize text with Juman and format the outputs for further processing"""
        from pyknp import MList, Morpheme  # type: ignore

        ml: List[Morpheme] = MList(juman_lines).mrph_list()
        words: List[ShortUnitWord] = []
        for m in ml:
            surface: str = m.midasi  # type: ignore
            pos: str = m.hinsi + "," + m.bunrui  # type: ignore
            lemma: str = m.genkei or surface  # type: ignore
            words.append(ShortUnitWord(surface, lemma, pos, m.fstring,
                                       ""))  # type: ignore
        return words