def parse_juman_result(self, juman_str, juman_format=JUMAN_FORMAT.DEFAULT): """ JUMAN出力結果に対して構文解析を行い、文節列オブジェクトを返す Args: juman_str (str): ある文に関するJUMANの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -tab -normal\n") else: command = [self.command] + self.options if self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$' % self.pattern) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$' % self.pattern) return BList(knp_lines, self.pattern, juman_format)
def parse(self, sentence): """ 文字列を入力として構文解析を行い、文節列オブジェクトを返す Args: sentence (str): 文を表す文字列 Returns: BList: 文節列オブジェクト """ assert(isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket( self.server, self.port, "RUN -tab -normal\n") else: command = [self.command] + self.option if self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$'%(self.pattern)) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$'%(self.pattern)) return BList(knp_lines, self.pattern)
def __init__(self, command='knp', option='-tab', rcfile='', server=None, port=31000, timeout=30, pattern=r'(?:^|\n)EOS($|\n)', jumanrcfile='', juman_option='-e2 -B', juman_port=32000, juman_command='juman', jumanpp=False): self.use_jumanpp = (juman_command == "jumanpp") or jumanpp assert 'EOS' in pattern self.pattern = pattern self.EOS = 'EOS' # tab形式しかパースしない assert '-tab' in option if rcfile and not os.path.isfile(os.path.expanduser(rcfile)): sys.stderr.write("Can't read rcfile (%s)!\n" % rcfile) quit(1) # Setup Juman(++) assert port != juman_port juman_args = {'option': juman_option, 'rcfile': jumanrcfile, 'server':server, 'port':juman_port} if self.use_jumanpp: self.juman = Jumanpp(**juman_args) else: self.juman = Juman(**juman_args) # Setup KNP if server is not None: self.socket = Socket(server, port, option=option, timeout=timeout) self.query = partial(self.socket.query, pattern=pattern) else: if rcfile: option += " -r {}".format(rcfile) self.subprocess = Subprocess(command, option=option) self.query = partial(self.subprocess.query, pattern=pattern)
def parse(self, sentence, juman_format=JUMAN_FORMAT.DEFAULT): """ 文字列を入力として構文解析を行い、文節列オブジェクトを返す Args: sentence (str): 文を表す文字列 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ assert(isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket( self.server, self.port, "RUN -tab -normal\n") else: command = [self.command] + self.option if self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$'%(self.pattern)) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$'%(self.pattern)) return BList(knp_lines, self.pattern, juman_format)
def parse(self, sentence): """ 文字列 sentence を対象として構文解析を行い,構文解析結果オブジェクトを返す. """ assert (isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -tab -normal\n") else: command = "%s %s" % (self.command, self.option) if self.rcfile: command += " -r %s" % self.rcfile self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=self.pattern) else: knp_lines = self.subprocess.query(juman_str, pattern=self.pattern) return BList(knp_lines, self.pattern)
def juman_lines(self, input_str): """ 入力文字列に対して形態素解析を行い、そのJuman出力結果を返す Args: input_str (str): 文を表す文字列 Returns: str: Juman出力結果 """ if '\n' in input_str: input_str = input_str.replace('\n','') print('Analysis is done ignoring "\\n".', file=sys.stderr) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -e2\n") else: command = [self.command] + self.option if 'jumanpp' not in self.command and self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: return self.socket.query(input_str, pattern=self.pattern) return self.subprocess.query(input_str, pattern=self.pattern)
def __init__(self, command='jumanpp', option='-e2 -B', rcfile='', server=None, port=32000, timeout=30, pattern=r'(?:^|\n)EOS($|\n)'): if rcfile and not os.path.isfile(os.path.expanduser(rcfile)): sys.stderr.write("Can't read rcfile (%s)!\n" % rcfile) quit(1) if server is not None: self.socket = Socket(server, port, option=option, timeout=timeout) self.query = partial(self.socket.query, pattern=pattern) else: if rcfile: option = "{} -r {}".format(option, rcfile).lstrip() self.subprocess = Subprocess(command, option=option) self.query = partial(self.subprocess.query, pattern=pattern)
class KNP(object): """ KNPを用いて構文解析を行う/KNPの解析結果を読み取るモジュール Args: command (str): KNPコマンド option (str): KNP解析オプション (詳細解析結果を出力する-tabは必須。 省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など) rcfile (str): KNP設定ファイルへのパス pattern (str): KNP出力の終端記号 jumancommand (str): JUMANコマンド jumanrcfile (str): JUMAN設定ファイルへのパス jumanpp (bool): JUMAN++を用いるかJUMANを用いるか """ def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanoption='', jumanpp=True): self.command = command self.server = server self.port = port self.timeout = timeout self.options = option.split() self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(self.command) is None: raise Exception("Can't find KNP command: %s" % self.command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, option=jumanoption, jumanpp=self.jumanpp) def knp(self, sentence): """ parse関数と同じ """ self.parse(sentence) def parse(self, sentence, juman_format=JUMAN_FORMAT.DEFAULT): """ 入力された文字列に対して形態素解析と構文解析を行い、文節列オブジェクトを返す Args: sentence (str): 文を表す文字列 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ assert (isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) return self.parse_juman_result(juman_str, juman_format) def parse_juman_result(self, juman_str, juman_format=JUMAN_FORMAT.DEFAULT): """ JUMAN出力結果に対して構文解析を行い、文節列オブジェクトを返す Args: juman_str (str): ある文に関するJUMANの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -tab -normal\n") else: command = [self.command] + self.options if self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$' % self.pattern) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$' % self.pattern) return BList(knp_lines, self.pattern, juman_format) def reparse_knp_result(self, knp_str, juman_format=JUMAN_FORMAT.DEFAULT): """ KNP出力結果に対してもう一度構文解析を行い、文節列オブジェクトを返す。 KNPのfeatureを再付与する場合などに用いる。中身はparse_juman_result関数と同じ。 Args: knp_str (str): ある文に関するKNPの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ return self.parse_juman_result(knp_str, juman_format=juman_format) def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT): """ ある文に関するKNP解析結果を文節列オブジェクトに変換する Args: input_str (str): ある文に関するKNPの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ return BList(input_str, self.pattern, juman_format)
class Juman(object): """形態素解析器 JUMAN を Python から利用するためのモジュール Args: command (str): JUMANの実行コマンド option (str): JUMAN解析オプション (ラティス形式 -s, ビーム幅 --beam <int>) rcfile (str): JUMAN設定ファイルへのパス pattern (str): JUMAN出力の終端記号 jumanpp (bool): JUMAN++を用いるかJUMANを用いるか。commandを指定した場合は無視される。 """ def __init__(self, command='jumanpp', server=None, port=32000, timeout=30, option='', rcfile='', ignorepattern='', pattern=r'^EOS$', jumanpp=True): if jumanpp or command != 'jumanpp': self.command = command self.option = option.split() else: self.command = 'juman' self.option = option.split() + ['-e2', '-B'] self.server = server self.port = port self.timeout = timeout self.rcfile = rcfile self.ignorepattern = ignorepattern self.pattern = pattern self.socket = None self.subprocess = None if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(self.command) is None: raise Exception("Can't find JUMAN command: %s" % self.command) def juman_lines(self, input_str): """ 入力文字列に対して形態素解析を行い、そのJuman出力結果を返す Args: input_str (str): 文を表す文字列 Returns: str: Juman出力結果 """ if '\n' in input_str: input_str = input_str.replace('\n','') print('Analysis is done ignoring "\\n".', file=sys.stderr) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -e2\n") else: command = [self.command] + self.option if 'jumanpp' not in self.command and self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: return self.socket.query(input_str, pattern=self.pattern) return self.subprocess.query(input_str, pattern=self.pattern) def juman(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT): """ analysis関数と同じ """ assert(isinstance(input_str, six.text_type)) result = MList(self.juman_lines(input_str), juman_format) return result def analysis(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT): """ 入力文字列に対して形態素解析し、その結果を MList オブジェクトとして返す Args: input_str (str): 文を表す文字列 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: MList: 形態素列オブジェクト """ return self.juman(input_str, juman_format) def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT): """ Juman出力結果に対して、その結果を MList オブジェクトとして返す Args: input_str (str): Juman出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: MList: 形態素列オブジェクト """ return MList(input_str, juman_format)
class KNP(object): """ KNP を用いて構文解析を行うモジュールである. """ def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='juman', jumanrcfile='', jumanpp=False): self.command = command self.server = server self.port = port self.timeout = timeout self.option = option self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = (jumancommand == "jumanpp") or jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): sys.stderr.write("Can't read rcfile (%s)!\n" % self.rcfile) quit(1) if (self.jumanpp): self.juman = Jumanpp() else: self.juman = Juman(command=jumancommand, rcfile=jumanrcfile) def knp(self, sentence): self.parse(sentence) def parse(self, sentence): """ 文字列 sentence を対象として構文解析を行い,構文解析結果オブジェクトを返す. """ assert (isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -tab -normal\n") else: command = "%s %s" % (self.command, self.option) if self.rcfile: command += " -r %s" % self.rcfile self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=self.pattern) else: knp_lines = self.subprocess.query(juman_str, pattern=self.pattern) return BList(knp_lines, self.pattern) def result(self, input_str): return BList(input_str, self.pattern)
class KNP(object): """ KNPを用いて構文解析を行う/KNPの解析結果を読み取るモジュール Args: command (str): KNPコマンド option (str): KNP解析オプション (詳細解析結果を出力する-tabは必須。 省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など) rcfile (str): KNP設定ファイルへのパス pattern (str): KNP出力の終端記号 jumancommand (str): JUMANコマンド jumanrcfile (str): JUMAN設定ファイルへのパス jumanpp (bool): JUMAN++を用いるかJUMANを用いるか """ def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanpp=True): self.command = command self.server = server self.port = port self.timeout = timeout self.option = option self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(command) is None: raise Exception("Can't find KNP command: %s" % command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, jumanpp=self.jumanpp) def knp(self, sentence): """ parse関数と同じ """ self.parse(sentence) def parse(self, sentence): """ 文字列を入力として構文解析を行い、文節列オブジェクトを返す Args: sentence (str): 文を表す文字列 Returns: BList: 文節列オブジェクト """ assert (isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -tab -normal\n") else: command = "%s %s" % (self.command, self.option) if self.rcfile: command += " -r %s" % self.rcfile self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$' % (self.pattern)) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$' % (self.pattern)) return BList(knp_lines, self.pattern) def result(self, input_str): """ ある文に関するKNP解析結果を文節列オブジェクトに変換する Args: input_str (str): ある文に関するKNPの出力結果 Returns: BList: 文節列オブジェクト """ return BList(input_str, self.pattern)
class KNP(object): """ KNPを用いて構文解析を行う/KNPの解析結果を読み取るモジュール Args: command (str): KNPコマンド option (str): KNP解析オプション (詳細解析結果を出力する-tabは必須。 省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など) rcfile (str): KNP設定ファイルへのパス pattern (str): KNP出力の終端記号 jumancommand (str): JUMANコマンド jumanrcfile (str): JUMAN設定ファイルへのパス jumanpp (bool): JUMAN++を用いるかJUMANを用いるか """ def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanpp=True): self.command = command self.server = server self.port = port self.timeout = timeout self.option = option.split() self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(self.command) is None: raise Exception("Can't find KNP command: %s" % self.command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, jumanpp=self.jumanpp) def knp(self, sentence): """ parse関数と同じ """ self.parse(sentence) def parse(self, sentence, juman_format=JUMAN_FORMAT.DEFAULT): """ 文字列を入力として構文解析を行い、文節列オブジェクトを返す Args: sentence (str): 文を表す文字列 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ assert(isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket( self.server, self.port, "RUN -tab -normal\n") else: command = [self.command] + self.option if self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$'%(self.pattern)) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$'%(self.pattern)) return BList(knp_lines, self.pattern, juman_format) def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT): """ ある文に関するKNP解析結果を文節列オブジェクトに変換する Args: input_str (str): ある文に関するKNPの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ return BList(input_str, self.pattern, juman_format)