def __init__(self, spec, bnst_id=0, newstyle=False): self._mrph_list = MList() self._tag_list = TList() self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self._pstring = '' self.bnst_id = bnst_id spec = spec.strip() if spec == '*': pass elif newstyle: items = spec.split(u"\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec): match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: sys.stderr.write("Illegal bunsetsu spec: %s\n" % spec) quit(1) # Extract 正規化代表表記 if not newstyle: self.repname = '' match = re.search(r"<正規化代表表記:([^\"\s]+?)>", self.fstring) if match: self.repname = match.group(1)
def call_juman_interface(self, input_str): """* What you can do - You call Juman tokenizer interface. * Output - pyknp.MList """ # type: (six.text_type)->MList if isinstance(self.jumanpp_obj, Jumanpp): ml_token_object = self.jumanpp_obj.analysis(input_str=input_str) elif isinstance(self.jumanpp_obj, JumanppHnadler): try: result_token = self.jumanpp_obj.query(input_string=input_str) except UnicodeDecodeError: logger.warning(msg="Process is down by some reason. It restarts process automatically.") self.jumanpp_obj.restart_process() result_token = self.jumanpp_obj.query(input_string=input_str) ml_token_object = MList(result_token) elif isinstance(self.jumanpp_obj, JumanppClient): server_response = self.jumanpp_obj.query(sentence=input_str, pattern=self.eos_pattern) ml_token_object = MList(server_response) else: raise Exception('Not defined') return ml_token_object
def __init__(self, spec, tag_id=0, juman_format=JUMAN_FORMAT.DEFAULT): self._mrph_list = MList() self.midasi = '' self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self.features = None self._pstring = '' self.tag_id = tag_id self.pas = None self.synnodes = [] spec = spec.strip() if spec == '+': pass elif juman_format != JUMAN_FORMAT.DEFAULT: items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] self.features = Features(self.fstring, "|", False) self.features._tag = self elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec): match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: raise Exception("Illegal tag spec: %s" % spec) # Extract 正規化代表表記 if juman_format == JUMAN_FORMAT.DEFAULT: self.repname = '' self.normalized_repname = '' self.head_repname = '' self.head_prime_repname = '' self.pred_repname = '' self.disambiguated_pred_repname = '' self.features = Features(self.fstring) self.features._tag = self normalized_repname = self.features.get("正規化代表表記") if normalized_repname is not None: self.repname = normalized_repname self.normalized_repname = normalized_repname head_repname = self.features.get("主辞代表表記") if head_repname is not None: self.head_repname = head_repname head_prime_repname = self.features.get("主辞’代表表記") if head_prime_repname: self.head_prime_repname = head_prime_repname pred_repname = self.features.get("用言代表表記") if pred_repname is not None: self.pred_repname = pred_repname disambiguated_pred_repname = self.features.get("標準用言代表表記") if disambiguated_pred_repname is not None: self.disambiguated_pred_repname = disambiguated_pred_repname
def call_juman_interface(self, input_str): # type: (text_type) -> MList """* What you can do - You call Juman tokenizer interface. * Output - pyknp.MList """ if isinstance(self.jumanpp_obj, Jumanpp): ml_token_object = self.jumanpp_obj.analysis(input_str=input_str) elif isinstance(self.jumanpp_obj, JumanppHnadler): try: result_token = self.jumanpp_obj.query(input_string=input_str) except ProcessDownException: """Unix process is down by any reason.""" logger.warning("Re-starting unix process because it takes longer time than {} seconds...".format(self.jumanpp_obj.timeout_second)) self.jumanpp_obj.restart_process() self.jumanpp_obj.query(self.dummy_text) result_token = self.jumanpp_obj.query(input_string=input_str) ml_token_object = MList(result_token) except UnicodeDecodeError: logger.warning(msg="Process is down by some reason. It restarts process automatically.") self.jumanpp_obj.restart_process() self.jumanpp_obj.query(self.dummy_text) result_token = self.jumanpp_obj.query(input_string=input_str) ml_token_object = MList(result_token) else: ml_token_object = MList(result_token) elif isinstance(self.jumanpp_obj, JumanppClient): server_response = self.jumanpp_obj.query(sentence=input_str, pattern=self.eos_pattern) ml_token_object = MList(server_response) else: raise Exception('Not defined') return ml_token_object
class Tag(object): """ 格解析の単位となるタグ(基本句)の各種情報を保持するオブジェクト. """ def __init__(self, spec, tag_id=0, newstyle=False): self._mrph_list = MList() self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self.features = None self._pstring = '' self.tag_id = tag_id self.synnodes = [] spec = spec.strip() if spec == '+': pass elif newstyle: items = spec.split(u"\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] self.features = Features(self.fstring, u"|", False) elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec): match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: sys.stderr.write("Illegal tag spec: %s\n" % spec) quit(1) # Extract 正規化代表表記 if not newstyle: self.repname = '' self.features = Features(self.fstring) rep = self.features.get(u"正規化代表表記") if rep is not None: self.repname = rep def push_mrph(self, mrph): self._mrph_list.push_mrph(mrph) def spec(self): return "+ %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._mrph_list.spec()) def mrph_list(self): return self._mrph_list def pstring(self, string=None): if string: self._pstring = string else: return self._pstring def get_surface(self): return ''.join([mrph.midasi for mrph in self.mrph_list()])
def __init__(self, spec, bnst_id=0, juman_format=JUMAN_FORMAT.DEFAULT): self._mrph_list = MList() self._tag_list = TList() self.midasi = '' self.parent_id = -1 self.parent = None self.children = [] self.prev = None self.next = None self.dpndtype = '' self.fstring = '' self._pstring = '' self.bnst_id = bnst_id spec = spec.strip() if spec == '*': pass elif juman_format != JUMAN_FORMAT.DEFAULT: # TODO items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec): match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: raise Exception("Illegal bunsetsu spec: %s" % spec) self.features = Features(self.fstring) # Extract 正規化代表表記 if juman_format == JUMAN_FORMAT.DEFAULT: self.repname = '' self.normalized_repname = '' self.head_repname = '' self.head_prime_repname = '' normalized_repname = self.features.get("正規化代表表記") if normalized_repname: self.repname = normalized_repname self.normalized_repname = normalized_repname head_repname = self.features.get("主辞代表表記") if head_repname: self.head_repname = head_repname head_prime_repname = self.features.get("主辞’代表表記") if head_prime_repname: self.head_prime_repname = head_prime_repname
def loads_tags(text: str) -> Tags: tags_fss: List[Features] = [] mrph_fss: List[Features] = [] indices = [] idx = 0 sent = "" for line in text.split("\n"): if line == "EOS": break if line.startswith(_TAG_PREFIX): indices.append(idx) info = json.loads(line[len(_TAG_PREFIX):]) tags_fss.append(Features(info)) else: mtext, mfs = line.rsplit("\t", 1) sent += mtext + "\n" mrph_fss.append(Features(json.loads(mfs))) idx += 1 mlist = MList(sent) for mrph, mrph_fs in zip(mlist, mrph_fss): mrph.fs = mrph_fs tags = Tags(mlist, indices) assert len(tags) == len(tags_fss) for tag, ti in zip(tags, tags_fss): tag.fs = ti return tags
def __juman_parse(self, text: str, jumanpp: Optional[bool] = None) -> str: juman_str = self.__juman_lines(text, jumanpp) m_length = len(MList(juman_str)) if m_length < self.MAX_JUMAN_MORPHLENGTH: return "%s%s" % (juman_str, self.knp.pattern) else: raise MorphTooLongError( f"{m_length} morphs > {self.MAX_JUMAN_MORPHLENGTH}")
def result(self, input_str): """ Juman出力結果に対して、その結果を MList オブジェクトとして返す Args: input_str (str): Juman出力結果 Returns: MList: 形態素列オブジェクト """ return MList(input_str)
def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT): """ Juman出力結果に対して、その結果を MList オブジェクトとして返す Args: input_str (str): Juman出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: MList: 形態素列オブジェクト """ return MList(input_str, juman_format)
def _detailed_tokens(self, juman_lines: str) -> List[ShortUnitWord]: """Tokenize text with Juman and format the outputs for further processing""" from pyknp import MList ml = MList(juman_lines).mrph_list() words: List[ShortUnitWord] = [] for m in ml: surface = m.midasi pos = m.hinsi + "," + m.bunrui lemma = m.genkei or surface words.append(ShortUnitWord(surface, lemma, pos, m.fstring, False)) return words
def _detailed_tokens(self, juman_lines: str) -> List[ShortUnitWord]: """Tokenize text with Juman and format the outputs for further processing""" from pyknp import MList, Morpheme # type: ignore ml: List[Morpheme] = MList(juman_lines).mrph_list() words: List[ShortUnitWord] = [] for m in ml: surface: str = m.midasi # type: ignore pos: str = m.hinsi + "," + m.bunrui # type: ignore lemma: str = m.genkei or surface # type: ignore words.append(ShortUnitWord(surface, lemma, pos, m.fstring, "")) # type: ignore return words
def get_mlist(inf) -> Iterator[MList]: buf: List[str] = [] surfs: List[str] = [] for line in inf: if line == "EOS\n": yield MList("".join(buf)) buf = [] surfs = [] continue buf.append(line) if line.startswith("@"): continue surf = line[:line.index(" ")] surfs.append(surf)
def __init__(self, spec, tag_id=0, newstyle=False): self._mrph_list = MList() self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self.features = None self._pstring = '' self.tag_id = tag_id self.synnodes = [] spec = spec.strip() if spec == '+': pass elif newstyle: items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] self.features = Features(self.fstring, "|", False) elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec): match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: sys.stderr.write("Illegal tag spec: %s\n" % spec) quit(1) # Extract 正規化代表表記 if not newstyle: self.repname = '' self.features = Features(self.fstring) rep = self.features.get("正規化代表表記") if rep is not None: self.repname = rep
def initialize(mlist: MList): if hasattr(mlist, "fs_inited"): return mlist.fs_inited = True for m in mlist: m.fs = Features() if m.imis == "NIL" or len(m.imis) == 0: continue for f in m.imis.split(" "): if f.startswith("代表表記"): continue m.fs.add(f) mlist[0].fs.add("文頭") mlist[-1].fs.add("文末")
def __init__(self, spec, bnst_id=0, juman_format=JUMAN_FORMAT.DEFAULT): self._mrph_list = MList() self._tag_list = TList() self.midasi = '' self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self._pstring = '' self.bnst_id = bnst_id spec = spec.strip() if spec == '*': pass elif juman_format != JUMAN_FORMAT.DEFAULT: # TODO items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec): match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: raise Exception("Illegal bunsetsu spec: %s" % spec) self.features = Features(self.fstring) # Extract 正規化代表表記 if juman_format == JUMAN_FORMAT.DEFAULT: self.repname = '' self.normalized_repname = '' self.head_repname = '' self.head_prime_repname = '' normalized_repname = self.features.get("正規化代表表記") if normalized_repname: self.repname = normalized_repname self.normalized_repname = normalized_repname head_repname = self.features.get("主辞代表表記") if head_repname: self.head_repname = head_repname head_prime_repname = self.features.get("主辞’代表表記") if head_prime_repname: self.head_prime_repname = head_prime_repname
def call_juman_interface(self, input_str): # type: (str) -> MList """* What you can do - You call Juman tokenizer interface. * Output - pyknp.MList """ if isinstance(self.jumanpp_obj, Jumanpp): ml_token_object = self.jumanpp_obj.analysis(input_str=input_str) elif isinstance(self.jumanpp_obj, JumanppClient): server_response = self.jumanpp_obj.query(sentence=input_str, pattern=self.eos_pattern) ml_token_object = MList(server_response) else: raise Exception('Not defined') return ml_token_object
def call_juman_interface(self, input_str): # type: (text_type)->MList if isinstance(self.juman, pyknp.Juman): result = self.juman.analysis(input_str) return result elif isinstance(self.juman, JumanppHnadler): try: result_analysis = self.juman.query(input_str) except UnicodeDecodeError: logger.warning( msg= "Process is down by some reason. It restarts process automatically." ) self.juman.restart_process() result_analysis = self.juman.query(input_string=input_str) return MList(result_analysis) else: raise Exception('Not defined.')
def get(self, sentence: str) -> Optional[MList]: parsed = self.surf2parsed.get(sentence) if parsed: return MList(parsed) return None
def juman_parse(self, text: str, jumanpp: Optional[bool] = None) -> MList: normalized_text = self.prenormalize(text) return MList(self.__juman_lines(normalized_text, jumanpp))
class Bunsetsu(object): """ KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト. """ def __init__(self, spec, bnst_id=0, newstyle=False): self._mrph_list = MList() self._tag_list = TList() self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self._pstring = '' self.bnst_id = bnst_id spec = spec.strip() if spec == '*': pass elif newstyle: items = spec.split(u"\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec): match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: sys.stderr.write("Illegal bunsetsu spec: %s\n" % spec) quit(1) # Extract 正規化代表表記 if not newstyle: self.repname = '' match = re.search(r"<正規化代表表記:([^\"\s]+?)>", self.fstring) if match: self.repname = match.group(1) def push_mrph(self, mrph): if len(self._tag_list) > 0: self._tag_list[-1].push_mrph(mrph) self._mrph_list.push_mrph(mrph) def push_tag(self, tag): if len(self._tag_list) == 0 and len(self._mrph_list) > 0: sys.stderr.write("Unsafe addition of tags!\n") quit(1) self._tag_list.push_tag(tag) def spec(self): return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._tag_list.spec()) def mrph_list(self): return self._mrph_list def tag_list(self): return self._tag_list def pstring(self, string=None): if string: self._pstring = string else: return self._pstring
def result(self, input_str): return MList(input_str)
class Tag(object): """ ある文に関する基本句列を保持するオブジェクト Args: spec (str): KNP出力 tag_id (int): 基本句ID juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Attributes: tag_id (int): 基本句ID midasi (str): 見出し parent (Tag): 親の基本句オブジェクト parent_id (int): 親の基本句ID children (list): 子の基本句オブジェクトのリスト dpndtype (str): 係り受けタイプ fstring (str): feature情報 repname (str): 正規化代表表記 (normalized_repnameに同じ) normalized_repname (str): 正規化代表表記 head_repname (str): 主辞代表表記 head_prime_repname (str): 主辞’代表表記 pred_repname (str): 用言代表表記 disambiguated_pred_repname (str): 標準用言代表表記 features (Features): 基本句のfeatureを表すFeatureオブジェクト pas (Pas): 基本句が述語の場合は項の情報(Pasオブジェクト), そうでない場合None """ def __init__(self, spec, tag_id=0, juman_format=JUMAN_FORMAT.DEFAULT): self._mrph_list = MList() self.midasi = '' self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self.features = None self._pstring = '' self.tag_id = tag_id self.pas = None self.synnodes = [] spec = spec.strip() if spec == '+': pass elif juman_format != JUMAN_FORMAT.DEFAULT: items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] self.features = Features(self.fstring, "|", False) self.features._tag = self elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec): match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: raise Exception("Illegal tag spec: %s" % spec) # Extract 正規化代表表記 if juman_format == JUMAN_FORMAT.DEFAULT: self.repname = '' self.normalized_repname = '' self.head_repname = '' self.head_prime_repname = '' self.pred_repname = '' self.disambiguated_pred_repname = '' self.features = Features(self.fstring) self.features._tag = self normalized_repname = self.features.get("正規化代表表記") if normalized_repname is not None: self.repname = normalized_repname self.normalized_repname = normalized_repname head_repname = self.features.get("主辞代表表記") if head_repname is not None: self.head_repname = head_repname head_prime_repname = self.features.get("主辞’代表表記") if head_prime_repname: self.head_prime_repname = head_prime_repname pred_repname = self.features.get("用言代表表記") if pred_repname is not None: self.pred_repname = pred_repname disambiguated_pred_repname = self.features.get("標準用言代表表記") if disambiguated_pred_repname is not None: self.disambiguated_pred_repname = disambiguated_pred_repname def push_mrph(self, mrph): """ 新しい形態素オブジェクトをセットする """ self._mrph_list.push_mrph(mrph) def set_midasi(self): """ midasiをセットする """ self.midasi = self.get_surface() def spec(self): """ 基本句に対応するKNP出力 """ return "+ %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._mrph_list.spec()) def mrph_list(self): """ 基本句を構成する全形態素オブジェクトを返す Returns: list: 形態素オブジェクトMorphemeのリスト """ return self._mrph_list def pstring(self, string=None): """ draw_treeしたときに右側に出力する文字列を返す """ if string: self._pstring = string else: return self._pstring def get_surface(self): """ 基本句の見出しを返す Returns: str: 基本句の見出し """ return ''.join(mrph.midasi for mrph in self.mrph_list())
class Bunsetsu(object): """ KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト. Args: spec (str): KNP出力のうち文節に該当する箇所の文字列 bnst_id (int): 文節ID newstyle (bool): KNPフォーマットの種類 (公開版KNPの場合はFalse) Attributes: bnst_id (int): 文節ID midasi (str): 見出し parent (Bunsetsu): 親の文節オブジェクト parent_id (int): 親の文節ID children (list): 子の文節オブジェクトのリスト repname (str): 正規化代表表記 (normalized_repnameに同じ) normalized_repname (str): 正規化代表表記 head_repname (str): 主辞代表表記 head_prime_repname (str): 主辞’代表表記 fstring (str): feature情報 """ def __init__(self, spec, bnst_id=0, newstyle=False): self._mrph_list = MList() self._tag_list = TList() self.midasi = '' self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self._pstring = '' self.bnst_id = bnst_id spec = spec.strip() if spec == '*': pass elif newstyle: items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec): match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: raise Exception("Illegal bunsetsu spec: %s" % spec) self.features = Features(self.fstring) # Extract 正規化代表表記 if not newstyle: self.repname = '' self.normalized_repname = '' self.head_repname = '' self.head_prime_repname = '' normalized_repname = self.features.get("正規化代表表記") if normalized_repname: self.repname = normalized_repname self.normalized_repname = normalized_repname head_repname = self.features.get("主辞代表表記") if head_repname: self.head_repname = head_repname head_prime_repname = self.features.get("主辞’代表表記") if head_prime_repname: self.head_prime_repname = head_prime_repname def push_mrph(self, mrph): """ 新しい形態素オブジェクトをセットする """ if len(self._tag_list) > 0: self._tag_list[-1].push_mrph(mrph) self._mrph_list.push_mrph(mrph) def push_tag(self, tag): """ 新しい基本句オブジェクトをセットする """ if len(self._tag_list) == 0 and len(self._mrph_list) > 0: raise Exception("Unsafe addition of tags!") self._tag_list.push_tag(tag) def set_midasi(self): """ midasiをセットする """ for i in range(len(self._tag_list)): self._tag_list[i].set_midasi() self.midasi = ''.join(mrph.midasi for mrph in self.mrph_list()) def spec(self): """ 文節に対応するKNP出力 """ return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._tag_list.spec()) def mrph_list(self): """ 文節を構成する全形態素オブジェクトを返す Returns: list: 形態素オブジェクトMorphemeのリスト """ return self._mrph_list def tag_list(self): """ 文節を構成する全基本句オブジェクトを返す Returns: list: 基本句オブジェクトTagのリスト """ return self._tag_list def pstring(self, string=None): """ draw_treeしたときに右側に出力する文字列を返す """ if string: self._pstring = string else: return self._pstring
def juman(self, input_str): assert (isinstance(input_str, six.text_type)) result = MList(self.juman_lines(input_str)) return result
def jumanpp(self, input_str): assert isinstance(input_str, str) result = MList(self.jumanpp_lines(input_str)) return result
def juman(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT): """ analysis関数と同じ """ assert(isinstance(input_str, six.text_type)) result = MList(self.juman_lines(input_str), juman_format) return result
class Bunsetsu(object): """ KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト. Args: spec (str): KNP出力のうち文節に該当する箇所の文字列 bnst_id (int): 文節ID juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Attributes: bnst_id (int): 文節ID midasi (str): 見出し parent (Bunsetsu): 親の文節オブジェクト parent_id (int): 親の文節ID children (list): 子の文節オブジェクトのリスト repname (str): 正規化代表表記 (normalized_repnameに同じ) normalized_repname (str): 正規化代表表記 head_repname (str): 主辞代表表記 head_prime_repname (str): 主辞’代表表記 fstring (str): feature情報 """ def __init__(self, spec, bnst_id=0, juman_format=JUMAN_FORMAT.DEFAULT): self._mrph_list = MList() self._tag_list = TList() self.midasi = '' self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self._pstring = '' self.bnst_id = bnst_id spec = spec.strip() if spec == '*': pass elif juman_format != JUMAN_FORMAT.DEFAULT: # TODO items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec): match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: raise Exception("Illegal bunsetsu spec: %s" % spec) self.features = Features(self.fstring) # Extract 正規化代表表記 if juman_format == JUMAN_FORMAT.DEFAULT: self.repname = '' self.normalized_repname = '' self.head_repname = '' self.head_prime_repname = '' normalized_repname = self.features.get("正規化代表表記") if normalized_repname: self.repname = normalized_repname self.normalized_repname = normalized_repname head_repname = self.features.get("主辞代表表記") if head_repname: self.head_repname = head_repname head_prime_repname = self.features.get("主辞’代表表記") if head_prime_repname: self.head_prime_repname = head_prime_repname def push_mrph(self, mrph): """ 新しい形態素オブジェクトをセットする """ if len(self._tag_list) > 0: self._tag_list[-1].push_mrph(mrph) self._mrph_list.push_mrph(mrph) def push_tag(self, tag): """ 新しい基本句オブジェクトをセットする """ if len(self._tag_list) == 0 and len(self._mrph_list) > 0: raise Exception("Unsafe addition of tags!") self._tag_list.push_tag(tag) def set_midasi(self): """ midasiをセットする """ for i in range(len(self._tag_list)): self._tag_list[i].set_midasi() self.midasi = ''.join(mrph.midasi for mrph in self.mrph_list()) def spec(self): """ 文節に対応するKNP出力 """ return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._tag_list.spec()) def mrph_list(self): """ 文節を構成する全形態素オブジェクトを返す Returns: list: 形態素オブジェクトMorphemeのリスト """ return self._mrph_list def tag_list(self): """ 文節を構成する全基本句オブジェクトを返す Returns: list: 基本句オブジェクトTagのリスト """ return self._tag_list def pstring(self, string=None): """ draw_treeしたときに右側に出力する文字列を返す """ if string: self._pstring = string else: return self._pstring
class Bunsetsu(object): """ KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト. """ def __init__(self, spec, bnst_id=0, newstyle=False): self._mrph_list = MList() self._tag_list = TList() self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self._pstring = '' self.bnst_id = bnst_id spec = spec.strip() if spec == '*': pass elif newstyle: items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec): match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: sys.stderr.write("Illegal bunsetsu spec: %s\n" % spec) quit(1) # Extract 正規化代表表記 if not newstyle: self.repname = '' match = re.search(r"<正規化代表表記:([^\"\s]+?)>", self.fstring) if match: self.repname = match.group(1) self.hrepname = '' match = re.search(r"<主辞代表表記:([^\"\s]+?)>", self.fstring) if match: self.hrepname = match.group(1) self.hprepname = '' match = re.search(r"<主辞’代表表記:([^\"\s]+?)>", self.fstring) if match: self.hprepname = match.group(1) def push_mrph(self, mrph): if self._tag_list: self._tag_list[-1].push_mrph(mrph) self._mrph_list.push_mrph(mrph) def push_tag(self, tag): if not self._tag_list and self._mrph_list: sys.stderr.write("Unsafe addition of tags!\n") quit(1) self._tag_list.push_tag(tag) def spec(self): return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._tag_list.spec()) def mrph_list(self): return self._mrph_list def tag_list(self): return self._tag_list def pstring(self, string=None): if string: self._pstring = string else: return self._pstring def bnst_head(self): if len(self.tag_list()) == 1: return self.tag_list()[0] for tag in self.tag_list(): if '文節内' not in tag.features: return tag def recursive_children(self): def __recursive_children(bnst, bs): assert bnst not in bnst.children children = bnst.children if not children: return [] for c in children: __recursive_children(c, bs) bs.append(c) return bs return __recursive_children(self, []) def recursive_adnominals(self): modifiers = [] for c in self.children: if '<連体修飾>' in c.fstring: m_children = c.recursive_children() m_children.append(c) modifiers.extend(m_children) return modifiers