예제 #1
0
    def __init__(self, spec, bnst_id=0, newstyle=False):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif newstyle:
            items = spec.split(u"\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            sys.stderr.write("Illegal bunsetsu spec: %s\n" % spec)
            quit(1)

        # Extract 正規化代表表記
        if not newstyle:
            self.repname = ''
            match = re.search(r"<正規化代表表記:([^\"\s]+?)>", self.fstring)
            if match:
                self.repname = match.group(1)
    def call_juman_interface(self, input_str):
        """* What you can do
        - You call Juman tokenizer interface.

        * Output
        - pyknp.MList
        """
        # type: (six.text_type)->MList
        if isinstance(self.jumanpp_obj, Jumanpp):
            ml_token_object = self.jumanpp_obj.analysis(input_str=input_str)
        elif isinstance(self.jumanpp_obj, JumanppHnadler):
            try:
                result_token = self.jumanpp_obj.query(input_string=input_str)
            except UnicodeDecodeError:
                logger.warning(msg="Process is down by some reason. It restarts process automatically.")
                self.jumanpp_obj.restart_process()
                result_token = self.jumanpp_obj.query(input_string=input_str)
            ml_token_object = MList(result_token)
        elif isinstance(self.jumanpp_obj, JumanppClient):
            server_response = self.jumanpp_obj.query(sentence=input_str, pattern=self.eos_pattern)
            ml_token_object = MList(server_response)
        else:
            raise Exception('Not defined')

        return ml_token_object
예제 #3
0
    def __init__(self, spec, tag_id=0, juman_format=JUMAN_FORMAT.DEFAULT):
        self._mrph_list = MList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self.features = None
        self._pstring = ''
        self.tag_id = tag_id
        self.pas = None
        self.synnodes = []
        spec = spec.strip()
        if spec == '+':
            pass
        elif juman_format != JUMAN_FORMAT.DEFAULT:
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
            self.features = Features(self.fstring, "|", False)
            self.features._tag = self
        elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec):
            match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal tag spec: %s" % spec)

        # Extract 正規化代表表記
        if juman_format == JUMAN_FORMAT.DEFAULT:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''
            self.pred_repname = ''
            self.disambiguated_pred_repname = ''

            self.features = Features(self.fstring)
            self.features._tag = self

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname is not None:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname is not None:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname
            pred_repname = self.features.get("用言代表表記")
            if pred_repname is not None:
                self.pred_repname = pred_repname
            disambiguated_pred_repname = self.features.get("標準用言代表表記")
            if disambiguated_pred_repname is not None:
                self.disambiguated_pred_repname = disambiguated_pred_repname
    def call_juman_interface(self, input_str):
        # type: (text_type) -> MList
        """* What you can do
        - You call Juman tokenizer interface.

        * Output
        - pyknp.MList
        """
        if isinstance(self.jumanpp_obj, Jumanpp):
            ml_token_object = self.jumanpp_obj.analysis(input_str=input_str)
        elif isinstance(self.jumanpp_obj, JumanppHnadler):
            try:
                result_token = self.jumanpp_obj.query(input_string=input_str)
            except ProcessDownException:
                """Unix process is down by any reason."""
                logger.warning("Re-starting unix process because it takes longer time than {} seconds...".format(self.jumanpp_obj.timeout_second))
                self.jumanpp_obj.restart_process()
                self.jumanpp_obj.query(self.dummy_text)
                result_token = self.jumanpp_obj.query(input_string=input_str)
                ml_token_object = MList(result_token)
            except UnicodeDecodeError:
                logger.warning(msg="Process is down by some reason. It restarts process automatically.")
                self.jumanpp_obj.restart_process()
                self.jumanpp_obj.query(self.dummy_text)
                result_token = self.jumanpp_obj.query(input_string=input_str)
                ml_token_object = MList(result_token)
            else:
                ml_token_object = MList(result_token)
        elif isinstance(self.jumanpp_obj, JumanppClient):
            server_response = self.jumanpp_obj.query(sentence=input_str, pattern=self.eos_pattern)
            ml_token_object = MList(server_response)
        else:
            raise Exception('Not defined')

        return ml_token_object
예제 #5
0
class Tag(object):
    """
    格解析の単位となるタグ(基本句)の各種情報を保持するオブジェクト.
    """
    def __init__(self, spec, tag_id=0, newstyle=False):
        self._mrph_list = MList()
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self.features = None
        self._pstring = ''
        self.tag_id = tag_id
        self.synnodes = []
        spec = spec.strip()
        if spec == '+':
            pass
        elif newstyle:
            items = spec.split(u"\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
            self.features = Features(self.fstring, u"|", False)
        elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec):
            match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            sys.stderr.write("Illegal tag spec: %s\n" % spec)
            quit(1)

        # Extract 正規化代表表記
        if not newstyle:
            self.repname = ''
            self.features = Features(self.fstring)
            rep = self.features.get(u"正規化代表表記")
            if rep is not None:
                self.repname = rep

    def push_mrph(self, mrph):
        self._mrph_list.push_mrph(mrph)

    def spec(self):
        return "+ %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring,
                                  self._mrph_list.spec())

    def mrph_list(self):
        return self._mrph_list

    def pstring(self, string=None):
        if string:
            self._pstring = string
        else:
            return self._pstring

    def get_surface(self):
        return ''.join([mrph.midasi for mrph in self.mrph_list()])
예제 #6
0
    def __init__(self, spec, bnst_id=0, juman_format=JUMAN_FORMAT.DEFAULT):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.prev = None
        self.next = None
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif juman_format != JUMAN_FORMAT.DEFAULT:  # TODO
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal bunsetsu spec: %s" % spec)
        self.features = Features(self.fstring)

        # Extract 正規化代表表記
        if juman_format == JUMAN_FORMAT.DEFAULT:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname
예제 #7
0
파일: tag.py 프로젝트: shirayu/pyknp
    def __init__(self, spec, tag_id=0, juman_format=JUMAN_FORMAT.DEFAULT):
        self._mrph_list = MList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self.features = None
        self._pstring = ''
        self.tag_id = tag_id
        self.pas = None
        self.synnodes = []
        spec = spec.strip()
        if spec == '+':
            pass
        elif juman_format != JUMAN_FORMAT.DEFAULT:
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
            self.features = Features(self.fstring, "|", False)
            self.features._tag = self
        elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec):
            match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal tag spec: %s" % spec)

        # Extract 正規化代表表記
        if juman_format == JUMAN_FORMAT.DEFAULT:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''
            self.pred_repname = ''
            self.disambiguated_pred_repname = ''

            self.features = Features(self.fstring)
            self.features._tag = self

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname is not None:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname is not None:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname
            pred_repname = self.features.get("用言代表表記")
            if pred_repname is not None:
                self.pred_repname = pred_repname
            disambiguated_pred_repname = self.features.get("標準用言代表表記")
            if disambiguated_pred_repname is not None:
                self.disambiguated_pred_repname = disambiguated_pred_repname
예제 #8
0
def loads_tags(text: str) -> Tags:
    tags_fss: List[Features] = []
    mrph_fss: List[Features] = []
    indices = []
    idx = 0
    sent = ""
    for line in text.split("\n"):
        if line == "EOS":
            break
        if line.startswith(_TAG_PREFIX):
            indices.append(idx)
            info = json.loads(line[len(_TAG_PREFIX):])
            tags_fss.append(Features(info))
        else:
            mtext, mfs = line.rsplit("\t", 1)
            sent += mtext + "\n"
            mrph_fss.append(Features(json.loads(mfs)))
            idx += 1

    mlist = MList(sent)
    for mrph, mrph_fs in zip(mlist, mrph_fss):
        mrph.fs = mrph_fs

    tags = Tags(mlist, indices)
    assert len(tags) == len(tags_fss)
    for tag, ti in zip(tags, tags_fss):
        tag.fs = ti
    return tags
예제 #9
0
 def __juman_parse(self, text: str, jumanpp: Optional[bool] = None) -> str:
     juman_str = self.__juman_lines(text, jumanpp)
     m_length = len(MList(juman_str))
     if m_length < self.MAX_JUMAN_MORPHLENGTH:
         return "%s%s" % (juman_str, self.knp.pattern)
     else:
         raise MorphTooLongError(
             f"{m_length} morphs > {self.MAX_JUMAN_MORPHLENGTH}")
예제 #10
0
    def result(self, input_str):
        """ Juman出力結果に対して、その結果を MList オブジェクトとして返す
        
        Args:
            input_str (str): Juman出力結果

        Returns:
            MList: 形態素列オブジェクト
        """
        return MList(input_str)
예제 #11
0
파일: juman.py 프로젝트: yasu-shiba/pyknp
    def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT):
        """ Juman出力結果に対して、その結果を MList オブジェクトとして返す
        
        Args:
            input_str (str): Juman出力結果
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            MList: 形態素列オブジェクト
        """
        return MList(input_str, juman_format)
예제 #12
0
    def _detailed_tokens(self, juman_lines: str) -> List[ShortUnitWord]:
        """Tokenize text with Juman and format the outputs for further processing"""
        from pyknp import MList

        ml = MList(juman_lines).mrph_list()
        words: List[ShortUnitWord] = []
        for m in ml:
            surface = m.midasi
            pos = m.hinsi + "," + m.bunrui
            lemma = m.genkei or surface
            words.append(ShortUnitWord(surface, lemma, pos, m.fstring, False))
        return words
예제 #13
0
    def _detailed_tokens(self, juman_lines: str) -> List[ShortUnitWord]:
        """Tokenize text with Juman and format the outputs for further processing"""
        from pyknp import MList, Morpheme  # type: ignore

        ml: List[Morpheme] = MList(juman_lines).mrph_list()
        words: List[ShortUnitWord] = []
        for m in ml:
            surface: str = m.midasi  # type: ignore
            pos: str = m.hinsi + "," + m.bunrui  # type: ignore
            lemma: str = m.genkei or surface  # type: ignore
            words.append(ShortUnitWord(surface, lemma, pos, m.fstring,
                                       ""))  # type: ignore
        return words
예제 #14
0
파일: util.py 프로젝트: megagonlabs/desuwa
def get_mlist(inf) -> Iterator[MList]:
    buf: List[str] = []
    surfs: List[str] = []
    for line in inf:
        if line == "EOS\n":
            yield MList("".join(buf))
            buf = []
            surfs = []
            continue
        buf.append(line)
        if line.startswith("@"):
            continue
        surf = line[:line.index(" ")]
        surfs.append(surf)
예제 #15
0
파일: tag.py 프로젝트: kzinmr/pyknp-extend
    def __init__(self, spec, tag_id=0, newstyle=False):
        self._mrph_list = MList()
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self.features = None
        self._pstring = ''
        self.tag_id = tag_id
        self.synnodes = []
        spec = spec.strip()
        if spec == '+':
            pass
        elif newstyle:
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
            self.features = Features(self.fstring, "|", False)
        elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec):
            match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            sys.stderr.write("Illegal tag spec: %s\n" % spec)
            quit(1)

        # Extract 正規化代表表記
        if not newstyle:
            self.repname = ''
            self.features = Features(self.fstring)
            rep = self.features.get("正規化代表表記")
            if rep is not None:
                self.repname = rep
예제 #16
0
    def initialize(mlist: MList):
        if hasattr(mlist, "fs_inited"):
            return
        mlist.fs_inited = True

        for m in mlist:
            m.fs = Features()
            if m.imis == "NIL" or len(m.imis) == 0:
                continue
            for f in m.imis.split(" "):
                if f.startswith("代表表記"):
                    continue
                m.fs.add(f)
        mlist[0].fs.add("文頭")
        mlist[-1].fs.add("文末")
예제 #17
0
파일: bunsetsu.py 프로젝트: shirayu/pyknp
    def __init__(self, spec, bnst_id=0, juman_format=JUMAN_FORMAT.DEFAULT):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif juman_format != JUMAN_FORMAT.DEFAULT: # TODO
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal bunsetsu spec: %s" % spec)
        self.features = Features(self.fstring)

        # Extract 正規化代表表記
        if juman_format == JUMAN_FORMAT.DEFAULT:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname
    def call_juman_interface(self, input_str):
        # type: (str) -> MList
        """* What you can do
        - You call Juman tokenizer interface.

        * Output
        - pyknp.MList
        """
        if isinstance(self.jumanpp_obj, Jumanpp):
            ml_token_object = self.jumanpp_obj.analysis(input_str=input_str)
        elif isinstance(self.jumanpp_obj, JumanppClient):
            server_response = self.jumanpp_obj.query(sentence=input_str, pattern=self.eos_pattern)
            ml_token_object = MList(server_response)
        else:
            raise Exception('Not defined')

        return ml_token_object
예제 #19
0
 def call_juman_interface(self, input_str):
     # type: (text_type)->MList
     if isinstance(self.juman, pyknp.Juman):
         result = self.juman.analysis(input_str)
         return result
     elif isinstance(self.juman, JumanppHnadler):
         try:
             result_analysis = self.juman.query(input_str)
         except UnicodeDecodeError:
             logger.warning(
                 msg=
                 "Process is down by some reason. It restarts process automatically."
             )
             self.juman.restart_process()
             result_analysis = self.juman.query(input_string=input_str)
         return MList(result_analysis)
     else:
         raise Exception('Not defined.')
예제 #20
0
파일: util.py 프로젝트: megagonlabs/desuwa
 def get(self, sentence: str) -> Optional[MList]:
     parsed = self.surf2parsed.get(sentence)
     if parsed:
         return MList(parsed)
     return None
예제 #21
0
 def juman_parse(self, text: str, jumanpp: Optional[bool] = None) -> MList:
     normalized_text = self.prenormalize(text)
     return MList(self.__juman_lines(normalized_text, jumanpp))
예제 #22
0
class Bunsetsu(object):
    """
    KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト.
    """

    def __init__(self, spec, bnst_id=0, newstyle=False):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif newstyle:
            items = spec.split(u"\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            sys.stderr.write("Illegal bunsetsu spec: %s\n" % spec)
            quit(1)

        # Extract 正規化代表表記
        if not newstyle:
            self.repname = ''
            match = re.search(r"<正規化代表表記:([^\"\s]+?)>", self.fstring)
            if match:
                self.repname = match.group(1)

    def push_mrph(self, mrph):
        if len(self._tag_list) > 0:
            self._tag_list[-1].push_mrph(mrph)
        self._mrph_list.push_mrph(mrph)

    def push_tag(self, tag):
        if len(self._tag_list) == 0 and len(self._mrph_list) > 0:
            sys.stderr.write("Unsafe addition of tags!\n")
            quit(1)
        self._tag_list.push_tag(tag)

    def spec(self):
        return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype,
                                  self.fstring, self._tag_list.spec())

    def mrph_list(self):
        return self._mrph_list

    def tag_list(self):
        return self._tag_list

    def pstring(self, string=None):
        if string:
            self._pstring = string
        else:
            return self._pstring
예제 #23
0
 def result(self, input_str):
     return MList(input_str)
예제 #24
0
파일: tag.py 프로젝트: shirayu/pyknp
class Tag(object):
    """
    ある文に関する基本句列を保持するオブジェクト

    Args:
        spec (str): KNP出力
        tag_id (int): 基本句ID
        juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

    Attributes:
        tag_id (int): 基本句ID
        midasi (str): 見出し
        parent (Tag): 親の基本句オブジェクト
        parent_id (int): 親の基本句ID
        children (list): 子の基本句オブジェクトのリスト
        dpndtype (str): 係り受けタイプ
        fstring (str): feature情報
        repname (str): 正規化代表表記 (normalized_repnameに同じ)
        normalized_repname (str): 正規化代表表記
        head_repname (str): 主辞代表表記
        head_prime_repname (str): 主辞’代表表記
        pred_repname (str): 用言代表表記
        disambiguated_pred_repname (str): 標準用言代表表記
        features (Features): 基本句のfeatureを表すFeatureオブジェクト
        pas (Pas): 基本句が述語の場合は項の情報(Pasオブジェクト), そうでない場合None
    """

    def __init__(self, spec, tag_id=0, juman_format=JUMAN_FORMAT.DEFAULT):
        self._mrph_list = MList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self.features = None
        self._pstring = ''
        self.tag_id = tag_id
        self.pas = None
        self.synnodes = []
        spec = spec.strip()
        if spec == '+':
            pass
        elif juman_format != JUMAN_FORMAT.DEFAULT:
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
            self.features = Features(self.fstring, "|", False)
            self.features._tag = self
        elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec):
            match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal tag spec: %s" % spec)

        # Extract 正規化代表表記
        if juman_format == JUMAN_FORMAT.DEFAULT:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''
            self.pred_repname = ''
            self.disambiguated_pred_repname = ''

            self.features = Features(self.fstring)
            self.features._tag = self

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname is not None:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname is not None:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname
            pred_repname = self.features.get("用言代表表記")
            if pred_repname is not None:
                self.pred_repname = pred_repname
            disambiguated_pred_repname = self.features.get("標準用言代表表記")
            if disambiguated_pred_repname is not None:
                self.disambiguated_pred_repname = disambiguated_pred_repname

    def push_mrph(self, mrph):
        """ 新しい形態素オブジェクトをセットする """
        self._mrph_list.push_mrph(mrph)

    def set_midasi(self):
        """ midasiをセットする """
        self.midasi = self.get_surface()

    def spec(self):
        """ 基本句に対応するKNP出力 """
        return "+ %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring,
                                  self._mrph_list.spec())

    def mrph_list(self):
        """ 基本句を構成する全形態素オブジェクトを返す

        Returns:
            list: 形態素オブジェクトMorphemeのリスト
        """
        return self._mrph_list

    def pstring(self, string=None):
        """ draw_treeしたときに右側に出力する文字列を返す """
        if string:
            self._pstring = string
        else:
            return self._pstring

    def get_surface(self):
        """ 基本句の見出しを返す

        Returns:
            str: 基本句の見出し
        """
        return ''.join(mrph.midasi for mrph in self.mrph_list())
예제 #25
0
class Bunsetsu(object):
    """
    KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト.

    Args:
        spec (str): KNP出力のうち文節に該当する箇所の文字列
        bnst_id (int): 文節ID
        newstyle (bool): KNPフォーマットの種類 (公開版KNPの場合はFalse)

    Attributes:
        bnst_id (int): 文節ID
        midasi (str): 見出し
        parent (Bunsetsu): 親の文節オブジェクト
        parent_id (int): 親の文節ID
        children (list): 子の文節オブジェクトのリスト
        repname (str): 正規化代表表記 (normalized_repnameに同じ)
        normalized_repname (str): 正規化代表表記
        head_repname (str): 主辞代表表記
        head_prime_repname (str): 主辞’代表表記
        fstring (str): feature情報
    """
    def __init__(self, spec, bnst_id=0, newstyle=False):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif newstyle:
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal bunsetsu spec: %s" % spec)
        self.features = Features(self.fstring)

        # Extract 正規化代表表記
        if not newstyle:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname

    def push_mrph(self, mrph):
        """ 新しい形態素オブジェクトをセットする """
        if len(self._tag_list) > 0:
            self._tag_list[-1].push_mrph(mrph)
        self._mrph_list.push_mrph(mrph)

    def push_tag(self, tag):
        """ 新しい基本句オブジェクトをセットする """
        if len(self._tag_list) == 0 and len(self._mrph_list) > 0:
            raise Exception("Unsafe addition of tags!")
        self._tag_list.push_tag(tag)

    def set_midasi(self):
        """ midasiをセットする """
        for i in range(len(self._tag_list)):
            self._tag_list[i].set_midasi()
        self.midasi = ''.join(mrph.midasi for mrph in self.mrph_list())

    def spec(self):
        """ 文節に対応するKNP出力 """
        return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring,
                                  self._tag_list.spec())

    def mrph_list(self):
        """ 文節を構成する全形態素オブジェクトを返す

        Returns:
            list: 形態素オブジェクトMorphemeのリスト
        """
        return self._mrph_list

    def tag_list(self):
        """ 文節を構成する全基本句オブジェクトを返す

        Returns:
            list: 基本句オブジェクトTagのリスト
        """
        return self._tag_list

    def pstring(self, string=None):
        """ draw_treeしたときに右側に出力する文字列を返す """
        if string:
            self._pstring = string
        else:
            return self._pstring
예제 #26
0
 def juman(self, input_str):
     assert (isinstance(input_str, six.text_type))
     result = MList(self.juman_lines(input_str))
     return result
예제 #27
0
 def jumanpp(self, input_str):
     assert isinstance(input_str, str)
     result = MList(self.jumanpp_lines(input_str))
     return result
예제 #28
0
파일: juman.py 프로젝트: yasu-shiba/pyknp
 def juman(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT):
     """ analysis関数と同じ """
     assert(isinstance(input_str, six.text_type))
     result = MList(self.juman_lines(input_str), juman_format)
     return result
예제 #29
0
파일: bunsetsu.py 프로젝트: shirayu/pyknp
class Bunsetsu(object):
    """
    KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト.

    Args:
        spec (str): KNP出力のうち文節に該当する箇所の文字列
        bnst_id (int): 文節ID
        juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

    Attributes:
        bnst_id (int): 文節ID
        midasi (str): 見出し
        parent (Bunsetsu): 親の文節オブジェクト
        parent_id (int): 親の文節ID
        children (list): 子の文節オブジェクトのリスト
        repname (str): 正規化代表表記 (normalized_repnameに同じ)
        normalized_repname (str): 正規化代表表記
        head_repname (str): 主辞代表表記
        head_prime_repname (str): 主辞’代表表記
        fstring (str): feature情報
    """

    def __init__(self, spec, bnst_id=0, juman_format=JUMAN_FORMAT.DEFAULT):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif juman_format != JUMAN_FORMAT.DEFAULT: # TODO
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal bunsetsu spec: %s" % spec)
        self.features = Features(self.fstring)

        # Extract 正規化代表表記
        if juman_format == JUMAN_FORMAT.DEFAULT:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname


    def push_mrph(self, mrph):
        """ 新しい形態素オブジェクトをセットする """
        if len(self._tag_list) > 0:
            self._tag_list[-1].push_mrph(mrph)
        self._mrph_list.push_mrph(mrph)

    def push_tag(self, tag):
        """ 新しい基本句オブジェクトをセットする """
        if len(self._tag_list) == 0 and len(self._mrph_list) > 0:
            raise Exception("Unsafe addition of tags!")
        self._tag_list.push_tag(tag)

    def set_midasi(self):
        """ midasiをセットする """
        for i in range(len(self._tag_list)):
            self._tag_list[i].set_midasi()
        self.midasi = ''.join(mrph.midasi for mrph in self.mrph_list())

    def spec(self):
        """ 文節に対応するKNP出力 """
        return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype,
                                  self.fstring, self._tag_list.spec())

    def mrph_list(self):
        """ 文節を構成する全形態素オブジェクトを返す

        Returns:
            list: 形態素オブジェクトMorphemeのリスト
        """
        return self._mrph_list

    def tag_list(self):
        """ 文節を構成する全基本句オブジェクトを返す

        Returns:
            list: 基本句オブジェクトTagのリスト
        """
        return self._tag_list

    def pstring(self, string=None):
        """ draw_treeしたときに右側に出力する文字列を返す """
        if string:
            self._pstring = string
        else:
            return self._pstring
예제 #30
0
class Tag(object):
    """
    ある文に関する基本句列を保持するオブジェクト

    Args:
        spec (str): KNP出力
        tag_id (int): 基本句ID
        juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

    Attributes:
        tag_id (int): 基本句ID
        midasi (str): 見出し
        parent (Tag): 親の基本句オブジェクト
        parent_id (int): 親の基本句ID
        children (list): 子の基本句オブジェクトのリスト
        dpndtype (str): 係り受けタイプ
        fstring (str): feature情報
        repname (str): 正規化代表表記 (normalized_repnameに同じ)
        normalized_repname (str): 正規化代表表記
        head_repname (str): 主辞代表表記
        head_prime_repname (str): 主辞’代表表記
        pred_repname (str): 用言代表表記
        disambiguated_pred_repname (str): 標準用言代表表記
        features (Features): 基本句のfeatureを表すFeatureオブジェクト
        pas (Pas): 基本句が述語の場合は項の情報(Pasオブジェクト), そうでない場合None
    """
    def __init__(self, spec, tag_id=0, juman_format=JUMAN_FORMAT.DEFAULT):
        self._mrph_list = MList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self.features = None
        self._pstring = ''
        self.tag_id = tag_id
        self.pas = None
        self.synnodes = []
        spec = spec.strip()
        if spec == '+':
            pass
        elif juman_format != JUMAN_FORMAT.DEFAULT:
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
            self.features = Features(self.fstring, "|", False)
            self.features._tag = self
        elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec):
            match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal tag spec: %s" % spec)

        # Extract 正規化代表表記
        if juman_format == JUMAN_FORMAT.DEFAULT:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''
            self.pred_repname = ''
            self.disambiguated_pred_repname = ''

            self.features = Features(self.fstring)
            self.features._tag = self

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname is not None:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname is not None:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname
            pred_repname = self.features.get("用言代表表記")
            if pred_repname is not None:
                self.pred_repname = pred_repname
            disambiguated_pred_repname = self.features.get("標準用言代表表記")
            if disambiguated_pred_repname is not None:
                self.disambiguated_pred_repname = disambiguated_pred_repname

    def push_mrph(self, mrph):
        """ 新しい形態素オブジェクトをセットする """
        self._mrph_list.push_mrph(mrph)

    def set_midasi(self):
        """ midasiをセットする """
        self.midasi = self.get_surface()

    def spec(self):
        """ 基本句に対応するKNP出力 """
        return "+ %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring,
                                  self._mrph_list.spec())

    def mrph_list(self):
        """ 基本句を構成する全形態素オブジェクトを返す

        Returns:
            list: 形態素オブジェクトMorphemeのリスト
        """
        return self._mrph_list

    def pstring(self, string=None):
        """ draw_treeしたときに右側に出力する文字列を返す """
        if string:
            self._pstring = string
        else:
            return self._pstring

    def get_surface(self):
        """ 基本句の見出しを返す

        Returns:
            str: 基本句の見出し
        """
        return ''.join(mrph.midasi for mrph in self.mrph_list())
예제 #31
0
class Bunsetsu(object):
    """
    KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト.
    """
    def __init__(self, spec, bnst_id=0, newstyle=False):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif newstyle:
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            sys.stderr.write("Illegal bunsetsu spec: %s\n" % spec)
            quit(1)

        # Extract 正規化代表表記
        if not newstyle:
            self.repname = ''
            match = re.search(r"<正規化代表表記:([^\"\s]+?)>", self.fstring)
            if match:
                self.repname = match.group(1)
            self.hrepname = ''
            match = re.search(r"<主辞代表表記:([^\"\s]+?)>", self.fstring)
            if match:
                self.hrepname = match.group(1)
            self.hprepname = ''
            match = re.search(r"<主辞’代表表記:([^\"\s]+?)>", self.fstring)
            if match:
                self.hprepname = match.group(1)

    def push_mrph(self, mrph):
        if self._tag_list:
            self._tag_list[-1].push_mrph(mrph)
        self._mrph_list.push_mrph(mrph)

    def push_tag(self, tag):
        if not self._tag_list and self._mrph_list:
            sys.stderr.write("Unsafe addition of tags!\n")
            quit(1)
        self._tag_list.push_tag(tag)

    def spec(self):
        return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring,
                                  self._tag_list.spec())

    def mrph_list(self):
        return self._mrph_list

    def tag_list(self):
        return self._tag_list

    def pstring(self, string=None):
        if string:
            self._pstring = string
        else:
            return self._pstring

    def bnst_head(self):
        if len(self.tag_list()) == 1:
            return self.tag_list()[0]
        for tag in self.tag_list():
            if '文節内' not in tag.features:
                return tag

    def recursive_children(self):
        def __recursive_children(bnst, bs):
            assert bnst not in bnst.children
            children = bnst.children
            if not children:
                return []
            for c in children:
                __recursive_children(c, bs)
                bs.append(c)
            return bs

        return __recursive_children(self, [])

    def recursive_adnominals(self):
        modifiers = []
        for c in self.children:
            if '<連体修飾>' in c.fstring:
                m_children = c.recursive_children()
                m_children.append(c)
                modifiers.extend(m_children)
        return modifiers