示例#1
0
    def parse(self, spec):
        """ KNPの出力を読み取る 

        Args:
            spec (str): KNP出力
        """
        for string in spec.split('\n'):
            if string.strip() == "":
                continue
            if string.startswith('#\t'):
                items = string.split("\t")
                if len(items) >= 3 and items[1] == "PAS":
                    self._pinfos.append(items[2])
            elif string.startswith('#'):
                if self.comment:
                    self.comment += "\n"
                self.comment += string
                match = re.match(r'# S-ID: ?(\S*)( .+)?$', self.comment)
                if match:
                    self.sid = match.group(1)
                if 'KNP++' in string and 'output:KNP' not in string:
                    self.juman_format = JUMAN_FORMAT.LATTICE_TOP_ONE  # TODO
            elif re.match(self.pattern, string):
                break
            elif string.startswith(';;'):
                raise Exception("Error: %s" % string)
            elif string.startswith('*'):
                bnst = Bunsetsu(string, len(self._bnst))
                self._bnst.append(bnst)
            elif string.startswith('+'):
                if self.juman_format != JUMAN_FORMAT.DEFAULT:  # TODO
                    bnst = Bunsetsu(string, len(self._bnst), self.juman_format)
                    self._bnst.append(bnst)
                self._bnst[-1].push_tag(
                    Tag(string, len(self.tag_list()), self.juman_format))
            elif string.startswith('!!'):
                synnodes = SynNodes(string)
                self._bnst[-1].tag_list().push_synnodes(synnodes)
            elif string.startswith('!') and not string.startswith('! ! !'):
                synnode = SynNode(string)
                self._bnst[-1].tag_list().push_synnode(synnode)
            elif string.startswith('EOS'):
                pass
            else:
                mrph = Morpheme(string, len(self.mrph_list()),
                                self.juman_format)
                if len(self._bnst) == 0:
                    bnst = Bunsetsu("*", len(self._bnst))
                    self._bnst.append(bnst)
                self._bnst[-1].push_mrph(mrph)
示例#2
0
 def _parse_spec(self, spec):
     for string in spec.split('\n'):
         if string.strip() == "":
             continue
         if string.startswith('#\t'):
             items = string.split("\t")
             if len(items) >= 3 and items[1] == "PAS":
                 self._pinfos.append(items[2])
         elif string.startswith('#'):
             self.comment += string
             self.comment += "\n"
             match = re.match(r'# S-ID:(.*?)[ $\n]', self.comment)
             if match:
                 self.sid = match.group(1)
             if 'KNP++' in string:
                 self.newstyle = True
         elif re.match(self.pattern, string):
             break
         elif string.startswith(';;'):
             sys.stderr.write("Error: %s\n" % string)
             quit(1)
         elif string.startswith('*'):
             bnst = Bunsetsu(string, len(self._bnst))
             self._bnst.append(bnst)
         elif string.startswith('+'):
             if self.newstyle:
                 bnst = Bunsetsu(string, len(self._bnst), self.newstyle)
                 self._bnst.append(bnst)
             self._bnst[-1].push_tag(
                 Tag(string, len(self.tag_list()), self.newstyle))
         elif string.startswith('!!'):
             synnodes = SynNodes(string)
             self._bnst[-1].tag_list().push_synnodes(synnodes)
         elif string.startswith('!') and not string.startswith('! ! !'):
             synnode = SynNode(string)
             self._bnst[-1].tag_list().push_synnode(synnode)
         elif string.startswith('EOS'):
             pass
         else:
             mrph = Morpheme(string, len(self.mrph_list()), self.newstyle)
             if not self._bnst:
                 bnst = Bunsetsu("*", len(self._bnst))
                 self._bnst.append(bnst)
             self._bnst[-1].push_mrph(mrph)
示例#3
0
    def get_bnst_span(self, bnst: Bunsetsu) -> Tuple[int, int]:
        """ 文節内の基本句すべてがまたがる、元文内における文字列範囲(スパン)を返す
            例. '日本経済新聞の記者' において、 bnst('日本経済新聞の') のスパン -> (0,6)
        """
        def merge_spans(continuous_spans):
            if len(continuous_spans) < 2:
                return continuous_spans[0]
            return continuous_spans[0][0], continuous_spans[-1][1]

        return merge_spans(
            [self.get_tag_span(t.tag_id) for t in bnst.tag_list()])