Пример #1
0
 def test(self):
     tlist = TList()
     tag1 = Tag(u"+ 1D <BGH:構文/こうぶん><文節内><係:文節内><文頭>"
                u"<体言><名詞項候補><先行詞候補><正規化代表表記:構文/こうぶん>")
     mrph1 = Morpheme(u"構文 こうぶん 構文 名詞 6 普通名詞 1 * 0 * 0 \""
                      u"代表表記:構文/こうぶん カテゴリ:抽象物\" "
                      u"<代表表記:構文/こうぶん>")
     tag2 = Tag(u"+ -1D <BGH:解析/かいせき><文末><体言><用言:判>" u"<体言止><レベル:C>")
     mrph2 = Morpheme(u"解析 かいせき 解析 名詞 6 サ変名詞 2 * 0 * 0 \""
                      u"代表表記:解析/かいせき カテゴリ:抽象物 ドメイン:教育・学習;"
                      u"科学・技術\" <代表表記:解析/かいせき>")
     # Add tag with included morpheme
     tag1.push_mrph(mrph1)
     tlist.push_tag(tag1)
     self.assertEqual(len(tlist), 1)
     self.assertEqual(len(tlist[0].mrph_list()), 1)
     # Add tag without morpheme
     tlist.push_tag(tag2)
     self.assertEqual(len(tlist), 2)
     self.assertEqual(len(tlist[1].mrph_list()), 0)
     # Add morpheme to second tag
     tlist.push_mrph(mrph2)
     self.assertEqual(len(tlist), 2)
     self.assertEqual(len(tlist[0].mrph_list()), 1)
     self.assertEqual(len(tlist[1].mrph_list()), 1)
Пример #2
0
 def __init__(self, spec="", juman_format=JUMAN_FORMAT.DEFAULT):
     self._mrph = []
     self._readonly = False
     self.comment = ""
     mid = 1
     if spec != "":
         for line in spec.split("\n"):
             if line.strip() == "":
                 continue
             elif line.startswith('#'):
                 self.comment += line
             elif line.startswith('@') and not line.startswith('@ @'):
                 self._mrph[-1].push_doukei(
                     Morpheme(line[2:], mid, juman_format))
                 mid += 1
             else:
                 mrph = Morpheme(line, mid, juman_format)
                 if juman_format == JUMAN_FORMAT.LATTICE_TOP_ONE:
                     if 1 not in mrph.ranks:
                         continue
                     elif self._mrph and self._mrph[
                             -1].mrph_id == mrph.mrph_id:
                         self._mrph[-1].push_doukei(mrph)
                         continue
                 self.push_mrph(mrph)
                 mid += 1
Пример #3
0
 def test_mrph(self):
     bnst = Bunsetsu(self.bunsetsu_str)
     mrph1 = Morpheme(self.mrph1_str)
     bnst.push_mrph(mrph1)
     self.assertEqual(len(bnst.mrph_list()), 1)
     mrph2 = Morpheme(self.mrph2_str)
     bnst.push_mrph(mrph2)
     self.assertEqual(len(bnst.mrph_list()), 2)
     self.assertEqual(''.join(mrph.midasi for mrph in bnst.mrph_list()),
                      '構文解析')
Пример #4
0
 def test_spec(self):
     bnst = Bunsetsu(self.bunsetsu_str)
     tag1 = Tag(self.tag1_str)
     mrph1 = Morpheme(self.mrph1_str)
     tag1.push_mrph(mrph1)
     bnst.push_tag(tag1)
     tag2 = Tag(self.tag2_str)
     mrph2 = Morpheme(self.mrph2_str)
     tag2.push_mrph(mrph2)
     bnst.push_tag(tag2)
     self.assertEqual(bnst.spec(), self.spec)
Пример #5
0
 def test(self):
     tag_str = "+ 1D <BGH:構文/こうぶん><文節内><係:文節内><文頭><体言><名詞項候補><先行詞候補><正規化代表表記:構文/こうぶん>"
     tag = Tag(tag_str, 2)
     self.assertEqual(tag.tag_id, 2)
     self.assertEqual(tag.dpndtype, 'D')
     self.assertEqual(tag.parent_id, 1)
     self.assertEqual(len(tag.mrph_list()), 0)
     mrph1 = Morpheme("構文 こうぶん 構文 名詞 6 普通名詞 1 * 0 * 0 \"代表表記:構文/こうぶん カテゴリ:抽象物\" <代表表記:構文/こうぶん>")
     mrph2 = Morpheme("解析 かいせき 解析 名詞 6 サ変名詞 2 * 0 * 0 \"代表表記:解析/かいせき カテゴリ:抽象物 ドメイン:教育・学習;科学・技術\" <代表表記:解析/かいせき>")
     tag.push_mrph(mrph1)
     self.assertEqual(len(tag.mrph_list()), 1)
     tag.push_mrph(mrph2)
     self.assertEqual(len(tag.mrph_list()), 2)
     self.assertEqual(tag.get_surface(), '構文解析')
Пример #6
0
 def __init__(self, spec=""):
     self._mrph = []
     self._readonly = False
     self.comment = ""
     mid = 1
     if spec != "":
         for line in spec.split("\n"):
             if line.strip() == "":
                 continue
             elif line.startswith('#'):
                 self.comment += line
             elif line.startswith('@') and not line.startswith('@ @'):
                 self._mrph[-1].push_doukei(Morpheme(line[2:], mid))
                 mid += 1
             else:
                 self.push_mrph(Morpheme(line, mid))
                 mid += 1
Пример #7
0
    def parse(self, spec):
        """ KNPの出力を読み取る 

        Args:
            spec (str): KNP出力
        """
        for string in spec.split('\n'):
            if string.strip() == "":
                continue
            if string.startswith('#\t'):
                items = string.split("\t")
                if len(items) >= 3 and items[1] == "PAS":
                    self._pinfos.append(items[2])
            elif string.startswith('#'):
                if self.comment:
                    self.comment += "\n"
                self.comment += string
                match = re.match(r'# S-ID: ?(\S*)( .+)?$', self.comment)
                if match:
                    self.sid = match.group(1)
                if 'KNP++' in string and 'output:KNP' not in string:
                    self.juman_format = JUMAN_FORMAT.LATTICE_TOP_ONE  # TODO
            elif re.match(self.pattern, string):
                break
            elif string.startswith(';;'):
                raise Exception("Error: %s" % string)
            elif string.startswith('*'):
                bnst = Bunsetsu(string, len(self._bnst))
                self._bnst.append(bnst)
            elif string.startswith('+'):
                if self.juman_format != JUMAN_FORMAT.DEFAULT:  # TODO
                    bnst = Bunsetsu(string, len(self._bnst), self.juman_format)
                    self._bnst.append(bnst)
                self._bnst[-1].push_tag(
                    Tag(string, len(self.tag_list()), self.juman_format))
            elif string.startswith('!!'):
                synnodes = SynNodes(string)
                self._bnst[-1].tag_list().push_synnodes(synnodes)
            elif string.startswith('!') and not string.startswith('! ! !'):
                synnode = SynNode(string)
                self._bnst[-1].tag_list().push_synnode(synnode)
            elif string.startswith('EOS'):
                pass
            else:
                mrph = Morpheme(string, len(self.mrph_list()),
                                self.juman_format)
                if len(self._bnst) == 0:
                    bnst = Bunsetsu("*", len(self._bnst))
                    self._bnst.append(bnst)
                self._bnst[-1].push_mrph(mrph)
Пример #8
0
 def _parse_spec(self, spec):
     for string in spec.split('\n'):
         if string.strip() == "":
             continue
         if string.startswith('#\t'):
             items = string.split("\t")
             if len(items) >= 3 and items[1] == "PAS":
                 self._pinfos.append(items[2])
         elif string.startswith('#'):
             self.comment += string
             self.comment += "\n"
             match = re.match(r'# S-ID:(.*?)[ $\n]', self.comment)
             if match:
                 self.sid = match.group(1)
             if 'KNP++' in string:
                 self.newstyle = True
         elif re.match(self.pattern, string):
             break
         elif string.startswith(';;'):
             sys.stderr.write("Error: %s\n" % string)
             quit(1)
         elif string.startswith('*'):
             bnst = Bunsetsu(string, len(self._bnst))
             self._bnst.append(bnst)
         elif string.startswith('+'):
             if self.newstyle:
                 bnst = Bunsetsu(string, len(self._bnst), self.newstyle)
                 self._bnst.append(bnst)
             self._bnst[-1].push_tag(
                 Tag(string, len(self.tag_list()), self.newstyle))
         elif string.startswith('!!'):
             synnodes = SynNodes(string)
             self._bnst[-1].tag_list().push_synnodes(synnodes)
         elif string.startswith('!') and not string.startswith('! ! !'):
             synnode = SynNode(string)
             self._bnst[-1].tag_list().push_synnode(synnode)
         elif string.startswith('EOS'):
             pass
         else:
             mrph = Morpheme(string, len(self.mrph_list()), self.newstyle)
             if not self._bnst:
                 bnst = Bunsetsu("*", len(self._bnst))
                 self._bnst.append(bnst)
             self._bnst[-1].push_mrph(mrph)