def test(self): tlist = TList() tag1 = Tag(u"+ 1D <BGH:構文/こうぶん><文節内><係:文節内><文頭>" u"<体言><名詞項候補><先行詞候補><正規化代表表記:構文/こうぶん>") mrph1 = Morpheme(u"構文 こうぶん 構文 名詞 6 普通名詞 1 * 0 * 0 \"" u"代表表記:構文/こうぶん カテゴリ:抽象物\" " u"<代表表記:構文/こうぶん>") tag2 = Tag(u"+ -1D <BGH:解析/かいせき><文末><体言><用言:判>" u"<体言止><レベル:C>") mrph2 = Morpheme(u"解析 かいせき 解析 名詞 6 サ変名詞 2 * 0 * 0 \"" u"代表表記:解析/かいせき カテゴリ:抽象物 ドメイン:教育・学習;" u"科学・技術\" <代表表記:解析/かいせき>") # Add tag with included morpheme tag1.push_mrph(mrph1) tlist.push_tag(tag1) self.assertEqual(len(tlist), 1) self.assertEqual(len(tlist[0].mrph_list()), 1) # Add tag without morpheme tlist.push_tag(tag2) self.assertEqual(len(tlist), 2) self.assertEqual(len(tlist[1].mrph_list()), 0) # Add morpheme to second tag tlist.push_mrph(mrph2) self.assertEqual(len(tlist), 2) self.assertEqual(len(tlist[0].mrph_list()), 1) self.assertEqual(len(tlist[1].mrph_list()), 1)
def __init__(self, spec="", juman_format=JUMAN_FORMAT.DEFAULT): self._mrph = [] self._readonly = False self.comment = "" mid = 1 if spec != "": for line in spec.split("\n"): if line.strip() == "": continue elif line.startswith('#'): self.comment += line elif line.startswith('@') and not line.startswith('@ @'): self._mrph[-1].push_doukei( Morpheme(line[2:], mid, juman_format)) mid += 1 else: mrph = Morpheme(line, mid, juman_format) if juman_format == JUMAN_FORMAT.LATTICE_TOP_ONE: if 1 not in mrph.ranks: continue elif self._mrph and self._mrph[ -1].mrph_id == mrph.mrph_id: self._mrph[-1].push_doukei(mrph) continue self.push_mrph(mrph) mid += 1
def test_mrph(self): bnst = Bunsetsu(self.bunsetsu_str) mrph1 = Morpheme(self.mrph1_str) bnst.push_mrph(mrph1) self.assertEqual(len(bnst.mrph_list()), 1) mrph2 = Morpheme(self.mrph2_str) bnst.push_mrph(mrph2) self.assertEqual(len(bnst.mrph_list()), 2) self.assertEqual(''.join(mrph.midasi for mrph in bnst.mrph_list()), '構文解析')
def test_spec(self): bnst = Bunsetsu(self.bunsetsu_str) tag1 = Tag(self.tag1_str) mrph1 = Morpheme(self.mrph1_str) tag1.push_mrph(mrph1) bnst.push_tag(tag1) tag2 = Tag(self.tag2_str) mrph2 = Morpheme(self.mrph2_str) tag2.push_mrph(mrph2) bnst.push_tag(tag2) self.assertEqual(bnst.spec(), self.spec)
def test(self): tag_str = "+ 1D <BGH:構文/こうぶん><文節内><係:文節内><文頭><体言><名詞項候補><先行詞候補><正規化代表表記:構文/こうぶん>" tag = Tag(tag_str, 2) self.assertEqual(tag.tag_id, 2) self.assertEqual(tag.dpndtype, 'D') self.assertEqual(tag.parent_id, 1) self.assertEqual(len(tag.mrph_list()), 0) mrph1 = Morpheme("構文 こうぶん 構文 名詞 6 普通名詞 1 * 0 * 0 \"代表表記:構文/こうぶん カテゴリ:抽象物\" <代表表記:構文/こうぶん>") mrph2 = Morpheme("解析 かいせき 解析 名詞 6 サ変名詞 2 * 0 * 0 \"代表表記:解析/かいせき カテゴリ:抽象物 ドメイン:教育・学習;科学・技術\" <代表表記:解析/かいせき>") tag.push_mrph(mrph1) self.assertEqual(len(tag.mrph_list()), 1) tag.push_mrph(mrph2) self.assertEqual(len(tag.mrph_list()), 2) self.assertEqual(tag.get_surface(), '構文解析')
def __init__(self, spec=""): self._mrph = [] self._readonly = False self.comment = "" mid = 1 if spec != "": for line in spec.split("\n"): if line.strip() == "": continue elif line.startswith('#'): self.comment += line elif line.startswith('@') and not line.startswith('@ @'): self._mrph[-1].push_doukei(Morpheme(line[2:], mid)) mid += 1 else: self.push_mrph(Morpheme(line, mid)) mid += 1
def parse(self, spec): """ KNPの出力を読み取る Args: spec (str): KNP出力 """ for string in spec.split('\n'): if string.strip() == "": continue if string.startswith('#\t'): items = string.split("\t") if len(items) >= 3 and items[1] == "PAS": self._pinfos.append(items[2]) elif string.startswith('#'): if self.comment: self.comment += "\n" self.comment += string match = re.match(r'# S-ID: ?(\S*)( .+)?$', self.comment) if match: self.sid = match.group(1) if 'KNP++' in string and 'output:KNP' not in string: self.juman_format = JUMAN_FORMAT.LATTICE_TOP_ONE # TODO elif re.match(self.pattern, string): break elif string.startswith(';;'): raise Exception("Error: %s" % string) elif string.startswith('*'): bnst = Bunsetsu(string, len(self._bnst)) self._bnst.append(bnst) elif string.startswith('+'): if self.juman_format != JUMAN_FORMAT.DEFAULT: # TODO bnst = Bunsetsu(string, len(self._bnst), self.juman_format) self._bnst.append(bnst) self._bnst[-1].push_tag( Tag(string, len(self.tag_list()), self.juman_format)) elif string.startswith('!!'): synnodes = SynNodes(string) self._bnst[-1].tag_list().push_synnodes(synnodes) elif string.startswith('!') and not string.startswith('! ! !'): synnode = SynNode(string) self._bnst[-1].tag_list().push_synnode(synnode) elif string.startswith('EOS'): pass else: mrph = Morpheme(string, len(self.mrph_list()), self.juman_format) if len(self._bnst) == 0: bnst = Bunsetsu("*", len(self._bnst)) self._bnst.append(bnst) self._bnst[-1].push_mrph(mrph)
def _parse_spec(self, spec): for string in spec.split('\n'): if string.strip() == "": continue if string.startswith('#\t'): items = string.split("\t") if len(items) >= 3 and items[1] == "PAS": self._pinfos.append(items[2]) elif string.startswith('#'): self.comment += string self.comment += "\n" match = re.match(r'# S-ID:(.*?)[ $\n]', self.comment) if match: self.sid = match.group(1) if 'KNP++' in string: self.newstyle = True elif re.match(self.pattern, string): break elif string.startswith(';;'): sys.stderr.write("Error: %s\n" % string) quit(1) elif string.startswith('*'): bnst = Bunsetsu(string, len(self._bnst)) self._bnst.append(bnst) elif string.startswith('+'): if self.newstyle: bnst = Bunsetsu(string, len(self._bnst), self.newstyle) self._bnst.append(bnst) self._bnst[-1].push_tag( Tag(string, len(self.tag_list()), self.newstyle)) elif string.startswith('!!'): synnodes = SynNodes(string) self._bnst[-1].tag_list().push_synnodes(synnodes) elif string.startswith('!') and not string.startswith('! ! !'): synnode = SynNode(string) self._bnst[-1].tag_list().push_synnode(synnode) elif string.startswith('EOS'): pass else: mrph = Morpheme(string, len(self.mrph_list()), self.newstyle) if not self._bnst: bnst = Bunsetsu("*", len(self._bnst)) self._bnst.append(bnst) self._bnst[-1].push_mrph(mrph)