def test_parse_args(self): '''Test invocation of parse with bad arguments.''' # None text with mecab.MeCab() as nm: with self.assertRaises(api.MeCabError): nm.parse(None) # text must be str with mecab.MeCab() as nm: with self.assertRaises(api.MeCabError): nm.parse(99) # boundary_constraints must be re or str with mecab.MeCab() as nm: with self.assertRaises(api.MeCabError): nm.parse('foo', boundary_constraints=99.99) # feature_constraints must be tuple with mecab.MeCab() as nm: with self.assertRaises(api.MeCabError): nm.parse('foo', feature_constraints=[]) # -p / --partial, text must end with \n with mecab.MeCab('--partial') as nm: with self.assertRaises(api.MeCabError): nm.parse('foo')
def test_parse_tostr_default(self): '''Test simple default parsing.''' with mecab.MeCab() as nm: expected = nm.parse(self.text).strip() expected = expected.replace('\n', os.linesep) # ??? actual = self._2bytes(self._mecab_parse('')) self.assertEqual(expected, actual)
def test_sysdic(self): '''Test dictionary interface on system dictionary.''' with mecab.MeCab() as nm: sysdic = nm.dicts[0] cs = sysdic.charset.lower() self.assertIn(cs, self.CHARSETS) self.assertIsNotNone(re.search('sys.dic$', sysdic.filepath)) self.assertEqual(sysdic.type, 0) self.assertEqual(sysdic.version, 102)
def test_parse_tostr_partial(self): '''Test -p / --partial parsing to string.''' with mecab.MeCab('-p') as nm: yml = self.yaml.get('text10') txt = self._u2str(yml.get('text')) actual = nm.parse(txt).split('\n') expected = [self._u2str(e) for e in yml.get('expected').get('str').split(',')] for i in range(len(actual)): self.assertTrue(actual[i].startswith(expected[i]))
def test_parse_mecab_options_marginal(self): '''Test option-parsing: marginal.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-m') self.assertDictEqual(dopts, {'marginal': True}) dopts = nm._MeCab__parse_mecab_options('--marginal') self.assertDictEqual(dopts, {'marginal': True}) dopts = nm._MeCab__parse_mecab_options({'marginal': True}) self.assertDictEqual(dopts, {'marginal': True})
def test_parse_mecab_options_allocatesentence(self): '''Test option-parsing: allocation-sentence.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-C') self.assertDictEqual(dopts, {'allocate_sentence': True}) dopts = nm._MeCab__parse_mecab_options('--allocate-sentence') self.assertDictEqual(dopts, {'allocate_sentence': True}) dopts = nm._MeCab__parse_mecab_options({'allocate_sentence': True}) self.assertDictEqual(dopts, {'allocate_sentence': True})
def test_parse_mecab_options_allmorphs(self): '''Test option-parsing: all-morphs.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-a') self.assertDictEqual(dopts, {'all_morphs': True}) dopts = nm._MeCab__parse_mecab_options('--all-morphs') self.assertDictEqual(dopts, {'all_morphs': True}) dopts = nm._MeCab__parse_mecab_options({'all_morphs': True}) self.assertDictEqual(dopts, {'all_morphs': True})
def test_parse_unicodeRstr(self): '''Test parse: unicode input (Python 2) and bytes input (Python 3).''' s = '日本語だよ、これが。' with mecab.MeCab() as nm: if sys.version < '3': b = s.decode('utf-8') else: b = s.encode('utf-8') with self.assertRaises(api.MeCabError): nm.parse(b)
def test_parse_tostr_feature(self): '''Test feature constraint parsing to string (output format does NOT apply).''' with mecab.MeCab(r'-F%m,%f[0],%s\n') as nm: yml = self.yaml.get('text11') txt = self._u2str(yml.get('text')) feat = (tuple(self._u2str(yml.get('feature')).split(',')), ) expected = [self._u2str(e) for e in yml.get('expected')] actual = nm.parse(txt, feature_constraints=feat).split('\n') for i in range(len(actual)): self.assertEqual(actual[i], expected[i])
def test_parse_override_node_format(self): '''Test node-format override when default is defined in rcfile''' with mecab.MeCab(r'-r {} -O "" -F%m!\n'.format(self.testrc)) as nm: expected = nm.parse(self.text, as_nodes=True) expected = [e.feature for e in expected if e.stat == 0] argf = ['-r', self.testrc, '-O', '', '-F%m!\\n'] actual = self._2bytes(self._mecab_parse(argf)) actual = [e for e in actual.split() if not e.startswith('EOS')] for i, e in enumerate(actual): self.assertEqual(e, expected[i])
def test_parse_tonode_outputformat_errors(self): '''Test node parsing with output formatting errors: 1. unknown node has no pronunciation value 2. format missing leading [ 3. format missing ending ] ''' s = '私はブルザエモンです。' formats = ['-F%f[8]', '-F%f1]', '-F%f[1'] for argf in formats: with mecab.MeCab(argf) as nm: with self.assertRaises(api.MeCabError): list(nm.parse('私はブルザエモンです。', as_nodes=True))
def test_parse_tostr(self): '''Test default parsing, across different output formats.''' formats = [ '', '-Owakati', '-Oyomi', '-Ochasen2', '-N2', r'-F%m\t%h\t%f[0]\n' ] for argf in formats: with mecab.MeCab(argf) as nm: expected = nm.parse(self.text) expected = expected.replace('\n', os.linesep) actual = self._2bytes(self._mecab_parse(argf)) self.assertEqual(expected, actual)
def test_init_libunset(self): '''Test for load error when MeCab lib is not found.''' try: orig_env = os.getenv(mecab.MeCab.MECAB_PATH) os.environ[mecab.MeCab.MECAB_PATH] = '/foo/bar' with self.assertRaises(api.MeCabError) as cm: with mecab.MeCab(): self.assertIsNotNone( re.search('cannot load library /foo/bar', str(cm.exception))) finally: os.environ[mecab.MeCab.MECAB_PATH] = orig_env
def test_parse_mecab_options_userdic(self): '''Test option-parsing: userdic.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-u/baz/qux.dic') self.assertDictEqual(dopts, {'userdic': '/baz/qux.dic'}) dopts = nm._MeCab__parse_mecab_options('-u /baz/qux.dic') self.assertDictEqual(dopts, {'userdic': '/baz/qux.dic'}) dopts = nm._MeCab__parse_mecab_options('--userdic=/baz/qux.dic') self.assertDictEqual(dopts, {'userdic': '/baz/qux.dic'}) dopts = nm._MeCab__parse_mecab_options({'userdic': '/baz/qux.dic'}) self.assertDictEqual(dopts, {'userdic': '/baz/qux.dic'})
def test_parse_mecab_options_unkfeature(self): '''Test option-parsing: unk-feature.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-x!!!\\n') self.assertDictEqual(dopts, {'unk_feature': '!!!\\n'}) dopts = nm._MeCab__parse_mecab_options('-x !!!\\n') self.assertDictEqual(dopts, {'unk_feature': '!!!\\n'}) dopts = nm._MeCab__parse_mecab_options('--unk-feature=!!!\\n') self.assertDictEqual(dopts, {'unk_feature': '!!!\\n'}) dopts = nm._MeCab__parse_mecab_options({'unk_feature': '!!!\\n'}) self.assertDictEqual(dopts, {'unk_feature': '!!!\\n'})
def test_parse_mecab_options_eonformat(self): '''Test option-parsing: eon-format.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-S___\\n') self.assertDictEqual(dopts, {'eon_format': '___\\n'}) dopts = nm._MeCab__parse_mecab_options('-S ___\\n') self.assertDictEqual(dopts, {'eon_format': '___\\n'}) dopts = nm._MeCab__parse_mecab_options('--eon-format=___\\n') self.assertDictEqual(dopts, {'eon_format': '___\\n'}) dopts = nm._MeCab__parse_mecab_options({'eon_format': '___\\n'}) self.assertDictEqual(dopts, {'eon_format': '___\\n'})
def test_parse_mecab_options_bosformat(self): '''Test option-parsing: bos-format.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-B>>>\\n') self.assertDictEqual(dopts, {'bos_format': '>>>\\n'}) dopts = nm._MeCab__parse_mecab_options('-B >>>\\n') self.assertDictEqual(dopts, {'bos_format': '>>>\\n'}) dopts = nm._MeCab__parse_mecab_options('--bos-format=>>>\\n') self.assertDictEqual(dopts, {'bos_format': '>>>\\n'}) dopts = nm._MeCab__parse_mecab_options({'bos_format': '>>>\\n'}) self.assertDictEqual(dopts, {'bos_format': '>>>\\n'})
def test_parse_mecab_options_unkformat(self): '''Test option-parsing: unk-format.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-U???\\n') self.assertDictEqual(dopts, {'unk_format': '???\\n'}) dopts = nm._MeCab__parse_mecab_options('-U ???\\n') self.assertDictEqual(dopts, {'unk_format': '???\\n'}) dopts = nm._MeCab__parse_mecab_options('--unk-format=???\\n') self.assertDictEqual(dopts, {'unk_format': '???\\n'}) dopts = nm._MeCab__parse_mecab_options({'unk_format': '???\\n'}) self.assertDictEqual(dopts, {'unk_format': '???\\n'})
def test_parse_mecab_options_nodeformat(self): '''Test option-parsing: node-format.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-F%m\\n') self.assertDictEqual(dopts, {'node_format': '%m\\n'}) dopts = nm._MeCab__parse_mecab_options('-F %m\\n') self.assertDictEqual(dopts, {'node_format': '%m\\n'}) dopts = nm._MeCab__parse_mecab_options('--node-format=%m\\n') self.assertDictEqual(dopts, {'node_format': '%m\\n'}) dopts = nm._MeCab__parse_mecab_options({'node_format': '%m\\n'}) self.assertDictEqual(dopts, {'node_format': '%m\\n'})
def test_parse_mecab_options_dicdir(self): '''Test option-parsing: dicdir.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-d/foo/bar') self.assertDictEqual(dopts, {'dicdir': '/foo/bar'}) dopts = nm._MeCab__parse_mecab_options('-d /foo/bar') self.assertDictEqual(dopts, {'dicdir': '/foo/bar'}) dopts = nm._MeCab__parse_mecab_options('--dicdir=/foo/bar') self.assertDictEqual(dopts, {'dicdir': '/foo/bar'}) dopts = nm._MeCab__parse_mecab_options({'dicdir': '/foo/bar'}) self.assertDictEqual(dopts, {'dicdir': '/foo/bar'})
def test_parse_tonode_default(self): '''Test node parsing, skipping over any BOS or EOS nodes.''' formats = ['', '-N2'] for argf in formats: with mecab.MeCab(argf) as nm: expected = nm.parse(self.text, as_nodes=True) expected = [e for e in expected if e.stat == 0] actual = self._2bytes(self._mecab_parse(argf)) actual = [e for e in actual.split(os.linesep) if e != 'EOS'] for i in range(len(actual)): s, f = actual[i].split() self.assertEqual(expected[i].surface, s) self.assertEqual(expected[i].feature, f)
def test_parse_mecab_options_outputformattype(self): '''Test option-parsing: output-format-type.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-Owakati') self.assertDictEqual(dopts, {'output_format_type': 'wakati'}) dopts = nm._MeCab__parse_mecab_options('-O wakati') self.assertDictEqual(dopts, {'output_format_type': 'wakati'}) dopts = nm._MeCab__parse_mecab_options( '--output-format-type=wakati') self.assertDictEqual(dopts, {'output_format_type': 'wakati'}) dopts = nm._MeCab__parse_mecab_options( {'output_format_type': 'wakati'}) self.assertDictEqual(dopts, {'output_format_type': 'wakati'})
def test_parse_mecab_options_theta(self): '''Test option-parsing: theta.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-t0.777') self.assertDictEqual(dopts, {'theta': 0.777}) dopts = nm._MeCab__parse_mecab_options('-t 0.777') self.assertDictEqual(dopts, {'theta': 0.777}) dopts = nm._MeCab__parse_mecab_options('--theta=0.777') self.assertDictEqual(dopts, {'theta': 0.777}) dopts = nm._MeCab__parse_mecab_options({'theta': 0.777}) self.assertDictEqual(dopts, {'theta': 0.777}) # ValueError and message on stderr if theta is not a float with self.assertRaises(ValueError) as ctx: nm._MeCab__parse_mecab_options('--theta=XXX') self.assertIsNotNone(re.search('--theta', str(ctx.exception)))
def test_parse_mecab_options_nbest(self): '''Test option-parsing: nbest.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-N2') self.assertDictEqual(dopts, {'nbest': 2}) dopts = nm._MeCab__parse_mecab_options('-N 2') self.assertDictEqual(dopts, {'nbest': 2}) dopts = nm._MeCab__parse_mecab_options('--nbest=2') self.assertDictEqual(dopts, {'nbest': 2}) dopts = nm._MeCab__parse_mecab_options({'nbest': 2}) self.assertDictEqual(dopts, {'nbest': 2}) # ValueError with message if nbest is not an int with self.assertRaises(ValueError) as ctx: nm._MeCab__parse_mecab_options('-N0.99') self.assertIsNotNone(re.search('--nbest', str(ctx.exception)))
def test_parse_mecab_options_costfactor(self): '''Test option-parsing: cost-factor.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-c666') self.assertDictEqual(dopts, {'cost_factor': 666}) dopts = nm._MeCab__parse_mecab_options('-c 666') self.assertDictEqual(dopts, {'cost_factor': 666}) dopts = nm._MeCab__parse_mecab_options('--cost-factor=666') self.assertDictEqual(dopts, {'cost_factor': 666}) dopts = nm._MeCab__parse_mecab_options({'cost_factor': 666}) self.assertDictEqual(dopts, {'cost_factor': 666}) # ValueError with message if cost_factor is not an int with self.assertRaises(ValueError) as ctx: nm._MeCab__parse_mecab_options('-c0.99') self.assertIsNotNone(re.search('--cost-factor', str(ctx.exception)))
def test_parse_mecab_options_maxgroupingsize(self): '''Test option-parsing: max-grouping-size.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-M99') self.assertDictEqual(dopts, {'max_grouping_size': 99}) dopts = nm._MeCab__parse_mecab_options('-M 99') self.assertDictEqual(dopts, {'max_grouping_size': 99}) dopts = nm._MeCab__parse_mecab_options('--max-grouping-size=99') self.assertDictEqual(dopts, {'max_grouping_size': 99}) dopts = nm._MeCab__parse_mecab_options({'max_grouping_size': 99}) self.assertDictEqual(dopts, {'max_grouping_size': 99}) # ValueError with message if max_grouping_size is not an int with self.assertRaises(ValueError) as ctx: nm._MeCab__parse_mecab_options('-M0.99') self.assertIsNotNone( re.search('--max-grouping-size', str(ctx.exception)))
def test_parse_mecab_options_inputbuffersize(self): '''Test option-parsing: input-buffer-size.''' with mecab.MeCab() as nm: dopts = nm._MeCab__parse_mecab_options('-b8888') self.assertDictEqual(dopts, {'input_buffer_size': 8888}) dopts = nm._MeCab__parse_mecab_options('-b 8888') self.assertDictEqual(dopts, {'input_buffer_size': 8888}) dopts = nm._MeCab__parse_mecab_options('--input-buffer-size=8888') self.assertDictEqual(dopts, {'input_buffer_size': 8888}) dopts = nm._MeCab__parse_mecab_options({'input_buffer_size': 8888}) self.assertDictEqual(dopts, {'input_buffer_size': 8888}) # ValueError with message if input_buffer_size is not an int with self.assertRaises(ValueError) as ctx: nm._MeCab__parse_mecab_options('-b0.99') self.assertIsNotNone( re.search('--input-buffer-size', str(ctx.exception)))
def test_build_options_str(self): '''Test option-building logic.''' with mecab.MeCab() as nm: opts = nm._MeCab__build_options_str({ 'dicdir': '/foo', 'userdic': '/bar', 'lattice_level': 444, 'output_format_type': 'yomi', 'all_morphs': True, 'nbest': 555, 'partial': True, 'marginal': True, 'max_grouping_size': 666, 'node_format': 'node\\n', 'unk_format': 'unk\\n', 'bos_format': 'bos\\n', 'eos_format': 'eos\\n', 'eon_format': 'eon\\n', 'unk_feature': 'unkf\\n', 'input_buffer_size': 777, 'allocate_sentence': True, 'theta': 0.999, 'cost_factor': 888, 'unknown': 1000 }) expected = self._2bytes(opts) actual = [ '--dicdir=/foo', '--userdic=/bar', '--lattice-level=444', '--output-format-type=yomi', '--all-morphs', '--nbest=555', '--partial', '--marginal', '--max-grouping-size=666', '--node-format=node\\\\n', '--unk-format=unk\\\\n', '--bos-format=bos\\\\n', '--eos-format=eos\\\\n', '--eon-format=eon\\\\n', '--unk-feature=unkf\\\\n', '--input-buffer-size=777', '--allocate-sentence', '--theta=0.999', '--cost-factor=888' ] for option in actual: self.assertIsNotNone(re.search(option, expected)) self.assertIsNone(re.search('--unknown', expected))
def test_parse_mecab_options_latticelevel(self): '''Test option-parsing: lattice-level warning.''' with mecab.MeCab() as nm: # setting lattice-level issues warning on stderr orig_err = sys.stderr try: opts = [ '-l777', '-l 777', '--lattice-level=777', { 'lattice_level': 777 } ] for o in opts: tmp_err = StringIO() sys.stderr = tmp_err dopts = nm._MeCab__parse_mecab_options(o) self.assertDictEqual(dopts, {'lattice_level': 777}) res = re.search(nm._WARN_LATTICE_LEVEL, tmp_err.getvalue().strip()) self.assertIsNotNone(res) finally: sys.stderr = orig_err
def test_init_unknownoption(self): '''Test instantiation of MeCab with unrecognized option.''' with self.assertRaises(api.MeCabError) as ctx: with mecab.MeCab('--unknown'): self.assertIsNotNone(re.search('--unknown', str(ctx.exception)))