def test_property_types(self): sys_dic = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) # entry in the system dictionary entry = sys_dic.lookup('すもも'.encode('utf8'))[0] self.assertTrue(type(entry[1]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int) entry_extra = sys_dic.lookup_extra(entry[0]) self.assertTrue(type(entry_extra[0]) is str) self.assertTrue(type(entry_extra[1]) is str) self.assertTrue(type(entry_extra[2]) is str) self.assertTrue(type(entry_extra[3]) is str) self.assertTrue(type(entry_extra[4]) is str) self.assertTrue(type(entry_extra[5]) is str) # unknown entry entry = sys_dic.unknowns.get(u'HIRAGANA')[0] self.assertTrue(type(entry[3]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[1]) is int) self.assertTrue(type(entry[2]) is int) # mmap dict etnry mmap_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections, chardef.DATA, unknowns.DATA) entry = mmap_dic.lookup(u'すもも'.encode('utf8'))[0] self.assertTrue(type(entry[1]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int) entry_extra = mmap_dic.lookup_extra(entry[0]) self.assertTrue(type(entry_extra[0]) is str) self.assertTrue(type(entry_extra[1]) is str) self.assertTrue(type(entry_extra[2]) is str) self.assertTrue(type(entry_extra[3]) is str) self.assertTrue(type(entry_extra[4]) is str) self.assertTrue(type(entry_extra[5]) is str) # entry in the user defined dictionary user_dic = UserDictionary(user_dict=os.path.join( parent_dir, 'tests/user_ipadic.csv'), enc='utf8', type='ipadic', connections=connections) entry = user_dic.lookup('東京スカイツリー'.encode('utf8'))[0] self.assertTrue(type(entry[1]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int)
def test_simplified_user_dictionary_with_progress(self): # create simplified user dictionary from csv with progress indicator progress_indicator = SimpleProgressIndicator(update_frequency=1.0) with self.assertLogs(logger=p_logger) as cm: # create user dictionary large_user_dic = UserDictionary( user_dict=os.path.join(parent_dir, 'tests/user_simpledic.csv'), enc='utf8', type='simpledic', connections=connections, progress_handler=progress_indicator) entry_count = len(large_user_dic.entries) # output for each entry and for complete (entry_count + 1) self.assertEqual((entry_count + 1) * 2, len(cm.output)) # value is reset after complete self.assertIsNone(progress_indicator.value) for i in range(0, (entry_count + 1) * 2): if i < entry_count: # progress for reading csv self.assertIn('Reading user dictionary from CSV', cm.output[i]) self.assertIn(f'{i + 1}/{entry_count}', cm.output[i]) elif i == entry_count: # on compete loading csv self.assertIn(f'{entry_count}/{entry_count}', cm.output[i]) elif i < entry_count * 2 + 1: # progress for create_minimum_transducer self.assertIn('Running create_minimum_transducer', cm.output[i]) self.assertIn(f'{i - entry_count}/{entry_count}', cm.output[i]) elif i == entry_count * 2 + 1: # on compete loading create_minimum_transducer self.assertIn(f'{entry_count}/{entry_count}', cm.output[i]) # same result as without progress indicator self.assertEqual(1, len(large_user_dic.lookup('東京スカイツリー'.encode('utf8'))))
def test_simplified_user_dictionary(self): # create user dictionary from csv user_dic = UserDictionary(user_dict=os.path.join( parent_dir, 'tests/user_simpledic.csv'), enc='utf8', type='simpledic', connections=connections) self.assertEqual(1, len(user_dic.lookup('東京スカイツリー'.encode('utf8')))) # save compiled dictionary dic_dir = os.path.join(parent_dir, 'tests/userdic_simple') user_dic.save(to_dir=os.path.join(parent_dir, 'tests/userdic_simple')) self.assertTrue( os.path.exists(os.path.join(dic_dir, FILE_USER_FST_DATA))) self.assertTrue( os.path.exists(os.path.join(dic_dir, FILE_USER_ENTRIES_DATA))) # load compiled dictionary compiled_user_dic = CompiledUserDictionary(dic_dir, connections=connections) self.assertEqual( 1, len(compiled_user_dic.lookup('とうきょうスカイツリー駅'.encode('utf8'))))
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer from janome.dic import UserDictionary import sysdic print('Compile user dictionary (MeCab IPADIC format)') user_dict = UserDictionary("user_ipadic.csv", "utf8", "ipadic", sysdic.connections) user_dict.save("/tmp/userdic") t = Tokenizer("/tmp/userdic") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'): print(token) print('') print('Compile user dictionary (simplified format)') user_dict = UserDictionary("user_simpledic.csv", "utf8", "simpledic", sysdic.connections) user_dict.save("/tmp/userdic_simple") t = Tokenizer("/tmp/userdic_simple") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'): print(token)
from janome.dic import UserDictionary from janome import sysdic user_dict = UserDictionary('neologd.csv', 'utf8', 'ipadic', sysdic.connections) user_dict.save('neologd')
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer from janome.dic import UserDictionary from janome import sysdic print('Compile user dictionary (MeCab IPADIC format)') user_dict = UserDictionary("user_ipadic.csv", "utf8", "ipadic", sysdic.connections) user_dict.save("/tmp/userdic") t = Tokenizer("/tmp/userdic") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'): print(token) print('') print('Compile user dictionary (simplified format)') user_dict = UserDictionary("user_simpledic.csv", "utf8", "simpledic", sysdic.connections) user_dict.save("/tmp/userdic_simple") t = Tokenizer("/tmp/userdic_simple") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'): print(token)