def test_build(self): out_path = os.path.join(self.test_dir, 'output.txt') out_stream = open(out_path, 'wb') lexicon_paths = [self.input_path] matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8') header = DictionaryHeader(SYSTEM_DICT_VERSION, int(time.time()), 'test') out_stream.write(header.to_bytes()) builder = DictionaryBuilder(logger=self.logger) builder.build(lexicon_paths, matrix_input_stream, out_stream) out_stream.close() matrix_input_stream.close() buffers, header, grammar, lexicon_set = self.read_system_dictionary( out_path) lexicon = lexicon_set.lexicons[0] # header self.assertEqual(SYSTEM_DICT_VERSION, header.version) self.assertEqual('test', header.description) # grammar self.assertEqual(2, grammar.get_part_of_speech_size()) self.assertEqual(["名詞", "固有名詞", "地名", "一般", "*", "*"], grammar.get_part_of_speech_string(0)) self.assertEqual(["名詞", "普通名詞", "一般", "*", "*", "*"], grammar.get_part_of_speech_string(1)) self.assertEqual(200, grammar.get_connect_cost(0, 0)) # lexicon self.assertEqual(3, lexicon.size()) self.assertEqual(0, lexicon.get_cost(0)) wi = lexicon.get_word_info(0) self.assertEqual('東京都', wi.surface) self.assertEqual('東京都', wi.normalized_form) self.assertEqual(-1, wi.dictionary_form_word_id) self.assertEqual('ヒガシキョウト', wi.reading_form) self.assertEqual(0, wi.pos_id) self.assertEqual([1, 2], wi.a_unit_split) self.assertEqual([], wi.b_unit_split) lst = lexicon.lookup('東京都'.encode('utf-8'), 0) self.assertEqual((0, len('東京都'.encode('utf-8'))), lst.__next__()) with self.assertRaises(StopIteration): lst.__next__() self.assertEqual(-1, lexicon.get_left_id(1)) self.assertEqual(0, lexicon.get_cost(1)) wi = lexicon.get_word_info(1) self.assertEqual('東', wi.surface) self.assertEqual('ひがし', wi.normalized_form) self.assertEqual(-1, wi.dictionary_form_word_id) self.assertEqual('ヒガシ', wi.reading_form) self.assertEqual(1, wi.pos_id) self.assertEqual([], wi.a_unit_split) self.assertEqual([], wi.b_unit_split) lst = lexicon.lookup('東'.encode('utf-8'), 0) with self.assertRaises(StopIteration): lst.__next__()
def setUp(self): # Copied from sudachipy.dictionay.Dictionary.read_system_dictionary test_resources_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources') filename = os.path.join(test_resources_dir, 'system.dic') with open(filename, 'rb') as system_dic: bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ) offset = 0 self.header = DictionaryHeader.from_bytes(bytes_, offset)
def _build_dictionary(self, input_txt_path, lex_lines, dictionary_name): with open(input_txt_path, 'w', encoding='utf-8') as wf: wf.write("\n".join(lex_lines)) out_path = os.path.join(self.resource_dir, dictionary_name) out_stream = open(out_path, 'wb') lexicon_paths = [input_txt_path] matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8') header = DictionaryHeader(SYSTEM_DICT_VERSION_2, int(time.time()), 'test') out_stream.write(header.to_bytes()) builder = DictionaryBuilder(logger=self.logger) builder.build(lexicon_paths, matrix_input_stream, out_stream) out_stream.close() matrix_input_stream.close() return out_path
def setUp(self): # Copied from sudachipy.dictionay.Dictionary.read_system_dictionary test_resources_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources') filename = os.path.join(test_resources_dir, 'system.dic') with open(filename, 'r+b') as system_dic: bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ) header = DictionaryHeader.from_bytes(bytes_, 0) if header.version != SYSTEM_DICT_VERSION: raise Exception('invalid system dictionary') self.lexicon = DoubleArrayLexicon(bytes_, header.storage_size() + 470)
def test_build(self): out_path = os.path.join(self.test_dir, 'output.txt') in_path = os.path.join(self.test_dir, 'input.txt') out_stream = open(out_path, 'wb') # lexicon_paths = [self.input_path] # matrix_input_stream = open(self.matrix_path, 'r') with open(in_path, 'w', encoding='utf-8') as wf: wf.write( "東京都市,0,0,0,東京都市,名詞,固有名詞,地名,一般,*,*,ヒガシキョウトシ,東京都市,*,B,\"東,名詞,普通名詞,一般,*,*,*,ヒガシ/3/U1\",*,\"4/3/市,名詞,普通名詞,一般,*,*,*,シ\"\n" ) wf.write('市,-1,-1,0,市,名詞,普通名詞,一般,*,*,*,シ,市,*,A,*,*,*\n') _, _, grammar, lexicon_set = TestDictionaryBuilder.read_system_dictionary( self.dict_filename) header = DictionaryHeader(SYSTEM_DICT_VERSION, int(time.time()), 'test') out_stream.write(header.to_bytes()) builder = UserDictionaryBuilder(grammar, lexicon_set, logger=self.logger) lexicon_paths = [in_path] builder.build(lexicon_paths, None, out_stream) out_stream.close() buffers, header, grammar, lexicon_set = TestDictionaryBuilder.read_system_dictionary( out_path) lexicon = lexicon_set.lexicons[0] # header self.assertEqual(SYSTEM_DICT_VERSION, header.version) self.assertEqual('test', header.description) # lexicon self.assertEqual(0, lexicon.get_left_id(0)) self.assertEqual(0, lexicon.get_cost(0)) wi = lexicon.get_word_info(0) self.assertEqual('東京都市', wi.surface) self.assertEqual('東京都市', wi.normalized_form) self.assertEqual(-1, wi.dictionary_form_word_id) self.assertEqual('ヒガシキョウトシ', wi.reading_form) self.assertEqual(3, wi.pos_id) self.assertEqual([4, 3, 1 | (1 << 28)], wi.a_unit_split) self.assertEqual([], wi.b_unit_split) self.assertEqual([4, 3, 1 | (1 << 28)], wi.word_structure) lst = lexicon.lookup('東京都市'.encode('utf-8'), 0) self.assertEqual((0, len('東京都市'.encode('utf-8'))), lst.__next__()) with self.assertRaises(StopIteration): lst.__next__() self.assertEqual(-1, lexicon.get_left_id(1)) self.assertEqual(0, lexicon.get_cost(1)) wi = lexicon.get_word_info(1) self.assertEqual('市', wi.surface) self.assertEqual('市', wi.normalized_form) self.assertEqual(-1, wi.dictionary_form_word_id) self.assertEqual('シ', wi.reading_form) self.assertEqual(4, wi.pos_id) self.assertEqual([], wi.a_unit_split) self.assertEqual([], wi.b_unit_split) lst = lexicon.lookup('東'.encode('utf-8'), 0) with self.assertRaises(StopIteration): lst.__next__()