def test_build(self): out_path = os.path.join(self.test_dir, 'output.txt') out_stream = open(out_path, 'wb') lexicon_paths = [self.input_path] matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8') header = DictionaryHeader(SYSTEM_DICT_VERSION, int(time.time()), 'test') out_stream.write(header.to_bytes()) builder = DictionaryBuilder(logger=self.logger) builder.build(lexicon_paths, matrix_input_stream, out_stream) out_stream.close() matrix_input_stream.close() buffers, header, grammar, lexicon_set = self.read_system_dictionary( out_path) lexicon = lexicon_set.lexicons[0] # header self.assertEqual(SYSTEM_DICT_VERSION, header.version) self.assertEqual('test', header.description) # grammar self.assertEqual(2, grammar.get_part_of_speech_size()) self.assertEqual(["名詞", "固有名詞", "地名", "一般", "*", "*"], grammar.get_part_of_speech_string(0)) self.assertEqual(["名詞", "普通名詞", "一般", "*", "*", "*"], grammar.get_part_of_speech_string(1)) self.assertEqual(200, grammar.get_connect_cost(0, 0)) # lexicon self.assertEqual(3, lexicon.size()) self.assertEqual(0, lexicon.get_cost(0)) wi = lexicon.get_word_info(0) self.assertEqual('東京都', wi.surface) self.assertEqual('東京都', wi.normalized_form) self.assertEqual(-1, wi.dictionary_form_word_id) self.assertEqual('ヒガシキョウト', wi.reading_form) self.assertEqual(0, wi.pos_id) self.assertEqual([1, 2], wi.a_unit_split) self.assertEqual([], wi.b_unit_split) lst = lexicon.lookup('東京都'.encode('utf-8'), 0) self.assertEqual((0, len('東京都'.encode('utf-8'))), lst.__next__()) with self.assertRaises(StopIteration): lst.__next__() self.assertEqual(-1, lexicon.get_left_id(1)) self.assertEqual(0, lexicon.get_cost(1)) wi = lexicon.get_word_info(1) self.assertEqual('東', wi.surface) self.assertEqual('ひがし', wi.normalized_form) self.assertEqual(-1, wi.dictionary_form_word_id) self.assertEqual('ヒガシ', wi.reading_form) self.assertEqual(1, wi.pos_id) self.assertEqual([], wi.a_unit_split) self.assertEqual([], wi.b_unit_split) lst = lexicon.lookup('東'.encode('utf-8'), 0) with self.assertRaises(StopIteration): lst.__next__()
def _build_dictionary(self, input_txt_path, lex_lines, dictionary_name): with open(input_txt_path, 'w', encoding='utf-8') as wf: wf.write("\n".join(lex_lines)) out_path = os.path.join(self.resource_dir, dictionary_name) out_stream = open(out_path, 'wb') lexicon_paths = [input_txt_path] matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8') header = DictionaryHeader(SYSTEM_DICT_VERSION_2, int(time.time()), 'test') out_stream.write(header.to_bytes()) builder = DictionaryBuilder(logger=self.logger) builder.build(lexicon_paths, matrix_input_stream, out_stream) out_stream.close() matrix_input_stream.close() return out_path