def test_build(self):
        out_path = os.path.join(self.test_dir, 'output.txt')

        out_stream = open(out_path, 'wb')
        lexicon_paths = [self.input_path]
        matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8')

        header = DictionaryHeader(SYSTEM_DICT_VERSION, int(time.time()),
                                  'test')
        out_stream.write(header.to_bytes())
        builder = DictionaryBuilder(logger=self.logger)
        builder.build(lexicon_paths, matrix_input_stream, out_stream)
        out_stream.close()
        matrix_input_stream.close()

        buffers, header, grammar, lexicon_set = self.read_system_dictionary(
            out_path)
        lexicon = lexicon_set.lexicons[0]

        # header
        self.assertEqual(SYSTEM_DICT_VERSION, header.version)
        self.assertEqual('test', header.description)

        # grammar
        self.assertEqual(2, grammar.get_part_of_speech_size())
        self.assertEqual(["名詞", "固有名詞", "地名", "一般", "*", "*"],
                         grammar.get_part_of_speech_string(0))
        self.assertEqual(["名詞", "普通名詞", "一般", "*", "*", "*"],
                         grammar.get_part_of_speech_string(1))
        self.assertEqual(200, grammar.get_connect_cost(0, 0))

        # lexicon
        self.assertEqual(3, lexicon.size())
        self.assertEqual(0, lexicon.get_cost(0))
        wi = lexicon.get_word_info(0)
        self.assertEqual('東京都', wi.surface)
        self.assertEqual('東京都', wi.normalized_form)
        self.assertEqual(-1, wi.dictionary_form_word_id)
        self.assertEqual('ヒガシキョウト', wi.reading_form)
        self.assertEqual(0, wi.pos_id)
        self.assertEqual([1, 2], wi.a_unit_split)
        self.assertEqual([], wi.b_unit_split)
        lst = lexicon.lookup('東京都'.encode('utf-8'), 0)
        self.assertEqual((0, len('東京都'.encode('utf-8'))), lst.__next__())
        with self.assertRaises(StopIteration):
            lst.__next__()

        self.assertEqual(-1, lexicon.get_left_id(1))
        self.assertEqual(0, lexicon.get_cost(1))
        wi = lexicon.get_word_info(1)
        self.assertEqual('東', wi.surface)
        self.assertEqual('ひがし', wi.normalized_form)
        self.assertEqual(-1, wi.dictionary_form_word_id)
        self.assertEqual('ヒガシ', wi.reading_form)
        self.assertEqual(1, wi.pos_id)
        self.assertEqual([], wi.a_unit_split)
        self.assertEqual([], wi.b_unit_split)
        lst = lexicon.lookup('東'.encode('utf-8'), 0)
        with self.assertRaises(StopIteration):
            lst.__next__()
示例#2
0
    def _build_dictionary(self, input_txt_path, lex_lines, dictionary_name):
        with open(input_txt_path, 'w', encoding='utf-8') as wf:
            wf.write("\n".join(lex_lines))

        out_path = os.path.join(self.resource_dir, dictionary_name)
        out_stream = open(out_path, 'wb')
        lexicon_paths = [input_txt_path]
        matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8')

        header = DictionaryHeader(SYSTEM_DICT_VERSION_2, int(time.time()),
                                  'test')
        out_stream.write(header.to_bytes())
        builder = DictionaryBuilder(logger=self.logger)
        builder.build(lexicon_paths, matrix_input_stream, out_stream)
        out_stream.close()
        matrix_input_stream.close()

        return out_path