def test_build(self):
        out_path = os.path.join(self.test_dir, 'output.txt')

        out_stream = open(out_path, 'wb')
        lexicon_paths = [self.input_path]
        matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8')

        header = DictionaryHeader(SYSTEM_DICT_VERSION, int(time.time()),
                                  'test')
        out_stream.write(header.to_bytes())
        builder = DictionaryBuilder(logger=self.logger)
        builder.build(lexicon_paths, matrix_input_stream, out_stream)
        out_stream.close()
        matrix_input_stream.close()

        buffers, header, grammar, lexicon_set = self.read_system_dictionary(
            out_path)
        lexicon = lexicon_set.lexicons[0]

        # header
        self.assertEqual(SYSTEM_DICT_VERSION, header.version)
        self.assertEqual('test', header.description)

        # grammar
        self.assertEqual(2, grammar.get_part_of_speech_size())
        self.assertEqual(["名詞", "固有名詞", "地名", "一般", "*", "*"],
                         grammar.get_part_of_speech_string(0))
        self.assertEqual(["名詞", "普通名詞", "一般", "*", "*", "*"],
                         grammar.get_part_of_speech_string(1))
        self.assertEqual(200, grammar.get_connect_cost(0, 0))

        # lexicon
        self.assertEqual(3, lexicon.size())
        self.assertEqual(0, lexicon.get_cost(0))
        wi = lexicon.get_word_info(0)
        self.assertEqual('東京都', wi.surface)
        self.assertEqual('東京都', wi.normalized_form)
        self.assertEqual(-1, wi.dictionary_form_word_id)
        self.assertEqual('ヒガシキョウト', wi.reading_form)
        self.assertEqual(0, wi.pos_id)
        self.assertEqual([1, 2], wi.a_unit_split)
        self.assertEqual([], wi.b_unit_split)
        lst = lexicon.lookup('東京都'.encode('utf-8'), 0)
        self.assertEqual((0, len('東京都'.encode('utf-8'))), lst.__next__())
        with self.assertRaises(StopIteration):
            lst.__next__()

        self.assertEqual(-1, lexicon.get_left_id(1))
        self.assertEqual(0, lexicon.get_cost(1))
        wi = lexicon.get_word_info(1)
        self.assertEqual('東', wi.surface)
        self.assertEqual('ひがし', wi.normalized_form)
        self.assertEqual(-1, wi.dictionary_form_word_id)
        self.assertEqual('ヒガシ', wi.reading_form)
        self.assertEqual(1, wi.pos_id)
        self.assertEqual([], wi.a_unit_split)
        self.assertEqual([], wi.b_unit_split)
        lst = lexicon.lookup('東'.encode('utf-8'), 0)
        with self.assertRaises(StopIteration):
            lst.__next__()
예제 #2
0
 def setUp(self):
     # Copied from sudachipy.dictionay.Dictionary.read_system_dictionary
     test_resources_dir = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources')
     filename = os.path.join(test_resources_dir, 'system.dic')
     with open(filename, 'rb') as system_dic:
         bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ)
     offset = 0
     self.header = DictionaryHeader.from_bytes(bytes_, offset)
예제 #3
0
    def _build_dictionary(self, input_txt_path, lex_lines, dictionary_name):
        with open(input_txt_path, 'w', encoding='utf-8') as wf:
            wf.write("\n".join(lex_lines))

        out_path = os.path.join(self.resource_dir, dictionary_name)
        out_stream = open(out_path, 'wb')
        lexicon_paths = [input_txt_path]
        matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8')

        header = DictionaryHeader(SYSTEM_DICT_VERSION_2, int(time.time()),
                                  'test')
        out_stream.write(header.to_bytes())
        builder = DictionaryBuilder(logger=self.logger)
        builder.build(lexicon_paths, matrix_input_stream, out_stream)
        out_stream.close()
        matrix_input_stream.close()

        return out_path
 def setUp(self):
     # Copied from sudachipy.dictionay.Dictionary.read_system_dictionary
     test_resources_dir = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources')
     filename = os.path.join(test_resources_dir, 'system.dic')
     with open(filename, 'r+b') as system_dic:
         bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ)
     header = DictionaryHeader.from_bytes(bytes_, 0)
     if header.version != SYSTEM_DICT_VERSION:
         raise Exception('invalid system dictionary')
     self.lexicon = DoubleArrayLexicon(bytes_, header.storage_size() + 470)
    def test_build(self):
        out_path = os.path.join(self.test_dir, 'output.txt')
        in_path = os.path.join(self.test_dir, 'input.txt')

        out_stream = open(out_path, 'wb')
        # lexicon_paths = [self.input_path]
        # matrix_input_stream = open(self.matrix_path, 'r')
        with open(in_path, 'w', encoding='utf-8') as wf:
            wf.write(
                "東京都市,0,0,0,東京都市,名詞,固有名詞,地名,一般,*,*,ヒガシキョウトシ,東京都市,*,B,\"東,名詞,普通名詞,一般,*,*,*,ヒガシ/3/U1\",*,\"4/3/市,名詞,普通名詞,一般,*,*,*,シ\"\n"
            )
            wf.write('市,-1,-1,0,市,名詞,普通名詞,一般,*,*,*,シ,市,*,A,*,*,*\n')

        _, _, grammar, lexicon_set = TestDictionaryBuilder.read_system_dictionary(
            self.dict_filename)
        header = DictionaryHeader(SYSTEM_DICT_VERSION, int(time.time()),
                                  'test')
        out_stream.write(header.to_bytes())
        builder = UserDictionaryBuilder(grammar,
                                        lexicon_set,
                                        logger=self.logger)
        lexicon_paths = [in_path]
        builder.build(lexicon_paths, None, out_stream)
        out_stream.close()

        buffers, header, grammar, lexicon_set = TestDictionaryBuilder.read_system_dictionary(
            out_path)
        lexicon = lexicon_set.lexicons[0]

        # header
        self.assertEqual(SYSTEM_DICT_VERSION, header.version)
        self.assertEqual('test', header.description)

        # lexicon
        self.assertEqual(0, lexicon.get_left_id(0))
        self.assertEqual(0, lexicon.get_cost(0))
        wi = lexicon.get_word_info(0)
        self.assertEqual('東京都市', wi.surface)
        self.assertEqual('東京都市', wi.normalized_form)
        self.assertEqual(-1, wi.dictionary_form_word_id)
        self.assertEqual('ヒガシキョウトシ', wi.reading_form)
        self.assertEqual(3, wi.pos_id)
        self.assertEqual([4, 3, 1 | (1 << 28)], wi.a_unit_split)
        self.assertEqual([], wi.b_unit_split)
        self.assertEqual([4, 3, 1 | (1 << 28)], wi.word_structure)
        lst = lexicon.lookup('東京都市'.encode('utf-8'), 0)
        self.assertEqual((0, len('東京都市'.encode('utf-8'))), lst.__next__())
        with self.assertRaises(StopIteration):
            lst.__next__()

        self.assertEqual(-1, lexicon.get_left_id(1))
        self.assertEqual(0, lexicon.get_cost(1))
        wi = lexicon.get_word_info(1)
        self.assertEqual('市', wi.surface)
        self.assertEqual('市', wi.normalized_form)
        self.assertEqual(-1, wi.dictionary_form_word_id)
        self.assertEqual('シ', wi.reading_form)
        self.assertEqual(4, wi.pos_id)
        self.assertEqual([], wi.a_unit_split)
        self.assertEqual([], wi.b_unit_split)
        lst = lexicon.lookup('東'.encode('utf-8'), 0)
        with self.assertRaises(StopIteration):
            lst.__next__()