示例#1
0
 def test_parse_line_toolong_headword(self):
     builder = DictionaryBuilder(logger=self.logger)
     x = 'a' * (32767 + 1)  # max value of short in Java + 1
     x = x + ',6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*'
     with self.assertRaises(ValueError) as cm:
         builder.parse_line(x.split(','))
     self.assertEqual('string is too long', cm.exception.args[0])
 def test_parse_line_empty_headword(self):
     builder = DictionaryBuilder(logger=self.logger)
     with self.assertRaises(ValueError) as cm:
         builder.parse_line(
             ',6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,*'.split(
                 ','))
     self.assertEqual('headword is empty', cm.exception.args[0])
示例#3
0
 def test_convert_matrix(self):
     in_stream = StringIO('2 3\n0 0 0\n0 1 1\n0 2 2\n\n1 0 3\n1 1 4\n1 2 5\n')
     builder = DictionaryBuilder(logger=self.logger)
     matrix = builder.convert_matrix(in_stream)
     self.assertEqual(2, int.from_bytes(builder.byte_buffer.getvalue()[0:2], byteorder='little'))
     self.assertEqual(3, int.from_bytes(builder.byte_buffer.getvalue()[2:4], byteorder='little'))
     self.assertEqual(0, int.from_bytes(matrix.getvalue()[0:2], byteorder='little'))
     self.assertEqual(4, int.from_bytes(matrix.getvalue()[(2 + 1) * 2:(2 + 1) * 2 + 2], byteorder='little'))
示例#4
0
    def test_build(self):
        out_path = os.path.join(self.test_dir, 'output.txt')

        out_stream = open(out_path, 'wb')
        lexicon_paths = [self.input_path]
        matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8')

        header = DictionaryHeader(SYSTEM_DICT_VERSION, int(time.time()), 'test')
        out_stream.write(header.to_bytes())
        builder = DictionaryBuilder(logger=self.logger)
        builder.build(lexicon_paths, matrix_input_stream, out_stream)
        out_stream.close()
        matrix_input_stream.close()

        buffers, header, grammar, lexicon_set = self.read_system_dictionary(out_path)
        lexicon = lexicon_set.lexicons[0]

        # header
        self.assertEqual(SYSTEM_DICT_VERSION, header.version)
        self.assertEqual('test', header.description)

        # grammar
        self.assertEqual(2, grammar.get_part_of_speech_size())
        self.assertEqual(["名詞", "固有名詞", "地名", "一般", "*", "*"], grammar.get_part_of_speech_string(0))
        self.assertEqual(["名詞", "普通名詞", "一般", "*", "*", "*"], grammar.get_part_of_speech_string(1))
        self.assertEqual(200, grammar.get_connect_cost(0, 0))

        # lexicon
        self.assertEqual(3, lexicon.size())
        self.assertEqual(0, lexicon.get_cost(0))
        wi = lexicon.get_word_info(0)
        self.assertEqual('東京都', wi.surface)
        self.assertEqual('東京都', wi.normalized_form)
        self.assertEqual(-1, wi.dictionary_form_word_id)
        self.assertEqual('ヒガシキョウト', wi.reading_form)
        self.assertEqual(0, wi.pos_id)
        self.assertEqual([1, 2], wi.a_unit_split)
        self.assertEqual([], wi.b_unit_split)
        lst = lexicon.lookup('東京都'.encode('utf-8'), 0)
        self.assertEqual((0, len('東京都'.encode('utf-8'))), lst.__next__())
        with self.assertRaises(StopIteration):
            lst.__next__()

        self.assertEqual(-1, lexicon.get_left_id(1))
        self.assertEqual(0, lexicon.get_cost(1))
        wi = lexicon.get_word_info(1)
        self.assertEqual('東', wi.surface)
        self.assertEqual('ひがし', wi.normalized_form)
        self.assertEqual(-1, wi.dictionary_form_word_id)
        self.assertEqual('ヒガシ', wi.reading_form)
        self.assertEqual(1, wi.pos_id)
        self.assertEqual([], wi.a_unit_split)
        self.assertEqual([], wi.b_unit_split)
        lst = lexicon.lookup('東'.encode('utf-8'), 0)
        with self.assertRaises(StopIteration):
            lst.__next__()
    def test_parseline(self):
        builder = DictionaryBuilder(logger=self.logger)
        entry = builder.parse_line(
            '京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*'.split(','))
        self.assertEqual('京都', entry.headword)
        self.assertEqual([6, 6, 5293], entry.parameters)
        self.assertEqual(0, entry.wordinfo.pos_id)
        self.assertEqual('*', entry.aunit_split_string)
        self.assertEqual('*', entry.bunit_split_string)

        entry = builder.parse_line(
            '京都,-1,-1,0,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*'.split(','))
        self.assertIsNone(entry.headword)
        self.assertEqual(0, entry.wordinfo.pos_id)
    def test_write_string(self):
        builder = DictionaryBuilder(logger=self.logger)
        position = builder.byte_buffer.tell()
        builder.write_string('')
        self.assertEqual(0, builder.byte_buffer.getvalue()[position])
        self.assertEqual(position + 1, builder.byte_buffer.tell())

        position = builder.byte_buffer.tell()
        builder.write_string('あ𠮟')
        self.assertEqual(3, builder.byte_buffer.getvalue()[position])
        self.assertEqual(
            'あ',
            builder.byte_buffer.getvalue()[position + 1:position +
                                           3].decode('utf-16-le'))
        a = int.from_bytes(builder.byte_buffer.getvalue()[position +
                                                          3:position + 5],
                           byteorder='little')
        b = int.from_bytes(builder.byte_buffer.getvalue()[position +
                                                          5:position + 7],
                           byteorder='little')
        self.assertEqual(55362, a)  # \ud842
        self.assertEqual(57247, b)  # \udf94

        position = builder.byte_buffer.tell()
        long_str = '0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789'
        len_ = len(long_str)
        builder.write_string(long_str)
        self.assertEqual((len_ >> 8) | 0x80,
                         builder.byte_buffer.getvalue()[position])
        self.assertEqual(len_ & 0xff,
                         builder.byte_buffer.getvalue()[position + 1])
        self.assertEqual(position + 2 + 2 * len_, builder.byte_buffer.tell())
示例#7
0
    def _build_dictionary(self, input_txt_path, lex_lines, dictionary_name):
        with open(input_txt_path, 'w', encoding='utf-8') as wf:
            wf.write("\n".join(lex_lines))

        out_path = os.path.join(self.resource_dir, dictionary_name)
        out_stream = open(out_path, 'wb')
        lexicon_paths = [input_txt_path]
        matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8')

        header = DictionaryHeader(SYSTEM_DICT_VERSION_2, int(time.time()),
                                  'test')
        out_stream.write(header.to_bytes())
        builder = DictionaryBuilder(logger=self.logger)
        builder.build(lexicon_paths, matrix_input_stream, out_stream)
        out_stream.close()
        matrix_input_stream.close()

        return out_path
 def test_add_to_trie(self):
     builder = DictionaryBuilder(logger=self.logger)
     builder.add_to_trie('abc', 0)
     builder.add_to_trie('abc', 1)
     builder.add_to_trie('abcd', 2)
     self.assertTrue(0 in builder.trie_keys['abc'.encode('utf-8')])
     self.assertTrue(1 in builder.trie_keys['abc'.encode('utf-8')])
 def test_write_intarray(self):
     builder = DictionaryBuilder(logger=self.logger)
     position = builder.byte_buffer.tell()
     builder.write_intarray([])
     self.assertEqual(0, builder.byte_buffer.getvalue()[position])
     builder.write_intarray([1, 2, 3])
     self.assertEqual(3, builder.byte_buffer.getvalue()[position + 1])
     self.assertEqual(
         1,
         int.from_bytes(builder.byte_buffer.getvalue()[position +
                                                       2:position + 6],
                        byteorder='little',
                        signed=True))
     self.assertEqual(
         2,
         int.from_bytes(builder.byte_buffer.getvalue()[position +
                                                       6:position + 10],
                        byteorder='little',
                        signed=True))
     self.assertEqual(
         3,
         int.from_bytes(builder.byte_buffer.getvalue()[position +
                                                       10:position + 14],
                        byteorder='little',
                        signed=True))
示例#10
0
    def test_parse_splitinfo(self):
        builder = DictionaryBuilder(logger=self.logger)
        builder.entries.extend([None] * 4)
        self.assertEqual([], builder.parse_splitinfo('*'))
        self.assertEqual([1, 2, 3], builder.parse_splitinfo('1/2/3'))
        self.assertEqual(2, builder.parse_splitinfo('1/U2/3')[1])

        mocked_lexicon = mock.Mock(spec=Lexicon)
        mocked_lexicon.size.return_value = 4
        builder = UserDictionaryBuilder(None, mocked_lexicon)
        builder.entries += [None, None, None]
        self.assertEqual([1, 2 | 1 << 28, 3], builder.parse_splitinfo("1/U2/3"))
 def test_parse_line_same_readingform(self):
     builder = DictionaryBuilder(logger=self.logger)
     entry = builder.parse_line(
         '〒,6,6,5293,〒,名詞,普通名詞,一般,*,*,*,〒,〒,*,A,*,*,*'.split(','))
     self.assertEqual('〒', entry.wordinfo.reading_form)
 def test_parse_line_invalid_columns(self):
     builder = DictionaryBuilder(logger=self.logger)
     with self.assertRaises(ValueError) as cm:
         builder.parse_line(
             '京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*'.split(','))
     self.assertEqual('invalid format', cm.exception.args[0])
 def test_parse_splitinfo_invalid_wordid(self):
     builder = DictionaryBuilder(logger=self.logger)
     with self.assertRaises(ValueError) as cm:
         builder.parse_splitinfo('1/2/3')
     self.assertEqual('invalid word ID', cm.exception.args[0])
 def test_decode(self):
     builder = DictionaryBuilder(logger=self.logger)
     self.assertEqual('a,c', builder.decode('a\\u002cc'))
     self.assertEqual('a,c', builder.decode('a\\u{002c}c'))
     self.assertEqual('a𠮟c', builder.decode('a\\u{20b9f}c'))
 def test_convert_postable(self):
     builder = DictionaryBuilder(logger=self.logger)
     builder.convert_postable(['a,b,c,d,e,f', 'g,h,i,j,k,l'])
     self.assertEqual(2 + 3 * 12, builder.byte_buffer.tell())
示例#16
0
 def test_parse_line_toomany_split(self):
     builder = DictionaryBuilder(logger=self.logger)
     with self.assertRaises(ValueError) as cm:
         builder.parse_line('京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,B,0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0/0/1/2/3/4/5/6/7/8/9/0,*,*'.split(','))
     self.assertEqual('too many units', cm.exception.args[0])