Пример #1
0
    def loadDict(self, filename, language):
        # load a cc-CEDICT dictionary with the [cedict-parser](https://github.com/marcanuy/cedict_utils)
        # loads the list of entries into global variable

        parser = CedictParser(file_path=filename)
        self._dictentries = parser.parse()
        self._dictlang = language
Пример #2
0
    def loadDict(self, filename):
        # load a cc-CEDICT dictionary with the cedict parser
        # and harmonize the output

        parser = CedictParser(file_path=filename)
        entries = parser.parse()

        self._dict = list()

        for e in entries:
            parsed = {}

            parsed['traditional'] = e.traditional
            parsed['simplified'] = e.simplified
            parsed['pinyin'] = e.pinyin
            parsed['meaning'] = ''

            m = e.meanings[0]

            # for m in e.meanings:

            # check, if already is meaning in there, if so add /
            if parsed['meaning'] != '':
                parsed['meaning'] += " / "

            # do not add examples to the meanings
            # meaning = re.sub(r'([\;\/][\s]?Bsp.:[^\;\/]*)(?=[\;\/])', '', m) # for full file
            meaning = re.sub(r'([\;\/][\s]?Bsp.:[^\;\/]*)', '', m)
            meaning = re.sub(r'\(Bsp.:[^\)]*\)', '', meaning)
            parsed['meaning'] += meaning

            self._dict.append(parsed)
Пример #3
0
    def test_parse_entry(self):
        lines = [
            'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n'
        ]
        parser = CedictParser(lines=lines)

        entries = parser.parse()

        self.assertTrue(type(entries[0]), CedictEntry)
Пример #4
0
def create_dicts():
    traditional = {}
    simplified = {}
    from cedict_utils.cedict import CedictParser
    parser = CedictParser()
    parser.read_file("cedict_1_0_ts_utf-8_mdbg.txt")
    entries = parser.parse()
    for character in entries:
        traditional[character.traditional] = character.pinyin
        # simplified[character.simplified] = character.pinyin
    return traditional
Пример #5
0
    def cleanDict(self):
        # cleans a dictionary and reloads it into the class

        new = []
        for e in self._dictentries:
            e.raw_line = re.sub(r'([\;\/][\s]?Bsp.:[^\;\/]*)(?=[\;\/])', '',
                                e.raw_line)  # remove german examples
            # e.raw_line = re.sub(r'\(Bsp.:[^\)]*\)', '', e.raw_line) # remove german short examples
            new.append(e.raw_line)

        # del(parser)
        parser = CedictParser(lines=new)
        self._dictentries = parser.parse()
Пример #6
0
    def test_remove_lines_with_comments(self):
        an_entry = 'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n'
        lines = [
            '# CC-CEDICT\n', '# Community Chinese-English dictionary.\n',
            '# \n', '# Creative Commons Attribution-Share Alike 3.0\n',
            '#! publisher=MDBG\n', an_entry
        ]
        parser = CedictParser(lines=lines)

        parser._filter_comments()

        self.assertEqual(len(parser.lines), 1)
        self.assertEqual(an_entry, parser.lines[0])
Пример #7
0
    def test_sanitize_lines(self):
        """ Test if all filters are being applied """
        original_lines = [
            ' ', '', '# \n', '#! publisher=MDBG\n', '\n',
            'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n'
        ]
        expected_lines = [
            'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/'
        ]
        parser = CedictParser(lines=original_lines)

        parser._sanitize()

        self.assertCountEqual(parser.lines, expected_lines)
Пример #8
0
    def test_remove_empty_lines(self):
        original_lines = [
            ' ', '', '# \n', '#! publisher=MDBG\n', '\n',
            'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n'
        ]
        expected_lines = [
            '# \n', '#! publisher=MDBG\n',
            'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n'
        ]
        parser = CedictParser(lines=original_lines)

        parser._filter_empty_entries()

        self.assertCountEqual(parser.lines, expected_lines)
Пример #9
0
    def test_remove_new_lines(self):
        original_lines = [
            '# CC-CEDICT\n', '# Community Chinese-English dictionary.\n',
            '# \n', '#! publisher=MDBG\n',
            'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n'
        ]
        expected_lines = [
            '# CC-CEDICT', '# Community Chinese-English dictionary.', '# ',
            '#! publisher=MDBG',
            'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/'
        ]
        parser = CedictParser(lines=original_lines)

        parser._filter_new_lines()

        self.assertCountEqual(parser.lines, expected_lines)
Пример #10
0
def add_pinyin(predictions):
    # prepare Cedict (Chinese dictionary) parser
    from cedict_utils.cedict import CedictParser
    parser = CedictParser()
    parser.read_file("cedict_1_0_ts_utf-8_mdbg.txt")
    entries = parser.parse()
    search_term = predictions
    pinyin = ""
    pinyin_data = []
    #loop through dictionary and get pin yin for eaxch character
    for term in search_term:
        for i in range(len(entries)):
            if term in entries[i].traditional:
                characters = entries[i].traditional
                position = characters.index(term)
                pinyin_string = entries[i].pinyin
                pinyin_list = pinyin_string.split()
                pinyin = pinyin_list[int(position)]
                pinyin_data.append(pinyin)
                pinyin = ""
                break
        search_term.replace(term, '')
    return pinyin_data