def loadDict(self, filename, language): # load a cc-CEDICT dictionary with the [cedict-parser](https://github.com/marcanuy/cedict_utils) # loads the list of entries into global variable parser = CedictParser(file_path=filename) self._dictentries = parser.parse() self._dictlang = language
def loadDict(self, filename): # load a cc-CEDICT dictionary with the cedict parser # and harmonize the output parser = CedictParser(file_path=filename) entries = parser.parse() self._dict = list() for e in entries: parsed = {} parsed['traditional'] = e.traditional parsed['simplified'] = e.simplified parsed['pinyin'] = e.pinyin parsed['meaning'] = '' m = e.meanings[0] # for m in e.meanings: # check, if already is meaning in there, if so add / if parsed['meaning'] != '': parsed['meaning'] += " / " # do not add examples to the meanings # meaning = re.sub(r'([\;\/][\s]?Bsp.:[^\;\/]*)(?=[\;\/])', '', m) # for full file meaning = re.sub(r'([\;\/][\s]?Bsp.:[^\;\/]*)', '', m) meaning = re.sub(r'\(Bsp.:[^\)]*\)', '', meaning) parsed['meaning'] += meaning self._dict.append(parsed)
def test_parse_entry(self): lines = [ 'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n' ] parser = CedictParser(lines=lines) entries = parser.parse() self.assertTrue(type(entries[0]), CedictEntry)
def create_dicts(): traditional = {} simplified = {} from cedict_utils.cedict import CedictParser parser = CedictParser() parser.read_file("cedict_1_0_ts_utf-8_mdbg.txt") entries = parser.parse() for character in entries: traditional[character.traditional] = character.pinyin # simplified[character.simplified] = character.pinyin return traditional
def cleanDict(self): # cleans a dictionary and reloads it into the class new = [] for e in self._dictentries: e.raw_line = re.sub(r'([\;\/][\s]?Bsp.:[^\;\/]*)(?=[\;\/])', '', e.raw_line) # remove german examples # e.raw_line = re.sub(r'\(Bsp.:[^\)]*\)', '', e.raw_line) # remove german short examples new.append(e.raw_line) # del(parser) parser = CedictParser(lines=new) self._dictentries = parser.parse()
def test_remove_lines_with_comments(self): an_entry = 'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n' lines = [ '# CC-CEDICT\n', '# Community Chinese-English dictionary.\n', '# \n', '# Creative Commons Attribution-Share Alike 3.0\n', '#! publisher=MDBG\n', an_entry ] parser = CedictParser(lines=lines) parser._filter_comments() self.assertEqual(len(parser.lines), 1) self.assertEqual(an_entry, parser.lines[0])
def test_sanitize_lines(self): """ Test if all filters are being applied """ original_lines = [ ' ', '', '# \n', '#! publisher=MDBG\n', '\n', 'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n' ] expected_lines = [ 'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/' ] parser = CedictParser(lines=original_lines) parser._sanitize() self.assertCountEqual(parser.lines, expected_lines)
def test_remove_empty_lines(self): original_lines = [ ' ', '', '# \n', '#! publisher=MDBG\n', '\n', 'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n' ] expected_lines = [ '# \n', '#! publisher=MDBG\n', 'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n' ] parser = CedictParser(lines=original_lines) parser._filter_empty_entries() self.assertCountEqual(parser.lines, expected_lines)
def test_remove_new_lines(self): original_lines = [ '# CC-CEDICT\n', '# Community Chinese-English dictionary.\n', '# \n', '#! publisher=MDBG\n', 'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/\n' ] expected_lines = [ '# CC-CEDICT', '# Community Chinese-English dictionary.', '# ', '#! publisher=MDBG', 'K書 K书 [K shu1] /to cram (Taiwan,khè su 齧書, lit./ 啃書|啃书[ken3 shu1]/' ] parser = CedictParser(lines=original_lines) parser._filter_new_lines() self.assertCountEqual(parser.lines, expected_lines)
def add_pinyin(predictions): # prepare Cedict (Chinese dictionary) parser from cedict_utils.cedict import CedictParser parser = CedictParser() parser.read_file("cedict_1_0_ts_utf-8_mdbg.txt") entries = parser.parse() search_term = predictions pinyin = "" pinyin_data = [] #loop through dictionary and get pin yin for eaxch character for term in search_term: for i in range(len(entries)): if term in entries[i].traditional: characters = entries[i].traditional position = characters.index(term) pinyin_string = entries[i].pinyin pinyin_list = pinyin_string.split() pinyin = pinyin_list[int(position)] pinyin_data.append(pinyin) pinyin = "" break search_term.replace(term, '') return pinyin_data