def __merge_dict(_merged_dict, key, val, mode=0): # filter the noise and check if it still has values after filtering val = __filter_noise(val) if not __check_has_val(val): return _merged_dict # preprocess key key = process(key, pipeline) # if the dictionary does not contain this key if key not in _merged_dict: _merged_dict[key] = val return _merged_dict # if the dictionary contains this key for _type, l in val.items(): if _type not in _merged_dict[key] or not _merged_dict[key][_type]: _merged_dict[key][_type] = l continue if mode != 0 and _type == 'translation' and _merged_dict[key][_type]: continue for i in l: if i not in _merged_dict[key][_type]: _merged_dict[key][_type].append(i) #_merged_dict[key][_type] += l return _merged_dict
def __filter_noise(val): for k, l in val.items(): l = list(map(lambda x: process(x, pipeline), l)) l = list(filter(lambda x: x, l)) if k == 'translation' and len(l) >= 5: l = list(filter(lambda x: '.' not in x, l)) if k == 'translation' and len(l) >= 5 and len( list(filter(lambda x: len(x) >= 5, l))) >= 3: l = list(filter(lambda x: len(x) >= 5, l)) if not l: continue val[k] = l return val
nltk_dir = os.path.join(dictionary_dir, 'nltk') if not os.path.exists(nltk_dir): os.mkdir(nltk_dir) write_dict_path = os.path.join(nltk_dir, 'en_zh_dict_nltk.json') dictionary = dict() for word in wn.words(): word_details = {} syns = wn.synsets(word) src_synonyms = set() for x in syns: src_synonym = x.name().split(".")[0] if src_synonym != word: src_synonym = utils.process(src_synonym, utils.weak_pl) src_synonyms.add(src_synonym) src_meanings = list( filter( None, list( map( lambda x: utils.process(x.definition(), utils.weak_pl) if (word == x.name().split(".")[0]) else None, syns)))) pos = list( set( filter( None, list( map(
keep_bracket_content_pl = copy.deepcopy(utils.pipeline) keep_bracket_content_pl[-2] = utils.remove_bracket print('\ntraversing data ...\n') length = len(data) for i, v in enumerate(data): # show progress if i % 1000 == 0: progress = float(i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') en_word = utils.process(v['en_word'], utils.pipeline) # en_meanings = utils.process(v['en_meanings'], utils.pipeline) en_meanings = utils.process(v['en_meanings'], keep_bracket_content_pl) zh_translation = utils.process(v['zh_translation'], utils.pipeline) pos = utils.process(v['pos'], utils.pipeline) if '.' in en_word: continue # if i % 1000 == 0: # print(f'\n{en_word:30s} | {en_meanings:20s} | {str(v["en_meanings"]).strip().lower():20s} | {zh_translation:40s} | {pos:20s} |') pos_list = [] if en_meanings: en_meanings = __reg_enter.sub(r' \1', en_meanings)
_dict[k] = {'translation': []} _dict[k]['translation'].append(v) for file_name in os.listdir(facebook_dir): file_path = os.path.join(facebook_dir, file_name) print(f'reading from {file_path} ...') suffix = os.path.splitext(file_name)[1].lower() if suffix == '.json': data = load_json(file_path) if file_name[:2].lower() == 'zh': for zh, val in data.items(): zh = process(zh, pipeline) ens = val['translation'] for en in ens: en = process(en, pipeline) __add_dict(zh_en_dict, zh, en) __add_dict(en_zh_dict, en, zh) else: for en, val in data.items(): en = process(en, pipeline) zhs = val['translation'] for zh in zhs: zh = process(zh, pipeline)
for file_name in os.listdir(facebook_dir): if file_name not in ['en-ro.txt', 'ro-en.txt']: continue file_path = os.path.join(facebook_dir, file_name) print(f'reading from {file_path} ...') # read data lines = utils.read_lines(file_path) lines = list(map(lambda x: x.strip().split('\t'), lines)) lines = list(filter(lambda x: x and len(x) == 2, lines)) if file_name[:2].lower() == 'ro': for ro, en in lines: ro = process(ro, pipeline) en = process(en, pipeline) __add_dict(ro_en_dict, ro, en) __add_dict(en_ro_dict, en, ro) else: for en, ro in lines: ro = process(ro, pipeline) en = process(en, pipeline) __add_dict(ro_en_dict, ro, en) __add_dict(en_ro_dict, en, ro) print('filtering duplicate ...')
with open(file_name[0], encoding="utf-8") as f: print("reading file for ", file_name[0]) read = csv.reader(f, delimiter='\t') zh_en_dict = {} en_zh_dict = {} for row in read: if len(row) != 2: continue zh_token, en_token = row if zh_token == en_token: continue zh_token = process(zh_token, pipeline) en_token = process(en_token, pipeline) if '/' in zh_token or '/' in en_token: if len(zh_token.split('/')) == len(en_token.split('/')): zh_tokens = zh_token.split('/') en_tokens = en_token.split('/') for j, _zh_token in enumerate(zh_tokens): _en_token = en_tokens[j] __add_dict(zh_en_dict, _zh_token, _en_token) __add_dict(en_zh_dict, _en_token, _zh_token) continue __add_dict(zh_en_dict, zh_token, en_token) __add_dict(en_zh_dict, en_token, zh_token)