示例#1
0
文件: merge.py 项目: SamuelLAN/DLM
def __merge_dict(_merged_dict, key, val, mode=0):
    # filter the noise and check if it still has values after filtering
    val = __filter_noise(val)
    if not __check_has_val(val):
        return _merged_dict

    # preprocess key
    key = process(key, pipeline)

    # if the dictionary does not contain this key
    if key not in _merged_dict:
        _merged_dict[key] = val
        return _merged_dict

    # if the dictionary contains this key
    for _type, l in val.items():
        if _type not in _merged_dict[key] or not _merged_dict[key][_type]:
            _merged_dict[key][_type] = l
            continue

        if mode != 0 and _type == 'translation' and _merged_dict[key][_type]:
            continue

        for i in l:
            if i not in _merged_dict[key][_type]:
                _merged_dict[key][_type].append(i)
        #_merged_dict[key][_type] += l

    return _merged_dict
示例#2
0
文件: merge.py 项目: SamuelLAN/DLM
def __filter_noise(val):
    for k, l in val.items():
        l = list(map(lambda x: process(x, pipeline), l))
        l = list(filter(lambda x: x, l))

        if k == 'translation' and len(l) >= 5:
            l = list(filter(lambda x: '.' not in x, l))

        if k == 'translation' and len(l) >= 5 and len(
                list(filter(lambda x: len(x) >= 5, l))) >= 3:
            l = list(filter(lambda x: len(x) >= 5, l))

        if not l:
            continue

        val[k] = l
    return val
示例#3
0
nltk_dir = os.path.join(dictionary_dir, 'nltk')
if not os.path.exists(nltk_dir):
    os.mkdir(nltk_dir)
write_dict_path = os.path.join(nltk_dir, 'en_zh_dict_nltk.json')

dictionary = dict()
for word in wn.words():

    word_details = {}
    syns = wn.synsets(word)
    src_synonyms = set()
    for x in syns:
        src_synonym = x.name().split(".")[0]
        if src_synonym != word:
            src_synonym = utils.process(src_synonym, utils.weak_pl)
            src_synonyms.add(src_synonym)

    src_meanings = list(
        filter(
            None,
            list(
                map(
                    lambda x: utils.process(x.definition(), utils.weak_pl)
                    if (word == x.name().split(".")[0]) else None, syns))))
    pos = list(
        set(
            filter(
                None,
                list(
                    map(
示例#4
0
文件: ecdict.py 项目: SamuelLAN/DLM

keep_bracket_content_pl = copy.deepcopy(utils.pipeline)
keep_bracket_content_pl[-2] = utils.remove_bracket

print('\ntraversing data ...\n')

length = len(data)

for i, v in enumerate(data):
    # show progress
    if i % 1000 == 0:
        progress = float(i + 1) / length * 100.
        print('\rprogress: %.2f%% ' % progress, end='')

    en_word = utils.process(v['en_word'], utils.pipeline)
    # en_meanings = utils.process(v['en_meanings'], utils.pipeline)
    en_meanings = utils.process(v['en_meanings'], keep_bracket_content_pl)
    zh_translation = utils.process(v['zh_translation'], utils.pipeline)
    pos = utils.process(v['pos'], utils.pipeline)

    if '.' in en_word:
        continue

    # if i % 1000 == 0:
    #     print(f'\n{en_word:30s} | {en_meanings:20s} | {str(v["en_meanings"]).strip().lower():20s} | {zh_translation:40s} | {pos:20s} |')

    pos_list = []

    if en_meanings:
        en_meanings = __reg_enter.sub(r' \1', en_meanings)
示例#5
0
        _dict[k] = {'translation': []}
    _dict[k]['translation'].append(v)


for file_name in os.listdir(facebook_dir):
    file_path = os.path.join(facebook_dir, file_name)
    print(f'reading from {file_path} ...')

    suffix = os.path.splitext(file_name)[1].lower()
    if suffix == '.json':
        data = load_json(file_path)

        if file_name[:2].lower() == 'zh':

            for zh, val in data.items():
                zh = process(zh, pipeline)

                ens = val['translation']
                for en in ens:
                    en = process(en, pipeline)

                    __add_dict(zh_en_dict, zh, en)
                    __add_dict(en_zh_dict, en, zh)

        else:
            for en, val in data.items():
                en = process(en, pipeline)

                zhs = val['translation']
                for zh in zhs:
                    zh = process(zh, pipeline)
示例#6
0
for file_name in os.listdir(facebook_dir):
    if file_name not in ['en-ro.txt', 'ro-en.txt']:
        continue

    file_path = os.path.join(facebook_dir, file_name)
    print(f'reading from {file_path} ...')

    # read data
    lines = utils.read_lines(file_path)
    lines = list(map(lambda x: x.strip().split('\t'), lines))
    lines = list(filter(lambda x: x and len(x) == 2, lines))

    if file_name[:2].lower() == 'ro':
        for ro, en in lines:
            ro = process(ro, pipeline)
            en = process(en, pipeline)

            __add_dict(ro_en_dict, ro, en)
            __add_dict(en_ro_dict, en, ro)

    else:
        for en, ro in lines:
            ro = process(ro, pipeline)
            en = process(en, pipeline)

            __add_dict(ro_en_dict, ro, en)
            __add_dict(en_ro_dict, en, ro)

print('filtering duplicate ...')
示例#7
0
    with open(file_name[0], encoding="utf-8") as f:
        print("reading file for ", file_name[0])
        read = csv.reader(f, delimiter='\t')

        zh_en_dict = {}
        en_zh_dict = {}

        for row in read:
            if len(row) != 2:
                continue

            zh_token, en_token = row
            if zh_token == en_token:
                continue

            zh_token = process(zh_token, pipeline)
            en_token = process(en_token, pipeline)

            if '/' in zh_token or '/' in en_token:
                if len(zh_token.split('/')) == len(en_token.split('/')):
                    zh_tokens = zh_token.split('/')
                    en_tokens = en_token.split('/')

                    for j, _zh_token in enumerate(zh_tokens):
                        _en_token = en_tokens[j]
                        __add_dict(zh_en_dict, _zh_token, _en_token)
                        __add_dict(en_zh_dict, _en_token, _zh_token)
                continue

            __add_dict(zh_en_dict, zh_token, en_token)
            __add_dict(en_zh_dict, en_token, zh_token)