def creating_alphabets(alphabet_path, alphabet_data_paths,
                       wiki_alphabet_data_paths, word_dict):
    train_paths = alphabet_data_paths['train']
    wiki_train_paths = wiki_alphabet_data_paths['train']  # [rram]
    extra_paths = [v for k, v in alphabet_data_paths.items() if k != 'train']
    wiki_extra_paths = [
        v for k, v in alphabet_data_paths.items() if k != 'train'
    ]  # [rram]
    alphabet_dict = {}
    wiki_alphabet_dict = {}  # [rram]
    alphabet_dict['alphabets'] = prepare_data.create_alphabets(
        alphabet_path,
        train_paths,
        extra_paths=extra_paths,
        max_vocabulary_size=100000,
        embedd_dict=word_dict)
    wiki_alphabet_dict['alphabets'] = prepare_data.create_alphabets(
        wiki_alphabet_paths,
        wiki_train_paths,
        extra_paths=wiki_extra_paths,
        max_vocabulary_size=100000,
        embedd_dict=word_dict)  # [rram]
    for k, v in alphabet_dict['alphabets'].items():
        num_key = 'num_' + k.split('_')[0]
        alphabet_dict[num_key] = v.size()
        logger.info("%s : %d" % (num_key, alphabet_dict[num_key]))

    for k, v in wiki_alphabet_dict['alphabets'].items():  # [rram]
        num_key = 'num_' + k.split('_')[0]  # [rram]
        wiki_alphabet_dict[num_key] = v.size()  # [rram]
        logger.info("%s : %d" %
                    (num_key, wiki_alphabet_dict[num_key]))  # [rram]
    return alphabet_dict, wiki_alphabet_dict  # used to be alphabet_dict only # [rram]
示例#2
0
def creating_alphabets(alphabet_path, alphabet_data_paths, word_dict):
    train_paths = alphabet_data_paths['train']
    extra_paths = [v for k, v in alphabet_data_paths.items() if k != 'train']
    alphabet_dict = {}
    alphabet_dict['alphabets'] = prepare_data.create_alphabets(
        alphabet_path,
        train_paths,
        extra_paths=extra_paths,
        max_vocabulary_size=300000,  # rram 100K -> 300K or 400K
        embedd_dict=word_dict)
    print(alphabet_dict['alphabets'])  # rram - debugging
    fout = open('debug_alphabets.txt', 'w')  # rram - debugging
    print('WRITING alphabets TO DEBUG_ALPHABETS.txt')  # rram - debugging

    for k, v in alphabet_dict['alphabets'].items():

        # print(k, v.get_content()) # rram - debugging
        fout.write(f'({k}, {v.get_content()})\n')  # rram - debugging

        num_key = 'num_' + k.split('_')[0]
        alphabet_dict[num_key] = v.size()
        logger.info("%s : %d" % (num_key, alphabet_dict[num_key]))

    fout.close()  # rram - debugging

    return alphabet_dict
示例#3
0
def creating_alphabets(alphabet_path, alphabet_data_paths, word_dict):
    train_paths = alphabet_data_paths['train']
    extra_paths = [v for k,v in alphabet_data_paths.items() if k != 'train']
    alphabet_dict = {}
    alphabet_dict['alphabets'] = prepare_data.create_alphabets(alphabet_path,
                                                               train_paths,
                                                               extra_paths=extra_paths,
                                                               max_vocabulary_size=100000,
                                                               embedd_dict=word_dict)
    for k, v in alphabet_dict['alphabets'].items():
        num_key = 'num_' + k.split('_')[0]
        alphabet_dict[num_key] = v.size()
        logger.info("%s : %d" % (num_key, alphabet_dict[num_key]))
    return alphabet_dict