Exemplo n.º 1
0
 def __init__(self, fname, exp_root):
     self.data = read_json(fname)
     self.exp_root = exp_root
     self.vocab = pickle.load(open(os.path.join(exp_root, 'vocab.pkl'), 'rb'))
     self.word2idx = pickle.load(open(os.path.join(exp_root, 'word2idx.pkl'), 'rb'))
     self.idx2word = pickle.load(open(os.path.join(exp_root, 'idx2word.pkl'), 'rb'))
     self.get_longest()
Exemplo n.º 2
0
        left_part = title[:offset]
        right_part = title[offset + length:]
        left_part = [i.lower() for i in replace_symbols(left_part).split()]
        right_part = [i.lower() for i in replace_symbols(right_part).split()]

        avg_l += len(left_part)
        avg_r += len(right_part)

        # 筛选吗?先不吧
        # number, context, magnitude, idx for number
        instance = (number_str, left_part + right_part, magnitude,
                    len(left_part))

        if float(number_str) < 0:
            print(magnitude)

        samples.append(instance)

    return samples


if __name__ == '__main__':

    filename = 'data/Numeracy_600K_article_title.json'
    exp_dir = 'title_all'
    data = read_json(filename)
    numeral_prediction_data = to_numeral_prediction(data)
    pickle.dump(numeral_prediction_data,
                open('numeral_prediction/data.pkl', 'wb'))
import sys
sys.path.append('../')

from data_utils import read_json
from tqdm import tqdm
import pickle

if __name__ == '__main__':
    jsons = read_json('exps/title_all_correct/preproc.train.json')
    with open('train_embed/corpus.txt', 'w', encoding='utf-8') as fout:
        for js in tqdm(jsons):
            sent = js['title']
            fout.writelines(sent)