nlp = English() with open('train.json', 'wb') as f: json.dump(parse_file(train_file, nlp), f, indent=2) with open('test.json', 'wb') as f: json.dump(parse_file(test_file, nlp), f, indent=2) logging.info('starting numericalization') word_vocab = SennaVocab() rel_vocab = Vocab() with open('train.json') as f: train = json.load(f) with open('test.json') as f: test = json.load(f) numericalize(train, word_vocab, rel_vocab, add=True) word_vocab = word_vocab.prune_rares(cutoff=2) word_vocab = word_vocab.sort_by_decreasing_count() rel_vocab = rel_vocab.sort_by_decreasing_count() train = numericalize(train, word_vocab, rel_vocab, add=False) test = numericalize(test, word_vocab, rel_vocab, add=False) with open('vocab.pkl', 'wb') as f: pkl.dump({'word': word_vocab, 'rel': rel_vocab}, f) with open('trainXY.json', 'wb') as f: json.dump(train, f, indent=2) with open('testXY.json', 'wb') as f: json.dump(test, f, indent=2)