def test_eq(vocab): v = Vocab('unk') v.update('zero one two two three three three'.split()) assert v == vocab v.add('zero', count=10) assert v == vocab # equality doesn't depend on count v.add('four') assert v != vocab
def test_words2indices(self): v = Vocab('unk') words = ['i', 'like', 'pie'] v.update(words) v = v.freeze() assert v.words2indices(words) == [1, 2, 3] assert v.words2indices(['i', 'said']) == [1, 0]
test_file = os.path.join(mydir, 'SemEval2010_task8_testing_keys', 'TEST_FILE_FULL.TXT') logging.basicConfig(level=logging.INFO) logging.info('starting preprocessing') if os.path.isfile('train.json') and os.path.isfile('test.json') and not args['--force']: logging.info('train.json and test.json already exists. Skipping proprocessing.') else: nlp = English() with open('train.json', 'wb') as f: json.dump(parse_file(train_file, nlp), f, indent=2) with open('test.json', 'wb') as f: json.dump(parse_file(test_file, nlp), f, indent=2) logging.info('starting numericalization') word_vocab = SennaVocab() rel_vocab = Vocab() with open('train.json') as f: train = json.load(f) with open('test.json') as f: test = json.load(f) numericalize(train, word_vocab, rel_vocab, add=True) word_vocab = word_vocab.prune_rares(cutoff=2) word_vocab = word_vocab.sort_by_decreasing_count() rel_vocab = rel_vocab.sort_by_decreasing_count() train = numericalize(train, word_vocab, rel_vocab, add=False) test = numericalize(test, word_vocab, rel_vocab, add=False) with open('vocab.pkl', 'wb') as f: pkl.dump({'word': word_vocab, 'rel': rel_vocab}, f)
def test_indices2words(self): v = Vocab(unk='unk') v.update(['i', 'like', 'pie']) words = v.indices2words([1, 2, 3, 0]) assert words == ['i', 'like', 'pie', 'unk']
def vocab(): v = Vocab('unk') v.update('zero one two two three three three'.split()) return v