示例#1
0
def test_eq(vocab):
    v = Vocab('unk')
    v.update('zero one two two three three three'.split())
    assert v == vocab
    v.add('zero', count=10)
    assert v == vocab  # equality doesn't depend on count
    v.add('four')
    assert v != vocab
示例#2
0
 def test_words2indices(self):
     v = Vocab('unk')
     words = ['i', 'like', 'pie']
     v.update(words)
     v = v.freeze()
     assert v.words2indices(words) == [1, 2, 3]
     assert v.words2indices(['i', 'said']) == [1, 0]
示例#3
0
def test_eq(vocab):
    v = Vocab('unk')
    v.update('zero one two two three three three'.split())
    assert v == vocab
    v.add('zero', count=10)
    assert v == vocab  # equality doesn't depend on count
    v.add('four')
    assert v != vocab
示例#4
0
 def test_words2indices(self):
     v = Vocab('unk')
     words = ['i', 'like', 'pie']
     v.update(words)
     v = v.freeze()
     assert v.words2indices(words) == [1, 2, 3]
     assert v.words2indices(['i', 'said']) == [1, 0]
示例#5
0
    test_file = os.path.join(mydir, 'SemEval2010_task8_testing_keys', 'TEST_FILE_FULL.TXT')
    logging.basicConfig(level=logging.INFO)

    logging.info('starting preprocessing')
    if os.path.isfile('train.json') and os.path.isfile('test.json') and not args['--force']:
        logging.info('train.json and test.json already exists. Skipping proprocessing.')
    else:
        nlp = English()
        with open('train.json', 'wb') as f:
            json.dump(parse_file(train_file, nlp), f, indent=2)
        with open('test.json', 'wb') as f:
            json.dump(parse_file(test_file, nlp), f, indent=2)

    logging.info('starting numericalization')
    word_vocab = SennaVocab()
    rel_vocab = Vocab()

    with open('train.json') as f:
        train = json.load(f)
    with open('test.json') as f:
        test = json.load(f)

    numericalize(train, word_vocab, rel_vocab, add=True)
    word_vocab = word_vocab.prune_rares(cutoff=2)
    word_vocab = word_vocab.sort_by_decreasing_count()
    rel_vocab = rel_vocab.sort_by_decreasing_count()
    train = numericalize(train, word_vocab, rel_vocab, add=False)
    test = numericalize(test, word_vocab, rel_vocab, add=False)

    with open('vocab.pkl', 'wb') as f:
        pkl.dump({'word': word_vocab, 'rel': rel_vocab}, f)
示例#6
0
 def test_indices2words(self):
     v = Vocab(unk='unk')
     v.update(['i', 'like', 'pie'])
     words = v.indices2words([1, 2, 3, 0])
     assert words == ['i', 'like', 'pie', 'unk']
示例#7
0
def vocab():
    v = Vocab('unk')
    v.update('zero one two two three three three'.split())
    return v
示例#8
0
 def test_indices2words(self):
     v = Vocab(unk='unk')
     v.update(['i', 'like', 'pie'])
     words = v.indices2words([1, 2, 3, 0])
     assert words == ['i', 'like', 'pie', 'unk']
示例#9
0
def vocab():
    v = Vocab('unk')
    v.update('zero one two two three three three'.split())
    return v