Exemplo n.º 1
0
def test_corpus_info_class():
    vi = vocabulary.VocabInfo()

    # Test base functionality
    try:
        assert vi['word1'] == 0
        assert vi['word2'] == 1
        assert vi['word3'] == 2
    except:
        pytest.fail('Unexpected error')

    # Test proper data structure
    assert type(vi.dictionary) is dict

    # Test incrementing term frequency
    vi.increment_term_frequency('word1')
    assert vi.term_frequency('word1') == 1
    vi.increment_term_frequency('word1')
    assert vi.term_frequency('word1') == 2

    # Test incrementing document frequency
    vi.increment_doc_frequency('word1')
    assert vi.doc_frequency('word1') == 1
    vi.increment_doc_frequency('word1')
    assert vi.doc_frequency('word1') == 2

    # Test incrementing term frequency on unknown word
    vi.increment_term_frequency('word4')
    assert vi.term_frequency('word4') == 1

    # Test incrementing document frequency on unknown word
    vi.increment_doc_frequency('word5')
    assert vi.doc_frequency('word5') == 1

    # Test non-string keys
    with pytest.raises(TypeError):
        vi[int()]

    with pytest.raises(TypeError):
        vi[tuple((0, 0))]

    with pytest.raises(TypeError):
        vi[dict()]

    with pytest.raises(TypeError):
        vi[list()]
Exemplo n.º 2
0
def test_extract_documents():
    ed = importer.extract_documents

    subset = ['word1 word2 word3', 'word4 word5 word6']
    testable = False

    vinfo = v.VocabInfo()
    vinfo.dictionary = {word : [int(word[-1]), 1, 1] for doc in subset for word in doc.split()}
    reject = lambda x : x.endswith('6')
    corpus = 'testamazon'

    # This tests the reject function
    expected_documents = [[0, 1, 2], [3, 4]]
    expected_total_tokens = 5
    # We keep the original token numbers but assign them different numbers
    # In the documents
    expected_small_vocab = {1:0, 2:1, 3:2, 4:3, 5:4}

    documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus)
    possible_targets = {0, 1}

    # Test correct documents and targets
    for i, doc in enumerate(documents):
        for j, token in enumerate(doc.tokens):
            assert token.token == expected_documents[i][j]
        assert doc.metadata['target'] in possible_targets

    # Test correct number of total tokens
    assert total_tokens == expected_total_tokens

    # Test correct small vocabulary
    assert expected_small_vocab == small_vocab

    s = "{\"reviewText\": \"word1 word2 word3\", \"overall\": 5.0}"
    subset = [s, s]
    testable = True

    # Always returns false
    reject = lambda x : type(total_tokens) is not int

    expected_documents = [[0, 1, 2], [0, 1, 2]]
    expected_total_tokens = 6
    expected_small_vocab = {1:0, 2:1, 3:2}
    expected_target = [5, 5]

    # Test when testable is true
    documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus)
    for i, doc in enumerate(documents):
        for j, token in enumerate(doc.tokens):
            assert token.token == expected_documents[i][j]
        assert doc.metadata['target'] == expected_target[i]

    # Test rejecting everything
    s = "{\"reviewText\": \"word1 word2 word3\", \"overall\": 5.0}"
    subset = [s, s]
    testable = True

    # Always returns true
    reject = lambda x : type(total_tokens) is int

    expected_documents = [[0, 1, 2], [0, 1, 2]]
    expected_total_tokens = 6
    expected_small_vocab = {1:0, 2:1, 3:2}
    expected_target = [5, 5]

    # Test when testable is true
    documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus)
    assert not documents
    assert not total_tokens
    assert not small_vocab

    # Test with Rare word filtering
    s = "{\"reviewText\": \"the and of\", \"overall\": 5.0}"
    subset = [s, s]
    testable = True

    stop = v.generate_stopword_list()
    reject = lambda x : x in stop

    documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus)
    assert not documents
    assert not total_tokens
    assert not small_vocab

    # Make sure it works for reddit
    corpus = 'reddit'
    s = "{\"body\": \"word1 word2 word3\", \"score\": 5.0}"
    subset = [s, s]
    testable = True

    reject = lambda x : type(total_tokens) is not int

    expected_documents = [[0, 1, 2], [0, 1, 2]]
    expected_total_tokens = 6
    expected_small_vocab = {1:0, 2:1, 3:2}
    expected_target = [1, 1]

    documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus)
    for i, doc in enumerate(documents):
        for j, token in enumerate(doc.tokens):
            assert token.token == expected_documents[i][j]
        assert doc.metadata['target'] == expected_target[i]

    # Test with reddit and score 1 to ensure correct output
    s = "{\"body\": \"word1 word2 word3\", \"score\": 1.0}"
    subset = [s, s]
    testable = True

    reject = lambda x : type(total_tokens) is not int

    expected_documents = [[0, 1, 2], [0, 1, 2]]
    expected_total_tokens = 6
    expected_small_vocab = {1:0, 2:1, 3:2}
    expected_target = [0, 0]

    documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus)
    for i, doc in enumerate(documents):
        for j, token in enumerate(doc.tokens):
            assert token.token == expected_documents[i][j]
        assert doc.metadata['target'] == expected_target[i]

    # Test correct number of total tokens
    assert expected_total_tokens == total_tokens

    # Test correct small vocabulary
    assert small_vocab == expected_small_vocab

    # Test non-lists for subset parameter
    with pytest.raises(TypeError):
        ed(int(), testable, vinfo, reject, corpus)

    with pytest.raises(TypeError):
        ed(dict(), testable, vinfo, reject, corpus)

    with pytest.raises(TypeError):
        ed(set(), testable, vinfo, reject, corpus)

    with pytest.raises(TypeError):
        ed(tuple((0, 0)), testable, vinfo, reject, corpus)

    with pytest.raises(TypeError):
        ed(str(), testable, vinfo, reject, corpus)

    # Test empty list for subset
    with pytest.raises(AttributeError):
        ed([], testable, vinfo, reject, corpus)

    # Test non-booleans for parameter testable
    with pytest.raises(TypeError):
        ed(subset, int(), vinfo, reject, corpus)

    with pytest.raises(TypeError):
        ed(subset, dict(), vinfo, reject, corpus)

    with pytest.raises(TypeError):
        ed(subset, set(), vinfo, reject, corpus)

    with pytest.raises(TypeError):
        ed(subset, tuple((0, 0)), vinfo, reject, corpus)

    with pytest.raises(TypeError):
        ed(subset, str(), vinfo, reject, corpus)

    # Test non-VocabInfo for subset vocabulary
    with pytest.raises(TypeError):
        ed(subset, testable, int(), reject, corpus)

    with pytest.raises(TypeError):
        ed(subset, testable, dict(), reject, corpus)

    with pytest.raises(TypeError):
        ed(subset, testable, set(), reject, corpus)

    with pytest.raises(TypeError):
        ed(subset, testable, tuple((0, 0)), reject, corpus)

    with pytest.raises(TypeError):
        ed(subset, testable, str(), reject, corpus)

    # Test non-string values for corpus
    with pytest.raises(TypeError):
        ed(subset, testable, vinfo, reject, int())

    with pytest.raises(TypeError):
        ed(subset, testable, vinfo, reject, dict())

    with pytest.raises(TypeError):
        ed(subset, testable, vinfo, reject, set())

    with pytest.raises(TypeError):
        ed(subset, testable, vinfo, reject, tuple((0, 0)))

    # Test empty string for corpus
    with pytest.raises(AttributeError):
        ed(subset, testable, vinfo, reject, '')
Exemplo n.º 3
0
def test_import_corpus_function():
    import pickle

    ic = importer.import_corpus

    # Corpus Setup
    notacorpus_dir = os.path.join(os.getenv('HOME'), '.preprocess/corpora/notacorpus')
    notacorpus_filepath = os.path.join(notacorpus_dir, 'notacorpus.txt.gz')

    assert not os.path.isdir(notacorpus_dir)
    assert not os.path.isfile(notacorpus_filepath)

    s = "word1 word2 word3 word4 word5"
    os.mkdir(notacorpus_dir)
    with gzip.open(notacorpus_filepath, 'wb') as f:
        f.write(f'{s}\n{s}'.encode('utf-8'))

    # Vocab Setup
    vocab_dir = os.path.join(os.getenv('HOME'), '.preprocess/vocabulary')
    notacorpus_vocab_dir = os.path.join(vocab_dir, 'notacorpus')
    notacorpus_vocab_fp = os.path.join(notacorpus_vocab_dir, 'notacorpus_vocabulary.pickle')

    assert not os.path.isdir(notacorpus_vocab_dir)
    assert not os.path.isfile(notacorpus_vocab_fp)

    vocab_dict = {word : [int(word[-1]), 2, int(word[-1])] for word in s.split()}
    vocab_info = v.VocabInfo()
    vocab_info.dictionary = vocab_dict

    os.mkdir(vocab_dir)
    os.mkdir(notacorpus_vocab_dir)
    with open(notacorpus_vocab_fp, 'wb') as f:
        pickle.dump(vocab_info, f)

    from preprocess import argmanager as arg
    import_corpus_dir = importer.import_corpus_dir
    assert not os.path.isdir(import_corpus_dir)
    for c in arg.valid_corpora:
        assert not os.path.isdir(os.path.join(import_corpus_dir, c))

    corpus = 'notacorpus'
    methods = []
    train_size = 1
    seed = 0
    test_size = 1

    imported_corpus_filename = importer.create_imported_corpus_filename(corpus, methods, train_size, seed)
    assert not os.path.isfile(imported_corpus_filename)

    # Test with no filtering
    train, test = ic(corpus, methods, train_size, seed, test_size=test_size)

    # Reducing all tokens by 1 because of how they will be re-assigned
    # token values
    expected_train = [[0, 1, 2, 3, 4]]
    expected_test = [[0, 1, 2, 3, 4]]
    expected_vocabulary = {1, 2, 3, 4, 5}
    expected_metadata = { 'total_tokens' : 5 }

    # Test correct pickle creation
    assert os.path.isfile(imported_corpus_filename)

    # Test correct directory creation
    assert os.path.isdir(import_corpus_dir)
    for c in arg.valid_corpora:
        assert os.path.isdir(os.path.join(import_corpus_dir, c))

    # Test correct train set
    assert train.vocabulary == expected_vocabulary
    assert train.metadata == expected_metadata
    for i, doc in enumerate(train.documents):
        assert not doc.metadata['target']
        for j, token in enumerate(doc.tokens):
            assert token.token == expected_train[i][j]

    # Test correct test set
    assert not test.vocabulary
    assert not test.metadata
    for i, doc in enumerate(test.documents):
        for j, token in enumerate(doc.tokens):
            assert token.token == expected_test[i][j]

    os.remove(imported_corpus_filename)
    assert not os.path.isfile(imported_corpus_filename)

    # Test with some filtering
    os.remove(notacorpus_filepath)
    assert not os.path.isfile(notacorpus_filepath)

    # Setup corpus
    notacorpus_filepath = os.path.join(notacorpus_dir, 'notacorpus_r3.txt.gz')
    with gzip.open(notacorpus_filepath, 'wb') as f:
        f.write(f'{s}\n{s}'.encode('utf-8'))

    # Setup vocab
    os.remove(notacorpus_vocab_fp)
    assert not os.path.isfile(notacorpus_vocab_fp)
    notacorpus_vocab_fp = os.path.join(notacorpus_vocab_dir, 'notacorpus_r3_vocabulary.pickle')
    with open(notacorpus_vocab_fp, 'wb') as f:
        pickle.dump(vocab_info, f)

    methods = ['r3']
    train, test = ic(corpus, methods, train_size, seed, test_size=test_size)

    imported_corpus_filename = importer.create_imported_corpus_filename(corpus, methods, train_size, seed)

    expected_train = [[0, 1, 2]]
    expected_test = [[0, 1, 2]]
    expected_vocabulary = {3, 4, 5}
    expected_metadata = {'total_tokens' : 3}

    assert os.path.isfile(imported_corpus_filename)
    os.remove(imported_corpus_filename)
    assert not os.path.isfile(imported_corpus_filename)

    # Test correct train set
    assert train.vocabulary == expected_vocabulary
    assert train.metadata == expected_metadata
    for i, doc in enumerate(train.documents):
        assert not doc.metadata['target']
        for j, token in enumerate(doc.tokens):
            assert token.token == expected_train[i][j]

    # Test correct test set
    assert not test.vocabulary
    assert not test.metadata
    for i, doc in enumerate(test.documents):
        for j, token in enumerate(doc.tokens):
            assert token.token == expected_test[i][j]

    # Test to ensure that words unique to test set get filtered
    tr = "word0 word1 word2 word3 word4 word5"

    # word6 and word7 should get filtered because of how we constructed the vocabulary above
    te = "word3 word4 word5 word6 word7 word8"
    os.remove(notacorpus_filepath)
    assert not os.path.isfile(notacorpus_filepath)

    # Setup corpus
    notacorpus_filepath = os.path.join(notacorpus_dir, 'notacorpus.txt.gz')
    with gzip.open(notacorpus_filepath, 'wb') as f:
        f.write(f'{tr}\n{te}'.encode('utf-8'))

    # Setup vocab
    vocab_dict = {word : [int(word[-1]), 2, int(word[-1])] for word in tr.split()}
    vocab_info = v.VocabInfo()
    vocab_info.dictionary = vocab_dict

    os.remove(notacorpus_vocab_fp)
    assert not os.path.isfile(notacorpus_vocab_fp)
    notacorpus_vocab_fp = os.path.join(notacorpus_vocab_dir, 'notacorpus_vocabulary.pickle')
    with open(notacorpus_vocab_fp, 'wb') as f:
        pickle.dump(vocab_info, f)

    methods = []

    # Change the seed so documents when shuffled maintain the same order
    seed = 1
    train, test = ic(corpus, methods, train_size, seed, test_size=test_size)

    imported_corpus_filename = importer.create_imported_corpus_filename(corpus, methods, train_size, seed)
    expected_train = [[0, 1, 2, 3, 4]]
    expected_test = [[2, 3, 4]]

    # We expect 0 to be filtered because it has a frequency of 0
    expected_vocabulary = {1, 2, 3, 4, 5}
    expected_metadata = {'total_tokens' : 5}

    assert os.path.isfile(imported_corpus_filename)
    os.remove(imported_corpus_filename)
    assert not os.path.isfile(imported_corpus_filename)

    # Test correct train set
    assert train.vocabulary == expected_vocabulary
    assert train.metadata == expected_metadata
    for i, doc in enumerate(train.documents):
        assert not doc.metadata['target']
        for j, token in enumerate(doc.tokens):
            assert token.token == expected_train[i][j]

    # Test correct test set
    assert not test.vocabulary
    assert not test.metadata
    for i, doc in enumerate(test.documents):
        for j, token in enumerate(doc.tokens):
            assert token.token == expected_test[i][j]

    # Clean up
    for c in arg.valid_corpora:
        os.rmdir(os.path.join(import_corpus_dir, c))
        assert not os.path.isdir(os.path.join(import_corpus_dir, c))
    os.rmdir(import_corpus_dir)
    assert not os.path.isdir(import_corpus_dir)

    os.remove(notacorpus_filepath)
    os.rmdir(notacorpus_dir)
    assert not os.path.isfile(notacorpus_filepath)
    assert not os.path.isdir(notacorpus_dir)

    os.remove(notacorpus_vocab_fp)
    os.rmdir(notacorpus_vocab_dir)
    assert not os.path.isfile(notacorpus_vocab_fp)
    assert not os.path.isdir(notacorpus_vocab_dir)

    os.rmdir(vocab_dir)
    assert not os.path.isdir(vocab_dir)
Exemplo n.º 4
0
def test_sample_function():
    import pickle
    import numpy as np
    from preprocess import argmanager

    s = sample.sample
    sample_dir = os.path.join(os.getenv('HOME'), '.preprocess/samples')
    vocab_dir = os.path.join(os.getenv('HOME'), '.preprocess/vocabulary')
    notacorpus_vocab_dir = os.path.join(vocab_dir, 'notacorpus')
    assert not os.path.isdir(sample_dir)
    assert not os.path.isdir(vocab_dir)
    for c in argmanager.valid_corpora:
        assert not os.path.isdir(os.path.join(sample_dir, c))

    try:
        corpus = 'notacorpus'
        methods = []
        sample_filename = sample.create_sample_filename(corpus, methods)

        v = { 'word1' : [0, 2, 1], 'word2' : [1, 2, 1] }
        vocab = vocabulary.VocabInfo()
        vocab.dictionary = v

        # fake seed of 0
        os.mkdir(vocab_dir)
        os.mkdir(notacorpus_vocab_dir)
        vocab_filename = vocabulary.create_vocabulary_filename(corpus, methods, 0)
        with open(vocab_filename, 'wb') as f:
            pickle.dump(vocab, f)

        # Test directory creation
        assert not os.path.isfile(sample_filename)

        one_sample = s(corpus, methods)
        assert os.path.isdir(sample_dir)
        for c in argmanager.valid_corpora:
            assert os.path.isdir(os.path.join(sample_dir, c))

        assert os.path.isfile(sample_filename)

        # Test correct object shape
        assert len(one_sample.shape) == 2

        # Test pickle retrieval
        test_sample = np.asarray([[1, 2, 3, 4], [1, 2, 3, 4]])
        with open(sample_filename, 'wb') as f:
            pickle.dump(test_sample, f)

        one_sample = s(corpus, methods)

        assert test_sample.shape == one_sample.shape
        for i, val in enumerate(one_sample):
            assert set(val) == set(test_sample[i])

    except:
        pytest.fail('Unexpected exception')

    finally:
        os.remove(sample_filename)
        os.remove(vocab_filename)
        for c in argmanager.valid_corpora:
            os.rmdir(os.path.join(sample_dir, c))
        os.rmdir(sample_dir)
        os.rmdir(notacorpus_vocab_dir)
        os.rmdir(vocab_dir)

        assert not os.path.isfile(sample_filename)
        assert not os.path.isfile(vocab_filename)
        for c in argmanager.valid_corpora:
            assert not os.path.isdir(os.path.join(sample_dir, c))
        assert not os.path.isdir(sample_dir)
        assert not os.path.isdir(vocab_dir)
        assert not os.path.isdir(notacorpus_vocab_dir)
Exemplo n.º 5
0
def test_sample_from_vocabulary():
    from preprocess import sample

    sfv = sample.sample_from_vocabulary
    type_index = sample.TYPE_INDEX
    token_index = sample.TOKEN_INDEX
    types_of_measurement = sample.TYPES_OF_MEASUREMENT
    step_size = sample.DEFAULT_STEP_SIZE

    vocab = { 'word1' : [0, 2, 1], 'word2' : [1, 2, 1] }
    v = vocabulary.VocabInfo()
    v.dictionary = vocab
    num_samples = 5
    sample_size = 2000
    s = sfv(v, num_samples, sample_size)
    num_measurements = (sample_size // step_size) + 1

    # Test correct shape
    assert s.shape == (num_measurements, types_of_measurement)

    # Test correst number of tokens sampled
    assert s[-1][token_index] == sample_size + 1

    # Test correct number of types
    assert s[-1][type_index] == len(vocab)

    # Test default non-stopword removal
    stopword_list = list(vocabulary.generate_stopword_list())
    vocab = { stopword_list[0] : [0, 2, 1], 'word2' : [1, 2, 1] }
    v = vocabulary.VocabInfo()
    v.dictionary = vocab
    s = sfv(v, num_samples, sample_size)

    # Test correct number of types without stopword filtering
    assert s[-1][type_index] == len(vocab)

    s = sfv(v, num_samples, sample_size, stop_filter=True)

    # Test correct number of types with stopword filtering
    assert s[-1][type_index] == (len(vocab) - 1)

    vocab = { 'word1' : [0, 2, 1], 'word2' : [1, 2, 2] }
    v = vocabulary.VocabInfo()
    v.dictionary = vocab
    s = sfv(v, num_samples, sample_size, rare_filter=2)

    # Test correct number of types with raised rare word filter
    assert s[-1][type_index] == (len(vocab) - 1)

    vocab = { 'word1' : [0, 2, 1], 'word2' : [1, 2, 1] }
    v = vocabulary.VocabInfo()
    v.dictionary = vocab
    s = sfv(v, num_samples, sample_size, rare_filter=2)

    # Test all words filtered when sufficiently high rare word filter
    assert not s[-1][type_index] and s[-1][token_index]

    # Test non-VocabInfo types
    with pytest.raises(TypeError):
        sfv(int(), num_samples, sample_size, rare_filter=1)

    with pytest.raises(TypeError):
        sfv(dict(), num_samples, sample_size, rare_filter=1)

    with pytest.raises(TypeError):
        sfv(set(), num_samples, sample_size, rare_filter=1)

    with pytest.raises(TypeError):
        sfv(tuple((0, 0)), num_samples, sample_size, rare_filter=1)

    with pytest.raises(TypeError):
        sfv(list(), num_samples, sample_size, rare_filter=1)

    # Test non-int type for num_samples
    with pytest.raises(TypeError):
        sfv(v, set(), sample_size)

    with pytest.raises(TypeError):
        sfv(v, list(), sample_size)

    with pytest.raises(TypeError):
        sfv(v, dict(), sample_size)

    with pytest.raises(TypeError):
        sfv(v, tuple((0, 0)), sample_size)

    # Test non-int type for sample_size
    with pytest.raises(TypeError):
        sfv(v, num_samples, list())

    with pytest.raises(TypeError):
        sfv(v, num_samples, set())

    with pytest.raises(TypeError):
        sfv(v, num_samples, dict())

    with pytest.raises(TypeError):
        sfv(v, num_samples, tuple((0, 0)))

    # Test non-bool type for stop_filter
    with pytest.raises(TypeError):
        sfv(v, num_samples, sample_size, stop_filter=int())

    with pytest.raises(TypeError):
        sfv(v, num_samples, sample_size, stop_filter=dict())

    with pytest.raises(TypeError):
        sfv(v, num_samples, sample_size, stop_filter=list())

    with pytest.raises(TypeError):
        sfv(v, num_samples, sample_size, stop_filter=tuple((0, 0)))

    with pytest.raises(TypeError):
        sfv(v, num_samples, sample_size, stop_filter=set())

    # Test non-int value for rare_filter
    with pytest.raises(TypeError):
        sfv(v, num_samples, sample_size, rare_filter=list())

    with pytest.raises(TypeError):
        sfv(v, num_samples, sample_size, rare_filter=set())

    with pytest.raises(TypeError):
        sfv(v, num_samples, sample_size, rare_filter=dict())

    with pytest.raises(TypeError):
        sfv(v, num_samples, sample_size, rare_filter=tuple((0, 0)))

    with pytest.raises(TypeError):
        sfv(v, num_samples, sample_size, rare_filter=bool())

    # Test 0 value for rare filter
    with pytest.raises(AttributeError):
        sfv(v, num_samples, sample_size, rare_filter=0)

    # Test negative value for rare filter
    with pytest.raises(AttributeError):
        sfv(v, num_samples, sample_size, rare_filter=-1)

    with pytest.raises(AttributeError):
        sfv(v, num_samples, sample_size, rare_filter=-100000)

    # Test negative value for num_samples
    with pytest.raises(AttributeError):
        sfv(v, -1, sample_size)

    with pytest.raises(AttributeError):
        sfv(v, -10000, sample_size)

    # Test negative values for sample_size
    with pytest.raises(AttributeError):
        sfv(v, num_samples, -1)

    with pytest.raises(AttributeError):
        sfv(v, num_samples, -10000)
Exemplo n.º 6
0
def test_categorical_creation():

    cc = sample.create_categorical

    # Test base functionality
    vocab = { 'word1' : [0, 2, 1], 'word2' : [1, 2, 1] }
    v = vocabulary.VocabInfo()
    v.dictionary = vocab
    expected_p_values = [.5, .5]
    expected_sample_values = ['word1', 'word2']

    sample_values, p_values = cc(v)
    assert expected_sample_values == sample_values
    for i, val in enumerate(expected_p_values):
        assert val == p_values[i]

    # Test different ratios
    vocab = { 'word1' : [0, 1, 1], 'word2' : [1, 3, 1] }
    v.dictionary = vocab

    expected_p_values = [.25, .75]
    sample_values, p_values = cc(v)

    assert sample_values == expected_sample_values
    for i, val in enumerate(expected_p_values):
        assert val == p_values[i]

    # Test more token types
    vocab = { 'word1' : [0, 1, 1], 'word2' : [1, 3, 1], 'word3' : [2, 4, 1] }
    v.dictionary = vocab

    expected_p_values = [.125, .375, .5]
    expected_sample_values = ['word1', 'word2', 'word3']
    sample_values, p_values = cc(v)

    assert sample_values == expected_sample_values
    for i, val in enumerate(expected_p_values):
        assert val == p_values[i]

    # Test with HashVocabInfo
    v = vocabulary.HashVocabInfo(6)
    v.dictionary = vocab

    expected_p_values = [.125, .375, .5]
    expected_sample_values = ['word1', 'word2', 'word3']
    sample_values, p_values = cc(v)

    assert sample_values == expected_sample_values
    for i, val in enumerate(expected_p_values):
        assert val == p_values[i]

    # Test non-VocabInfo types
    with pytest.raises(TypeError):
        cc(str())

    with pytest.raises(TypeError):
        cc(list())

    with pytest.raises(TypeError):
        cc(dict())

    with pytest.raises(TypeError):
        cc(set())

    with pytest.raises(TypeError):
        cc(tuple((0, 0)))

    with pytest.raises(TypeError):
        cc(int())