def test_corpus_info_class(): vi = vocabulary.VocabInfo() # Test base functionality try: assert vi['word1'] == 0 assert vi['word2'] == 1 assert vi['word3'] == 2 except: pytest.fail('Unexpected error') # Test proper data structure assert type(vi.dictionary) is dict # Test incrementing term frequency vi.increment_term_frequency('word1') assert vi.term_frequency('word1') == 1 vi.increment_term_frequency('word1') assert vi.term_frequency('word1') == 2 # Test incrementing document frequency vi.increment_doc_frequency('word1') assert vi.doc_frequency('word1') == 1 vi.increment_doc_frequency('word1') assert vi.doc_frequency('word1') == 2 # Test incrementing term frequency on unknown word vi.increment_term_frequency('word4') assert vi.term_frequency('word4') == 1 # Test incrementing document frequency on unknown word vi.increment_doc_frequency('word5') assert vi.doc_frequency('word5') == 1 # Test non-string keys with pytest.raises(TypeError): vi[int()] with pytest.raises(TypeError): vi[tuple((0, 0))] with pytest.raises(TypeError): vi[dict()] with pytest.raises(TypeError): vi[list()]
def test_extract_documents(): ed = importer.extract_documents subset = ['word1 word2 word3', 'word4 word5 word6'] testable = False vinfo = v.VocabInfo() vinfo.dictionary = {word : [int(word[-1]), 1, 1] for doc in subset for word in doc.split()} reject = lambda x : x.endswith('6') corpus = 'testamazon' # This tests the reject function expected_documents = [[0, 1, 2], [3, 4]] expected_total_tokens = 5 # We keep the original token numbers but assign them different numbers # In the documents expected_small_vocab = {1:0, 2:1, 3:2, 4:3, 5:4} documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus) possible_targets = {0, 1} # Test correct documents and targets for i, doc in enumerate(documents): for j, token in enumerate(doc.tokens): assert token.token == expected_documents[i][j] assert doc.metadata['target'] in possible_targets # Test correct number of total tokens assert total_tokens == expected_total_tokens # Test correct small vocabulary assert expected_small_vocab == small_vocab s = "{\"reviewText\": \"word1 word2 word3\", \"overall\": 5.0}" subset = [s, s] testable = True # Always returns false reject = lambda x : type(total_tokens) is not int expected_documents = [[0, 1, 2], [0, 1, 2]] expected_total_tokens = 6 expected_small_vocab = {1:0, 2:1, 3:2} expected_target = [5, 5] # Test when testable is true documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus) for i, doc in enumerate(documents): for j, token in enumerate(doc.tokens): assert token.token == expected_documents[i][j] assert doc.metadata['target'] == expected_target[i] # Test rejecting everything s = "{\"reviewText\": \"word1 word2 word3\", \"overall\": 5.0}" subset = [s, s] testable = True # Always returns true reject = lambda x : type(total_tokens) is int expected_documents = [[0, 1, 2], [0, 1, 2]] expected_total_tokens = 6 expected_small_vocab = {1:0, 2:1, 3:2} expected_target = [5, 5] # Test when testable is true documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus) assert not documents assert not total_tokens assert not small_vocab # Test with Rare word filtering s = "{\"reviewText\": \"the and of\", \"overall\": 5.0}" subset = [s, s] testable = True stop = v.generate_stopword_list() reject = lambda x : x in stop documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus) assert not documents assert not total_tokens assert not small_vocab # Make sure it works for reddit corpus = 'reddit' s = "{\"body\": \"word1 word2 word3\", \"score\": 5.0}" subset = [s, s] testable = True reject = lambda x : type(total_tokens) is not int expected_documents = [[0, 1, 2], [0, 1, 2]] expected_total_tokens = 6 expected_small_vocab = {1:0, 2:1, 3:2} expected_target = [1, 1] documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus) for i, doc in enumerate(documents): for j, token in enumerate(doc.tokens): assert token.token == expected_documents[i][j] assert doc.metadata['target'] == expected_target[i] # Test with reddit and score 1 to ensure correct output s = "{\"body\": \"word1 word2 word3\", \"score\": 1.0}" subset = [s, s] testable = True reject = lambda x : type(total_tokens) is not int expected_documents = [[0, 1, 2], [0, 1, 2]] expected_total_tokens = 6 expected_small_vocab = {1:0, 2:1, 3:2} expected_target = [0, 0] documents, total_tokens, small_vocab = ed(subset, testable, vinfo, reject, corpus) for i, doc in enumerate(documents): for j, token in enumerate(doc.tokens): assert token.token == expected_documents[i][j] assert doc.metadata['target'] == expected_target[i] # Test correct number of total tokens assert expected_total_tokens == total_tokens # Test correct small vocabulary assert small_vocab == expected_small_vocab # Test non-lists for subset parameter with pytest.raises(TypeError): ed(int(), testable, vinfo, reject, corpus) with pytest.raises(TypeError): ed(dict(), testable, vinfo, reject, corpus) with pytest.raises(TypeError): ed(set(), testable, vinfo, reject, corpus) with pytest.raises(TypeError): ed(tuple((0, 0)), testable, vinfo, reject, corpus) with pytest.raises(TypeError): ed(str(), testable, vinfo, reject, corpus) # Test empty list for subset with pytest.raises(AttributeError): ed([], testable, vinfo, reject, corpus) # Test non-booleans for parameter testable with pytest.raises(TypeError): ed(subset, int(), vinfo, reject, corpus) with pytest.raises(TypeError): ed(subset, dict(), vinfo, reject, corpus) with pytest.raises(TypeError): ed(subset, set(), vinfo, reject, corpus) with pytest.raises(TypeError): ed(subset, tuple((0, 0)), vinfo, reject, corpus) with pytest.raises(TypeError): ed(subset, str(), vinfo, reject, corpus) # Test non-VocabInfo for subset vocabulary with pytest.raises(TypeError): ed(subset, testable, int(), reject, corpus) with pytest.raises(TypeError): ed(subset, testable, dict(), reject, corpus) with pytest.raises(TypeError): ed(subset, testable, set(), reject, corpus) with pytest.raises(TypeError): ed(subset, testable, tuple((0, 0)), reject, corpus) with pytest.raises(TypeError): ed(subset, testable, str(), reject, corpus) # Test non-string values for corpus with pytest.raises(TypeError): ed(subset, testable, vinfo, reject, int()) with pytest.raises(TypeError): ed(subset, testable, vinfo, reject, dict()) with pytest.raises(TypeError): ed(subset, testable, vinfo, reject, set()) with pytest.raises(TypeError): ed(subset, testable, vinfo, reject, tuple((0, 0))) # Test empty string for corpus with pytest.raises(AttributeError): ed(subset, testable, vinfo, reject, '')
def test_import_corpus_function(): import pickle ic = importer.import_corpus # Corpus Setup notacorpus_dir = os.path.join(os.getenv('HOME'), '.preprocess/corpora/notacorpus') notacorpus_filepath = os.path.join(notacorpus_dir, 'notacorpus.txt.gz') assert not os.path.isdir(notacorpus_dir) assert not os.path.isfile(notacorpus_filepath) s = "word1 word2 word3 word4 word5" os.mkdir(notacorpus_dir) with gzip.open(notacorpus_filepath, 'wb') as f: f.write(f'{s}\n{s}'.encode('utf-8')) # Vocab Setup vocab_dir = os.path.join(os.getenv('HOME'), '.preprocess/vocabulary') notacorpus_vocab_dir = os.path.join(vocab_dir, 'notacorpus') notacorpus_vocab_fp = os.path.join(notacorpus_vocab_dir, 'notacorpus_vocabulary.pickle') assert not os.path.isdir(notacorpus_vocab_dir) assert not os.path.isfile(notacorpus_vocab_fp) vocab_dict = {word : [int(word[-1]), 2, int(word[-1])] for word in s.split()} vocab_info = v.VocabInfo() vocab_info.dictionary = vocab_dict os.mkdir(vocab_dir) os.mkdir(notacorpus_vocab_dir) with open(notacorpus_vocab_fp, 'wb') as f: pickle.dump(vocab_info, f) from preprocess import argmanager as arg import_corpus_dir = importer.import_corpus_dir assert not os.path.isdir(import_corpus_dir) for c in arg.valid_corpora: assert not os.path.isdir(os.path.join(import_corpus_dir, c)) corpus = 'notacorpus' methods = [] train_size = 1 seed = 0 test_size = 1 imported_corpus_filename = importer.create_imported_corpus_filename(corpus, methods, train_size, seed) assert not os.path.isfile(imported_corpus_filename) # Test with no filtering train, test = ic(corpus, methods, train_size, seed, test_size=test_size) # Reducing all tokens by 1 because of how they will be re-assigned # token values expected_train = [[0, 1, 2, 3, 4]] expected_test = [[0, 1, 2, 3, 4]] expected_vocabulary = {1, 2, 3, 4, 5} expected_metadata = { 'total_tokens' : 5 } # Test correct pickle creation assert os.path.isfile(imported_corpus_filename) # Test correct directory creation assert os.path.isdir(import_corpus_dir) for c in arg.valid_corpora: assert os.path.isdir(os.path.join(import_corpus_dir, c)) # Test correct train set assert train.vocabulary == expected_vocabulary assert train.metadata == expected_metadata for i, doc in enumerate(train.documents): assert not doc.metadata['target'] for j, token in enumerate(doc.tokens): assert token.token == expected_train[i][j] # Test correct test set assert not test.vocabulary assert not test.metadata for i, doc in enumerate(test.documents): for j, token in enumerate(doc.tokens): assert token.token == expected_test[i][j] os.remove(imported_corpus_filename) assert not os.path.isfile(imported_corpus_filename) # Test with some filtering os.remove(notacorpus_filepath) assert not os.path.isfile(notacorpus_filepath) # Setup corpus notacorpus_filepath = os.path.join(notacorpus_dir, 'notacorpus_r3.txt.gz') with gzip.open(notacorpus_filepath, 'wb') as f: f.write(f'{s}\n{s}'.encode('utf-8')) # Setup vocab os.remove(notacorpus_vocab_fp) assert not os.path.isfile(notacorpus_vocab_fp) notacorpus_vocab_fp = os.path.join(notacorpus_vocab_dir, 'notacorpus_r3_vocabulary.pickle') with open(notacorpus_vocab_fp, 'wb') as f: pickle.dump(vocab_info, f) methods = ['r3'] train, test = ic(corpus, methods, train_size, seed, test_size=test_size) imported_corpus_filename = importer.create_imported_corpus_filename(corpus, methods, train_size, seed) expected_train = [[0, 1, 2]] expected_test = [[0, 1, 2]] expected_vocabulary = {3, 4, 5} expected_metadata = {'total_tokens' : 3} assert os.path.isfile(imported_corpus_filename) os.remove(imported_corpus_filename) assert not os.path.isfile(imported_corpus_filename) # Test correct train set assert train.vocabulary == expected_vocabulary assert train.metadata == expected_metadata for i, doc in enumerate(train.documents): assert not doc.metadata['target'] for j, token in enumerate(doc.tokens): assert token.token == expected_train[i][j] # Test correct test set assert not test.vocabulary assert not test.metadata for i, doc in enumerate(test.documents): for j, token in enumerate(doc.tokens): assert token.token == expected_test[i][j] # Test to ensure that words unique to test set get filtered tr = "word0 word1 word2 word3 word4 word5" # word6 and word7 should get filtered because of how we constructed the vocabulary above te = "word3 word4 word5 word6 word7 word8" os.remove(notacorpus_filepath) assert not os.path.isfile(notacorpus_filepath) # Setup corpus notacorpus_filepath = os.path.join(notacorpus_dir, 'notacorpus.txt.gz') with gzip.open(notacorpus_filepath, 'wb') as f: f.write(f'{tr}\n{te}'.encode('utf-8')) # Setup vocab vocab_dict = {word : [int(word[-1]), 2, int(word[-1])] for word in tr.split()} vocab_info = v.VocabInfo() vocab_info.dictionary = vocab_dict os.remove(notacorpus_vocab_fp) assert not os.path.isfile(notacorpus_vocab_fp) notacorpus_vocab_fp = os.path.join(notacorpus_vocab_dir, 'notacorpus_vocabulary.pickle') with open(notacorpus_vocab_fp, 'wb') as f: pickle.dump(vocab_info, f) methods = [] # Change the seed so documents when shuffled maintain the same order seed = 1 train, test = ic(corpus, methods, train_size, seed, test_size=test_size) imported_corpus_filename = importer.create_imported_corpus_filename(corpus, methods, train_size, seed) expected_train = [[0, 1, 2, 3, 4]] expected_test = [[2, 3, 4]] # We expect 0 to be filtered because it has a frequency of 0 expected_vocabulary = {1, 2, 3, 4, 5} expected_metadata = {'total_tokens' : 5} assert os.path.isfile(imported_corpus_filename) os.remove(imported_corpus_filename) assert not os.path.isfile(imported_corpus_filename) # Test correct train set assert train.vocabulary == expected_vocabulary assert train.metadata == expected_metadata for i, doc in enumerate(train.documents): assert not doc.metadata['target'] for j, token in enumerate(doc.tokens): assert token.token == expected_train[i][j] # Test correct test set assert not test.vocabulary assert not test.metadata for i, doc in enumerate(test.documents): for j, token in enumerate(doc.tokens): assert token.token == expected_test[i][j] # Clean up for c in arg.valid_corpora: os.rmdir(os.path.join(import_corpus_dir, c)) assert not os.path.isdir(os.path.join(import_corpus_dir, c)) os.rmdir(import_corpus_dir) assert not os.path.isdir(import_corpus_dir) os.remove(notacorpus_filepath) os.rmdir(notacorpus_dir) assert not os.path.isfile(notacorpus_filepath) assert not os.path.isdir(notacorpus_dir) os.remove(notacorpus_vocab_fp) os.rmdir(notacorpus_vocab_dir) assert not os.path.isfile(notacorpus_vocab_fp) assert not os.path.isdir(notacorpus_vocab_dir) os.rmdir(vocab_dir) assert not os.path.isdir(vocab_dir)
def test_sample_function(): import pickle import numpy as np from preprocess import argmanager s = sample.sample sample_dir = os.path.join(os.getenv('HOME'), '.preprocess/samples') vocab_dir = os.path.join(os.getenv('HOME'), '.preprocess/vocabulary') notacorpus_vocab_dir = os.path.join(vocab_dir, 'notacorpus') assert not os.path.isdir(sample_dir) assert not os.path.isdir(vocab_dir) for c in argmanager.valid_corpora: assert not os.path.isdir(os.path.join(sample_dir, c)) try: corpus = 'notacorpus' methods = [] sample_filename = sample.create_sample_filename(corpus, methods) v = { 'word1' : [0, 2, 1], 'word2' : [1, 2, 1] } vocab = vocabulary.VocabInfo() vocab.dictionary = v # fake seed of 0 os.mkdir(vocab_dir) os.mkdir(notacorpus_vocab_dir) vocab_filename = vocabulary.create_vocabulary_filename(corpus, methods, 0) with open(vocab_filename, 'wb') as f: pickle.dump(vocab, f) # Test directory creation assert not os.path.isfile(sample_filename) one_sample = s(corpus, methods) assert os.path.isdir(sample_dir) for c in argmanager.valid_corpora: assert os.path.isdir(os.path.join(sample_dir, c)) assert os.path.isfile(sample_filename) # Test correct object shape assert len(one_sample.shape) == 2 # Test pickle retrieval test_sample = np.asarray([[1, 2, 3, 4], [1, 2, 3, 4]]) with open(sample_filename, 'wb') as f: pickle.dump(test_sample, f) one_sample = s(corpus, methods) assert test_sample.shape == one_sample.shape for i, val in enumerate(one_sample): assert set(val) == set(test_sample[i]) except: pytest.fail('Unexpected exception') finally: os.remove(sample_filename) os.remove(vocab_filename) for c in argmanager.valid_corpora: os.rmdir(os.path.join(sample_dir, c)) os.rmdir(sample_dir) os.rmdir(notacorpus_vocab_dir) os.rmdir(vocab_dir) assert not os.path.isfile(sample_filename) assert not os.path.isfile(vocab_filename) for c in argmanager.valid_corpora: assert not os.path.isdir(os.path.join(sample_dir, c)) assert not os.path.isdir(sample_dir) assert not os.path.isdir(vocab_dir) assert not os.path.isdir(notacorpus_vocab_dir)
def test_sample_from_vocabulary(): from preprocess import sample sfv = sample.sample_from_vocabulary type_index = sample.TYPE_INDEX token_index = sample.TOKEN_INDEX types_of_measurement = sample.TYPES_OF_MEASUREMENT step_size = sample.DEFAULT_STEP_SIZE vocab = { 'word1' : [0, 2, 1], 'word2' : [1, 2, 1] } v = vocabulary.VocabInfo() v.dictionary = vocab num_samples = 5 sample_size = 2000 s = sfv(v, num_samples, sample_size) num_measurements = (sample_size // step_size) + 1 # Test correct shape assert s.shape == (num_measurements, types_of_measurement) # Test correst number of tokens sampled assert s[-1][token_index] == sample_size + 1 # Test correct number of types assert s[-1][type_index] == len(vocab) # Test default non-stopword removal stopword_list = list(vocabulary.generate_stopword_list()) vocab = { stopword_list[0] : [0, 2, 1], 'word2' : [1, 2, 1] } v = vocabulary.VocabInfo() v.dictionary = vocab s = sfv(v, num_samples, sample_size) # Test correct number of types without stopword filtering assert s[-1][type_index] == len(vocab) s = sfv(v, num_samples, sample_size, stop_filter=True) # Test correct number of types with stopword filtering assert s[-1][type_index] == (len(vocab) - 1) vocab = { 'word1' : [0, 2, 1], 'word2' : [1, 2, 2] } v = vocabulary.VocabInfo() v.dictionary = vocab s = sfv(v, num_samples, sample_size, rare_filter=2) # Test correct number of types with raised rare word filter assert s[-1][type_index] == (len(vocab) - 1) vocab = { 'word1' : [0, 2, 1], 'word2' : [1, 2, 1] } v = vocabulary.VocabInfo() v.dictionary = vocab s = sfv(v, num_samples, sample_size, rare_filter=2) # Test all words filtered when sufficiently high rare word filter assert not s[-1][type_index] and s[-1][token_index] # Test non-VocabInfo types with pytest.raises(TypeError): sfv(int(), num_samples, sample_size, rare_filter=1) with pytest.raises(TypeError): sfv(dict(), num_samples, sample_size, rare_filter=1) with pytest.raises(TypeError): sfv(set(), num_samples, sample_size, rare_filter=1) with pytest.raises(TypeError): sfv(tuple((0, 0)), num_samples, sample_size, rare_filter=1) with pytest.raises(TypeError): sfv(list(), num_samples, sample_size, rare_filter=1) # Test non-int type for num_samples with pytest.raises(TypeError): sfv(v, set(), sample_size) with pytest.raises(TypeError): sfv(v, list(), sample_size) with pytest.raises(TypeError): sfv(v, dict(), sample_size) with pytest.raises(TypeError): sfv(v, tuple((0, 0)), sample_size) # Test non-int type for sample_size with pytest.raises(TypeError): sfv(v, num_samples, list()) with pytest.raises(TypeError): sfv(v, num_samples, set()) with pytest.raises(TypeError): sfv(v, num_samples, dict()) with pytest.raises(TypeError): sfv(v, num_samples, tuple((0, 0))) # Test non-bool type for stop_filter with pytest.raises(TypeError): sfv(v, num_samples, sample_size, stop_filter=int()) with pytest.raises(TypeError): sfv(v, num_samples, sample_size, stop_filter=dict()) with pytest.raises(TypeError): sfv(v, num_samples, sample_size, stop_filter=list()) with pytest.raises(TypeError): sfv(v, num_samples, sample_size, stop_filter=tuple((0, 0))) with pytest.raises(TypeError): sfv(v, num_samples, sample_size, stop_filter=set()) # Test non-int value for rare_filter with pytest.raises(TypeError): sfv(v, num_samples, sample_size, rare_filter=list()) with pytest.raises(TypeError): sfv(v, num_samples, sample_size, rare_filter=set()) with pytest.raises(TypeError): sfv(v, num_samples, sample_size, rare_filter=dict()) with pytest.raises(TypeError): sfv(v, num_samples, sample_size, rare_filter=tuple((0, 0))) with pytest.raises(TypeError): sfv(v, num_samples, sample_size, rare_filter=bool()) # Test 0 value for rare filter with pytest.raises(AttributeError): sfv(v, num_samples, sample_size, rare_filter=0) # Test negative value for rare filter with pytest.raises(AttributeError): sfv(v, num_samples, sample_size, rare_filter=-1) with pytest.raises(AttributeError): sfv(v, num_samples, sample_size, rare_filter=-100000) # Test negative value for num_samples with pytest.raises(AttributeError): sfv(v, -1, sample_size) with pytest.raises(AttributeError): sfv(v, -10000, sample_size) # Test negative values for sample_size with pytest.raises(AttributeError): sfv(v, num_samples, -1) with pytest.raises(AttributeError): sfv(v, num_samples, -10000)
def test_categorical_creation(): cc = sample.create_categorical # Test base functionality vocab = { 'word1' : [0, 2, 1], 'word2' : [1, 2, 1] } v = vocabulary.VocabInfo() v.dictionary = vocab expected_p_values = [.5, .5] expected_sample_values = ['word1', 'word2'] sample_values, p_values = cc(v) assert expected_sample_values == sample_values for i, val in enumerate(expected_p_values): assert val == p_values[i] # Test different ratios vocab = { 'word1' : [0, 1, 1], 'word2' : [1, 3, 1] } v.dictionary = vocab expected_p_values = [.25, .75] sample_values, p_values = cc(v) assert sample_values == expected_sample_values for i, val in enumerate(expected_p_values): assert val == p_values[i] # Test more token types vocab = { 'word1' : [0, 1, 1], 'word2' : [1, 3, 1], 'word3' : [2, 4, 1] } v.dictionary = vocab expected_p_values = [.125, .375, .5] expected_sample_values = ['word1', 'word2', 'word3'] sample_values, p_values = cc(v) assert sample_values == expected_sample_values for i, val in enumerate(expected_p_values): assert val == p_values[i] # Test with HashVocabInfo v = vocabulary.HashVocabInfo(6) v.dictionary = vocab expected_p_values = [.125, .375, .5] expected_sample_values = ['word1', 'word2', 'word3'] sample_values, p_values = cc(v) assert sample_values == expected_sample_values for i, val in enumerate(expected_p_values): assert val == p_values[i] # Test non-VocabInfo types with pytest.raises(TypeError): cc(str()) with pytest.raises(TypeError): cc(list()) with pytest.raises(TypeError): cc(dict()) with pytest.raises(TypeError): cc(set()) with pytest.raises(TypeError): cc(tuple((0, 0))) with pytest.raises(TypeError): cc(int())