def test_basic_anti_stemming(self): """Test the basic anti_stemming functionality. Given a small set of possible stems, make sure that it generates the right ones """ num_rows = 1000 f = StringIO("Running is trusted") gen = TextGenerator((f, )) observed_words = {'RUNNING': 0, 'IS': 0, 'TRUSTED': 0} observed_stems = {'RUN': 0, 'TRUST': 0} for _ in xrange(num_rows): (stem, generated) = gen.generate_antistem(7, 0, 1) self.assertIn(generated, observed_words) self.assertIn(stem, observed_stems) observed_words[generated] += 1 observed_stems[stem] += 1 # Note: becuase of the way we weight words based on sentence position, # we exepct a 2-to-1 ratio between 'run' and 'trust' self.assertLess(observed_words['RUNNING'], num_rows * 0.7, observed_words) self.assertGreater(observed_words['RUNNING'], num_rows * 0.3, observed_words) self.assertLess(observed_words['TRUSTED'], num_rows * 0.7, observed_words) self.assertGreater(observed_words['TRUSTED'], num_rows * 0.3, observed_words) self.assertLess(observed_stems['RUN'], num_rows * 0.7, observed_stems) self.assertGreater(observed_stems['RUN'], num_rows * 0.3, observed_stems) self.assertLess(observed_stems['TRUST'], num_rows * 0.7, observed_stems) self.assertGreater(observed_stems['TRUST'], num_rows * 0.3, observed_stems) self.assertEqual(observed_words['IS'], 0)
def test_basic_generation2(self): """Given 'this is this. this is that.' a 4 token sequence would have to be either 'this is this.' or 'this is that.'""" f = StringIO('This is this. This is that.') gen = TextGenerator((f, )) gened_text = gen.generate(13) # Note: leading spaces are expected (due to the last-three-words # mechanism. goal1_words = [' ', 'This', ' ', 'is', ' ', 'this', '.'] goal1_stems = [None, 'THI', None, 'IS', None, 'THI', None] goal1_uppers = [' ', 'THIS', ' ', 'IS', ' ', 'THIS', '.'] goal2_words = [' ', 'This', ' ', 'is', ' ', 'that', '.'] goal2_stems = [None, 'THI', None, 'IS', None, 'THAT', None] goal2_uppers = [' ', 'THIS', ' ', 'IS', ' ', 'THAT', '.'] if gened_text.word_list[5] == 'this': self.assertListEqual(gened_text.word_list, goal1_words, self.seed_msg) self.assertListEqual(gened_text.stem_list, goal1_stems, self.seed_msg) self.assertListEqual(gened_text.upper_word_list, goal1_uppers, self.seed_msg) else: self.assertListEqual(gened_text.word_list, goal2_words, self.seed_msg) self.assertListEqual(gened_text.stem_list, goal2_stems, self.seed_msg) self.assertListEqual(gened_text.upper_word_list, goal2_uppers, self.seed_msg)
def test_multi_sentence(self): sentence = 'This is it' f = StringIO(sentence) gen = TextGenerator((f, )) length = 2 * len(sentence) + 3 gened_text = gen.generate(length) goal_words = [ 'This', ' ', 'is', ' ', 'it', '.', ' ', 'This', ' ', 'is', ' ', 'it', '.' ] goal_uppers = [ 'THIS', ' ', 'IS', ' ', 'IT', '.', ' ', 'THIS', ' ', 'IS', ' ', 'IT', '.' ] goal_stems = [ 'THI', None, 'IS', None, 'IT', None, None, 'THI', None, 'IS', None, 'IT', None ] self.assertListEqual(gened_text.word_list, goal_words, self.seed_msg) self.assertListEqual(gened_text.stem_list, goal_stems, self.seed_msg) self.assertListEqual(gened_text.upper_word_list, goal_uppers, self.seed_msg)
def test_to_string(self): """Given a very small sample of text that can only generate a single known sequence make sure that is in fact what we get.""" sentence = 'This is it' f = StringIO(sentence) gen = TextGenerator((f, )) generated = gen.generate(len(sentence) + 1) generated_string = str(generated) self.assertEqual(generated_string, ' This is it.', self.seed_msg)
def test_basic_word_cardinality(self): """Word Cardinality: Given a small sample of text that has a few wordss of different lengths, check to make sure that is what we get and associated cardinality functions work""" f = StringIO("Yoda run is") gen = TextGenerator((f, )) self.assertEqual(gen.word_cardinality(), 3, self.seed_msg) self.assertEqual(gen.word_cardinality(1), 0, self.seed_msg) self.assertEqual(gen.word_cardinality(2), 1, self.seed_msg) self.assertEqual(gen.word_cardinality(3), 1, self.seed_msg) self.assertEqual(gen.word_cardinality(4), 1, self.seed_msg)
def test_trigram(self): """ Given a very small sample of text that can only generate a single known sequence, make sure we can only generate the correct trigram. """ sentence = 'Running is discouraged. Running is discouraged. Running is encouraged.' f = StringIO(sentence) gen = TextGenerator((f, )) generated = gen.generate_trigram(0, 0.33) goal = "Running is encouraged" self.assertEqual(generated, goal, self.seed_msg)
def test_basic_stem_generation(self): """Stem Generation: Given a small sample of text that can only generate a single stem, check to make sure that is what we get and associated cardinality functions work""" f = StringIO("Runs running run") gen = TextGenerator((f, )) observed = {'RUN': 0} for _ in xrange(100): generated = gen.generate_stem(3, 0, 1) observed[generated] += 1 self.assertEqual(observed['RUN'], 100, self.seed_msg)
def test_long_generation(self): """Test various properties about longer sentences to make sure they seem correct.""" f = StringIO('This is the sentence one.\n\nThis is the second ' 'sentence. This is the third sentence!') gen = TextGenerator((f, )) # Since this will generate several possible sentences we'll generate # lots of sentences and make sure only valid ones are produced. We'll # also check that in a run of 1000 all of the valid sentences are # produced at least once as that should pretty much always happen. generated_sentences = set() for i in xrange(1000): generated = gen.generate(60) generated_string = str(generated) generated_sentences.add(generated_string) # Here are all the valid sentences one could produce given the above: valid_sentences = set( ('This is the sentence one.', 'This is the second sentence.', 'This is the third sentence!')) # Since we are generating more than one sentence, we need # to check valid sentences against prefixes of generated # text def prefixes_some_sentence(valid, gens): for sentence in gens: if sentence.startswith(valid): return True return False for s in valid_sentences: self.assertTrue( prefixes_some_sentence(s, generated_sentences), 'The following sentence was not observed:\n' + s + '\n' + self.seed_msg) def prefixed_by_valid_sentence(gened, valids): for valid in valids: if gened.startswith(valid): return True return False for s in generated_sentences: self.assertTrue( prefixed_by_valid_sentence(s, valid_sentences), 'The following invalid sentence was observed:\n' + s + '\n' + self.seed_msg)
def test_corner_word_generation(self): """Given a standard input, see corner case behavior on word generation given a len not previously seen""" f = StringIO('Runs, running, run') gen = TextGenerator((f, )) self.assertRaises(KeyError, gen.generate_word, 0, 0, 1) self.assertRaises(KeyError, gen.generate_word, -1, 0, 1)
def test_basic_stemming(self): """ Given a very small sample of text that can only generate a single known sequence, me sure we get the expected stems. """ sentence = 'Running is discouraged' f = StringIO(sentence) gen = TextGenerator((f, )) gened_text = gen.generate(len(sentence) + 1) goal_words = [' ', 'Running', ' ', 'is', ' ', 'discouraged', '.'] goal_uppers = [' ', 'RUNNING', ' ', 'IS', ' ', 'DISCOURAGED', '.'] goal_stems = [None, 'RUN', None, 'IS', None, 'DISCOURAG', None] self.assertListEqual(gened_text.word_list, goal_words, self.seed_msg) self.assertListEqual(gened_text.stem_list, goal_stems, self.seed_msg) self.assertListEqual(gened_text.upper_word_list, goal_uppers, self.seed_msg)
def test_basic_generation(self): """Given a very small sample of text that can only generate a single known sequence make sure that is in fact what we get.""" sentence = 'This is it' f = StringIO(sentence) gen = TextGenerator((f, )) gened_text = gen.generate(len(sentence) + 1) # Note: leading space is intentional-- due to the way we add # the last three words, there will always be a space before the # third-to-last word. goal_words = [' ', 'This', ' ', 'is', ' ', 'it', '.'] goal_stems = [None, 'THI', None, 'IS', None, 'IT', None] goal_uppers = [' ', 'THIS', ' ', 'IS', ' ', 'IT', '.'] self.assertListEqual(gened_text.word_list, goal_words, self.seed_msg) self.assertListEqual(gened_text.stem_list, goal_stems, self.seed_msg) self.assertListEqual(gened_text.upper_word_list, goal_uppers, self.seed_msg)
def test_basic_generation_range(self): """Check that we get a string in the range we expect.""" f = StringIO('This is it') gen = TextGenerator((f,)) dist = TextDistribution(gen, 11, 22) for i in xrange(20): generated = dist.generate() generated_str = str(generated) self.assertLessEqual(len(generated_str), 22, self.seed_msg)
def test_basic_generation(self): """Given a very small sample of text that can only generate a single known sequence make sure we get what we expect.""" f = StringIO('This is it') gen = TextGenerator((f,)) dist = TextDistribution(gen, 11, 11) gened_text = dist.generate() goal_words = [' ', 'This', ' ', 'is', ' ', 'it', '.'] goal_stems = [None, 'THI', None, 'IS', None, 'IT', None] goal_uppers = [' ', 'THIS', ' ', 'IS', ' ', 'IT', '.'] self.assertListEqual(gened_text.word_list, goal_words, self.seed_msg) self.assertListEqual(gened_text.stem_list, goal_stems, self.seed_msg) self.assertListEqual(gened_text.upper_word_list, goal_uppers, self.seed_msg)
def test_respect_word_boundaries(self): """Check that we only get entire words""" f = StringIO('this is it') gen = TextGenerator((f,)) dist = TextDistribution(gen, 200, 300) valid_words = set(['this', 'is', 'it', ' ', "."]) for i in xrange(20): generated = dist.generate() seen = set() for word in generated.word_list: seen.add(word) self.assertSetEqual(seen, valid_words, self.seed_msg)
def test_basic_word_generation(self): """Word Generation: Given a small sample of text that can only generate a small set of words, check to make sure that we have the right distributions""" f = StringIO('This is it. This was it.') gen = TextGenerator((f, )) observed = {'THIS': 0, 'IS': 0, 'IT': 0, 'WAS': 0} for _ in xrange(600): value = gen.generate_word(2, 0, 1) observed[value] += 1 value = gen.generate_word(3, 0, 1) observed[value] += 1 value = gen.generate_word(4, 0, 1) observed[value] += 1 print observed self.assertGreater(observed['IS'], 250, self.seed_msg) self.assertLess(observed['IS'], 350, self.seed_msg) self.assertGreater(observed['IT'], 250, self.seed_msg) self.assertLess(observed['IT'], 350, self.seed_msg) self.assertEqual(observed['THIS'], 600, self.seed_msg) self.assertEqual(observed['WAS'], 600, self.seed_msg)
def test_generate_alarm_words(self): str = 'This is the file. It has tokens in it. ' * 1000 f = StringIO('This is the file. It has tokens in it') gen = TextGenerator((f,)) dist = TextDistribution(gen, 1000, 2000, add_alarmwords = True) self.assertTrue(dist.alarmwords_enabled()) alarmwords = dist.alarmwords() self.assertGreaterEqual(len(alarmwords), 2) # WIll count the number of generated texts containing # 0 alarmwrods, 1 alarmword, etc. num_alarmwords_counts = collections.Counter() # Will count the nubmer of generated texts that have 2 alarmwords with # 0 characters between them, 2 alarmwords with distance 1 between them, # etc. distance_counter = collections.Counter() # For each alarmword, count the number of generated-texts which # (1) have an alarmword, and (2) have that alarmword *first* first_alarmwords = collections.Counter() num_iterations = 10000 for _ in xrange(num_iterations): generated_text = dist.generate() generated_str = generated_text.str() num_alarmwords = 0 # update num_alarmwords_counts alarm_words_list = [] for alarmword in alarmwords: if alarmword in generated_str: num_alarmwords += 1 alarm_words_list.append(alarmword) for alarmword in alarm_words_list: if any([alarmword in word and alarmword != word for word in alarm_words_list]): num_alarmwords -= 1 num_alarmwords_counts[num_alarmwords] += 1 if num_alarmwords >= 1: indices_and_words = \ [ (generated_str.find(aw), aw) for aw in alarmwords] indices_and_words = filter(lambda (x,_) : x >= 0, indices_and_words) indices_and_words.sort() # will sort by first component # of the contained tuples # update first_alarmwords (_, first_alarmword) = indices_and_words[0] first_alarmwords[first_alarmword] += 1 if num_alarmwords >= 2: # update distance_counter for n in range(len(indices_and_words) - 1): (first_index, alarmword) = indices_and_words[n] (second_index, _) = indices_and_words[n + 1] distance = second_index - first_index - len(alarmword) distance_counter[distance] += 1 # Okay, now check the distributions we just measures. # The values in observed_alarmword_counts should correspond # to the constants for AlarmWordsDistribution.dist1 observed_alarmword_counts = set(num_alarmwords_counts.keys()) self.assertSetEqual(observed_alarmword_counts, set([0,1,2])) for (num_alarmwords, expected_proportion) in [(0, 0.8), (1, 0.1), (2, 0.1)]: observed_proportion = \ num_alarmwords_counts[num_alarmwords] / num_iterations self.assertGreater(observed_proportion, expected_proportion * 0.5) self.assertLess(observed_proportion, expected_proportion * 1.5) ## This is commented out because the distribution is no longer uniform ## which means as a result there is no easy way to compare the alarm ## word set which is now over 400 words """ # The values in first_alarmwords should match the distribution of # AlarmWordsDistribution.dist2 (currently a uniform distribution observed_first_alarmwords = set(first_alarmwords.keys()) self.assertSetEqual(observed_first_alarmwords, set(dist.alarmwords())) alarmword_row_count = sum([num_alarmwords_counts[1], num_alarmwords_counts[2]]) for alarmword in dist.alarmwords(): expected_proportion = 1 / len(dist.alarmwords()) observed_proportion = \ first_alarmwords[alarmword] / alarmword_row_count self.assertLess(observed_proportion, expected_proportion * 1.5) self.assertGreater(observed_proportion, expected_proportion * 0.5) """ # The values in distance_counter should correspond (roughly) to the # constants for AlarmWordsDistribution.dist3 for (max_distance, expected_proportion) in [(25, 0.25), (50, 0.5), (100, .75), (200, 1)]: under_max_distance = 0 for n in range(max_distance+1): under_max_distance += distance_counter[n] observed_proportion = under_max_distance / num_alarmwords_counts[2] self.assertLess(observed_proportion, expected_proportion * 1.5) self.assertGreater(observed_proportion, expected_proportion * 0.5)
def test_dont_generate_alarm_words(self): str = 'This is the file. It has tokens in it. ' * 1000 f = StringIO('This is the file. It has tokens in it') gen = TextGenerator((f,)) dist = TextDistribution(gen, 1000, 2000) self.assertFalse(dist.alarmwords_enabled())
def test_construction_multiple_files(self): """Same as the above but with multiple files.""" f1 = StringIO('This is the file. It has tokens in it') f2 = StringIO('This is the 2nd file.') gen = TextGenerator((f1, f2))
def test_construction_single_file(self): """Very simple test that just makes sure constructing the object with a single file doesn't crash.""" f = StringIO('This is the file. It has tokens in it') gen = TextGenerator((f, ))