def test_options_case_sensitive(self): # change is_case_sensitive, other options remain the same as default values options = TextProfilerOptions() options.is_case_sensitive = False # input with one sample text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"sentence": 1, "test": 2} expected_vocab = { "s": 5, " ": 5, "e": 5, "t": 4, "!": 3, "T": 2, "i": 2, "n": 2, "h": 1, ",": 1, "a": 1, "c": 1, ".": 1, } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) # input with two samples text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"sentence": 1, "test": 2} expected_vocab = { "s": 5, " ": 5, "e": 5, "t": 4, "!": 3, "T": 2, "i": 2, "n": 2, "h": 1, ",": 1, "a": 1, "c": 1, ".": 1, } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count)
def test_options_case_sensitive(self): # change is_case_sensitive, other options remain the same as default values options = TextProfilerOptions() options.is_case_sensitive = False # input with one sample text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {'sentence': 1, 'test': 2} expected_vocab = { 's': 5, ' ': 5, 'e': 5, 't': 4, '!': 3, 'T': 2, 'i': 2, 'n': 2, 'h': 1, ',': 1, 'a': 1, 'c': 1, '.': 1 } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) # input with two samples text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {'sentence': 1, 'test': 2} expected_vocab = { 's': 5, ' ': 5, 'e': 5, 't': 4, '!': 3, 'T': 2, 'i': 2, 'n': 2, 'h': 1, ',': 1, 'a': 1, 'c': 1, '.': 1 } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count)
def test_report(self): """Test report method in TextProfiler class under four (4) scenarios. First, test under scenario of disabling vocab and word. Second, test with no options and `remove_disabled_flag`=True. Third, test no options and default `remove_disabled_flag`. Lastly, test under scenario of disabling vocab but not word. """ options = ( TextProfilerOptions() ) # With TextProfilerOptions as False and remove_disabled_flag == True options.vocab.is_enabled = False options.words.is_enabled = False profiler = TextProfiler("Name", options) sample = pd.Series(["This is test, a Test sentence.!!!"]) profiler.update(sample) report = profiler.report(remove_disabled_flag=True) report_keys = list(report.keys()) self.assertNotIn("vocab", report_keys) self.assertNotIn("words", report_keys) profiler = TextProfiler( "Name") # w/o TextProfilerOptions and remove_disabled_flag == True report = profiler.report(remove_disabled_flag=True) report_keys = list(report.keys()) self.assertIn("vocab", report_keys) self.assertIn("words", report_keys) profiler = TextProfiler( "Name") # w/o TextProfilerOptions and remove_disabled_flag default report = profiler.report() report_keys = list(report.keys()) self.assertIn("vocab", report_keys) self.assertIn("words", report_keys) options = ( TextProfilerOptions() ) # With TextProfilerOptions True/False and remove_disabled_flag == True options.vocab.is_enabled = True options.words.is_enabled = False profiler = TextProfiler("Name", options) sample = pd.Series(["This is test, a Test sentence.!!!"]) profiler.update(sample) report = profiler.report(remove_disabled_flag=True) report_keys = list(report.keys()) self.assertIn("vocab", report_keys) self.assertNotIn("words", report_keys)
def test_options_most_common_words_count(self): # None value for number of common words options = TextProfilerOptions() options.top_k_words = None options.stop_words = [ ] # set stop_words to empty list for easy inspection text_profile = TextProfiler("Name", options=options) sample = pd.Series( ["this is test,", " this is a test sentence", "this is", "this"]) text_profile.update(sample) profile = text_profile.profile expected_word_count = { 'this': 4, 'is': 3, 'test': 2, 'a': 1, 'sentence': 1 } self.assertDictEqual(expected_word_count, profile["word_count"]) # set number of common words to 3 options.top_k_words = 3 options.stop_words = [ ] # set stop_words to empty list for easy inspection text_profile = TextProfiler("Name", options=options) sample = pd.Series( ["this is test,", " this is a test sentence", "this is", "this"]) text_profile.update(sample) profile = text_profile.profile expected_word_count = {'this': 4, 'is': 3, 'test': 2} self.assertDictEqual(expected_word_count, profile["word_count"]) # change number of common words options.top_k_words = 2 text_profile = TextProfiler("Name", options=options) text_profile.update(sample) profile = text_profile.profile expected_word_count = {'this': 4, 'is': 3} self.assertDictEqual(expected_word_count, profile["word_count"]) # change number of common words greater than length of word_counts list options.top_k_words = 10 text_profile = TextProfiler("Name", options=options) text_profile.update(sample) profile = text_profile.profile expected_word_count = { 'this': 4, 'is': 3, 'test': 2, 'a': 1, 'sentence': 1 } self.assertDictEqual(expected_word_count, profile["word_count"])
def test_options_default(self): options = TextProfilerOptions() # input with one sample text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {'sentence': 1, 'Test': 1, 'test': 1} expected_vocab = { 's': 5, ' ': 5, 'e': 5, 't': 4, '!': 3, 'T': 2, 'i': 2, 'n': 2, 'h': 1, ',': 1, 'a': 1, 'c': 1, '.': 1 } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) # input with two samples text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {'sentence': 1, 'Test': 1, 'test': 1} expected_vocab = { 's': 5, ' ': 5, 'e': 5, 't': 4, '!': 3, 'T': 2, 'i': 2, 'n': 2, 'h': 1, ',': 1, 'a': 1, 'c': 1, '.': 1 } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count)
def test_options_default(self): options = TextProfilerOptions() # input with one sample text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"sentence": 1, "Test": 1, "test": 1} expected_vocab = { "s": 5, " ": 5, "e": 5, "t": 4, "!": 3, "T": 2, "i": 2, "n": 2, "h": 1, ",": 1, "a": 1, "c": 1, ".": 1, } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) # input with two samples text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"sentence": 1, "Test": 1, "test": 1} expected_vocab = { "s": 5, " ": 5, "e": 5, "t": 4, "!": 3, "T": 2, "i": 2, "n": 2, "h": 1, ",": 1, "a": 1, "c": 1, ".": 1, } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count)
def test_options_vocab_update(self): # change vocab.is_enabled, other options remain the same as default values options = TextProfilerOptions() options.vocab.is_enabled = False # input with one sample text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {'sentence': 1, 'Test': 1, 'test': 1} expected_vocab = dict() self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) # input with two samples text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {'sentence': 1, 'Test': 1, 'test': 1} expected_vocab = dict() self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count)
def test_options_most_common_chars_count(self): # None value for number of common chars options = TextProfilerOptions() options.top_k_chars = None text_profile = TextProfiler("Name", options=options) sample = pd.Series( ["this is test,", " this is a test sentence", "this is", "this"]) text_profile.update(sample) profile = text_profile.profile expected_vocab_count = { 's': 10, 't': 9, ' ': 8, 'i': 7, 'e': 5, 'h': 4, 'n': 2, ',': 1, 'a': 1, 'c': 1 } self.assertDictEqual(expected_vocab_count, profile["vocab_count"]) # set number of common chars to 3 options.top_k_chars = 3 text_profile = TextProfiler("Name", options=options) sample = pd.Series( ["this is test,", " this is a test sentence", "this is", "this"]) text_profile.update(sample) profile = text_profile.profile expected_vocab_count = {'s': 10, 't': 9, ' ': 8} self.assertDictEqual(expected_vocab_count, profile["vocab_count"]) # change number of common chars options.top_k_chars = 2 text_profile = TextProfiler("Name", options=options) text_profile.update(sample) profile = text_profile.profile expected_vocab_count = {'s': 10, 't': 9} self.assertDictEqual(expected_vocab_count, profile["vocab_count"]) # change number of common chars greater than length of vocab_counts list options.top_k_chars = 300 text_profile = TextProfiler("Name", options=options) text_profile.update(sample) profile = text_profile.profile expected_vocab_count = { 's': 10, 't': 9, ' ': 8, 'i': 7, 'e': 5, 'h': 4, 'n': 2, ',': 1, 'a': 1, 'c': 1 } self.assertDictEqual(expected_vocab_count, profile["vocab_count"])
def test_options_stop_words(self): # change stop_words, other options remain the same as default values # with a list of stopwords options = TextProfilerOptions() options.stop_words = ['hello', 'sentence', 'is', 'a'] ## input with one sample text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {'This': 1, 'Test': 1, 'test': 1} expected_vocab = { 's': 5, ' ': 5, 'e': 5, 't': 4, '!': 3, 'T': 2, 'i': 2, 'n': 2, 'h': 1, ',': 1, 'a': 1, 'c': 1, '.': 1 } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) ## input with two samples text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {'This': 1, 'Test': 1, 'test': 1} expected_vocab = { 's': 5, ' ': 5, 'e': 5, 't': 4, '!': 3, 'T': 2, 'i': 2, 'n': 2, 'h': 1, ',': 1, 'a': 1, 'c': 1, '.': 1 } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) # with an empty list options = TextProfilerOptions() options.stop_words = [] ## input with one sample text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = { 'This': 1, 'is': 1, 'test': 1, 'a': 1, 'Test': 1, 'sentence': 1 } expected_vocab = { 's': 5, ' ': 5, 'e': 5, 't': 4, '!': 3, 'T': 2, 'i': 2, 'n': 2, 'h': 1, ',': 1, 'a': 1, 'c': 1, '.': 1 } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) ## input with two samples text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = { 'This': 1, 'is': 1, 'test': 1, 'a': 1, 'Test': 1, 'sentence': 1 } expected_vocab = { 's': 5, ' ': 5, 'e': 5, 't': 4, '!': 3, 'T': 2, 'i': 2, 'n': 2, 'h': 1, ',': 1, 'a': 1, 'c': 1, '.': 1 } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count)
def test_diff_profiles(self): text_profile1 = TextProfiler("Name") sample = pd.Series(["Hello my name is: Grant.!!!"]) text_profile1.update(sample) text_profile2 = TextProfiler("Name") sample = pd.Series(["Bob and \"grant\", 'are' friends Grant Grant"]) text_profile2.update(sample) expected_diff = { 'vocab': [['H', 'l', 'm', 'y', ':', '.', '!'], ['e', 'o', ' ', 'n', 'a', 'i', 's', 'G', 'r', 't'], ['B', 'b', 'd', '"', 'g', ',', "'", 'f']], 'vocab_count': [{ '!': 3, 'l': 2, 'm': 2, 'H': 1, 'y': 1, ':': 1, '.': 1 }, { ' ': -2, 'e': 'unchanged', 'n': -3, 'a': -3, 'o': 'unchanged', 'i': 'unchanged', 's': 'unchanged', 'G': -1, 'r': -4, 't': -2 }, { 'd': 2, '"': 2, "'": 2, 'B': 1, 'b': 1, 'g': 1, ',': 1, 'f': 1 }], 'words': [['Hello', 'name'], ['Grant'], ['Bob', 'grant', 'friends']], 'word_count': [{ 'Hello': 1, 'name': 1 }, { 'Grant': -1 }, { 'Bob': 1, 'grant': 1, 'friends': 1 }] } self.assertDictEqual(expected_diff, text_profile1.diff(text_profile2)) # Test when one profiler is not case sensitive text_profile1 = TextProfiler("Name") sample = pd.Series(["Hello my name is: Grant.!!!"]) text_profile1.update(sample) options = TextProfilerOptions() options.is_case_sensitive = False text_profile2 = TextProfiler("Name", options=options) sample = pd.Series(["Bob and \"grant\", 'are' friends Grant Grant"]) text_profile2.update(sample) expected_diff = { 'vocab': [['H', 'l', 'm', 'y', ':', '.', '!'], ['e', 'o', ' ', 'n', 'a', 'i', 's', 'G', 'r', 't'], ['B', 'b', 'd', '"', 'g', ',', "'", 'f']], 'vocab_count': [{ '!': 3, 'l': 2, 'm': 2, 'H': 1, 'y': 1, ':': 1, '.': 1 }, { ' ': -2, 'e': 'unchanged', 'n': -3, 'a': -3, 'o': 'unchanged', 'i': 'unchanged', 's': 'unchanged', 'G': -1, 'r': -4, 't': -2 }, { 'd': 2, '"': 2, "'": 2, 'B': 1, 'b': 1, 'g': 1, ',': 1, 'f': 1 }], 'words': [['hello', 'name'], ['grant'], ['bob', 'friends']], 'word_count': [{ 'hello': 1, 'name': 1 }, { 'grant': -2 }, { 'bob': 1, 'friends': 1 }] } self.assertDictEqual(expected_diff, text_profile1.diff(text_profile2))
def test_options_most_common_chars_count(self): # None value for number of common chars options = TextProfilerOptions() options.top_k_chars = None text_profile = TextProfiler("Name", options=options) sample = pd.Series( ["this is test,", " this is a test sentence", "this is", "this"]) text_profile.update(sample) profile = text_profile.profile expected_vocab_count = { "s": 10, "t": 9, " ": 8, "i": 7, "e": 5, "h": 4, "n": 2, ",": 1, "a": 1, "c": 1, } self.assertDictEqual(expected_vocab_count, profile["vocab_count"]) # set number of common chars to 3 options.top_k_chars = 3 text_profile = TextProfiler("Name", options=options) sample = pd.Series( ["this is test,", " this is a test sentence", "this is", "this"]) text_profile.update(sample) profile = text_profile.profile expected_vocab_count = {"s": 10, "t": 9, " ": 8} self.assertDictEqual(expected_vocab_count, profile["vocab_count"]) # change number of common chars options.top_k_chars = 2 text_profile = TextProfiler("Name", options=options) text_profile.update(sample) profile = text_profile.profile expected_vocab_count = {"s": 10, "t": 9} self.assertDictEqual(expected_vocab_count, profile["vocab_count"]) # change number of common chars greater than length of vocab_counts list options.top_k_chars = 300 text_profile = TextProfiler("Name", options=options) text_profile.update(sample) profile = text_profile.profile expected_vocab_count = { "s": 10, "t": 9, " ": 8, "i": 7, "e": 5, "h": 4, "n": 2, ",": 1, "a": 1, "c": 1, } self.assertDictEqual(expected_vocab_count, profile["vocab_count"])
def test_options_stop_words(self): # change stop_words, other options remain the same as default values # with a list of stopwords options = TextProfilerOptions() options.stop_words = ["hello", "sentence", "is", "a"] ## input with one sample text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"This": 1, "Test": 1, "test": 1} expected_vocab = { "s": 5, " ": 5, "e": 5, "t": 4, "!": 3, "T": 2, "i": 2, "n": 2, "h": 1, ",": 1, "a": 1, "c": 1, ".": 1, } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) ## input with two samples text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"This": 1, "Test": 1, "test": 1} expected_vocab = { "s": 5, " ": 5, "e": 5, "t": 4, "!": 3, "T": 2, "i": 2, "n": 2, "h": 1, ",": 1, "a": 1, "c": 1, ".": 1, } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) # with an empty list options = TextProfilerOptions() options.stop_words = [] ## input with one sample text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = { "This": 1, "is": 1, "test": 1, "a": 1, "Test": 1, "sentence": 1, } expected_vocab = { "s": 5, " ": 5, "e": 5, "t": 4, "!": 3, "T": 2, "i": 2, "n": 2, "h": 1, ",": 1, "a": 1, "c": 1, ".": 1, } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) ## input with two samples text_profile = TextProfiler("Name", options=options) sample = pd.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = { "This": 1, "is": 1, "test": 1, "a": 1, "Test": 1, "sentence": 1, } expected_vocab = { "s": 5, " ": 5, "e": 5, "t": 4, "!": 3, "T": 2, "i": 2, "n": 2, "h": 1, ",": 1, "a": 1, "c": 1, ".": 1, } self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count)
def test_diff_profiles(self): text_profile1 = TextProfiler("Name") sample = pd.Series(["Hello my name is: Grant.!!!"]) text_profile1.update(sample) text_profile2 = TextProfiler("Name") sample = pd.Series(["Bob and \"grant\", 'are' friends Grant Grant"]) text_profile2.update(sample) expected_diff = { "vocab": [ ["H", "l", "m", "y", ":", ".", "!"], ["e", "o", " ", "n", "a", "i", "s", "G", "r", "t"], ["B", "b", "d", '"', "g", ",", "'", "f"], ], "vocab_count": [ { "!": 3, "l": 2, "m": 2, "H": 1, "y": 1, ":": 1, ".": 1 }, { " ": -2, "e": "unchanged", "n": -3, "a": -3, "o": "unchanged", "i": "unchanged", "s": "unchanged", "G": -1, "r": -4, "t": -2, }, { "d": 2, '"': 2, "'": 2, "B": 1, "b": 1, "g": 1, ",": 1, "f": 1 }, ], "words": [["Hello", "name"], ["Grant"], ["Bob", "grant", "friends"]], "word_count": [ { "Hello": 1, "name": 1 }, { "Grant": -1 }, { "Bob": 1, "grant": 1, "friends": 1 }, ], } self.assertDictEqual(expected_diff, text_profile1.diff(text_profile2)) # Test when one profiler is not case sensitive text_profile1 = TextProfiler("Name") sample = pd.Series(["Hello my name is: Grant.!!!"]) text_profile1.update(sample) options = TextProfilerOptions() options.is_case_sensitive = False text_profile2 = TextProfiler("Name", options=options) sample = pd.Series(["Bob and \"grant\", 'are' friends Grant Grant"]) text_profile2.update(sample) expected_diff = { "vocab": [ ["H", "l", "m", "y", ":", ".", "!"], ["e", "o", " ", "n", "a", "i", "s", "G", "r", "t"], ["B", "b", "d", '"', "g", ",", "'", "f"], ], "vocab_count": [ { "!": 3, "l": 2, "m": 2, "H": 1, "y": 1, ":": 1, ".": 1 }, { " ": -2, "e": "unchanged", "n": -3, "a": -3, "o": "unchanged", "i": "unchanged", "s": "unchanged", "G": -1, "r": -4, "t": -2, }, { "d": 2, '"': 2, "'": 2, "B": 1, "b": 1, "g": 1, ",": 1, "f": 1 }, ], "words": [["hello", "name"], ["grant"], ["bob", "friends"]], "word_count": [ { "hello": 1, "name": 1 }, { "grant": -2 }, { "bob": 1, "friends": 1 }, ], } self.assertDictEqual(expected_diff, text_profile1.diff(text_profile2))