def test_simple_word_simplifier_minmax_length(self): s = build_simplifier(SimplificationStrategy.WORD_SIMPLE, min_length=4, max_length=8) self.assertEqual([ 'many words this sentence', 'handle u.s.', 'handle missing spaces', 'also right', 'twitter mighta' ], s.simplify_dataset(texts))
def test_vocab_simplifier_tf_2(self): s = build_simplifier(SimplificationStrategy.WORD_VOCAB, dataset=texts, min_term_frequency=2) self.assertEqual([ 'How in ?', 'How do you handle in ?', ', do you handle ? ?', ', ?', '@ @ , ?' ], s.simplify_dataset(texts))
def test_vocab_simplifier_tf2_df_05(self): s = build_simplifier(SimplificationStrategy.WORD_VOCAB, dataset=texts, min_term_frequency=2, max_document_frequency=.5, lowercase=True) self.assertEqual( ['how in', 'how do handle in', 'do handle', '', '@ @'], s.simplify_dataset(texts))
def test_vocab_simplifier_tf_1(self): s = build_simplifier(SimplificationStrategy.WORD_VOCAB, dataset=texts) self.assertEqual([ 'How many words are in this sentence ?', 'How do you handle compound-words in the U.S. ?', 'Ehm , do you handle missing spaces ? No ?', 'You can also distinguish uppercase and lowercase , right ?', '@ username213 : Twitter vocabulary of @ otheruser12 miGhta bedifferent , hu ? xD < 3' ], s.simplify_dataset(texts))
def test_simple_word_simplifier(self): s = build_simplifier(SimplificationStrategy.WORD_SIMPLE) self.assertEqual([ 'how many words are in this sentence ?', 'how do you handle compound-words in the u.s. ?', 'ehm , do you handle missing spaces ? no ?', 'you can also distinguish uppercase and lowercase , right ?', '@ username213 : twitter vocabulary of @ otheruser12 mighta bedifferent , hu ? xd < 3' ], s.simplify_dataset(texts))
def test_simplifier_no_casing(self): s = build_simplifier(SimplificationStrategy.SYMBOL_SIMPLE) self.assertEqual([ 'how many words are in this sentence?', 'how do you handle compound-words in the u.s.?', 'ehm,do you handle missing spaces?no?', '@username213: twitter vocabulary of @otheruser12 mighta bedifferent,hu? xd <3', "why don't you handle clitics with apostrophes differently?" ], s.simplify_dataset(texts))
def test_vocab_simplifier_df_025(self): s = build_simplifier(SimplificationStrategy.WORD_VOCAB, dataset=texts, max_document_frequency=.25) self.assertEqual([ 'many words are this sentence', 'compound-words the U.S.', 'Ehm missing spaces No', 'You can also distinguish uppercase and lowercase right', 'username213 : Twitter vocabulary of otheruser12 miGhta bedifferent hu xD < 3' ], s.simplify_dataset(texts))
def test_simplifier_alpha(self): s = build_simplifier( SimplificationStrategy.SYMBOL_SIMPLE_ALPHA_NOCASING) self.assertEqual([ 'how many words are in this sentence', 'how do you handle compound words in the u s', 'ehm do you handle missing spaces no', 'username twitter vocabulary of otheruser mighta bedifferent hu xd', 'why don t you handle clitics with apostrophes differently' ], s.simplify_dataset(texts))
def test_simplifier_casing(self): # only duplicate whitespace is removed s = build_simplifier(SimplificationStrategy.SYMBOL_SIMPLE, lowercase=False) self.assertEqual([ 'How many words are in this sentence?', 'How do you handle compound-words in the U.S.?', 'Ehm,do you handle missing spaces?No?', '@username213: Twitter vocabulary of @otheruser12 miGhta bedifferent,hu? xD <3', "Why don't you handle clitics with apostrophes differently?" ], s.simplify_dataset(texts))
def test_postinit_vocab_simplifier(self): s = build_simplifier(SimplificationStrategy.SYMBOL_VOCAB_NOCASING, min_term_frequency=4) s.load_parameters(dataset=texts) self.assertEqual([ 'how many words are in this sentence?', 'how do you handle compound words in the u s ?', 'ehm do you handle missin spaces?no?', 'username twitter oca ulary of otheruser mi hta edifferent hu? d', 'why don t you handle clitics with apostrophes differently?' ], s.simplify_dataset(texts))
def test_uninitialized_vocab_simplifier(self): s = build_simplifier(SimplificationStrategy.SYMBOL_VOCAB_NOCASING, min_term_frequency=4) self.assertEqual([''], s.simplify_dataset(['No initialization - all gone']))