Пример #1
0
 def test_simple_word_simplifier_minmax_length(self):
     s = build_simplifier(SimplificationStrategy.WORD_SIMPLE,
                          min_length=4,
                          max_length=8)
     self.assertEqual([
         'many words this sentence', 'handle u.s.', 'handle missing spaces',
         'also right', 'twitter mighta'
     ], s.simplify_dataset(texts))
Пример #2
0
 def test_vocab_simplifier_tf_2(self):
     s = build_simplifier(SimplificationStrategy.WORD_VOCAB,
                          dataset=texts,
                          min_term_frequency=2)
     self.assertEqual([
         'How in ?', 'How do you handle in ?', ', do you handle ? ?', ', ?',
         '@ @ , ?'
     ], s.simplify_dataset(texts))
Пример #3
0
 def test_vocab_simplifier_tf2_df_05(self):
     s = build_simplifier(SimplificationStrategy.WORD_VOCAB,
                          dataset=texts,
                          min_term_frequency=2,
                          max_document_frequency=.5,
                          lowercase=True)
     self.assertEqual(
         ['how in', 'how do handle in', 'do handle', '', '@ @'],
         s.simplify_dataset(texts))
Пример #4
0
 def test_vocab_simplifier_tf_1(self):
     s = build_simplifier(SimplificationStrategy.WORD_VOCAB, dataset=texts)
     self.assertEqual([
         'How many words are in this sentence ?',
         'How do you handle compound-words in the U.S. ?',
         'Ehm , do you handle missing spaces ? No ?',
         'You can also distinguish uppercase and lowercase , right ?',
         '@ username213 : Twitter vocabulary of @ otheruser12 miGhta bedifferent , hu ? xD < 3'
     ], s.simplify_dataset(texts))
Пример #5
0
 def test_simple_word_simplifier(self):
     s = build_simplifier(SimplificationStrategy.WORD_SIMPLE)
     self.assertEqual([
         'how many words are in this sentence ?',
         'how do you handle compound-words in the u.s. ?',
         'ehm , do you handle missing spaces ? no ?',
         'you can also distinguish uppercase and lowercase , right ?',
         '@ username213 : twitter vocabulary of @ otheruser12 mighta bedifferent , hu ? xd < 3'
     ], s.simplify_dataset(texts))
Пример #6
0
 def test_simplifier_no_casing(self):
     s = build_simplifier(SimplificationStrategy.SYMBOL_SIMPLE)
     self.assertEqual([
         'how many words are in this sentence?',
         'how do you handle compound-words in the u.s.?',
         'ehm,do you handle missing spaces?no?',
         '@username213: twitter vocabulary of @otheruser12 mighta bedifferent,hu? xd <3',
         "why don't you handle clitics with apostrophes differently?"
     ], s.simplify_dataset(texts))
Пример #7
0
 def test_vocab_simplifier_df_025(self):
     s = build_simplifier(SimplificationStrategy.WORD_VOCAB,
                          dataset=texts,
                          max_document_frequency=.25)
     self.assertEqual([
         'many words are this sentence', 'compound-words the U.S.',
         'Ehm missing spaces No',
         'You can also distinguish uppercase and lowercase right',
         'username213 : Twitter vocabulary of otheruser12 miGhta bedifferent hu xD < 3'
     ], s.simplify_dataset(texts))
Пример #8
0
 def test_simplifier_alpha(self):
     s = build_simplifier(
         SimplificationStrategy.SYMBOL_SIMPLE_ALPHA_NOCASING)
     self.assertEqual([
         'how many words are in this sentence',
         'how do you handle compound words in the u s',
         'ehm do you handle missing spaces no',
         'username twitter vocabulary of otheruser mighta bedifferent hu xd',
         'why don t you handle clitics with apostrophes differently'
     ], s.simplify_dataset(texts))
Пример #9
0
 def test_simplifier_casing(self):
     # only duplicate whitespace is removed
     s = build_simplifier(SimplificationStrategy.SYMBOL_SIMPLE,
                          lowercase=False)
     self.assertEqual([
         'How many words are in this sentence?',
         'How do you handle compound-words in the U.S.?',
         'Ehm,do you handle missing spaces?No?',
         '@username213: Twitter vocabulary of @otheruser12 miGhta bedifferent,hu? xD <3',
         "Why don't you handle clitics with apostrophes differently?"
     ], s.simplify_dataset(texts))
Пример #10
0
 def test_postinit_vocab_simplifier(self):
     s = build_simplifier(SimplificationStrategy.SYMBOL_VOCAB_NOCASING,
                          min_term_frequency=4)
     s.load_parameters(dataset=texts)
     self.assertEqual([
         'how many words are in this sentence?',
         'how do you handle compound words in the u s ?',
         'ehm do you handle missin spaces?no?',
         'username twitter oca ulary of otheruser mi hta edifferent hu? d',
         'why don t you handle clitics with apostrophes differently?'
     ], s.simplify_dataset(texts))
Пример #11
0
 def test_uninitialized_vocab_simplifier(self):
     s = build_simplifier(SimplificationStrategy.SYMBOL_VOCAB_NOCASING,
                          min_term_frequency=4)
     self.assertEqual([''],
                      s.simplify_dataset(['No initialization - all gone']))