Пример #1
0
 def test_initials(self):
     # We use a to_list() here such that we don't receive a generator
     sc = TextScrubber().initials().to_list()
     self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                      ['h', 'w'])
     self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                      [['h', 'w'], ['s', 'A']])
Пример #2
0
 def test_token_transform(self):
     # We use a to_list() here such that we don't receive a generator
     sc = TextScrubber().token_transform(
         func=lambda t: t.capitalize()).to_list()
     self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                      ['Hello', 'World'])
     self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                      [['Hello', 'World'], ['Slimmer', 'Ai']])
Пример #3
0
    def test_sub_greek_chars(self):
        # On entire string.
        sc = TextScrubber().sub_greek_chars()
        self.assertEqual(sc.transform('α * β^Λ'), 'alpha * beta^Lambda')

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sub_greek_chars(on_tokens=True).to_list()
        self.assertEqual(sc.transform(['α * β^Λ', 'χΥΖ'], on_tokens=True),
                         ['alpha * beta^Lambda', 'chiUpsilonZeta'])
Пример #4
0
 def test_sub_tokens(self):
     # We use a to_list() here such that we don't receive a generator
     sc = TextScrubber().sub_tokens(func=lambda t: {
         'hello': 'goodbye',
         'AI': 'ML'
     }.get(t, t)).to_list()
     self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                      ['goodbye', 'world'])
     self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                      [['goodbye', 'world'], ['slimmer', 'ML']])
Пример #5
0
    def test_remove_excessive_whitespace(self):
        # On entire strings
        sc = TextScrubber().remove_excessive_whitespace(on_tokens=False)
        self.assertEqual(sc.transform('hello  world '), 'hello world')
        self.assertEqual(sc.transform(['hello   world ', ' slimmer  AI ']),
                         ['hello world', 'slimmer AI'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_excessive_whitespace(
            on_tokens=True).to_list()
        self.assertEqual(sc.transform(['hello ', 'wor ld'], on_tokens=True),
                         ['hello', 'wor ld'])
Пример #6
0
    def test_tokenize(self):
        # Using the default tokenizer
        sc = TextScrubber().tokenize()
        self.assertEqual(sc.transform('hello world'), ['hello', 'world'])
        self.assertEqual(sc.transform(['hello world', 'slimmer AI']),
                         [['hello', 'world'], ['slimmer', 'AI']])

        # Using a custom one
        sc = TextScrubber().tokenize(func=lambda s: s.split('e'))
        self.assertEqual(sc.transform('hello world'), ['h', 'llo world'])
        self.assertEqual(sc.transform(['hello world', 'slimmer AI']),
                         [['h', 'llo world'], ['slimm', 'r AI']])
Пример #7
0
 def test_to_list(self):
     # The map objects will be materialized by the to_list() function
     sc = TextScrubber().to_list()
     self.assertEqual(
         sc.transform(map(str.upper, ['hello', 'world']), on_tokens=True),
         ['HELLO', 'WORLD'])
     self.assertEqual(
         sc.transform([
             map(str.upper, ['hello', 'world']),
             map(str.upper, ['slimmer', 'Ai'])
         ],
                      to_set=False),
         [['HELLO', 'WORLD'], ['SLIMMER', 'AI']])
Пример #8
0
    def test_remove_digits(self):
        # On entire strings
        sc = TextScrubber().remove_digits(on_tokens=False)
        self.assertEqual(sc.transform('hell0 world12'), 'hell world')
        self.assertEqual(sc.transform(['hell0 world12', 'sl1mm3r A1']),
                         ['hell world', 'slmmr A'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_digits(on_tokens=True).to_list()
        self.assertEqual(sc.transform(['hell0 world12'], on_tokens=True),
                         ['hell world'])
        self.assertEqual(sc.transform([['hell0 world12', 'sl1mm3r A1']]),
                         [['hell world', 'slmmr A']])
Пример #9
0
    def test_lowercase(self):
        # On entire strings
        sc = TextScrubber().lowercase(on_tokens=False)
        self.assertEqual(sc.transform('Hello World'), 'hello world')
        self.assertEqual(sc.transform(['Hello World', 'slimmer AI']),
                         ['hello world', 'slimmer ai'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().lowercase(on_tokens=True).to_list()
        self.assertEqual(sc.transform(['Hello World'], on_tokens=True),
                         ['hello world'])
        self.assertEqual(sc.transform([['Hello World', 'slimmer AI']]),
                         [['hello world', 'slimmer ai']])
Пример #10
0
    def test_sub_latex_chars(self):
        # On entire string.
        sc = TextScrubber().sub_latex_chars()
        self.assertEqual(sc.transform(r'Eric \"Ozg\"ur Sar{\i}o\u{g}lu'),
                         'Eric Ozgur Sarioglu')
        self.assertEqual(sc.transform(r'Jan K\v{r}et\'insk\'y'),
                         'Jan Kretinsky')

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sub_latex_chars(on_tokens=True).to_list()
        self.assertEqual(
            sc.transform([r'Mieczys{\l}aw', r'K{\l}opotek'], on_tokens=True),
            ['Mieczyslaw', 'Klopotek'])
Пример #11
0
    def test_remove_quotes(self):
        # On entire strings
        sc = TextScrubber().remove_quotes(on_tokens=False)
        self.assertEqual(sc.transform('"hello world"'), 'hello world')
        self.assertEqual(sc.transform(['"hello world"', 'slimmer\' AI']),
                         ['hello world', 'slimmer AI'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_quotes(on_tokens=True).to_list()
        self.assertEqual(sc.transform(['"hello world"'], on_tokens=True),
                         ['hello world'])
        self.assertEqual(sc.transform([['"hello world"', 'slimmer\' AI']]),
                         [['hello world', 'slimmer AI']])
Пример #12
0
    def test_to_ascii(self):
        # On entire strings
        sc = TextScrubber().to_ascii(on_tokens=False)
        self.assertEqual(sc.transform('héllô wòrld'), 'hello world')
        self.assertEqual(sc.transform(['héllô wòrld', 'slímm̀er ÀI']),
                         ['hello world', 'slimmer AI'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().to_ascii(on_tokens=True).to_list()
        self.assertEqual(sc.transform(['héllô wòrld'], on_tokens=True),
                         ['hello world'])
        self.assertEqual(sc.transform([['héllô wòrld', 'slímm̀er ÀI']]),
                         [['hello world', 'slimmer AI']])
Пример #13
0
    def test_remove_suffixes(self):
        # On entire strings
        sc = TextScrubber().remove_suffixes({'Bar', 'world'}, on_tokens=False)
        self.assertEqual(sc.transform('fooBar fooBar'), 'fooBar foo')
        self.assertEqual(sc.transform(['hello world', 'world hello']),
                         ['hello ', 'world hello'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_suffixes({'Bar', 'world'}, on_tokens=True)
        self.assertEqual(sc.transform(['fooBar fooBar'], on_tokens=True),
                         ['fooBar foo'])
        self.assertEqual(sc.transform([['hello world', 'world hello']]),
                         [['hello ', 'world hello']])
Пример #14
0
    def test_sub_html_chars(self):
        # On entire string.
        sc = TextScrubber().sub_html_chars()
        self.assertEqual(sc.transform('Eric Zeißner'), 'Eric Zeißner')
        self.assertEqual(sc.transform('Marco Pöchacker'),
                         'Marco Pöchacker')
        self.assertEqual(sc.transform('@ My Place'), '@ My Place')
        self.assertEqual(sc.transform('Carl's'), 'Carl\'s')

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sub_html_chars(on_tokens=True).to_list()
        self.assertEqual(
            sc.transform(['Marco', 'Pöchacker'], on_tokens=True),
            ['Marco', 'Pöchacker'])
Пример #15
0
    def test_join(self):
        # Default separator
        sc = TextScrubber().join(sep=' ')
        self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                         'hello world')
        self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                         ['hello world', 'slimmer AI'])

        # Custom separator
        sc = TextScrubber().join(sep=' & ')
        self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                         'hello & world')
        self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                         ['hello & world', 'slimmer & AI'])
Пример #16
0
    def test_remove_html_tags(self):
        # On entire strings
        sc = TextScrubber().remove_html_tags(on_tokens=False)
        self.assertEqual(sc.transform('<b>hello</b> wo<FOO>rld'),
                         'hello world')
        self.assertEqual(
            sc.transform([
                'hello <i>world</i></br>',
                '<a tag>slimmer</some tag><sup>AI</sup>'
            ]), ['hello world', 'slimmerAI'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_html_tags(on_tokens=True).to_list()
        self.assertEqual(
            sc.transform(['<b>hello</b> wo<FOO>rld'], on_tokens=True),
            ['hello world'])
        self.assertEqual(
            sc.transform([[
                'hello <i>world</i></br>',
                '<a tag>slimmer</some tag><sup>AI</sup>'
            ]]), [['hello world', 'slimmerAI']])
Пример #17
0
    def test_sub(self):
        # Substitute with string on entire string.
        sc = TextScrubber().sub(search='big', replace='small')
        self.assertEqual(sc.transform('hello big world.'),
                         'hello small world.')

        # Substitute with string on tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sub(search='big', replace='small',
                                on_tokens=True).to_list()
        self.assertEqual(
            sc.transform(['hello', 'big', 'world'], on_tokens=True),
            ['hello', 'small', 'world'])
        self.assertEqual(sc.transform([['hello', 'big', 'world']]),
                         [['hello', 'small', 'world']])

        # Substitute with regex on entire string.
        sc = TextScrubber().sub(search=r'ph\.?\ ?d\.?', replace='phd')
        self.assertEqual(sc.transform('i have a ph.d. in banana pies'),
                         'i have a phd in banana pies')

        # Substitute with regex on tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sub(search=r'ph\.?\ ?d\.?',
                                replace='phd',
                                on_tokens=True).to_list()
        self.assertEqual(
            sc.transform(['i', 'am', 'phd.', 'student.'], on_tokens=True),
            ['i', 'am', 'phd', 'student.'])
        self.assertEqual(sc.transform([['i', 'am', 'phd.', 'student.']]),
                         [['i', 'am', 'phd', 'student.']])
Пример #18
0
    def test_remove_stop_words(self):
        # Default stop words. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_stop_words(stop_words=None).to_list()
        self.assertEqual(
            sc.transform(['around', 'the', 'world'], on_tokens=True),
            ['world'])
        self.assertEqual(
            sc.transform([['around', 'the', 'world'],
                          ['once', 'upon', 'a', 'time']]),
            [['world'], ['time']])

        # Custom stop words. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_stop_words(
            stop_words={'world', 'time'}).to_list()
        self.assertEqual(
            sc.transform(['around', 'the', 'world'], on_tokens=True),
            ['around', 'the'])
        self.assertEqual(
            sc.transform([['around', 'the', 'world'],
                          ['once', 'upon', 'a', 'time']]),
            [['around', 'the'], ['once', 'upon', 'a']])

        # All caps words shouldn't be removed
        sc = TextScrubber().remove_stop_words(stop_words=None).to_list()
        self.assertEqual(
            sc.transform(['around', 'THE', 'world'], on_tokens=True),
            ['THE', 'world'])
        self.assertEqual(
            sc.transform([['AROUND', 'the', 'world'],
                          ['once', 'upon', 'A', 'time']]),
            [['AROUND', 'world'], ['A', 'time']])
Пример #19
0
    def test_filter_tokens(self):
        # Default test
        sc = TextScrubber().filter_tokens(test=lambda t: t,
                                          neg=False).to_list()
        self.assertEqual(sc.transform(['hello', '', 'world'], on_tokens=True),
                         ['hello', 'world'])
        self.assertEqual(
            sc.transform([['hello', '', 'world'],
                          [None, 'slimmer', 'AI', False]]),
            [['hello', 'world'], ['slimmer', 'AI']])

        # Default test using negative results
        sc = TextScrubber().filter_tokens(test=lambda t: t, neg=True).to_list()
        self.assertEqual(sc.transform(['hello', '', 'world'], on_tokens=True),
                         [''])
        self.assertEqual(
            sc.transform([['hello', '', 'world'],
                          [None, 'slimmer', 'AI', False]]),
            [[''], [None, False]])

        # Custom test
        sc = TextScrubber().filter_tokens(
            test=lambda t: isinstance(t, str) and t.islower(),
            neg=False).to_list()
        self.assertEqual(sc.transform(['hello', '', 'world'], on_tokens=True),
                         ['hello', 'world'])
        self.assertEqual(
            sc.transform([['hello', '', 'world'],
                          [None, 'slimmer', 'AI', False]]),
            [['hello', 'world'], ['slimmer']])
Пример #20
0
    def test_sort(self):
        # Default setting. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sort(reverse=False).to_list()
        self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                         ['hello', 'world'])
        self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                         [['hello', 'world'], ['AI', 'slimmer']])

        # Reverse order. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sort(reverse=True).to_list()
        self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                         ['world', 'hello'])
        self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                         [['world', 'hello'], ['slimmer', 'AI']])

        # Sort on tokens
        sc = TextScrubber().sort(reverse=True, on_tokens=False).to_list()
        self.assertEqual(
            sc.transform([['hello', 'world'], ['slimmer', 'AI']],
                         on_tokens=True),
            [['slimmer', 'AI'], ['hello', 'world']])
        sc = TextScrubber().sort(reverse=True, on_tokens=True).to_list()
        self.assertEqual(
            sc.transform([['hello', 'world'], ['slimmer', 'AI']],
                         on_tokens=True),
            [['world', 'hello'], ['slimmer', 'AI']])
Пример #21
0
    def test_num2words(self):
        # Default settings. On strings
        sc = TextScrubber().num2words(include_commas=False, on_tokens=False)
        self.assertEqual(
            sc.transform('hello 1337 world'),
            'hello one thousand three hundred and thirty-seven world')
        self.assertEqual(sc.transform(['hello 1337 world', 'Atoomweg 6b']), [
            'hello one thousand three hundred and thirty-seven world',
            'Atoomweg six b'
        ])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().num2words(include_commas=False,
                                      on_tokens=True).to_list()
        self.assertEqual(
            sc.transform(['hello', '1337', 'world'], on_tokens=True),
            ['hello', 'one thousand three hundred and thirty-seven', 'world'])
        self.assertEqual(
            sc.transform([['hello', '1337', 'world'], ['Atoomweg', '6b']]),
            [['hello', 'one thousand three hundred and thirty-seven', 'world'],
             ['Atoomweg', 'six b']])

        # Including commas
        sc = TextScrubber().num2words(include_commas=True, on_tokens=False)
        self.assertEqual(
            sc.transform('hello 1337 world'),
            'hello one thousand, three hundred and thirty-seven world')
        self.assertEqual(sc.transform(['hello 1337 world', 'Atoomweg 6b']), [
            'hello one thousand, three hundred and thirty-seven world',
            'Atoomweg six b'
        ])

        # Different language
        sc = TextScrubber().num2words(include_commas=False,
                                      language='nl',
                                      on_tokens=False)
        self.assertEqual(sc.transform('hello 1337 world'),
                         'hello duizenddriehonderdzevenendertig world')
        self.assertEqual(
            sc.transform(['hello 1337 world', 'Atoomweg 6b']),
            ['hello duizenddriehonderdzevenendertig world', 'Atoomweg zes b'])
Пример #22
0
    def test_strip(self):
        # On entire strings
        sc = TextScrubber().strip(chars=None, on_tokens=False)
        self.assertEqual(sc.transform('  hello   world'), 'hello   world')
        self.assertEqual(sc.transform(['  hello    world', 'slimmer AI  ']),
                         ['hello    world', 'slimmer AI'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().strip(chars=None, on_tokens=True).to_list()
        self.assertEqual(sc.transform(['  hello   world'], on_tokens=True),
                         ['hello   world'])
        self.assertEqual(sc.transform([['  hello    world', 'slimmer AI  ']]),
                         [['hello    world', 'slimmer AI']])

        # With custom chars
        sc = TextScrubber().strip(chars='ld', on_tokens=False)
        self.assertEqual(sc.transform('  hello   world'), '  hello   wor')
        self.assertEqual(sc.transform(['  hello    world', 'slimmer AI  ']),
                         ['  hello    wor', 'slimmer AI  '])
Пример #23
0
    def test_remove_punctuation(self):
        # On entire strings
        sc = TextScrubber().remove_punctuation(keep_punctuation='',
                                               on_tokens=False)
        self.assertEqual(sc.transform('hello, world!'), 'hello world')
        self.assertEqual(sc.transform(['hello, world!', 'slimmer-slimst.Ai']),
                         ['hello world', 'slimmerslimstAi'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_punctuation(keep_punctuation='',
                                               on_tokens=True).to_list()
        self.assertEqual(sc.transform(['hello, world!'], on_tokens=True),
                         ['hello world'])
        self.assertEqual(
            sc.transform([['hello, world!', 'slimmer-slimst.Ai']]),
            [['hello world', 'slimmerslimstAi']])

        # With a custom list of punctuation to keep
        sc = TextScrubber().remove_punctuation(keep_punctuation=',.',
                                               on_tokens=False)
        self.assertEqual(sc.transform('hello, world!'), 'hello, world')
        self.assertEqual(sc.transform(['hello, world!', 'slimmer-slimst.Ai']),
                         ['hello, world', 'slimmerslimst.Ai'])
Пример #24
0
 def test_text_transform(self):
     sc = TextScrubber().text_transform(func=lambda s: s.capitalize())
     self.assertEqual(sc.transform('hello world'), 'Hello world')
     self.assertEqual(sc.transform(['hello world', 'slimmer AI']),
                      ['Hello world', 'Slimmer ai'])