Пример #1
0
 def test_initials(self):
     # We use a to_list() here such that we don't receive a generator
     sc = TextScrubber().initials().to_list()
     self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                      ['h', 'w'])
     self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                      [['h', 'w'], ['s', 'A']])
Пример #2
0
 def test_token_transform(self):
     # We use a to_list() here such that we don't receive a generator
     sc = TextScrubber().token_transform(
         func=lambda t: t.capitalize()).to_list()
     self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                      ['Hello', 'World'])
     self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                      [['Hello', 'World'], ['Slimmer', 'Ai']])
Пример #3
0
 def test_sub_tokens(self):
     # We use a to_list() here such that we don't receive a generator
     sc = TextScrubber().sub_tokens(func=lambda t: {
         'hello': 'goodbye',
         'AI': 'ML'
     }.get(t, t)).to_list()
     self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                      ['goodbye', 'world'])
     self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                      [['goodbye', 'world'], ['slimmer', 'ML']])
Пример #4
0
 def test_to_list(self):
     # The map objects will be materialized by the to_list() function
     sc = TextScrubber().to_list()
     self.assertEqual(
         sc.transform(map(str.upper, ['hello', 'world']), on_tokens=True),
         ['HELLO', 'WORLD'])
     self.assertEqual(
         sc.transform([
             map(str.upper, ['hello', 'world']),
             map(str.upper, ['slimmer', 'Ai'])
         ],
                      to_set=False),
         [['HELLO', 'WORLD'], ['SLIMMER', 'AI']])
Пример #5
0
    def test_transform(self):
        # The following will work with tokenizing
        sc = (TextScrubber().tokenize().initials().join(''))
        self.assertEqual(
            sc.transform('hello world', on_tokens=False, to_set=False), 'hw')
        self.assertEqual(
            sc.transform(['hello', 'world'], on_tokens=False, to_set=False),
            ['h', 'w'])
        self.assertEqual(
            sc.transform(['hello', 'world'], on_tokens=False, to_set=True),
            {'h', 'w'})
        self.assertEqual(
            sc.transform(['hello world', 'slimmer AI'],
                         on_tokens=False,
                         to_set=True), {'hw', 'sA'})

        # Will fail because we're dealing with tokens and we can't tokenize lists
        with self.assertRaises(TypeError):
            sc.transform(['hello world', 'slimmer AI'],
                         on_tokens=True,
                         to_set=True)
        with self.assertRaises(TypeError):
            sc.transform([['hello', 'world'], ['slimmer', 'AI']],
                         on_tokens=True,
                         to_set=False)
        with self.assertRaises(TypeError):
            sc.transform([['hello', 'world'], ['slimmer', 'AI']],
                         on_tokens=False,
                         to_set=False)

        # The following will work without tokenizing
        sc = (TextScrubber().initials().join(''))
        self.assertEqual(
            sc.transform(['hello', 'world'], on_tokens=False, to_set=False),
            ['hello', 'world'])
        self.assertEqual(
            sc.transform(['hello', 'world'], on_tokens=True, to_set=False),
            'hw')
        self.assertEqual(
            sc.transform([['hello', 'world'], ['slimmer', 'AI']],
                         on_tokens=False,
                         to_set=False), ['hw', 'sA'])
        self.assertEqual(
            sc.transform([['hello', 'world'], ['slimmer', 'AI']],
                         on_tokens=False,
                         to_set=True), {'hw', 'sA'})
        self.assertEqual(
            sc.transform([['hello', 'world'], ['slimmer', 'AI']],
                         on_tokens=True,
                         to_set=False), 'helloslimmer')
Пример #6
0
    def test_sub_html_chars(self):
        # On entire string.
        sc = TextScrubber().sub_html_chars()
        self.assertEqual(sc.transform('Eric Zeißner'), 'Eric Zeißner')
        self.assertEqual(sc.transform('Marco Pöchacker'),
                         'Marco Pöchacker')
        self.assertEqual(sc.transform('@ My Place'), '@ My Place')
        self.assertEqual(sc.transform('Carl's'), 'Carl\'s')

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sub_html_chars(on_tokens=True).to_list()
        self.assertEqual(
            sc.transform(['Marco', 'Pöchacker'], on_tokens=True),
            ['Marco', 'Pöchacker'])
Пример #7
0
    def test_tokenize(self):
        # Using the default tokenizer
        sc = TextScrubber().tokenize()
        self.assertEqual(sc.transform('hello world'), ['hello', 'world'])
        self.assertEqual(sc.transform(['hello world', 'slimmer AI']),
                         [['hello', 'world'], ['slimmer', 'AI']])

        # Using a custom one
        sc = TextScrubber().tokenize(func=lambda s: s.split('e'))
        self.assertEqual(sc.transform('hello world'), ['h', 'llo world'])
        self.assertEqual(sc.transform(['hello world', 'slimmer AI']),
                         [['h', 'llo world'], ['slimm', 'r AI']])
Пример #8
0
    def test_remove_digits(self):
        # On entire strings
        sc = TextScrubber().remove_digits(on_tokens=False)
        self.assertEqual(sc.transform('hell0 world12'), 'hell world')
        self.assertEqual(sc.transform(['hell0 world12', 'sl1mm3r A1']),
                         ['hell world', 'slmmr A'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_digits(on_tokens=True).to_list()
        self.assertEqual(sc.transform(['hell0 world12'], on_tokens=True),
                         ['hell world'])
        self.assertEqual(sc.transform([['hell0 world12', 'sl1mm3r A1']]),
                         [['hell world', 'slmmr A']])
Пример #9
0
    def test_lowercase(self):
        # On entire strings
        sc = TextScrubber().lowercase(on_tokens=False)
        self.assertEqual(sc.transform('Hello World'), 'hello world')
        self.assertEqual(sc.transform(['Hello World', 'slimmer AI']),
                         ['hello world', 'slimmer ai'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().lowercase(on_tokens=True).to_list()
        self.assertEqual(sc.transform(['Hello World'], on_tokens=True),
                         ['hello world'])
        self.assertEqual(sc.transform([['Hello World', 'slimmer AI']]),
                         [['hello world', 'slimmer ai']])
Пример #10
0
    def test_remove_quotes(self):
        # On entire strings
        sc = TextScrubber().remove_quotes(on_tokens=False)
        self.assertEqual(sc.transform('"hello world"'), 'hello world')
        self.assertEqual(sc.transform(['"hello world"', 'slimmer\' AI']),
                         ['hello world', 'slimmer AI'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_quotes(on_tokens=True).to_list()
        self.assertEqual(sc.transform(['"hello world"'], on_tokens=True),
                         ['hello world'])
        self.assertEqual(sc.transform([['"hello world"', 'slimmer\' AI']]),
                         [['hello world', 'slimmer AI']])
Пример #11
0
    def test_remove_suffixes(self):
        # On entire strings
        sc = TextScrubber().remove_suffixes({'Bar', 'world'}, on_tokens=False)
        self.assertEqual(sc.transform('fooBar fooBar'), 'fooBar foo')
        self.assertEqual(sc.transform(['hello world', 'world hello']),
                         ['hello ', 'world hello'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_suffixes({'Bar', 'world'}, on_tokens=True)
        self.assertEqual(sc.transform(['fooBar fooBar'], on_tokens=True),
                         ['fooBar foo'])
        self.assertEqual(sc.transform([['hello world', 'world hello']]),
                         [['hello ', 'world hello']])
Пример #12
0
    def test_to_ascii(self):
        # On entire strings
        sc = TextScrubber().to_ascii(on_tokens=False)
        self.assertEqual(sc.transform('héllô wòrld'), 'hello world')
        self.assertEqual(sc.transform(['héllô wòrld', 'slímm̀er ÀI']),
                         ['hello world', 'slimmer AI'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().to_ascii(on_tokens=True).to_list()
        self.assertEqual(sc.transform(['héllô wòrld'], on_tokens=True),
                         ['hello world'])
        self.assertEqual(sc.transform([['héllô wòrld', 'slímm̀er ÀI']]),
                         [['hello world', 'slimmer AI']])
Пример #13
0
    def test_join(self):
        # Default separator
        sc = TextScrubber().join(sep=' ')
        self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                         'hello world')
        self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                         ['hello world', 'slimmer AI'])

        # Custom separator
        sc = TextScrubber().join(sep=' & ')
        self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                         'hello & world')
        self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                         ['hello & world', 'slimmer & AI'])
Пример #14
0
    def test_remove_excessive_whitespace(self):
        # On entire strings
        sc = TextScrubber().remove_excessive_whitespace(on_tokens=False)
        self.assertEqual(sc.transform('hello  world '), 'hello world')
        self.assertEqual(sc.transform(['hello   world ', ' slimmer  AI ']),
                         ['hello world', 'slimmer AI'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_excessive_whitespace(
            on_tokens=True).to_list()
        self.assertEqual(sc.transform(['hello ', 'wor ld'], on_tokens=True),
                         ['hello', 'wor ld'])
Пример #15
0
    def test_sub_greek_chars(self):
        # On entire string.
        sc = TextScrubber().sub_greek_chars()
        self.assertEqual(sc.transform('α * β^Λ'), 'alpha * beta^Lambda')

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sub_greek_chars(on_tokens=True).to_list()
        self.assertEqual(sc.transform(['α * β^Λ', 'χΥΖ'], on_tokens=True),
                         ['alpha * beta^Lambda', 'chiUpsilonZeta'])
Пример #16
0
    def test_sub_latex_chars(self):
        # On entire string.
        sc = TextScrubber().sub_latex_chars()
        self.assertEqual(sc.transform(r'Eric \"Ozg\"ur Sar{\i}o\u{g}lu'),
                         'Eric Ozgur Sarioglu')
        self.assertEqual(sc.transform(r'Jan K\v{r}et\'insk\'y'),
                         'Jan Kretinsky')

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sub_latex_chars(on_tokens=True).to_list()
        self.assertEqual(
            sc.transform([r'Mieczys{\l}aw', r'K{\l}opotek'], on_tokens=True),
            ['Mieczyslaw', 'Klopotek'])
Пример #17
0
    def test_remove_html_tags(self):
        # On entire strings
        sc = TextScrubber().remove_html_tags(on_tokens=False)
        self.assertEqual(sc.transform('<b>hello</b> wo<FOO>rld'),
                         'hello world')
        self.assertEqual(
            sc.transform([
                'hello <i>world</i></br>',
                '<a tag>slimmer</some tag><sup>AI</sup>'
            ]), ['hello world', 'slimmerAI'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_html_tags(on_tokens=True).to_list()
        self.assertEqual(
            sc.transform(['<b>hello</b> wo<FOO>rld'], on_tokens=True),
            ['hello world'])
        self.assertEqual(
            sc.transform([[
                'hello <i>world</i></br>',
                '<a tag>slimmer</some tag><sup>AI</sup>'
            ]]), [['hello world', 'slimmerAI']])
Пример #18
0
 def test_transform_generator(self):
     sc = (TextScrubber().tokenize().initials().join(''))
     gen = sc.transform_generator(['hello world', 'slimmer AI'])
     self.assertTrue(isinstance(gen, types.GeneratorType))
     self.assertEqual(list(gen), ['hw', 'sA'])
Пример #19
0
 def test_text_transform(self):
     sc = TextScrubber().text_transform(func=lambda s: s.capitalize())
     self.assertEqual(sc.transform('hello world'), 'Hello world')
     self.assertEqual(sc.transform(['hello world', 'slimmer AI']),
                      ['Hello world', 'Slimmer ai'])
Пример #20
0
    def test_num2words(self):
        # Default settings. On strings
        sc = TextScrubber().num2words(include_commas=False, on_tokens=False)
        self.assertEqual(
            sc.transform('hello 1337 world'),
            'hello one thousand three hundred and thirty-seven world')
        self.assertEqual(sc.transform(['hello 1337 world', 'Atoomweg 6b']), [
            'hello one thousand three hundred and thirty-seven world',
            'Atoomweg six b'
        ])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().num2words(include_commas=False,
                                      on_tokens=True).to_list()
        self.assertEqual(
            sc.transform(['hello', '1337', 'world'], on_tokens=True),
            ['hello', 'one thousand three hundred and thirty-seven', 'world'])
        self.assertEqual(
            sc.transform([['hello', '1337', 'world'], ['Atoomweg', '6b']]),
            [['hello', 'one thousand three hundred and thirty-seven', 'world'],
             ['Atoomweg', 'six b']])

        # Including commas
        sc = TextScrubber().num2words(include_commas=True, on_tokens=False)
        self.assertEqual(
            sc.transform('hello 1337 world'),
            'hello one thousand, three hundred and thirty-seven world')
        self.assertEqual(sc.transform(['hello 1337 world', 'Atoomweg 6b']), [
            'hello one thousand, three hundred and thirty-seven world',
            'Atoomweg six b'
        ])

        # Different language
        sc = TextScrubber().num2words(include_commas=False,
                                      language='nl',
                                      on_tokens=False)
        self.assertEqual(sc.transform('hello 1337 world'),
                         'hello duizenddriehonderdzevenendertig world')
        self.assertEqual(
            sc.transform(['hello 1337 world', 'Atoomweg 6b']),
            ['hello duizenddriehonderdzevenendertig world', 'Atoomweg zes b'])
Пример #21
0
    def test_remove_punctuation(self):
        # On entire strings
        sc = TextScrubber().remove_punctuation(keep_punctuation='',
                                               on_tokens=False)
        self.assertEqual(sc.transform('hello, world!'), 'hello world')
        self.assertEqual(sc.transform(['hello, world!', 'slimmer-slimst.Ai']),
                         ['hello world', 'slimmerslimstAi'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_punctuation(keep_punctuation='',
                                               on_tokens=True).to_list()
        self.assertEqual(sc.transform(['hello, world!'], on_tokens=True),
                         ['hello world'])
        self.assertEqual(
            sc.transform([['hello, world!', 'slimmer-slimst.Ai']]),
            [['hello world', 'slimmerslimstAi']])

        # With a custom list of punctuation to keep
        sc = TextScrubber().remove_punctuation(keep_punctuation=',.',
                                               on_tokens=False)
        self.assertEqual(sc.transform('hello, world!'), 'hello, world')
        self.assertEqual(sc.transform(['hello, world!', 'slimmer-slimst.Ai']),
                         ['hello, world', 'slimmerslimst.Ai'])
Пример #22
0
    def test_sub(self):
        # Substitute with string on entire string.
        sc = TextScrubber().sub(search='big', replace='small')
        self.assertEqual(sc.transform('hello big world.'),
                         'hello small world.')

        # Substitute with string on tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sub(search='big', replace='small',
                                on_tokens=True).to_list()
        self.assertEqual(
            sc.transform(['hello', 'big', 'world'], on_tokens=True),
            ['hello', 'small', 'world'])
        self.assertEqual(sc.transform([['hello', 'big', 'world']]),
                         [['hello', 'small', 'world']])

        # Substitute with regex on entire string.
        sc = TextScrubber().sub(search=r'ph\.?\ ?d\.?', replace='phd')
        self.assertEqual(sc.transform('i have a ph.d. in banana pies'),
                         'i have a phd in banana pies')

        # Substitute with regex on tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sub(search=r'ph\.?\ ?d\.?',
                                replace='phd',
                                on_tokens=True).to_list()
        self.assertEqual(
            sc.transform(['i', 'am', 'phd.', 'student.'], on_tokens=True),
            ['i', 'am', 'phd', 'student.'])
        self.assertEqual(sc.transform([['i', 'am', 'phd.', 'student.']]),
                         [['i', 'am', 'phd', 'student.']])
Пример #23
0
    def test_strip(self):
        # On entire strings
        sc = TextScrubber().strip(chars=None, on_tokens=False)
        self.assertEqual(sc.transform('  hello   world'), 'hello   world')
        self.assertEqual(sc.transform(['  hello    world', 'slimmer AI  ']),
                         ['hello    world', 'slimmer AI'])

        # On tokens. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().strip(chars=None, on_tokens=True).to_list()
        self.assertEqual(sc.transform(['  hello   world'], on_tokens=True),
                         ['hello   world'])
        self.assertEqual(sc.transform([['  hello    world', 'slimmer AI  ']]),
                         [['hello    world', 'slimmer AI']])

        # With custom chars
        sc = TextScrubber().strip(chars='ld', on_tokens=False)
        self.assertEqual(sc.transform('  hello   world'), '  hello   wor')
        self.assertEqual(sc.transform(['  hello    world', 'slimmer AI  ']),
                         ['  hello    wor', 'slimmer AI  '])
Пример #24
0
    def test_filter_tokens(self):
        # Default test
        sc = TextScrubber().filter_tokens(test=lambda t: t,
                                          neg=False).to_list()
        self.assertEqual(sc.transform(['hello', '', 'world'], on_tokens=True),
                         ['hello', 'world'])
        self.assertEqual(
            sc.transform([['hello', '', 'world'],
                          [None, 'slimmer', 'AI', False]]),
            [['hello', 'world'], ['slimmer', 'AI']])

        # Default test using negative results
        sc = TextScrubber().filter_tokens(test=lambda t: t, neg=True).to_list()
        self.assertEqual(sc.transform(['hello', '', 'world'], on_tokens=True),
                         [''])
        self.assertEqual(
            sc.transform([['hello', '', 'world'],
                          [None, 'slimmer', 'AI', False]]),
            [[''], [None, False]])

        # Custom test
        sc = TextScrubber().filter_tokens(
            test=lambda t: isinstance(t, str) and t.islower(),
            neg=False).to_list()
        self.assertEqual(sc.transform(['hello', '', 'world'], on_tokens=True),
                         ['hello', 'world'])
        self.assertEqual(
            sc.transform([['hello', '', 'world'],
                          [None, 'slimmer', 'AI', False]]),
            [['hello', 'world'], ['slimmer']])
Пример #25
0
    def test_remove_stop_words(self):
        # Default stop words. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_stop_words(stop_words=None).to_list()
        self.assertEqual(
            sc.transform(['around', 'the', 'world'], on_tokens=True),
            ['world'])
        self.assertEqual(
            sc.transform([['around', 'the', 'world'],
                          ['once', 'upon', 'a', 'time']]),
            [['world'], ['time']])

        # Custom stop words. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().remove_stop_words(
            stop_words={'world', 'time'}).to_list()
        self.assertEqual(
            sc.transform(['around', 'the', 'world'], on_tokens=True),
            ['around', 'the'])
        self.assertEqual(
            sc.transform([['around', 'the', 'world'],
                          ['once', 'upon', 'a', 'time']]),
            [['around', 'the'], ['once', 'upon', 'a']])

        # All caps words shouldn't be removed
        sc = TextScrubber().remove_stop_words(stop_words=None).to_list()
        self.assertEqual(
            sc.transform(['around', 'THE', 'world'], on_tokens=True),
            ['THE', 'world'])
        self.assertEqual(
            sc.transform([['AROUND', 'the', 'world'],
                          ['once', 'upon', 'A', 'time']]),
            [['AROUND', 'world'], ['A', 'time']])
Пример #26
0
    'island': 'islands',
    'monteneg': 'montenegro',
    'neth': 'netherlands',
    'rep': 'republic',
    'republ': 'republic',
    'republik': 'republic',
    'sint': 'saint',
    'st': 'saint'
}

# We define the scrubber once so the regex objects will be compiled only once
_GEO_STRING_SCRUBBER = (
    TextScrubber().to_ascii().lowercase().remove_digits().sub(
        r'-|/|&|,', ' ').remove_punctuation().remove_suffixes(
            {' si', ' ri', ' dong'})  # Set of formal city suffixes
    .tokenize().filter_tokens().sub_tokens(
        lambda token: _GEO_TOKEN_MAP.get(token, token)).remove_stop_words({
            'a', 'an', 'and', 'cedex', 'da', 'der', 'di', 'do', 'e', 'email',
            'im', 'le', 'mail', 'of', 'the'
        }).join())


def _clean_geo_string(string: str) -> str:
    """
    Cleans a strings with geographical information (e.g., countries/states/cities).

    :param string: Input string to clean.
    :return: Cleaned string.
    """
    return _GEO_STRING_SCRUBBER.transform(string)

Пример #27
0
    def test_sort(self):
        # Default setting. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sort(reverse=False).to_list()
        self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                         ['hello', 'world'])
        self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                         [['hello', 'world'], ['AI', 'slimmer']])

        # Reverse order. We use a to_list() here such that we don't receive a generator
        sc = TextScrubber().sort(reverse=True).to_list()
        self.assertEqual(sc.transform(['hello', 'world'], on_tokens=True),
                         ['world', 'hello'])
        self.assertEqual(sc.transform([['hello', 'world'], ['slimmer', 'AI']]),
                         [['world', 'hello'], ['slimmer', 'AI']])

        # Sort on tokens
        sc = TextScrubber().sort(reverse=True, on_tokens=False).to_list()
        self.assertEqual(
            sc.transform([['hello', 'world'], ['slimmer', 'AI']],
                         on_tokens=True),
            [['slimmer', 'AI'], ['hello', 'world']])
        sc = TextScrubber().sort(reverse=True, on_tokens=True).to_list()
        self.assertEqual(
            sc.transform([['hello', 'world'], ['slimmer', 'AI']],
                         on_tokens=True),
            [['world', 'hello'], ['slimmer', 'AI']])