def test_original_words_to_tokens(self):
     t1 = Tokenizer(**neagative_initalization)
     text = "I'm 29 years old and I don't live in Petach Tikva"
     res1 = t1.word_tokenize(text)
     expected = {"I'm": ['i', "'m"], '29': ['29'], 'years': ['years'], 'old': ['old'], 'and': ['and'], 'I': ['i'],
                 "don't": ['do', "n't"], 'live': ['live'], 'in': ['in'], 'Petach': ['petach'], 'Tikva': ['tikva']}
     assert res1['originalWordsToTokens'] == expected
    def test_remove_new_lines(self):
        args = deepcopy(neagative_initalization)
        args['remove_new_lines'] = True
        t1 = Tokenizer(**args)
        text = \
            """
I have 3 dogs
 I lied
"""
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == text.lower().replace('\n', '').replace('\r', '').strip()
    def test_hash_seed(self):
        def hash_djb2(s, seed=5381):
            """
             Hash string with djb2 hash function

             :type s: ``str``
             :param s: The input string to hash

             :type seed: ``int``
             :param seed: The seed for the hash function (default is 5381)

             :return: The hashed value
             :rtype: ``int``
            """
            hash_name = seed
            for x in s:
                hash_name = ((hash_name << 5) + hash_name) + ord(x)

            return hash_name & 0xFFFFFFFF

        args = deepcopy(neagative_initalization)
        args['hash_seed'] = 5
        t1 = Tokenizer(**args)
        text = 'hello world'
        res1 = t1.word_tokenize(text)
        assert res1['hashedTokenizedText'] == ' '.join(
            str(hash_djb2(word, 5)) for word in text.split())

        args['hash_seed'] = None
        t1 = Tokenizer(**args)
        text = 'hello world'
        res1 = t1.word_tokenize(text)
        assert 'hashedTokenizedText' not in res1
    def test_remove_stop_words(self):
        args = deepcopy(neagative_initalization)
        args['remove_stop_words'] = False
        t1 = Tokenizer(**args)
        text = 'let it be'
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == text

        args['remove_stop_words'] = True
        t1 = Tokenizer(**args)
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == 'let'
    def test_max_test_length(self):
        text = 'example sentence'
        list_text = [text] * 2
        args = deepcopy(neagative_initalization)
        t1 = Tokenizer(**args)
        t1.max_text_length = len(text) + 1
        res1 = t1.word_tokenize(list_text)
        assert all(res1[i]['tokenizedText'] == text for i in range(len(list_text)))

        t1.max_text_length = len(text) - 1
        res1 = t1.word_tokenize(list_text)
        assert all(res1[i]['tokenizedText'] == '' for i in range(len(list_text)))
    def test_replace_urls(self):
        tested_arg = 'replace_urls'
        args = deepcopy(neagative_initalization)
        args[tested_arg] = False
        t1 = Tokenizer(**args)
        text = 'my url is www.google.com'
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == text

        args[tested_arg] = True
        t1 = Tokenizer(**args)
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == 'my url is {}'.format(t1.url_pattern)
    def test_remove_non_alpha(self):
        tested_arg = 'remove_non_alpha'
        args = deepcopy(neagative_initalization)
        args[tested_arg] = False
        t1 = Tokenizer(**args)
        text = 'see you s00n'
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == text

        args[tested_arg] = True
        t1 = Tokenizer(**args)
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == 'see you'
    def test_lemma(self):
        tested_arg = 'lemma'
        args = deepcopy(neagative_initalization)
        args[tested_arg] = False
        t1 = Tokenizer(**args)
        text = 'this tokenization method is exceeding my expectations'
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == text

        args[tested_arg] = True
        t1 = Tokenizer(**args)
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == 'this tokenization method be exceed my expectation'
    def test_replace_numbers(self):
        tested_arg = 'replace_numbers'
        args = deepcopy(neagative_initalization)
        args[tested_arg] = False
        t1 = Tokenizer(**args)
        text = 'i am 3 years old'
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == text

        args[tested_arg] = True
        t1 = Tokenizer(**args)
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == 'i am {} years old'.format(t1.number_pattern)
    def test_number_pattern(self):
        args = deepcopy(neagative_initalization)
        args['replace_numbers'] = True
        t1 = Tokenizer(**args)
        text = "I have 3 dogs"
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == "I have {} dogs".lower().format(t1.number_pattern)

        args['replace_numbers'] = False
        t1 = Tokenizer(**args)
        text = "I have 3 dogs"
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == text.lower()
    def test_replace_emails(self):
        tested_arg = 'replace_emails'
        args = deepcopy(negative_initialization)
        args[tested_arg] = False
        t1 = Tokenizer(**args)
        text = 'my email is [email protected]'
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == text

        args[tested_arg] = True
        t1 = Tokenizer(**args)
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == 'my email is {}'.format(
            t1.email_pattern)
Exemplo n.º 12
0
    def test_remove_punct(self):
        args = deepcopy(neagative_initalization)
        args['remove_punct'] = False
        t1 = Tokenizer(**args)
        text = 'let, it. be!'
        res1 = t1.word_tokenize(text)
        expected_result = text
        for punct in string.punctuation:
            expected_result = expected_result.replace(punct, ' ' + punct)
        assert res1['tokenizedText'] == expected_result

        args['remove_punct'] = True
        t1 = Tokenizer(**args)
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == 'let it be'
    def test_tokenization_methold(self):
        tokenization_method = 'byWords'
        language = 'fake language'
        args = deepcopy(neagative_initalization)
        args['tokenization_method'] = tokenization_method
        args['language'] = language
        t1 = Tokenizer(**args)
        text = 'example sentence.'
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == ''.join(c for c in text if c not in string.punctuation)

        tokenization_method = 'byLetters'
        args['tokenization_method'] = tokenization_method
        t1 = Tokenizer(**args)
        text = 'example sentence'
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == ' '.join(c for c in text if c != ' ')
    def test_clean_html_tokenizer(self):
        args = deepcopy(neagative_initalization)
        args['clean_html'] = True
        t1 = Tokenizer(**args)
        text = """
            <!DOCTYPE html>
        <html>
        <body>
        <h1>My First Heading</h1>
        <p>My first paragraph</p>
        </body>
        </html>
        """
        res1 = t1.word_tokenize(text)
        assert res1['tokenizedText'] == 'My First Heading My first paragraph'.lower()

        args['clean_html'] = False
        t2 = Tokenizer(**args)
        res2 = t2.word_tokenize(text)
        assert re.sub(r"\s+", "", res2['tokenizedText']) == re.sub(r"\s+", "", text.lower())