예제 #1
0
def test_remove_accents():
    text = "El niño se asustó -- qué miedo!"
    proc_text = "El nino se asusto -- que miedo!"
    assert preprocess.remove_accents(text, method="unicode") == proc_text
    assert preprocess.remove_accents(text, method="ascii") == proc_text
    with pytest.raises(Exception):
        _ = preprocess.remove_accents(text, method="foo")
예제 #2
0
 def test_remove_accents(self):
     text = "El niño se asustó -- qué miedo!"
     proc_text = "El nino se asusto -- que miedo!"
     self.assertEqual(preprocess.remove_accents(text, method='unicode'),
                      proc_text)
     self.assertEqual(preprocess.remove_accents(text, method='ascii'),
                      proc_text)
     self.assertRaises(Exception,
                       preprocess.remove_accents,
                       text,
                       method='foo')
예제 #3
0
    def clean_tweet(self, text):
        # FIXED UNICODE
        text = preprocess.fix_bad_unicode(text)

        # GET TEXT ONLY FROM HTML
        text = BeautifulSoup(text, features='lxml').getText()
        # UN-PACK CONTRACTIONS
        text = preprocess.unpack_contractions(text)

        # REMOVE URL
        text = preprocess.replace_urls(text)

        # REMOVE EMAILS
        text = preprocess.replace_emails(text)

        # REMOVE PHONE NUMBERS
        text = preprocess.replace_phone_numbers(text)

        # REMOVE NUMBERS
        text = preprocess.replace_numbers(text)

        # REMOVE CURRENCY
        text = preprocess.replace_currency_symbols(text)

        # REMOVE ACCENTS
        text = preprocess.remove_accents(text)

        # CONVERT EMOJIS TO TEXT
        words = text.split()
        reformed = [
            self.SMILEY[word] if word in self.SMILEY else word
            for word in words
        ]
        text = " ".join(reformed)
        text = emoji.demojize(text)
        text = text.replace(":", " ")
        text = ' '.join(text.split())

        # SPLIT ATTACHED WORDS
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text))

        # SPLIT UNDERSCORE WORDS
        text = text.replace('_', ' ')

        # REMOVE PUNCTUATION
        text = preprocess.remove_punct(text)

        # Remove numbers
        text = re.sub(r'\d', '', text)

        # REMOVE WORDS LESS THAN 3 CHARACTERS
        text = re.sub(r'\b\w{1,2}\b', '', text)

        # NORMALIZE WHITESPACE
        text = preprocess.normalize_whitespace(text)

        return text
예제 #4
0
def preprocess(line):
    """
    Pre processes the given line.

    :param line: line as str
    :return: preprocessed sentence(s)
    """
    result = ''
    if len(line) < args.linelength:
        if args.clean:
            line = clean_text(line)
        if args.lemmatize:
            doc = nlp(line)
            tokens = [token.lemma_ for token in doc]
        else:
            tokens = line.split()
        if args.stem:
            tokens = [stemmer.stem(t) for t in tokens]
        if args.decapitalize:
            tokens = [t.lower() for t in tokens]
        if args.umlaute:
            tokens = [replace_umlaute(t) for t in tokens]
        if args.accents:
            tokens = [pp.remove_accents(t) for t in tokens]
        if args.numbers:
            tokens = [
                pp.replace_numbers(t, replace_with='*NUMMER*') for t in tokens
            ]
        if args.punctuation:
            tokens = [t for t in tokens if t not in punctuation_tokens]
        if args.stopwords:
            tokens = [t for t in tokens if t.lower() not in stop_words]
        if args.forbidden:
            tokens = [
                t for t in tokens
                if not any(kw in t.lower() for kw in forbidden_keywords)
            ]
        if len(tokens) > 3:
            result = "{}\n".format(' '.join(tokens))

    return result
예제 #5
0
 def test_remove_accents(self):
     text = "El niño se asustó -- qué miedo!"
     proc_text = "El nino se asusto -- que miedo!"
     self.assertEqual(preprocess.remove_accents(text, method='unicode'), proc_text)
     self.assertEqual(preprocess.remove_accents(text, method='ascii'), proc_text)
     self.assertRaises(Exception, preprocess.remove_accents, text, method='foo')