def test_remove_accents(): text = "El niño se asustó -- qué miedo!" proc_text = "El nino se asusto -- que miedo!" assert preprocess.remove_accents(text, method="unicode") == proc_text assert preprocess.remove_accents(text, method="ascii") == proc_text with pytest.raises(Exception): _ = preprocess.remove_accents(text, method="foo")
def test_remove_accents(self): text = "El niño se asustó -- qué miedo!" proc_text = "El nino se asusto -- que miedo!" self.assertEqual(preprocess.remove_accents(text, method='unicode'), proc_text) self.assertEqual(preprocess.remove_accents(text, method='ascii'), proc_text) self.assertRaises(Exception, preprocess.remove_accents, text, method='foo')
def clean_tweet(self, text): # FIXED UNICODE text = preprocess.fix_bad_unicode(text) # GET TEXT ONLY FROM HTML text = BeautifulSoup(text, features='lxml').getText() # UN-PACK CONTRACTIONS text = preprocess.unpack_contractions(text) # REMOVE URL text = preprocess.replace_urls(text) # REMOVE EMAILS text = preprocess.replace_emails(text) # REMOVE PHONE NUMBERS text = preprocess.replace_phone_numbers(text) # REMOVE NUMBERS text = preprocess.replace_numbers(text) # REMOVE CURRENCY text = preprocess.replace_currency_symbols(text) # REMOVE ACCENTS text = preprocess.remove_accents(text) # CONVERT EMOJIS TO TEXT words = text.split() reformed = [ self.SMILEY[word] if word in self.SMILEY else word for word in words ] text = " ".join(reformed) text = emoji.demojize(text) text = text.replace(":", " ") text = ' '.join(text.split()) # SPLIT ATTACHED WORDS text = ' '.join(re.findall('[A-Z][^A-Z]*', text)) # SPLIT UNDERSCORE WORDS text = text.replace('_', ' ') # REMOVE PUNCTUATION text = preprocess.remove_punct(text) # Remove numbers text = re.sub(r'\d', '', text) # REMOVE WORDS LESS THAN 3 CHARACTERS text = re.sub(r'\b\w{1,2}\b', '', text) # NORMALIZE WHITESPACE text = preprocess.normalize_whitespace(text) return text
def preprocess(line): """ Pre processes the given line. :param line: line as str :return: preprocessed sentence(s) """ result = '' if len(line) < args.linelength: if args.clean: line = clean_text(line) if args.lemmatize: doc = nlp(line) tokens = [token.lemma_ for token in doc] else: tokens = line.split() if args.stem: tokens = [stemmer.stem(t) for t in tokens] if args.decapitalize: tokens = [t.lower() for t in tokens] if args.umlaute: tokens = [replace_umlaute(t) for t in tokens] if args.accents: tokens = [pp.remove_accents(t) for t in tokens] if args.numbers: tokens = [ pp.replace_numbers(t, replace_with='*NUMMER*') for t in tokens ] if args.punctuation: tokens = [t for t in tokens if t not in punctuation_tokens] if args.stopwords: tokens = [t for t in tokens if t.lower() not in stop_words] if args.forbidden: tokens = [ t for t in tokens if not any(kw in t.lower() for kw in forbidden_keywords) ] if len(tokens) > 3: result = "{}\n".format(' '.join(tokens)) return result