def test_replace_currency_symbols(self): text = '$1.00 equals £0.67 equals €0.91.' proc_text1 = 'USD1.00 equals GBP0.67 equals EUR0.91.' proc_text2 = '*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.' self.assertEqual( preprocess.replace_currency_symbols(text, replace_with=None), proc_text1) self.assertEqual( preprocess.replace_currency_symbols(text, replace_with='*CUR* '), proc_text2)
def test_replace_currency_symbols(self): tests = [ ('$1.00 equals £0.67 equals €0.91.', 'USD1.00 equals GBP0.67 equals EUR0.91.', '*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.'), ('this zebra costs $100.', 'this zebra costs USD100.', 'this zebra costs *CUR* 100.'), ] for text, proc_text1, proc_text2 in tests: self.assertEqual(preprocess.replace_currency_symbols(text, replace_with=None), proc_text1) self.assertEqual(preprocess.replace_currency_symbols(text, replace_with='*CUR* '), proc_text2)
def preprocess_unicode(raw_text): raw_text = preprocess.transliterate_unicode(raw_text.lower()) raw_text = preprocess.replace_urls(raw_text, replace_with=u'') raw_text = preprocess.replace_emails(raw_text, replace_with=u'') raw_text = preprocess.replace_phone_numbers(raw_text, replace_with=u'') raw_text = preprocess.replace_numbers(raw_text, replace_with=u'') raw_text = preprocess.replace_currency_symbols(raw_text, replace_with=u'') return raw_text
def test_replace_currency_symbols(): tests = [ ( "$1.00 equals £0.67 equals €0.91.", "USD1.00 equals GBP0.67 equals EUR0.91.", "*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.", ), ( "this zebra costs $100.", "this zebra costs USD100.", "this zebra costs *CUR* 100.", ), ] for text, proc_text1, proc_text2 in tests: assert (preprocess.replace_currency_symbols( text, replace_with=None) == proc_text1) assert (preprocess.replace_currency_symbols( text, replace_with="*CUR* ") == proc_text2)
def clean_tweet(self, text): # FIXED UNICODE text = preprocess.fix_bad_unicode(text) # GET TEXT ONLY FROM HTML text = BeautifulSoup(text, features='lxml').getText() # UN-PACK CONTRACTIONS text = preprocess.unpack_contractions(text) # REMOVE URL text = preprocess.replace_urls(text) # REMOVE EMAILS text = preprocess.replace_emails(text) # REMOVE PHONE NUMBERS text = preprocess.replace_phone_numbers(text) # REMOVE NUMBERS text = preprocess.replace_numbers(text) # REMOVE CURRENCY text = preprocess.replace_currency_symbols(text) # REMOVE ACCENTS text = preprocess.remove_accents(text) # CONVERT EMOJIS TO TEXT words = text.split() reformed = [ self.SMILEY[word] if word in self.SMILEY else word for word in words ] text = " ".join(reformed) text = emoji.demojize(text) text = text.replace(":", " ") text = ' '.join(text.split()) # SPLIT ATTACHED WORDS text = ' '.join(re.findall('[A-Z][^A-Z]*', text)) # SPLIT UNDERSCORE WORDS text = text.replace('_', ' ') # REMOVE PUNCTUATION text = preprocess.remove_punct(text) # Remove numbers text = re.sub(r'\d', '', text) # REMOVE WORDS LESS THAN 3 CHARACTERS text = re.sub(r'\b\w{1,2}\b', '', text) # NORMALIZE WHITESPACE text = preprocess.normalize_whitespace(text) return text
def clean_text(self, raw_text): raw_text = self.strip_tags(raw_text) raw_text = raw_text.lower() raw_text = preprocess.remove_punct(raw_text) raw_text = preprocess.transliterate_unicode(raw_text) raw_text = preprocess.replace_urls(raw_text, replace_with='') raw_text = preprocess.replace_emails(raw_text, replace_with='') raw_text = preprocess.replace_phone_numbers(raw_text, replace_with='') raw_text = preprocess.replace_numbers(raw_text, replace_with='') raw_text = preprocess.replace_currency_symbols(raw_text, replace_with='') return raw_text
def test_replace_currency_symbols(self): text = '$1.00 equals £0.67 equals €0.91.' proc_text1 = 'USD1.00 equals GBP0.67 equals EUR0.91.' proc_text2 = '*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.' self.assertEqual(preprocess.replace_currency_symbols(text, replace_with=None), proc_text1) self.assertEqual(preprocess.replace_currency_symbols(text, replace_with='*CUR* '), proc_text2)