def test_emails(self): data = [[u"Has a [email protected]",u"Has a 1"], [u"[email protected] is a nice site",u"1 is a nice site"], [u"super cool [email protected] site",u"super cool 1 site"], ] for i,(text, truth) in enumerate(data): with self.subTest(i=i): new_text = replace_email("1",text) self.assertEqual(new_text,truth)
def word_count(text, lang=None): # Remove all tags from text text_no_tags,_ = remove_tags(text) # Remove urls, emails and other breakable elements. text_no_tags = replace_url(u'url', text_no_tags) text_no_tags = replace_email(u'email', text_no_tags) text_no_tags = replace_date(u'date', text_no_tags) text_no_tags = replace_phone(u'phone', text_no_tags) text_no_tags = replace_money(u'money', text_no_tags) if lang is not None and lang not in asian_languages: words = word_count_aux(text_no_tags) return words else: asian_chars = sum([is_asian(x) for x in text_no_tags]) non_asian_words = "".join([filter_jchars(c) for c in text_no_tags]) words = word_count_aux(non_asian_words) return words + asian_chars