def refine_facebook_post(text): if text is None: return None # removes URLs new_text = re.sub('(ftp|http[s]?)://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) # refine entities hashtag_pattern = re.compile('(#\w+)') username_pattern = re.compile('(@\w+)') for word in hashtag_pattern.findall(new_text): sub = refine_entities(word) if sub is not None: new_text = re.sub(word, sub, new_text) else: new_text = re.sub(word, '', new_text) for word in username_pattern.findall(new_text): sub = refine_entities(word) if sub is not None: new_text = re.sub(word, sub, new_text) else: new_text = re.sub(word, '', new_text) # remove starting and trailing whitespaces new_text = new_text.lstrip().rstrip() # tokenize the string and filter out non-English language usage tokens = word_tokenize(new_text) new_text = " ".join([token for token in tokens if is_english(token)]) return new_text
def get_twitter_trends(twitter_client, woe_id): if not (twitter_client, woe_id): return None response_data = twitter_client.client.trends_place(woe_id) trends = [] for content in (response_data[0]['trends']): trends.append(refine_entities(content['name'])) return trends
def test_username_filter(): test_case = '@12345_icloudPrivacyLeak' expected_output = '12345 icloud Privacy Leak' assert refine_entities(test_case) == expected_output
def test_hashtag_sample_three(): test_case = '#12345_icloud' expected_output = '12345 icloud' assert refine_entities(test_case) == expected_output
def test_hashtag_sample_two(): test_case = '#BonjourTomCruise' expected_output = 'Bonjour Tom Cruise' assert refine_entities(test_case) == expected_output
def test_hashtag_sample_one(): test_case = '#12345こんにちはJapanWelcomeTo日本' expected_output = '12345 Japan Welcome To' assert refine_entities(test_case) == expected_output
def test_non_english_language_detection_two(): test_case = '12345こんにちはJapanWelcomeTo日本' expected_output = '12345 Japan Welcome To' assert refine_entities(test_case) == expected_output