예제 #1
0
def refine_facebook_post(text):
    if text is None:
        return None

    # removes URLs
    new_text = re.sub('(ftp|http[s]?)://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # refine entities
    hashtag_pattern = re.compile('(#\w+)')
    username_pattern = re.compile('(@\w+)')

    for word in hashtag_pattern.findall(new_text):
        sub = refine_entities(word)
        if sub is not None:
            new_text = re.sub(word, sub, new_text)
        else:
            new_text = re.sub(word, '', new_text)

    for word in username_pattern.findall(new_text):
        sub = refine_entities(word)
        if sub is not None:
            new_text = re.sub(word, sub, new_text)
        else:
            new_text = re.sub(word, '', new_text)

    # remove starting and trailing whitespaces
    new_text = new_text.lstrip().rstrip()

    # tokenize the string and filter out non-English language usage
    tokens = word_tokenize(new_text)
    new_text = " ".join([token for token in tokens if is_english(token)])

    return new_text
예제 #2
0
def get_twitter_trends(twitter_client, woe_id):
    if not (twitter_client, woe_id):
        return None

    response_data = twitter_client.client.trends_place(woe_id)

    trends = []
    for content in (response_data[0]['trends']):
        trends.append(refine_entities(content['name']))

    return trends
def test_username_filter():
    test_case = '@12345_icloudPrivacyLeak'
    expected_output = '12345 icloud Privacy Leak'

    assert refine_entities(test_case) == expected_output
def test_hashtag_sample_three():
    test_case = '#12345_icloud'
    expected_output = '12345 icloud'

    assert refine_entities(test_case) == expected_output
def test_hashtag_sample_two():
    test_case = '#BonjourTomCruise'
    expected_output = 'Bonjour Tom Cruise'

    assert refine_entities(test_case) == expected_output
def test_hashtag_sample_one():
    test_case = '#12345こんにちはJapanWelcomeTo日本'
    expected_output = '12345 Japan Welcome To'

    assert refine_entities(test_case) == expected_output
def test_non_english_language_detection_two():
    test_case = '12345こんにちはJapanWelcomeTo日本'
    expected_output = '12345 Japan Welcome To'

    assert refine_entities(test_case) == expected_output