Python Extractor示例，twitter_text.Extractor Python示例

示例#1

0

显示文件

文件： recipe__extract_tweet_entities.py 项目： richallensf/Recipes-for-Mining-Twitter

def get_entities(tweet):

    extractor = twitter_text.Extractor(tweet['text'])

    # Note: the production Twitter API contains a few additional fields in
    # the entities hash that would require additional API calls to resolve
    # See API resources that offer the include_entities parameter for details.

    entities = {}
    entities['user_mentions'] = []
    for um in extractor.extract_mentioned_screen_names_with_indices():
        entities['user_mentions'].append(um)

    entities['hashtags'] = []
    for ht in extractor.extract_hashtags_with_indices():

        # Massage field name to match production twitter api

        ht['text'] = ht['hashtag']
        del ht['hashtag']
        entities['hashtags'].append(ht)

    entities['urls'] = []
    for url in extractor.extract_urls_with_indices():
        entities['urls'].append(url)

    return entities

示例#2

0

显示文件

文件： tweet_stats.py 项目： pombreda/twitter-tools

    def get_entities(self, text):
        """Extract entities from tweet as text and return an entity dict.

        Function modified from:
        https://github.com/ptwobrussell/Mining-the-Social-Web/blob/master/python_code/the_tweet__extract_tweet_entities.py
        """

        extractor = twitter_text.Extractor(text)

        entities = {}
        entities['user_mentions'] = []
        for um in extractor.extract_mentioned_screen_names_with_indices():
            entities['user_mentions'].append(um)

        entities['hashtags'] = []
        for ht in extractor.extract_hashtags_with_indices():

            # massage field name to match production twitter api
            ht['text'] = ht['hashtag']
            del ht['hashtag']
            entities['hashtags'].append(ht)

        entities['urls'] = []
        for url in extractor.extract_urls_with_indices():
            entities['urls'].append(url)

        return entities

示例#3

0

显示文件

文件： the_tweet__extract_tweet_entities.py 项目： anishpurohit/dropbox

def getEntities(tweet):

    # Now extract various entities from it and build up a familiar structure

    extractor = twitter_text.Extractor(tweet['text'])

    # Note that the production Twitter API contains a few additional fields in
    # the entities hash that would require additional API calls to resolve

    entities = {}
    entities['user_mentions'] = []
    for um in extractor.extract_mentioned_screen_names_with_indices():
        entities['user_mentions'].append(um)

    entities['hashtags'] = []
    for ht in extractor.extract_hashtags_with_indices():

        # massage field name to match production twitter api

        ht['text'] = ht['hashtag']
        del ht['hashtag']
        entities['hashtags'].append(ht)

    entities['urls'] = []
    for url in extractor.extract_urls_with_indices():
        entities['urls'].append(url)

    return entities

示例#4

0

显示文件

文件： recipe03.py 项目： mkhoin/Tweepy-Recipe

def get_entities(tweet):

    extractor = twitter_text.Extractor(tweet['text'])
    entities = {}
    entities['user_mentions_indices'] = []
    for um in extractor.extract_mentioned_screen_names_with_indices():
        entities['user_mentions_indices'].append(um)

    entities['hashtags_indices'] = []
    for hts in extractor.extract_hashtags_with_indices():
        entities['hashtags_indices'].append(hts)

    entities['urls_indices'] = []
    for url in extractor.extract_urls_with_indices():
        entities['urls_indices'].append(url)


#	entities['user_name']  = []
#	for un in extractor.extract_mentioned_screen_names():
#		entities['user_name'].append(un)

#	entities['hashtag'] = []
#	for ht in extractor.extract_hashtags():
#		entities['hashtag'].append(ht)

#	entities['url']  = []
#	for ur in extractor.extract_urls():
#		entities['url'].append(ur)

    return entities

示例#5

0

显示文件

def get_mentions(tweet):
    '''
    A regular expression is used to identify mentions.
    '''
    extractor = twitter_text.Extractor(tweet)
    entities = []
    for um in extractor.extract_mentioned_screen_names_with_indices():
        entities.append(um['screen_name'])
    return entities

示例#6

0

显示文件

def main():
    print("hi")
    import sys
    import bz2
    print("Python version")
    print(sys.version)
    print("Version info.")
    extractor = twitter_text.Extractor()
    entities = extractor.extract_mentioned_screennames_with_indices(
        "fooo @jack @biz fooo")
    print(entities)
    print(dir(entities[0]))
    print(entities[0].value)

示例#7

0

显示文件

def uniqueMentionsAndAuthorsMapper(doc):
    if not doc.get('entities'):
        import twitter_text
        extractor = twitter_text.Extractor(doc['text'])
        doc['entities'] = {'user_mentions': [um for um in
                           extractor.extract_mentioned_screen_names_with_indices()]}

    # Emit the @mentions

    if doc['entities'].get('user_mentions'):
        for um in doc['entities']['user_mentions']:
            yield [um['screen_name'].lower(), None]

    # Emit the tweet author

    yield [doc['from_user'].lower(), None]

示例#8

0

显示文件

文件： extract_entries.py 项目： tazzledazzle/python-archive

def get_entities(tweet):

    extractor = twitter_text.Extractor(tweet['text'])

    entities = {}
    entities['user_mentions'] = []
    for um in extractor.extract_mentioned_screen_names_with_indices():
        entities['user_mentions'].append(um)

    entities['hashtags'] = []
    for ht in extractor.extract_hashtags_with_indices():
        ht['text'] = ht['hashtag']
        del ht['hashtag']
        entities['hashtags'].append(ht)

    entities['urls'] = []
    for url in extractor.extract_urls_with_indices():
        entities['urls'].append(url)

    return entities

示例#9

0

显示文件

文件： cookbook.py 项目： vzhan100/Twitter-Data-Mining-Project

print get_user_profile(twitter_api,
                       screen_names=["SocialWebMining", "ptwobrussell"])
#print get_user_profile(twitter_api, user_ids=[132373965])

### Example 18. Extracting tweet entities from arbitrary text

# In[ ]:

import twitter_text

# Sample usage

txt = "RT @SocialWebMining Mining 1M+ Tweets About #Syria http://wp.me/p3QiJd-1I"

ex = twitter_text.Extractor(txt)

print "Screen Names:", ex.extract_mentioned_screen_names_with_indices()
print "URLs:", ex.extract_urls_with_indices()
print "Hashtags:", ex.extract_hashtags_with_indices()

### Example 19. Getting all friends or followers for a user

# In[ ]:

from functools import partial
from sys import maxint


def get_friends_followers_ids(twitter_api,
                              screen_name=None,

示例#10

0

显示文件

def extract_urls(text):
    extractor = twitter_text.Extractor(text)
    urls = []
    for um in extractor.extract_urls_with_indices():
        urls.append(um)
    return urls

示例#11

0

显示文件

文件： test_extractor.py 项目： sayrer/twitter-text

def test_extractor_ctor():
    extractor = twitter_text.Extractor()
    assert extractor is not None

示例#12

0

显示文件

文件： test_extractor.py 项目： sayrer/twitter-text

def test_yaml():
    extractor = twitter_text.Extractor()
    with open(r"rust/conformance/tests/extract.yml") as file:
        testmap = yaml.load(file, Loader=yaml.FullLoader)

        assert len(testmap["tests"]["mentions"]) > 0
        for test in testmap["tests"]["mentions"]:
            mentions = extractor.extract_mentioned_screennames(test["text"])
            for index, s in enumerate(mentions):
                assert s == test["expected"][index]

        assert len(testmap["tests"]["mentions_with_indices"]) > 0
        for test in testmap["tests"]["mentions_with_indices"]:
            entities = extractor.extract_mentioned_screennames_with_indices(
                test["text"])
            assert len(test["expected"]) == len(entities)
            for index, entity in enumerate(entities):
                assert entity.value == test["expected"][index]["screen_name"]
                assert entity.start == test["expected"][index]["indices"][0]
                assert entity.end == test["expected"][index]["indices"][1]

        assert len(testmap["tests"]["mentions_or_lists_with_indices"]) > 0
        for test in testmap["tests"]["mentions_or_lists_with_indices"]:
            entities = extractor.extract_mentions_or_lists_with_indices(
                test["text"])
            assert len(test["expected"]) == len(entities)
            for index, entity in enumerate(entities):
                assert entity.value == test["expected"][index]["screen_name"]
                assert entity.list_slug == test["expected"][index]["list_slug"]
                assert entity.start == test["expected"][index]["indices"][0]
                assert entity.end == test["expected"][index]["indices"][1]

        assert len(testmap["tests"]["replies"]) > 0
        for test in testmap["tests"]["replies"]:
            entity = extractor.extract_reply_screenname(test["text"])
            if entity is not None:
                assert entity.value == test["expected"]
            else:
                assert test["expected"] is None

        assert len(testmap["tests"]["urls"]) > 0
        for test in testmap["tests"]["urls"]:
            urls = extractor.extract_urls(test["text"])
            for index, s in enumerate(urls):
                assert s == test["expected"][index]

        assert len(testmap["tests"]["urls_with_indices"]) > 0
        for test in testmap["tests"]["urls_with_indices"]:
            entities = extractor.extract_urls_with_indices(test["text"])
            assert len(test["expected"]) == len(entities)
            for index, entity in enumerate(entities):
                assert entity.value == test["expected"][index]["url"]
                assert entity.start == test["expected"][index]["indices"][0]
                assert entity.end == test["expected"][index]["indices"][1]

        assert len(testmap["tests"]["urls_with_directional_markers"]) > 0
        for test in testmap["tests"]["urls_with_directional_markers"]:
            entities = extractor.extract_urls_with_indices(test["text"])
            assert len(test["expected"]) == len(entities)
            for index, entity in enumerate(entities):
                assert entity.value == test["expected"][index]["url"]
                assert entity.start == test["expected"][index]["indices"][0]
                assert entity.end == test["expected"][index]["indices"][1]

        assert len(testmap["tests"]["tco_urls_with_params"]) > 0
        for test in testmap["tests"]["tco_urls_with_params"]:
            entities = extractor.extract_urls_with_indices(test["text"])
            assert len(test["expected"]) == len(entities)
            for index, entity in enumerate(entities):
                assert entity.value == test["expected"][index]

        assert len(testmap["tests"]["hashtags"]) > 0
        for test in testmap["tests"]["hashtags"]:
            hashtags = extractor.extract_hashtags(test["text"])
            assert len(test["expected"]) == len(hashtags)
            for index, hashtag in enumerate(hashtags):
                assert hashtag == test["expected"][index]

        assert len(testmap["tests"]["hashtags_from_astral"]) > 0
        for test in testmap["tests"]["hashtags_from_astral"]:
            hashtags = extractor.extract_hashtags(test["text"])
            assert len(test["expected"]) == len(hashtags)
            for index, hashtag in enumerate(hashtags):
                assert hashtag == test["expected"][index]

        assert len(testmap["tests"]["hashtags_with_indices"]) > 0
        for test in testmap["tests"]["hashtags_with_indices"]:
            entities = extractor.extract_hashtags_with_indices(test["text"])
            assert len(test["expected"]) == len(entities)
            for index, entity in enumerate(entities):
                assert entity.value == test["expected"][index]["hashtag"]
                assert entity.start == test["expected"][index]["indices"][0]
                assert entity.end == test["expected"][index]["indices"][1]

        assert len(testmap["tests"]["cashtags"]) > 0
        for test in testmap["tests"]["cashtags"]:
            cashtags = extractor.extract_cashtags(test["text"])
            assert len(test["expected"]) == len(cashtags)
            for index, cashtag in enumerate(cashtags):
                assert cashtag == test["expected"][index]

        assert len(testmap["tests"]["cashtags_with_indices"]) > 0
        for test in testmap["tests"]["cashtags_with_indices"]:
            entities = extractor.extract_cashtags_with_indices(test["text"])
            assert len(test["expected"]) == len(entities)
            for index, entity in enumerate(entities):
                assert entity.value == test["expected"][index]["cashtag"]
                assert entity.start == test["expected"][index]["indices"][0]
                assert entity.end == test["expected"][index]["indices"][1]

示例#13

0

显示文件

文件： test_extractor.py 项目： sayrer/twitter-text

def test_accessors():
    extractor = twitter_text.Extractor()

    assert extractor.get_extract_url_without_protocol() == True
    extractor.set_extract_url_without_protocol(False)
    assert extractor.get_extract_url_without_protocol() == False

示例#14

0

显示文件

文件： tests.py 项目： j4mie/twitter-text-py

def extractor_tests(tests, passed, failed):
    print u'Running Extractor tests'

    correct_mentioned_screen_names = [u'foo', u'monkeybat', u'bar']
    correct_mentioned_screen_names_with_indices = [{'indicies': (0, 4), 'screen_name': u'foo'}, {'indicies': (32, 42), 'screen_name': u'monkeybat'}, {'indicies': (47, 51), 'screen_name': u'bar'}]
    correct_reply_screen_name = 'foo'
    correct_urls = [u'http://dryan.net/xxxxx?param=true#hash']
    correct_urls_with_indices = [{'url': u'http://dryan.net/xxxxx?param=true#hash', 'indices': (52, 90)}]
    correct_hashtags = [u'comedy', u'url']
    correct_hashtags_with_indices = [{'indices': (91, 98), 'hashtag': u'comedy'}, {'indices': (99, 103), 'hashtag': u'url'}]

    extractor = twitter_text.Extractor(text)
    
    if tt.extractor.extract_mentioned_screen_names() == correct_mentioned_screen_names:
        print u'\033[92m  Attached extract_mentioned_screen_names passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Attached extract_mentioned_screen_names failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_mentioned_screen_names)
        print u'    Returned: %s' % force_unicode(tt.extractor.extract_mentioned_screen_names())
        failed +=1
    tests +=1

    if extractor.extract_mentioned_screen_names() == correct_mentioned_screen_names:
        print u'\033[92m  Stand alone extract_mentioned_screen_names passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Stand alone extract_mentioned_screen_names failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_mentioned_screen_names)
        print u'    Returned: %s' % force_unicode(extractor.extract_mentioned_screen_names())
        failed +=1
    tests +=1

    if tt.extractor.extract_mentioned_screen_names_with_indices() == correct_mentioned_screen_names_with_indices:
        print u'\033[92m  Attached extract_mentioned_screen_names_with_indices passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Attached extract_mentioned_screen_names_with_indices failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_mentioned_screen_names_with_indices)
        print u'    Returned: %s' % force_unicode(tt.extractor.extract_mentioned_screen_names_with_indices())
        failed += 1
    tests += 1

    if extractor.extract_mentioned_screen_names_with_indices() == correct_mentioned_screen_names_with_indices:
        print u'\033[92m  Stand alone extract_mentioned_screen_names_with_indices passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Stand alone extract_mentioned_screen_names_with_indices failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_mentioned_screen_names_with_indices)
        print u'    Returned: %s' % force_unicode(extractor.extract_mentioned_screen_names_with_indices())
        failed += 1
    tests += 1

    if tt.extractor.extract_reply_screen_name() == correct_reply_screen_name:
        print u'\033[92m  Attached extract_reply_screen_name passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Attached extract_reply_screen_name failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_reply_screen_name)
        print u'    Returned: %s' % force_unicode(tt.extractor.extract_reply_screen_name())
        failed +=1
    tests +=1

    if extractor.extract_reply_screen_name() == correct_reply_screen_name:
        print u'\033[92m  Stand alone extract_reply_screen_name passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Stand alone extract_reply_screen_name failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_reply_screen_name)
        print u'    Returned: %s' % force_unicode(extractor.extract_reply_screen_name())
        failed +=1
    tests +=1

    if tt.extractor.extract_urls() == correct_urls:
        print u'\033[92m  Attached extract_urls passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Attached extract_urls failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_urls)
        print u'    Returned: %s' % force_unicode(tt.extractor.extract_urls())
        failed +=1
    tests +=1

    if extractor.extract_urls() == correct_urls:
        print u'\033[92m  Stand alone extract_urls passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Stand alone extract_urls failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_urls)
        print u'    Returned: %s' % force_unicode(extractor.extract_urls())
        failed +=1
    tests +=1

    if tt.extractor.extract_urls_with_indices() == correct_urls_with_indices:
        print u'\033[92m  Attached extract_urls_with_indices passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Attached extract_urls_with_indices failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_urls_with_indices)
        print u'    Returned: %s' % force_unicode(tt.extractor.extract_urls_with_indices())
        failed += 1
    tests += 1

    if extractor.extract_urls_with_indices() == correct_urls_with_indices:
        print u'\033[92m  Stand alone extract_urls_with_indices passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Stand alone extract_urls_with_indices failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_urls_with_indices)
        print u'    Returned: %s' % force_unicode(extractor.extract_urls_with_indices())
        failed += 1
    tests += 1

    if tt.extractor.extract_hashtags() == correct_hashtags:
        print u'\033[92m  Attached extract_hashtags passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Attached extract_hashtags failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_hashtags)
        print u'    Returned: %s' % force_unicode(tt.extractor.extract_hashtags())
        failed +=1
    tests +=1

    if extractor.extract_hashtags() == correct_hashtags:
        print u'\033[92m  Stand alone extract_hashtags passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Stand alone extract_hashtags failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_hashtags)
        print u'    Returned: %s' % force_unicode(extractor.extract_hashtags())
        failed +=1
    tests +=1

    if tt.extractor.extract_hashtags_with_indices() == correct_hashtags_with_indices:
        print u'\033[92m  Attached extract_hashtags_with_indices passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Attached extract_hashtags_with_indices failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_hashtags_with_indices)
        print u'    Returned: %s' % force_unicode(tt.extractor.extract_hashtags_with_indices())
        failed += 1
    tests += 1
        
    if extractor.extract_hashtags_with_indices() == correct_hashtags_with_indices:
        print u'\033[92m  Stand alone extract_hashtags_with_indices passed\033[0m'
        passed += 1
    else:
        print u'\033[91m  Stand alone extract_hashtags_with_indices failed:\033[0m'
        print u'    Expected: %s' % force_unicode(correct_hashtags_with_indices)
        print u'    Returned: %s' % force_unicode(extractor.extract_hashtags_with_indices())
        failed += 1
    tests += 1
    
    return tests, passed, failed

示例#15

0

显示文件

文件： Assignment2.py 项目： amganier/Predict452

# results is a list of dictionary items obtained from twitter
# these functions assume that the text of each tweet
# is written to a separate line in the output text file
#item_count = 0  # initialize count of objects dumped to file
#with open(partial_text_filename_u, 'w') as outfile:
#for dict_item in results_u:
#outfile.write(json.dumps(item['text']))
#item_count = item_count + 1
#if item_count < len(results_u):
#outfile.write(line_termination)  # new line between text items

# In[11]:

#Using extract
# Citation: Mining the Social Web, 2nd Edition 9.2 Tweet Entities from Arbitrary Text
users_extract = twitter_text.Extractor(results_u)
ue = ("Screen Names:", users_extract.extract_mentioned_screen_names())
ue_list = list(ue)
type(ue_list)

competition_extract = twitter_text.Extractor(results_twt)

ce = ("Screen Names:", competition_extract.extract_mentioned_screen_names())
ce_list = list(ce)
type(ce_list)

# FAIL: Looking to identify common values between lists
# Citation: https://stackoverflow.com/questions/28061223/python-how-to-find-common-values-in-three-lists
# set(ce_list).intersection(ue_list)

print(ue_list)

示例#16

0

显示文件

文件： 06-data-cleaning-tweets.py 项目： Monicajia/mybook

    marks = ['RT @', '@', '&quot;', '#', '\n', '\t', '  ']
    for k in marks:
        tweet = tweet.replace(k, '')
    return tweet


# In[32]:


import twitter_text

tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! 
            #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com 
            http://ccc.nju.edu.cn RT !!HELP!!!!'''

ex = twitter_text.Extractor(tweet)
at_names = ex.extract_mentioned_screen_names()
urls = ex.extract_urls()
hashtags = ex.extract_hashtags()
rt_user = extract_rt_user(tweet)
#tweet_text = extract_tweet_text(tweet, at_names, urls)

print(at_names, urls, hashtags, rt_user,'-------->')#, tweet_text)


# In[33]:


import csv

lines = csv.reader(chunk,delimiter=',', quotechar='"')