def get_entities(tweet): extractor = twitter_text.Extractor(tweet['text']) # Note: the production Twitter API contains a few additional fields in # the entities hash that would require additional API calls to resolve # See API resources that offer the include_entities parameter for details. entities = {} entities['user_mentions'] = [] for um in extractor.extract_mentioned_screen_names_with_indices(): entities['user_mentions'].append(um) entities['hashtags'] = [] for ht in extractor.extract_hashtags_with_indices(): # Massage field name to match production twitter api ht['text'] = ht['hashtag'] del ht['hashtag'] entities['hashtags'].append(ht) entities['urls'] = [] for url in extractor.extract_urls_with_indices(): entities['urls'].append(url) return entities
def get_entities(self, text): """Extract entities from tweet as text and return an entity dict. Function modified from: https://github.com/ptwobrussell/Mining-the-Social-Web/blob/master/python_code/the_tweet__extract_tweet_entities.py """ extractor = twitter_text.Extractor(text) entities = {} entities['user_mentions'] = [] for um in extractor.extract_mentioned_screen_names_with_indices(): entities['user_mentions'].append(um) entities['hashtags'] = [] for ht in extractor.extract_hashtags_with_indices(): # massage field name to match production twitter api ht['text'] = ht['hashtag'] del ht['hashtag'] entities['hashtags'].append(ht) entities['urls'] = [] for url in extractor.extract_urls_with_indices(): entities['urls'].append(url) return entities
def getEntities(tweet): # Now extract various entities from it and build up a familiar structure extractor = twitter_text.Extractor(tweet['text']) # Note that the production Twitter API contains a few additional fields in # the entities hash that would require additional API calls to resolve entities = {} entities['user_mentions'] = [] for um in extractor.extract_mentioned_screen_names_with_indices(): entities['user_mentions'].append(um) entities['hashtags'] = [] for ht in extractor.extract_hashtags_with_indices(): # massage field name to match production twitter api ht['text'] = ht['hashtag'] del ht['hashtag'] entities['hashtags'].append(ht) entities['urls'] = [] for url in extractor.extract_urls_with_indices(): entities['urls'].append(url) return entities
def get_entities(tweet): extractor = twitter_text.Extractor(tweet['text']) entities = {} entities['user_mentions_indices'] = [] for um in extractor.extract_mentioned_screen_names_with_indices(): entities['user_mentions_indices'].append(um) entities['hashtags_indices'] = [] for hts in extractor.extract_hashtags_with_indices(): entities['hashtags_indices'].append(hts) entities['urls_indices'] = [] for url in extractor.extract_urls_with_indices(): entities['urls_indices'].append(url) # entities['user_name'] = [] # for un in extractor.extract_mentioned_screen_names(): # entities['user_name'].append(un) # entities['hashtag'] = [] # for ht in extractor.extract_hashtags(): # entities['hashtag'].append(ht) # entities['url'] = [] # for ur in extractor.extract_urls(): # entities['url'].append(ur) return entities
def get_mentions(tweet): ''' A regular expression is used to identify mentions. ''' extractor = twitter_text.Extractor(tweet) entities = [] for um in extractor.extract_mentioned_screen_names_with_indices(): entities.append(um['screen_name']) return entities
def main(): print("hi") import sys import bz2 print("Python version") print(sys.version) print("Version info.") extractor = twitter_text.Extractor() entities = extractor.extract_mentioned_screennames_with_indices( "fooo @jack @biz fooo") print(entities) print(dir(entities[0])) print(entities[0].value)
def uniqueMentionsAndAuthorsMapper(doc): if not doc.get('entities'): import twitter_text extractor = twitter_text.Extractor(doc['text']) doc['entities'] = {'user_mentions': [um for um in extractor.extract_mentioned_screen_names_with_indices()]} # Emit the @mentions if doc['entities'].get('user_mentions'): for um in doc['entities']['user_mentions']: yield [um['screen_name'].lower(), None] # Emit the tweet author yield [doc['from_user'].lower(), None]
def get_entities(tweet): extractor = twitter_text.Extractor(tweet['text']) entities = {} entities['user_mentions'] = [] for um in extractor.extract_mentioned_screen_names_with_indices(): entities['user_mentions'].append(um) entities['hashtags'] = [] for ht in extractor.extract_hashtags_with_indices(): ht['text'] = ht['hashtag'] del ht['hashtag'] entities['hashtags'].append(ht) entities['urls'] = [] for url in extractor.extract_urls_with_indices(): entities['urls'].append(url) return entities
print get_user_profile(twitter_api, screen_names=["SocialWebMining", "ptwobrussell"]) #print get_user_profile(twitter_api, user_ids=[132373965]) ### Example 18. Extracting tweet entities from arbitrary text # In[ ]: import twitter_text # Sample usage txt = "RT @SocialWebMining Mining 1M+ Tweets About #Syria http://wp.me/p3QiJd-1I" ex = twitter_text.Extractor(txt) print "Screen Names:", ex.extract_mentioned_screen_names_with_indices() print "URLs:", ex.extract_urls_with_indices() print "Hashtags:", ex.extract_hashtags_with_indices() ### Example 19. Getting all friends or followers for a user # In[ ]: from functools import partial from sys import maxint def get_friends_followers_ids(twitter_api, screen_name=None,
def extract_urls(text): extractor = twitter_text.Extractor(text) urls = [] for um in extractor.extract_urls_with_indices(): urls.append(um) return urls
def test_extractor_ctor(): extractor = twitter_text.Extractor() assert extractor is not None
def test_yaml(): extractor = twitter_text.Extractor() with open(r"rust/conformance/tests/extract.yml") as file: testmap = yaml.load(file, Loader=yaml.FullLoader) assert len(testmap["tests"]["mentions"]) > 0 for test in testmap["tests"]["mentions"]: mentions = extractor.extract_mentioned_screennames(test["text"]) for index, s in enumerate(mentions): assert s == test["expected"][index] assert len(testmap["tests"]["mentions_with_indices"]) > 0 for test in testmap["tests"]["mentions_with_indices"]: entities = extractor.extract_mentioned_screennames_with_indices( test["text"]) assert len(test["expected"]) == len(entities) for index, entity in enumerate(entities): assert entity.value == test["expected"][index]["screen_name"] assert entity.start == test["expected"][index]["indices"][0] assert entity.end == test["expected"][index]["indices"][1] assert len(testmap["tests"]["mentions_or_lists_with_indices"]) > 0 for test in testmap["tests"]["mentions_or_lists_with_indices"]: entities = extractor.extract_mentions_or_lists_with_indices( test["text"]) assert len(test["expected"]) == len(entities) for index, entity in enumerate(entities): assert entity.value == test["expected"][index]["screen_name"] assert entity.list_slug == test["expected"][index]["list_slug"] assert entity.start == test["expected"][index]["indices"][0] assert entity.end == test["expected"][index]["indices"][1] assert len(testmap["tests"]["replies"]) > 0 for test in testmap["tests"]["replies"]: entity = extractor.extract_reply_screenname(test["text"]) if entity is not None: assert entity.value == test["expected"] else: assert test["expected"] is None assert len(testmap["tests"]["urls"]) > 0 for test in testmap["tests"]["urls"]: urls = extractor.extract_urls(test["text"]) for index, s in enumerate(urls): assert s == test["expected"][index] assert len(testmap["tests"]["urls_with_indices"]) > 0 for test in testmap["tests"]["urls_with_indices"]: entities = extractor.extract_urls_with_indices(test["text"]) assert len(test["expected"]) == len(entities) for index, entity in enumerate(entities): assert entity.value == test["expected"][index]["url"] assert entity.start == test["expected"][index]["indices"][0] assert entity.end == test["expected"][index]["indices"][1] assert len(testmap["tests"]["urls_with_directional_markers"]) > 0 for test in testmap["tests"]["urls_with_directional_markers"]: entities = extractor.extract_urls_with_indices(test["text"]) assert len(test["expected"]) == len(entities) for index, entity in enumerate(entities): assert entity.value == test["expected"][index]["url"] assert entity.start == test["expected"][index]["indices"][0] assert entity.end == test["expected"][index]["indices"][1] assert len(testmap["tests"]["tco_urls_with_params"]) > 0 for test in testmap["tests"]["tco_urls_with_params"]: entities = extractor.extract_urls_with_indices(test["text"]) assert len(test["expected"]) == len(entities) for index, entity in enumerate(entities): assert entity.value == test["expected"][index] assert len(testmap["tests"]["hashtags"]) > 0 for test in testmap["tests"]["hashtags"]: hashtags = extractor.extract_hashtags(test["text"]) assert len(test["expected"]) == len(hashtags) for index, hashtag in enumerate(hashtags): assert hashtag == test["expected"][index] assert len(testmap["tests"]["hashtags_from_astral"]) > 0 for test in testmap["tests"]["hashtags_from_astral"]: hashtags = extractor.extract_hashtags(test["text"]) assert len(test["expected"]) == len(hashtags) for index, hashtag in enumerate(hashtags): assert hashtag == test["expected"][index] assert len(testmap["tests"]["hashtags_with_indices"]) > 0 for test in testmap["tests"]["hashtags_with_indices"]: entities = extractor.extract_hashtags_with_indices(test["text"]) assert len(test["expected"]) == len(entities) for index, entity in enumerate(entities): assert entity.value == test["expected"][index]["hashtag"] assert entity.start == test["expected"][index]["indices"][0] assert entity.end == test["expected"][index]["indices"][1] assert len(testmap["tests"]["cashtags"]) > 0 for test in testmap["tests"]["cashtags"]: cashtags = extractor.extract_cashtags(test["text"]) assert len(test["expected"]) == len(cashtags) for index, cashtag in enumerate(cashtags): assert cashtag == test["expected"][index] assert len(testmap["tests"]["cashtags_with_indices"]) > 0 for test in testmap["tests"]["cashtags_with_indices"]: entities = extractor.extract_cashtags_with_indices(test["text"]) assert len(test["expected"]) == len(entities) for index, entity in enumerate(entities): assert entity.value == test["expected"][index]["cashtag"] assert entity.start == test["expected"][index]["indices"][0] assert entity.end == test["expected"][index]["indices"][1]
def test_accessors(): extractor = twitter_text.Extractor() assert extractor.get_extract_url_without_protocol() == True extractor.set_extract_url_without_protocol(False) assert extractor.get_extract_url_without_protocol() == False
def extractor_tests(tests, passed, failed): print u'Running Extractor tests' correct_mentioned_screen_names = [u'foo', u'monkeybat', u'bar'] correct_mentioned_screen_names_with_indices = [{'indicies': (0, 4), 'screen_name': u'foo'}, {'indicies': (32, 42), 'screen_name': u'monkeybat'}, {'indicies': (47, 51), 'screen_name': u'bar'}] correct_reply_screen_name = 'foo' correct_urls = [u'http://dryan.net/xxxxx?param=true#hash'] correct_urls_with_indices = [{'url': u'http://dryan.net/xxxxx?param=true#hash', 'indices': (52, 90)}] correct_hashtags = [u'comedy', u'url'] correct_hashtags_with_indices = [{'indices': (91, 98), 'hashtag': u'comedy'}, {'indices': (99, 103), 'hashtag': u'url'}] extractor = twitter_text.Extractor(text) if tt.extractor.extract_mentioned_screen_names() == correct_mentioned_screen_names: print u'\033[92m Attached extract_mentioned_screen_names passed\033[0m' passed += 1 else: print u'\033[91m Attached extract_mentioned_screen_names failed:\033[0m' print u' Expected: %s' % force_unicode(correct_mentioned_screen_names) print u' Returned: %s' % force_unicode(tt.extractor.extract_mentioned_screen_names()) failed +=1 tests +=1 if extractor.extract_mentioned_screen_names() == correct_mentioned_screen_names: print u'\033[92m Stand alone extract_mentioned_screen_names passed\033[0m' passed += 1 else: print u'\033[91m Stand alone extract_mentioned_screen_names failed:\033[0m' print u' Expected: %s' % force_unicode(correct_mentioned_screen_names) print u' Returned: %s' % force_unicode(extractor.extract_mentioned_screen_names()) failed +=1 tests +=1 if tt.extractor.extract_mentioned_screen_names_with_indices() == correct_mentioned_screen_names_with_indices: print u'\033[92m Attached extract_mentioned_screen_names_with_indices passed\033[0m' passed += 1 else: print u'\033[91m Attached extract_mentioned_screen_names_with_indices failed:\033[0m' print u' Expected: %s' % force_unicode(correct_mentioned_screen_names_with_indices) print u' Returned: %s' % force_unicode(tt.extractor.extract_mentioned_screen_names_with_indices()) failed += 1 tests += 1 if extractor.extract_mentioned_screen_names_with_indices() == correct_mentioned_screen_names_with_indices: print u'\033[92m Stand alone extract_mentioned_screen_names_with_indices passed\033[0m' passed += 1 else: print u'\033[91m Stand alone extract_mentioned_screen_names_with_indices failed:\033[0m' print u' Expected: %s' % force_unicode(correct_mentioned_screen_names_with_indices) print u' Returned: %s' % force_unicode(extractor.extract_mentioned_screen_names_with_indices()) failed += 1 tests += 1 if tt.extractor.extract_reply_screen_name() == correct_reply_screen_name: print u'\033[92m Attached extract_reply_screen_name passed\033[0m' passed += 1 else: print u'\033[91m Attached extract_reply_screen_name failed:\033[0m' print u' Expected: %s' % force_unicode(correct_reply_screen_name) print u' Returned: %s' % force_unicode(tt.extractor.extract_reply_screen_name()) failed +=1 tests +=1 if extractor.extract_reply_screen_name() == correct_reply_screen_name: print u'\033[92m Stand alone extract_reply_screen_name passed\033[0m' passed += 1 else: print u'\033[91m Stand alone extract_reply_screen_name failed:\033[0m' print u' Expected: %s' % force_unicode(correct_reply_screen_name) print u' Returned: %s' % force_unicode(extractor.extract_reply_screen_name()) failed +=1 tests +=1 if tt.extractor.extract_urls() == correct_urls: print u'\033[92m Attached extract_urls passed\033[0m' passed += 1 else: print u'\033[91m Attached extract_urls failed:\033[0m' print u' Expected: %s' % force_unicode(correct_urls) print u' Returned: %s' % force_unicode(tt.extractor.extract_urls()) failed +=1 tests +=1 if extractor.extract_urls() == correct_urls: print u'\033[92m Stand alone extract_urls passed\033[0m' passed += 1 else: print u'\033[91m Stand alone extract_urls failed:\033[0m' print u' Expected: %s' % force_unicode(correct_urls) print u' Returned: %s' % force_unicode(extractor.extract_urls()) failed +=1 tests +=1 if tt.extractor.extract_urls_with_indices() == correct_urls_with_indices: print u'\033[92m Attached extract_urls_with_indices passed\033[0m' passed += 1 else: print u'\033[91m Attached extract_urls_with_indices failed:\033[0m' print u' Expected: %s' % force_unicode(correct_urls_with_indices) print u' Returned: %s' % force_unicode(tt.extractor.extract_urls_with_indices()) failed += 1 tests += 1 if extractor.extract_urls_with_indices() == correct_urls_with_indices: print u'\033[92m Stand alone extract_urls_with_indices passed\033[0m' passed += 1 else: print u'\033[91m Stand alone extract_urls_with_indices failed:\033[0m' print u' Expected: %s' % force_unicode(correct_urls_with_indices) print u' Returned: %s' % force_unicode(extractor.extract_urls_with_indices()) failed += 1 tests += 1 if tt.extractor.extract_hashtags() == correct_hashtags: print u'\033[92m Attached extract_hashtags passed\033[0m' passed += 1 else: print u'\033[91m Attached extract_hashtags failed:\033[0m' print u' Expected: %s' % force_unicode(correct_hashtags) print u' Returned: %s' % force_unicode(tt.extractor.extract_hashtags()) failed +=1 tests +=1 if extractor.extract_hashtags() == correct_hashtags: print u'\033[92m Stand alone extract_hashtags passed\033[0m' passed += 1 else: print u'\033[91m Stand alone extract_hashtags failed:\033[0m' print u' Expected: %s' % force_unicode(correct_hashtags) print u' Returned: %s' % force_unicode(extractor.extract_hashtags()) failed +=1 tests +=1 if tt.extractor.extract_hashtags_with_indices() == correct_hashtags_with_indices: print u'\033[92m Attached extract_hashtags_with_indices passed\033[0m' passed += 1 else: print u'\033[91m Attached extract_hashtags_with_indices failed:\033[0m' print u' Expected: %s' % force_unicode(correct_hashtags_with_indices) print u' Returned: %s' % force_unicode(tt.extractor.extract_hashtags_with_indices()) failed += 1 tests += 1 if extractor.extract_hashtags_with_indices() == correct_hashtags_with_indices: print u'\033[92m Stand alone extract_hashtags_with_indices passed\033[0m' passed += 1 else: print u'\033[91m Stand alone extract_hashtags_with_indices failed:\033[0m' print u' Expected: %s' % force_unicode(correct_hashtags_with_indices) print u' Returned: %s' % force_unicode(extractor.extract_hashtags_with_indices()) failed += 1 tests += 1 return tests, passed, failed
# results is a list of dictionary items obtained from twitter # these functions assume that the text of each tweet # is written to a separate line in the output text file #item_count = 0 # initialize count of objects dumped to file #with open(partial_text_filename_u, 'w') as outfile: #for dict_item in results_u: #outfile.write(json.dumps(item['text'])) #item_count = item_count + 1 #if item_count < len(results_u): #outfile.write(line_termination) # new line between text items # In[11]: #Using extract # Citation: Mining the Social Web, 2nd Edition 9.2 Tweet Entities from Arbitrary Text users_extract = twitter_text.Extractor(results_u) ue = ("Screen Names:", users_extract.extract_mentioned_screen_names()) ue_list = list(ue) type(ue_list) competition_extract = twitter_text.Extractor(results_twt) ce = ("Screen Names:", competition_extract.extract_mentioned_screen_names()) ce_list = list(ce) type(ce_list) # FAIL: Looking to identify common values between lists # Citation: https://stackoverflow.com/questions/28061223/python-how-to-find-common-values-in-three-lists # set(ce_list).intersection(ue_list) print(ue_list)
marks = ['RT @', '@', '"', '#', '\n', '\t', ' '] for k in marks: tweet = tweet.replace(k, '') return tweet # In[32]: import twitter_text tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com http://ccc.nju.edu.cn RT !!HELP!!!!''' ex = twitter_text.Extractor(tweet) at_names = ex.extract_mentioned_screen_names() urls = ex.extract_urls() hashtags = ex.extract_hashtags() rt_user = extract_rt_user(tweet) #tweet_text = extract_tweet_text(tweet, at_names, urls) print(at_names, urls, hashtags, rt_user,'-------->')#, tweet_text) # In[33]: import csv lines = csv.reader(chunk,delimiter=',', quotechar='"')