def process_tweet(tweet_in): punct = re.escape("!\"$%&'()*+,-./:;<=>?@[\\]^`{|}~") expander = Expand_Url(db_name="url_test") tweet = tweet_in if tweet.has_key("entities"): # Insert Counts tweet["counts"] = { "urls": len(tweet["entities"]["urls"]), "hashtags": len(tweet["entities"]["hashtags"]), "user_mentions": len(tweet["entities"]["user_mentions"]), } tweet["hashtags"] = [] tweet["mentions"] = [] # Insert list of hashtags and mentions for index in range(len(tweet["entities"]["hashtags"])): tweet["hashtags"].append(tweet["entities"]["hashtags"][index]["text"].lower()) for index in range(len(tweet["entities"]["user_mentions"])): tweet["mentions"].append(tweet["entities"]["user_mentions"][index]["screen_name"].lower()) tweet["hashtags"].sort() tweet["mentions"].sort() # begin url expansion for index in range(len(tweet["entities"]["urls"])): ourl = tweet["entities"]["urls"][index]["expanded_url"] # if the expanded_url field is empty, try expanding the 'url' field instead if ourl is None: ourl = tweet["entities"]["urls"][index]["url"] if ourl: try: expanded = expander.check_cache(ourl) tweet["entities"]["urls"][index].update(expanded) # Catch any exceptions related to URL or expanding errors # and make sure we record why # except (URLError, APIError, UnicodeWarning, UnicodeError) as e: # tweet['entities']['urls'][index]['expansion_error'] = e.msg; # this catches errors which seem to emanate from unicode errors # this should be checked on occasion to ensure it really is a unicode error except KeyError as e: tweet["entities"]["urls"][index]["expansion_error"] = "Possible Unicode Error" # end url expansion # Track rule matches # tweet['track_kw'] = {} # tweet['track_kw']['hashtags'] = list(set(tweet['hashtags']).intersection(track_set)) # tweet['track_kw']['mentions'] = list(set(tweet['mentions']).intersection(track_set)) tweet_text = re.sub("[%s]" % punct, " ", tweet["text"]) tweet_text = tweet_text.lower().split() # tweet['track_kw']['text'] = list(set(tweet_text).intersection(track_set)) # Convert dates # tweet['created_ts'] = to_datetime(tweet['created_at']) # tweet['user']['created_ts'] = to_datetime(tweet['user']['created_at']) # Print tweet as JSON to stdout # print tweet['text'],tweet['entities']['urls'] # result = simplejson.dumps(tweet) return tweet
def process_tweet(tweet_in): track_list = ['boston','marathon','bomb','blast','explosion','watertown','mit','mitshooting'] # Turn it into a set track_set = set(track_list) punct = re.escape('!"$%&\'()*+,-./:;<=>?@[\\]^`{|}~') expander = Expand_Url(db_name=config_info.cache_db) try: tweet = simplejson.loads(tweet_in) if not tweet.has_key("info"): #print " [x] accepted tweet ID %s" % tweet['id'] if tweet.has_key("entities"): # Insert Counts tweet['counts'] = { 'urls': len(tweet['entities']['urls']), 'hashtags': len(tweet['entities']['hashtags']), 'user_mentions': len(tweet['entities']['user_mentions']) }; tweet['hashtags'] = [] tweet['mentions'] = [] # Insert list of hashtags and mentions for index in range(len(tweet['entities']['hashtags'])): tweet['hashtags'].append(tweet['entities']['hashtags'][index]['text'].lower()) for index in range(len(tweet['entities']['user_mentions'])): tweet['mentions'].append(tweet['entities']['user_mentions'][index]['screen_name'].lower()) tweet['hashtags'].sort() tweet['mentions'].sort() # begin url expansion for index in range(len(tweet['entities']['urls'])): ourl = tweet['entities']['urls'][index]['expanded_url'] if ourl != None: try: expanded = expander.check_cache(ourl) tweet['entities']['urls'][index].update(expanded) # Catch any exceptions related to URL or expanding errors # and make sure we record why #except (URLError, APIError, UnicodeWarning, UnicodeError) as e: # tweet['entities']['urls'][index]['expansion_error'] = e.msg; # this catches errors which seem to emanate from unicode errors # this should be checked on occasion to ensure it really is a unicode error except KeyError as e: tweet['entities']['urls'][index]['error'] = "Possible Unicode Error"; # if the expanded_url field is empty, try expanding the 'url' field instead else: ourl = tweet['entities']['urls'][index]['url'] try: expanded = expander.check_cache(ourl) tweet['entities']['urls'][index].update(expanded) # Catch any exceptions related to URL or expanding errors # and make sure we record why #except (URLError, APIError, UnicodeWarning, UnicodeError) as e: # tweet['entities']['urls'][index]['expansion_error'] = e.msg; # this catches errors which seem to emanate from unicode errors # this should be checked on occasion to ensure it really is a unicode error except KeyError as e: tweet['entities']['urls'][index]['error'] = "Possible Unicode Error"; # end url expansion # Track rule matches tweet['track_kw'] = {} tweet['track_kw']['hashtags'] = list(set(tweet['hashtags']).intersection(track_set)) tweet['track_kw']['mentions'] = list(set(tweet['mentions']).intersection(track_set)) tweet_text = re.sub('[%s]' % punct, ' ', tweet['text']) tweet_text = tweet_text.lower().split() tweet['track_kw']['text'] = list(set(tweet_text).intersection(track_set)) # Convert dates # Print tweet as JSON to stdout #print tweet['text'],tweet['entities']['urls'] result = simplejson.dumps(tweet) print " [x] processed tweet ID %s" % tweet['id'] return result else: print " [x] processed %s tweets" % tweet['info']['activity_count'] except ValueError as e: print ' [x] %s, %s' % (e,tweet_in) return '%s, %s' % (e,tweet_in)
from expand_url import Expand_Url URLs = ['http://www.ebay.com', 'http://somelab.net/foo', 'http://uw.edu/foo','http://seattle.somelab.net/test.txt', 'http://somelab.net'] test = Expand_Url(db_name='url_test') for x in URLs: print test.check_cache(x)
tweet['hashtags'].sort() tweet['mentions'].sort() # begin url expansion for index in range(len(tweet['entities']['urls'])): ourl = tweet['entities']['urls'][index]['expanded_url'] # if the expanded_url field is empty, try expanding the 'url' field instead if ourl is None: ourl = tweet['entities']['urls'][index]['url'] print ourl if ourl: print ourl try: expanded = expander.check_cache(ourl) tweet['entities']['urls'][index].update(expanded) # Catch any exceptions related to URL or expanding errors # and make sure we record why #except (URLError, APIError, UnicodeWarning, UnicodeError) as e: # tweet['entities']['urls'][index]['expansion_error'] = e.msg; # this catches errors which seem to emanate from unicode errors # this should be checked on occasion to ensure it really is a unicode error except KeyError as e: tweet['entities']['urls'][index]['expansion_error'] = "Possible Unicode Error"; # end url expansion # Track rule matches tweet['track_kw'] = {} tweet['track_kw']['hashtags'] = list(set(tweet['hashtags']).intersection(track_set))
def process_tweet(tweet_in): track_list = ["boston", "marathon", "bomb", "blast", "explosion", "watertown", "mit", "mitshooting"] # Turn it into a set track_set = set(track_list) punct = re.escape("!\"$%&'()*+,-./:;<=>?@[\\]^`{|}~") expander = Expand_Url(db_name="url_cache") try: tweet = simplejson.loads(tweet_in) if not tweet.has_key("info"): # print " [x] accepted tweet ID %s" % tweet['id'] if tweet.has_key("entities"): # Insert Counts tweet["counts"] = { "urls": len(tweet["entities"]["urls"]), "hashtags": len(tweet["entities"]["hashtags"]), "user_mentions": len(tweet["entities"]["user_mentions"]), } tweet["hashtags"] = [] tweet["mentions"] = [] # Insert list of hashtags and mentions for index in range(len(tweet["entities"]["hashtags"])): tweet["hashtags"].append(tweet["entities"]["hashtags"][index]["text"].lower()) for index in range(len(tweet["entities"]["user_mentions"])): tweet["mentions"].append(tweet["entities"]["user_mentions"][index]["screen_name"].lower()) tweet["hashtags"].sort() tweet["mentions"].sort() # begin url expansion for index in range(len(tweet["entities"]["urls"])): ourl = tweet["entities"]["urls"][index]["expanded_url"] # if the expanded_url field is empty, try expanding the 'url' field instead if ourl is None: ourl = tweet["entities"]["urls"][index]["url"] if ourl: try: expanded = expander.check_cache(ourl) tweet["entities"]["urls"][index].update(expanded) # Catch any exceptions related to URL or expanding errors # and make sure we record why # except (URLError, APIError, UnicodeWarning, UnicodeError) as e: # tweet['entities']['urls'][index]['expansion_error'] = e.msg; # this catches errors which seem to emanate from unicode errors # this should be checked on occasion to ensure it really is a unicode error except KeyError as e: tweet["entities"]["urls"][index]["expansion_error"] = "Possible Unicode Error" # end url expansion # Track rule matches tweet["track_kw"] = {} tweet["track_kw"]["hashtags"] = list(set(tweet["hashtags"]).intersection(track_set)) tweet["track_kw"]["mentions"] = list(set(tweet["mentions"]).intersection(track_set)) tweet_text = re.sub("[%s]" % punct, " ", tweet["text"]) tweet_text = tweet_text.lower().split() tweet["track_kw"]["text"] = list(set(tweet_text).intersection(track_set)) # Convert dates # Print tweet as JSON to stdout # print tweet['text'],tweet['entities']['urls'] result = simplejson.dumps(tweet) return result # print " [x] processed tweet ID %s" % tweet['id'] else: print " [x] processed %s tweets" % tweet["info"]["activity_count"] except ValueError as e: print " [x] %s, %s" % (e, tweet_in) return "%s, %s" % (e, tweet_in)