예제 #1
0
def process_tweet(tweet_in):
    punct = re.escape("!\"$%&'()*+,-./:;<=>?@[\\]^`{|}~")
    expander = Expand_Url(db_name="url_test")
    tweet = tweet_in
    if tweet.has_key("entities"):

        # Insert Counts
        tweet["counts"] = {
            "urls": len(tweet["entities"]["urls"]),
            "hashtags": len(tweet["entities"]["hashtags"]),
            "user_mentions": len(tweet["entities"]["user_mentions"]),
        }

        tweet["hashtags"] = []
        tweet["mentions"] = []

        # Insert list of hashtags and mentions
        for index in range(len(tweet["entities"]["hashtags"])):
            tweet["hashtags"].append(tweet["entities"]["hashtags"][index]["text"].lower())
            for index in range(len(tweet["entities"]["user_mentions"])):
                tweet["mentions"].append(tweet["entities"]["user_mentions"][index]["screen_name"].lower())

                tweet["hashtags"].sort()
                tweet["mentions"].sort()

                # begin url expansion
                for index in range(len(tweet["entities"]["urls"])):
                    ourl = tweet["entities"]["urls"][index]["expanded_url"]

                    # if the expanded_url field is empty, try expanding the 'url' field instead
                    if ourl is None:
                        ourl = tweet["entities"]["urls"][index]["url"]

                    if ourl:

                        try:
                            expanded = expander.check_cache(ourl)
                            tweet["entities"]["urls"][index].update(expanded)
                            # Catch any exceptions related to URL or expanding errors
                            # and make sure we record why
                            # except (URLError, APIError, UnicodeWarning, UnicodeError) as e:
                            # 	tweet['entities']['urls'][index]['expansion_error'] = e.msg;
                            # this catches errors which seem to emanate from unicode errors
                            # this should be checked on occasion to ensure it really is a unicode error
                        except KeyError as e:
                            tweet["entities"]["urls"][index]["expansion_error"] = "Possible Unicode Error"
                # end url expansion

                # Track rule matches
                # tweet['track_kw'] = {}
                # tweet['track_kw']['hashtags'] = list(set(tweet['hashtags']).intersection(track_set))
                # tweet['track_kw']['mentions'] = list(set(tweet['mentions']).intersection(track_set))
                tweet_text = re.sub("[%s]" % punct, " ", tweet["text"])
                tweet_text = tweet_text.lower().split()
                # tweet['track_kw']['text'] = list(set(tweet_text).intersection(track_set))

                # Convert dates
                # tweet['created_ts'] = to_datetime(tweet['created_at'])
                # tweet['user']['created_ts'] = to_datetime(tweet['user']['created_at'])

                # Print tweet as JSON to stdout
                # print tweet['text'],tweet['entities']['urls']
    # result = simplejson.dumps(tweet)
    return tweet
예제 #2
0
def process_tweet(tweet_in):
    track_list = ['boston','marathon','bomb','blast','explosion','watertown','mit','mitshooting']
    # Turn it into a set
    track_set = set(track_list)
    punct = re.escape('!"$%&\'()*+,-./:;<=>?@[\\]^`{|}~')
    expander = Expand_Url(db_name=config_info.cache_db)
    try:
        tweet = simplejson.loads(tweet_in)
        if not tweet.has_key("info"):
            #print " [x] accepted tweet ID %s" % tweet['id']
            if tweet.has_key("entities"):

            # Insert Counts
                tweet['counts'] = {
                    'urls': len(tweet['entities']['urls']),
                    'hashtags': len(tweet['entities']['hashtags']),
                    'user_mentions': len(tweet['entities']['user_mentions'])
                };

                tweet['hashtags'] = []
                tweet['mentions'] = []

                # Insert list of hashtags and mentions
                for index in range(len(tweet['entities']['hashtags'])):
                    tweet['hashtags'].append(tweet['entities']['hashtags'][index]['text'].lower())
                    for index in range(len(tweet['entities']['user_mentions'])):
                        tweet['mentions'].append(tweet['entities']['user_mentions'][index]['screen_name'].lower())

                        tweet['hashtags'].sort()
                        tweet['mentions'].sort()

                # begin url expansion
                for index in range(len(tweet['entities']['urls'])):
                    ourl = tweet['entities']['urls'][index]['expanded_url']
                    if ourl != None:
                        try:
                            expanded = expander.check_cache(ourl)
                            tweet['entities']['urls'][index].update(expanded)
                            # Catch any exceptions related to URL or expanding errors
                            # and make sure we record why
                            #except (URLError, APIError, UnicodeWarning, UnicodeError) as e:
                            #	tweet['entities']['urls'][index]['expansion_error'] = e.msg;
                            # this catches errors which seem to emanate from unicode errors
                            # this should be checked on occasion to ensure it really is a unicode error
                        except KeyError as e:
                            tweet['entities']['urls'][index]['error'] = "Possible Unicode Error";
                    # if the expanded_url field is empty, try expanding the 'url' field instead
                    else:
                        ourl = tweet['entities']['urls'][index]['url']
                        try:
                            expanded = expander.check_cache(ourl)
                            tweet['entities']['urls'][index].update(expanded)
                            # Catch any exceptions related to URL or expanding errors
                            # and make sure we record why
                            #except (URLError, APIError, UnicodeWarning, UnicodeError) as e:
                            #	tweet['entities']['urls'][index]['expansion_error'] = e.msg;
                            # this catches errors which seem to emanate from unicode errors
                            # this should be checked on occasion to ensure it really is a unicode error
                        except KeyError as e:
                            tweet['entities']['urls'][index]['error'] = "Possible Unicode Error";
                    # end url expansion

                        # Track rule matches
                tweet['track_kw'] = {}
                tweet['track_kw']['hashtags'] = list(set(tweet['hashtags']).intersection(track_set))
                tweet['track_kw']['mentions'] = list(set(tweet['mentions']).intersection(track_set))
                tweet_text = re.sub('[%s]' % punct, ' ', tweet['text'])
                tweet_text = tweet_text.lower().split()
                tweet['track_kw']['text'] = list(set(tweet_text).intersection(track_set))

                        # Convert dates

                        # Print tweet as JSON to stdout
                        #print tweet['text'],tweet['entities']['urls']
            result = simplejson.dumps(tweet)
            print " [x] processed tweet ID %s" % tweet['id']
            return result
        else:
            print " [x] processed %s tweets" % tweet['info']['activity_count']

    except ValueError as e:
        print ' [x] %s, %s' % (e,tweet_in)
        return '%s, %s' % (e,tweet_in)
예제 #3
0
from expand_url import Expand_Url

URLs = ['http://www.ebay.com', 'http://somelab.net/foo', 'http://uw.edu/foo','http://seattle.somelab.net/test.txt', 'http://somelab.net']

test = Expand_Url(db_name='url_test')

for x in URLs:
    print test.check_cache(x)
예제 #4
0
import sys
sys.path.append('.')

import simplejson
import re
import time
from datetime          import datetime, timedelta
from email.utils       import parsedate_tz
#from some_url_expander import URLError
#from some_url_expander import APIError
from expand_url import Expand_Url
from urlparse          import urlsplit

# who is expanding urls on our server
expander = Expand_Url(db_name='url_test')

# List of punct to remove from string for track keyword matching
punct = re.escape('!"$%&\'()*+,-./:;<=>?@[\\]^`{|}~')

# List of words we are tracking
track_list = ["15o","15oct","99percent","acampadamataro","acampvalladolid","acampvalladolid","frankietease","ioccupy","ioccupyoccupyashland","k8_revolution","lakajo97","occopywmpt","occuponsmontrea","occupy","occupyaarhus","occupyabilene","occupyadelaide","occupyafrica","occupyafrica1","occupyakron","occupyalbany","occupyalbanyny1","occupyallentown","occupyamsterdam","occupyanchorage","occupyannarbor","occupyappleton","occupyarcata","occupyarizona","occupyarkansas","occupyashland","occupyashlandky","occupyaspen","occupyastoria","occupyathens","occupyathensga","occupyatl","occupyatlanta","occupyatlanticcity","occupyatlcity","occupyauburn","occupyaugusta","occupyaurora","occupyaustin","occupyb0ulder","occupybaltimore","occupybhgrove","occupybkny","occupyboise","occupyboulder","occupyboulderco","occupybrisbane","occupybrussels","occupybucharest","occupybuffalo","occupycarsoncty","occupycc","occupycha","occupychi","occupychicago","occupychucktown","occupycincinnati","occupycincy","occupyclarksvil","occupycleveland","occupycolumbia","occupycosprings","occupycu","occupycville","occupydallas","occupydc","occupydelaware","occupydenhaag","occupydenmark","occupyearth","occupyeugene","occupyflorida","occupyfm","occupyfortmyers","occupyftcollins","occupygtown","occupyhardford","occupyhartford","occupyhouston","occupyhsv","occupyhumboldt","occupyindy","occupyisu","occupyitaly","occupyjax","occupykeene","occupykelowna","occupykingston","occupyla","occupylansing","occupylasvegas","occupylausd","occupylondon","occupylsx","occupymadison99","occupymartnsbrg","occupymemphis","occupymia","occupymilwaukee","occupymn","occupymontrea","occupynashville","occupynewportor","occupynj","occupyns","occupyobise","occupyokc","occupyomaha","occupyorlando","occupyorlandofl","occupyottawa","occupypei","occupyphoenix","occupyportland","occupyprov","occupyquebec","occupyraleigh","occupyredlands","occupyrichmond","occupyroanokeva","occupyrockford","occupysacto","occupysalem","occupysananto","occupysanjose","occupysantacruz","occupysarasota","occupysarasotaoccupysanjose","occupysaskatoon","occupysb","occupysd","occupyseattle","occupysenhaag","occupyslc","occupysr","occupystaugust","occupystl","occupytampa","occupythemedia","occupytoronto","occupyueg","occupyukiah","occupyvermont","occupyvictoria","occupywallst","occupywallstnyc","occupywallstreet","occupywinnipeg","occupywmpt","occupywv","occupyyakima","occupyyeg","occupyyork","occupy_albanyny","occupy_okc","occupy_ottawa","occypyftcollins","ows","owslosangeles","owsspacecoast","perversmas","quimbanda","storydoula","tokumtorgin","nov5","5nov","bofa","cabincr3w","nov2","2nov","generalstrike","oct29","29oct","nov17","17nov","occupypics","usdor","occupydenver","needsoftheoccupiers","wearethe99","occupyoakland","occupyboston","occupy_boston","oo","53percent","1percent","banktransferday","moveyourmoney","louderthanwords","rebuilddream","acorn","n17","17n","d21","12d","occupyarrests","n30","30n","nov30","strike","occupytheport"]
# Turn it into a set
track_set = set(track_list)

# Parse Twitter created_at datestring and turn it into
def to_datetime(datestring):
	time_tuple = parsedate_tz(datestring.strip())
	dt = datetime(*time_tuple[:6])
	return dt
예제 #5
0
def process_tweet(tweet_in):
    track_list = ["boston", "marathon", "bomb", "blast", "explosion", "watertown", "mit", "mitshooting"]
    # Turn it into a set
    track_set = set(track_list)
    punct = re.escape("!\"$%&'()*+,-./:;<=>?@[\\]^`{|}~")
    expander = Expand_Url(db_name="url_cache")
    try:
        tweet = simplejson.loads(tweet_in)
        if not tweet.has_key("info"):
            # print " [x] accepted tweet ID %s" % tweet['id']
            if tweet.has_key("entities"):

                # Insert Counts
                tweet["counts"] = {
                    "urls": len(tweet["entities"]["urls"]),
                    "hashtags": len(tweet["entities"]["hashtags"]),
                    "user_mentions": len(tweet["entities"]["user_mentions"]),
                }

                tweet["hashtags"] = []
                tweet["mentions"] = []

                # Insert list of hashtags and mentions
                for index in range(len(tweet["entities"]["hashtags"])):
                    tweet["hashtags"].append(tweet["entities"]["hashtags"][index]["text"].lower())
                    for index in range(len(tweet["entities"]["user_mentions"])):
                        tweet["mentions"].append(tweet["entities"]["user_mentions"][index]["screen_name"].lower())

                        tweet["hashtags"].sort()
                        tweet["mentions"].sort()

                # begin url expansion
                for index in range(len(tweet["entities"]["urls"])):
                    ourl = tweet["entities"]["urls"][index]["expanded_url"]

                    # if the expanded_url field is empty, try expanding the 'url' field instead
                    if ourl is None:
                        ourl = tweet["entities"]["urls"][index]["url"]

                    if ourl:
                        try:
                            expanded = expander.check_cache(ourl)
                            tweet["entities"]["urls"][index].update(expanded)
                            # Catch any exceptions related to URL or expanding errors
                            # and make sure we record why
                            # except (URLError, APIError, UnicodeWarning, UnicodeError) as e:
                            # 	tweet['entities']['urls'][index]['expansion_error'] = e.msg;
                            # this catches errors which seem to emanate from unicode errors
                            # this should be checked on occasion to ensure it really is a unicode error
                        except KeyError as e:
                            tweet["entities"]["urls"][index]["expansion_error"] = "Possible Unicode Error"
                        # end url expansion

                        # Track rule matches
                        tweet["track_kw"] = {}
                        tweet["track_kw"]["hashtags"] = list(set(tweet["hashtags"]).intersection(track_set))
                        tweet["track_kw"]["mentions"] = list(set(tweet["mentions"]).intersection(track_set))
                        tweet_text = re.sub("[%s]" % punct, " ", tweet["text"])
                        tweet_text = tweet_text.lower().split()
                        tweet["track_kw"]["text"] = list(set(tweet_text).intersection(track_set))

                        # Convert dates

                        # Print tweet as JSON to stdout
                        # print tweet['text'],tweet['entities']['urls']
            result = simplejson.dumps(tweet)
            return result
            # print " [x] processed tweet ID %s" % tweet['id']
        else:
            print " [x] processed %s tweets" % tweet["info"]["activity_count"]

    except ValueError as e:
        print " [x] %s, %s" % (e, tweet_in)
        return "%s, %s" % (e, tweet_in)