Пример #1
0
 def getLatLngFromTweet(tweet):
     coordinates = lookup(tweet, Constants.COORDINATES)
     if not coordinates:
         return None
     coordinateList = lookup(coordinates, Constants.COORDINATES)
     try:
         longitude = float(coordinateList[0])
         latitude = float(coordinateList[1])
     except:
         return None
     latLng = (latitude, longitude)
     return latLng
Пример #2
0
 def getLatLngFromTweet(tweet):
     coordinates = lookup(tweet, Constants.COORDINATES)
     if not coordinates:
         return None
     coordinateList = lookup(coordinates, Constants.COORDINATES)
     try:
         longitude = float(coordinateList[0])
         latitude = float(coordinateList[1])
     except:
         return None
     latLng = (latitude, longitude)
     return latLng
    def resolveLocationUsingPlace(self, tweet):
        place = self.getPlaceFromTweet(tweet)
        if place == None:
            return None

        url = lookup(place, 'url')
        id = lookup(place, 'id')
        country = lookup(place, 'country')
        if country == None:
            logger.warn("Found place with no country: {}".format(place))
            return None
        if self.placeNameToNormalizedPlaceName.has_key(country.lower):
            country = placeNameToNormalizedPlaceName[country.lower]

        placeType = lookup(place, 'place_type')
        if placeType.lower() == 'city':
            city = lookup(place, 'name')
            if country.lower() == 'united states':
                fullName = lookup(place, 'full_name')
                state = None
                if not fullName:
                    logger.warn(
                        "Found place with no full_name: {}".format(place))
                    return None
                match = re.search(self.statePattern, fullName)
                if match:
                    matchedString = match.group().lower().split()[1].strip()
                    if self.stateAbbreviationToFullName.has_key(matchedString):
                        state = self.stateAbbreviationToFullName[matchedString]
                    else:
                        st_matches = [
                            st for st in
                            self.stateAbbreviationToFullName.values()
                            if st == matchedString
                        ]
                        if len(st_matches) > 0:
                            state = st_matches[0]
                return self.getLocationForPlace(country, state, None, city,
                                                url, id)
            else:
                return self.getLocationForPlace(country, None, None, city, url,
                                                id)
        elif placeType.lower() == 'admin':
            state = lookup(place, 'name')
            return self.getLocationForPlace(country, state, None, None, url,
                                            id)
        elif placeType.lower() == 'country':
            return self.getLocationForPlace(country, None, None, None, url, id)
        elif placeType.lower() == 'neighborhood' or placeType.lower() == 'poi':
            fullName = lookup(place, 'full_name')
            if not fullName:
                logger.warn("Found place with no full_name: {}".format(place))
                return None
            splitFullName = fullName.split(',')
            city = None
            if len(splitFullName) > 1:
                city = splitFullName[1]
            return self.getLocationForPlace(country, None, None, city, url, id)
        else:
            logger.warn("Unknown place type: {}".format(placeType))
Пример #4
0
 def getLocationFromTweet(tweet):
     user = Utils.getUserFromTweet(tweet)
     if user:
         location = lookup(user, Constants.TWEET_USER_LOCATION)
         if location and len(location) > 0:
             return location
     return None
Пример #5
0
 def getLocationFromTweet(tweet):
     user = getUserFromTweet(tweet)
     if user:
         location = lookup(user, 'location')
         if location:
             return location
     return None
def create_features(vocab, docs):
    #sentiment shelve file
    #f_senti_shelve = os.path.join(folder,'shelve','sentiment_earn.dat')
    X = []
    Y = []
    docs_lst = []
    #relevant_but_not = []
    #new_feat_flag = True
    #if new_feat_flag:
     #   sentiment_shelve = shelve.open(f_senti_shelve)
      #  sentiments = sentiment_shelve['senti_dict']
    for index,doc in docs.iteritems():
        
        doc_id = index
        text = lookup(doc, 'BODY').lower()
        #print 'text'
        #print text
        if text:
            features = features_to_vector(vocab, extract_features(text))
            #X0.append(features)
          #  new_feat_lst = new_features(text)  
           # features.extend(new_feat_lst)
            #features.append(sentiments[tweet_id])
            
            X.append(features)
            if doc['label']:
                Y.append(1)
            else:
                Y.append(0)
            docs_lst.append([doc_id,text])
    
    return Y, X, docs_lst
Пример #7
0
 def getLocationFromTweet(tweet):
     user = getUserFromTweet(tweet)
     if user:
         location = lookup(user,'location')
         if location:
             return location
     return None
Пример #8
0
 def getLocationFromTweet(tweet):
     user = Utils.getUserFromTweet(tweet)
     if user:
         location = lookup(user, Constants.TWEET_USER_LOCATION)
         if location and len(location) > 0:
             return location 
     return None
Пример #9
0
def ext_latlong(tweet):
    OneCoord = r'([-+]?\d{1,3}\.\d{3,})'
    Separator = r', ?'
    LatLong = re.compile(OneCoord + Separator + OneCoord, re.U)
    geo = lookup(tweet, 'geo')
    if geo and geo['type'] == 'Point':
        lat, lon = geo['coordinates']
        loc_type = 'OFFICIAL'
    else:
        loc = lookup(tweet, 'user.location').strip()
        if loc:
            m = LatLong.search(loc.encode('utf8'))
            if m:
                lat, lon = m.groups()
                loc_type = 'REGEX'
    lat = float(lat); lon = float(lon)
    return (lat,lon), loc_type
Пример #10
0
 def resolveLocationUsingPlace(self, tweet):
     place = self.getPlaceFromTweet(tweet)
     if place == None:
         return None
         
     url = lookup(place, 'url')
     id = lookup(place, 'id')
     country = lookup(place, 'country')
     if country == None:
         logger.warn("Found place with no country: {}".format(place))
         return None
     if self.placeNameToNormalizedPlaceName.has_key(country.lower):
         country = placeNameToNormalizedPlaceName[country.lower]
         
     placeType = lookup(place, 'place_type')
     if placeType.lower() == 'city':
         city = lookup(place, 'name')
         if country.lower() == 'united states':
             fullName = lookup(place, 'full_name')
             state = None
             if not fullName:
                 logger.warn("Found place with no full_name: {}".format(place))
                 return None
             match = re.search(self.statePattern, fullName)
             if match:
                 matchedString = match.group().lower().split()[1].strip()
                 if self.stateAbbreviationToFullName.has_key(matchedString):
                     state = self.stateAbbreviationToFullName[matchedString]
                 else:
                     st_matches = [st for st in self.stateAbbreviationToFullName.values() if st == matchedString]
                     if len(st_matches) > 0:
                         state = st_matches[0]
             return self.getLocationForPlace(country, state, None, city, url, id)
         else:
             return self.getLocationForPlace(country, None, None, city, url, id)
     elif placeType.lower() == 'admin':
         state = lookup(place, 'name')
         return self.getLocationForPlace(country, state, None, None, url, id)
     elif placeType.lower() == 'country':
         return self.getLocationForPlace(country, None, None, None, url, id)
     elif placeType.lower() == 'neighborhood' or placeType.lower() == 'poi':
         fullName = lookup(place, 'full_name')
         if not fullName:
             logger.warn("Found place with no full_name: {}".format(place))
             return None
         splitFullName = fullName.split(',')
         city = None
         if len(splitFullName) > 1:
             city = splitFullName[1]
         return self.getLocationForPlace(country, None, None, city, url, id)
     else:
         logger.warn("Unknown place type: {}".format(placeType))
Пример #11
0
 def geo_check_tweet(tweet):
     geo = lookup(tweet,'geo')
     if geo and geo['type'] == 'Point':
         lat,lon  = geo['coordinates']
         loc_type = 'OFFICIAL'
     else:
         loc = lookup(tweet, 'user.location').strip()
         if not loc:
             #print "REJECT NO USERLOC\t" + json.dumps(record)
             return None
         m = LatLong.search(loc.encode('utf8'))
         if not m:
           #print "REJECT NO GEO REGEX\t" + json.dumps(record)
           return None
         lat,lon = m.groups()
         loc_type = 'REGEX'
     lat=float(lat); lon=float(lon)
     if (lat,lon)==(0,0) or lat < -90 or lat > 90 or lon < -180 or lon > 180:
         #print "REJECT JUNK GEO\t" + json.dumps([lat,lon]) + "\t" + json.dumps(record)
         return None
     else:        
         return (lat, lon)
def geo_check_tweet(tweet):
    geo = lookup(tweet,'geo')
    if geo and geo['type'] == 'Point':
        lat,lon  = geo['coordinates']
        loc_type = 'OFFICIAL'
    else:
        loc = lookup(tweet, 'user.location').strip()
        if not loc:
            #print "REJECT NO USERLOC\t" + json.dumps(record)
            return None
        m = LatLong.search(loc.encode('utf8'))
        if not m:
          #print "REJECT NO GEO REGEX\t" + json.dumps(record)
          return None
        lat,lon = m.groups()
        loc_type = 'REGEX'
    lat=float(lat); lon=float(lon)
    if (lat,lon)==(0,0) or lat < -90 or lat > 90 or lon < -180 or lon > 180:
        #print "REJECT JUNK GEO\t" + json.dumps([lat,lon]) + "\t" + json.dumps(record)
        return None
    else:        
        return (lat, lon)
Пример #13
0
 def getPlaceFromTweet(tweet):
     if tweet.has_key(Constants.PLACE):
         return lookup(tweet, Constants.PLACE)
     return None
Пример #14
0
 def getUserFromTweet(tweet):
     if tweet.has_key(Constants.TWEET_USER):
         return lookup(tweet, Constants.TWEET_USER)
     return None
Пример #15
0
 def getPlaceFromTweet(tweet):
     if tweet.has_key(Constants.PLACE):
         return lookup(tweet, Constants.PLACE)
     return None
Пример #16
0
Brendan O'Connor (brenocon.com)
"""
import sys, os, re
# sys.path.insert(0,'/h/brendano/proc')
# sys.path.insert(0,'/mal1/brendano/twi/twproc/proc')
# sys.path.insert(0, os.path.join(os.path.dirname(__file__),'../nlp'))
import twokenize
from hose_util import lookup, iterate, json

OneCoord = r'([-+]?\d{1,3}\.\d{3,})'
Separator = r', ?'
LatLong = re.compile(OneCoord + Separator + OneCoord, re.U)

for raw, tweet in iterate(raw=True):
    source = lookup(tweet, 'source')
    if "Buoy" in source:
        # print "REJECT BUOY\t" + json.dumps(tweet)
        continue

    n_fol = lookup(tweet, 'user.followers_count') or 0
    n_fri = lookup(tweet, 'user.friends_count') or 0
    if not (n_fol < 1000 and n_fri < 1000):
        # print "REJECT FOLLOWERS\t" + json.dumps(lookup(tweet,'user'))
        continue

    text = lookup(tweet, 'text')
    if not text.strip():
        # print "REJECT NO TEXT\t" + json.dumps(record)
        continue
Пример #17
0
 def getUserFromTweet(tweet):
     if tweet.has_key(Constants.TWEET_USER):
         return lookup(tweet, Constants.TWEET_USER)
     return None
Пример #18
0
def Geo_C(intput):
    OneCoord = r'([-+]?\d{1,3}\.\d{3,})'
    Separator = r', ?'
    LatLong = re.compile(OneCoord + Separator + OneCoord, re.U)

    for raw, tweet in iterate(raw=True, inputList=intput):
      source = lookup(tweet, 'source')
      if "Buoy" in source:
        # print "REJECT BUOY\t" + json.dumps(tweet)
        continue

      n_fol = lookup(tweet, 'user.followers_count') or 0
      n_fri = lookup(tweet, 'user.friends_count') or 0
      if not (n_fol < 1000 and n_fri < 1000):
        # print "REJECT FOLLOWERS\t" + json.dumps(lookup(tweet,'user'))
        continue

      text = lookup(tweet, 'text')
      if not text.strip():
        # print "REJECT NO TEXT\t" + json.dumps(record)
        continue

      lat = None
      lon = None
      orig_str = ""

      loc_type = None

      geo = lookup(tweet, 'geo')
      if geo and geo['type'] == 'Point':
        lat, lon = geo['coordinates']
        loc_type = 'OFFICIAL'
      else:
        loc = lookup(tweet, 'user.location').strip()
        if not loc:
          # print "REJECT NO USERLOC\t" + json.dumps(record)
          continue
        m = LatLong.search(loc.encode('utf8'))
        if not m:
          # print "REJECT NO GEO REGEX\t" + json.dumps(record)
          continue
        lat, lon = m.groups()
        loc_type = 'REGEX'

      lat = float(lat); lon = float(lon)
      if (lat, lon) == (0, 0) or lat < -90 or lat > 90 or lon < -180 or lon > 180:
        # print "REJECT JUNK GEO\t" + json.dumps([lat,lon]) + "\t" + json.dumps(record)
        continue

      # # For our applications we usually want to kill retweets
      if lookup(tweet, 'retweeted_status'):
        # print "REJECT OFFICIAL RT\t" + json.dumps(text)
        continue
      toks = twokenize.tokenize(text)
      if any(tok == 'RT' for tok in toks):
        # print "REJECT TEXT RT\t" + json.dumps(text)
        continue

      # Build a "SmallTweet" format record
      record = {
          'id': lookup(tweet, 'id'),
          'user': lookup(tweet, 'user.screen_name'),
          'date': tweet['created_at_datetime'].strftime("%Y-%m-%dT%H:%M:%S"),
          'text': lookup(tweet, 'text')
      }

      record['lonlat'] = [lon, lat]

      if '\t' in record['user']:
        print >> sys.stderr, "WTF\t" + json.dumps(record)
        continue



      out = [
          # 'GEO ' + loc_type,
#          str(record['id']),
#          record['user'].encode('utf-8'),
#          record['date'].encode('utf-8'),
          str(record['lonlat'][0]) + '+' + str(record['lonlat'][1])
#          record['text'].encode('utf-8')
          # json.dumps(lookup(tweet, 'user.location')),
          # json.dumps(lookup(tweet, 'source')),
          # json.dumps(record),
      ]

#      TempVar = record

#      print '\t'.join(out)

      return '\t'.join(out)
Пример #19
0
Brendan O'Connor (brenocon.com)
"""
import sys, os, re
# sys.path.insert(0,'/h/brendano/proc')
# sys.path.insert(0,'/mal1/brendano/twi/twproc/proc')
# sys.path.insert(0, os.path.join(os.path.dirname(__file__),'../nlp'))
import twokenize
from hose_util import lookup, iterate, json

OneCoord = r'([-+]?\d{1,3}\.\d{3,})'
Separator = r', ?'
LatLong = re.compile(OneCoord + Separator + OneCoord, re.U)

for raw, tweet in iterate(raw=True):
  source = lookup(tweet, 'source')
  if "Buoy" in source:
    # print "REJECT BUOY\t" + json.dumps(tweet)
    continue

  n_fol = lookup(tweet, 'user.followers_count') or 0
  n_fri = lookup(tweet, 'user.friends_count') or 0
  if not (n_fol < 1000 and n_fri < 1000):
    # print "REJECT FOLLOWERS\t" + json.dumps(lookup(tweet,'user'))
    continue

  text = lookup(tweet, 'text')
  if not text.strip():
    # print "REJECT NO TEXT\t" + json.dumps(record)
    continue