def getLatLngFromTweet(tweet): coordinates = lookup(tweet, Constants.COORDINATES) if not coordinates: return None coordinateList = lookup(coordinates, Constants.COORDINATES) try: longitude = float(coordinateList[0]) latitude = float(coordinateList[1]) except: return None latLng = (latitude, longitude) return latLng
def resolveLocationUsingPlace(self, tweet): place = self.getPlaceFromTweet(tweet) if place == None: return None url = lookup(place, 'url') id = lookup(place, 'id') country = lookup(place, 'country') if country == None: logger.warn("Found place with no country: {}".format(place)) return None if self.placeNameToNormalizedPlaceName.has_key(country.lower): country = placeNameToNormalizedPlaceName[country.lower] placeType = lookup(place, 'place_type') if placeType.lower() == 'city': city = lookup(place, 'name') if country.lower() == 'united states': fullName = lookup(place, 'full_name') state = None if not fullName: logger.warn( "Found place with no full_name: {}".format(place)) return None match = re.search(self.statePattern, fullName) if match: matchedString = match.group().lower().split()[1].strip() if self.stateAbbreviationToFullName.has_key(matchedString): state = self.stateAbbreviationToFullName[matchedString] else: st_matches = [ st for st in self.stateAbbreviationToFullName.values() if st == matchedString ] if len(st_matches) > 0: state = st_matches[0] return self.getLocationForPlace(country, state, None, city, url, id) else: return self.getLocationForPlace(country, None, None, city, url, id) elif placeType.lower() == 'admin': state = lookup(place, 'name') return self.getLocationForPlace(country, state, None, None, url, id) elif placeType.lower() == 'country': return self.getLocationForPlace(country, None, None, None, url, id) elif placeType.lower() == 'neighborhood' or placeType.lower() == 'poi': fullName = lookup(place, 'full_name') if not fullName: logger.warn("Found place with no full_name: {}".format(place)) return None splitFullName = fullName.split(',') city = None if len(splitFullName) > 1: city = splitFullName[1] return self.getLocationForPlace(country, None, None, city, url, id) else: logger.warn("Unknown place type: {}".format(placeType))
def getLocationFromTweet(tweet): user = Utils.getUserFromTweet(tweet) if user: location = lookup(user, Constants.TWEET_USER_LOCATION) if location and len(location) > 0: return location return None
def getLocationFromTweet(tweet): user = getUserFromTweet(tweet) if user: location = lookup(user, 'location') if location: return location return None
def create_features(vocab, docs): #sentiment shelve file #f_senti_shelve = os.path.join(folder,'shelve','sentiment_earn.dat') X = [] Y = [] docs_lst = [] #relevant_but_not = [] #new_feat_flag = True #if new_feat_flag: # sentiment_shelve = shelve.open(f_senti_shelve) # sentiments = sentiment_shelve['senti_dict'] for index,doc in docs.iteritems(): doc_id = index text = lookup(doc, 'BODY').lower() #print 'text' #print text if text: features = features_to_vector(vocab, extract_features(text)) #X0.append(features) # new_feat_lst = new_features(text) # features.extend(new_feat_lst) #features.append(sentiments[tweet_id]) X.append(features) if doc['label']: Y.append(1) else: Y.append(0) docs_lst.append([doc_id,text]) return Y, X, docs_lst
def getLocationFromTweet(tweet): user = getUserFromTweet(tweet) if user: location = lookup(user,'location') if location: return location return None
def ext_latlong(tweet): OneCoord = r'([-+]?\d{1,3}\.\d{3,})' Separator = r', ?' LatLong = re.compile(OneCoord + Separator + OneCoord, re.U) geo = lookup(tweet, 'geo') if geo and geo['type'] == 'Point': lat, lon = geo['coordinates'] loc_type = 'OFFICIAL' else: loc = lookup(tweet, 'user.location').strip() if loc: m = LatLong.search(loc.encode('utf8')) if m: lat, lon = m.groups() loc_type = 'REGEX' lat = float(lat); lon = float(lon) return (lat,lon), loc_type
def resolveLocationUsingPlace(self, tweet): place = self.getPlaceFromTweet(tweet) if place == None: return None url = lookup(place, 'url') id = lookup(place, 'id') country = lookup(place, 'country') if country == None: logger.warn("Found place with no country: {}".format(place)) return None if self.placeNameToNormalizedPlaceName.has_key(country.lower): country = placeNameToNormalizedPlaceName[country.lower] placeType = lookup(place, 'place_type') if placeType.lower() == 'city': city = lookup(place, 'name') if country.lower() == 'united states': fullName = lookup(place, 'full_name') state = None if not fullName: logger.warn("Found place with no full_name: {}".format(place)) return None match = re.search(self.statePattern, fullName) if match: matchedString = match.group().lower().split()[1].strip() if self.stateAbbreviationToFullName.has_key(matchedString): state = self.stateAbbreviationToFullName[matchedString] else: st_matches = [st for st in self.stateAbbreviationToFullName.values() if st == matchedString] if len(st_matches) > 0: state = st_matches[0] return self.getLocationForPlace(country, state, None, city, url, id) else: return self.getLocationForPlace(country, None, None, city, url, id) elif placeType.lower() == 'admin': state = lookup(place, 'name') return self.getLocationForPlace(country, state, None, None, url, id) elif placeType.lower() == 'country': return self.getLocationForPlace(country, None, None, None, url, id) elif placeType.lower() == 'neighborhood' or placeType.lower() == 'poi': fullName = lookup(place, 'full_name') if not fullName: logger.warn("Found place with no full_name: {}".format(place)) return None splitFullName = fullName.split(',') city = None if len(splitFullName) > 1: city = splitFullName[1] return self.getLocationForPlace(country, None, None, city, url, id) else: logger.warn("Unknown place type: {}".format(placeType))
def geo_check_tweet(tweet): geo = lookup(tweet,'geo') if geo and geo['type'] == 'Point': lat,lon = geo['coordinates'] loc_type = 'OFFICIAL' else: loc = lookup(tweet, 'user.location').strip() if not loc: #print "REJECT NO USERLOC\t" + json.dumps(record) return None m = LatLong.search(loc.encode('utf8')) if not m: #print "REJECT NO GEO REGEX\t" + json.dumps(record) return None lat,lon = m.groups() loc_type = 'REGEX' lat=float(lat); lon=float(lon) if (lat,lon)==(0,0) or lat < -90 or lat > 90 or lon < -180 or lon > 180: #print "REJECT JUNK GEO\t" + json.dumps([lat,lon]) + "\t" + json.dumps(record) return None else: return (lat, lon)
def getPlaceFromTweet(tweet): if tweet.has_key(Constants.PLACE): return lookup(tweet, Constants.PLACE) return None
def getUserFromTweet(tweet): if tweet.has_key(Constants.TWEET_USER): return lookup(tweet, Constants.TWEET_USER) return None
Brendan O'Connor (brenocon.com) """ import sys, os, re # sys.path.insert(0,'/h/brendano/proc') # sys.path.insert(0,'/mal1/brendano/twi/twproc/proc') # sys.path.insert(0, os.path.join(os.path.dirname(__file__),'../nlp')) import twokenize from hose_util import lookup, iterate, json OneCoord = r'([-+]?\d{1,3}\.\d{3,})' Separator = r', ?' LatLong = re.compile(OneCoord + Separator + OneCoord, re.U) for raw, tweet in iterate(raw=True): source = lookup(tweet, 'source') if "Buoy" in source: # print "REJECT BUOY\t" + json.dumps(tweet) continue n_fol = lookup(tweet, 'user.followers_count') or 0 n_fri = lookup(tweet, 'user.friends_count') or 0 if not (n_fol < 1000 and n_fri < 1000): # print "REJECT FOLLOWERS\t" + json.dumps(lookup(tweet,'user')) continue text = lookup(tweet, 'text') if not text.strip(): # print "REJECT NO TEXT\t" + json.dumps(record) continue
def Geo_C(intput): OneCoord = r'([-+]?\d{1,3}\.\d{3,})' Separator = r', ?' LatLong = re.compile(OneCoord + Separator + OneCoord, re.U) for raw, tweet in iterate(raw=True, inputList=intput): source = lookup(tweet, 'source') if "Buoy" in source: # print "REJECT BUOY\t" + json.dumps(tweet) continue n_fol = lookup(tweet, 'user.followers_count') or 0 n_fri = lookup(tweet, 'user.friends_count') or 0 if not (n_fol < 1000 and n_fri < 1000): # print "REJECT FOLLOWERS\t" + json.dumps(lookup(tweet,'user')) continue text = lookup(tweet, 'text') if not text.strip(): # print "REJECT NO TEXT\t" + json.dumps(record) continue lat = None lon = None orig_str = "" loc_type = None geo = lookup(tweet, 'geo') if geo and geo['type'] == 'Point': lat, lon = geo['coordinates'] loc_type = 'OFFICIAL' else: loc = lookup(tweet, 'user.location').strip() if not loc: # print "REJECT NO USERLOC\t" + json.dumps(record) continue m = LatLong.search(loc.encode('utf8')) if not m: # print "REJECT NO GEO REGEX\t" + json.dumps(record) continue lat, lon = m.groups() loc_type = 'REGEX' lat = float(lat); lon = float(lon) if (lat, lon) == (0, 0) or lat < -90 or lat > 90 or lon < -180 or lon > 180: # print "REJECT JUNK GEO\t" + json.dumps([lat,lon]) + "\t" + json.dumps(record) continue # # For our applications we usually want to kill retweets if lookup(tweet, 'retweeted_status'): # print "REJECT OFFICIAL RT\t" + json.dumps(text) continue toks = twokenize.tokenize(text) if any(tok == 'RT' for tok in toks): # print "REJECT TEXT RT\t" + json.dumps(text) continue # Build a "SmallTweet" format record record = { 'id': lookup(tweet, 'id'), 'user': lookup(tweet, 'user.screen_name'), 'date': tweet['created_at_datetime'].strftime("%Y-%m-%dT%H:%M:%S"), 'text': lookup(tweet, 'text') } record['lonlat'] = [lon, lat] if '\t' in record['user']: print >> sys.stderr, "WTF\t" + json.dumps(record) continue out = [ # 'GEO ' + loc_type, # str(record['id']), # record['user'].encode('utf-8'), # record['date'].encode('utf-8'), str(record['lonlat'][0]) + '+' + str(record['lonlat'][1]) # record['text'].encode('utf-8') # json.dumps(lookup(tweet, 'user.location')), # json.dumps(lookup(tweet, 'source')), # json.dumps(record), ] # TempVar = record # print '\t'.join(out) return '\t'.join(out)