def lookup_contacts(contact_uids,mdists,env): """ lookup user profiles for contacts or leafs """ twit = twitter.TwitterResource() gis = gisgraphy.GisgraphyResource() gis.set_mdists(mdists) # FIXME: we need a better way to know which file we are on. # FIXME: use the new input_paths thing first, contact_uids = utils.peek(contact_uids) group = User.mod_id(first) logging.info('lookup old uids for %s',group) save_name = 'saved_users.%s'%group if env.name_exists(save_name): stored = set(env.load(save_name)) else: stored = User.mod_id_set(int(group)) logging.info('loaded mod_group %s of %d users',group,len(stored)) missing = (id for id in contact_uids if id not in stored) chunks = utils.grouper(100, missing, dontfill=True) for chunk in chunks: users = twit.user_lookup(user_ids=list(chunk)) for amigo in filter(None,users): assert User.mod_id(amigo._id)==group amigo.geonames_place = gis.twitter_loc(amigo.location) amigo.merge() yield len(users)
def at_tuples(geo_at): """ create (mentioned user, tweet creator) pairs from geo_ats, and split based on user id of mentioned user """ uid,ats = geo_at for at in ats: yield User.mod_id(at), (at,uid)
def geo_ats(): """ fetch all at mentions from database """ for tweets in Tweets.find({},fields=['ats']): if tweets.ats: uid = tweets._id yield User.mod_id(uid), (uid,tweets.ats)
def saved_users(): """ Create set of ids already already in the database so that lookup_contacts can skip these users. Talking to the database in lookup_contacts to check if users are in the database is too slow. """ users = User.database.User.find({},fields=[],timeout=False) return ((User.mod_id(u['_id']),u['_id']) for u in users)
def pick_nebrs(mloc_uid): """ For each target user, pick the 25 located contacts. """ # reads predict.prep.mloc_uids, requires lookup_contacts, but don't read it. user = User.get_id(mloc_uid) user.neighbors = _pick_neighbors(user) user.save() return ((User.mod_id(n),n) for n in user.neighbors)
def parse_geotweets(tweets): """ read tweets from Twitter's streaming API and save users and their tweets USAGE: gunzip -c ~/may/*/*.gz | ./gb.py -s parse_geotweets """ # We save users and locations intermingled because this data is too big to # fit in memory, and we do not want to do two passes. users = set() for i,t in enumerate(tweets): if i%10000 ==0: logging.info("read %d tweets"%i) if 'id' not in t: continue # this is not a tweet uid = t['user']['id'] if not t.get('coordinates'): continue if uid not in users: yield User.mod_id(uid),t['user'] users.add(uid) yield User.mod_id(uid),(uid,t['coordinates']['coordinates']) logging.info("sending up to %d users"%len(users))
def _my_contacts(user): return ((User.mod_id(c),c) for c in user.contacts)