def lookup_contacts(contact_uids,mdists,env): """ lookup user profiles for contacts or leafs """ twit = twitter.TwitterResource() gis = gisgraphy.GisgraphyResource() gis.set_mdists(mdists) # FIXME: we need a better way to know which file we are on. # FIXME: use the new input_paths thing first, contact_uids = utils.peek(contact_uids) group = User.mod_id(first) logging.info('lookup old uids for %s',group) save_name = 'saved_users.%s'%group if env.name_exists(save_name): stored = set(env.load(save_name)) else: stored = User.mod_id_set(int(group)) logging.info('loaded mod_group %s of %d users',group,len(stored)) missing = (id for id in contact_uids if id not in stored) chunks = utils.grouper(100, missing, dontfill=True) for chunk in chunks: users = twit.user_lookup(user_ids=list(chunk)) for amigo in filter(None,users): assert User.mod_id(amigo._id)==group amigo.geonames_place = gis.twitter_loc(amigo.location) amigo.merge() yield len(users)
def cheap_locals(nebr_ids,mloc_uids,cutoff=20): """ local contact ratio based on 20 leafs """ seen = set() # There can be duplicates because nebr_ids is created by clumping nebr_split for nebr_id in nebr_ids: if nebr_id in seen: continue seen.add(nebr_id) user = User.get_id(nebr_id) user_loc = user.geonames_place.to_d() cids = [ cid for key in User.NEBR_KEYS for cid in (getattr(user,key) or []) if cid not in mloc_uids ] if not cids: continue random.shuffle(cids) leafs = User.find(User._id.is_in(cids[:cutoff]), fields=['gnp']) dists = [ coord_in_miles(user_loc,leaf.geonames_place.to_d()) for leaf in leafs if leaf.has_place() ] if dists: blur = sum(1.0 for d in dists if d<25)/len(dists) yield user._id,blur
def pick_nebrs(mloc_uid): """ For each target user, pick the 25 located contacts. """ # reads predict.prep.mloc_uids, requires lookup_contacts, but don't read it. user = User.get_id(mloc_uid) user.neighbors = _pick_neighbors(user) user.save() return ((User.mod_id(n),n) for n in user.neighbors)
def test_fix_mloc_mdists(self): self.FS["mdists"] = [dict(other=2)] self.FS["mloc_uids.03"] = [3, 103] User(_id=3, location="Texas").save() User(_id=103, location="Bryan, TX").save() with _patch_gisgraphy(): self.gob.run_job("fix_mloc_mdists") u3 = User.get_id(3) u103 = User.get_id(103) self.assertEqual(u3.geonames_place.mdist, 2000) self.assertEqual(u103.geonames_place.mdist, 2)
def _paged_users(uids, **find_kwargs): # save some round trips by asking for 100 at a time groups = utils.grouper(100, uids, dontfill=True) return chain.from_iterable( User.find(User._id.is_in(list(group)), **find_kwargs) for group in groups )
def find_contacts(user_ds): """ for each target user, fetch edges and tweets, pick 100 located contact ids """ gis = gisgraphy.GisgraphyResource() twit = twitter.TwitterResource() for user_d in itertools.islice(user_ds,2600): user = User.get_id(user_d['id']) if user: logging.warn("not revisiting %d",user._id) else: user = User(user_d) user.geonames_place = gis.twitter_loc(user.location) _save_user_contacts(twit, user, _pick_random_contacts, limit=100) for mod_nebr in _my_contacts(user): yield mod_nebr
def rfr_triads(user_d): """ find a target users with a social triangle and a recip friend not in that triangle. Return info about all four users. """ # We are looking for this structure in the social graph: # my you---our # \ | / # me # me is a target user, the other users are contacts, and the edges are all # reciprocal. me = User(user_d) me_rfr = set(me.rfriends or []).intersection(me.neighbors or []) if len(me_rfr)<3: return [] for you_id in me_rfr: you_ed = Edges.get_id(you_id) if not you_ed: continue #There are no edges for this neighbor. ours = me_rfr.intersection(you_ed.friends,you_ed.followers) mine = me_rfr.difference(you_ed.friends,you_ed.followers) if ours and mine: d = dict( me = dict(_id=me._id,loc=me.median_loc), you = dict(_id=you_id), my = dict(_id=random.choice(list(mine))), our = dict(_id=random.choice(list(ours))), ) for k,v in d.iteritems(): if k=='me': continue gnp = User.get_id(v['_id'],fields=['gnp']).geonames_place.to_d() gnp.pop('zipcode',None) v['loc'] = gnp return [d] return []
def test_find_contacts(self): self._find_contacts_6() results = self.FS["find_contacts.06"] s_res = sorted(list(r[1])[0] for r in results) self.assertEqual(s_res, [0, 1, 2, 3, 7, 12, 18, 24, 30]) flor = User.get_id(6) self.assertEqual(flor.just_mentioned, [7]) self.assertEqual(sorted(flor.just_friends), [12, 18, 24, 30])
def saved_users(): """ Create set of ids already already in the database so that lookup_contacts can skip these users. Talking to the database in lookup_contacts to check if users are in the database is too slow. """ users = User.database.User.find({},fields=[],timeout=False) return ((User.mod_id(u['_id']),u['_id']) for u in users)
def geo_ats(): """ fetch all at mentions from database """ for tweets in Tweets.find({},fields=['ats']): if tweets.ats: uid = tweets._id yield User.mod_id(uid), (uid,tweets.ats)
def find_leafs(uid): """ for each contact, fetch edges and tweets, pick 100 leaf ids """ twit = twitter.TwitterResource() user = User.get_id(uid) _save_user_contacts(twit, user, _pick_random_contacts, limit=100) return _my_contacts(user)
def at_tuples(geo_at): """ create (mentioned user, tweet creator) pairs from geo_ats, and split based on user id of mentioned user """ uid,ats = geo_at for at in ats: yield User.mod_id(at), (at,uid)
def pred_users(uids): """ fetch target users from database """ for g in utils.grouper(100,uids,dontfill=True): ids_group = tuple(g) for u in User.find(User._id.is_in(ids_group)): yield u.to_d()
def parse_geotweets(tweets): """ read tweets from Twitter's streaming API and save users and their tweets USAGE: gunzip -c ~/may/*/*.gz | ./gb.py -s parse_geotweets """ # We save users and locations intermingled because this data is too big to # fit in memory, and we do not want to do two passes. users = set() for i,t in enumerate(tweets): if i%10000 ==0: logging.info("read %d tweets"%i) if 'id' not in t: continue # this is not a tweet uid = t['user']['id'] if not t.get('coordinates'): continue if uid not in users: yield User.mod_id(uid),t['user'] users.add(uid) yield User.mod_id(uid),(uid,t['coordinates']['coordinates']) logging.info("sending up to %d users"%len(users))
def nebr_dists(mloc_tile): """ find the distances from target users to their contacts """ nebrs = User.find(User._id.is_in(mloc_tile['nebrs']),fields=['gnp']) for nebr in nebrs: dist = coord_in_miles(mloc_tile['mloc'], nebr.geonames_place.to_d()) # add a one at the end to make the output format identical to # stranger_dists. yield dist,1
def mloc_tile(mloc_uids): """ split the target users into tiles based on their home location """ users = User.find(User._id.is_in(tuple(mloc_uids)),fields=['mloc','nebrs']) for user in users: if not user.neighbors: continue lng,lat = user.median_loc yield _tile(lat),user.to_d()
def test_lookup_contacts(self): self.FS["mdists"] = [dict(other=2.5)] self.FS["contact_split.04"] = [4, 404] User.database.User = mock.MagicMock() User.database.User.find.return_value = [ # MockTwitterResource will throw a 404 if you lookup user 404. # This lets us know the user was skipped. dict(_id=404) ] with _patch_twitter(): with _patch_gisgraphy(): self.gob.run_job("lookup_contacts") beryl = User.get_id(4) self.assertEqual(beryl.screen_name, "user_4") self.assertEqual(beryl.geonames_place.feature_code, "PPLA2") self.assertEqual(beryl.geonames_place.mdist, 3) missing = User.get_id(404) self.assertEqual(missing, None)
def mloc_uids(user_ds): """ pick 2500 target users who have locations and good contacts """ retrieved = [u['id'] for u in itertools.islice(user_ds,2600)] users = User.find(User._id.is_in(retrieved)) good_ = { u._id for u in users if any(getattr(u,k) for k in NEBR_KEYS)} good = [uid for uid in retrieved if uid in good_] logging.info("found %d of %d",len(good),len(retrieved)) # throw away accounts that didn't work to get down to the 2500 good users return good[:2500]
def test_find_contacts_errors(self): self.FS["mloc_users.04"] = [dict(id=404)] self.FS["mloc_users.03"] = [dict(id=503)] with _patch_twitter(): self.gob.run_job("find_contacts") for uid in (404, 503): missing = User.get_id(uid) self.assertEqual(missing.error_status, uid) self.assertEqual(missing.neighbors, None) self.assertEqual(missing.rfriends, None) self.assertEqual(Edges.get_id(uid), None) self.assertEqual(Tweets.get_id(uid), None)
def nebrs_d(user_d,mloc_blur): """ create dict with lots of information about a target user's located contacts """ mb = MlocBlur(*mloc_blur) user = User(user_d) nebrs = User.find(User._id.is_in(user_d['nebrs'])) tweets = Tweets.get_id(user_d['_id'],fields=['ats']) res = make_nebrs_d(user,nebrs,tweets.ats) res['mloc'] = user_d['mloc'] res['gnp'] = _blur_gnp(mb, user_d) return [res]
def _fetch_profiles(uids,twit,gis): users = list(User.find(User._id.is_in(uids))) existing_ids = {u._id for u in users} missing_ids = [uid for uid in uids if uid not in existing_ids] chunks = utils.grouper(100, missing_ids, dontfill=True) for chunk in chunks: found = twit.user_lookup(user_ids=list(chunk)) for amigo in filter(None,found): amigo.geonames_place = gis.twitter_loc(amigo.location) amigo.merge() users.append(amigo) return users
def predict(self, user_d, steps=0): """ Attept to locate a Twitter user. user_d should be a Twitter-style user dictionary steps is the number of steps on the social graph to crawl. It should be 0, 1, or 2. If 0, predict makes no Twitter API calls, 1 uses 4 calls, and 2 uses around 80 API calls. returns (longitude, latitude) or None if no location can be found """ user = User(user_d) if steps == 0 and not user.location: return None gnp = self.gis.twitter_loc(user.location) if steps == 0: return gnp.to_tup() if gnp else None if gnp and gnp.mdist < MAX_GNP_MDIST: user.geonames_place = gnp return gnp.to_tup() _crawl_pred_one(user, self.twit, self.gis, self.pred, fast=(steps == 1)) return user.pred_loc
def trash_extra_mloc(mloc_uids): "remove the mloc_users that mloc_uids skipped over" # This scares me a bit, but it's too late to go back and fix find_contacts. # I really wish I had limited find_contacts to stop after 2500 good users. db = User.database mloc_uids = set(mloc_uids) group_ = set(uid%100 for uid in mloc_uids) assert len(group_)==1 group = next(iter(group_)) stored = User.mod_id_set(group) trash = list(stored - mloc_uids) logging.info("trashing %d users",len(trash)) logging.debug("full list: %r",trash) db.Edges.remove({'_id':{'$in':trash}}) db.Tweets.remove({'_id':{'$in':trash}}) db.User.remove({'_id':{'$in':trash}})
def _pick_neighbors(user): nebrs = {} for key in NEBR_KEYS: cids = getattr(user,key) if not cids: continue # this is slowish contacts = User.find(User._id.is_in(cids), fields=['gnp']) nebrs[key] = set(u._id for u in contacts if u.has_place()) picked_ = filter(None, itertools.chain.from_iterable( itertools.izip_longest(*nebrs.values()))) picked = picked_[:25] logging.info('picked %d of %d contacts',len(picked),len(user.contacts)) return picked
def fix_mloc_mdists(mloc_uids,mdists): """ Add the median location error to profiles of contacts and target users. """ gis = gisgraphy.GisgraphyResource() gis.set_mdists(mdists) # We didn't have mdists at the time the mloc users were saved. This # function could be avoided by running the mdist calculation before # running find_contacts. fixed = 0 users = User.find(User._id.is_in(tuple(mloc_uids))) for user in users: user.geonames_place = gis.twitter_loc(user.location) user.save() if user.geonames_place: fixed+=1 logging.info("fixed %d mdists",fixed) return [fixed]
def total_contacts(user_ds): """ count the total number of contacts (to include in the paper) """ for user_d in itertools.islice(user_ds,2600): user = User.get_id(user_d['id']) if not user: yield "no user" elif user.error_status: yield str(user.error_status) else: edges = Edges.get_id(user._id) tweets = Tweets.get_id(user._id) if not edges or not tweets: yield "no contacts" else: sets = _contact_sets(tweets,edges) yield [len(sets[k]) for k in User.NEBR_KEYS]
def edges_d(user_d, geo_ats): """ create one dict per target user with information about one selected contact for each of the four types of contact """ me = User(user_d) if not me.neighbors: return [] nebrs = set(me.neighbors) me_usa = _in_usa(me.median_loc[0],me.median_loc[1]) keys = {'just_followers':'jfol', 'just_friends':'jfrd', 'rfriends':'rfrd', 'just_mentioned':'jat'} rels = dict(_id = me._id, mloc = me.median_loc) for long,short in keys.iteritems(): amigos = [a for a in getattr(me,long) if a in nebrs] if not amigos: continue amigo = User.get_id(amigos[0]) gnp = amigo.geonames_place.to_d() if gnp['mdist']>1000: continue rels[short] = dict( folc=amigo.followers_count, frdc=amigo.friends_count, lofrd=amigo.local_friends, lofol=amigo.local_followers, prot=amigo.protected, lat=gnp['lat'], lng=gnp['lng'], mdist=gnp['mdist'], _id=amigo._id, i_at=_ated(geo_ats,me._id,amigo._id), u_at=_ated(geo_ats,amigo._id,me._id), usa = me_usa and _in_usa(gnp['lng'],gnp['lat']), ) return [rels]
def _my_contacts(user): return ((User.mod_id(c),c) for c in user.contacts)