예제 #1
0
def lookup_contacts(contact_uids,mdists,env):
    """
    lookup user profiles for contacts or leafs
    """
    twit = twitter.TwitterResource()
    gis = gisgraphy.GisgraphyResource()
    gis.set_mdists(mdists)

    # FIXME: we need a better way to know which file we are on.
    # FIXME: use the new input_paths thing
    first, contact_uids = utils.peek(contact_uids)
    group = User.mod_id(first)
    logging.info('lookup old uids for %s',group)
    save_name = 'saved_users.%s'%group
    if env.name_exists(save_name):
        stored = set(env.load(save_name))
    else:
        stored = User.mod_id_set(int(group))
    logging.info('loaded mod_group %s of %d users',group,len(stored))
    missing = (id for id in contact_uids if id not in stored)

    chunks = utils.grouper(100, missing, dontfill=True)
    for chunk in chunks:
        users = twit.user_lookup(user_ids=list(chunk))
        for amigo in filter(None,users):
            assert User.mod_id(amigo._id)==group
            amigo.geonames_place = gis.twitter_loc(amigo.location)
            amigo.merge()
        yield len(users)
예제 #2
0
def cheap_locals(nebr_ids,mloc_uids,cutoff=20):
    """
    local contact ratio based on 20 leafs
    """
    seen = set()
    # There can be duplicates because nebr_ids is created by clumping nebr_split
    for nebr_id in nebr_ids:
        if nebr_id in seen:
            continue
        seen.add(nebr_id)

        user = User.get_id(nebr_id)
        user_loc = user.geonames_place.to_d()

        cids = [
            cid
            for key in User.NEBR_KEYS
            for cid in (getattr(user,key) or [])
            if cid not in mloc_uids
            ]
        if not cids:
            continue
        random.shuffle(cids)
        leafs = User.find(User._id.is_in(cids[:cutoff]), fields=['gnp'])

        dists = [
            coord_in_miles(user_loc,leaf.geonames_place.to_d())
            for leaf in leafs
            if leaf.has_place()
        ]
        if dists:
            blur = sum(1.0 for d in dists if d<25)/len(dists)
            yield user._id,blur
예제 #3
0
def pick_nebrs(mloc_uid):
    """
    For each target user, pick the 25 located contacts.
    """
    # reads predict.prep.mloc_uids, requires lookup_contacts, but don't read it.
    user = User.get_id(mloc_uid)
    user.neighbors = _pick_neighbors(user)
    user.save()
    return ((User.mod_id(n),n) for n in user.neighbors)
예제 #4
0
 def test_fix_mloc_mdists(self):
     self.FS["mdists"] = [dict(other=2)]
     self.FS["mloc_uids.03"] = [3, 103]
     User(_id=3, location="Texas").save()
     User(_id=103, location="Bryan, TX").save()
     with _patch_gisgraphy():
         self.gob.run_job("fix_mloc_mdists")
     u3 = User.get_id(3)
     u103 = User.get_id(103)
     self.assertEqual(u3.geonames_place.mdist, 2000)
     self.assertEqual(u103.geonames_place.mdist, 2)
예제 #5
0
def _paged_users(uids, **find_kwargs):
    # save some round trips by asking for 100 at a time
    groups = utils.grouper(100, uids, dontfill=True)
    return chain.from_iterable(
        User.find(User._id.is_in(list(group)), **find_kwargs)
        for group in groups
    )
예제 #6
0
def find_contacts(user_ds):
    """
    for each target user, fetch edges and tweets, pick 100 located contact ids
    """
    gis = gisgraphy.GisgraphyResource()
    twit = twitter.TwitterResource()
    for user_d in itertools.islice(user_ds,2600):
        user = User.get_id(user_d['id'])
        if user:
            logging.warn("not revisiting %d",user._id)
        else:
            user = User(user_d)
            user.geonames_place = gis.twitter_loc(user.location)
            _save_user_contacts(twit, user, _pick_random_contacts, limit=100)
        for mod_nebr in _my_contacts(user):
            yield mod_nebr
예제 #7
0
def rfr_triads(user_d):
    """
    find a target users with a social triangle and a recip friend not in that
    triangle. Return info about all four users.
    """
    # We are looking for this structure in the social graph:
    # my  you---our
    #   \  |  /
    #      me
    # me is a target user, the other users are contacts, and the edges are all
    # reciprocal.
    me = User(user_d)
    me_rfr = set(me.rfriends or []).intersection(me.neighbors or [])
    if len(me_rfr)<3:
        return []
    for you_id in me_rfr:
        you_ed = Edges.get_id(you_id)
        if not you_ed:
            continue #There are no edges for this neighbor.
        ours = me_rfr.intersection(you_ed.friends,you_ed.followers)
        mine = me_rfr.difference(you_ed.friends,you_ed.followers)
        if ours and mine:
            d = dict(
                me = dict(_id=me._id,loc=me.median_loc),
                you = dict(_id=you_id),
                my = dict(_id=random.choice(list(mine))),
                our = dict(_id=random.choice(list(ours))),
                )
            for k,v in d.iteritems():
                if k=='me': continue
                gnp = User.get_id(v['_id'],fields=['gnp']).geonames_place.to_d()
                gnp.pop('zipcode',None)
                v['loc'] = gnp
            return [d]
    return []
예제 #8
0
 def test_find_contacts(self):
     self._find_contacts_6()
     results = self.FS["find_contacts.06"]
     s_res = sorted(list(r[1])[0] for r in results)
     self.assertEqual(s_res, [0, 1, 2, 3, 7, 12, 18, 24, 30])
     flor = User.get_id(6)
     self.assertEqual(flor.just_mentioned, [7])
     self.assertEqual(sorted(flor.just_friends), [12, 18, 24, 30])
예제 #9
0
def saved_users():
    """
    Create set of ids already already in the database so that lookup_contacts
    can skip these users.  Talking to the database in lookup_contacts to check
    if users are in the database is too slow.
    """
    users = User.database.User.find({},fields=[],timeout=False)
    return ((User.mod_id(u['_id']),u['_id']) for u in users)
예제 #10
0
def geo_ats():
    """
    fetch all at mentions from database
    """
    for tweets in Tweets.find({},fields=['ats']):
        if tweets.ats:
            uid = tweets._id
            yield User.mod_id(uid), (uid,tweets.ats)
예제 #11
0
def find_leafs(uid):
    """
    for each contact, fetch edges and tweets, pick 100 leaf ids
    """
    twit = twitter.TwitterResource()
    user = User.get_id(uid)
    _save_user_contacts(twit, user, _pick_random_contacts, limit=100)
    return _my_contacts(user)
예제 #12
0
def at_tuples(geo_at):
    """
    create (mentioned user, tweet creator) pairs from geo_ats, and split based
    on user id of mentioned user
    """
    uid,ats = geo_at
    for at in ats:
        yield User.mod_id(at), (at,uid)
예제 #13
0
def pred_users(uids):
    """
    fetch target users from database
    """
    for g in utils.grouper(100,uids,dontfill=True):
        ids_group = tuple(g)
        for u in User.find(User._id.is_in(ids_group)):
            yield u.to_d()
예제 #14
0
def parse_geotweets(tweets):
    """
    read tweets from Twitter's streaming API and save users and their tweets
    USAGE: gunzip -c ~/may/*/*.gz | ./gb.py -s parse_geotweets
    """
    # We save users and locations intermingled because this data is too big to
    # fit in memory, and we do not want to do two passes.
    users = set()
    for i,t in enumerate(tweets):
        if i%10000 ==0:
            logging.info("read %d tweets"%i)
        if 'id' not in t: continue # this is not a tweet
        uid = t['user']['id']
        if not t.get('coordinates'): continue
        if uid not in users:
            yield User.mod_id(uid),t['user']
            users.add(uid)
        yield User.mod_id(uid),(uid,t['coordinates']['coordinates'])
    logging.info("sending up to %d users"%len(users))
예제 #15
0
def nebr_dists(mloc_tile):
    """
    find the distances from target users to their contacts
    """
    nebrs = User.find(User._id.is_in(mloc_tile['nebrs']),fields=['gnp'])
    for nebr in nebrs:
        dist = coord_in_miles(mloc_tile['mloc'], nebr.geonames_place.to_d())
        # add a one at the end to make the output format identical to
        # stranger_dists.
        yield dist,1
예제 #16
0
def mloc_tile(mloc_uids):
    """
    split the target users into tiles based on their home location
    """
    users = User.find(User._id.is_in(tuple(mloc_uids)),fields=['mloc','nebrs'])
    for user in users:
        if not user.neighbors:
            continue
        lng,lat = user.median_loc
        yield _tile(lat),user.to_d()
예제 #17
0
    def test_lookup_contacts(self):
        self.FS["mdists"] = [dict(other=2.5)]
        self.FS["contact_split.04"] = [4, 404]
        User.database.User = mock.MagicMock()
        User.database.User.find.return_value = [
            # MockTwitterResource will throw a 404 if you lookup user 404.
            # This lets us know the user was skipped.
            dict(_id=404)
        ]

        with _patch_twitter():
            with _patch_gisgraphy():
                self.gob.run_job("lookup_contacts")

        beryl = User.get_id(4)
        self.assertEqual(beryl.screen_name, "user_4")
        self.assertEqual(beryl.geonames_place.feature_code, "PPLA2")
        self.assertEqual(beryl.geonames_place.mdist, 3)
        missing = User.get_id(404)
        self.assertEqual(missing, None)
예제 #18
0
def mloc_uids(user_ds):
    """
    pick 2500 target users who have locations and good contacts
    """
    retrieved = [u['id'] for u in itertools.islice(user_ds,2600)]
    users = User.find(User._id.is_in(retrieved))
    good_ = { u._id for u in users if any(getattr(u,k) for k in NEBR_KEYS)}
    good = [uid for uid in retrieved if uid in good_]
    logging.info("found %d of %d",len(good),len(retrieved))
    # throw away accounts that didn't work to get down to the 2500 good users
    return good[:2500]
예제 #19
0
 def test_find_contacts_errors(self):
     self.FS["mloc_users.04"] = [dict(id=404)]
     self.FS["mloc_users.03"] = [dict(id=503)]
     with _patch_twitter():
         self.gob.run_job("find_contacts")
     for uid in (404, 503):
         missing = User.get_id(uid)
         self.assertEqual(missing.error_status, uid)
         self.assertEqual(missing.neighbors, None)
         self.assertEqual(missing.rfriends, None)
         self.assertEqual(Edges.get_id(uid), None)
         self.assertEqual(Tweets.get_id(uid), None)
예제 #20
0
def nebrs_d(user_d,mloc_blur):
    """
    create dict with lots of information about a target user's located contacts
    """
    mb = MlocBlur(*mloc_blur)
    user = User(user_d)
    nebrs = User.find(User._id.is_in(user_d['nebrs']))
    tweets = Tweets.get_id(user_d['_id'],fields=['ats'])

    res = make_nebrs_d(user,nebrs,tweets.ats)
    res['mloc'] = user_d['mloc']
    res['gnp'] = _blur_gnp(mb, user_d)
    return [res]
예제 #21
0
def _fetch_profiles(uids,twit,gis):
    users = list(User.find(User._id.is_in(uids)))
    existing_ids = {u._id for u in users}
    missing_ids = [uid for uid in uids if uid not in existing_ids]

    chunks = utils.grouper(100, missing_ids, dontfill=True)
    for chunk in chunks:
        found = twit.user_lookup(user_ids=list(chunk))
        for amigo in filter(None,found):
            amigo.geonames_place = gis.twitter_loc(amigo.location)
            amigo.merge()
            users.append(amigo)
    return users
예제 #22
0
    def predict(self, user_d, steps=0):
        """
        Attept to locate a Twitter user.
            user_d should be a Twitter-style user dictionary
            steps is the number of steps on the social graph to crawl. It should
                be 0, 1, or 2. If 0, predict makes no Twitter API calls, 1 uses
                4 calls, and 2 uses around 80 API calls.
        returns (longitude, latitude) or None if no location can be found
        """
        user = User(user_d)
        if steps == 0 and not user.location:
            return None

        gnp = self.gis.twitter_loc(user.location)

        if steps == 0:
            return gnp.to_tup() if gnp else None

        if gnp and gnp.mdist < MAX_GNP_MDIST:
            user.geonames_place = gnp
            return gnp.to_tup()

        _crawl_pred_one(user, self.twit, self.gis, self.pred, fast=(steps == 1))
        return user.pred_loc
예제 #23
0
def trash_extra_mloc(mloc_uids):
    "remove the mloc_users that mloc_uids skipped over"
    # This scares me a bit, but it's too late to go back and fix find_contacts.
    # I really wish I had limited find_contacts to stop after 2500 good users.
    db = User.database
    mloc_uids = set(mloc_uids)
    group_ = set(uid%100 for uid in mloc_uids)
    assert len(group_)==1
    group = next(iter(group_))
    stored = User.mod_id_set(group)
    trash = list(stored - mloc_uids)
    logging.info("trashing %d users",len(trash))
    logging.debug("full list: %r",trash)
    db.Edges.remove({'_id':{'$in':trash}})
    db.Tweets.remove({'_id':{'$in':trash}})
    db.User.remove({'_id':{'$in':trash}})
예제 #24
0
def _pick_neighbors(user):
    nebrs = {}
    for key in NEBR_KEYS:
        cids = getattr(user,key)
        if not cids:
            continue

        # this is slowish
        contacts = User.find(User._id.is_in(cids), fields=['gnp'])
        nebrs[key] = set(u._id for u in contacts if u.has_place())

    picked_ = filter(None,
                itertools.chain.from_iterable(
                    itertools.izip_longest(*nebrs.values())))
    picked = picked_[:25]
    logging.info('picked %d of %d contacts',len(picked),len(user.contacts))
    return picked
예제 #25
0
def fix_mloc_mdists(mloc_uids,mdists):
    """
    Add the median location error to profiles of contacts and target users.
    """
    gis = gisgraphy.GisgraphyResource()
    gis.set_mdists(mdists)
    # We didn't have mdists at the time the mloc users were saved. This
    # function could be avoided by running the mdist calculation before
    # running find_contacts.
    fixed = 0
    users = User.find(User._id.is_in(tuple(mloc_uids)))
    for user in users:
        user.geonames_place = gis.twitter_loc(user.location)
        user.save()
        if user.geonames_place:
            fixed+=1
    logging.info("fixed %d mdists",fixed)
    return [fixed]
예제 #26
0
def total_contacts(user_ds):
    """
    count the total number of contacts (to include in the paper)
    """
    for user_d in itertools.islice(user_ds,2600):
        user = User.get_id(user_d['id'])

        if not user:
            yield "no user"
        elif user.error_status:
            yield str(user.error_status)
        else:
            edges = Edges.get_id(user._id)
            tweets = Tweets.get_id(user._id)
            if not edges or not tweets:
                yield "no contacts"
            else:
                sets = _contact_sets(tweets,edges)
                yield [len(sets[k]) for k in User.NEBR_KEYS]
예제 #27
0
def edges_d(user_d, geo_ats):
    """
    create one dict per target user with information about one selected contact
    for each of the four types of contact
    """
    me = User(user_d)
    if not me.neighbors:
        return []
    nebrs = set(me.neighbors)
    me_usa = _in_usa(me.median_loc[0],me.median_loc[1])

    keys = {'just_followers':'jfol',
            'just_friends':'jfrd',
            'rfriends':'rfrd',
            'just_mentioned':'jat'}
    rels = dict(_id = me._id, mloc = me.median_loc)
    for long,short in keys.iteritems():
        amigos = [a for a in getattr(me,long) if a in nebrs]
        if not amigos:
            continue
        amigo = User.get_id(amigos[0])
        gnp = amigo.geonames_place.to_d()
        if gnp['mdist']>1000:
            continue
        rels[short] = dict(
                folc=amigo.followers_count,
                frdc=amigo.friends_count,
                lofrd=amigo.local_friends,
                lofol=amigo.local_followers,
                prot=amigo.protected,
                lat=gnp['lat'],
                lng=gnp['lng'],
                mdist=gnp['mdist'],
                _id=amigo._id,
                i_at=_ated(geo_ats,me._id,amigo._id),
                u_at=_ated(geo_ats,amigo._id,me._id),
                usa = me_usa and _in_usa(gnp['lng'],gnp['lat']),
                )

    return [rels]
예제 #28
0
def _my_contacts(user):
    return ((User.mod_id(c),c) for c in user.contacts)