def near_edges(daily_ats, user_locs, in_paths): """ keep edges from daily_ats between users who live within 25 miles """ # FIXME: I'm really starting to dislike in_paths day_name = in_paths[0].split('.')[-1] dt = datetime.datetime.strptime(day_name,"%Y-%m-%d") day = dt.date() - datetime.date(2012,8,1) edges = collections.defaultdict(lambda: collections.defaultdict(int)) for kind,frm,to in daily_ats: if frm in user_locs and to in user_locs: edges[frm,to][kind]+=1 def _as_array(is_to,is_lat): return np.array([user_locs[edge[is_to]][is_lat] for edge in edges]) flngs = _as_array(0,0) flats = _as_array(0,1) tlngs = _as_array(1,0) tlats = _as_array(1,1) dists = utils.np_haversine(flngs,tlngs,flats,tlats) for dist,(frm,to) in izip(dists,edges): if dist<25: edge = edges[frm,to] yield NearEdge( frm, to, dist, day.days, edge.get('at',0), edge.get('rt',0), )
def predict(self,nebrs_d,vect_fit): lats = [r['lat'] for r in nebrs_d['nebrs']] lngs = [r['lng'] for r in nebrs_d['nebrs']] mlat = np.median(lats) mlng = np.median(lngs) dists = utils.np_haversine(mlng,lngs,mlat,lats) return np.argmin(dists)
def graph_example_probs(vect_fit, in_paths): """ create an example of maximum likeliehood estimation for four friends """ if in_paths[0][-1] != '0': return curves = [fit for vers,cutoff,fit in vect_fit if vers=='leaf'] lat_range = np.linspace(27.01,32.99,5*60) lng_range = np.linspace(-100.99,-93.01,5*80) lat_grid, lng_grid = np.meshgrid(lat_range, lng_range) print lat_range, lng_range probs = np.zeros_like(lat_grid) spots = ( (-95.31, 29.73, 0), # Houston (-96.37, 30.67, 1), # Bryan, TX (-99.25, 31.25, 5), # Texas (-97.74, 30.27, 3), # Austin ) for lng, lat, curve in spots: dists = utils.np_haversine(lng, lng_grid, lat, lat_grid) probs+=np.log(peek.contact_curve(dists,*(curves[curve]))) clipped = 255.999*(np.max(probs)-probs)/np.ptp(probs) buff = np.require(np.transpose(clipped),np.uint8,['C_CONTIGUOUS']) img = PIL.Image.frombuffer('L',(clipped.shape),buff) img.save('example_probs.png')
def _calc_dists(nebrs_d): gnp = nebrs_d['gnp'] lats = [r['lat'] for r in nebrs_d['nebrs']] lngs = [r['lng'] for r in nebrs_d['nebrs']] all_lats = lats+[gnp['lat']] if gnp else lats all_lngs = lngs+[gnp['lng']] if gnp else lngs lat1,lat2 = np.meshgrid(all_lats,lats) lng1,lng2 = np.meshgrid(all_lngs,lngs) return utils.np_haversine(lng1,lng2,lat1,lat2)
def exact_strange_bins(uids,mlocs): """find the distance between every contact and every target user""" mlngs,mlats = np.transpose(mlocs) bins = utils.dist_bins(120) counts = np.zeros(len(bins)-1) for contact in _paged_users(set(uids),fields=['gnp']): clat = contact.geonames_place.lat clng = contact.geonames_place.lng dists= utils.np_haversine(clng, mlngs, clat, mlats) hist,b = np.histogram(dists,bins) counts+=hist return enumerate(counts)
def _dists_for_lat(lat): lat_range = np.linspace(-89.95,89.95,1800) lng_range = np.linspace(.05,180.05,1801) lat_grid,lng_grid = np.meshgrid(lat_range, lng_range) centered_lat = .05 + .1*_tile(lat) lat_ar = np.empty_like(lat_grid) lat_ar.fill(centered_lat) lng_0 = np.empty_like(lat_grid) lng_0.fill(.05) return utils.np_haversine(lng_0, lng_grid, lat_ar, lat_grid)
def mdist_real(nebrs_d): """ compare median location error to the actual location error for the target users after adding noise to home location in mloc_blur """ data = collections.defaultdict(list) for nebr_d in nebrs_d: if not nebr_d['gnp']: continue data['glat'].append(nebr_d['gnp']['lat']) data['glng'].append(nebr_d['gnp']['lng']) data['mlng'].append(nebr_d['mloc'][0]) data['mlat'].append(nebr_d['mloc'][1]) data['mdist'].append(nebr_d['gnp']['mdist']) dists = utils.np_haversine( data['mlng'], data['glng'], data['mlat'], data['glat']) return itertools.izip(data['mdist'],dists)
def stranger_prob(lat_tile,contact_count): """ Calculate pStrangers for every longitude tile at a specific latitude. pStrangers is the probability that a user lives at a location given the locations of people they are not connected to. This step of FreindlyLocation took about 2 weeks on a machine with 8 cores. lat_tile should be between -900 and 900 and represents a latitude contact_count is a matrix of the locations of the contacts """ lat_range = np.linspace(-89.95,89.95,1800) lng_range = np.linspace(.05,359.95,3600) lat_grid,lng_grid = np.meshgrid(lat_range, lng_range) dists = utils.np_haversine(.05, lng_grid, .1*lat_tile+.05, lat_grid) # FIXME: the name of a slurped command-line argument should not have to # match the file name contact_mat = contact_count dists[0,lat_tile+900] = 2 for lng_tile in xrange(-1800,1800): probs = np.log(1-utils.contact_prob(dists)) prob = np.sum(contact_mat*probs) yield (lng_tile,lat_tile),prob dists = np.roll(dists,1,0)