def predict_probability_area(self, upper_bound, lower_bound, estimated_loc): ''' Given a prediction and a bounding box this will return a confidence range for that prediction Args: upper_bound (geoCoord): bounding box top right geoCoord lower_bound (geoCoord): bounding box bottom left geoCoord estimated_loc (LocEstimate): geoCoord of the estimated location Returns: Probability Tuple(Tuple(float,float)): A probability range tuple (min probability, max probability) ''' geo = estimated_loc.geo_coord top_dist = haversine(geo, GeoCoord(upper_bound.lat, geo.lon)) bottom_dist = haversine(geo, GeoCoord(lower_bound.lat, geo.lon)) r_dist = haversine(geo, GeoCoord(geo.lat, upper_bound.lon)) l_dist = haversine(geo, GeoCoord(geo.lat, lower_bound.lon)) min_dist = min([top_dist, bottom_dist, r_dist, l_dist]) max_dist = max([top_dist, bottom_dist, r_dist, l_dist]) #min_prob = self.lookup( (min_dist- med_error)/std_dev) #max_prob = self.lookup( (max_dist - med_error)/ std_dev) return (self.lookup((min_dist-estimated_loc.dispersion)/estimated_loc.dispersion_std_dev),\ self.lookup((max_dist-estimated_loc.dispersion)/estimated_loc.dispersion_std_dev))
def load_from_rdds(locs_known, edges, desired_samples=1000, dispersion_threshold=150, neighbor_threshold=3): ''' Creates an EstimatorCurve Args: locs_known (rdd of LocEstimate): RDD of locations that are known edges (rdd of (src_id (dest_id, weight)): RDD of edges in the network desired_samples (int): Limit the curve to just a sample of data Returns: EstimatorCurve: A new EstimatorCurve representing the known input data ''' # Filter edge list so we never attempt to estimate a "known" location known_edges = edges.keyBy(lambda (src_id, (dst_id, weight)): dst_id)\ .leftOuterJoin(locs_known)\ .flatMap(lambda (dst_id, (edge, loc_known)): [edge] if loc_known is not None else [] ) medians = known_edges.join(locs_known)\ .map(lambda (src_id, ((dst_id, weight), src_loc)) : (dst_id, (src_loc, weight)))\ .groupByKey()\ .filter(lambda (src_id, neighbors) : len(neighbors) >= neighbor_threshold)\ .mapValues(lambda neighbors :\ median(haversine, [loc for loc,w in neighbors], [w for loc,w in neighbors]))\ .join(locs_known)\ .mapValues(lambda (found_loc, known_loc) :\ (found_loc, known_loc, haversine(known_loc.geo_coord, found_loc.geo_coord)))\ .filter(lambda (src_id, (found_loc, known_loc, dist)) : found_loc.dispersion < dispersion_threshold) #some medians might have std_devs of zero close_locs = medians.filter(lambda (src_id, ( found_loc, known_loc, dist)): found_loc.dispersion_std_dev == 0) #remaining_locs = medians.filter(lambda (src_id, (found_loc, known_loc, dist)) : found_loc.dispersion_std_dev != 0) values = medians.map(lambda (src_id, (found_loc, known_loc, dist)) :\ (src_id, ((dist-found_loc.dispersion)/found_loc.dispersion_std_dev if found_loc.dispersion_std_dev != 0 else 0)))\ .values() values_wo_stdev = close_locs.map(lambda (src_id, (found_loc, known_loc, dist)): (src_id, dist))\ .values() return EstimatorCurve(EstimatorCurve.build_curve(values, desired_samples),\ EstimatorCurve.build_curve(values_wo_stdev, desired_samples))
def load_from_rdds(locs_known, edges, desired_samples=1000, dispersion_threshold=150, neighbor_threshold=3): ''' Creates an EstimatorCurve Args: locs_known (rdd of LocEstimate): RDD of locations that are known edges (rdd of (src_id (dest_id, weight)): RDD of edges in the network desired_samples (int): Limit the curve to just a sample of data Returns: EstimatorCurve: A new EstimatorCurve representing the known input data ''' # Filter edge list so we never attempt to estimate a "known" location known_edges = edges.keyBy(lambda (src_id, (dst_id, weight)): dst_id)\ .leftOuterJoin(locs_known)\ .flatMap(lambda (dst_id, (edge, loc_known)): [edge] if loc_known is not None else [] ) medians = known_edges.join(locs_known)\ .map(lambda (src_id, ((dst_id, weight), src_loc)) : (dst_id, (src_loc, weight)))\ .groupByKey()\ .filter(lambda (src_id, neighbors) : len(neighbors) >= neighbor_threshold)\ .mapValues(lambda neighbors :\ median(haversine, [loc for loc,w in neighbors], [w for loc,w in neighbors]))\ .join(locs_known)\ .mapValues(lambda (found_loc, known_loc) :\ (found_loc, known_loc, haversine(known_loc.geo_coord, found_loc.geo_coord)))\ .filter(lambda (src_id, (found_loc, known_loc, dist)) : found_loc.dispersion < dispersion_threshold) #some medians might have std_devs of zero close_locs = medians.filter(lambda (src_id, (found_loc, known_loc, dist)) : found_loc.dispersion_std_dev == 0) #remaining_locs = medians.filter(lambda (src_id, (found_loc, known_loc, dist)) : found_loc.dispersion_std_dev != 0) values = medians.map(lambda (src_id, (found_loc, known_loc, dist)) :\ (src_id, ((dist-found_loc.dispersion)/found_loc.dispersion_std_dev if found_loc.dispersion_std_dev != 0 else 0)))\ .values() values_wo_stdev = close_locs.map(lambda (src_id, (found_loc, known_loc, dist)): (src_id, dist))\ .values() return EstimatorCurve(EstimatorCurve.build_curve(values, desired_samples),\ EstimatorCurve.build_curve(values_wo_stdev, desired_samples))