def region(self, lat, lon): """ Return a region code matching the provided position. If the position is not found inside any region return None. """ # Look up point in RTree of buffered region envelopes. # This is a coarse-grained but very fast match. point = geometry.Point(lon, lat) codes = [self._tree_ids[id_] for id_ in self._tree.intersection(point.bounds)] if not codes: return None # match point against the buffered polygon shapes buffered_codes = [code for code in codes if self._buffered_shapes[code].contains(point)] if len(buffered_codes) < 2: return buffered_codes[0] if buffered_codes else None # match point against the precise polygon shapes precise_codes = [code for code in buffered_codes if self._prepared_shapes[code].contains(point)] if len(precise_codes) == 1: return precise_codes[0] # Use distance from the border of each region as the tie-breaker. distances = {} # point wasn't in any precise region, which one of the buffered # regions is it closest to? if not precise_codes: for code in buffered_codes: coords = [] if isinstance(self._shapes[code].boundary, geometry.base.BaseMultipartGeometry): for geom in self._shapes[code].boundary.geoms: coords.extend([coord for coord in geom.coords]) else: coords = self._shapes[code].boundary.coords for coord in coords: distances[geocalc.distance( coord[1], coord[0], lat, lon)] = code return distances[min(distances.keys())] # point was in multiple overlapping regions, take the one where it # is farthest away from the border / the most inside a region for code in precise_codes: coords = [] if isinstance(self._shapes[code].boundary, geometry.base.BaseMultipartGeometry): for geom in self._shapes[code].boundary.geoms: coords.extend([coord for coord in geom.coords]) else: coords = self._shapes[code].boundary.coords for coord in coords: distances[geocalc.distance( coord[1], coord[0], lat, lon)] = code return distances[max(distances.keys())]
def cluster_wifis(networks): # Only consider clusters that have at least 2 found networks # inside them. Otherwise someone could use a combination of # one real network and one fake and therefor not found network to # get the position of the real network. length = len(networks) if length < MIN_WIFIS_IN_CLUSTER: # Not enough WiFis to form a valid cluster. return [] positions = networks[['lat', 'lon']] if length == 2: one = positions[0] two = positions[1] if distance(one[0], one[1], two[0], two[1]) <= MAX_WIFI_CLUSTER_METERS: # Only two WiFis and they agree, so cluster them. return [networks] else: # Or they disagree forming two clusters of size one, # neither of which is large enough to be returned. return [] # Calculate the condensed distance matrix based on distance in meters. # This avoids calculating the square form, which would calculate # each value twice and avoids calculating the diagonal of zeros. # We avoid the special cases for length < 2 with the above checks. # See scipy.spatial.distance.squareform and # https://stackoverflow.com/questions/13079563 dist_matrix = numpy.zeros(length * (length - 1) // 2, dtype=numpy.double) for i, (a, b) in enumerate(itertools.combinations(positions, 2)): dist_matrix[i] = distance(a[0], a[1], b[0], b[1]) link_matrix = hierarchy.linkage(dist_matrix, method='complete') assignments = hierarchy.fcluster(link_matrix, MAX_WIFI_CLUSTER_METERS, criterion='distance', depth=2) indexed_clusters = defaultdict(list) for i, net in zip(assignments, networks): indexed_clusters[i].append(net) clusters = [] for values in indexed_clusters.values(): if len(values) >= MIN_WIFIS_IN_CLUSTER: clusters.append(numpy.array(values, dtype=NETWORK_DTYPE)) return clusters
def aggregate_cell_position(networks, min_accuracy, max_accuracy): """ Calculate the aggregate position of the user inside the given cluster of networks. Return the position, an accuracy estimate and a combined score. The accuracy is bounded by the min_accuracy and max_accuracy. """ if len(networks) == 1: lat = networks[0]['lat'] lon = networks[0]['lon'] radius = min(max(networks[0]['radius'], min_accuracy), max_accuracy) score = networks[0]['score'] return (float(lat), float(lon), float(radius), float(score)) points = numpy.array( [(net['lat'], net['lon']) for net in networks], dtype=numpy.double) weights = numpy.array([ net['score'] / math.pow(net['signal'], 2) for net in networks], dtype=numpy.double) lat, lon = numpy.average(points, axis=0, weights=weights) score = networks['score'].sum() # Guess the accuracy as the 95th percentile of the distances # from the lat/lon to the positions of all networks. distances = numpy.array([ distance(lat, lon, net['lat'], net['lon']) for net in networks], dtype=numpy.double) accuracy = min(max(numpy.percentile(distances, 95), min_accuracy), max_accuracy) return (float(lat), float(lon), float(accuracy), float(score))
def test_circle_radius(self): circles = numpy.array( [(1.0, 1.0, 100.0), (1.001, 1.001, 100.0)], dtype=numpy.double) lat, lon, radius = aggregate_position(circles, 10.0) self.assertEqual((lat, lon), (1.0005, 1.0005)) self.assertAlmostEqual(distance(lat, lon, 1.0, 1.0) + 100.0, radius, 7)
def best_cluster(self): """Return the best cluster from this collection.""" if len(self) <= 1: return self results = sorted(self, key=operator.attrgetter('accuracy')) clusters = {} for i, result1 in enumerate(results): clusters[i] = [result1] # allow a 50% buffer zone around each result radius1 = result1.accuracy * 1.5 for j, result2 in enumerate(results): if j > i: # only calculate the upper triangle radius2 = result2.accuracy * 1.5 max_radius = max(radius1, radius2) apart = distance(result1.lat, result1.lon, result2.lat, result2.lon) if apart <= max_radius: clusters[i].append(result2) def sum_score(values): # Sort by highest cumulative score, # break tie by highest individual score return (sum([v.score for v in values]), max([v.score for v in values])) clusters = sorted(clusters.values(), key=sum_score, reverse=True) return clusters[0]
def aggregate_obs(self): positions = numpy.array( [(obs.lat, obs.lon) for obs in self.observations], dtype=numpy.double) max_lat, max_lon = positions.max(axis=0) min_lat, min_lon = positions.min(axis=0) box_distance = distance(min_lat, min_lon, max_lat, max_lon) if box_distance > self.MAX_DIST_METERS: return None weights = numpy.array( [obs.weight for obs in self.observations], dtype=numpy.double) lat, lon = numpy.average(positions, axis=0, weights=weights) lat = float(lat) lon = float(lon) radius = circle_radius(lat, lon, max_lat, max_lon, min_lat, min_lon) region = GEOCODER.region(lat, lon) samples, weight = self.bounded_samples_weight( len(self.observations), float(weights.sum())) return { 'positions': positions, 'weights': weights, 'lat': lat, 'lon': lon, 'max_lat': float(max_lat), 'min_lat': float(min_lat), 'max_lon': float(max_lon), 'min_lon': float(min_lon), 'radius': radius, 'region': region, 'samples': samples, 'weight': weight, }
def aggregate_cell_position(networks, min_accuracy, max_accuracy): """ Calculate the aggregate position of the user inside the given cluster of networks. Return the position, an accuracy estimate and a combined score. The accuracy is bounded by the min_accuracy and max_accuracy. """ if len(networks) == 1: lat = networks[0]['lat'] lon = networks[0]['lon'] radius = min(max(networks[0]['radius'], min_accuracy), max_accuracy) score = networks[0]['score'] return (float(lat), float(lon), float(radius), float(score)) points = numpy.array([(net['lat'], net['lon']) for net in networks], dtype=numpy.double) weights = numpy.array( [net['score'] / math.pow(net['signal'], 2) for net in networks], dtype=numpy.double) lat, lon = numpy.average(points, axis=0, weights=weights) score = networks['score'].sum() # Guess the accuracy as the 95th percentile of the distances # from the lat/lon to the positions of all networks. distances = numpy.array( [distance(lat, lon, net['lat'], net['lon']) for net in networks], dtype=numpy.double) accuracy = min(max(numpy.percentile(distances, 95), min_accuracy), max_accuracy) return (float(lat), float(lon), float(accuracy), float(score))
def cluster_wifis(networks): # Only consider clusters that have at least 2 found networks # inside them. Otherwise someone could use a combination of # one real network and one fake and therefor not found network to # get the position of the real network. length = len(networks) if length < MIN_WIFIS_IN_CLUSTER: # Not enough WiFis to form a valid cluster. return [] positions = networks[['lat', 'lon']] if length == 2: one = positions[0] two = positions[1] if distance(one[0], one[1], two[0], two[1]) <= MAX_WIFI_CLUSTER_METERS: # Only two WiFis and they agree, so cluster them. return [networks] else: # Or they disagree forming two clusters of size one, # neither of which is large enough to be returned. return [] # Calculate the condensed distance matrix based on distance in meters. # This avoids calculating the square form, which would calculate # each value twice and avoids calculating the diagonal of zeros. # We avoid the special cases for length < 2 with the above checks. # See scipy.spatial.distance.squareform and # https://stackoverflow.com/questions/13079563 dist_matrix = numpy.zeros(length * (length - 1) // 2, dtype=numpy.double) for i, (a, b) in enumerate(itertools.combinations(positions, 2)): dist_matrix[i] = distance(a[0], a[1], b[0], b[1]) link_matrix = hierarchy.linkage(dist_matrix, method='complete') assignments = hierarchy.fcluster( link_matrix, MAX_WIFI_CLUSTER_METERS, criterion='distance', depth=2) indexed_clusters = defaultdict(list) for i, net in zip(assignments, networks): indexed_clusters[i].append(net) clusters = [] for values in indexed_clusters.values(): if len(values) >= MIN_WIFIS_IN_CLUSTER: clusters.append(numpy.array(values, dtype=NETWORK_DTYPE)) return clusters
def test_simple_distance(self): # This is a simple case where the points are close to each other. lat1 = 44.0337065 lon1 = -79.4908184 lat2 = 44.0347065 lon2 = -79.4918184 delta = distance(lat1, lon1, lat2, lon2) self.assertAlmostEqual(delta, 136.9483, 4)
def test_simple_distance(self): # This is a simple case where the points are close to each other. lat1 = 44.0337065 lon1 = -79.4908184 lat2 = 44.0347065 lon2 = -79.4918184 delta = distance(lat1, lon1, lat2, lon2) sdelta = '%0.4f' % delta self.assertEqual(sdelta, '0.1369')
def test_simple_distance(self): # This is a simple case where the points are close to each other. lat1 = 44.0337065 lon1 = -79.4908184 lat2 = 44.0347065 lon2 = -79.4918184 delta = distance(lat1, lon1, lat2, lon2) sdelta = "%0.4f" % delta self.assertEqual(sdelta, '0.1369')
def _get_clusters(self, wifi_signals, queried_wifis): """ Filter out BSSIDs that are numerically very similar, assuming they're multiple interfaces on the same base station or such. """ dissimilar_keys = set(self._filter_bssids_by_similarity( [w.key for w in queried_wifis])) if len(dissimilar_keys) < len(queried_wifis): self.stat_time( 'wifi.provided_too_similar', len(queried_wifis) - len(dissimilar_keys)) wifi_networks = [ Network(w.key, w.lat, w.lon, w.range) for w in queried_wifis if w.key in dissimilar_keys] if len(wifi_networks) < MIN_WIFIS_IN_QUERY: # We didn't get enough matches. self.stat_count('wifi.found_too_few') # Sort networks by signal strengths in query. wifi_networks.sort( lambda a, b: cmp(wifi_signals[b.key], wifi_signals[a.key])) clusters = self._cluster_elements( wifi_networks, lambda a, b: distance(a.lat, a.lon, b.lat, b.lon), MAX_WIFI_CLUSTER_KM) # The second loop selects a cluster and estimates the position of that # cluster. The selected cluster is the one with the most points, larger # than MIN_WIFIS_IN_CLUSTER; its position is estimated taking up-to # MAX_WIFIS_IN_CLUSTER worth of points from the cluster, which is # pre-sorted in signal-strength order due to the way we built the # clusters. # # The reasoning here is that if we have >1 cluster at all, we probably # have some bad data -- likely an AP or set of APs associated with a # single antenna that moved -- since a user shouldn't be able to hear # multiple groups 500m apart. # # So we're trying to select a cluster that's most-likely good data, # which we assume to be the one with the most points in it. # # The reason we take a subset of those points when estimating location # is that we're doing a (non-weighted) centroid calculation, which is # itself unbalanced by distant elements. Even if we did a weighted # centroid here, using radio intensity as a proxy for distance has an # error that increases significantly with distance, so we'd have to # underweight pretty heavily. return [c for c in clusters if len(c) >= MIN_WIFIS_IN_CLUSTER]
def test_out_of_range(self): # We don't always sanitize the incoming data and thus have to deal # with some invalid coordinates. Make sure the distance function # doesn't error out on us. lat1 = -100.0 lon1 = -186.0 lat2 = 0.0 lon2 = 0.0 delta = distance(lat1, lon1, lat2, lon2) sdelta = '%0.4f' % delta self.assertEqual(sdelta, '8901.7476')
def test_out_of_range(self): # We don't always sanitize the incoming data and thus have to deal # with some invalid coordinates. Make sure the distance function # doesn't error out on us. lat1 = -100.0 lon1 = -186.0 lat2 = 0.0 lon2 = 0.0 delta = distance(lat1, lon1, lat2, lon2) sdelta = "%0.4f" % delta self.assertEqual(sdelta, '8901.7476')
def _get_clusters(self, wifi_signals, queried_wifis): """ Filter out BSSIDs that are numerically very similar, assuming they're multiple interfaces on the same base station or such. """ dissimilar_keys = set( self._filter_bssids_by_similarity([w.key for w in queried_wifis])) if len(dissimilar_keys) < len(queried_wifis): self.stat_time('wifi.provided_too_similar', len(queried_wifis) - len(dissimilar_keys)) wifi_networks = [ Network(w.key, w.lat, w.lon, w.range) for w in queried_wifis if w.key in dissimilar_keys ] if len(wifi_networks) < MIN_WIFIS_IN_QUERY: # We didn't get enough matches. self.stat_count('wifi.found_too_few') # Sort networks by signal strengths in query. wifi_networks.sort( lambda a, b: cmp(wifi_signals[b.key], wifi_signals[a.key])) clusters = self._cluster_elements( wifi_networks, lambda a, b: distance(a.lat, a.lon, b.lat, b.lon), MAX_WIFI_CLUSTER_KM) # The second loop selects a cluster and estimates the position of that # cluster. The selected cluster is the one with the most points, larger # than MIN_WIFIS_IN_CLUSTER; its position is estimated taking up-to # MAX_WIFIS_IN_CLUSTER worth of points from the cluster, which is # pre-sorted in signal-strength order due to the way we built the # clusters. # # The reasoning here is that if we have >1 cluster at all, we probably # have some bad data -- likely an AP or set of APs associated with a # single antenna that moved -- since a user shouldn't be able to hear # multiple groups 500m apart. # # So we're trying to select a cluster that's most-likely good data, # which we assume to be the one with the most points in it. # # The reason we take a subset of those points when estimating location # is that we're doing a (non-weighted) centroid calculation, which is # itself unbalanced by distant elements. Even if we did a weighted # centroid here, using radio intensity as a proxy for distance has an # error that increases significantly with distance, so we'd have to # underweight pretty heavily. return [c for c in clusters if len(c) >= MIN_WIFIS_IN_CLUSTER]
def test_antipodal(self): # Antipodal points (opposite sides of the planet) have a round off # error with the standard haversine calculation which is extremely # old and assumes we are using fixed precision math instead of IEEE # floats. lat1 = 90.0 lon1 = 0.0 lat2 = -90.0 lon2 = 0 delta = distance(lat1, lon1, lat2, lon2) sdelta = "%0.4f" % delta self.assertEqual(sdelta, '20015.0868')
def test_antipodal(self): # Antipodal points (opposite sides of the planet) have a round off # error with the standard haversine calculation which is extremely # old and assumes we are using fixed precision math instead of IEEE # floats. lat1 = 90.0 lon1 = 0.0 lat2 = -90.0 lon2 = 0 delta = distance(lat1, lon1, lat2, lon2) sdelta = '%0.4f' % delta self.assertEqual(sdelta, '20015.0868')
def confirm_station_obs(self): confirm = False if self.has_position(): # station with position confirm = True for obs in self.observations: obs_distance = distance(obs.lat, obs.lon, self.station.lat, self.station.lon) if obs_distance > self.MAX_DIST_METERS: confirm = False break return confirm
def estimate_accuracy(lat, lon, points, minimum): if len(points) == 1: accuracy = points[0].range else: # Terrible approximation, but hopefully better # than the old approximation, "worst-case range": # this one takes the maximum distance from location # to any of the provided points. accuracy = max([distance(to_degrees(lat), to_degrees(lon), to_degrees(p.lat), to_degrees(p.lon)) * 1000 for p in points]) return max(accuracy, minimum)
def _nearest_tower(missing_lat, missing_lon, centroids): """ We just need the closest cell, so we can approximate using the haversine formula. """ lat1 = missing_lat lon1 = missing_lon min_dist = None for pt in centroids: lat2 = pt['lat'] lon2 = pt['lon'] dist = distance(lat1, lon1, lat2, lon2) if min_dist is None or min_dist['dist'] > dist: min_dist = {'dist': dist, 'pt': pt} if min_dist['dist'] <= NEAREST_DISTANCE: return min_dist
def _estimate_accuracy(self, lat, lon, points, minimum): """ Return the maximum range between a position (lat/lon) and a list of secondary positions (points). But at least use the specified minimum value. """ if len(points) == 1: accuracy = points[0].range else: # Terrible approximation, but hopefully better # than the old approximation, "worst-case range": # this one takes the maximum distance from location # to any of the provided points. accuracy = max([distance(lat, lon, p.lat, p.lon) * 1000 for p in points]) if accuracy is not None: accuracy = float(accuracy) return max(accuracy, minimum)
def _nearest_tower(missing_lat, missing_lon, centroids): """ We just need the closest cell, so we can approximate using the haversine formula. """ lat1 = to_degrees(missing_lat) lon1 = to_degrees(missing_lon) min_dist = None for pt in centroids: lat2 = to_degrees(pt['lat']) lon2 = to_degrees(pt['lon']) dist = distance(lat1, lon1, lat2, lon2) if min_dist is None or min_dist['dist'] > dist: min_dist = {'dist': dist, 'pt': pt} if min_dist['dist'] <= NEAREST_DISTANCE: return min_dist
def _nearest_tower(missing_lat, missing_lon, centroids): """ We just need the closest cell, so we can approximate using the haversine formula. """ FLOAT_CONST = 10000000.0 lat1 = missing_lat / FLOAT_CONST lon1 = missing_lon / FLOAT_CONST min_dist = None for pt in centroids: lat2 = float(pt['lat']) / FLOAT_CONST lon2 = float(pt['lon']) / FLOAT_CONST dist = distance(lat1, lon1, lat2, lon2) if min_dist is None or min_dist['dist'] > dist: min_dist = {'dist': dist, 'pt': pt} if min_dist['dist'] <= NEAREST_DISTANCE: return min_dist
def _estimate_accuracy(self, lat, lon, points, minimum): """ Return the maximum range between a position (lat/lon) and a list of secondary positions (points). But at least use the specified minimum value. """ if len(points) == 1: accuracy = points[0].range else: # Terrible approximation, but hopefully better # than the old approximation, "worst-case range": # this one takes the maximum distance from location # to any of the provided points. accuracy = max( [distance(lat, lon, p.lat, p.lon) * 1000 for p in points]) if accuracy is not None: accuracy = float(accuracy) return max(accuracy, minimum)
def aggregate_mac_position(networks, minimum_accuracy): # Idea based on https://gis.stackexchange.com/questions/40660 def func(point, points): return numpy.array([ distance(p['lat'], p['lon'], point[0], point[1]) * min(math.sqrt(2000.0 / p['age']), 1.0) / math.pow(p['signalStrength'], 2) for p in points]) # Guess initial position as the weighted mean over all networks. points = numpy.array( [(net['lat'], net['lon']) for net in networks], dtype=numpy.double) weights = numpy.array([ net['score'] * min(math.sqrt(2000.0 / net['age']), 1.0) / math.pow(net['signalStrength'], 2) for net in networks], dtype=numpy.double) initial = numpy.average(points, axis=0, weights=weights) (lat, lon), cov_x, info, mesg, ier = leastsq( func, initial, args=networks, full_output=True) if ier not in (1, 2, 3, 4): # pragma: no cover # No solution found, use initial estimate. lat, lon = initial # Guess the accuracy as the 95th percentile of the distances # from the lat/lon to the positions of all networks. distances = numpy.array([ distance(lat, lon, net['lat'], net['lon']) for net in networks], dtype=numpy.double) accuracy = max(numpy.percentile(distances, 95), minimum_accuracy) return (float(lat), float(lon), float(accuracy))
def aggregate_mac_position(networks, minimum_accuracy): # Idea based on https://gis.stackexchange.com/questions/40660 def func(point, points): return numpy.array([ distance(p['lat'], p['lon'], point[0], point[1]) * min(math.sqrt(2000.0 / p['age']), 1.0) / math.pow(p['signalStrength'], 2) for p in points ]) # Guess initial position as the weighted mean over all networks. points = numpy.array([(net['lat'], net['lon']) for net in networks], dtype=numpy.double) weights = numpy.array([ net['score'] * min(math.sqrt(2000.0 / net['age']), 1.0) / math.pow(net['signalStrength'], 2) for net in networks ], dtype=numpy.double) initial = numpy.average(points, axis=0, weights=weights) (lat, lon), cov_x, info, mesg, ier = leastsq(func, initial, args=networks, full_output=True) if ier not in (1, 2, 3, 4): # pragma: no cover # No solution found, use initial estimate. lat, lon = initial # Guess the accuracy as the 95th percentile of the distances # from the lat/lon to the positions of all networks. distances = numpy.array( [distance(lat, lon, net['lat'], net['lon']) for net in networks], dtype=numpy.double) accuracy = max(numpy.percentile(distances, 95), minimum_accuracy) return (float(lat), float(lon), float(accuracy))
def aggregate_obs(self): positions = numpy.array([(obs.lat, obs.lon) for obs in self.observations], dtype=numpy.double) max_lat, max_lon = positions.max(axis=0) min_lat, min_lon = positions.min(axis=0) box_distance = distance(min_lat, min_lon, max_lat, max_lon) if box_distance > self.MAX_DIST_METERS: return None weights = numpy.array([obs.weight for obs in self.observations], dtype=numpy.double) lat, lon = numpy.average(positions, axis=0, weights=weights) lat = float(lat) lon = float(lon) radius = circle_radius(lat, lon, max_lat, max_lon, min_lat, min_lon) region = GEOCODER.region(lat, lon) samples, weight = self.bounded_samples_weight(len(self.observations), float(weights.sum())) return { 'positions': positions, 'weights': weights, 'lat': lat, 'lon': lon, 'max_lat': float(max_lat), 'min_lat': float(min_lat), 'max_lon': float(max_lon), 'min_lon': float(min_lon), 'radius': radius, 'region': region, 'samples': samples, 'weight': weight, }
def func(point, points): return numpy.array([ distance(p['lat'], p['lon'], point[0], point[1]) / math.pow(p['signal'], 2) for p in points ])
def test_circle_radius(self): circles = numpy.array([(1.0, 1.0, 100.0), (1.001, 1.001, 100.0)], dtype=numpy.double) lat, lon, radius = aggregate_position(circles, 10.0) self.assertEqual((lat, lon), (1.0005, 1.0005)) self.assertAlmostEqual(distance(lat, lon, 1.0, 1.0) + 100.0, radius, 7)
def wifi_distance(one, two): return distance(one.lat, one.lon, two.lat, two.lon)
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. """ stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pass-through wifi data validated['wifi'] = data.get('wifi', []) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC)) # Merge all possible cell and lac keys into one list all_cell_keys = [] all_cell_keys.extend(validated['cell']) for key in validated['cell_lac']: all_cell_keys.append(key) # Do a single query for all cells and lacs at the same time try: all_networks = query_cell_networks(session, all_cell_keys) except Exception: heka_client.raven(RAVEN_ERROR) all_networks = [] for network in all_networks: if network.key == CELLID_LAC: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes( validated['cell'], api_name, client_addr, geoip_db) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi)]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field]) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' if not result: stats_client.incr('%s.miss' % api_name) return None rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.incr('%s.%s_hit' % (api_name, result_metric)) stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result
def calculate_new_position(station, measures, moving_stations, max_dist_km, backfill=True): # if backfill is true, we work on older measures for which # the new/total counters where never updated length = len(measures) latitudes = [w[0] for w in measures] longitudes = [w[1] for w in measures] new_lat = sum(latitudes) // length new_lon = sum(longitudes) // length if station.lat and station.lon: latitudes.append(station.lat) longitudes.append(station.lon) existing_station = True else: station.lat = new_lat station.lon = new_lon existing_station = False # calculate extremes of measures, existing location estimate # and existing extreme values def extreme(vals, attr, function): new = function(vals) old = getattr(station, attr, None) if old is not None: return function(new, old) else: return new min_lat = extreme(latitudes, 'min_lat', min) min_lon = extreme(longitudes, 'min_lon', min) max_lat = extreme(latitudes, 'max_lat', max) max_lon = extreme(longitudes, 'max_lon', max) # calculate sphere-distance from opposite corners of # bounding box containing current location estimate # and new measurements; if too big, station is moving box_dist = distance(to_degrees(min_lat), to_degrees(min_lon), to_degrees(max_lat), to_degrees(max_lon)) if existing_station: if box_dist > max_dist_km: # add to moving list, return early without updating # station since it will be deleted by caller momentarily moving_stations.add(station) return if backfill: new_total = station.total_measures + length old_length = station.total_measures # update total to account for new measures # new counter never got updated to include the measures station.total_measures = new_total else: new_total = station.total_measures old_length = new_total - length station.lat = ((station.lat * old_length) + (new_lat * length)) // new_total station.lon = ((station.lon * old_length) + (new_lon * length)) // new_total if not backfill: # decrease new counter, total is already correct # in the backfill case new counter was never increased station.new_measures = station.new_measures - length # update max/min lat/lon columns station.min_lat = min_lat station.min_lon = min_lon station.max_lat = max_lat station.max_lon = max_lon # give radio-range estimate between extreme values and centroid ctr = (to_degrees(station.lat), to_degrees(station.lon)) points = [(to_degrees(min_lat), to_degrees(min_lon)), (to_degrees(min_lat), to_degrees(max_lon)), (to_degrees(max_lat), to_degrees(min_lon)), (to_degrees(max_lat), to_degrees(max_lon))] station.range = range_to_points(ctr, points) * 1000.0
def test_out_of_range(self): self.assertAlmostEqual( distance(-100.0, -186.0, 0.0, 0.0), 8901747.5973, 4)
def new_station_values(self, station, station_key, first_blocked, observations): # This function returns a 3-tuple, the first element is True, # if the station was found to be moving. # The second element is either None or a dict of values, # if the station is new and should result in a table insert # The third element is either None or a dict of values # if the station did exist and should be updated obs_length = len(observations) obs_positions = numpy.array( [(obs.lat, obs.lon) for obs in observations], dtype=numpy.double) obs_lat, obs_lon = centroid(obs_positions) values = { 'modified': self.utcnow, } values.update(station_key.__dict__) if self.station_type == 'cell': # pass on extra psc column which is not actually part # of the stations hash key values['psc'] = observations[-1].psc created = self.utcnow if station is None: if first_blocked: # if the station did previously exist, retain at least the # time it was first put on a blocklist as the creation date created = first_blocked values.update({ 'created': created, 'range': 0, 'total_measures': 0, }) if (station is not None and station.lat is not None and station.lon is not None): obs_positions = numpy.append(obs_positions, [ (station.lat, station.lon), (numpy.nan if station.max_lat is None else station.max_lat, numpy.nan if station.max_lon is None else station.max_lon), (numpy.nan if station.min_lat is None else station.min_lat, numpy.nan if station.min_lon is None else station.min_lon), ], axis=0) existing_station = True else: values['lat'] = obs_lat values['lon'] = obs_lon existing_station = False max_lat, max_lon = numpy.nanmax(obs_positions, axis=0) min_lat, min_lon = numpy.nanmin(obs_positions, axis=0) # calculate sphere-distance from opposite corners of # bounding box containing current location estimate # and new observations; if too big, station is moving box_dist = distance(min_lat, min_lon, max_lat, max_lon) # TODO: If we get a too large box_dist, we should not create # a new station record with the impossibly big distance, # so moving the box_dist > self.max_dist_meters here if existing_station: if box_dist > self.max_dist_meters: # Signal a moving station and return early without updating # the station since it will be deleted by caller momentarily return (True, None, None) # limit the maximum weight of the old station estimate old_weight = min(station.total_measures, self.MAX_OLD_OBSERVATIONS) new_weight = old_weight + obs_length values['lat'] = ((station.lat * old_weight) + (obs_lat * obs_length)) / new_weight values['lon'] = ((station.lon * old_weight) + (obs_lon * obs_length)) / new_weight # increase total counter if station is not None: values['total_measures'] = station.total_measures + obs_length else: values['total_measures'] = obs_length # update max/min lat/lon columns values['min_lat'] = float(min_lat) values['min_lon'] = float(min_lon) values['max_lat'] = float(max_lat) values['max_lon'] = float(max_lon) # give radio-range estimate between extreme values and centroid values['range'] = circle_radius( values['lat'], values['lon'], max_lat, max_lon, min_lat, min_lon) if station is None: # return new values return (False, values, None) else: # return updated values, remove station from session self.session.expunge(station) return (False, None, values)
def search_wifi(session, data): # estimate signal strength at -100 dBm if none is provided, # which is worse than the 99th percentile of wifi dBms we # see in practice (-98). def signal_strength(w): if 'signal' in w: return int(w['signal']) else: return -100 wifi_signals = dict([(normalize_wifi_key(w['key']), signal_strength(w)) for w in data['wifi']]) wifi_keys = set(wifi_signals.keys()) if not any(wifi_keys): # no valid normalized keys return None if len(wifi_keys) < 3: # we didn't even get three keys, bail out return None sql_null = None # avoid pep8 warning query = session.query(Wifi.key, Wifi.lat, Wifi.lon).filter( Wifi.key.in_(wifi_keys)).filter( Wifi.lat != sql_null).filter( Wifi.lon != sql_null) wifis = query.all() if len(wifis) < 3: # we got fewer than three actual matches return None wifis = [Network(normalize_wifi_key(w[0]), w[1], w[2]) for w in wifis] # sort networks by signal strengths in query wifis.sort(lambda a, b: cmp(wifi_signals[b.key], wifi_signals[a.key])) clusters = [] for w in wifis: # try to assign w to a cluster (but at most one) for c in clusters: for n in c: if distance(quantize(n.lat), quantize(n.lon), quantize(w.lat), quantize(w.lon)) <= MAX_DIST: c.append(w) w = None break if len(c) >= 3: # if we have a cluster with more than 3 # networks in it, return its centroid. length = len(c) avg_lat = sum([n.lat for n in c]) / length avg_lon = sum([n.lon for n in c]) / length return { 'lat': quantize(avg_lat), 'lon': quantize(avg_lon), 'accuracy': 500, } if w is None: break # if w didn't adhere to any cluster, make a new one if w is not None: clusters.append([w]) # if we didn't get any clusters with >3 networks, # the query is a bunch of outliers; give up and # let the next location method try. return None
def region(self, lat, lon): """ Return a region code matching the provided position. If the position is not found inside any region return None. """ # Look up point in RTree of buffered region envelopes. # This is a coarse-grained but very fast match. point = geometry.Point(lon, lat) codes = [ self._tree_ids[id_] for id_ in self._tree.intersection(point.bounds) ] if not codes: return None # match point against the buffered polygon shapes buffered_codes = [ code for code in codes if self._buffered_shapes[code].contains(point) ] if len(buffered_codes) < 2: return buffered_codes[0] if buffered_codes else None # match point against the precise polygon shapes precise_codes = [ code for code in buffered_codes if self._prepared_shapes[code].contains(point) ] if len(precise_codes) == 1: return precise_codes[0] # Use distance from the border of each region as the tie-breaker. distances = {} # point wasn't in any precise region, which one of the buffered # regions is it closest to? if not precise_codes: for code in buffered_codes: coords = [] if isinstance(self._shapes[code].boundary, geometry.base.BaseMultipartGeometry): for geom in self._shapes[code].boundary.geoms: coords.extend([coord for coord in geom.coords]) else: coords = self._shapes[code].boundary.coords for coord in coords: distances[geocalc.distance(coord[1], coord[0], lat, lon)] = code return distances[min(distances.keys())] # point was in multiple overlapping regions, take the one where it # is farthest away from the border / the most inside a region for code in precise_codes: coords = [] if isinstance(self._shapes[code].boundary, geometry.base.BaseMultipartGeometry): for geom in self._shapes[code].boundary.geoms: coords.extend([coord for coord in geom.coords]) else: coords = self._shapes[code].boundary.coords for coord in coords: distances[geocalc.distance(coord[1], coord[0], lat, lon)] = code return distances[max(distances.keys())]
def test_non_float(self): self.assertAlmostEqual(distance(1.0, 1.0, 1, 1.1), 11117.7991, 4) with self.assertRaises(TypeError): distance(None, '0.1', 1, 1.1)
def test_out_of_max_bounds(self): self.assertAlmostEqual(distance(-100.0, -186.0, 0.0, 0.0), 8901747.5973, 4)
def test_antipodal(self): # Antipodal points (opposite sides of the planet) have a round off # error with the standard haversine calculation which is extremely # old and assumes we are using fixed precision math instead of IEEE # floats. self.assertAlmostEqual(distance(90.0, 0.0, -90.0, 0), 20015086.796, 4)
def get(self, query): """ Get a cached result for the query. :param query: The query for which to look for a cached value. :type query: :class:`ichnaea.api.locate.query.Query` :returns: The cache result or None. :rtype: :class:`~ichnaea.api.locate.fallback.ExternalResult` """ fallback_name = query.api_key.fallback_name if not self._should_cache(query): self._stat_count(fallback_name, 'bypassed') return None cache_keys = self._cache_keys(query) # dict of (lat, lon, fallback) tuples to ExternalResult list # lat/lon clustered into ~100x100 meter grid cells clustered_results = defaultdict(list) not_found_cluster = (None, None, None) try: for value in self.redis_client.mget(cache_keys): if not value: continue value = simplejson.loads(value) if value == LOCATION_NOT_FOUND: value = ExternalResult(None, None, None, None) clustered_results[not_found_cluster] = [value] else: value = ExternalResult(**value) # ~100x100m clusters clustered_results[(round(value.lat, 3), round(value.lat, 3), value.fallback)].append(value) except (simplejson.JSONDecodeError, RedisError): self.raven_client.captureException() self._stat_count(fallback_name, 'failure') return None if not clustered_results: self._stat_count(fallback_name, 'miss') return None if list(clustered_results.keys()) == [not_found_cluster]: # the only match was for not found results self._stat_count(fallback_name, 'hit') return clustered_results[not_found_cluster][0] if len(clustered_results) == 1: # all the cached values agree with each other self._stat_count(fallback_name, 'hit') results = list(clustered_results.values())[0] circles = numpy.array([(res.lat, res.lon, res.accuracy) for res in results], dtype=numpy.double) points, accuracies = numpy.hsplit(circles, [2]) lat, lon = points.mean(axis=0) lat = float(lat) lon = float(lon) radius = 0.0 for circle in circles: p_dist = distance(lat, lon, circle[0], circle[1]) + circle[2] radius = max(radius, p_dist) return ExternalResult( lat=lat, lon=lon, accuracy=float(radius), fallback=results[0].fallback, ) # inconsistent results self._stat_count(fallback_name, 'inconsistent') return None
def cluster_networks(models, lookups, min_radius=None, min_signal=None, max_distance=None): """ Given a list of database models and lookups, return a list of clusters of nearby networks. """ now = util.utcnow() # Create a dict of macs mapped to their signal strength. signals = {} for lookup in lookups: signals[lookup.mac] = lookup.signal or min_signal networks = numpy.array([(model.lat, model.lon, model.radius or min_radius, signals[model.mac], model.score(now)) for model in models], dtype=NETWORK_DTYPE) # Only consider clusters that have at least 2 found networks # inside them. Otherwise someone could use a combination of # one real network and one fake and therefor not found network to # get the position of the real network. length = len(networks) if length < 2: # Not enough networks to form a valid cluster. return [] positions = networks[['lat', 'lon']] if length == 2: one = positions[0] two = positions[1] if distance(one[0], one[1], two[0], two[1]) <= max_distance: # Only two networks and they agree, so cluster them. return [networks] else: # Or they disagree forming two clusters of size one, # neither of which is large enough to be returned. return [] # Calculate the condensed distance matrix based on distance in meters. # This avoids calculating the square form, which would calculate # each value twice and avoids calculating the diagonal of zeros. # We avoid the special cases for length < 2 with the above checks. # See scipy.spatial.distance.squareform and # https://stackoverflow.com/questions/13079563 dist_matrix = numpy.zeros(length * (length - 1) // 2, dtype=numpy.double) for i, (a, b) in enumerate(itertools.combinations(positions, 2)): dist_matrix[i] = distance(a[0], a[1], b[0], b[1]) link_matrix = hierarchy.linkage(dist_matrix, method='complete') assignments = hierarchy.fcluster(link_matrix, max_distance, criterion='distance', depth=2) indexed_clusters = defaultdict(list) for i, net in zip(assignments, networks): indexed_clusters[i].append(net) clusters = [] for values in indexed_clusters.values(): if len(values) >= 2: clusters.append(numpy.array(values, dtype=NETWORK_DTYPE)) return clusters
def station_values(self, station_key, shard_station, observations): """ Return two-tuple of status, value dict where status is one of: `new`, `new_moving`, `moving`, `changed`. """ # cases: # we always get a station key and observations # 0. observations disagree # 0.a. no shard station, return new_moving # 0.b. shard station, return moving # 1. no shard station # 1.a. obs agree -> return new # 2. shard station # 2.a. obs disagree -> return moving # 2.b. obs agree -> return changed created = self.utcnow values = { 'mac': station_key, 'modified': self.utcnow, } obs_length = len(observations) obs_positions = numpy.array( [(obs.lat, obs.lon) for obs in observations], dtype=numpy.double) obs_new_lat, obs_new_lon = centroid(obs_positions) obs_max_lat, obs_max_lon = numpy.nanmax(obs_positions, axis=0) obs_min_lat, obs_min_lon = numpy.nanmin(obs_positions, axis=0) obs_box_dist = distance(obs_min_lat, obs_min_lon, obs_max_lat, obs_max_lon) if obs_box_dist > self.max_dist_meters: # the new observations are already too far apart if not shard_station: values.update({ 'created': created, 'block_first': self.today, 'block_last': self.today, 'block_count': 1, }) return ('new_moving', values) else: block_count = shard_station.block_count or 0 values.update({ 'lat': None, 'lon': None, 'max_lat': None, 'min_lat': None, 'max_lon': None, 'min_lon': None, 'country': shard_station.country, 'radius': None, 'samples': None, 'source': None, 'block_last': self.today, 'block_count': block_count + 1, }) return ('moving', values) if shard_station is None: # totally new station, only agreeing observations radius = circle_radius( obs_new_lat, obs_new_lon, obs_max_lat, obs_max_lon, obs_min_lat, obs_min_lon) values.update({ 'created': created, 'lat': obs_new_lat, 'lon': obs_new_lon, 'max_lat': float(obs_max_lat), 'min_lat': float(obs_min_lat), 'max_lon': float(obs_max_lon), 'min_lon': float(obs_min_lon), 'country': country_for_location(obs_new_lat, obs_new_lon), 'radius': radius, 'samples': obs_length, 'source': None, }) return ('new', values) else: # shard_station + new observations positions = numpy.append(obs_positions, [ (numpy.nan if shard_station.lat is None else shard_station.lat, numpy.nan if shard_station.lon is None else shard_station.lon), (numpy.nan if shard_station.max_lat is None else shard_station.max_lat, numpy.nan if shard_station.max_lon is None else shard_station.max_lon), (numpy.nan if shard_station.min_lat is None else shard_station.min_lat, numpy.nan if shard_station.min_lon is None else shard_station.min_lon), ], axis=0) max_lat, max_lon = numpy.nanmax(positions, axis=0) min_lat, min_lon = numpy.nanmin(positions, axis=0) box_dist = distance(min_lat, min_lon, max_lat, max_lon) if box_dist > self.max_dist_meters: # shard_station + disagreeing observations block_count = shard_station.block_count or 0 values.update({ 'lat': None, 'lon': None, 'max_lat': None, 'min_lat': None, 'max_lon': None, 'min_lon': None, 'country': shard_station.country, 'radius': None, 'samples': None, 'source': None, 'block_last': self.today, 'block_count': block_count + 1, }) return ('moving', values) else: # shard_station + agreeing observations if shard_station.lat is None or shard_station.lon is None: old_weight = 0 else: old_weight = min((shard_station.samples or 0), self.MAX_OLD_OBSERVATIONS) new_lat = ((obs_new_lat * obs_length + (shard_station.lat or 0.0) * old_weight) / (obs_length + old_weight)) new_lon = ((obs_new_lon * obs_length + (shard_station.lon or 0.0) * old_weight) / (obs_length + old_weight)) samples = (shard_station.samples or 0) + obs_length radius = circle_radius( new_lat, new_lon, max_lat, max_lon, min_lat, min_lon) country = shard_station.country if (country and not country_matches_location( new_lat, new_lon, country)): # reset country if it no longer matches country = None if not country: country = country_for_location(new_lat, new_lon) values.update({ 'lat': new_lat, 'lon': new_lon, 'max_lat': float(max_lat), 'min_lat': float(min_lat), 'max_lon': float(max_lon), 'min_lon': float(min_lon), 'country': country, 'radius': radius, 'samples': samples, 'source': None, # use the exact same keys as in the moving case 'block_last': shard_station.block_last, 'block_count': shard_station.block_count, }) return ('changed', values) return (None, None) # pragma: no cover
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None, api_key_log=False, api_key_name=None): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. """ stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pre-process wifi data for wifi in data.get('wifi', ()): wifi = normalized_wifi_dict(wifi) if wifi: validated['wifi'].append(wifi) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC)) # Merge all possible cell and lac keys into one list all_cell_keys = [] all_cell_keys.extend(validated['cell']) for key in validated['cell_lac']: all_cell_keys.append(key) # Do a single query for all cells and lacs at the same time try: all_networks = query_cell_networks(session, all_cell_keys) except Exception: heka_client.raven(RAVEN_ERROR) all_networks = [] for network in all_networks: if network.key == CELLID_LAC: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes(validated['cell'], api_name, client_addr, geoip_db) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi) ]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field]) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' # Do detailed logging for some api keys if api_key_log and api_key_name: api_log_metric = None wifi_keys = set([w['key'] for w in validated['wifi']]) if wifi_keys and \ len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY: # Only count requests as WiFi-based if they contain enough # distinct WiFi networks to pass our filters if result_metric == 'wifi': api_log_metric = 'wifi_hit' else: api_log_metric = 'wifi_miss' elif validated['cell']: if result_metric == 'cell': api_log_metric = 'cell_hit' elif result_metric == 'cell_lac': api_log_metric = 'cell_lac_hit' else: api_log_metric = 'cell_miss' else: if geoip_res: api_log_metric = 'geoip_hit' else: api_log_metric = 'geoip_miss' if api_log_metric: stats_client.incr('%s.api_log.%s.%s' % (api_name, api_key_name, api_log_metric)) if not result: stats_client.incr('%s.miss' % api_name) return None rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.incr('%s.%s_hit' % (api_name, result_metric)) stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result
def search_wifi(session, data): # Estimate signal strength at -100 dBm if none is provided, # which is worse than the 99th percentile of wifi dBms we # see in practice (-98). def signal_strength(w): if 'signal' in w: return int(w['signal']) else: return -100 wifi_signals = dict([(normalized_wifi_key(w['key']), signal_strength(w)) for w in data['wifi']]) wifi_keys = set(wifi_signals.keys()) if not any(wifi_keys): # No valid normalized keys. return None if len(wifi_keys) < MIN_WIFIS_IN_QUERY: # We didn't get enough keys. return None query = session.query(Wifi.key, Wifi.lat, Wifi.lon, Wifi.range).filter( Wifi.key.in_(wifi_keys)).filter( Wifi.lat.isnot(None)).filter( Wifi.lon.isnot(None)) wifis = query.all() if len(wifis) < MIN_WIFIS_IN_QUERY: # We didn't get enough matches. return None wifis = [Network(normalized_wifi_key(w[0]), w[1], w[2], w[3]) for w in wifis] # Sort networks by signal strengths in query. wifis.sort(lambda a, b: cmp(wifi_signals[b.key], wifi_signals[a.key])) clusters = [] # The first loop forms a set of clusters by distance, # preferring the cluster with the stronger signal strength # if there's a tie. for w in wifis: # Try to assign w to a cluster (but at most one). for c in clusters: for n in c: if distance(quantize(n.lat), quantize(n.lon), quantize(w.lat), quantize(w.lon)) <= MAX_WIFI_CLUSTER_KM: c.append(w) w = None break if w is None: break # If w didn't adhere to any cluster, make a new one. if w is not None: clusters.append([w]) # The second loop selects a cluster and estimates the position of that # cluster. The selected cluster is the one with the most points, larger # than MIN_WIFIS_IN_CLUSTER; its position is estimated taking up-to # MAX_WIFIS_IN_CLUSTER worth of points from the cluster, which is # pre-sorted in signal-strength order due to the way we built the # clusters. # # The reasoning here is that if we have >1 cluster at all, we probably # have some bad data -- likely an AP or set of APs associated with a # single antenna that moved -- since a user shouldn't be able to hear # multiple groups 500m apart. # # So we're trying to select a cluster that's most-likely good data, # which we assume to be the one with the most points in it. # # The reason we take a subset of those points when estimating location # is that we're doing a (non-weighted) centroid calculation, which is # itself unbalanced by distant elements. Even if we did a weighted # centroid here, using radio intensity as a proxy for distance has an # error that increases significantly with distance, so we'd have to # underweight pretty heavily. clusters = [c for c in clusters if len(c) > MIN_WIFIS_IN_CLUSTER] if len(clusters) == 0: return None clusters.sort(lambda a, b: cmp(len(b), len(a))) cluster = clusters[0] sample = cluster[:min(len(cluster), MAX_WIFIS_IN_CLUSTER)] length = len(sample) avg_lat = sum([n.lat for n in sample]) / length avg_lon = sum([n.lon for n in sample]) / length return { 'lat': quantize(avg_lat), 'lon': quantize(avg_lon), 'accuracy': estimate_accuracy(avg_lat, avg_lon, sample, WIFI_MIN_ACCURACY), }
def agrees_with(self, result): dist = distance(result.lat, result.lon, self.lat, self.lon) * 1000 return dist <= result.accuracy
def agrees_with(self, other): dist = distance(other.lat, other.lon, self.lat, self.lon) * 1000 return dist <= other.accuracy
def calculate_new_position(self, station, observations): # This function returns True if the station was found to be moving. length = len(observations) latitudes = [obs.lat for obs in observations] longitudes = [obs.lon for obs in observations] new_lat = sum(latitudes) / length new_lon = sum(longitudes) / length if station.lat and station.lon: latitudes.append(station.lat) longitudes.append(station.lon) existing_station = True else: station.lat = new_lat station.lon = new_lon existing_station = False # calculate extremes of observations, existing location estimate # and existing extreme values def extreme(vals, attr, function): new = function(vals) old = getattr(station, attr, None) if old is not None: return function(new, old) else: return new min_lat = extreme(latitudes, 'min_lat', min) min_lon = extreme(longitudes, 'min_lon', min) max_lat = extreme(latitudes, 'max_lat', max) max_lon = extreme(longitudes, 'max_lon', max) # calculate sphere-distance from opposite corners of # bounding box containing current location estimate # and new observations; if too big, station is moving box_dist = distance(min_lat, min_lon, max_lat, max_lon) if existing_station: if box_dist > self.max_dist_km: # Signal a moving station and return early without updating # the station since it will be deleted by caller momentarily return True # limit the maximum weight of the old station estimate old_weight = min(station.total_measures - length, self.MAX_OLD_OBSERVATIONS) new_weight = old_weight + length station.lat = ((station.lat * old_weight) + (new_lat * length)) / new_weight station.lon = ((station.lon * old_weight) + (new_lon * length)) / new_weight # decrease new counter, total is already correct station.new_measures = station.new_measures - length # update max/min lat/lon columns station.min_lat = min_lat station.min_lon = min_lon station.max_lat = max_lat station.max_lon = max_lon # give radio-range estimate between extreme values and centroid ctr = (station.lat, station.lon) points = [(min_lat, min_lon), (min_lat, max_lon), (max_lat, min_lon), (max_lat, max_lon)] station.range = range_to_points(ctr, points) * 1000.0 station.modified = util.utcnow()
def station_values(self, station_key, shard_station, observations): """ Return two-tuple of status, value dict where status is one of: `new`, `new_moving`, `moving`, `changed`. """ # cases: # we always get a station key and observations # 0. observations disagree # 0.a. no shard station, return new_moving # 0.b. shard station, return moving # 1. no shard station # 1.a. obs agree -> return new # 2. shard station # 2.a. obs disagree -> return moving # 2.b. obs agree -> return changed created = self.utcnow values = self._base_station_values(station_key, observations) obs_positions = numpy.array( [(obs.lat, obs.lon) for obs in observations], dtype=numpy.double) obs_length = len(observations) obs_weights = numpy.array( [obs.weight for obs in observations], dtype=numpy.double) obs_weight = float(obs_weights.sum()) obs_new_lat, obs_new_lon = numpy.average( obs_positions, axis=0, weights=obs_weights) obs_new_lat = float(obs_new_lat) obs_new_lon = float(obs_new_lon) obs_max_lat, obs_max_lon = obs_positions.max(axis=0) obs_min_lat, obs_min_lon = obs_positions.min(axis=0) obs_box_dist = distance(obs_min_lat, obs_min_lon, obs_max_lat, obs_max_lon) if obs_box_dist > self.max_dist_meters: # the new observations are already too far apart if not shard_station: values.update({ 'created': created, 'block_first': self.today, 'block_last': self.today, 'block_count': 1, }) return ('new_moving', values) else: block_count = shard_station.block_count or 0 values.update({ 'lat': None, 'lon': None, 'max_lat': None, 'min_lat': None, 'max_lon': None, 'min_lon': None, 'radius': None, 'region': shard_station.region, 'samples': None, 'source': None, 'weight': None, 'block_first': shard_station.block_first or self.today, 'block_last': self.today, 'block_count': block_count + 1, }) return ('moving', values) if shard_station is None: # totally new station, only agreeing observations radius = circle_radius( obs_new_lat, obs_new_lon, obs_max_lat, obs_max_lon, obs_min_lat, obs_min_lon) values.update({ 'created': created, 'lat': obs_new_lat, 'lon': obs_new_lon, 'max_lat': float(obs_max_lat), 'min_lat': float(obs_min_lat), 'max_lon': float(obs_max_lon), 'min_lon': float(obs_min_lon), 'radius': radius, 'region': GEOCODER.region(obs_new_lat, obs_new_lon), 'samples': obs_length, 'source': None, 'weight': obs_weight, }) return ('new', values) else: # shard_station + new observations positions = numpy.append(obs_positions, [ (numpy.nan if shard_station.lat is None else shard_station.lat, numpy.nan if shard_station.lon is None else shard_station.lon), (numpy.nan if shard_station.max_lat is None else shard_station.max_lat, numpy.nan if shard_station.max_lon is None else shard_station.max_lon), (numpy.nan if shard_station.min_lat is None else shard_station.min_lat, numpy.nan if shard_station.min_lon is None else shard_station.min_lon), ], axis=0) max_lat, max_lon = numpy.nanmax(positions, axis=0) min_lat, min_lon = numpy.nanmin(positions, axis=0) box_dist = distance(min_lat, min_lon, max_lat, max_lon) if box_dist > self.max_dist_meters: # shard_station + disagreeing observations block_count = shard_station.block_count or 0 values.update({ 'lat': None, 'lon': None, 'max_lat': None, 'min_lat': None, 'max_lon': None, 'min_lon': None, 'radius': None, 'region': shard_station.region, 'samples': None, 'source': None, 'weight': None, 'block_first': shard_station.block_first or self.today, 'block_last': self.today, 'block_count': block_count + 1, }) return ('moving', values) else: # shard_station + agreeing observations if shard_station.lat is None or shard_station.lon is None: old_weight = 0 else: old_weight = min((shard_station.weight or 0.0), self.MAX_OLD_WEIGHT) new_lat = ((obs_new_lat * obs_weight + (shard_station.lat or 0.0) * old_weight) / (obs_weight + old_weight)) new_lon = ((obs_new_lon * obs_weight + (shard_station.lon or 0.0) * old_weight) / (obs_weight + old_weight)) # put in maximum value to avoid overflow of DB column samples = min((shard_station.samples or 0) + obs_length, 4294967295) weight = min((shard_station.weight or 0.0) + obs_weight, 1000000000.0) radius = circle_radius( new_lat, new_lon, max_lat, max_lon, min_lat, min_lon) region = shard_station.region if (region and not GEOCODER.in_region( new_lat, new_lon, region)): # reset region if it no longer matches region = None if not region: region = GEOCODER.region(new_lat, new_lon) values.update({ 'lat': new_lat, 'lon': new_lon, 'max_lat': float(max_lat), 'min_lat': float(min_lat), 'max_lon': float(max_lon), 'min_lon': float(min_lon), 'radius': radius, 'region': region, 'samples': samples, 'source': None, 'weight': weight, # use the exact same keys as in the moving case 'block_first': shard_station.block_first, 'block_last': shard_station.block_last, 'block_count': shard_station.block_count, }) return ('changed', values) return (None, None) # pragma: no cover
def get(self, query): """ Get a cached result for the query. :param query: The query for which to look for a cached value. :type query: :class:`ichnaea.api.locate.query.Query` :returns: The cache result or None. :rtype: :class:`~ichnaea.api.locate.fallback.ExternalResult` """ if not self._should_cache(query): self._stat_count('cache', tags=['status:bypassed']) return None cache_keys = self._cache_keys(query) # dict of (lat, lon, fallback) tuples to ExternalResult list # lat/lon clustered into ~100x100 meter grid cells clustered_results = defaultdict(list) not_found_cluster = (None, None, None) try: for value in self.redis_client.mget(cache_keys): if not value: continue value = simplejson.loads(value) if value == LOCATION_NOT_FOUND: value = ExternalResult(None, None, None, None) clustered_results[not_found_cluster] = [value] else: value = ExternalResult(**value) # ~100x100m clusters clustered_results[(round(value.lat, 3), round(value.lat, 3), value.fallback)].append(value) except (simplejson.JSONDecodeError, RedisError): self.raven_client.captureException() self._stat_count('cache', tags=['status:failure']) return None if not clustered_results: self._stat_count('cache', tags=['status:miss']) return None if list(clustered_results.keys()) == [not_found_cluster]: # the only match was for not found results self._stat_count('cache', tags=['status:hit']) return clustered_results[not_found_cluster][0] if len(clustered_results) == 1: # all the cached values agree with each other self._stat_count('cache', tags=['status:hit']) results = list(clustered_results.values())[0] circles = numpy.array( [(res.lat, res.lon, res.accuracy) for res in results], dtype=numpy.double) points, accuracies = numpy.hsplit(circles, [2]) lat, lon = points.mean(axis=0) lat = float(lat) lon = float(lon) radius = 0.0 for circle in circles: p_dist = distance(lat, lon, circle[0], circle[1]) + circle[2] radius = max(radius, p_dist) return ExternalResult( lat=lat, lon=lon, accuracy=float(radius), fallback=results[0].fallback, ) # inconsistent results self._stat_count('cache', tags=['status:inconsistent']) return None
def search_wifi(session, wifis, stats_client, api_name): # Estimate signal strength at -100 dBm if none is provided, # which is worse than the 99th percentile of wifi dBms we # see in practice (-98). def signal_strength(w): signal = w['signal'] if signal == 0: return -100 return signal wifi_signals = dict([(w['key'], signal_strength(w)) for w in wifis]) wifi_keys = set(wifi_signals.keys()) if len(wifi_keys) < MIN_WIFIS_IN_QUERY: # We didn't get enough keys. if len(wifi_keys) >= 1: stats_client.incr('%s.wifi.provided_too_few' % api_name) return None stats_client.timing('%s.wifi.provided' % api_name, len(wifi_keys)) query = session.query(Wifi.key, Wifi.lat, Wifi.lon, Wifi.range).filter( Wifi.key.in_(wifi_keys)).filter(Wifi.lat.isnot(None)).filter( Wifi.lon.isnot(None)) wifis = query.all() if len(wifis) < len(wifi_keys): stats_client.incr('%s.wifi.partial_match' % api_name) stats_client.timing('%s.wifi.provided_not_known' % api_name, len(wifi_keys) - len(wifis)) # Filter out BSSIDs that are numerically very similar, assuming they're # multiple interfaces on the same base station or such. dissimilar_keys = set(filter_bssids_by_similarity([w.key for w in wifis])) if len(dissimilar_keys) < len(wifis): stats_client.timing('%s.wifi.provided_too_similar' % api_name, len(wifis) - len(dissimilar_keys)) wifis = [ Network(w.key, w.lat, w.lon, w.range) for w in wifis if w.key in dissimilar_keys ] if len(wifis) < MIN_WIFIS_IN_QUERY: # We didn't get enough matches. stats_client.incr('%s.wifi.found_too_few' % api_name) return None # Sort networks by signal strengths in query. wifis.sort(lambda a, b: cmp(wifi_signals[b.key], wifi_signals[a.key])) clusters = cluster_elements( wifis, lambda a, b: distance(a.lat, a.lon, b.lat, b.lon), MAX_WIFI_CLUSTER_KM) # The second loop selects a cluster and estimates the position of that # cluster. The selected cluster is the one with the most points, larger # than MIN_WIFIS_IN_CLUSTER; its position is estimated taking up-to # MAX_WIFIS_IN_CLUSTER worth of points from the cluster, which is # pre-sorted in signal-strength order due to the way we built the # clusters. # # The reasoning here is that if we have >1 cluster at all, we probably # have some bad data -- likely an AP or set of APs associated with a # single antenna that moved -- since a user shouldn't be able to hear # multiple groups 500m apart. # # So we're trying to select a cluster that's most-likely good data, # which we assume to be the one with the most points in it. # # The reason we take a subset of those points when estimating location # is that we're doing a (non-weighted) centroid calculation, which is # itself unbalanced by distant elements. Even if we did a weighted # centroid here, using radio intensity as a proxy for distance has an # error that increases significantly with distance, so we'd have to # underweight pretty heavily. clusters = [c for c in clusters if len(c) >= MIN_WIFIS_IN_CLUSTER] if len(clusters) == 0: stats_client.incr('%s.wifi.found_no_cluster' % api_name) return None clusters.sort(lambda a, b: cmp(len(b), len(a))) cluster = clusters[0] sample = cluster[:min(len(cluster), MAX_WIFIS_IN_CLUSTER)] length = len(sample) avg_lat = sum([n.lat for n in sample]) / length avg_lon = sum([n.lon for n in sample]) / length return { 'lat': avg_lat, 'lon': avg_lon, 'accuracy': estimate_accuracy(avg_lat, avg_lon, sample, WIFI_MIN_ACCURACY), }
def search_wifi(session, wifis): # Estimate signal strength at -100 dBm if none is provided, # which is worse than the 99th percentile of wifi dBms we # see in practice (-98). def signal_strength(w): signal = w['signal'] if signal == 0: return -100 return signal wifi_signals = dict([(w['key'], signal_strength(w)) for w in wifis]) wifi_keys = set(wifi_signals.keys()) if len(wifi_keys) < MIN_WIFIS_IN_QUERY: # We didn't get enough keys. return None query = session.query(Wifi.key, Wifi.lat, Wifi.lon, Wifi.range).filter( Wifi.key.in_(wifi_keys)).filter( Wifi.lat.isnot(None)).filter( Wifi.lon.isnot(None)) wifis = query.all() # Filter out BSSIDs that are numerically very similar, assuming they're # multiple interfaces on the same base station or such. dissimilar_keys = set(filter_bssids_by_similarity([w.key for w in wifis])) wifis = [Network(w.key, w.lat, w.lon, w.range) for w in wifis if w.key in dissimilar_keys] if len(wifis) < MIN_WIFIS_IN_QUERY: # We didn't get enough matches. return None # Sort networks by signal strengths in query. wifis.sort(lambda a, b: cmp(wifi_signals[b.key], wifi_signals[a.key])) clusters = cluster_elements(wifis, lambda a, b: distance(a.lat, a.lon, b.lat, b.lon), MAX_WIFI_CLUSTER_KM) # The second loop selects a cluster and estimates the position of that # cluster. The selected cluster is the one with the most points, larger # than MIN_WIFIS_IN_CLUSTER; its position is estimated taking up-to # MAX_WIFIS_IN_CLUSTER worth of points from the cluster, which is # pre-sorted in signal-strength order due to the way we built the # clusters. # # The reasoning here is that if we have >1 cluster at all, we probably # have some bad data -- likely an AP or set of APs associated with a # single antenna that moved -- since a user shouldn't be able to hear # multiple groups 500m apart. # # So we're trying to select a cluster that's most-likely good data, # which we assume to be the one with the most points in it. # # The reason we take a subset of those points when estimating location # is that we're doing a (non-weighted) centroid calculation, which is # itself unbalanced by distant elements. Even if we did a weighted # centroid here, using radio intensity as a proxy for distance has an # error that increases significantly with distance, so we'd have to # underweight pretty heavily. clusters = [c for c in clusters if len(c) >= MIN_WIFIS_IN_CLUSTER] if len(clusters) == 0: return None clusters.sort(lambda a, b: cmp(len(b), len(a))) cluster = clusters[0] sample = cluster[:min(len(cluster), MAX_WIFIS_IN_CLUSTER)] length = len(sample) avg_lat = sum([n.lat for n in sample]) / length avg_lon = sum([n.lon for n in sample]) / length return { 'lat': avg_lat, 'lon': avg_lon, 'accuracy': estimate_accuracy(avg_lat, avg_lon, sample, WIFI_MIN_ACCURACY), }
def station_values(self, station_key, shard_station, observations): """ Return two-tuple of status, value dict where status is one of: `new`, `new_moving`, `moving`, `changed`. """ # cases: # we always get a station key and observations # 0. observations disagree # 0.a. no shard station, return new_moving # 0.b. shard station, return moving # 1. no shard station # 1.a. obs agree -> return new # 2. shard station # 2.a. obs disagree -> return moving # 2.b. obs agree -> return changed created = self.utcnow values = self._base_station_values(station_key, observations) obs_length = len(observations) obs_positions = numpy.array([(obs.lat, obs.lon) for obs in observations], dtype=numpy.double) obs_new_lat, obs_new_lon = centroid(obs_positions) obs_max_lat, obs_max_lon = numpy.nanmax(obs_positions, axis=0) obs_min_lat, obs_min_lon = numpy.nanmin(obs_positions, axis=0) obs_box_dist = distance(obs_min_lat, obs_min_lon, obs_max_lat, obs_max_lon) if obs_box_dist > self.max_dist_meters: # the new observations are already too far apart if not shard_station: values.update({ 'created': created, 'block_first': self.today, 'block_last': self.today, 'block_count': 1, }) return ('new_moving', values) else: block_count = shard_station.block_count or 0 values.update({ 'lat': None, 'lon': None, 'max_lat': None, 'min_lat': None, 'max_lon': None, 'min_lon': None, 'radius': None, 'region': shard_station.region, 'samples': None, 'source': None, 'block_first': shard_station.block_first or self.today, 'block_last': self.today, 'block_count': block_count + 1, }) return ('moving', values) if shard_station is None: # totally new station, only agreeing observations radius = circle_radius(obs_new_lat, obs_new_lon, obs_max_lat, obs_max_lon, obs_min_lat, obs_min_lon) values.update({ 'created': created, 'lat': obs_new_lat, 'lon': obs_new_lon, 'max_lat': float(obs_max_lat), 'min_lat': float(obs_min_lat), 'max_lon': float(obs_max_lon), 'min_lon': float(obs_min_lon), 'radius': radius, 'region': GEOCODER.region(obs_new_lat, obs_new_lon), 'samples': obs_length, 'source': None, }) return ('new', values) else: # shard_station + new observations positions = numpy.append(obs_positions, [ (numpy.nan if shard_station.lat is None else shard_station.lat, numpy.nan if shard_station.lon is None else shard_station.lon), (numpy.nan if shard_station.max_lat is None else shard_station.max_lat, numpy.nan if shard_station.max_lon is None else shard_station.max_lon), (numpy.nan if shard_station.min_lat is None else shard_station.min_lat, numpy.nan if shard_station.min_lon is None else shard_station.min_lon), ], axis=0) max_lat, max_lon = numpy.nanmax(positions, axis=0) min_lat, min_lon = numpy.nanmin(positions, axis=0) box_dist = distance(min_lat, min_lon, max_lat, max_lon) if box_dist > self.max_dist_meters: # shard_station + disagreeing observations block_count = shard_station.block_count or 0 values.update({ 'lat': None, 'lon': None, 'max_lat': None, 'min_lat': None, 'max_lon': None, 'min_lon': None, 'radius': None, 'region': shard_station.region, 'samples': None, 'source': None, 'block_first': shard_station.block_first or self.today, 'block_last': self.today, 'block_count': block_count + 1, }) return ('moving', values) else: # shard_station + agreeing observations if shard_station.lat is None or shard_station.lon is None: old_weight = 0 else: old_weight = min((shard_station.samples or 0), self.MAX_OLD_OBSERVATIONS) new_lat = ((obs_new_lat * obs_length + (shard_station.lat or 0.0) * old_weight) / (obs_length + old_weight)) new_lon = ((obs_new_lon * obs_length + (shard_station.lon or 0.0) * old_weight) / (obs_length + old_weight)) samples = (shard_station.samples or 0) + obs_length radius = circle_radius(new_lat, new_lon, max_lat, max_lon, min_lat, min_lon) region = shard_station.region if (region and not GEOCODER.in_region(new_lat, new_lon, region)): # reset region if it no longer matches region = None if not region: region = GEOCODER.region(new_lat, new_lon) values.update({ 'lat': new_lat, 'lon': new_lon, 'max_lat': float(max_lat), 'min_lat': float(min_lat), 'max_lon': float(max_lon), 'min_lon': float(min_lon), 'radius': radius, 'region': region, 'samples': samples, 'source': None, # use the exact same keys as in the moving case 'block_first': shard_station.block_first, 'block_last': shard_station.block_last, 'block_count': shard_station.block_count, }) return ('changed', values) return (None, None) # pragma: no cover
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None, api_key_log=False, api_key_name=None, result_type='position'): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. :param api_key_log: Enable additional api key specific logging? :param api_key_name: The metric friendly api key name. :param result_type: What kind of result to return, either a lat/lon position or a country estimate. """ if result_type not in ('country', 'position'): raise ValueError('Invalid result_type, must be one of ' 'position or country') stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pre-process wifi data for wifi in data.get('wifi', ()): wifi = normalized_wifi_dict(wifi) if wifi: validated['wifi'].append(wifi) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key) found_cells = [] # Query all cells and OCID cells for model in Cell, OCIDCell, CellArea: cell_filter = [] for key in validated['cell']: # create a list of 'and' criteria for cell keys criterion = join_cellkey(model, key) cell_filter.append(and_(*criterion)) if cell_filter: # only do a query if we have cell results, or this will match # all rows in the table load_fields = ('radio', 'mcc', 'mnc', 'lac', 'lat', 'lon', 'range') query = (session.query(model).options( load_only(*load_fields)).filter(or_(*cell_filter)).filter( model.lat.isnot(None)).filter(model.lon.isnot(None))) try: found_cells.extend(query.all()) except Exception: heka_client.raven(RAVEN_ERROR) if found_cells: # Group all found_cellss by location area lacs = defaultdict(list) for cell in found_cells: cellarea_key = (cell.radio, cell.mcc, cell.mnc, cell.lac) lacs[cellarea_key].append(cell) def sort_lac(v): # use the lac with the most values, # or the one with the smallest range return (len(v), -min([e.range for e in v])) # If we get data from multiple location areas, use the one with the # most data points in it. That way a lac with a cell hit will # have two entries and win over a lac with only the lac entry. lac = sorted(lacs.values(), key=sort_lac, reverse=True) for cell in lac[0]: # The first entry is the key, # used only to distinguish cell from lac network = Network(key=None, lat=cell.lat, lon=cell.lon, range=cell.range) if type(cell) is CellArea: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes(validated['cell'], api_name, client_addr, geoip_db, stats_client) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi) ]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field], stats_client, api_name) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' # Do detailed logging for some api keys if api_key_log and api_key_name: api_log_metric = None wifi_keys = set([w['key'] for w in validated['wifi']]) if wifi_keys and \ len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY: # Only count requests as WiFi-based if they contain enough # distinct WiFi networks to pass our filters if result_metric == 'wifi': api_log_metric = 'wifi_hit' else: api_log_metric = 'wifi_miss' elif validated['cell']: if result_metric == 'cell': api_log_metric = 'cell_hit' elif result_metric == 'cell_lac': api_log_metric = 'cell_lac_hit' else: api_log_metric = 'cell_miss' else: if geoip_res: api_log_metric = 'geoip_hit' else: api_log_metric = 'geoip_miss' if api_log_metric: stats_client.incr('%s.api_log.%s.%s' % (api_name, api_key_name, api_log_metric)) if not result: stats_client.incr('%s.miss' % api_name) return None stats_client.incr('%s.%s_hit' % (api_name, result_metric)) if result_type == 'position': rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result elif result_type == 'country': if countries: country = iso3166.countries.get(countries[0]) return { 'country_name': country.name, 'country_code': country.alpha2 }