def process_batch(request, data, errors): nickname = request.headers.get('X-Nickname', u'') upload_items = flatten_items(data) errors = process_upload(nickname, upload_items) if errors is SENTINEL: return HTTPServiceUnavailable() if errors: get_stats_client().incr('geosubmit.upload.errors', len(errors)) result = HTTPOk() result.content_type = 'application/json' result.body = '{}' return result
def process_single(request): stats_client = get_stats_client() locate_data, locate_errors = preprocess_request( request, schema=GeoLocateSchema(), extra_checks=(geolocate_validator, ), response=JSONParseError, accept_empty=True, ) data, errors = preprocess_request( request, schema=GeoSubmitSchema(), extra_checks=(geosubmit_validator, ), response=None, ) data = {'items': [data]} nickname = request.headers.get('X-Nickname', u'') email = request.headers.get('X-Email', u'') upload_items = flatten_items(data) errors = process_upload(nickname, email, upload_items) if errors is not SENTINEL and errors: # pragma: no cover stats_client.incr('geosubmit.upload.errors', len(errors)) first_item = data['items'][0] if first_item['latitude'] == -255 or first_item['longitude'] == -255: data = map_data(data['items'][0]) session = request.db_slave_session result = search_all_sources( session, 'geosubmit', data, client_addr=request.client_addr, geoip_db=request.registry.geoip_db, api_key_log=getattr(request, 'api_key_log', False), api_key_name=getattr(request, 'api_key_name', None)) else: result = { 'lat': first_item['latitude'], 'lon': first_item['longitude'], 'accuracy': first_item['accuracy'] } if result is None: stats_client.incr('geosubmit.miss') result = HTTPNotFound() result.content_type = 'application/json' result.body = NOT_FOUND return result return { "location": { "lat": result['lat'], "lng": result['lon'], }, "accuracy": float(result['accuracy']), }
def process_batch(request, data, errors): nickname = request.headers.get('X-Nickname', u'') email = request.headers.get('X-Email', u'') upload_items = flatten_items(data) errors = process_upload(nickname, email, upload_items) if errors is SENTINEL: # pragma: no cover return HTTPServiceUnavailable() if errors: # pragma: no cover get_stats_client().incr('geosubmit.upload.errors', len(errors)) result = HTTPOk() result.content_type = 'application/json' result.body = '{}' return result
def geoip_and_best_guess_country_codes(cell_keys, api_name, client_addr, geoip_db): """ Return (geoip, alpha2) where geoip is the result of a GeoIP lookup and alpha2 is a best-guess ISO 3166 alpha2 country code. The country code guess uses both GeoIP and cell MCCs, preferring GeoIP. Return None for either field if no data is available. """ stats_client = get_stats_client() geoip = None if client_addr and geoip_db is not None: geoip = geoip_db.geoip_lookup(client_addr) cell_countries = [] cell_mccs = set() for cell_key in cell_keys: for c in mobile_codes.mcc(str(cell_key.mcc)): cell_countries.append(c.alpha2) cell_mccs.add(cell_key.mcc) if len(cell_mccs) > 1: stats_client.incr('%s.anomaly.multiple_mccs' % api_name) if geoip: # GeoIP always wins if we have it. accuracy, city = radius_from_geoip(geoip) if city: stats_client.incr('%s.geoip_city_found' % api_name) else: stats_client.incr('%s.geoip_country_found' % api_name) if geoip['country_code'] not in cell_countries: if cell_countries: stats_client.incr('%s.anomaly.geoip_mcc_mismatch' % api_name) # Only use the GeoIP country as an additional possible match, # but retain the cell countries as a likely match as well. cell_countries.append(geoip['country_code']) stats_client.incr('%s.country_from_geoip' % api_name) geoip_res = { 'lat': geoip['latitude'], 'lon': geoip['longitude'], 'accuracy': accuracy } return (geoip_res, most_common_elements(cell_countries)) else: stats_client.incr('%s.no_geoip_found' % api_name) # Pick the most-commonly-occurring country codes if we got any cc = most_common_elements(cell_countries) if cc: stats_client.incr('%s.country_from_mcc' % api_name) return (None, cc) stats_client.incr('%s.no_country' % api_name) return (None, [])
def process_single(request): stats_client = get_stats_client() locate_data, locate_errors = preprocess_request( request, schema=GeoLocateSchema(), extra_checks=(geolocate_validator, ), response=JSONParseError, accept_empty=True, ) data, errors = preprocess_request( request, schema=GeoSubmitSchema(), extra_checks=(geosubmit_validator,), response=None, ) data = {'items': [data]} nickname = request.headers.get('X-Nickname', u'') email = request.headers.get('X-Email', u'') upload_items = flatten_items(data) errors = process_upload(nickname, email, upload_items) if errors is not SENTINEL and errors: # pragma: no cover stats_client.incr('geosubmit.upload.errors', len(errors)) first_item = data['items'][0] if first_item['latitude'] == -255 or first_item['longitude'] == -255: data = map_data(data['items'][0]) session = request.db_slave_session result = search_all_sources( session, 'geosubmit', data, client_addr=request.client_addr, geoip_db=request.registry.geoip_db, api_key_log=getattr(request, 'api_key_log', False), api_key_name=getattr(request, 'api_key_name', None)) else: result = {'lat': first_item['latitude'], 'lon': first_item['longitude'], 'accuracy': first_item['accuracy']} if result is None: stats_client.incr('geosubmit.miss') result = HTTPNotFound() result.content_type = 'application/json' result.body = NOT_FOUND return result return { "location": { "lat": result['lat'], "lng": result['lon'], }, "accuracy": float(result['accuracy']), }
def blacklist_and_remove_moving_stations(session, blacklist_model, station_type, to_key, join_key, moving_stations, remove_station): moving_keys = [] utcnow = util.utcnow() for station in moving_stations: key = to_key(station) query = session.query(blacklist_model).filter( *join_key(blacklist_model, key)) b = query.first() d = key._asdict() moving_keys.append(d) if b: b.time = utcnow b.count += 1 else: b = blacklist_model(**d) session.add(b) if moving_keys: get_stats_client().incr("items.blacklisted.%s_moving" % station_type, len(moving_keys)) remove_station.delay(moving_keys)
def blacklist_and_remove_moving_stations(session, blacklist_model, station_type, moving_stations, remove_station): moving_keys = [] utcnow = util.utcnow() for station in moving_stations: station_key = blacklist_model.to_hashkey(station) query = session.query(blacklist_model).filter( *blacklist_model.joinkey(station_key)) blacklisted_station = query.first() moving_keys.append(station_key) if blacklisted_station: blacklisted_station.time = utcnow blacklisted_station.count += 1 else: blacklisted_station = blacklist_model(time=utcnow, count=1, **station_key.__dict__) session.add(blacklisted_station) if moving_keys: get_stats_client().incr("items.blacklisted.%s_moving" % station_type, len(moving_keys)) remove_station.delay(moving_keys)
def __init__(self, api_key_name, api_key_log, api_name): """ A StatsLogger sends counted and timed named statistics to a statistic aggregator client. :param api_key_name: Human readable API key name (for example 'test_1') :type api_key_name: str :param api_key_log: Gather additional API key specific stats? :type api_key_log: bool :param api_name: Name of the API, used as stats prefix (for example 'geolocate') :type api_name: str """ self.api_key_name = api_key_name self.api_key_log = api_key_log self.api_name = api_name self.raven_client = get_raven_client() self.stats_client = get_stats_client()
def process_station_measures(session, entries, station_type, station_model, measure_model, blacklist_model, create_measure, create_key, join_key, userid=None, max_measures_per_station=11000, utcnow=None): all_measures = [] dropped_blacklisted = 0 dropped_malformed = 0 dropped_overflow = 0 stats_client = get_stats_client() new_stations = 0 if utcnow is None: utcnow = util.utcnow() elif isinstance(utcnow, basestring): utcnow = decode_datetime(utcnow) # Process entries and group by validated station key station_measures = defaultdict(list) for entry in entries: measure = create_measure(utcnow, entry) if not measure: dropped_malformed += 1 continue station_measures[create_key(measure)].append(measure) # Process measures one station at a time for key, measures in station_measures.items(): incomplete = False is_new_station = False # Figure out how much space is left for this station. free = available_station_space(session, key, station_model, join_key, max_measures_per_station) if free is None: is_new_station = True free = max_measures_per_station if is_new_station: # Drop measures for blacklisted stations. if blacklisted_station(session, key, blacklist_model, join_key, utcnow): dropped_blacklisted += len(measures) continue incomplete = incomplete_measure(key) if not incomplete: # We discovered an actual new complete station. new_stations += 1 # Accept measures up to input-throttling limit, then drop. num = 0 for measure in measures: if free <= 0: dropped_overflow += 1 continue all_measures.append(measure) free -= 1 num += 1 # Accept incomplete measures, just don't make stations for them. # (station creation is a side effect of count-updating) if not incomplete and num > 0: create_or_update_station(session, key, station_model, join_key, utcnow, num) # Credit the user with discovering any new stations. if userid is not None and new_stations > 0: process_score(userid, new_stations, session, key='new_' + station_type) if dropped_blacklisted != 0: stats_client.incr("items.dropped.%s_ingress_blacklisted" % station_type, count=dropped_blacklisted) if dropped_malformed != 0: stats_client.incr("items.dropped.%s_ingress_malformed" % station_type, count=dropped_malformed) if dropped_overflow != 0: stats_client.incr("items.dropped.%s_ingress_overflow" % station_type, count=dropped_overflow) stats_client.incr("items.inserted.%s_measures" % station_type, count=len(all_measures)) session.add_all(all_measures) return all_measures
def stats_client(self): return get_stats_client()
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None, api_key_log=False, api_key_name=None, result_type='position'): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. :param api_key_log: Enable additional api key specific logging? :param api_key_name: The metric friendly api key name. :param result_type: What kind of result to return, either a lat/lon position or a country estimate. """ if result_type not in ('country', 'position'): raise ValueError('Invalid result_type, must be one of ' 'position or country') stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pre-process wifi data for wifi in data.get('wifi', ()): wifi = normalized_wifi_dict(wifi) if wifi: validated['wifi'].append(wifi) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key) found_cells = [] # Query all cells and OCID cells for model in Cell, OCIDCell, CellArea: cell_filter = [] for key in validated['cell']: # create a list of 'and' criteria for cell keys criterion = join_cellkey(model, key) cell_filter.append(and_(*criterion)) if cell_filter: # only do a query if we have cell results, or this will match # all rows in the table load_fields = ('radio', 'mcc', 'mnc', 'lac', 'lat', 'lon', 'range') query = (session.query(model).options( load_only(*load_fields)).filter(or_(*cell_filter)).filter( model.lat.isnot(None)).filter(model.lon.isnot(None))) try: found_cells.extend(query.all()) except Exception: heka_client.raven(RAVEN_ERROR) if found_cells: # Group all found_cellss by location area lacs = defaultdict(list) for cell in found_cells: cellarea_key = (cell.radio, cell.mcc, cell.mnc, cell.lac) lacs[cellarea_key].append(cell) def sort_lac(v): # use the lac with the most values, # or the one with the smallest range return (len(v), -min([e.range for e in v])) # If we get data from multiple location areas, use the one with the # most data points in it. That way a lac with a cell hit will # have two entries and win over a lac with only the lac entry. lac = sorted(lacs.values(), key=sort_lac, reverse=True) for cell in lac[0]: # The first entry is the key, # used only to distinguish cell from lac network = Network(key=None, lat=cell.lat, lon=cell.lon, range=cell.range) if type(cell) is CellArea: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes(validated['cell'], api_name, client_addr, geoip_db, stats_client) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi) ]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field], stats_client, api_name) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' # Do detailed logging for some api keys if api_key_log and api_key_name: api_log_metric = None wifi_keys = set([w['key'] for w in validated['wifi']]) if wifi_keys and \ len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY: # Only count requests as WiFi-based if they contain enough # distinct WiFi networks to pass our filters if result_metric == 'wifi': api_log_metric = 'wifi_hit' else: api_log_metric = 'wifi_miss' elif validated['cell']: if result_metric == 'cell': api_log_metric = 'cell_hit' elif result_metric == 'cell_lac': api_log_metric = 'cell_lac_hit' else: api_log_metric = 'cell_miss' else: if geoip_res: api_log_metric = 'geoip_hit' else: api_log_metric = 'geoip_miss' if api_log_metric: stats_client.incr('%s.api_log.%s.%s' % (api_name, api_key_name, api_log_metric)) if not result: stats_client.incr('%s.miss' % api_name) return None stats_client.incr('%s.%s_hit' % (api_name, result_metric)) if result_type == 'position': rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result elif result_type == 'country': if countries: country = iso3166.countries.get(countries[0]) return { 'country_name': country.name, 'country_code': country.alpha2 }
def process_observations(observations, session, userid=None, api_key_log=False, api_key_name=None): stats_client = get_stats_client() positions = [] cell_observations = [] wifi_observations = [] for i, obs in enumerate(observations): obs['report_id'] = uuid.uuid1() cell, wifi = process_observation(obs, session) cell_observations.extend(cell) wifi_observations.extend(wifi) if cell or wifi: positions.append({ 'lat': obs['lat'], 'lon': obs['lon'], }) if cell_observations: # group by and create task per cell key stats_client.incr('items.uploaded.cell_observations', len(cell_observations)) if api_key_log: stats_client.incr( 'items.api_log.%s.uploaded.cell_observations' % api_key_name, len(cell_observations)) cells = defaultdict(list) for obs in cell_observations: cells[CellObservation.to_hashkey(obs)].append(obs) # Create a task per group of 5 cell keys at a time. # Grouping them helps in avoiding per-task overhead. cells = list(cells.values()) batch_size = 5 countdown = 0 for i in range(0, len(cells), batch_size): values = [] for observations in cells[i:i + batch_size]: values.extend(observations) # insert observations, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations insert_measures_cell.apply_async(args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if wifi_observations: # group by WiFi key stats_client.incr('items.uploaded.wifi_observations', len(wifi_observations)) if api_key_log: stats_client.incr( 'items.api_log.%s.uploaded.wifi_observations' % api_key_name, len(wifi_observations)) wifis = defaultdict(list) for obs in wifi_observations: wifis[WifiObservation.to_hashkey(obs)].append(obs) # Create a task per group of 20 WiFi keys at a time. # We tend to get a huge number of unique WiFi networks per # batch upload, with one to very few observations per WiFi. # Grouping them helps in avoiding per-task overhead. wifis = list(wifis.values()) batch_size = 20 countdown = 0 for i in range(0, len(wifis), batch_size): values = [] for observations in wifis[i:i + batch_size]: values.extend(observations) # insert observations, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations insert_measures_wifi.apply_async(args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if userid is not None: process_score(userid, len(positions), session) if positions: process_mapstat(session, positions)
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. """ stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pass-through wifi data validated['wifi'] = data.get('wifi', []) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC)) # Merge all possible cell and lac keys into one list all_cell_keys = [] all_cell_keys.extend(validated['cell']) for key in validated['cell_lac']: all_cell_keys.append(key) # Do a single query for all cells and lacs at the same time try: all_networks = query_cell_networks(session, all_cell_keys) except Exception: heka_client.raven(RAVEN_ERROR) all_networks = [] for network in all_networks: if network.key == CELLID_LAC: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes( validated['cell'], api_name, client_addr, geoip_db) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi)]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field]) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' if not result: stats_client.incr('%s.miss' % api_name) return None rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.incr('%s.%s_hit' % (api_name, result_metric)) stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result
def process_station_observations(session, entries, station_type, station_model, observation_model, blacklist_model, userid=None, max_observations_per_station=11000, utcnow=None): all_observations = [] dropped_blacklisted = 0 dropped_malformed = 0 dropped_overflow = 0 stats_client = get_stats_client() new_stations = 0 if utcnow is None: utcnow = util.utcnow() # Process entries and group by validated station key station_observations = defaultdict(list) for entry in entries: entry['created'] = utcnow obs = observation_model.create(entry) if not obs: dropped_malformed += 1 continue station_observations[obs.hashkey()].append(obs) # Process observations one station at a time for key, observations in station_observations.items(): first_blacklisted = None incomplete = False is_new_station = False # Figure out how much space is left for this station. free = available_station_space(session, key, station_model, max_observations_per_station) if free is None: is_new_station = True free = max_observations_per_station if is_new_station: # Drop observations for blacklisted stations. blacklisted, first_blacklisted = blacklisted_station( session, key, blacklist_model, utcnow) if blacklisted: dropped_blacklisted += len(observations) continue incomplete = incomplete_observation(station_type, key) if not incomplete: # We discovered an actual new complete station. new_stations += 1 # Accept observations up to input-throttling limit, then drop. num = 0 for obs in observations: if free <= 0: dropped_overflow += 1 continue all_observations.append(obs) free -= 1 num += 1 # Accept incomplete observations, just don't make stations for them. # (station creation is a side effect of count-updating) if not incomplete and num > 0: create_or_update_station(session, key, station_model, utcnow, num, first_blacklisted) # Credit the user with discovering any new stations. if userid is not None and new_stations > 0: process_score(userid, new_stations, session, key='new_' + station_type) if dropped_blacklisted != 0: stats_client.incr('items.dropped.%s_ingress_blacklisted' % station_type, count=dropped_blacklisted) if dropped_malformed != 0: stats_client.incr('items.dropped.%s_ingress_malformed' % station_type, count=dropped_malformed) if dropped_overflow != 0: stats_client.incr('items.dropped.%s_ingress_overflow' % station_type, count=dropped_overflow) stats_client.incr('items.inserted.%s_observations' % station_type, count=len(all_observations)) session.add_all(all_observations) return all_observations
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None, api_key_log=False, api_key_name=None): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. """ stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pre-process wifi data for wifi in data.get('wifi', ()): wifi = normalized_wifi_dict(wifi) if wifi: validated['wifi'].append(wifi) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC)) # Merge all possible cell and lac keys into one list all_cell_keys = [] all_cell_keys.extend(validated['cell']) for key in validated['cell_lac']: all_cell_keys.append(key) # Do a single query for all cells and lacs at the same time try: all_networks = query_cell_networks(session, all_cell_keys) except Exception: heka_client.raven(RAVEN_ERROR) all_networks = [] for network in all_networks: if network.key == CELLID_LAC: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes(validated['cell'], api_name, client_addr, geoip_db) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi) ]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field]) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' # Do detailed logging for some api keys if api_key_log and api_key_name: api_log_metric = None wifi_keys = set([w['key'] for w in validated['wifi']]) if wifi_keys and \ len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY: # Only count requests as WiFi-based if they contain enough # distinct WiFi networks to pass our filters if result_metric == 'wifi': api_log_metric = 'wifi_hit' else: api_log_metric = 'wifi_miss' elif validated['cell']: if result_metric == 'cell': api_log_metric = 'cell_hit' elif result_metric == 'cell_lac': api_log_metric = 'cell_lac_hit' else: api_log_metric = 'cell_miss' else: if geoip_res: api_log_metric = 'geoip_hit' else: api_log_metric = 'geoip_miss' if api_log_metric: stats_client.incr('%s.api_log.%s.%s' % (api_name, api_key_name, api_log_metric)) if not result: stats_client.incr('%s.miss' % api_name) return None rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.incr('%s.%s_hit' % (api_name, result_metric)) stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result
def process_measures(items, session, userid=None): stats_client = get_stats_client() positions = [] cell_measures = [] wifi_measures = [] for i, item in enumerate(items): item['report_id'] = uuid.uuid1().hex cell, wifi = process_measure(item, session) cell_measures.extend(cell) wifi_measures.extend(wifi) if cell or wifi: positions.append({ 'lat': item['lat'], 'lon': item['lon'], }) if cell_measures: # group by and create task per cell key stats_client.incr("items.uploaded.cell_measures", len(cell_measures)) cells = defaultdict(list) for measure in cell_measures: cells[to_cellkey_psc(measure)].append(measure) # Create a task per group of 5 cell keys at a time. # Grouping them helps in avoiding per-task overhead. cells = list(cells.values()) batch_size = 5 countdown = 0 for i in range(0, len(cells), batch_size): values = [] for measures in cells[i:i + batch_size]: values.extend(measures) # insert measures, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations insert_measures_cell.apply_async( args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if wifi_measures: # group by WiFi key stats_client.incr("items.uploaded.wifi_measures", len(wifi_measures)) wifis = defaultdict(list) for measure in wifi_measures: wifis[measure['key']].append(measure) # Create a task per group of 20 WiFi keys at a time. # We tend to get a huge number of unique WiFi networks per # batch upload, with one to very few measures per WiFi. # Grouping them helps in avoiding per-task overhead. wifis = list(wifis.values()) batch_size = 20 countdown = 0 for i in range(0, len(wifis), batch_size): values = [] for measures in wifis[i:i + batch_size]: values.extend(measures) # insert measures, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations insert_measures_wifi.apply_async( args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if userid is not None: process_score(userid, len(positions), session) if positions: process_mapstat(session, positions)
def process_station_measures(session, entries, station_type, station_model, measure_model, blacklist_model, create_measure, create_key, join_key, userid=None, max_measures_per_station=11000, utcnow=None): all_measures = [] dropped_blacklisted = 0 dropped_malformed = 0 dropped_overflow = 0 stats_client = get_stats_client() new_stations = 0 if utcnow is None: utcnow = util.utcnow() elif isinstance(utcnow, basestring): utcnow = decode_datetime(utcnow) # Process entries and group by validated station key station_measures = defaultdict(list) for entry in entries: measure = create_measure(utcnow, entry) if not measure: dropped_malformed += 1 continue station_measures[create_key(measure)].append(measure) # Process measures one station at a time for key, measures in station_measures.items(): incomplete = False is_new_station = False # Figure out how much space is left for this station. free = available_station_space(session, key, station_model, join_key, max_measures_per_station) if free is None: is_new_station = True free = max_measures_per_station if is_new_station: # Drop measures for blacklisted stations. if blacklisted_station(session, key, blacklist_model, join_key, utcnow): dropped_blacklisted += len(measures) continue incomplete = incomplete_measure(key) if not incomplete: # We discovered an actual new complete station. new_stations += 1 # Accept measures up to input-throttling limit, then drop. num = 0 for measure in measures: if free <= 0: dropped_overflow += 1 continue all_measures.append(measure) free -= 1 num += 1 # Accept incomplete measures, just don't make stations for them. # (station creation is a side effect of count-updating) if not incomplete and num > 0: create_or_update_station(session, key, station_model, join_key, utcnow, num) # Credit the user with discovering any new stations. if userid is not None and new_stations > 0: process_score(userid, new_stations, session, key='new_' + station_type) if dropped_blacklisted != 0: stats_client.incr( "items.dropped.%s_ingress_blacklisted" % station_type, count=dropped_blacklisted) if dropped_malformed != 0: stats_client.incr( "items.dropped.%s_ingress_malformed" % station_type, count=dropped_malformed) if dropped_overflow != 0: stats_client.incr( "items.dropped.%s_ingress_overflow" % station_type, count=dropped_overflow) stats_client.incr( "items.inserted.%s_measures" % station_type, count=len(all_measures)) session.add_all(all_measures) return all_measures