def search_cell(session, data): sql_null = None # avoid pep8 warning radio = RADIO_TYPE.get(data['radio'], -1) cells = [] for cell in data['cell']: if cell.get('radio'): radio = RADIO_TYPE.get(cell['radio'], -1) query = session.query(Cell.lat, Cell.lon).filter( Cell.radio == radio).filter( Cell.mcc == cell['mcc']).filter( Cell.mnc == cell['mnc']).filter( Cell.lac == cell['lac']).filter( Cell.cid == cell['cid']).filter( Cell.lat != sql_null).filter( Cell.lon != sql_null ) result = query.first() if result is not None: cells.append(result) if not cells: return length = len(cells) avg_lat = sum([c[0] for c in cells]) / length avg_lon = sum([c[1] for c in cells]) / length return { 'lat': quantize(avg_lat), 'lon': quantize(avg_lon), 'accuracy': 35000, }
def search_cell(session, data): sql_null = None # avoid pep8 warning radio = RADIO_TYPE.get(data['radio'], -1) cells = [] for cell in data['cell']: if cell.get('radio'): radio = RADIO_TYPE.get(cell['radio'], -1) query = session.query(Cell.lat, Cell.lon).filter( Cell.radio == radio).filter(Cell.mcc == cell['mcc']).filter( Cell.mnc == cell['mnc']).filter( Cell.lac == cell['lac']).filter( Cell.cid == cell['cid']).filter( Cell.lat != sql_null).filter(Cell.lon != sql_null) result = query.first() if result is not None: cells.append(result) if not cells: return length = len(cells) avg_lat = sum([c[0] for c in cells]) / length avg_lon = sum([c[1] for c in cells]) / length return { 'lat': quantize(avg_lat), 'lon': quantize(avg_lon), 'accuracy': 35000, }
def search_cell(session, data): radio = RADIO_TYPE.get(data['radio'], -1) cell = data['cell'][0] if cell.get('radio'): radio = RADIO_TYPE.get(cell['radio'], -1) mcc = cell['mcc'] mnc = cell['mnc'] lac = cell['lac'] cid = cell['cid'] query = session.query(Cell) query = query.filter(Cell.radio == radio) query = query.filter(Cell.mcc == mcc) query = query.filter(Cell.mnc == mnc) query = query.filter(Cell.cid == cid) if lac >= 0: query = query.filter(Cell.lac == lac) result = query.first() if result is None: return return { 'lat': quantize(result.lat), 'lon': quantize(result.lon), 'accuracy': 35000, }
def search_cell_lac(session, data): radio = RADIO_TYPE.get(data['radio'], -1) lacs = [] for cell in data['cell']: cell = normalized_cell_dict(cell, default_radio=radio) if not cell: continue cell['cid'] = CELLID_LAC key = to_cellkey(cell) query = session.query(Cell.lat, Cell.lon, Cell.range).filter( *join_cellkey(Cell, key)).filter( Cell.lat.isnot(None)).filter( Cell.lon.isnot(None) ) result = query.first() if result is not None: lacs.append(Network(key, *result)) if not lacs: return # take the smallest LAC of any the user is inside lac = sorted(lacs, key=operator.attrgetter('range'))[0] return { 'lat': quantize(lac.lat), 'lon': quantize(lac.lon), 'accuracy': max(LAC_MIN_ACCURACY, lac.range), }
def search_cell(session, data): radio = RADIO_TYPE.get(data['radio'], -1) cells = [] for cell in data['cell']: cell = normalized_cell_dict(cell, default_radio=radio) if not cell: continue key = to_cellkey(cell) query = session.query(Cell.lat, Cell.lon, Cell.range).filter( *join_cellkey(Cell, key)).filter( Cell.lat.isnot(None)).filter( Cell.lon.isnot(None) ) result = query.first() if result is not None: cells.append(Network(key, *result)) if not cells: return length = len(cells) avg_lat = sum([c.lat for c in cells]) / length avg_lon = sum([c.lon for c in cells]) / length return { 'lat': quantize(avg_lat), 'lon': quantize(avg_lon), 'accuracy': estimate_accuracy(avg_lat, avg_lon, cells, CELL_MIN_ACCURACY), }
def deserialize(self, data, default_radio=None): if data: if 'radio' in data: if isinstance(data['radio'], basestring): data['radio'] = RADIO_TYPE.get( data['radio'], self.fields['radio'].missing) # If a default radio was set, # and we don't know, use it as fallback if (self.is_missing(data, 'radio') and default_radio is not None): data['radio'] = default_radio # If the cell id >= 65536 then it must be a umts tower if (data.get('cid', 0) >= 65536 and data['radio'] == RADIO_TYPE['gsm']): data['radio'] = RADIO_TYPE['umts'] else: data['radio'] = default_radio # Treat cid=65535 without a valid lac as an unspecified value if (self.is_missing(data, 'lac') and data.get('cid', None) == 65535): data['cid'] = self.fields['cid'].missing return super(ValidCellBaseSchema, self).deserialize(data)
def process_cell_measure(session, measure_data, entries, userid=None): cell_count = defaultdict(int) cell_measures = [] created = decode_datetime(measure_data.get('created', '')) # process entries for entry in entries: cell_measure = create_cell_measure(measure_data, entry) # use more specific cell type or # fall back to less precise measure if entry.get('radio'): cell_measure.radio = RADIO_TYPE.get(entry['radio'], -1) else: cell_measure.radio = measure_data['radio'] cell_measures.append(cell_measure) # group per unique cell cell_count[CellKey(cell_measure.radio, cell_measure.mcc, cell_measure.mnc, cell_measure.lac, cell_measure.cid)] += 1 # update new/total measure counts new_cells = 0 for cell_key, count in cell_count.items(): new_cells += update_cell_measure_count( cell_key, count, created, session) # update user score if userid is not None and new_cells > 0: process_score(userid, new_cells, session, key='new_cell') session.add_all(cell_measures) return cell_measures
def make_cell_import_dict(row): def val(key, default): if key in row and row[key] != '' and row[key] is not None: return row[key] else: return default d = dict() d['created'] = datetime.fromtimestamp( int(val('created', 0))).replace(tzinfo=UTC) d['modified'] = datetime.fromtimestamp( int(val('updated', 0))).replace(tzinfo=UTC) d['lat'] = float(val('lat', -255)) d['lon'] = float(val('lon', -255)) d['radio'] = RADIO_TYPE.get(row['radio'].lower(), -1) for k in ['mcc', 'mnc', 'lac', 'cid', 'psc']: d[k] = int(val(k, -1)) d['range'] = int(float(val('range', 0))) d['total_measures'] = int(val('samples', -1)) d['changeable'] = bool(val('changeable', True)) return normalized_cell_dict(d)
def process_measure(data, utcnow, session): session_objects = [] measure = Measure() measure.created = utcnow measure.time = data['time'] measure.lat = to_precise_int(data['lat']) measure.lon = to_precise_int(data['lon']) measure.accuracy = data['accuracy'] measure.altitude = data['altitude'] measure.altitude_accuracy = data['altitude_accuracy'] measure.radio = RADIO_TYPE.get(data['radio'], -1) # get measure.id set session.add(measure) session.flush() if data.get('cell'): cells, cell_data = process_cell(data['cell'], measure) measure.cell = dumps(cell_data) session_objects.extend(cells) if data.get('wifi'): # filter out old-style sha1 hashes too_long_keys = False for w in data['wifi']: w['key'] = key = normalize_wifi_key(w['key']) if len(key) > 12: too_long_keys = True break if not too_long_keys: process_wifi(data['wifi'], measure) measure.wifi = dumps(data['wifi']) return (measure, session_objects)
def process_measure(report_id, data, session): def add_missing_dict_entries(dst, src): # x.update(y) overwrites entries in x with those in y; # we want to only add those not already present for (k, v) in src.items(): if k not in dst: dst[k] = v cell_measures = {} wifi_measures = {} measure_data = dict( report_id=report_id, lat=data['lat'], lon=data['lon'], heading=data.get('heading', -1.0), speed=data.get('speed', -1.0), time=encode_datetime(data['time']), accuracy=data.get('accuracy', 0), altitude=data.get('altitude', 0), altitude_accuracy=data.get('altitude_accuracy', 0), ) measure_radio = RADIO_TYPE.get(data['radio'], -1) if data.get('cell'): # flatten measure / cell data into a single dict for c in data['cell']: add_missing_dict_entries(c, measure_data) c = normalized_cell_measure_dict(c, measure_radio) if c is None: continue key = to_cellkey_psc(c) if key in cell_measures: existing = cell_measures[key] if existing['ta'] > c['ta'] or \ (existing['signal'] != 0 and existing['signal'] < c['signal']) or \ existing['asu'] < c['asu']: cell_measures[key] = c else: cell_measures[key] = c cell_measures = cell_measures.values() # flatten measure / wifi data into a single dict if data.get('wifi'): for w in data['wifi']: add_missing_dict_entries(w, measure_data) w = normalized_wifi_measure_dict(w) if w is None: continue key = w['key'] if key in wifi_measures: existing = wifi_measures[key] if existing['signal'] != 0 and \ existing['signal'] < w['signal']: wifi_measures[key] = w else: wifi_measures[key] = w wifi_measures = wifi_measures.values() return (cell_measures, wifi_measures)
def normalized_cell_dict(d, default_radio=-1): """ Returns a normalized copy of the provided cell dict d, or None if the dict was invalid. """ if not isinstance(d, dict): # pragma: no cover return None d = d.copy() if 'radio' in d and isinstance(d['radio'], basestring): d['radio'] = RADIO_TYPE.get(d['radio'], -1) d = normalized_dict( d, dict(radio=(MIN_RADIO_TYPE, MAX_RADIO_TYPE, default_radio), mcc=(1, 999, REQUIRED), mnc=(0, 32767, REQUIRED), lac=(1, 65535, -1), cid=(1, 268435455, -1), psc=(0, 512, -1))) if d is None: return None # Check against the list of all known valid mccs if d['mcc'] not in ALL_VALID_MCCS: return None # If a default radio was set, and we don't know, use it as fallback if d['radio'] == -1 and default_radio != -1: d['radio'] = default_radio # Skip CDMA towers missing lac or cid (no psc on CDMA exists to # backfill using inference) if d['radio'] == RADIO_TYPE['cdma'] and (d['lac'] < 0 or d['cid'] < 0): return None # Skip GSM/LTE/UMTS towers with an invalid MNC if (d['radio'] in (RADIO_TYPE['gsm'], RADIO_TYPE['umts'], RADIO_TYPE['lte']) and d['mnc'] > 999): return None # Treat cid=65535 without a valid lac as an unspecified value if d['lac'] == -1 and d['cid'] == 65535: d['cid'] = -1 # Must have (lac and cid) or psc (psc-only to use in backfill) if (d['lac'] == -1 or d['cid'] == -1) and d['psc'] == -1: return None # If the cell id >= 65536 then it must be a umts tower if d['cid'] >= 65536 and d['radio'] == RADIO_TYPE['gsm']: d['radio'] = RADIO_TYPE['umts'] return d
def process_measure(data, session): def add_missing_dict_entries(dst, src): # x.update(y) overwrites entries in x with those in y; # We want to only add those not already present. # We also only want to copy the top-level base measure data # and not any nested values like cell or wifi. for (k, v) in src.items(): if k != 'radio' and k not in dst \ and not isinstance(v, (tuple, list, dict)): dst[k] = v measure_data = normalized_measure_dict(data) if measure_data is None: return ([], []) cell_measures = {} wifi_measures = {} measure_radio = RADIO_TYPE.get(data['radio'], -1) if data.get('cell'): # flatten measure / cell data into a single dict for c in data['cell']: add_missing_dict_entries(c, measure_data) c = normalized_cell_measure_dict(c, measure_radio) if c is None: # pragma: no cover continue key = to_cellkey_psc(c) if key in cell_measures: # pragma: no cover existing = cell_measures[key] if existing['ta'] > c['ta'] or \ (existing['signal'] != 0 and existing['signal'] < c['signal']) or \ existing['asu'] < c['asu']: cell_measures[key] = c else: cell_measures[key] = c cell_measures = cell_measures.values() # flatten measure / wifi data into a single dict if data.get('wifi'): for w in data['wifi']: add_missing_dict_entries(w, measure_data) w = normalized_wifi_measure_dict(w) if w is None: continue key = w['key'] if key in wifi_measures: # pragma: no cover existing = wifi_measures[key] if existing['signal'] != 0 and \ existing['signal'] < w['signal']: wifi_measures[key] = w else: wifi_measures[key] = w wifi_measures = wifi_measures.values() return (cell_measures, wifi_measures)
def normalized_cell_dict(d, default_radio=-1): """ Returns a normalized copy of the provided cell dict d, or None if the dict was invalid. """ if not isinstance(d, dict): # pragma: no cover return None d = d.copy() if 'radio' in d and isinstance(d['radio'], basestring): d['radio'] = RADIO_TYPE.get(d['radio'], -1) d = normalized_dict( d, dict(radio=(MIN_RADIO_TYPE, MAX_RADIO_TYPE, default_radio), mcc=(1, 999, REQUIRED), mnc=(0, 32767, REQUIRED), lac=(1, 65535, -1), cid=(1, 268435455, -1), psc=(0, 512, -1))) if d is None: return None # Check against the list of all known valid mccs if d['mcc'] not in ALL_VALID_MCCS: return None # If a default radio was set, and we don't know, use it as fallback if d['radio'] == -1 and default_radio != -1: d['radio'] = default_radio # Skip CDMA towers missing lac or cid (no psc on CDMA exists to # backfill using inference) if d['radio'] == RADIO_TYPE['cdma'] and (d['lac'] < 0 or d['cid'] < 0): return None # Skip GSM/LTE/UMTS towers with an invalid MNC if (d['radio'] in ( RADIO_TYPE['gsm'], RADIO_TYPE['umts'], RADIO_TYPE['lte']) and d['mnc'] > 999): return None # Treat cid=65535 without a valid lac as an unspecified value if d['lac'] == -1 and d['cid'] == 65535: d['cid'] = -1 # Must have (lac and cid) or psc (psc-only to use in backfill) if (d['lac'] == -1 or d['cid'] == -1) and d['psc'] == -1: return None # If the cell id >= 65536 then it must be a umts tower if d['cid'] >= 65536 and d['radio'] == RADIO_TYPE['gsm']: d['radio'] = RADIO_TYPE['umts'] return d
def clean_cell_keys(self, data): """Pre-process cell data.""" radio = RADIO_TYPE.get(data.get('radio', ''), -1) cell_keys = [] for cell in data.get(self.data_field, ()): cell = CellKeyMixin.validate(cell, default_radio=radio) if cell: cell_key = CellKeyMixin.to_hashkey(cell) cell_keys.append(cell_key) return cell_keys
def process_measure(measure_id, data, session): cell_measures = [] wifi_measures = [] measure_data = dict( measure_id=measure_id, lat=to_precise_int(data['lat']), lon=to_precise_int(data['lon']), time=encode_datetime(data['time']), accuracy=data['accuracy'], altitude=data['altitude'], altitude_accuracy=data['altitude_accuracy'], ) measure_radio = RADIO_TYPE.get(data['radio'], -1) if data.get('cell'): # flatten measure / cell data into a single dict for c in data['cell']: c.update(measure_data) # use more specific cell type or # fall back to less precise measure if c['radio'] != '': c['radio'] = RADIO_TYPE.get(c['radio'], -1) else: c['radio'] = measure_radio cell_measures = data['cell'] if data.get('wifi'): # filter out old-style sha1 hashes invalid_wifi_key = False for w in data['wifi']: w['key'] = key = normalize_wifi_key(w['key']) if not valid_wifi_pattern(key): invalid_wifi_key = True break if not invalid_wifi_key: # flatten measure / wifi data into a single dict for w in data['wifi']: w.update(measure_data) wifi_measures = data['wifi'] return (cell_measures, wifi_measures)
def search_cell_lac(session, data): sql_null = None # avoid pep8 warning radio = RADIO_TYPE.get(data['radio'], -1) lacs = [] for cell in data['cell']: if cell['mcc'] < 1 or cell['mnc'] < 0 or \ cell['lac'] < 0 or cell['cid'] < 0: # Skip over invalid values continue if cell.get('radio'): radio = RADIO_TYPE.get(cell['radio'], -1) query = session.query(Cell).filter( Cell.radio == radio).filter( Cell.mcc == cell['mcc']).filter( Cell.mnc == cell['mnc']).filter( Cell.lac == cell['lac']).filter( Cell.cid == CELLID_LAC).filter( Cell.lat != sql_null).filter( Cell.lon != sql_null ) result = query.first() if result is not None: lacs.append(result) if not lacs: return None # take the smallest LAC of any the user is inside lac = sorted(lacs, key=operator.attrgetter('range'))[0] return { 'lat': quantize(lac.lat), 'lon': quantize(lac.lon), 'accuracy': lac.range, }
def search_cell(session, data): radio = RADIO_TYPE.get(data['radio'], -1) cells = [] for cell in data['cell']: if cell['mcc'] < 1 or cell['mnc'] < 0 or \ cell['lac'] < 0 or cell['cid'] < 0: # Skip over invalid values continue if cell.get('radio'): radio = RADIO_TYPE.get(cell['radio'], -1) query = session.query(Cell.lat, Cell.lon).filter( Cell.radio == radio).filter( Cell.mcc == cell['mcc']).filter( Cell.mnc == cell['mnc']).filter( Cell.lac == cell['lac']).filter( Cell.cid == cell['cid']).filter( Cell.lat.isnot(None)).filter( Cell.lon.isnot(None) ) result = query.first() if result is not None: cells.append(result) if not cells: return length = len(cells) avg_lat = sum([c[0] for c in cells]) / length avg_lon = sum([c[1] for c in cells]) / length return { 'lat': quantize(avg_lat), 'lon': quantize(avg_lon), 'accuracy': 35000, }
def process_cell_measure(session, measure_data, entries, userid=None): cell_measures = [] # TODO group by unique cell for entry in entries: cell = create_cell_measure(measure_data, entry) # use more specific cell type or # fall back to less precise measure if entry.get('radio'): cell.radio = RADIO_TYPE.get(entry['radio'], -1) else: cell.radio = measure_data['radio'] update_cell_measure_count(cell, session, userid=userid) cell_measures.append(cell) session.add_all(cell_measures) return cell_measures
def countries(session): # We group by radio, mcc to take advantage of the index # and explicitly specify a small list of all valid radio values # to get mysql to actually use the index. radios = [v for v in RADIO_TYPE.values() if v >= 0] rows = session.query(Cell.radio, Cell.mcc, func.count()).filter( Cell.radio.in_(radios)).group_by(Cell.radio, Cell.mcc).all() # reverse grouping by mcc, radio codes = defaultdict(dict) for row in rows: codes[row[1]][row[0]] = row[2] countries = {} for code, item in codes.items(): names = [(c.name, c.alpha3) for c in mcc(str(code))] multiple = bool(len(names) > 1) for name, alpha3 in names: country = { 'code': alpha3, 'name': name, 'order': transliterate(name[:10].lower()), 'multiple': multiple, 'total': 0, 'gsm': 0, 'cdma': 0, 'umts': 0, 'lte': 0, } for t, v in item.items(): country[RADIO_TYPE_INVERSE[t]] = int(v) country['total'] = int(sum(item.values())) if alpha3 not in countries: countries[alpha3] = country else: # some countries like the US have multiple mcc codes, # we merge them here for k, v in country.items(): if isinstance(v, int): countries[alpha3][k] += v return sorted(countries.values(), key=itemgetter('name'))
def process_cell(entries, measure): result = [] cells = [] for entry in entries: cell = CellMeasure( measure_id=measure.id, created=measure.created, lat=measure.lat, lon=measure.lon, time=measure.time, accuracy=measure.accuracy, altitude=measure.altitude, altitude_accuracy=measure.altitude_accuracy, mcc=entry['mcc'], mnc=entry['mnc'], lac=entry['lac'], cid=entry['cid'], psc=entry['psc'], asu=entry['asu'], signal=entry['signal'], ta=entry['ta'], ) # use more specific cell type or fall back to less precise measure if entry['radio']: cell.radio = RADIO_TYPE.get(entry['radio'], -1) else: cell.radio = measure.radio cells.append(cell) result.append(entry) return (cells, result)
def countries(session): # We group by radio, mcc to take advantage of the index # and explicitly specify a small list of all valid radio values # to get mysql to actually use the index. radios = [v for v in RADIO_TYPE.values() if v >= 0] rows = session.query(Cell.radio, Cell.mcc, func.count(Cell.id)).filter( Cell.radio.in_(radios)).group_by(Cell.radio, Cell.mcc).all() # reverse grouping by mcc, radio codes = defaultdict(dict) for row in rows: codes[row[1]][row[0]] = row[2] countries = {} for code, item in codes.items(): names = [(c.name, c.alpha3) for c in mcc(str(code))] multiple = bool(len(names) > 1) for name, alpha3 in names: country = { 'code': alpha3, 'name': name, 'order': transliterate(name[:10].lower()), 'multiple': multiple, 'total': 0, 'gsm': 0, 'cdma': 0, 'umts': 0, 'lte': 0, } for t, v in item.items(): country[RADIO_TYPE_INVERSE[t]] = int(v) country['total'] = int(sum(item.values())) if alpha3 not in countries: countries[alpha3] = country else: # some countries like the US have multiple mcc codes, # we merge them here for k, v in country.items(): if isinstance(v, int): countries[alpha3][k] += v return sorted(countries.values(), key=itemgetter('name'))
def insert_cell_measure(measure_data, entries, userid=None): cell_measures = [] try: with insert_cell_measure.db_session() as session: for entry in entries: cell = create_cell_measure(measure_data, entry) # use more specific cell type or # fall back to less precise measure if entry.get('radio'): cell.radio = RADIO_TYPE.get(entry['radio'], -1) else: cell.radio = measure_data['radio'] update_cell_measure_count(cell, session, userid=userid) cell_measures.append(cell) session.add_all(cell_measures) session.commit() return len(cell_measures) except IntegrityError as exc: # pragma: no cover # TODO log error return 0 except Exception as exc: # pragma: no cover raise insert_cell_measure.retry(exc=exc)
def process_measure(data, utcnow, session, userid=None): measure = Measure() measure.created = utcnow measure.time = data["time"] measure.lat = to_precise_int(data["lat"]) measure.lon = to_precise_int(data["lon"]) measure.accuracy = data["accuracy"] measure.altitude = data["altitude"] measure.altitude_accuracy = data["altitude_accuracy"] measure.radio = RADIO_TYPE.get(data["radio"], -1) # get measure.id set session.add(measure) session.flush() measure_data = dict( id=measure.id, created=encode_datetime(measure.created), lat=measure.lat, lon=measure.lon, time=encode_datetime(measure.time), accuracy=measure.accuracy, altitude=measure.altitude, altitude_accuracy=measure.altitude_accuracy, radio=measure.radio, ) if data.get("cell"): insert_cell_measure.delay(measure_data, data["cell"], userid=userid) measure.cell = dumps(data["cell"]) if data.get("wifi"): # filter out old-style sha1 hashes too_long_keys = False for w in data["wifi"]: w["key"] = key = normalize_wifi_key(w["key"]) if len(key) > 12: too_long_keys = True break if not too_long_keys: insert_wifi_measure.delay(measure_data, data["wifi"], userid=userid) measure.wifi = dumps(data["wifi"]) return measure
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None, api_key_log=False, api_key_name=None): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. """ stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pre-process wifi data for wifi in data.get('wifi', ()): wifi = normalized_wifi_dict(wifi) if wifi: validated['wifi'].append(wifi) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC)) # Merge all possible cell and lac keys into one list all_cell_keys = [] all_cell_keys.extend(validated['cell']) for key in validated['cell_lac']: all_cell_keys.append(key) # Do a single query for all cells and lacs at the same time try: all_networks = query_cell_networks(session, all_cell_keys) except Exception: heka_client.raven(RAVEN_ERROR) all_networks = [] for network in all_networks: if network.key == CELLID_LAC: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes(validated['cell'], api_name, client_addr, geoip_db) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi) ]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field]) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' # Do detailed logging for some api keys if api_key_log and api_key_name: api_log_metric = None wifi_keys = set([w['key'] for w in validated['wifi']]) if wifi_keys and \ len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY: # Only count requests as WiFi-based if they contain enough # distinct WiFi networks to pass our filters if result_metric == 'wifi': api_log_metric = 'wifi_hit' else: api_log_metric = 'wifi_miss' elif validated['cell']: if result_metric == 'cell': api_log_metric = 'cell_hit' elif result_metric == 'cell_lac': api_log_metric = 'cell_lac_hit' else: api_log_metric = 'cell_miss' else: if geoip_res: api_log_metric = 'geoip_hit' else: api_log_metric = 'geoip_miss' if api_log_metric: stats_client.incr('%s.api_log.%s.%s' % (api_name, api_key_name, api_log_metric)) if not result: stats_client.incr('%s.miss' % api_name) return None rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.incr('%s.%s_hit' % (api_name, result_metric)) stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None, api_key_log=False, api_key_name=None, result_type='position'): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. :param api_key_log: Enable additional api key specific logging? :param api_key_name: The metric friendly api key name. :param result_type: What kind of result to return, either a lat/lon position or a country estimate. """ if result_type not in ('country', 'position'): raise ValueError('Invalid result_type, must be one of ' 'position or country') stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pre-process wifi data for wifi in data.get('wifi', ()): wifi = normalized_wifi_dict(wifi) if wifi: validated['wifi'].append(wifi) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key) found_cells = [] # Query all cells and OCID cells for model in Cell, OCIDCell, CellArea: cell_filter = [] for key in validated['cell']: # create a list of 'and' criteria for cell keys criterion = join_cellkey(model, key) cell_filter.append(and_(*criterion)) if cell_filter: # only do a query if we have cell results, or this will match # all rows in the table load_fields = ('radio', 'mcc', 'mnc', 'lac', 'lat', 'lon', 'range') query = (session.query(model).options( load_only(*load_fields)).filter(or_(*cell_filter)).filter( model.lat.isnot(None)).filter(model.lon.isnot(None))) try: found_cells.extend(query.all()) except Exception: heka_client.raven(RAVEN_ERROR) if found_cells: # Group all found_cellss by location area lacs = defaultdict(list) for cell in found_cells: cellarea_key = (cell.radio, cell.mcc, cell.mnc, cell.lac) lacs[cellarea_key].append(cell) def sort_lac(v): # use the lac with the most values, # or the one with the smallest range return (len(v), -min([e.range for e in v])) # If we get data from multiple location areas, use the one with the # most data points in it. That way a lac with a cell hit will # have two entries and win over a lac with only the lac entry. lac = sorted(lacs.values(), key=sort_lac, reverse=True) for cell in lac[0]: # The first entry is the key, # used only to distinguish cell from lac network = Network(key=None, lat=cell.lat, lon=cell.lon, range=cell.range) if type(cell) is CellArea: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes(validated['cell'], api_name, client_addr, geoip_db, stats_client) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi) ]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field], stats_client, api_name) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' # Do detailed logging for some api keys if api_key_log and api_key_name: api_log_metric = None wifi_keys = set([w['key'] for w in validated['wifi']]) if wifi_keys and \ len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY: # Only count requests as WiFi-based if they contain enough # distinct WiFi networks to pass our filters if result_metric == 'wifi': api_log_metric = 'wifi_hit' else: api_log_metric = 'wifi_miss' elif validated['cell']: if result_metric == 'cell': api_log_metric = 'cell_hit' elif result_metric == 'cell_lac': api_log_metric = 'cell_lac_hit' else: api_log_metric = 'cell_miss' else: if geoip_res: api_log_metric = 'geoip_hit' else: api_log_metric = 'geoip_miss' if api_log_metric: stats_client.incr('%s.api_log.%s.%s' % (api_name, api_key_name, api_log_metric)) if not result: stats_client.incr('%s.miss' % api_name) return None stats_client.incr('%s.%s_hit' % (api_name, result_metric)) if result_type == 'position': rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result elif result_type == 'country': if countries: country = iso3166.countries.get(countries[0]) return { 'country_name': country.name, 'country_code': country.alpha2 }
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. """ stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pass-through wifi data validated['wifi'] = data.get('wifi', []) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC)) # Merge all possible cell and lac keys into one list all_cell_keys = [] all_cell_keys.extend(validated['cell']) for key in validated['cell_lac']: all_cell_keys.append(key) # Do a single query for all cells and lacs at the same time try: all_networks = query_cell_networks(session, all_cell_keys) except Exception: heka_client.raven(RAVEN_ERROR) all_networks = [] for network in all_networks: if network.key == CELLID_LAC: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes( validated['cell'], api_name, client_addr, geoip_db) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi)]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field]) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' if not result: stats_client.incr('%s.miss' % api_name) return None rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.incr('%s.%s_hit' % (api_name, result_metric)) stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result