Exemplo n.º 1
0
def search_cell(session, data):
    radio = RADIO_TYPE.get(data['radio'], -1)
    cells = []
    for cell in data['cell']:
        cell = normalized_cell_dict(cell, default_radio=radio)
        if not cell:
            continue

        key = to_cellkey(cell)

        query = session.query(Cell.lat, Cell.lon, Cell.range).filter(
            *join_cellkey(Cell, key)).filter(
            Cell.lat.isnot(None)).filter(
            Cell.lon.isnot(None)
        )
        result = query.first()
        if result is not None:
            cells.append(Network(key, *result))

    if not cells:
        return

    length = len(cells)
    avg_lat = sum([c.lat for c in cells]) / length
    avg_lon = sum([c.lon for c in cells]) / length
    return {
        'lat': quantize(avg_lat),
        'lon': quantize(avg_lon),
        'accuracy': estimate_accuracy(avg_lat, avg_lon,
                                      cells, CELL_MIN_ACCURACY),
    }
Exemplo n.º 2
0
def search_cell_lac(session, data):
    radio = RADIO_TYPE.get(data['radio'], -1)
    lacs = []
    for cell in data['cell']:
        cell = normalized_cell_dict(cell, default_radio=radio)
        if not cell:
            continue

        cell['cid'] = CELLID_LAC
        key = to_cellkey(cell)

        query = session.query(Cell.lat, Cell.lon, Cell.range).filter(
            *join_cellkey(Cell, key)).filter(
            Cell.lat.isnot(None)).filter(
            Cell.lon.isnot(None)
        )
        result = query.first()
        if result is not None:
            lacs.append(Network(key, *result))

    if not lacs:
        return

    # take the smallest LAC of any the user is inside
    lac = sorted(lacs, key=operator.attrgetter('range'))[0]

    return {
        'lat': quantize(lac.lat),
        'lon': quantize(lac.lon),
        'accuracy': max(LAC_MIN_ACCURACY, lac.range),
    }
Exemplo n.º 3
0
def mark_moving_cells(session, moving_cells):
    moving_keys = []
    blacklist = set()
    for cell in moving_cells:
        query = session.query(CellBlacklist).filter(
            *join_cellkey(CellBlacklist, cell))
        b = query.first()
        if b is None:
            key = to_cellkey(cell)._asdict()
            blacklist.add(CellBlacklist(**key))
            moving_keys.append(key)

    get_heka_client().incr("items.blacklisted.cell_moving", len(moving_keys))
    session.add_all(blacklist)
    remove_cell.delay(moving_keys)
Exemplo n.º 4
0
def remove_cell(self, cell_keys):
    cells_removed = 0
    try:
        with self.db_session() as session:
            for k in cell_keys:
                key = to_cellkey(k)
                query = session.query(Cell).filter(*join_cellkey(Cell, key))
                cells_removed += query.delete(synchronize_session=False)
            session.commit()
        return cells_removed
    except IntegrityError as exc:  # pragma: no cover
        self.heka_client.raven('error')
        return 0
    except Exception as exc:  # pragma: no cover
        raise self.retry(exc=exc)
Exemplo n.º 5
0
def remove_cell(self, cell_keys):
    cells_removed = 0
    try:
        with self.db_session() as session:
            for k in cell_keys:
                key = to_cellkey(k)
                query = session.query(Cell).filter(*join_cellkey(Cell, key))
                cells_removed += query.delete(synchronize_session=False)
            session.commit()
        return cells_removed
    except IntegrityError as exc:  # pragma: no cover
        self.heka_client.raven('error')
        return 0
    except Exception as exc:  # pragma: no cover
        raise self.retry(exc=exc)
Exemplo n.º 6
0
def mark_moving_cells(session, moving_cells):
    moving_keys = []
    blacklist = set()
    for cell in moving_cells:
        query = session.query(CellBlacklist).filter(
            *join_cellkey(CellBlacklist, cell))
        b = query.first()
        if b is None:
            key = to_cellkey(cell)._asdict()
            blacklist.add(CellBlacklist(**key))
            moving_keys.append(key)

    get_heka_client().incr("items.blacklisted.cell_moving",
                           len(moving_keys))
    session.add_all(blacklist)
    remove_cell.delay(moving_keys)
Exemplo n.º 7
0
def remove_cell(self, cell_keys):
    cells_removed = 0
    try:
        with self.db_session() as session:
            changed_lacs = set()

            for k in cell_keys:
                key = to_cellkey(k)
                query = session.query(Cell).filter(*join_cellkey(Cell, key))
                cells_removed += query.delete()
                changed_lacs.add(key._replace(cid=CELLID_LAC))

            for key in changed_lacs:
                # Either schedule an update to the enclosing LAC or, if
                # we just removed the last cell in the LAC, remove the LAC
                # entirely.
                query = session.query(Cell).filter(
                    Cell.radio == key.radio,
                    Cell.mcc == key.mcc,
                    Cell.mnc == key.mnc,
                    Cell.lac == key.lac,
                    Cell.cid != CELLID_LAC)
                n = query.count()

                query = session.query(Cell).filter(
                    Cell.radio == key.radio,
                    Cell.mcc == key.mcc,
                    Cell.mnc == key.mnc,
                    Cell.lac == key.lac,
                    Cell.cid == CELLID_LAC)
                if n < 1:
                    query.delete()
                else:
                    lac = query.first()
                    if lac is not None:
                        lac.new_measures += 1

            session.commit()
        return cells_removed
    except Exception as exc:  # pragma: no cover
        self.heka_client.raven('error')
        raise self.retry(exc=exc)
Exemplo n.º 8
0
def remove_cell(self, cell_keys):
    cells_removed = 0
    try:
        with self.db_session() as session:
            for k in cell_keys:
                key = to_cellkey(k)
                query = session.query(Cell).filter(*join_cellkey(Cell, key))
                cells_removed += query.delete()

                # Either schedule an update to the enclosing LAC or, if
                # we just removed the last cell in the LAC, remove the LAC
                # entirely.
                query = session.query(func.count(Cell.id)).filter(
                    Cell.radio == key.radio,
                    Cell.mcc == key.mcc,
                    Cell.mnc == key.mnc,
                    Cell.lac == key.lac,
                    Cell.cid != CELLID_LAC)

                c = query.first()
                assert c is not None
                n = int(c[0])
                query = session.query(Cell).filter(
                    Cell.radio == key.radio,
                    Cell.mcc == key.mcc,
                    Cell.mnc == key.mnc,
                    Cell.lac == key.lac,
                    Cell.cid == CELLID_LAC)
                if n < 1:
                    query.delete()
                else:
                    query.update({'new_measures': '1'})

            session.commit()
        return cells_removed
    except IntegrityError as exc:  # pragma: no cover
        self.heka_client.raven('error')
        return 0
    except Exception as exc:  # pragma: no cover
        raise self.retry(exc=exc)
Exemplo n.º 9
0
def remove_cell(self, cell_keys):
    try:
        cells_removed = 0
        redis_client = self.app.redis_client
        with self.db_session() as session:
            changed_lacs = set()

            for k in cell_keys:
                key = to_cellkey(k)
                query = session.query(Cell).filter(*join_cellkey(Cell, key))
                cells_removed += query.delete()
                changed_lacs.add(key._replace(cid=CELLID_LAC))

            if changed_lacs:
                session.on_post_commit(enqueue_lacs,
                                       redis_client, changed_lacs)

            session.commit()
        return cells_removed
    except Exception as exc:  # pragma: no cover
        self.heka_client.raven('error')
        raise self.retry(exc=exc)
Exemplo n.º 10
0
def remove_cell(self, cell_keys):
    try:
        cells_removed = 0
        redis_client = self.app.redis_client
        with self.db_session() as session:
            changed_lacs = set()

            for k in cell_keys:
                key = to_cellkey(k)
                query = session.query(Cell).filter(*join_cellkey(Cell, key))
                cells_removed += query.delete()
                changed_lacs.add(key._replace(cid=CELLID_LAC))

            if changed_lacs:
                session.on_post_commit(enqueue_lacs, redis_client,
                                       changed_lacs)

            session.commit()
        return cells_removed
    except Exception as exc:  # pragma: no cover
        self.heka_client.raven('error')
        raise self.retry(exc=exc)
Exemplo n.º 11
0
def remove_cell(self, cell_keys):
    cells_removed = 0
    try:
        with self.db_session() as session:
            for k in cell_keys:
                key = to_cellkey(k)
                query = session.query(Cell).filter(*join_cellkey(Cell, key))
                cells_removed += query.delete()

                # Either schedule an update to the enclosing LAC or, if
                # we just removed the last cell in the LAC, remove the LAC
                # entirely.
                query = session.query(func.count(Cell.id)).filter(
                    Cell.radio == key.radio, Cell.mcc == key.mcc,
                    Cell.mnc == key.mnc, Cell.lac == key.lac,
                    Cell.cid != CELLID_LAC)

                c = query.first()
                assert c is not None
                n = int(c[0])
                query = session.query(Cell).filter(Cell.radio == key.radio,
                                                   Cell.mcc == key.mcc,
                                                   Cell.mnc == key.mnc,
                                                   Cell.lac == key.lac,
                                                   Cell.cid == CELLID_LAC)
                if n < 1:
                    query.delete()
                else:
                    query.update({'new_measures': '1'})

            session.commit()
        return cells_removed
    except IntegrityError as exc:  # pragma: no cover
        self.heka_client.raven('error')
        return 0
    except Exception as exc:  # pragma: no cover
        raise self.retry(exc=exc)
Exemplo n.º 12
0
    def test_blacklist_moving_cells(self):
        now = util.utcnow()
        long_ago = now - timedelta(days=40)
        session = self.db_master_session

        k1 = dict(radio=1, mcc=1, mnc=2, lac=3, cid=4)
        k2 = dict(radio=1, mcc=1, mnc=2, lac=6, cid=8)
        k3 = dict(radio=1, mcc=1, mnc=2, lac=9, cid=12)
        k4 = dict(radio=1, mcc=1, mnc=2, lac=12, cid=16)
        k5 = dict(radio=1, mcc=1, mnc=2, lac=15, cid=20)
        k6 = dict(radio=1, mcc=1, mnc=2, lac=18, cid=24)

        keys = set([CellKey(**k) for k in [k1, k2, k3, k4, k5, k6]])

        # keys k2, k3 and k4 are expected to be detected as moving
        data = [
            # a cell with an entry but no prior position
            Cell(new_measures=3, total_measures=0, **k1),
            CellMeasure(lat=1.001, lon=1.001, **k1),
            CellMeasure(lat=1.002, lon=1.005, **k1),
            CellMeasure(lat=1.003, lon=1.009, **k1),
            # a cell with a prior known position
            Cell(lat=2.0, lon=2.0,
                 new_measures=2, total_measures=1, **k2),
            CellMeasure(lat=2.0, lon=2.0, **k2),
            CellMeasure(lat=4.0, lon=2.0, **k2),
            # a cell with a very different prior position
            Cell(lat=1.0, lon=1.0,
                 new_measures=2, total_measures=1, **k3),
            CellMeasure(lat=3.0, lon=3.0, **k3),
            CellMeasure(lat=-3.0, lon=3.0, **k3),
            # another cell with a prior known position (and negative lat)
            Cell(lat=-4.0, lon=4.0,
                 new_measures=2, total_measures=1, **k4),
            CellMeasure(lat=-4.0, lon=4.0, **k4),
            CellMeasure(lat=-6.0, lon=4.0, **k4),
            # an already blacklisted cell
            CellBlacklist(**k5),
            CellMeasure(lat=5.0, lon=5.0, **k5),
            CellMeasure(lat=8.0, lon=5.0, **k5),
            # a cell with an old different record we ignore, position
            # estimate has been updated since
            Cell(lat=6.0, lon=6.0,
                 new_measures=2, total_measures=1, **k6),
            CellMeasure(lat=6.9, lon=6.9, time=long_ago, **k6),
            CellMeasure(lat=6.0, lon=6.0, **k6),
            CellMeasure(lat=6.001, lon=6, **k6),
        ]
        session.add_all(data)
        session.commit()

        result = location_update_cell.delay(min_new=1)
        self.assertEqual(result.get(), (5, 3))

        black = session.query(CellBlacklist).all()
        self.assertEqual(set([to_cellkey(b) for b in black]),
                         set([CellKey(**k) for k in [k2, k3, k4, k5]]))

        measures = session.query(CellMeasure).all()
        self.assertEqual(len(measures), 14)
        self.assertEqual(set([to_cellkey(m) for m in measures]), keys)

        # test duplicate call
        result = location_update_cell.delay(min_new=1)
        self.assertEqual(result.get(), 0)

        self.check_stats(
            total=6,
            timer=[
                # We made duplicate calls
                ('task.data.location_update_cell', 2),
                # One of those would've scheduled a remove_cell task
                ('task.data.remove_cell', 1)
            ],
            gauge=[
                ('task.data.location_update_cell.new_measures_1_100', 2),
            ])
Exemplo n.º 13
0
    def test_blacklist_moving_cells(self):
        now = util.utcnow()
        long_ago = now - timedelta(days=40)
        session = self.db_master_session

        k1 = dict(radio=1, mcc=1, mnc=2, lac=3, cid=4)
        k2 = dict(radio=1, mcc=1, mnc=2, lac=6, cid=8)
        k3 = dict(radio=1, mcc=1, mnc=2, lac=9, cid=12)
        k4 = dict(radio=1, mcc=1, mnc=2, lac=12, cid=16)
        k5 = dict(radio=1, mcc=1, mnc=2, lac=15, cid=20)
        k6 = dict(radio=1, mcc=1, mnc=2, lac=18, cid=24)

        keys = set([CellKey(**k) for k in [k1, k2, k3, k4, k5, k6]])

        # keys k2, k3 and k4 are expected to be detected as moving
        data = [
            # a cell with an entry but no prior position
            Cell(new_measures=3, total_measures=0, **k1),
            CellMeasure(lat=1.001, lon=1.001, **k1),
            CellMeasure(lat=1.002, lon=1.005, **k1),
            CellMeasure(lat=1.003, lon=1.009, **k1),
            # a cell with a prior known position
            Cell(lat=2.0, lon=2.0, new_measures=2, total_measures=1, **k2),
            CellMeasure(lat=2.0, lon=2.0, **k2),
            CellMeasure(lat=4.0, lon=2.0, **k2),
            # a cell with a very different prior position
            Cell(lat=1.0, lon=1.0, new_measures=2, total_measures=1, **k3),
            CellMeasure(lat=3.0, lon=3.0, **k3),
            CellMeasure(lat=-3.0, lon=3.0, **k3),
            # another cell with a prior known position (and negative lat)
            Cell(lat=-4.0, lon=4.0, new_measures=2, total_measures=1, **k4),
            CellMeasure(lat=-4.0, lon=4.0, **k4),
            CellMeasure(lat=-6.0, lon=4.0, **k4),
            # an already blacklisted cell
            CellBlacklist(**k5),
            CellMeasure(lat=5.0, lon=5.0, **k5),
            CellMeasure(lat=8.0, lon=5.0, **k5),
            # a cell with an old different record we ignore, position
            # estimate has been updated since
            Cell(lat=6.0, lon=6.0, new_measures=2, total_measures=1, **k6),
            CellMeasure(lat=6.9, lon=6.9, time=long_ago, **k6),
            CellMeasure(lat=6.0, lon=6.0, **k6),
            CellMeasure(lat=6.001, lon=6, **k6),
        ]
        session.add_all(data)
        session.commit()

        result = location_update_cell.delay(min_new=1)
        self.assertEqual(result.get(), (5, 3))

        black = session.query(CellBlacklist).all()
        self.assertEqual(set([to_cellkey(b) for b in black]),
                         set([CellKey(**k) for k in [k2, k3, k4, k5]]))

        measures = session.query(CellMeasure).all()
        self.assertEqual(len(measures), 14)
        self.assertEqual(set([to_cellkey(m) for m in measures]), keys)

        # test duplicate call
        result = location_update_cell.delay(min_new=1)
        self.assertEqual(result.get(), 0)

        self.check_stats(
            total=6,
            timer=[
                # We made duplicate calls
                ('task.data.location_update_cell', 2),
                # One of those would've scheduled a remove_cell task
                ('task.data.remove_cell', 1)
            ],
            gauge=[
                ('task.data.location_update_cell.new_measures_1_100', 2),
            ])
Exemplo n.º 14
0
def search_all_sources(session,
                       api_name,
                       data,
                       client_addr=None,
                       geoip_db=None,
                       api_key_log=False,
                       api_key_name=None,
                       result_type='position'):
    """
    Common code-path for all lookup APIs, using
    WiFi, cell, cell-lac and GeoIP data sources.

    :param session: A database session for queries.
    :param api_name: A string to use in metrics (for example "geolocate").
    :param data: A dict conforming to the search API.
    :param client_addr: The IP address the request came from.
    :param geoip_db: The geoip database.
    :param api_key_log: Enable additional api key specific logging?
    :param api_key_name: The metric friendly api key name.
    :param result_type: What kind of result to return, either a lat/lon
                        position or a country estimate.
    """

    if result_type not in ('country', 'position'):
        raise ValueError('Invalid result_type, must be one of '
                         'position or country')

    stats_client = get_stats_client()
    heka_client = get_heka_client()

    result = None
    result_metric = None

    validated = {
        'wifi': [],
        'cell': [],
        'cell_lac': set(),
        'cell_network': [],
        'cell_lac_network': [],
    }

    # Pre-process wifi data
    for wifi in data.get('wifi', ()):
        wifi = normalized_wifi_dict(wifi)
        if wifi:
            validated['wifi'].append(wifi)

    # Pre-process cell data
    radio = RADIO_TYPE.get(data.get('radio', ''), -1)
    for cell in data.get('cell', ()):
        cell = normalized_cell_dict(cell, default_radio=radio)
        if cell:
            cell_key = to_cellkey(cell)
            validated['cell'].append(cell_key)
            validated['cell_lac'].add(cell_key)

    found_cells = []

    # Query all cells and OCID cells
    for model in Cell, OCIDCell, CellArea:
        cell_filter = []
        for key in validated['cell']:
            # create a list of 'and' criteria for cell keys
            criterion = join_cellkey(model, key)
            cell_filter.append(and_(*criterion))

        if cell_filter:
            # only do a query if we have cell results, or this will match
            # all rows in the table
            load_fields = ('radio', 'mcc', 'mnc', 'lac', 'lat', 'lon', 'range')
            query = (session.query(model).options(
                load_only(*load_fields)).filter(or_(*cell_filter)).filter(
                    model.lat.isnot(None)).filter(model.lon.isnot(None)))

            try:
                found_cells.extend(query.all())
            except Exception:
                heka_client.raven(RAVEN_ERROR)

    if found_cells:
        # Group all found_cellss by location area
        lacs = defaultdict(list)
        for cell in found_cells:
            cellarea_key = (cell.radio, cell.mcc, cell.mnc, cell.lac)
            lacs[cellarea_key].append(cell)

        def sort_lac(v):
            # use the lac with the most values,
            # or the one with the smallest range
            return (len(v), -min([e.range for e in v]))

        # If we get data from multiple location areas, use the one with the
        # most data points in it. That way a lac with a cell hit will
        # have two entries and win over a lac with only the lac entry.
        lac = sorted(lacs.values(), key=sort_lac, reverse=True)

        for cell in lac[0]:
            # The first entry is the key,
            # used only to distinguish cell from lac
            network = Network(key=None,
                              lat=cell.lat,
                              lon=cell.lon,
                              range=cell.range)
            if type(cell) is CellArea:
                validated['cell_lac_network'].append(network)
            else:
                validated['cell_network'].append(network)

    # Always do a GeoIP lookup because it is cheap and we want to
    # report geoip vs. other data mismatches. We may also use
    # the full GeoIP City-level estimate as well, if all else fails.
    (geoip_res,
     countries) = geoip_and_best_guess_country_codes(validated['cell'],
                                                     api_name, client_addr,
                                                     geoip_db, stats_client)

    # First we attempt a "zoom-in" from cell-lac, to cell
    # to wifi, tightening our estimate each step only so
    # long as it doesn't contradict the existing best-estimate
    # nor the possible countries of origin.

    for (data_field, object_field, metric_name, search_fn) in [
        ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac),
        ('cell', 'cell_network', 'cell', search_cell),
        ('wifi', 'wifi', 'wifi', search_wifi)
    ]:

        if validated[data_field]:
            r = None
            try:
                r = search_fn(session, validated[object_field], stats_client,
                              api_name)
            except Exception:
                heka_client.raven(RAVEN_ERROR)
                stats_client.incr('%s.%s_error' % (api_name, metric_name))

            if r is None:
                stats_client.incr('%s.no_%s_found' % (api_name, metric_name))

            else:
                lat = float(r['lat'])
                lon = float(r['lon'])

                stats_client.incr('%s.%s_found' % (api_name, metric_name))

                # Skip any hit that matches none of the possible countries.
                country_match = False
                for country in countries:
                    if location_is_in_country(lat, lon, country, 1):
                        country_match = True
                        break

                if countries and not country_match:
                    stats_client.incr('%s.anomaly.%s_country_mismatch' %
                                      (api_name, metric_name))

                # Always accept the first result we get.
                if result is None:
                    result = r
                    result_metric = metric_name

                # Or any result that appears to be an improvement over the
                # existing best guess.
                elif (distance(float(result['lat']), float(result['lon']), lat,
                               lon) * 1000 <= result['accuracy']):
                    result = r
                    result_metric = metric_name

                else:
                    stats_client.incr('%s.anomaly.%s_%s_mismatch' %
                                      (api_name, metric_name, result_metric))

    # Fall back to GeoIP if nothing has worked yet. We do not
    # include this in the "zoom-in" loop because GeoIP is
    # frequently _wrong_ at the city level; we only want to
    # accept that estimate if we got nothing better from cell
    # or wifi.
    if not result and geoip_res:
        result = geoip_res
        result_metric = 'geoip'

    # Do detailed logging for some api keys
    if api_key_log and api_key_name:
        api_log_metric = None
        wifi_keys = set([w['key'] for w in validated['wifi']])
        if wifi_keys and \
           len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY:
            # Only count requests as WiFi-based if they contain enough
            # distinct WiFi networks to pass our filters
            if result_metric == 'wifi':
                api_log_metric = 'wifi_hit'
            else:
                api_log_metric = 'wifi_miss'
        elif validated['cell']:
            if result_metric == 'cell':
                api_log_metric = 'cell_hit'
            elif result_metric == 'cell_lac':
                api_log_metric = 'cell_lac_hit'
            else:
                api_log_metric = 'cell_miss'
        else:
            if geoip_res:
                api_log_metric = 'geoip_hit'
            else:
                api_log_metric = 'geoip_miss'
        if api_log_metric:
            stats_client.incr('%s.api_log.%s.%s' %
                              (api_name, api_key_name, api_log_metric))

    if not result:
        stats_client.incr('%s.miss' % api_name)
        return None

    stats_client.incr('%s.%s_hit' % (api_name, result_metric))

    if result_type == 'position':
        rounded_result = {
            'lat': round(result['lat'], DEGREE_DECIMAL_PLACES),
            'lon': round(result['lon'], DEGREE_DECIMAL_PLACES),
            'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES),
        }
        stats_client.timing('%s.accuracy.%s' % (api_name, result_metric),
                            rounded_result['accuracy'])
        return rounded_result
    elif result_type == 'country':
        if countries:
            country = iso3166.countries.get(countries[0])
            return {
                'country_name': country.name,
                'country_code': country.alpha2
            }
Exemplo n.º 15
0
def search_all_sources(session, api_name, data,
                       client_addr=None, geoip_db=None):
    """
    Common code-path for all lookup APIs, using
    WiFi, cell, cell-lac and GeoIP data sources.

    :param session: A database session for queries.
    :param api_name: A string to use in metrics (for example "geolocate").
    :param data: A dict conforming to the search API.
    :param client_addr: The IP address the request came from.
    :param geoip_db: The geoip database.
    """

    stats_client = get_stats_client()
    heka_client = get_heka_client()

    result = None
    result_metric = None

    validated = {
        'wifi': [],
        'cell': [],
        'cell_lac': set(),
        'cell_network': [],
        'cell_lac_network': [],
    }

    # Pass-through wifi data
    validated['wifi'] = data.get('wifi', [])

    # Pre-process cell data
    radio = RADIO_TYPE.get(data.get('radio', ''), -1)
    for cell in data.get('cell', ()):
        cell = normalized_cell_dict(cell, default_radio=radio)
        if cell:
            cell_key = to_cellkey(cell)
            validated['cell'].append(cell_key)
            validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC))

    # Merge all possible cell and lac keys into one list
    all_cell_keys = []
    all_cell_keys.extend(validated['cell'])
    for key in validated['cell_lac']:
        all_cell_keys.append(key)

    # Do a single query for all cells and lacs at the same time
    try:
        all_networks = query_cell_networks(session, all_cell_keys)
    except Exception:
        heka_client.raven(RAVEN_ERROR)
        all_networks = []
    for network in all_networks:
        if network.key == CELLID_LAC:
            validated['cell_lac_network'].append(network)
        else:
            validated['cell_network'].append(network)

    # Always do a GeoIP lookup because it is cheap and we want to
    # report geoip vs. other data mismatches. We may also use
    # the full GeoIP City-level estimate as well, if all else fails.
    (geoip_res, countries) = geoip_and_best_guess_country_codes(
        validated['cell'], api_name, client_addr, geoip_db)

    # First we attempt a "zoom-in" from cell-lac, to cell
    # to wifi, tightening our estimate each step only so
    # long as it doesn't contradict the existing best-estimate
    # nor the possible countries of origin.

    for (data_field, object_field, metric_name, search_fn) in [
            ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac),
            ('cell', 'cell_network', 'cell', search_cell),
            ('wifi', 'wifi', 'wifi', search_wifi)]:

        if validated[data_field]:
            r = None
            try:
                r = search_fn(session, validated[object_field])
            except Exception:
                heka_client.raven(RAVEN_ERROR)
                stats_client.incr('%s.%s_error' %
                                  (api_name, metric_name))

            if r is None:
                stats_client.incr('%s.no_%s_found' %
                                  (api_name, metric_name))

            else:
                lat = float(r['lat'])
                lon = float(r['lon'])

                stats_client.incr('%s.%s_found' %
                                  (api_name, metric_name))

                # Skip any hit that matches none of the possible countries.
                country_match = False
                for country in countries:
                    if location_is_in_country(lat, lon, country, 1):
                        country_match = True
                        break

                if countries and not country_match:
                    stats_client.incr('%s.anomaly.%s_country_mismatch' %
                                      (api_name, metric_name))

                # Always accept the first result we get.
                if result is None:
                    result = r
                    result_metric = metric_name

                # Or any result that appears to be an improvement over the
                # existing best guess.
                elif (distance(float(result['lat']),
                               float(result['lon']), lat, lon) * 1000
                      <= result['accuracy']):
                    result = r
                    result_metric = metric_name

                else:
                    stats_client.incr('%s.anomaly.%s_%s_mismatch' %
                                      (api_name, metric_name, result_metric))

    # Fall back to GeoIP if nothing has worked yet. We do not
    # include this in the "zoom-in" loop because GeoIP is
    # frequently _wrong_ at the city level; we only want to
    # accept that estimate if we got nothing better from cell
    # or wifi.
    if not result and geoip_res:
        result = geoip_res
        result_metric = 'geoip'

    if not result:
        stats_client.incr('%s.miss' % api_name)
        return None

    rounded_result = {
        'lat': round(result['lat'], DEGREE_DECIMAL_PLACES),
        'lon': round(result['lon'], DEGREE_DECIMAL_PLACES),
        'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES),
    }

    stats_client.incr('%s.%s_hit' % (api_name, result_metric))
    stats_client.timing('%s.accuracy.%s' % (api_name, result_metric),
                        rounded_result['accuracy'])

    return rounded_result
Exemplo n.º 16
0
def search_all_sources(session,
                       api_name,
                       data,
                       client_addr=None,
                       geoip_db=None,
                       api_key_log=False,
                       api_key_name=None):
    """
    Common code-path for all lookup APIs, using
    WiFi, cell, cell-lac and GeoIP data sources.

    :param session: A database session for queries.
    :param api_name: A string to use in metrics (for example "geolocate").
    :param data: A dict conforming to the search API.
    :param client_addr: The IP address the request came from.
    :param geoip_db: The geoip database.
    """

    stats_client = get_stats_client()
    heka_client = get_heka_client()

    result = None
    result_metric = None

    validated = {
        'wifi': [],
        'cell': [],
        'cell_lac': set(),
        'cell_network': [],
        'cell_lac_network': [],
    }

    # Pre-process wifi data
    for wifi in data.get('wifi', ()):
        wifi = normalized_wifi_dict(wifi)
        if wifi:
            validated['wifi'].append(wifi)

    # Pre-process cell data
    radio = RADIO_TYPE.get(data.get('radio', ''), -1)
    for cell in data.get('cell', ()):
        cell = normalized_cell_dict(cell, default_radio=radio)
        if cell:
            cell_key = to_cellkey(cell)
            validated['cell'].append(cell_key)
            validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC))

    # Merge all possible cell and lac keys into one list
    all_cell_keys = []
    all_cell_keys.extend(validated['cell'])
    for key in validated['cell_lac']:
        all_cell_keys.append(key)

    # Do a single query for all cells and lacs at the same time
    try:
        all_networks = query_cell_networks(session, all_cell_keys)
    except Exception:
        heka_client.raven(RAVEN_ERROR)
        all_networks = []
    for network in all_networks:
        if network.key == CELLID_LAC:
            validated['cell_lac_network'].append(network)
        else:
            validated['cell_network'].append(network)

    # Always do a GeoIP lookup because it is cheap and we want to
    # report geoip vs. other data mismatches. We may also use
    # the full GeoIP City-level estimate as well, if all else fails.
    (geoip_res,
     countries) = geoip_and_best_guess_country_codes(validated['cell'],
                                                     api_name, client_addr,
                                                     geoip_db)

    # First we attempt a "zoom-in" from cell-lac, to cell
    # to wifi, tightening our estimate each step only so
    # long as it doesn't contradict the existing best-estimate
    # nor the possible countries of origin.

    for (data_field, object_field, metric_name, search_fn) in [
        ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac),
        ('cell', 'cell_network', 'cell', search_cell),
        ('wifi', 'wifi', 'wifi', search_wifi)
    ]:

        if validated[data_field]:
            r = None
            try:
                r = search_fn(session, validated[object_field])
            except Exception:
                heka_client.raven(RAVEN_ERROR)
                stats_client.incr('%s.%s_error' % (api_name, metric_name))

            if r is None:
                stats_client.incr('%s.no_%s_found' % (api_name, metric_name))

            else:
                lat = float(r['lat'])
                lon = float(r['lon'])

                stats_client.incr('%s.%s_found' % (api_name, metric_name))

                # Skip any hit that matches none of the possible countries.
                country_match = False
                for country in countries:
                    if location_is_in_country(lat, lon, country, 1):
                        country_match = True
                        break

                if countries and not country_match:
                    stats_client.incr('%s.anomaly.%s_country_mismatch' %
                                      (api_name, metric_name))

                # Always accept the first result we get.
                if result is None:
                    result = r
                    result_metric = metric_name

                # Or any result that appears to be an improvement over the
                # existing best guess.
                elif (distance(float(result['lat']), float(result['lon']), lat,
                               lon) * 1000 <= result['accuracy']):
                    result = r
                    result_metric = metric_name

                else:
                    stats_client.incr('%s.anomaly.%s_%s_mismatch' %
                                      (api_name, metric_name, result_metric))

    # Fall back to GeoIP if nothing has worked yet. We do not
    # include this in the "zoom-in" loop because GeoIP is
    # frequently _wrong_ at the city level; we only want to
    # accept that estimate if we got nothing better from cell
    # or wifi.
    if not result and geoip_res:
        result = geoip_res
        result_metric = 'geoip'

    # Do detailed logging for some api keys
    if api_key_log and api_key_name:
        api_log_metric = None
        wifi_keys = set([w['key'] for w in validated['wifi']])
        if wifi_keys and \
           len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY:
            # Only count requests as WiFi-based if they contain enough
            # distinct WiFi networks to pass our filters
            if result_metric == 'wifi':
                api_log_metric = 'wifi_hit'
            else:
                api_log_metric = 'wifi_miss'
        elif validated['cell']:
            if result_metric == 'cell':
                api_log_metric = 'cell_hit'
            elif result_metric == 'cell_lac':
                api_log_metric = 'cell_lac_hit'
            else:
                api_log_metric = 'cell_miss'
        else:
            if geoip_res:
                api_log_metric = 'geoip_hit'
            else:
                api_log_metric = 'geoip_miss'
        if api_log_metric:
            stats_client.incr('%s.api_log.%s.%s' %
                              (api_name, api_key_name, api_log_metric))

    if not result:
        stats_client.incr('%s.miss' % api_name)
        return None

    rounded_result = {
        'lat': round(result['lat'], DEGREE_DECIMAL_PLACES),
        'lon': round(result['lon'], DEGREE_DECIMAL_PLACES),
        'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES),
    }

    stats_client.incr('%s.%s_hit' % (api_name, result_metric))
    stats_client.timing('%s.accuracy.%s' % (api_name, result_metric),
                        rounded_result['accuracy'])

    return rounded_result