def test_score(self): now = util.utcnow() assert round(station_score(Dummy( now, now, 0, 1), now), 2) == 0.05 assert round(station_score(Dummy( now - timedelta(days=1), now, 10, 2), now), 2) == 0.1 assert round(station_score(Dummy( now - timedelta(days=5), now, 10, 2), now), 2) == 0.5 assert round(station_score(Dummy( now - timedelta(days=10), now, 10, 2), now), 2) == 1.0 assert round(station_score(Dummy( now - timedelta(days=10), now, 10, 64), now), 2) == 6.0 assert round(station_score(Dummy( now - timedelta(days=10), now, 10, 1024), now), 2) == 10.0 assert round(station_score(Dummy( now - timedelta(days=10), now, 0, 1024), now), 2) == 0.5 assert round(station_score(Dummy( now - timedelta(days=70), now - timedelta(days=40), 10, 1024), now), 2) == 7.07 assert round(station_score(Dummy( now - timedelta(days=190), now - timedelta(days=180), 10, 1024), now), 2) == 3.78 assert round(station_score(Dummy( now - timedelta(days=190), now - timedelta(days=180), 10, 64), now), 2) == 2.27
def test_blue(self, geoip_db, http_session, session, source, stats): now = util.utcnow() region = GEOCODER.regions_for_mcc(235, metadata=True)[0] blue1 = BlueShardFactory(samples=10) blue2 = BlueShardFactory(samples=20) blue3 = BlueShardFactory.build(region='DE', samples=100) session.flush() query = self.model_query(geoip_db, http_session, session, stats, blues=[blue1, blue2, blue3]) results = source.search(query) self.check_model_results(results, [region]) best_result = results.best() assert best_result.region_code == region.code assert (best_result.score == station_score(blue1, now) + station_score(blue2, now)) stats.check(counter=[ (self.api_type + '.source', [ 'key:test', 'region:none', 'source:internal', 'accuracy:low', 'status:hit' ]), ])
def test_blue(self, geoip_db, http_session, session, source, metricsmock): now = util.utcnow() region = GEOCODER.regions_for_mcc(235, metadata=True)[0] blue1 = BlueShardFactory(samples=10) blue2 = BlueShardFactory(samples=20) blue3 = BlueShardFactory.build(region="DE", samples=100) session.flush() query = self.model_query( geoip_db, http_session, session, blues=[blue1, blue2, blue3] ) results = source.search(query) self.check_model_results(results, [region]) best_result = results.best() assert best_result.region_code == region.code assert best_result.score == station_score(blue1, now) + station_score( blue2, now ) assert metricsmock.has_record( "incr", self.api_type + ".source", value=1, tags=[ "key:test", "region:none", "source:internal", "accuracy:low", "status:hit", ], )
def test_multiple_cells(self, geoip_db, http_session, session, source): now = util.utcnow() cell = CellShardFactory(samples=100) cell2 = CellShardFactory( radio=cell.radio, mcc=cell.mcc, mnc=cell.mnc, lac=cell.lac, cid=cell.cid + 1, lat=cell.lat + 1.0, lon=cell.lon + 1.0, samples=10, ) session.flush() query = self.model_query(geoip_db, http_session, session, cells=[cell, cell2]) results = source.search(query) self.check_model_results( results, [cell], lat=cell.lat + 0.3333333, lon=cell.lon + 0.3333333, accuracy=CELL_MAX_ACCURACY, ) assert results.best().score == station_score( cell, now) + station_score(cell2, now)
def test_cluster_score_over_size(self, geoip_db, http_session, session, source): now = util.utcnow() yesterday = now - timedelta(days=1) last_week = now - timedelta(days=7) three_months = now - timedelta(days=90) four_months = now - timedelta(days=120) wifi11 = WifiShardFactory(samples=20, created=last_week, modified=yesterday) wifi12 = WifiShardFactory( lat=wifi11.lat + 0.00003, lon=wifi11.lon, samples=30, created=yesterday, modified=now, ) wifi13 = WifiShardFactory( lat=wifi11.lat - 0.00003, lon=wifi11.lon, samples=10, created=yesterday, modified=now, ) wifi21 = WifiShardFactory( lat=wifi11.lat + 1.0, lon=wifi11.lon + 1.0, samples=40, created=four_months, modified=three_months, ) wifi22 = WifiShardFactory( lat=wifi21.lat, lon=wifi21.lon, samples=50, created=three_months, modified=last_week, ) session.flush() query = self.model_query( geoip_db, http_session, session, wifis=[wifi11, wifi12, wifi13, wifi21, wifi22], ) results = source.search(query) assert len(results) == 2 best_result = results.best() assert round(best_result.lat, 7) == round(wifi21.lat, 7) assert round(best_result.lon, 7) == round(wifi21.lon, 7) assert round(best_result.accuracy, 2) == 10.0 assert round(best_result.score, 2) == round( station_score(wifi21, now) + station_score(wifi22, now), 2 ) other_result = [res for res in results if res.score < best_result.score][0] assert round(other_result.lat, 4) == round(wifi11.lat, 4) assert round(other_result.lon, 4) == round(wifi11.lon, 4)
def test_top_results_in_noisy_cluster( self, geoip_db, http_session, session, source ): now = util.utcnow() # all these should wind up in the same cluster since # the WiFis are spaced in increments of (+0.1m, +0.12m) wifi1 = WifiShardFactory.build() wifis = [] for i in range(0, MAX_WIFIS_IN_CLUSTER + 10): wifis.append( WifiShardFactory( lat=wifi1.lat + i * 0.000001, lon=wifi1.lon + i * 0.0000012, samples=100 - i, ) ) session.flush() # calculate expected result score = sum([station_score(wifi, now) for wifi in wifis]) query = self.model_query(geoip_db, http_session, session, wifis=wifis) for i, entry in enumerate(query.wifi): entry.signalStrength = -50 - i results = source.search(query) result = results.best() assert round(result.lat, 4) == round(wifi1.lat, 4) assert round(result.lon, 4) == round(wifi1.lon, 4) assert round(result.score, 4) == round(score, 4)
def cluster_cells(cells, lookups, min_age=0): """ Cluster cells by area. """ now = util.utcnow() today = now.date() # Create a dict of cell ids mapped to their age and signal strength. obs_data = {} for lookup in lookups: obs_data[decode_cellid(lookup.cellid)] = (max( abs(lookup.age or min_age), 1000), lookup.signalStrength or MIN_CELL_SIGNAL[lookup.radioType]) areas = defaultdict(list) for cell in cells: areas[area_id(cell)].append(cell) clusters = [] for area_cells in areas.values(): clusters.append( numpy.array( [(cell.lat, cell.lon, cell.radius, obs_data[cell.cellid][0], obs_data[cell.cellid][1], station_score(cell, now), encode_cellid(*cell.cellid), bool(cell.last_seen >= today)) for cell in area_cells], dtype=NETWORK_DTYPE)) return clusters
def test_block_last(self): now = util.utcnow() assert round(station_score(Dummy( now - timedelta(days=70), now - timedelta(days=60), 10, 64, (now - timedelta(days=65)).date()), now), 2) == 1.73
def cluster_cells(cells, lookups, min_age=0): """ Cluster cells by area. """ now = util.utcnow() today = now.date() # Create a dict of cell ids mapped to their age and signal strength. obs_data = {} for lookup in lookups: obs_data[decode_cellid(lookup.cellid)] = ( max(abs(lookup.age or min_age), 1000), lookup.signalStrength or MIN_CELL_SIGNAL[lookup.radioType]) areas = defaultdict(list) for cell in cells: areas[area_id(cell)].append(cell) clusters = [] for area_cells in areas.values(): clusters.append(numpy.array([( cell.lat, cell.lon, cell.radius, obs_data[cell.cellid][0], obs_data[cell.cellid][1], station_score(cell, now), encode_cellid(*cell.cellid), bool(cell.last_seen >= today)) for cell in area_cells], dtype=NETWORK_DTYPE)) return clusters
def test_last_seen(self): now = util.utcnow() assert round(station_score(Dummy( now - timedelta(days=70), now - timedelta(days=60), 10, 64, (now - timedelta(days=65)).date(), (now - timedelta(days=58)).date()), now), 2) == 2.42
def test_cell(self, geoip_db, http_session, session, source): now = util.utcnow() cell = CellShardFactory(samples=10) session.flush() query = self.model_query(geoip_db, http_session, session, cells=[cell]) results = source.search(query) self.check_model_results(results, [cell]) assert results.best().score == station_score(cell, now)
def test_cell(self, geoip_db, http_session, session, source, stats): now = util.utcnow() cell = CellShardFactory(samples=10) session.flush() query = self.model_query( geoip_db, http_session, session, stats, cells=[cell]) results = source.search(query) self.check_model_results(results, [cell]) assert results.best().score == station_score(cell, now)
def test_multiple_cells(self, geoip_db, http_session, session, source, stats): now = util.utcnow() cell = CellShardFactory(samples=100) cell2 = CellShardFactory(radio=cell.radio, mcc=cell.mcc, mnc=cell.mnc, lac=cell.lac, cid=cell.cid + 1, lat=cell.lat + 1.0, lon=cell.lon + 1.0, samples=10) session.flush() query = self.model_query( geoip_db, http_session, session, stats, cells=[cell, cell2]) results = source.search(query) self.check_model_results( results, [cell], lat=cell.lat + 0.3333333, lon=cell.lon + 0.3333333, accuracy=CELL_MAX_ACCURACY) assert (results.best().score == station_score(cell, now) + station_score(cell2, now))
def test_wifi(self, geoip_db, http_session, session, source, metricsmock): now = util.utcnow() region = GEOCODER.regions_for_mcc(235, metadata=True)[0] wifi1 = WifiShardFactory(samples=10) wifi2 = WifiShardFactory(samples=20) wifi3 = WifiShardFactory.build(region="DE", samples=100) session.flush() query = self.model_query(geoip_db, http_session, session, wifis=[wifi1, wifi2, wifi3]) results = source.search(query) self.check_model_results(results, [region]) best_result = results.best() assert best_result.region_code == region.code assert best_result.score == station_score(wifi1, now) + station_score( wifi2, now) metricsmock.assert_incr_once( self.api_type + ".source", tags=["key:test", "source:internal", "accuracy:low", "status:hit"], )
def test_blue(self, geoip_db, http_session, session, source, stats): now = util.utcnow() region = GEOCODER.regions_for_mcc(235, metadata=True)[0] blue1 = BlueShardFactory(samples=10) blue2 = BlueShardFactory(samples=20) blue3 = BlueShardFactory.build(region='DE', samples=100) session.flush() query = self.model_query( geoip_db, http_session, session, stats, blues=[blue1, blue2, blue3]) results = source.search(query) self.check_model_results(results, [region]) best_result = results.best() assert best_result.region_code == region.code assert (best_result.score == station_score(blue1, now) + station_score(blue2, now)) stats.check(counter=[ (self.api_type + '.source', ['key:test', 'region:none', 'source:internal', 'accuracy:low', 'status:hit']), ])
def search_wifi(self, query): results = self.result_list() now = util.utcnow() regions = defaultdict(int) wifis = query_macs(query, query.wifi, self.raven_client, WifiShard) for wifi in wifis: regions[wifi.region] += station_score(wifi, now) for code, score in regions.items(): region = GEOCODER.region_for_code(code) if region: results.add(self.result_type( region_code=code, region_name=region.name, accuracy=region.radius, score=score)) return results
def search_blue(self, query): results = self.result_list() now = util.utcnow() regions = defaultdict(int) blues = query_macs(query, query.blue, self.raven_client, BlueShard) for blue in blues: regions[blue.region] += station_score(blue, now) for code, score in regions.items(): region = GEOCODER.region_for_code(code) if region: results.add( self.result_type(region_code=code, region_name=region.name, accuracy=region.radius, score=score)) return results
def search_wifi(self, query): results = self.result_list() now = util.utcnow() regions = defaultdict(int) wifis = query_macs(query, query.wifi, self.raven_client, WifiShard) for wifi in wifis: regions[wifi.region] += station_score(wifi, now) for code, score in regions.items(): region = GEOCODER.region_for_code(code) if region: results.add( self.result_type(region_code=code, region_name=region.name, accuracy=region.radius, score=score)) return results
def search_blue(self, query): results = self.result_list() now = util.utcnow() regions = defaultdict(int) blues = query_macs(query, query.blue, self.raven_client, BlueShard) for blue in blues: regions[blue.region] += station_score(blue, now) for code, score in regions.items(): region = GEOCODER.region_for_code(code) if region: results.add(self.result_type( region_code=code, region_name=region.name, accuracy=region.radius, score=score)) return results
def cluster_networks(models, lookups, min_age=0, min_radius=None, min_signal=None, max_distance=None): """ Given a list of database models and lookups, return a list of clusters of nearby networks. """ now = util.utcnow() today = now.date() # Create a dict of macs mapped to their age and signal strength. obs_data = {} for lookup in lookups: obs_data[decode_mac(lookup.mac)] = ( max(abs(lookup.age or min_age), 1000), lookup.signalStrength or min_signal) networks = numpy.array([( model.lat, model.lon, model.radius or min_radius, obs_data[model.mac][0], obs_data[model.mac][1], station_score(model, now), encode_mac(model.mac), bool(model.last_seen >= today)) for model in models], dtype=NETWORK_DTYPE) # Only consider clusters that have at least 2 found networks # inside them. Otherwise someone could use a combination of # one real network and one fake and therefor not found network to # get the position of the real network. length = len(networks) if length < 2: # Not enough networks to form a valid cluster. return [] positions = networks[['lat', 'lon']] if length == 2: one = positions[0] two = positions[1] if distance(one[0], one[1], two[0], two[1]) <= max_distance: # Only two networks and they agree, so cluster them. return [networks] else: # Or they disagree forming two clusters of size one, # neither of which is large enough to be returned. return [] # Calculate the condensed distance matrix based on distance in meters. # This avoids calculating the square form, which would calculate # each value twice and avoids calculating the diagonal of zeros. # We avoid the special cases for length < 2 with the above checks. # See scipy.spatial.distance.squareform and # https://stackoverflow.com/questions/13079563 dist_matrix = numpy.zeros(length * (length - 1) // 2, dtype=numpy.double) for i, (a, b) in enumerate(itertools.combinations(positions, 2)): dist_matrix[i] = distance(a[0], a[1], b[0], b[1]) link_matrix = hierarchy.linkage(dist_matrix, method='complete') assignments = hierarchy.fcluster( link_matrix, max_distance, criterion='distance', depth=2) indexed_clusters = defaultdict(list) for i, net in zip(assignments, networks): indexed_clusters[i].append(net) clusters = [] for values in indexed_clusters.values(): if len(values) >= 2: clusters.append(numpy.array(values, dtype=NETWORK_DTYPE)) return clusters
def cluster_networks(models, lookups, min_age=0, min_radius=None, min_signal=None, max_distance=None): """ Given a list of database models and lookups, return a list of clusters of nearby networks. """ now = util.utcnow() today = now.date() # Create a dict of macs mapped to their age and signal strength. obs_data = {} for lookup in lookups: obs_data[decode_mac(lookup.mac)] = ( max(abs(lookup.age or min_age), 1000), lookup.signalStrength or min_signal, ) networks = numpy.array( [( model.lat, model.lon, model.radius or min_radius, obs_data[model.mac][0], obs_data[model.mac][1], station_score(model, now), encode_mac(model.mac, codec="base64"), bool(model.last_seen is not None and model.last_seen >= today), ) for model in models], dtype=NETWORK_DTYPE, ) # Only consider clusters that have at least 2 found networks # inside them. Otherwise someone could use a combination of # one real network and one fake and therefor not found network to # get the position of the real network. length = len(networks) if length < 2: # Not enough networks to form a valid cluster. return [] positions = networks[["lat", "lon"]] if length == 2: one = positions[0] two = positions[1] if distance(one[0], one[1], two[0], two[1]) <= max_distance: # Only two networks and they agree, so cluster them. return [networks] else: # Or they disagree forming two clusters of size one, # neither of which is large enough to be returned. return [] # Calculate the condensed distance matrix based on distance in meters. # This avoids calculating the square form, which would calculate # each value twice and avoids calculating the diagonal of zeros. # We avoid the special cases for length < 2 with the above checks. # See scipy.spatial.distance.squareform and # https://stackoverflow.com/questions/13079563 dist_matrix = numpy.zeros(length * (length - 1) // 2, dtype=numpy.double) for i, (a, b) in enumerate(itertools.combinations(positions, 2)): dist_matrix[i] = distance(a[0], a[1], b[0], b[1]) link_matrix = hierarchy.linkage(dist_matrix, method="complete") assignments = hierarchy.fcluster(link_matrix, max_distance, criterion="distance", depth=2) indexed_clusters = defaultdict(list) for i, net in zip(assignments, networks): indexed_clusters[i].append(net) clusters = [] for values in indexed_clusters.values(): if len(values) >= 2: clusters.append(numpy.array(values, dtype=NETWORK_DTYPE)) return clusters