def cluster_cells(cells, lookups, min_age=0): """ Cluster cells by area. """ now = util.utcnow() today = now.date() # Create a dict of cell ids mapped to their age and signal strength. obs_data = {} for lookup in lookups: obs_data[decode_cellid(lookup.cellid)] = ( max(abs(lookup.age or min_age), 1000), lookup.signalStrength or MIN_CELL_SIGNAL[lookup.radioType]) areas = defaultdict(list) for cell in cells: areas[area_id(cell)].append(cell) clusters = [] for area_cells in areas.values(): clusters.append(numpy.array([( cell.lat, cell.lon, cell.radius, obs_data[cell.cellid][0], obs_data[cell.cellid][1], station_score(cell, now), encode_cellid(*cell.cellid), bool(cell.last_seen >= today)) for cell in area_cells], dtype=NETWORK_DTYPE)) return clusters
def test_region_all_none(self, celery, session): """If all cell regions are None, the area region is None.""" # Sardinia, in Mediterranean, not identified as part of Italy cell = self.cell_factory( radio=Radio.wcdma, mcc=204, mnc=4, lac=35051, cid=1018429, lat=40.18, lon=9.59, radius=10, region=None, ) assert cell.region is None cell2 = self.cell_factory( radio=cell.radio, mcc=cell.mcc, mnc=cell.mnc, lac=cell.lac, cid=cell.cid + 1, lat=cell.lat + 0.1, lon=cell.lon + 0.1, radius=10, region=None, ) assert cell2.region is None session.flush() self.area_queue(celery).enqueue([area_id(cell)]) self.task.delay().get() area = session.query(self.area_model).one() assert area.region is None
def cluster_cells(cells, lookups, min_age=0): """ Cluster cells by area. """ now = util.utcnow() today = now.date() # Create a dict of cell ids mapped to their age and signal strength. obs_data = {} for lookup in lookups: obs_data[decode_cellid(lookup.cellid)] = (max( abs(lookup.age or min_age), 1000), lookup.signalStrength or MIN_CELL_SIGNAL[lookup.radioType]) areas = defaultdict(list) for cell in cells: areas[area_id(cell)].append(cell) clusters = [] for area_cells in areas.values(): clusters.append( numpy.array( [(cell.lat, cell.lon, cell.radius, obs_data[cell.cellid][0], obs_data[cell.cellid][1], station_score(cell, now), encode_cellid(*cell.cellid), bool(cell.last_seen >= today)) for cell in area_cells], dtype=NETWORK_DTYPE)) return clusters
def test_region_outside_tie(self, celery, session): cell = self.cell_factory( radio=Radio.gsm, mcc=310, mnc=1, lac=1, cid=1, lat=18.33, lon=-64.9, radius=10000, region="PR", ) self.cell_factory( radio=cell.radio, mcc=cell.mcc, mnc=cell.mnc, lac=cell.lac, cid=2, lat=18.34, lon=-64.9, radius=10000, region="PR", ) session.flush() self.area_queue(celery).enqueue([area_id(cell)]) self.task.delay().get() area = session.query(self.area_model).one() assert area.region == "PR"
def test_region(self, celery, session): cell = self.cell_factory( radio=Radio.gsm, mcc=425, mnc=1, lac=1, cid=1, lat=32.2, lon=35.0, radius=10000, region="XW", ) self.cell_factory( radio=cell.radio, mcc=cell.mcc, mnc=cell.mnc, lac=cell.lac, cid=2, lat=32.2, lon=34.9, radius=10000, region="IL", ) session.flush() self.area_queue(celery).enqueue([area_id(cell)]) self.task.delay().get() area = session.query(self.area_model).one() assert area.region == "IL"
def test_region_outside_tie(self, celery, session): cell = self.cell_factory( radio=Radio.gsm, mcc=310, mnc=1, lac=1, cid=1, lat=18.33, lon=-64.9, radius=10000, region='PR') self.cell_factory( radio=cell.radio, mcc=cell.mcc, mnc=cell.mnc, lac=cell.lac, cid=2, lat=18.34, lon=-64.9, radius=10000, region='PR') session.flush() self.area_queue(celery).enqueue([area_id(cell)]) self.task.delay().get() area = session.query(self.area_model).one() assert area.region == 'PR'
def test_region(self, celery, session): cell = self.cell_factory( radio=Radio.gsm, mcc=425, mnc=1, lac=1, cid=1, lat=32.2, lon=35.0, radius=10000, region='XW') self.cell_factory( radio=cell.radio, mcc=cell.mcc, mnc=cell.mnc, lac=cell.lac, cid=2, lat=32.2, lon=34.9, radius=10000, region='IL') session.flush() self.area_queue(celery).enqueue([area_id(cell)]) self.task.delay().get() area = session.query(self.area_model).one() assert area.region == 'IL'
def test_region_null_tied(self, celery, session): """If an equal number of cells have region=None, the area is None.""" # Bornholm, an island in the Baltic sea, not identified as part of Denmark cell = self.cell_factory( radio=Radio.wcdma, mcc=204, mnc=175, lac=1515, cid=13241603, lat=55.115, lon=14.88, radius=10, region=None, ) assert cell.region is None # Reeuwijk, Netherlands self.cell_factory( radio=Radio.wcdma, mcc=cell.mcc, mnc=cell.mnc, lac=cell.lac, cid=cell.cid + 2, lat=52.056, lon=4.733, radius=10, region="NL", ) session.flush() self.area_queue(celery).enqueue([area_id(cell)]) self.task.delay().get() area = session.query(self.area_model).one() assert area.region is None
def read_stations_from_csv(session, file_handle, redis_client, cellarea_queue): """ Read stations from a public cell export CSV. :arg session: a database session :arg file_handle: an open file handle for the CSV data :arg redis_client: a Redis client :arg cellarea_queue: the DataQueue for updating cellarea IDs """ # Avoid circular imports from ichnaea.data.tasks import update_cellarea, update_statregion csv_content = peekable(reader(file_handle)) # UMTS was the original name for WCDMA stations radio_type = {"UMTS": "wcdma", "GSM": "gsm", "LTE": "lte", "": "Unknown"} counts = defaultdict(Counter) areas = set() areas_total = 0 total = 0 if not csv_content: LOGGER.warning("Nothing to process.") return first_row = csv_content.peek() if first_row == _FIELD_NAMES: # Skip the first row because it's a header row next(csv_content) else: LOGGER.warning("Expected header row, got data: %s", first_row) for row in csv_content: try: radio = radio_type[row[0]] except KeyError: raise InvalidCSV("Unknown radio type in row: %s" % row) if radio == "Unknown": LOGGER.warning("Skipping unknown radio: %s", row) continue try: data = { "radio": radio, "mcc": int(row[1]), "mnc": int(row[2]), "lac": int(row[3]), "cid": int(row[4]), "psc": int(row[5]) if row[5] else 0, "lon": float(row[6]), "lat": float(row[7]), # Some exported radiuses exceed the max and fail validation "radius": min(int(row[8]), CELL_MAX_RADIUS), "samples": int(row[9]), # row[10] is "changable", always 1 and not imported "created": datetime.fromtimestamp(int(row[11]), UTC), "modified": datetime.fromtimestamp(int(row[12]), UTC), } shard = CellShard.create(_raise_invalid=True, **data) except (colander.Invalid, ValueError) as e: if total == 0: # If the first row is invalid, it's likely the rest of the # file is, too--drop out here. raise InvalidCSV("first row %s is invalid: %s" % (row, e)) else: LOGGER.warning("row %s is invalid: %s", row, e) continue # Is this station in the database? shard_type = shard.__class__ existing = (session.query(shard_type).filter( shard_type.cellid == shard.cellid).options( load_only("modified")).one_or_none()) if existing: if existing.modified < data["modified"]: # Update existing station with new data operation = "updated" existing.psc = shard.psc existing.lon = shard.lon existing.lat = shard.lat existing.radius = shard.radius existing.samples = shard.samples existing.created = shard.created existing.modified = shard.modified else: # Do nothing to existing station record operation = "found" else: # Add a new station record operation = "new" shard.min_lat = shard.lat shard.max_lat = shard.lat shard.min_lon = shard.lon shard.max_lon = shard.lon session.add(shard) counts[data["radio"]][operation] += 1 # Process the cell area? if operation in {"new", "updated"}: areas.add(area_id(shard)) # Process a chunk of stations, report on progress total += 1 if total % 1000 == 0: session.commit() LOGGER.info("Processed %d stations", total) if areas and (len(areas) % 1000 == 0): session.commit() areas_total += len(areas) LOGGER.info("Processed %d station areas", areas_total) with redis_pipeline(redis_client) as pipe: cellarea_queue.enqueue(list(areas), pipe=pipe) update_cellarea.delay() areas = set() # Commit remaining station data session.commit() # Update the remaining cell areas if areas: areas_total += len(areas) with redis_pipeline(redis_client) as pipe: cellarea_queue.enqueue(list(areas), pipe=pipe) update_cellarea.delay() # Now that we've updated all the cell areas, we need to update the # statregion update_statregion.delay() # Summarize results LOGGER.info("Complete, processed %d station%s:", total, "" if total == 1 else "s") for radio_type, op_counts in sorted(counts.items()): LOGGER.info( " %s: %d new, %d updated, %d already loaded", radio_type, op_counts["new"], op_counts["updated"], op_counts["found"], ) if areas_total: LOGGER.info(" %d station area%s updated", areas_total, "" if areas_total == 1 else "s")