def process_report(self, data): def add_missing_dict_entries(dst, src): # x.update(y) overwrites entries in x with those in y; # We want to only add those not already present. # We also only want to copy the top-level base report data # and not any nested values like cell or wifi. for (key, value) in src.items(): if key != 'radio' and key not in dst \ and not isinstance(value, (tuple, list, dict)): dst[key] = value report_data = Report.validate(data) if report_data is None: return ([], []) cell_observations = {} wifi_observations = {} if data.get('cell'): # flatten report / cell data into a single dict for cell in data['cell']: # only validate the additional fields cell = CellReport.validate(cell) if cell is None: continue add_missing_dict_entries(cell, report_data) cell_key = CellObservation.to_hashkey(cell) if cell_key in cell_observations: existing = cell_observations[cell_key] if existing['ta'] > cell['ta'] or \ (existing['signal'] != 0 and existing['signal'] < cell['signal']) or \ existing['asu'] < cell['asu']: cell_observations[cell_key] = cell else: cell_observations[cell_key] = cell cell_observations = cell_observations.values() # flatten report / wifi data into a single dict if data.get('wifi'): for wifi in data['wifi']: # only validate the additional fields wifi = WifiReport.validate(wifi) if wifi is None: continue add_missing_dict_entries(wifi, report_data) wifi_key = WifiObservation.to_hashkey(wifi) if wifi_key in wifi_observations: existing = wifi_observations[wifi_key] if existing['signal'] != 0 and \ existing['signal'] < wifi['signal']: wifi_observations[wifi_key] = wifi else: wifi_observations[wifi_key] = wifi wifi_observations = wifi_observations.values() return (cell_observations, wifi_observations)
def process_reports(self, reports, userid=None): positions = [] cell_observations = [] wifi_observations = [] for i, report in enumerate(reports): report['report_id'] = uuid.uuid1() cell, wifi = self.process_report(report) cell_observations.extend(cell) wifi_observations.extend(wifi) if cell or wifi: positions.append({ 'lat': report['lat'], 'lon': report['lon'], }) if cell_observations: # group by and create task per cell key self.stats_client.incr('items.uploaded.cell_observations', len(cell_observations)) if self.api_key_log: self.stats_client.incr( 'items.api_log.%s.uploaded.' 'cell_observations' % self.api_key_name, len(cell_observations)) cells = defaultdict(list) for obs in cell_observations: cells[CellObservation.to_hashkey(obs)].append(obs) # Create a task per group of 5 cell keys at a time. # Grouping them helps in avoiding per-task overhead. cells = list(cells.values()) batch_size = 5 countdown = 0 for i in range(0, len(cells), batch_size): values = [] for observations in cells[i:i + batch_size]: values.extend([encode_radio_dict(o) for o in observations]) # insert observations, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations self.insert_cell_task.apply_async( args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if wifi_observations: # group by WiFi key self.stats_client.incr('items.uploaded.wifi_observations', len(wifi_observations)) if self.api_key_log: self.stats_client.incr( 'items.api_log.%s.uploaded.' 'wifi_observations' % self.api_key_name, len(wifi_observations)) wifis = defaultdict(list) for obs in wifi_observations: wifis[WifiObservation.to_hashkey(obs)].append(obs) # Create a task per group of 20 WiFi keys at a time. # We tend to get a huge number of unique WiFi networks per # batch upload, with one to very few observations per WiFi. # Grouping them helps in avoiding per-task overhead. wifis = list(wifis.values()) batch_size = 20 countdown = 0 for i in range(0, len(wifis), batch_size): values = [] for observations in wifis[i:i + batch_size]: values.extend(observations) # insert observations, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations self.insert_wifi_task.apply_async( args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if userid is not None: scorekey = Score.to_hashkey( userid=userid, key=ScoreKey.location, time=util.utcnow().date()) Score.incr(self.session, scorekey, len(positions)) if positions: self.process_mapstat(positions)
def test_blacklist_moving_cells(self): now = util.utcnow() long_ago = now - timedelta(days=40) session = self.session k1 = dict(radio=Radio.cdma, mcc=1, mnc=2, lac=3, cid=4) k2 = dict(radio=Radio.cdma, mcc=1, mnc=2, lac=6, cid=8) k3 = dict(radio=Radio.cdma, mcc=1, mnc=2, lac=9, cid=12) k4 = dict(radio=Radio.cdma, mcc=1, mnc=2, lac=12, cid=16) k5 = dict(radio=Radio.cdma, mcc=1, mnc=2, lac=15, cid=20) k6 = dict(radio=Radio.cdma, mcc=1, mnc=2, lac=18, cid=24) # keys k2, k3 and k4 are expected to be detected as moving data = [ # a cell with an entry but no prior position Cell(new_measures=3, total_measures=0, **k1), CellObservation(lat=1.001, lon=1.001, **k1), CellObservation(lat=1.002, lon=1.005, **k1), CellObservation(lat=1.003, lon=1.009, **k1), # a cell with a prior known position Cell(lat=2.0, lon=2.0, new_measures=2, total_measures=1, **k2), CellObservation(lat=2.0, lon=2.0, **k2), CellObservation(lat=4.0, lon=2.0, **k2), # a cell with a very different prior position Cell(lat=1.0, lon=1.0, new_measures=2, total_measures=1, **k3), CellObservation(lat=3.0, lon=3.0, **k3), CellObservation(lat=-3.0, lon=3.0, **k3), # another cell with a prior known position (and negative lat) Cell(lat=-4.0, lon=4.0, new_measures=2, total_measures=1, **k4), CellObservation(lat=-4.0, lon=4.0, **k4), CellObservation(lat=-6.0, lon=4.0, **k4), # an already blacklisted cell CellBlacklist(time=now, count=1, **k5), CellObservation(lat=5.0, lon=5.0, **k5), CellObservation(lat=8.0, lon=5.0, **k5), # a cell with an old different record we ignore, position # estimate has been updated since Cell(lat=6.0, lon=6.0, new_measures=2, total_measures=1, **k6), CellObservation(lat=6.9, lon=6.9, time=long_ago, **k6), CellObservation(lat=6.0, lon=6.0, **k6), CellObservation(lat=6.001, lon=6, **k6), ] session.add_all(data) session.commit() result = location_update_cell.delay(min_new=1) self.assertEqual(result.get(), (5, 3)) moving = [k2, k3, k4, k5] black = session.query(CellBlacklist).all() self.assertEqual(set([b.hashkey() for b in black]), set([CellBlacklist.to_hashkey(k) for k in moving])) keys = [k1, k2, k3, k4, k5, k6] observations = session.query(CellObservation).all() self.assertEqual(len(observations), 14) self.assertEqual(set([obs.hashkey() for obs in observations]), set([CellObservation.to_hashkey(k) for k in keys])) # test duplicate call result = location_update_cell.delay(min_new=1) self.assertEqual(result.get(), (0, 0)) self.check_stats( total=6, timer=[ # We made duplicate calls ('task.data.location_update_cell', 2), # One of those would've scheduled a remove_cell task ('task.data.remove_cell', 1) ], gauge=[ ('task.data.location_update_cell.new_measures_1_100', 2), ])
def process_reports(self, reports, userid=None): positions = set() cell_observations = [] wifi_observations = [] for i, report in enumerate(reports): report['report_id'] = uuid.uuid1() cell, wifi = self.process_report(report) cell_observations.extend(cell) wifi_observations.extend(wifi) if (cell or wifi) and report.get('lat') and report.get('lon'): positions.add((report['lat'], report['lon'])) if cell_observations: # group by and create task per cell key self.stats_client.incr('items.uploaded.cell_observations', len(cell_observations)) if self.api_key and self.api_key.log: self.stats_client.incr( 'items.api_log.%s.uploaded.' 'cell_observations' % self.api_key.name, len(cell_observations)) cells = defaultdict(list) for obs in cell_observations: cells[CellObservation.to_hashkey(obs)].append(obs) # Create a task per group of 100 cell keys at a time. # Grouping them helps in avoiding per-task overhead. cells = list(cells.values()) batch_size = 100 countdown = 0 for i in range(0, len(cells), batch_size): values = [] for observations in cells[i:i + batch_size]: values.extend([encode_radio_dict(o) for o in observations]) # insert observations, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations self.insert_cell_task.apply_async( args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if wifi_observations: # group by WiFi key self.stats_client.incr('items.uploaded.wifi_observations', len(wifi_observations)) if self.api_key and self.api_key.log: self.stats_client.incr( 'items.api_log.%s.uploaded.' 'wifi_observations' % self.api_key.name, len(wifi_observations)) wifis = defaultdict(list) for obs in wifi_observations: wifis[WifiObservation.to_hashkey(obs)].append(obs) # Create a task per group of 100 WiFi keys at a time. # We tend to get a huge number of unique WiFi networks per # batch upload, with one to very few observations per WiFi. # Grouping them helps in avoiding per-task overhead. wifis = list(wifis.values()) batch_size = 100 countdown = 0 for i in range(0, len(wifis), batch_size): values = [] for observations in wifis[i:i + batch_size]: values.extend(observations) # insert observations, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations self.insert_wifi_task.apply_async( args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 self.process_mapstat(positions) self.process_score(userid, positions)
def process_report(self, data): def add_missing_dict_entries(dst, src): # x.update(y) overwrites entries in x with those in y; # We want to only add those not already present. # We also only want to copy the top-level base report data # and not any nested values like cell or wifi. for (key, value) in src.items(): if key != 'radio' and key not in dst \ and not isinstance(value, (tuple, list, dict)): dst[key] = value def better_cell_obs(new, old): comparators = [ ('ta', operator.lt), ('signal', operator.gt), ('asu', operator.gt), ] for field, better in comparators: if (None not in (old[field], new[field]) and better(new[field], old[field])): return True return False def better_wifi_obs(new, old): if (None not in (old['signal'], new['signal']) and new['signal'] > old['signal']): return True return False report_data = Report.validate(data) if report_data is None: return ([], []) cell_observations = {} wifi_observations = {} if data.get('cell'): # flatten report / cell data into a single dict for cell in data['cell']: # only validate the additional fields cell = CellReport.validate(cell) if cell is None: continue add_missing_dict_entries(cell, report_data) cell_key = CellObservation.to_hashkey(cell) if cell_key in cell_observations: existing = cell_observations[cell_key] if better_cell_obs(cell, existing): cell_observations[cell_key] = cell else: cell_observations[cell_key] = cell cell_observations = cell_observations.values() # flatten report / wifi data into a single dict if data.get('wifi'): for wifi in data['wifi']: # only validate the additional fields wifi = WifiReport.validate(wifi) if wifi is None: continue add_missing_dict_entries(wifi, report_data) wifi_key = WifiObservation.to_hashkey(wifi) if wifi_key in wifi_observations: existing = wifi_observations[wifi_key] if better_wifi_obs(wifi, existing): wifi_observations[wifi_key] = wifi else: wifi_observations[wifi_key] = wifi wifi_observations = wifi_observations.values() return (cell_observations, wifi_observations)
def process_observations(observations, session, userid=None, api_key_log=False, api_key_name=None): stats_client = get_stats_client() positions = [] cell_observations = [] wifi_observations = [] for i, obs in enumerate(observations): obs['report_id'] = uuid.uuid1() cell, wifi = process_observation(obs, session) cell_observations.extend(cell) wifi_observations.extend(wifi) if cell or wifi: positions.append({ 'lat': obs['lat'], 'lon': obs['lon'], }) if cell_observations: # group by and create task per cell key stats_client.incr('items.uploaded.cell_observations', len(cell_observations)) if api_key_log: stats_client.incr( 'items.api_log.%s.uploaded.cell_observations' % api_key_name, len(cell_observations)) cells = defaultdict(list) for obs in cell_observations: cells[CellObservation.to_hashkey(obs)].append(obs) # Create a task per group of 5 cell keys at a time. # Grouping them helps in avoiding per-task overhead. cells = list(cells.values()) batch_size = 5 countdown = 0 for i in range(0, len(cells), batch_size): values = [] for observations in cells[i:i + batch_size]: values.extend(observations) # insert observations, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations insert_measures_cell.apply_async(args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if wifi_observations: # group by WiFi key stats_client.incr('items.uploaded.wifi_observations', len(wifi_observations)) if api_key_log: stats_client.incr( 'items.api_log.%s.uploaded.wifi_observations' % api_key_name, len(wifi_observations)) wifis = defaultdict(list) for obs in wifi_observations: wifis[WifiObservation.to_hashkey(obs)].append(obs) # Create a task per group of 20 WiFi keys at a time. # We tend to get a huge number of unique WiFi networks per # batch upload, with one to very few observations per WiFi. # Grouping them helps in avoiding per-task overhead. wifis = list(wifis.values()) batch_size = 20 countdown = 0 for i in range(0, len(wifis), batch_size): values = [] for observations in wifis[i:i + batch_size]: values.extend(observations) # insert observations, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations insert_measures_wifi.apply_async(args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if userid is not None: process_score(userid, len(positions), session) if positions: process_mapstat(session, positions)