def search_wifi(session, data): wifi_data = data['wifi'] wifi_keys = set([normalize_wifi_key(w['key']) for w in wifi_data]) if not any(wifi_keys): # no valid normalized keys return None if len(wifi_keys) < 2: # we didn't even get two keys, bail out return None sql_null = None # avoid pep8 warning query = session.query(Wifi.lat, Wifi.lon).filter( Wifi.key.in_(wifi_keys)).filter(Wifi.lat != sql_null).filter( Wifi.lon != sql_null) wifis = query.all() if len(wifis) < 2: # we got fewer than two actual matches return None length = len(wifis) avg_lat = sum([w[0] for w in wifis]) / length avg_lon = sum([w[1] for w in wifis]) / length return { 'lat': quantize(avg_lat), 'lon': quantize(avg_lon), 'accuracy': 500, }
def search_wifi(session, data): wifi_data = data['wifi'] wifi_keys = set([normalize_wifi_key(w['key']) for w in wifi_data]) if not any(wifi_keys): # no valid normalized keys return None if len(wifi_keys) < 2: # we didn't even get two keys, bail out return None sql_null = None # avoid pep8 warning query = session.query(Wifi.lat, Wifi.lon).filter( Wifi.key.in_(wifi_keys)).filter( Wifi.lat != sql_null).filter( Wifi.lon != sql_null) wifis = query.all() if len(wifis) < 2: # we got fewer than two actual matches return None length = len(wifis) avg_lat = sum([w[0] for w in wifis]) / length avg_lon = sum([w[1] for w in wifis]) / length return { 'lat': quantize(avg_lat), 'lon': quantize(avg_lon), 'accuracy': 500, }
def process_measure(data, utcnow, session): session_objects = [] measure = Measure() measure.created = utcnow measure.time = data['time'] measure.lat = to_precise_int(data['lat']) measure.lon = to_precise_int(data['lon']) measure.accuracy = data['accuracy'] measure.altitude = data['altitude'] measure.altitude_accuracy = data['altitude_accuracy'] measure.radio = RADIO_TYPE.get(data['radio'], -1) # get measure.id set session.add(measure) session.flush() if data.get('cell'): cells, cell_data = process_cell(data['cell'], measure) measure.cell = dumps(cell_data) session_objects.extend(cells) if data.get('wifi'): # filter out old-style sha1 hashes too_long_keys = False for w in data['wifi']: w['key'] = key = normalize_wifi_key(w['key']) if len(key) > 12: too_long_keys = True break if not too_long_keys: process_wifi(data['wifi'], measure) measure.wifi = dumps(data['wifi']) return (measure, session_objects)
def load_file(session, source_file, batch_size=10000): utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) utcmin = utcnow - datetime.timedelta(120) with open(source_file, 'r') as fd: reader = csv.reader(fd, delimiter='\t', quotechar=None) counter = 0 for fields in reader: try: time = int(fields[0]) if time == 0: # pragma: no cover # unknown time gets an old date time = utcmin else: # convert from unixtime to utc time = datetime.datetime.utcfromtimestamp(time) key = normalize_wifi_key(str(fields[5])) if key == '000000000000': # pragma: no cover continue lat = fields[1] lon = fields[2] signal = int(fields[3]) channel = int(fields[4]) wifi = dict( key=key, channel=channel, signal=signal, ) data = dict( lat=lat, lon=lon, time=time, accuracy=0, altitude=0, altitude_accuracy=0, radio='', cell=(), wifi=[wifi], ) except (ValueError, IndexError): continue # side effect, schedules async tasks process_measure(data, utcnow, session) # flush every 1000 new records counter += 1 if counter % batch_size == 0: session.flush() print('Added %s records.' % counter) # add the rest session.flush() return counter
def process_measure(measure_id, data, session): cell_measures = [] wifi_measures = [] measure_data = dict( measure_id=measure_id, lat=to_precise_int(data['lat']), lon=to_precise_int(data['lon']), time=encode_datetime(data['time']), accuracy=data['accuracy'], altitude=data['altitude'], altitude_accuracy=data['altitude_accuracy'], ) measure_radio = RADIO_TYPE.get(data['radio'], -1) if data.get('cell'): # flatten measure / cell data into a single dict for c in data['cell']: c.update(measure_data) # use more specific cell type or # fall back to less precise measure if c['radio'] != '': c['radio'] = RADIO_TYPE.get(c['radio'], -1) else: c['radio'] = measure_radio cell_measures = data['cell'] if data.get('wifi'): # filter out old-style sha1 hashes invalid_wifi_key = False for w in data['wifi']: w['key'] = key = normalize_wifi_key(w['key']) if not valid_wifi_pattern(key): invalid_wifi_key = True break if not invalid_wifi_key: # flatten measure / wifi data into a single dict for w in data['wifi']: w.update(measure_data) wifi_measures = data['wifi'] return (cell_measures, wifi_measures)
def process_measure(data, utcnow, session, userid=None): measure = Measure() measure.created = utcnow measure.time = data["time"] measure.lat = to_precise_int(data["lat"]) measure.lon = to_precise_int(data["lon"]) measure.accuracy = data["accuracy"] measure.altitude = data["altitude"] measure.altitude_accuracy = data["altitude_accuracy"] measure.radio = RADIO_TYPE.get(data["radio"], -1) # get measure.id set session.add(measure) session.flush() measure_data = dict( id=measure.id, created=encode_datetime(measure.created), lat=measure.lat, lon=measure.lon, time=encode_datetime(measure.time), accuracy=measure.accuracy, altitude=measure.altitude, altitude_accuracy=measure.altitude_accuracy, radio=measure.radio, ) if data.get("cell"): insert_cell_measure.delay(measure_data, data["cell"], userid=userid) measure.cell = dumps(data["cell"]) if data.get("wifi"): # filter out old-style sha1 hashes too_long_keys = False for w in data["wifi"]: w["key"] = key = normalize_wifi_key(w["key"]) if len(key) > 12: too_long_keys = True break if not too_long_keys: insert_wifi_measure.delay(measure_data, data["wifi"], userid=userid) measure.wifi = dumps(data["wifi"]) return measure
def search_wifi(session, data): wifi_data = data['wifi'] wifi_keys = set([normalize_wifi_key(w['key']) for w in wifi_data]) if not any(wifi_keys): # no valid normalized keys return None if len(wifi_keys) < 3: # we didn't even get three keys, bail out return None sql_null = None # avoid pep8 warning query = session.query(Wifi.lat, Wifi.lon).filter( Wifi.key.in_(wifi_keys)).filter( Wifi.lat != sql_null).filter( Wifi.lon != sql_null) wifis = query.all() if len(wifis) < 3: # we got fewer than three actual matches return None length = len(wifis) avg_lat = sum([w[0] for w in wifis]) / length avg_lon = sum([w[1] for w in wifis]) / length # check to make sure all wifi AP's are close by # we might later relax this to allow some outliers latitudes = [w[0] for w in wifis] longitudes = [w[1] for w in wifis] lat_diff = abs(max(latitudes) - min(latitudes)) lon_diff = abs(max(longitudes) - min(longitudes)) if lat_diff >= MAX_DIFF or lon_diff >= MAX_DIFF: return None return { 'lat': quantize(avg_lat), 'lon': quantize(avg_lon), 'accuracy': 500, }
def load_file(session, source_file, batch_size=100, userid=None): utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) utcmin = utcnow - datetime.timedelta(120) with open(source_file, 'r') as fd: reader = csv.reader(fd, delimiter='\t', quotechar=None) counter = 0 items = [] for fields in reader: try: time = int(fields[0]) if time == 0: # pragma: no cover # unknown time gets an old date time = utcmin else: # convert from unixtime to utc time = datetime.datetime.utcfromtimestamp(time) key = normalize_wifi_key(str(fields[1])) if not valid_wifi_pattern(key): # pragma: no cover continue lat = fields[2] lon = fields[3] accuracy = int(fields[4]) altitude = int(fields[5]) altitude_accuracy = int(fields[6]) channel = int(fields[7]) signal = int(fields[8]) wifi = dict( key=key, channel=channel, signal=signal, ) data = dict( lat=lat, lon=lon, time=time, accuracy=accuracy, altitude=altitude, altitude_accuracy=altitude_accuracy, radio='', cell=(), wifi=[wifi], ) except (ValueError, IndexError): continue items.append(data) counter += 1 # flush every batch_size records if counter % batch_size == 0: process_measures(items, session, userid=userid) items = [] session.flush() print('Added %s records.' % counter) # process the remaining items process_measures(items, session, userid=userid) print('Added %s records.' % counter) session.flush() return counter
def search_wifi(session, data): # estimate signal strength at -100 dBm if none is provided, # which is worse than the 99th percentile of wifi dBms we # see in practice (-98). def signal_strength(w): if 'signal' in w: return int(w['signal']) else: return -100 wifi_signals = dict([(normalize_wifi_key(w['key']), signal_strength(w)) for w in data['wifi']]) wifi_keys = set(wifi_signals.keys()) if not any(wifi_keys): # no valid normalized keys return None if len(wifi_keys) < 3: # we didn't even get three keys, bail out return None sql_null = None # avoid pep8 warning query = session.query(Wifi.key, Wifi.lat, Wifi.lon).filter( Wifi.key.in_(wifi_keys)).filter( Wifi.lat != sql_null).filter( Wifi.lon != sql_null) wifis = query.all() if len(wifis) < 3: # we got fewer than three actual matches return None wifis = [Network(normalize_wifi_key(w[0]), w[1], w[2]) for w in wifis] # sort networks by signal strengths in query wifis.sort(lambda a, b: cmp(wifi_signals[b.key], wifi_signals[a.key])) clusters = [] for w in wifis: # try to assign w to a cluster (but at most one) for c in clusters: for n in c: if distance(quantize(n.lat), quantize(n.lon), quantize(w.lat), quantize(w.lon)) <= MAX_DIST: c.append(w) w = None break if len(c) >= 3: # if we have a cluster with more than 3 # networks in it, return its centroid. length = len(c) avg_lat = sum([n.lat for n in c]) / length avg_lon = sum([n.lon for n in c]) / length return { 'lat': quantize(avg_lat), 'lon': quantize(avg_lon), 'accuracy': 500, } if w is None: break # if w didn't adhere to any cluster, make a new one if w is not None: clusters.append([w]) # if we didn't get any clusters with >3 networks, # the query is a bunch of outliers; give up and # let the next location method try. return None