Пример #1
0
def search_wifi(session, data):
    wifi_data = data['wifi']
    wifi_keys = set([normalize_wifi_key(w['key']) for w in wifi_data])
    if not any(wifi_keys):
        # no valid normalized keys
        return None
    if len(wifi_keys) < 2:
        # we didn't even get two keys, bail out
        return None
    sql_null = None  # avoid pep8 warning
    query = session.query(Wifi.lat, Wifi.lon).filter(
        Wifi.key.in_(wifi_keys)).filter(Wifi.lat != sql_null).filter(
            Wifi.lon != sql_null)
    wifis = query.all()
    if len(wifis) < 2:
        # we got fewer than two actual matches
        return None
    length = len(wifis)
    avg_lat = sum([w[0] for w in wifis]) / length
    avg_lon = sum([w[1] for w in wifis]) / length
    return {
        'lat': quantize(avg_lat),
        'lon': quantize(avg_lon),
        'accuracy': 500,
    }
Пример #2
0
def search_wifi(session, data):
    wifi_data = data['wifi']
    wifi_keys = set([normalize_wifi_key(w['key']) for w in wifi_data])
    if not any(wifi_keys):
        # no valid normalized keys
        return None
    if len(wifi_keys) < 2:
        # we didn't even get two keys, bail out
        return None
    sql_null = None  # avoid pep8 warning
    query = session.query(Wifi.lat, Wifi.lon).filter(
        Wifi.key.in_(wifi_keys)).filter(
        Wifi.lat != sql_null).filter(
        Wifi.lon != sql_null)
    wifis = query.all()
    if len(wifis) < 2:
        # we got fewer than two actual matches
        return None
    length = len(wifis)
    avg_lat = sum([w[0] for w in wifis]) / length
    avg_lon = sum([w[1] for w in wifis]) / length
    return {
        'lat': quantize(avg_lat),
        'lon': quantize(avg_lon),
        'accuracy': 500,
    }
Пример #3
0
def process_measure(data, utcnow, session):
    session_objects = []
    measure = Measure()
    measure.created = utcnow
    measure.time = data['time']
    measure.lat = to_precise_int(data['lat'])
    measure.lon = to_precise_int(data['lon'])
    measure.accuracy = data['accuracy']
    measure.altitude = data['altitude']
    measure.altitude_accuracy = data['altitude_accuracy']
    measure.radio = RADIO_TYPE.get(data['radio'], -1)
    # get measure.id set
    session.add(measure)
    session.flush()
    if data.get('cell'):
        cells, cell_data = process_cell(data['cell'], measure)
        measure.cell = dumps(cell_data)
        session_objects.extend(cells)
    if data.get('wifi'):
        # filter out old-style sha1 hashes
        too_long_keys = False
        for w in data['wifi']:
            w['key'] = key = normalize_wifi_key(w['key'])
            if len(key) > 12:
                too_long_keys = True
                break
        if not too_long_keys:
            process_wifi(data['wifi'], measure)
            measure.wifi = dumps(data['wifi'])
    return (measure, session_objects)
Пример #4
0
def load_file(session, source_file, batch_size=10000):
    utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC)
    utcmin = utcnow - datetime.timedelta(120)

    with open(source_file, 'r') as fd:
        reader = csv.reader(fd, delimiter='\t', quotechar=None)
        counter = 0

        for fields in reader:
            try:
                time = int(fields[0])
                if time == 0:  # pragma: no cover
                    # unknown time gets an old date
                    time = utcmin
                else:
                    # convert from unixtime to utc
                    time = datetime.datetime.utcfromtimestamp(time)

                key = normalize_wifi_key(str(fields[5]))
                if key == '000000000000':  # pragma: no cover
                    continue

                lat = fields[1]
                lon = fields[2]
                signal = int(fields[3])
                channel = int(fields[4])
                wifi = dict(
                    key=key,
                    channel=channel,
                    signal=signal,
                )
                data = dict(
                    lat=lat,
                    lon=lon,
                    time=time,
                    accuracy=0,
                    altitude=0,
                    altitude_accuracy=0,
                    radio='',
                    cell=(),
                    wifi=[wifi],
                )
            except (ValueError, IndexError):
                continue
            # side effect, schedules async tasks
            process_measure(data, utcnow, session)

            # flush every 1000 new records
            counter += 1
            if counter % batch_size == 0:
                session.flush()
                print('Added %s records.' % counter)

    # add the rest
    session.flush()
    return counter
Пример #5
0
def load_file(session, source_file, batch_size=10000):
    utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC)
    utcmin = utcnow - datetime.timedelta(120)

    with open(source_file, 'r') as fd:
        reader = csv.reader(fd, delimiter='\t', quotechar=None)
        counter = 0

        for fields in reader:
            try:
                time = int(fields[0])
                if time == 0:  # pragma: no cover
                    # unknown time gets an old date
                    time = utcmin
                else:
                    # convert from unixtime to utc
                    time = datetime.datetime.utcfromtimestamp(time)

                key = normalize_wifi_key(str(fields[5]))
                if key == '000000000000':  # pragma: no cover
                    continue

                lat = fields[1]
                lon = fields[2]
                signal = int(fields[3])
                channel = int(fields[4])
                wifi = dict(
                    key=key,
                    channel=channel,
                    signal=signal,
                )
                data = dict(
                    lat=lat,
                    lon=lon,
                    time=time,
                    accuracy=0,
                    altitude=0,
                    altitude_accuracy=0,
                    radio='',
                    cell=(),
                    wifi=[wifi],
                )
            except (ValueError, IndexError):
                continue
            # side effect, schedules async tasks
            process_measure(data, utcnow, session)

            # flush every 1000 new records
            counter += 1
            if counter % batch_size == 0:
                session.flush()
                print('Added %s records.' % counter)

    # add the rest
    session.flush()
    return counter
Пример #6
0
def process_measure(measure_id, data, session):
    cell_measures = []
    wifi_measures = []
    measure_data = dict(
        measure_id=measure_id,
        lat=to_precise_int(data['lat']),
        lon=to_precise_int(data['lon']),
        time=encode_datetime(data['time']),
        accuracy=data['accuracy'],
        altitude=data['altitude'],
        altitude_accuracy=data['altitude_accuracy'],
    )
    measure_radio = RADIO_TYPE.get(data['radio'], -1)
    if data.get('cell'):
        # flatten measure / cell data into a single dict
        for c in data['cell']:
            c.update(measure_data)
            # use more specific cell type or
            # fall back to less precise measure
            if c['radio'] != '':
                c['radio'] = RADIO_TYPE.get(c['radio'], -1)
            else:
                c['radio'] = measure_radio
        cell_measures = data['cell']
    if data.get('wifi'):
        # filter out old-style sha1 hashes
        invalid_wifi_key = False
        for w in data['wifi']:
            w['key'] = key = normalize_wifi_key(w['key'])
            if not valid_wifi_pattern(key):
                invalid_wifi_key = True
                break

        if not invalid_wifi_key:
            # flatten measure / wifi data into a single dict
            for w in data['wifi']:
                w.update(measure_data)
            wifi_measures = data['wifi']
    return (cell_measures, wifi_measures)
Пример #7
0
def process_measure(measure_id, data, session):
    cell_measures = []
    wifi_measures = []
    measure_data = dict(
        measure_id=measure_id,
        lat=to_precise_int(data['lat']),
        lon=to_precise_int(data['lon']),
        time=encode_datetime(data['time']),
        accuracy=data['accuracy'],
        altitude=data['altitude'],
        altitude_accuracy=data['altitude_accuracy'],
    )
    measure_radio = RADIO_TYPE.get(data['radio'], -1)
    if data.get('cell'):
        # flatten measure / cell data into a single dict
        for c in data['cell']:
            c.update(measure_data)
            # use more specific cell type or
            # fall back to less precise measure
            if c['radio'] != '':
                c['radio'] = RADIO_TYPE.get(c['radio'], -1)
            else:
                c['radio'] = measure_radio
        cell_measures = data['cell']
    if data.get('wifi'):
        # filter out old-style sha1 hashes
        invalid_wifi_key = False
        for w in data['wifi']:
            w['key'] = key = normalize_wifi_key(w['key'])
            if not valid_wifi_pattern(key):
                invalid_wifi_key = True
                break

        if not invalid_wifi_key:
            # flatten measure / wifi data into a single dict
            for w in data['wifi']:
                w.update(measure_data)
            wifi_measures = data['wifi']
    return (cell_measures, wifi_measures)
Пример #8
0
def process_measure(data, utcnow, session, userid=None):
    measure = Measure()
    measure.created = utcnow
    measure.time = data["time"]
    measure.lat = to_precise_int(data["lat"])
    measure.lon = to_precise_int(data["lon"])
    measure.accuracy = data["accuracy"]
    measure.altitude = data["altitude"]
    measure.altitude_accuracy = data["altitude_accuracy"]
    measure.radio = RADIO_TYPE.get(data["radio"], -1)
    # get measure.id set
    session.add(measure)
    session.flush()
    measure_data = dict(
        id=measure.id,
        created=encode_datetime(measure.created),
        lat=measure.lat,
        lon=measure.lon,
        time=encode_datetime(measure.time),
        accuracy=measure.accuracy,
        altitude=measure.altitude,
        altitude_accuracy=measure.altitude_accuracy,
        radio=measure.radio,
    )
    if data.get("cell"):
        insert_cell_measure.delay(measure_data, data["cell"], userid=userid)
        measure.cell = dumps(data["cell"])
    if data.get("wifi"):
        # filter out old-style sha1 hashes
        too_long_keys = False
        for w in data["wifi"]:
            w["key"] = key = normalize_wifi_key(w["key"])
            if len(key) > 12:
                too_long_keys = True
                break
        if not too_long_keys:
            insert_wifi_measure.delay(measure_data, data["wifi"], userid=userid)
            measure.wifi = dumps(data["wifi"])
    return measure
Пример #9
0
def search_wifi(session, data):
    wifi_data = data['wifi']
    wifi_keys = set([normalize_wifi_key(w['key']) for w in wifi_data])
    if not any(wifi_keys):
        # no valid normalized keys
        return None
    if len(wifi_keys) < 3:
        # we didn't even get three keys, bail out
        return None
    sql_null = None  # avoid pep8 warning
    query = session.query(Wifi.lat, Wifi.lon).filter(
        Wifi.key.in_(wifi_keys)).filter(
        Wifi.lat != sql_null).filter(
        Wifi.lon != sql_null)
    wifis = query.all()
    if len(wifis) < 3:
        # we got fewer than three actual matches
        return None
    length = len(wifis)
    avg_lat = sum([w[0] for w in wifis]) / length
    avg_lon = sum([w[1] for w in wifis]) / length

    # check to make sure all wifi AP's are close by
    # we might later relax this to allow some outliers
    latitudes = [w[0] for w in wifis]
    longitudes = [w[1] for w in wifis]
    lat_diff = abs(max(latitudes) - min(latitudes))
    lon_diff = abs(max(longitudes) - min(longitudes))
    if lat_diff >= MAX_DIFF or lon_diff >= MAX_DIFF:
        return None

    return {
        'lat': quantize(avg_lat),
        'lon': quantize(avg_lon),
        'accuracy': 500,
    }
Пример #10
0
def load_file(session, source_file, batch_size=100, userid=None):
    utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC)
    utcmin = utcnow - datetime.timedelta(120)

    with open(source_file, 'r') as fd:
        reader = csv.reader(fd, delimiter='\t', quotechar=None)

        counter = 0
        items = []
        for fields in reader:
            try:
                time = int(fields[0])
                if time == 0:  # pragma: no cover
                    # unknown time gets an old date
                    time = utcmin
                else:
                    # convert from unixtime to utc
                    time = datetime.datetime.utcfromtimestamp(time)

                key = normalize_wifi_key(str(fields[1]))
                if not valid_wifi_pattern(key):  # pragma: no cover
                    continue

                lat = fields[2]
                lon = fields[3]
                accuracy = int(fields[4])
                altitude = int(fields[5])
                altitude_accuracy = int(fields[6])
                channel = int(fields[7])
                signal = int(fields[8])

                wifi = dict(
                    key=key,
                    channel=channel,
                    signal=signal,
                )
                data = dict(
                    lat=lat,
                    lon=lon,
                    time=time,
                    accuracy=accuracy,
                    altitude=altitude,
                    altitude_accuracy=altitude_accuracy,
                    radio='',
                    cell=(),
                    wifi=[wifi],
                )
            except (ValueError, IndexError):
                continue

            items.append(data)
            counter += 1

            # flush every batch_size records
            if counter % batch_size == 0:
                process_measures(items, session, userid=userid)
                items = []
                session.flush()
                print('Added %s records.' % counter)

    # process the remaining items
    process_measures(items, session, userid=userid)
    print('Added %s records.' % counter)

    session.flush()
    return counter
Пример #11
0
def load_file(session, source_file, batch_size=100, userid=None):
    utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC)
    utcmin = utcnow - datetime.timedelta(120)

    with open(source_file, 'r') as fd:
        reader = csv.reader(fd, delimiter='\t', quotechar=None)

        counter = 0
        items = []
        for fields in reader:
            try:
                time = int(fields[0])
                if time == 0:  # pragma: no cover
                    # unknown time gets an old date
                    time = utcmin
                else:
                    # convert from unixtime to utc
                    time = datetime.datetime.utcfromtimestamp(time)

                key = normalize_wifi_key(str(fields[1]))
                if not valid_wifi_pattern(key):  # pragma: no cover
                    continue

                lat = fields[2]
                lon = fields[3]
                accuracy = int(fields[4])
                altitude = int(fields[5])
                altitude_accuracy = int(fields[6])
                channel = int(fields[7])
                signal = int(fields[8])

                wifi = dict(
                    key=key,
                    channel=channel,
                    signal=signal,
                )
                data = dict(
                    lat=lat,
                    lon=lon,
                    time=time,
                    accuracy=accuracy,
                    altitude=altitude,
                    altitude_accuracy=altitude_accuracy,
                    radio='',
                    cell=(),
                    wifi=[wifi],
                )
            except (ValueError, IndexError):
                continue

            items.append(data)
            counter += 1

            # flush every batch_size records
            if counter % batch_size == 0:
                process_measures(items, session, userid=userid)
                items = []
                session.flush()
                print('Added %s records.' % counter)

    # process the remaining items
    process_measures(items, session, userid=userid)
    print('Added %s records.' % counter)

    session.flush()
    return counter
Пример #12
0
def search_wifi(session, data):

    # estimate signal strength at -100 dBm if none is provided,
    # which is worse than the 99th percentile of wifi dBms we
    # see in practice (-98).
    def signal_strength(w):
        if 'signal' in w:
            return int(w['signal'])
        else:
            return -100

    wifi_signals = dict([(normalize_wifi_key(w['key']),
                          signal_strength(w))
                         for w in data['wifi']])
    wifi_keys = set(wifi_signals.keys())

    if not any(wifi_keys):
        # no valid normalized keys
        return None
    if len(wifi_keys) < 3:
        # we didn't even get three keys, bail out
        return None
    sql_null = None  # avoid pep8 warning
    query = session.query(Wifi.key, Wifi.lat, Wifi.lon).filter(
        Wifi.key.in_(wifi_keys)).filter(
        Wifi.lat != sql_null).filter(
        Wifi.lon != sql_null)
    wifis = query.all()
    if len(wifis) < 3:
        # we got fewer than three actual matches
        return None

    wifis = [Network(normalize_wifi_key(w[0]), w[1], w[2]) for w in wifis]

    # sort networks by signal strengths in query
    wifis.sort(lambda a, b: cmp(wifi_signals[b.key],
                                wifi_signals[a.key]))

    clusters = []

    for w in wifis:
        # try to assign w to a cluster (but at most one)
        for c in clusters:
            for n in c:
                if distance(quantize(n.lat), quantize(n.lon),
                            quantize(w.lat), quantize(w.lon)) <= MAX_DIST:
                    c.append(w)
                    w = None
                    break

            if len(c) >= 3:
                # if we have a cluster with more than 3
                # networks in it, return its centroid.
                length = len(c)
                avg_lat = sum([n.lat for n in c]) / length
                avg_lon = sum([n.lon for n in c]) / length
                return {
                    'lat': quantize(avg_lat),
                    'lon': quantize(avg_lon),
                    'accuracy': 500,
                }

            if w is None:
                break

        # if w didn't adhere to any cluster, make a new one
        if w is not None:
            clusters.append([w])

    # if we didn't get any clusters with >3 networks,
    # the query is a bunch of outliers; give up and
    # let the next location method try.
    return None