def __init__(self, regions_file=REGIONS_FILE, buffer_file=REGIONS_BUFFER_FILE): self._buffered_shapes = {} self._prepared_shapes = {} self._shapes = {} self._tree_ids = {} self._radii = {} with util.gzip_open(regions_file, 'r') as fd: regions_data = simplejson.load(fd) genc_regions = frozenset([rec.alpha2 for rec in genc.REGIONS]) for feature in regions_data['features']: code = feature['properties']['alpha2'] if code in genc_regions: shape = geometry.shape(feature['geometry']) self._shapes[code] = shape self._prepared_shapes[code] = prepared.prep(shape) self._radii[code] = feature['properties']['radius'] with util.gzip_open(buffer_file, 'r') as fd: buffer_data = simplejson.load(fd) i = 0 envelopes = [] for feature in buffer_data['features']: code = feature['properties']['alpha2'] if code in genc_regions: shape = geometry.shape(feature['geometry']) self._buffered_shapes[code] = prepared.prep(shape) # Collect rtree index entries, and maintain a separate id to # code mapping. We don't use index object support as it # requires un/pickling the object entries on each lookup. if isinstance(shape, geometry.base.BaseMultipartGeometry): # Index bounding box of individual polygons instead of # the multipolygon, to avoid issues with regions crossing # the -180.0/+180.0 longitude boundary. for geom in shape.geoms: envelopes.append((i, geom.envelope.bounds, None)) self._tree_ids[i] = code i += 1 else: envelopes.append((i, shape.envelope.bounds, None)) self._tree_ids[i] = code i += 1 props = index.Property() props.fill_factor = 0.9 props.leaf_capacity = 20 self._tree = index.Index(envelopes, interleaved=True, properties=props) for envelope in envelopes: self._tree.insert(*envelope) self._valid_regions = frozenset(self._shapes.keys())
def test_files(self, session): today = util.utcnow().date() rows = [ dict(time=today, lat=12.345, lon=12.345), dict(time=today, lat=0, lon=12.345), dict(time=today, lat=-10.000, lon=-11.000), ] for row in rows: lat, lon = DataMap.scale(row["lat"], row["lon"]) data = DataMap.shard_model(lat, lon)( grid=(lat, lon), created=row["time"], modified=row["time"] ) session.add(data) session.flush() lines = [] rows = 0 with util.selfdestruct_tempdir() as temp_dir: quaddir = os.path.join(temp_dir, "quadtrees") os.mkdir(quaddir) shapes = os.path.join(temp_dir, "shapes") tiles = os.path.join(temp_dir, "tiles") for shard_id, shard in DataMap.shards().items(): filename = "map_%s.csv.gz" % shard_id filepath = os.path.join(temp_dir, filename) result = export_file(filepath, shard.__tablename__, _session=session) if not result: assert not os.path.isfile(filepath) continue rows += result with util.gzip_open(filepath, "r") as fd: written = fd.read() lines.extend([line.split(",") for line in written.split()]) encode_file(filename, temp_dir, quaddir) quadfolder = os.path.join(quaddir, "map_" + shard_id) assert os.path.isdir(quadfolder) self._check_quadtree(quadfolder) merge_files(quaddir, shapes) self._check_quadtree(shapes) render_tiles(shapes, tiles, 1, 2) assert sorted(os.listdir(tiles)) == ["0", "1", "2"] assert sorted(os.listdir(os.path.join(tiles, "0", "0"))) == [ "0.png", "*****@*****.**", ] assert rows == 18 assert len(lines) == 18 lats = [round(float(line[0]), 2) for line in lines] longs = [round(float(line[1]), 2) for line in lines] assert set(lats) == set([-10.0, 0.0, 12.35]) assert set(longs) == set([-11.0, 12.35])
def import_stations(self, session, pipe, filename): today = util.utcnow().date() on_duplicate = ( '`modified` = values(`modified`)' ', `lat` = values(`lat`)' ', `lon` = values(`lon`)' ', `psc` = values(`psc`)' ', `max_lat` = values(`max_lat`)' ', `min_lat` = values(`min_lat`)' ', `max_lon` = values(`max_lon`)' ', `min_lon` = values(`min_lon`)' ', `radius` = values(`radius`)' ', `samples` = values(`samples`)' ) table_insert = self.cell_model.__table__.insert( mysql_on_duplicate=on_duplicate) def commit_batch(rows): result = session.execute(table_insert, rows) count = result.rowcount # apply trick to avoid querying for existing rows, # MySQL claims 1 row for an inserted row, 2 for an updated row inserted_rows = 2 * len(rows) - count changed_rows = count - len(rows) assert inserted_rows + changed_rows == len(rows) StatCounter(self.stat_key, today).incr(pipe, inserted_rows) areaids = set() with util.gzip_open(filename, 'r') as gzip_wrapper: with gzip_wrapper as gzip_file: csv_reader = csv.reader(gzip_file) parse_row = partial(self.make_import_dict, self.cell_model.validate, self.import_spec) rows = [] for row in csv_reader: # skip any header row if (csv_reader.line_num == 1 and row[0] == 'radio'): # pragma: no cover continue data = parse_row(row) if data is not None: rows.append(data) areaids.add((int(data['radio']), data['mcc'], data['mnc'], data['lac'])) if len(rows) == self.batch_size: # pragma: no cover commit_batch(rows) session.flush() rows = [] if rows: commit_batch(rows) self.area_queue.enqueue( [encode_cellarea(*id_) for id_ in areaids], json=False)
def get_csv(self, lo=1, hi=10, time=1408604686): cell = self.cell line_template = ('UMTS,{mcc},{mnc},{lac},{cid},{psc},{lon:.7f},' '{lat:.7f},1,1,1,{time},{time},') lines = [line_template.format( mcc=cell.mcc, mnc=cell.mnc, lac=cell.lac, cid=i * 1010, psc='', lon=cell.lon + i * 0.002, lat=cell.lat + i * 0.001, time=time) for i in range(lo, hi)] # add bad lines lines.append(line_template.format( mcc=cell.mcc, mnc=cell.mnc, lac='', cid='', psc=12, lon=cell.lon, lat=cell.lat, time=time, )) lines.append(line_template.format( mcc=cell.mcc, mnc=cell.mnc, lac='', cid='', psc='', lon=cell.lon, lat=cell.lat, time=time, )) txt = '\n'.join(lines) with util.selfdestruct_tempdir() as temp_dir: path = os.path.join(temp_dir, 'import.csv.gz') with util.gzip_open(path, 'w') as gzip_wrapper: with gzip_wrapper as gzip_file: gzip_file.write(txt) yield path
def main(argv, _db=None): parser = argparse.ArgumentParser( prog=argv[0], description=( "Import from public cell data into a local dev environment. " "See https://location.services.mozilla.com/downloads"), ) parser.add_argument("filename", help="Path to the csv.gz import file.") args = parser.parse_args(argv[1:]) if not settings("local_dev_env"): print("This script can only be run in a local dev environment.") print("Set LOCAL_DEV_ENV=True in your environment.") return 1 filename = os.path.abspath(os.path.expanduser(args.filename)) if not os.path.isfile(filename): print("File %s not found." % filename) return 1 configure_logging() celery_app = get_eager_celery_app() init_worker(celery_app) cellarea_queue = celery_app.data_queues["update_cellarea"] with db_worker_session(celery_app.db, commit=False) as session: with gzip_open(filename, "r") as file_handle: read_stations_from_csv(session, file_handle, celery_app.redis_client, cellarea_queue) return 0
def test_files(self, db, session): # pragma: no cover today = util.utcnow().date() rows = [ dict(time=today, lat=12.345, lon=12.345), dict(time=today, lat=0, lon=12.345), dict(time=today, lat=-10.000, lon=-11.000), ] for row in rows: lat, lon = DataMap.scale(row['lat'], row['lon']) data = DataMap.shard_model(lat, lon)(grid=(lat, lon), created=row['time'], modified=row['time']) session.add(data) session.flush() lines = [] rows = 0 with util.selfdestruct_tempdir() as temp_dir: quaddir = os.path.join(temp_dir, 'quadtrees') os.mkdir(quaddir) shapes = os.path.join(temp_dir, 'shapes') tiles = os.path.join(temp_dir, 'tiles') for shard_id, shard in DataMap.shards().items(): filename = 'map_%s.csv.gz' % shard_id filepath = os.path.join(temp_dir, filename) result = export_file(filepath, shard.__tablename__, _session=session) if not result: assert not os.path.isfile(filepath) continue rows += result with util.gzip_open(filepath, 'r') as fd: written = fd.read() lines.extend([line.split(',') for line in written.split()]) encode_file(filename, temp_dir, quaddir) quadfolder = os.path.join(quaddir, 'map_' + shard_id) assert os.path.isdir(quadfolder) self._check_quadtree(quadfolder) merge_files(quaddir, shapes) self._check_quadtree(shapes) render_tiles(shapes, tiles, 1, 2) assert (sorted(os.listdir(tiles)) == ['0', '1', '2']) assert (sorted(os.listdir(os.path.join( tiles, '0', '0'))) == ['0.png', '*****@*****.**']) assert rows == 18 assert len(lines) == 18 assert (set([round(float(l[0]), 2) for l in lines]) == set([-10.0, 0.0, 12.35])) assert (set([round(float(l[1]), 2) for l in lines]) == set([-11.0, 12.35]))
def test_files(self, db_rw, session): today = util.utcnow().date() rows = [ dict(time=today, lat=12.345, lon=12.345), dict(time=today, lat=0, lon=12.345), dict(time=today, lat=-10.000, lon=-11.000), ] for row in rows: lat, lon = DataMap.scale(row['lat'], row['lon']) data = DataMap.shard_model(lat, lon)( grid=(lat, lon), created=row['time'], modified=row['time']) session.add(data) session.flush() lines = [] rows = 0 db_url = str(db_rw.engine.url) with util.selfdestruct_tempdir() as temp_dir: quaddir = os.path.join(temp_dir, 'quadtrees') os.mkdir(quaddir) shapes = os.path.join(temp_dir, 'shapes') tiles = os.path.join(temp_dir, 'tiles') for shard_id, shard in DataMap.shards().items(): filename = 'map_%s.csv.gz' % shard_id filepath = os.path.join(temp_dir, filename) result = export_file( db_url, filepath, shard.__tablename__, _session=session) if not result: assert not os.path.isfile(filepath) continue rows += result with util.gzip_open(filepath, 'r') as fd: written = fd.read() lines.extend([line.split(',') for line in written.split()]) encode_file(filename, temp_dir, quaddir, DATAMAPS_DIR) quadfolder = os.path.join(quaddir, 'map_' + shard_id) assert os.path.isdir(quadfolder) self._check_quadtree(quadfolder) merge_files(quaddir, shapes, DATAMAPS_DIR) self._check_quadtree(shapes) render_tiles(shapes, tiles, 1, 2, DATAMAPS_DIR, PNGQUANT) assert (sorted(os.listdir(tiles)) == ['0', '1', '2']) assert (sorted(os.listdir(os.path.join(tiles, '0', '0'))) == ['0.png', '*****@*****.**']) assert rows == 36 assert len(lines) == 36 assert (set([round(float(l[0]), 2) for l in lines]) == set([-10.0, 0.0, 12.35])) assert (set([round(float(l[1]), 2) for l in lines]) == set([-11.0, 12.35]))
def test_local_export(self, celery, session): now = util.utcnow() today = now.date() long_ago = now - timedelta(days=367) cell_fixture_fields = ("radio", "cid", "lat", "lon", "mnc", "mcc", "lac") base_cell = CellShardFactory.build(radio=Radio.wcdma) cell_key = { "radio": Radio.wcdma, "mcc": base_cell.mcc, "mnc": base_cell.mnc, "lac": base_cell.lac, } cells = set() for cid in range(190, 200): cell = dict(cid=cid, lat=base_cell.lat, lon=base_cell.lon, **cell_key) CellShardFactory(**cell) cell["lat"] = "%.7f" % cell["lat"] cell["lon"] = "%.7f" % cell["lon"] cell["radio"] = "UMTS" cell_strings = [(field, str(value)) for (field, value) in cell.items()] cell_tuple = tuple(sorted(cell_strings)) cells.add(cell_tuple) # add one incomplete / unprocessed cell CellShardFactory(cid=210, lat=None, lon=None, **cell_key) # add one really old cell CellShardFactory( cid=220, created=long_ago, modified=long_ago, last_seen=long_ago.date(), **cell_key, ) session.commit() with util.selfdestruct_tempdir() as temp_dir: path = os.path.join(temp_dir, "export.csv.gz") write_stations_to_csv(session, path, today) with util.gzip_open(path, "r") as gzip_wrapper: with gzip_wrapper as gzip_file: reader = csv.DictReader(gzip_file, CELL_FIELDS) header = next(reader) assert "area" in header.values() exported_cells = set() for exported_cell in reader: exported_cell_filtered = [ (field, value) for (field, value) in exported_cell.items() if field in cell_fixture_fields ] exported_cell = tuple(sorted(exported_cell_filtered)) exported_cells.add(exported_cell) assert cells == exported_cells
def dump_file(datatype, session, filename, lat=None, lon=None, radius=None): model = { 'blue': BlueShard, 'cell': CellShard, 'wifi': WifiShard, } where = where_area(lat, lon, radius) with util.gzip_open(filename, 'w') as fd: dump_model(model[datatype], session, fd, where=where) return 0
def __init__(self, json_file=JSON_FILE): self._buffered_shapes = {} self._prepared_shapes = {} self._shapes = {} self._tree_ids = {} self._radii = {} with util.gzip_open(json_file, 'r') as fd: data = simplejson.load(fd) genc_regions = frozenset([rec.alpha2 for rec in genc.REGIONS]) for feature in data['features']: code = feature['properties']['alpha2'] if code in genc_regions: shape = geometry.shape(feature['geometry']) self._shapes[code] = shape self._prepared_shapes[code] = prepared.prep(shape) self._radii[code] = feature['properties']['radius'] i = 0 envelopes = [] for code, shape in self._shapes.items(): # Build up region buffers, to create shapes that include all of # the coastal areas and boundaries of the regions and anywhere # a cell signal could still be recorded. The value is in decimal # degrees (1.0 == ~100km) but calculations don't take projection # / WSG84 into account. # After buffering remove any parts that crosses the -180.0/+180.0 # longitude boundary to the east or west. buffered = (shape.buffer(0.5) .difference(DATELINE_EAST) .difference(DATELINE_WEST)) self._buffered_shapes[code] = prepared.prep(buffered) # Collect rtree index entries, and maintain a separate id to # code mapping. We don't use index object support as it # requires un/pickling the object entries on each lookup. if isinstance(buffered, geometry.base.BaseMultipartGeometry): # Index bounding box of individual polygons instead of # the multipolygon, to avoid issues with regions crossing # the -180.0/+180.0 longitude boundary. for geom in buffered.geoms: envelopes.append((i, geom.envelope.bounds, None)) self._tree_ids[i] = code i += 1 else: envelopes.append((i, buffered.envelope.bounds, None)) self._tree_ids[i] = code i += 1 props = index.Property() props.fill_factor = 0.9 props.leaf_capacity = 20 self._tree = index.Index(envelopes, interleaved=True, properties=props) self._valid_regions = frozenset(self._shapes.keys())
def __init__(self, json_file=JSON_FILE): self._buffered_shapes = {} self._prepared_shapes = {} self._shapes = {} self._tree_ids = {} self._radii = {} with util.gzip_open(json_file, 'r') as fd: data = simplejson.load(fd) genc_regions = frozenset([rec.alpha2 for rec in genc.REGIONS]) for feature in data['features']: code = feature['properties']['alpha2'] if code in genc_regions: shape = geometry.shape(feature['geometry']) self._shapes[code] = shape self._prepared_shapes[code] = prepared.prep(shape) self._radii[code] = feature['properties']['radius'] i = 0 envelopes = [] for code, shape in self._shapes.items(): # Build up region buffers, to create shapes that include all of # the coastal areas and boundaries of the regions and anywhere # a cell signal could still be recorded. The value is in decimal # degrees (1.0 == ~100km) but calculations don't take projection # / WSG84 into account. # After buffering remove any parts that crosses the -180.0/+180.0 # longitude boundary to the east or west. buffered = (shape.buffer(0.5).difference(DATELINE_EAST).difference( DATELINE_WEST)) self._buffered_shapes[code] = prepared.prep(buffered) # Collect rtree index entries, and maintain a separate id to # code mapping. We don't use index object support as it # requires un/pickling the object entries on each lookup. if isinstance(buffered, geometry.base.BaseMultipartGeometry): # Index bounding box of individual polygons instead of # the multipolygon, to avoid issues with regions crossing # the -180.0/+180.0 longitude boundary. for geom in buffered.geoms: envelopes.append((i, geom.envelope.bounds, None)) self._tree_ids[i] = code i += 1 else: envelopes.append((i, buffered.envelope.bounds, None)) self._tree_ids[i] = code i += 1 props = index.Property() props.fill_factor = 0.9 props.leaf_capacity = 20 self._tree = index.Index(envelopes, interleaved=True, properties=props) self._valid_regions = frozenset(self._shapes.keys())
def test_local_export(self): cell_fixture_fields = ('radio', 'cid', 'lat', 'lon', 'mnc', 'mcc', 'lac') base_cell = CellShardFactory.build(radio=Radio.wcdma) cell_key = { 'radio': Radio.wcdma, 'mcc': base_cell.mcc, 'mnc': base_cell.mnc, 'lac': base_cell.lac } cells = set() for cid in range(190, 200): cell = dict(cid=cid, lat=base_cell.lat, lon=base_cell.lon, **cell_key) CellShardFactory(**cell) cell['lat'] = '%.7f' % cell['lat'] cell['lon'] = '%.7f' % cell['lon'] cell['radio'] = 'UMTS' cell_strings = [(field, str(value)) for (field, value) in cell.items()] cell_tuple = tuple(sorted(cell_strings)) cells.add(cell_tuple) # add one incomplete / unprocessed cell CellShardFactory(cid=210, lat=None, lon=None, **cell_key) self.session.commit() with util.selfdestruct_tempdir() as temp_dir: path = os.path.join(temp_dir, 'export.csv.gz') write_stations_to_csv(self.session, path) with util.gzip_open(path, 'r') as gzip_wrapper: with gzip_wrapper as gzip_file: reader = csv.DictReader(gzip_file, CELL_FIELDS) header = six.next(reader) self.assertTrue('area' in header.values()) exported_cells = set() for exported_cell in reader: exported_cell_filtered = [ (field, value) for (field, value) in exported_cell.items() if field in cell_fixture_fields ] exported_cell = tuple(sorted(exported_cell_filtered)) exported_cells.add(exported_cell) self.assertEqual(cells, exported_cells)
def test_local_export(self, celery, session): now = util.utcnow() today = now.date() long_ago = now - timedelta(days=367) cell_fixture_fields = ( 'radio', 'cid', 'lat', 'lon', 'mnc', 'mcc', 'lac') base_cell = CellShardFactory.build(radio=Radio.wcdma) cell_key = {'radio': Radio.wcdma, 'mcc': base_cell.mcc, 'mnc': base_cell.mnc, 'lac': base_cell.lac} cells = set() for cid in range(190, 200): cell = dict(cid=cid, lat=base_cell.lat, lon=base_cell.lon, **cell_key) CellShardFactory(**cell) cell['lat'] = '%.7f' % cell['lat'] cell['lon'] = '%.7f' % cell['lon'] cell['radio'] = 'UMTS' cell_strings = [ (field, str(value)) for (field, value) in cell.items()] cell_tuple = tuple(sorted(cell_strings)) cells.add(cell_tuple) # add one incomplete / unprocessed cell CellShardFactory(cid=210, lat=None, lon=None, **cell_key) # add one really old cell CellShardFactory(cid=220, created=long_ago, modified=long_ago, last_seen=long_ago.date(), **cell_key) session.commit() with util.selfdestruct_tempdir() as temp_dir: path = os.path.join(temp_dir, 'export.csv.gz') write_stations_to_csv(session, path, today) with util.gzip_open(path, 'r') as gzip_wrapper: with gzip_wrapper as gzip_file: reader = csv.DictReader(gzip_file, CELL_FIELDS) header = next(reader) assert 'area' in header.values() exported_cells = set() for exported_cell in reader: exported_cell_filtered = [ (field, value) for (field, value) in exported_cell.items() if field in cell_fixture_fields] exported_cell = tuple(sorted(exported_cell_filtered)) exported_cells.add(exported_cell) assert cells == exported_cells
def import_stations(session, pipe, filename, fields): today = util.utcnow().date() def commit_batch(ins, rows, commit=True): result = session.execute(ins, rows) count = result.rowcount # apply trick to avoid querying for existing rows, # MySQL claims 1 row for an inserted row, 2 for an updated row inserted_rows = 2 * len(rows) - count changed_rows = count - len(rows) assert inserted_rows + changed_rows == len(rows) StatCounter(StatKey.unique_ocid_cell, today).incr(pipe, inserted_rows) if commit: session.commit() else: # pragma: no cover session.flush() with util.gzip_open(filename, 'r') as gzip_file: csv_reader = csv.DictReader(gzip_file, fields) batch = 10000 rows = [] area_keys = set() ins = OCIDCell.__table__.insert( on_duplicate=(( 'changeable = values(changeable), ' 'modified = values(modified), ' 'total_measures = values(total_measures), ' 'lat = values(lat), ' 'lon = values(lon), ' 'psc = values(psc), ' '`range` = values(`range`)'))) for row in csv_reader: # skip any header row if csv_reader.line_num == 1 and \ 'radio' in row.values(): # pragma: no cover continue data = make_ocid_cell_import_dict(row) if data is not None: rows.append(data) area_keys.add(CellArea.to_hashkey(data)) if len(rows) == batch: # pragma: no cover commit_batch(ins, rows, commit=False) rows = [] if rows: commit_batch(ins, rows) for area_key in area_keys: update_area.delay(area_key, cell_type='ocid')
def test_files(self): today = util.utcnow().date() rows = [ dict(time=today, lat=12.345, lon=12.345), dict(time=today, lat=0, lon=12.345), dict(time=today, lat=-10.000, lon=-11.000), ] for row in rows: lat, lon = DataMap.scale(row["lat"], row["lon"]) data = DataMap.shard_model(lat, lon)(grid=(lat, lon), created=row["time"], modified=row["time"]) self.session.add(data) self.session.flush() lines = [] rows = 0 with util.selfdestruct_tempdir() as temp_dir: quaddir = os.path.join(temp_dir, "quadtrees") os.mkdir(quaddir) shapes = os.path.join(temp_dir, "shapes") tiles = os.path.join(temp_dir, "tiles") for shard_id, shard in DATAMAP_SHARDS.items(): filename = "map_%s.csv.gz" % shard_id filepath = os.path.join(temp_dir, filename) result = export_file(None, filepath, shard.__tablename__, _db_rw=_make_db(), _session=self.session) if not result: self.assertFalse(os.path.isfile(filepath)) continue rows += result with util.gzip_open(filepath, "r") as fd: written = fd.read() lines.extend([line.split(",") for line in written.split()]) encode_file(filename, temp_dir, quaddir, DATAMAPS_DIR) quadfolder = os.path.join(quaddir, "map_" + shard_id) self.assertTrue(os.path.isdir(quadfolder)) self._check_quadtree(quadfolder) merge_files(quaddir, shapes, DATAMAPS_DIR) self._check_quadtree(shapes) render_tiles(shapes, tiles, 1, 2, DATAMAPS_DIR, PNGQUANT) self.assertEqual(sorted(os.listdir(tiles)), ["0", "1", "2"]) self.assertEqual(sorted(os.listdir(os.path.join(tiles, "0", "0"))), ["0.png", "*****@*****.**"]) self.assertEqual(rows, 36) self.assertEqual(len(lines), 36) self.assertEqual(set([round(float(l[0]), 2) for l in lines]), set([-10.0, 0.0, 12.35])) self.assertEqual(set([round(float(l[1]), 2) for l in lines]), set([-11.0, 12.35]))
def test_local_export(self, celery, session): now = util.utcnow() today = now.date() long_ago = now - timedelta(days=367) cell_fixture_fields = ( 'radio', 'cid', 'lat', 'lon', 'mnc', 'mcc', 'lac') base_cell = CellShardFactory.build(radio=Radio.wcdma) cell_key = {'radio': Radio.wcdma, 'mcc': base_cell.mcc, 'mnc': base_cell.mnc, 'lac': base_cell.lac} cells = set() for cid in range(190, 200): cell = dict(cid=cid, lat=base_cell.lat, lon=base_cell.lon, **cell_key) CellShardFactory(**cell) cell['lat'] = '%.7f' % cell['lat'] cell['lon'] = '%.7f' % cell['lon'] cell['radio'] = 'UMTS' cell_strings = [ (field, str(value)) for (field, value) in cell.items()] cell_tuple = tuple(sorted(cell_strings)) cells.add(cell_tuple) # add one incomplete / unprocessed cell CellShardFactory(cid=210, lat=None, lon=None, **cell_key) # add one really old cell CellShardFactory(cid=220, created=long_ago, modified=long_ago, last_seen=long_ago.date(), **cell_key) session.commit() with util.selfdestruct_tempdir() as temp_dir: path = os.path.join(temp_dir, 'export.csv.gz') write_stations_to_csv(session, path, today) with util.gzip_open(path, 'r') as gzip_wrapper: with gzip_wrapper as gzip_file: reader = csv.DictReader(gzip_file, CELL_FIELDS) header = six.next(reader) assert 'area' in header.values() exported_cells = set() for exported_cell in reader: exported_cell_filtered = [ (field, value) for (field, value) in exported_cell.items() if field in cell_fixture_fields] exported_cell = tuple(sorted(exported_cell_filtered)) exported_cells.add(exported_cell) assert cells == exported_cells
def main(argv): # pragma: no cover os.system('ogr2ogr -f GeoJSON ' '-select "%s" -segmentize 0.1 data/temp.geojson ' 'data/ne_50m_admin_0_map_subunits.dbf' % ', '.join(PROPERTIES)) with open('data/temp.geojson', 'r') as fd: jsondata = fd.read() os.remove('data/temp.geojson') data = json.loads(jsondata) simplified = simplify(data['features']) output = to_geojson(simplified) with util.gzip_open('ichnaea/regions.geojson.gz', 'w', compresslevel=7) as fd: fd.write(output)
def main(argv): parser = argparse.ArgumentParser( prog=argv[0], description="Create region GeoJSON files.") # implicitly parse and react to -h/--help parser.parse_args(argv[1:]) os.system("ogr2ogr -f GeoJSON " '-select "%s" -segmentize 0.1 data/temp.geojson ' "data/ne_50m_admin_0_map_subunits.dbf" % ", ".join(PROPERTIES)) with open("data/temp.geojson", "r") as fd: jsondata = fd.read() os.remove("data/temp.geojson") data = json.loads(jsondata) simplified = simplify(data["features"]) region_collection, buffer_collection = to_geojson(simplified) with util.gzip_open(geocode.REGIONS_FILE, "w", compresslevel=7) as fd: fd.write(region_collection) with util.gzip_open(geocode.REGIONS_BUFFER_FILE, "w", compresslevel=7) as fd: fd.write(buffer_collection)
def main(argv): # pragma: no cover parser = argparse.ArgumentParser( prog=argv[0], description='Create region GeoJSON files.') # implicitly parse and react to -h/--help parser.parse_args(argv[1:]) os.system('ogr2ogr -f GeoJSON ' '-select "%s" -segmentize 0.1 data/temp.geojson ' 'data/ne_50m_admin_0_map_subunits.dbf' % ', '.join(PROPERTIES)) with open('data/temp.geojson', 'r') as fd: jsondata = fd.read() os.remove('data/temp.geojson') data = json.loads(jsondata) simplified = simplify(data['features']) region_collection, buffer_collection = to_geojson(simplified) with util.gzip_open(geocode.REGIONS_FILE, 'w', compresslevel=7) as fd: fd.write(region_collection) with util.gzip_open(geocode.REGIONS_BUFFER_FILE, 'w', compresslevel=7) as fd: fd.write(buffer_collection)
def test_local_export(self): cell_fixture_fields = ( 'radio', 'cid', 'lat', 'lon', 'mnc', 'mcc', 'lac') base_cell = CellFactory.build(radio=Radio.wcdma) cell_key = {'radio': Radio.wcdma, 'mcc': base_cell.mcc, 'mnc': base_cell.mnc, 'lac': base_cell.lac} cells = set() for cid in range(190, 200): cell = dict(cid=cid, lat=base_cell.lat, lon=base_cell.lon, **cell_key) CellFactory(**cell) cell['lat'] = '%.7f' % cell['lat'] cell['lon'] = '%.7f' % cell['lon'] cell['radio'] = 'UMTS' cell_strings = [ (field, str(value)) for (field, value) in cell.items()] cell_tuple = tuple(sorted(cell_strings)) cells.add(cell_tuple) # add one incomplete / unprocessed cell CellFactory(cid=210, lat=None, lon=None, **cell_key) self.session.commit() with util.selfdestruct_tempdir() as temp_dir: path = os.path.join(temp_dir, 'export.csv.gz') write_stations_to_csv(self.session, path) with util.gzip_open(path, 'r') as gzip_wrapper: with gzip_wrapper as gzip_file: reader = csv.DictReader(gzip_file, CELL_FIELDS) header = six.next(reader) self.assertTrue('area' in header.values()) self.assertEqual(header, CELL_HEADER_DICT) exported_cells = set() for exported_cell in reader: exported_cell_filtered = [ (field, value) for (field, value) in exported_cell.items() if field in cell_fixture_fields] exported_cell = tuple(sorted(exported_cell_filtered)) exported_cells.add(exported_cell) self.assertEqual(cells, exported_cells)
def export_file(filename, tablename, _db=None, _session=None): today = util.utcnow().date() one_year_ago = today - timedelta(days=365) one_year_ago = one_year_ago.strftime('%Y-%m-%d') # this is executed in a worker process stmt = text('''\ SELECT `grid`, CAST(ROUND(DATEDIFF(CURDATE(), `modified`) / 30) AS UNSIGNED) as `num` FROM {tablename} WHERE modified >= '{modified}' LIMIT :limit OFFSET :offset '''.format(tablename=tablename, modified=one_year_ago).replace('\n', ' ')) db = configure_db('ro', _db=_db) offset = 0 limit = 200000 result_rows = 0 with util.gzip_open(filename, 'w', compresslevel=2) as fd: with db_worker_session(db, commit=False) as session: if _session is not None: # testing hook session = _session while True: result = session.execute( stmt.bindparams(limit=limit, offset=offset)) rows = result.fetchall() result.close() if not rows: break lines = [] extend = lines.extend for row in rows: lat, lon = decode_datamap_grid(row.grid) extend(random_points(lat, lon, row.num)) fd.writelines(lines) result_rows += len(lines) offset += limit if not result_rows: os.remove(filename) db.close() return result_rows
def get_csv(self, lo=1, hi=10, time=1408604686): cell = self.cell line_template = ('UMTS,{mcc},{mnc},{lac},{cid},{psc},{lon:.7f},' '{lat:.7f},1,1,1,{time},{time},') lines = [ line_template.format(mcc=cell.mcc, mnc=cell.mnc, lac=cell.lac, cid=i * 1010, psc='', lon=cell.lon + i * 0.002, lat=cell.lat + i * 0.001, time=time) for i in range(lo, hi) ] # add bad lines lines.append( line_template.format( mcc=cell.mcc, mnc=cell.mnc, lac='', cid='', psc=12, lon=cell.lon, lat=cell.lat, time=time, )) lines.append( line_template.format( mcc=cell.mcc, mnc=cell.mnc, lac='', cid='', psc='', lon=cell.lon, lat=cell.lat, time=time, )) txt = '\n'.join(lines) with util.selfdestruct_tempdir() as temp_dir: path = os.path.join(temp_dir, 'import.csv.gz') with util.gzip_open(path, 'w') as gzip_wrapper: with gzip_wrapper as gzip_file: gzip_file.write(txt) yield path
def export_file(filename, tablename, _db=None, _session=None): # this is executed in a worker process stmt = text('''\ SELECT `grid`, CAST(ROUND(DATEDIFF(CURDATE(), `modified`) / 30) AS UNSIGNED) as `num` FROM {tablename} WHERE `grid` > :grid ORDER BY `grid` LIMIT :limit '''.format(tablename=tablename).replace('\n', ' ')) db = configure_db('ro', transport='sync', _db=_db) min_grid = b'' limit = 200000 result_rows = 0 with util.gzip_open(filename, 'w', compresslevel=2) as fd: with db_worker_session(db, commit=False) as session: if _session is not None: # testing hook session = _session while True: result = session.execute( stmt.bindparams(limit=limit, grid=min_grid)) rows = result.fetchall() result.close() if not rows: break lines = [] extend = lines.extend for row in rows: lat, lon = decode_datamap_grid(row.grid) extend(random_points(lat, lon, row.num)) fd.writelines(lines) result_rows += len(lines) min_grid = rows[-1].grid if not result_rows: os.remove(filename) db.close() return result_rows
def export_file(filename, tablename, _db=None, _session=None): # this is executed in a worker process stmt = text("""\ SELECT `grid`, CAST(ROUND(DATEDIFF(CURDATE(), `modified`) / 30) AS UNSIGNED) as `num` FROM {tablename} WHERE `grid` > :grid ORDER BY `grid` LIMIT :limit """.format(tablename=tablename).replace("\n", " ")) db = configure_db("ro", _db=_db, pool=False) min_grid = b"" limit = 200000 result_rows = 0 with util.gzip_open(filename, "w", compresslevel=2) as fd: with db_worker_session(db, commit=False) as session: if _session is not None: # testing hook session = _session while True: result = session.execute( stmt.bindparams(limit=limit, grid=min_grid)) rows = result.fetchall() result.close() if not rows: break lines = [] extend = lines.extend for row in rows: lat, lon = decode_datamap_grid(row.grid) extend(random_points(lat, lon, row.num)) fd.writelines(lines) result_rows += len(lines) min_grid = rows[-1].grid if not result_rows: os.remove(filename) db.close() return result_rows
def _export(self, session, datatype, expected_keys, restrict=False): with util.selfdestruct_tempdir() as temp_dir: path = os.path.join(temp_dir, datatype + ".tar.gz") if restrict: dump.dump_file(datatype, session, path, lat=GB_LAT, lon=GB_LON, radius=25000) else: dump.dump_file(datatype, session, path) assert os.path.isfile(path) with util.gzip_open(path, "r") as fd: lines = fd.readlines() assert len(lines) == len(expected_keys) + 1 for key in expected_keys: assert [True for line in lines if key in line] == [True]
def write_stations_to_csv(session, table, columns, cond, path, make_dict, fields): with util.gzip_open(path, 'w') as gzip_file: writer = csv.DictWriter(gzip_file, fields, extrasaction='ignore') limit = 10000 offset = 0 # Write header row writer.writerow(CELL_HEADER_DICT) while True: query = (select(columns=columns).where(cond) .limit(limit) .offset(offset) .order_by(table.c.created)) rows = session.execute(query).fetchall() if rows: writer.writerows([make_dict(row) for row in rows]) offset += limit else: break
def export_file(db_url, filename, tablename, _db_rw=None, _session=None): # this is executed in a worker process stmt = text('''\ SELECT `grid`, CAST(ROUND(DATEDIFF(CURDATE(), `modified`) / 30) AS UNSIGNED) as `num` FROM {tablename} LIMIT :limit OFFSET :offset '''.format(tablename=tablename).replace('\n', ' ')) db = configure_db(db_url, _db=_db_rw) offset = 0 limit = 200000 result_rows = 0 with util.gzip_open(filename, 'w', compresslevel=2) as fd: with db_worker_session(db, commit=False) as session: if _session is not None: # testing hook session = _session while True: result = session.execute( stmt.bindparams(limit=limit, offset=offset)) rows = result.fetchall() result.close() if not rows: break lines = [] extend = lines.extend for row in rows: lat, lon = decode_datamap_grid(row.grid) extend(random_points(lat, lon, row.num)) fd.writelines(lines) result_rows += len(lines) offset += limit if not result_rows: os.remove(filename) db.engine.pool.dispose() return result_rows
def dump_file(datatype, session, filename, lat=None, lon=None, radius=None): model = {"blue": BlueShard, "cell": CellShard, "wifi": WifiShard} where = where_area(lat, lon, radius) with util.gzip_open(filename, "w") as fd: dump_model(model[datatype], session, fd, where=where) return 0
def import_stations(self, session, pipe, filename): today = util.utcnow().date() area_keys = set() def commit_batch(ins, rows, commit=True): result = session.execute(ins, rows) count = result.rowcount # apply trick to avoid querying for existing rows, # MySQL claims 1 row for an inserted row, 2 for an updated row inserted_rows = 2 * len(rows) - count changed_rows = count - len(rows) assert inserted_rows + changed_rows == len(rows) StatCounter(self.stat_key, today).incr(pipe, inserted_rows) if commit: session.commit() else: # pragma: no cover session.flush() with util.gzip_open(filename, "r") as gzip_wrapper: with gzip_wrapper as gzip_file: csv_reader = csv.DictReader(gzip_file, CELL_FIELDS) rows = [] on_duplicate = ( "modified = values(modified), " "total_measures = values(total_measures), " "lat = values(lat), " "lon = values(lon), " "psc = values(psc), " "`range` = values(`range`)" ) if self.cell_type == "ocid": on_duplicate += ", changeable = values(changeable)" elif self.cell_type == "cell": # pragma: no cover on_duplicate += ( ", max_lat = values(max_lat)" ", min_lat = values(min_lat)" ", max_lon = values(max_lon)" ", min_lon = values(min_lon)" ) ins = self.cell_model.__table__.insert(mysql_on_duplicate=on_duplicate) for row in csv_reader: # skip any header row if csv_reader.line_num == 1 and "radio" in row.values(): # pragma: no cover continue data = self.make_import_dict(row) if data is not None: rows.append(data) area_keys.add(self.area_model.to_hashkey(data)) if len(rows) == self.batch_size: # pragma: no cover commit_batch(ins, rows, commit=False) rows = [] if rows: commit_batch(ins, rows) area_keys = list(area_keys) for i in range(0, len(area_keys), self.area_batch_size): area_batch = area_keys[i : i + self.area_batch_size] self.update_area_task.delay(area_batch, cell_type=self.cell_type)
def write_stations_to_csv(session, path, today, start_time=None, end_time=None): where = 'radio != 1 AND lat IS NOT NULL AND lon IS NOT NULL' if start_time is not None and end_time is not None: where = where + ' AND modified >= "%s" AND modified < "%s"' fmt = '%Y-%m-%d %H:%M:%S' where = where % (start_time.strftime(fmt), end_time.strftime(fmt)) else: # limit to cells modified in the last 12 months one_year = today - timedelta(days=365) where = where + ' AND modified >= "%s"' % one_year.strftime('%Y-%m-%d') header_row = [ 'radio', 'mcc', 'net', 'area', 'cell', 'unit', 'lon', 'lat', 'range', 'samples', 'changeable', 'created', 'updated', 'averageSignal', ] header_row = ','.join(header_row) + '\n' tables = [shard.__tablename__ for shard in CellShard.shards().values()] stmt = '''SELECT CONCAT_WS(",", CASE radio WHEN 0 THEN "GSM" WHEN 2 THEN "UMTS" WHEN 3 THEN "LTE" ELSE "" END, `mcc`, `mnc`, `lac`, `cid`, COALESCE(`psc`, ""), ROUND(`lon`, 7), ROUND(`lat`, 7), COALESCE(`radius`, "0"), COALESCE(`samples`, "0"), "1", COALESCE(UNIX_TIMESTAMP(`created`), ""), COALESCE(UNIX_TIMESTAMP(`modified`), ""), "" ) AS `cell_value` FROM %s WHERE %s ORDER BY `cellid` LIMIT :l OFFSET :o ''' with util.gzip_open(path, 'w', compresslevel=5) as gzip_wrapper: with gzip_wrapper as gzip_file: gzip_file.write(header_row) for table in tables: table_stmt = text(stmt % (table, where)) offset = 0 limit = 25000 while True: rows = session.execute( table_stmt.bindparams(o=offset, l=limit)).fetchall() if rows: buf = '\r\n'.join([row.cell_value for row in rows]) if buf: buf += '\r\n' gzip_file.write(buf) offset += limit else: break
def write_stations_to_csv(session, path, start_time=None, end_time=None): where = 'radio != 1 AND lat IS NOT NULL AND lon IS NOT NULL' if None not in (start_time, end_time): where = where + ' AND modified >= "%s" AND modified < "%s"' fmt = '%Y-%m-%d %H:%M:%S' where = where % (start_time.strftime(fmt), end_time.strftime(fmt)) header_row = [ 'radio', 'mcc', 'net', 'area', 'cell', 'unit', 'lon', 'lat', 'range', 'samples', 'changeable', 'created', 'updated', 'averageSignal', ] header_row = ','.join(header_row) + '\n' tables = [shard.__tablename__ for shard in CellShard.shards().values()] stmt = '''SELECT CONCAT_WS(",", CASE radio WHEN 0 THEN "GSM" WHEN 2 THEN "UMTS" WHEN 3 THEN "LTE" ELSE "" END, `mcc`, `mnc`, `lac`, `cid`, COALESCE(`psc`, ""), ROUND(`lon`, 7), ROUND(`lat`, 7), COALESCE(`radius`, "0"), COALESCE(`samples`, "0"), "1", COALESCE(UNIX_TIMESTAMP(`created`), ""), COALESCE(UNIX_TIMESTAMP(`modified`), ""), "" ) AS `cell_value` FROM %s WHERE %s ORDER BY `radio`, `mcc`, `mnc`, `lac`, `cid` LIMIT :l OFFSET :o ''' limit = 10000 offset = 0 with util.gzip_open(path, 'w', compresslevel=5) as gzip_wrapper: with gzip_wrapper as gzip_file: gzip_file.write(header_row) for table in tables: table_stmt = text(stmt % (table, where)) while True: rows = session.execute( table_stmt.bindparams(o=offset, l=limit)).fetchall() if rows: buf = '\r\n'.join([row.cell_value for row in rows]) if buf: buf += '\r\n' gzip_file.write(buf) offset += limit else: break
def import_stations(self, session, pipe, filename): today = util.utcnow().date() shards = self.cell_model.shards() on_duplicate = ('`modified` = values(`modified`)' ', `lat` = values(`lat`)' ', `lon` = values(`lon`)' ', `psc` = values(`psc`)' ', `max_lat` = values(`max_lat`)' ', `min_lat` = values(`min_lat`)' ', `max_lon` = values(`max_lon`)' ', `min_lon` = values(`min_lon`)' ', `radius` = values(`radius`)' ', `samples` = values(`samples`)') def commit_batch(rows): all_inserted_rows = 0 for shard_id, shard_rows in rows.items(): table_insert = shards[shard_id].__table__.insert( mysql_on_duplicate=on_duplicate) result = session.execute(table_insert, shard_rows) count = result.rowcount # apply trick to avoid querying for existing rows, # MySQL claims 1 row for an inserted row, 2 for an updated row inserted_rows = 2 * len(shard_rows) - count changed_rows = count - len(shard_rows) assert inserted_rows + changed_rows == len(shard_rows) all_inserted_rows += inserted_rows StatCounter(self.stat_key, today).incr(pipe, all_inserted_rows) areaids = set() with util.gzip_open(filename, 'r') as gzip_wrapper: with gzip_wrapper as gzip_file: cell_model = self.cell_model csv_reader = csv.reader(gzip_file) parse_row = partial(self.make_import_dict, self.cell_model.validate, self.import_spec) rows = defaultdict(list) row_count = 0 for row in csv_reader: # skip any header row if (csv_reader.line_num == 1 and row[0] == 'radio'): # pragma: no cover continue data = parse_row(row) if data is not None: rows[cell_model.shard_id(data['radio'])].append(data) row_count += 1 areaids.add((int(data['radio']), data['mcc'], data['mnc'], data['lac'])) if row_count == self.batch_size: # pragma: no cover commit_batch(rows) session.flush() rows = defaultdict(list) row_count = 0 if rows: commit_batch(rows) self.area_queue.enqueue([encode_cellarea(*id_) for id_ in areaids], json=False)
def write_stations_to_csv(session, path, start_time=None, end_time=None): where = 'radio != 1 AND lat IS NOT NULL AND lon IS NOT NULL' if None not in (start_time, end_time): where = where + ' AND modified >= "%s" AND modified < "%s"' fmt = '%Y-%m-%d %H:%M:%S' where = where % (start_time.strftime(fmt), end_time.strftime(fmt)) header_row = [ 'radio', 'mcc', 'net', 'area', 'cell', 'unit', 'lon', 'lat', 'range', 'samples', 'changeable', 'created', 'updated', 'averageSignal', ] header_row = ','.join(header_row) + '\n' table = Cell.__tablename__ stmt = '''SELECT CONCAT_WS(",", CASE radio WHEN 0 THEN "GSM" WHEN 2 THEN "UMTS" WHEN 3 THEN "LTE" ELSE "" END, `mcc`, `mnc`, `lac`, `cid`, COALESCE(`psc`, ""), ROUND(`lon`, 7), ROUND(`lat`, 7), COALESCE(`radius`, "0"), COALESCE(`samples`, "0"), "1", COALESCE(UNIX_TIMESTAMP(`created`), ""), COALESCE(UNIX_TIMESTAMP(`modified`), ""), "" ) AS `cell_value` FROM %s WHERE %s ORDER BY `radio`, `mcc`, `mnc`, `lac`, `cid` LIMIT :l OFFSET :o ''' % (table, where) stmt = text(stmt) limit = 10000 offset = 0 with util.gzip_open(path, 'w', compresslevel=5) as gzip_wrapper: with gzip_wrapper as gzip_file: gzip_file.write(header_row) while True: rows = session.execute( stmt.bindparams(o=offset, l=limit)).fetchall() if rows: buf = '\r\n'.join([row.cell_value for row in rows]) if buf: buf += '\r\n' gzip_file.write(buf) offset += limit else: break
def write_stations_to_csv(session, path, today, start_time=None, end_time=None): linesep = "\r\n" where = "lat IS NOT NULL AND lon IS NOT NULL" if start_time is not None and end_time is not None: where = where + ' AND modified >= "%s" AND modified < "%s"' fmt = "%Y-%m-%d %H:%M:%S" where = where % (start_time.strftime(fmt), end_time.strftime(fmt)) else: # limit to cells modified in the last 12 months one_year = today - timedelta(days=365) where = where + ' AND modified >= "%s"' % one_year.strftime("%Y-%m-%d") header_row = ",".join(_FIELD_NAMES) + linesep tables = [shard.__tablename__ for shard in CellShard.shards().values()] stmt = """SELECT `cellid`, CONCAT_WS(",", CASE radio WHEN 0 THEN "GSM" WHEN 2 THEN "UMTS" WHEN 3 THEN "LTE" ELSE "" END, `mcc`, `mnc`, `lac`, `cid`, COALESCE(`psc`, ""), ROUND(`lon`, 7), ROUND(`lat`, 7), COALESCE(`radius`, "0"), COALESCE(`samples`, "0"), "1", COALESCE(UNIX_TIMESTAMP(`created`), ""), COALESCE(UNIX_TIMESTAMP(`modified`), ""), "" ) AS `cell_value` FROM %s WHERE %s AND `cellid` > :cellid ORDER BY `cellid` LIMIT :limit """ with util.gzip_open(path, "w", compresslevel=5) as gzip_wrapper: with gzip_wrapper as gzip_file: gzip_file.write(header_row) for table in tables: table_stmt = text(stmt % (table, where)) min_cellid = "" limit = 25000 while True: rows = session.execute( table_stmt.bindparams(limit=limit, cellid=min_cellid)).fetchall() if rows: buf = "".join(row.cell_value + linesep for row in rows) gzip_file.write(buf) min_cellid = rows[-1].cellid else: break
def write_stations_to_csv(session, path, start_time=None, end_time=None): where = "radio != 1 AND lat IS NOT NULL AND lon IS NOT NULL" if None not in (start_time, end_time): where = where + ' AND modified >= "%s" AND modified < "%s"' fmt = "%Y-%m-%d %H:%M:%S" where = where % (start_time.strftime(fmt), end_time.strftime(fmt)) header_row = [ "radio", "mcc", "net", "area", "cell", "unit", "lon", "lat", "range", "samples", "changeable", "created", "updated", "averageSignal", ] header_row = ",".join(header_row) + "\n" table = Cell.__tablename__ stmt = """SELECT CONCAT_WS(",", CASE radio WHEN 0 THEN "GSM" WHEN 2 THEN "UMTS" WHEN 3 THEN "LTE" ELSE "" END, `mcc`, `mnc`, `lac`, `cid`, COALESCE(`psc`, ""), ROUND(`lon`, 7), ROUND(`lat`, 7), COALESCE(`range`, "0"), COALESCE(`total_measures`, "0"), "1", COALESCE(UNIX_TIMESTAMP(`created`), ""), COALESCE(UNIX_TIMESTAMP(`modified`), ""), "" ) AS `cell_value` FROM %s WHERE %s ORDER BY `radio`, `mcc`, `mnc`, `lac`, `cid` LIMIT :l OFFSET :o """ % ( table, where, ) stmt = text(stmt) limit = 10000 offset = 0 with util.gzip_open(path, "w", compresslevel=5) as gzip_wrapper: with gzip_wrapper as gzip_file: gzip_file.write(header_row) while True: rows = session.execute(stmt.bindparams(o=offset, l=limit)).fetchall() if rows: buf = "\r\n".join([row.cell_value for row in rows]) if buf: buf += "\r\n" gzip_file.write(buf) offset += limit else: break
def write_stations_to_csv(session, path, today, start_time=None, end_time=None): where = 'lat IS NOT NULL AND lon IS NOT NULL' if start_time is not None and end_time is not None: where = where + ' AND modified >= "%s" AND modified < "%s"' fmt = '%Y-%m-%d %H:%M:%S' where = where % (start_time.strftime(fmt), end_time.strftime(fmt)) else: # limit to cells modified in the last 12 months one_year = today - timedelta(days=365) where = where + ' AND modified >= "%s"' % one_year.strftime('%Y-%m-%d') header_row = [ 'radio', 'mcc', 'net', 'area', 'cell', 'unit', 'lon', 'lat', 'range', 'samples', 'changeable', 'created', 'updated', 'averageSignal', ] header_row = ','.join(header_row) + '\n' tables = [shard.__tablename__ for shard in CellShard.shards().values()] stmt = '''SELECT `cellid`, CONCAT_WS(",", CASE radio WHEN 0 THEN "GSM" WHEN 2 THEN "UMTS" WHEN 3 THEN "LTE" ELSE "" END, `mcc`, `mnc`, `lac`, `cid`, COALESCE(`psc`, ""), ROUND(`lon`, 7), ROUND(`lat`, 7), COALESCE(`radius`, "0"), COALESCE(`samples`, "0"), "1", COALESCE(UNIX_TIMESTAMP(`created`), ""), COALESCE(UNIX_TIMESTAMP(`modified`), ""), "" ) AS `cell_value` FROM %s WHERE %s AND `cellid` > :cellid ORDER BY `cellid` LIMIT :limit ''' with util.gzip_open(path, 'w', compresslevel=5) as gzip_wrapper: with gzip_wrapper as gzip_file: gzip_file.write(header_row) for table in tables: table_stmt = text(stmt % (table, where)) min_cellid = '' limit = 25000 while True: rows = session.execute( table_stmt.bindparams(limit=limit, cellid=min_cellid)).fetchall() if rows: buf = '\r\n'.join([row.cell_value for row in rows]) if buf: buf += '\r\n' gzip_file.write(buf) min_cellid = rows[-1].cellid else: break