def test_multiple(self): self._add([ (1.0, 2.0, self.yesterday), (-10.0, 40.0, self.yesterday), ]) self._queue([ (1.0, 2.0), (1.0, 2.0), (40.0011, 3.0011), (40.0012, 3.0012), (40.0013, 3.0013), (0.0, 0.0), (1.0, 2.0), (1.00001, 2.00001), ]) for shard_id in DataMap.shards(): update_datamap.delay(batch=2, shard_id=shard_id).get() rows = [] for shard in DataMap.shards().values(): rows.extend(self.session.query(shard).all()) self.assertEqual(len(rows), 4) created = set() modified = set() positions = set() for row in rows: lat, lon = row.grid created.add(row.created) modified.add(row.modified) positions.add((lat / 1000.0, lon / 1000.0)) self.assertEqual(created, set([self.today, self.yesterday])) self.assertEqual(modified, set([self.today, self.yesterday])) self.assertEqual( positions, set([(1.0, 2.0), (-10.0, 40.0), (0.0, 0.0), (40.001, 3.001)]))
def test_files(self, session): today = util.utcnow().date() rows = [ dict(time=today, lat=12.345, lon=12.345), dict(time=today, lat=0, lon=12.345), dict(time=today, lat=-10.000, lon=-11.000), ] for row in rows: lat, lon = DataMap.scale(row["lat"], row["lon"]) data = DataMap.shard_model(lat, lon)( grid=(lat, lon), created=row["time"], modified=row["time"] ) session.add(data) session.flush() lines = [] rows = 0 with util.selfdestruct_tempdir() as temp_dir: quaddir = os.path.join(temp_dir, "quadtrees") os.mkdir(quaddir) shapes = os.path.join(temp_dir, "shapes") tiles = os.path.join(temp_dir, "tiles") for shard_id, shard in DataMap.shards().items(): filename = "map_%s.csv.gz" % shard_id filepath = os.path.join(temp_dir, filename) result = export_file(filepath, shard.__tablename__, _session=session) if not result: assert not os.path.isfile(filepath) continue rows += result with util.gzip_open(filepath, "r") as fd: written = fd.read() lines.extend([line.split(",") for line in written.split()]) encode_file(filename, temp_dir, quaddir) quadfolder = os.path.join(quaddir, "map_" + shard_id) assert os.path.isdir(quadfolder) self._check_quadtree(quadfolder) merge_files(quaddir, shapes) self._check_quadtree(shapes) render_tiles(shapes, tiles, 1, 2) assert sorted(os.listdir(tiles)) == ["0", "1", "2"] assert sorted(os.listdir(os.path.join(tiles, "0", "0"))) == [ "0.png", "*****@*****.**", ] assert rows == 18 assert len(lines) == 18 lats = [round(float(line[0]), 2) for line in lines] longs = [round(float(line[1]), 2) for line in lines] assert set(lats) == set([-10.0, 0.0, 12.35]) assert set(longs) == set([-11.0, 12.35])
def test_files(self, db_rw, session): today = util.utcnow().date() rows = [ dict(time=today, lat=12.345, lon=12.345), dict(time=today, lat=0, lon=12.345), dict(time=today, lat=-10.000, lon=-11.000), ] for row in rows: lat, lon = DataMap.scale(row['lat'], row['lon']) data = DataMap.shard_model(lat, lon)( grid=(lat, lon), created=row['time'], modified=row['time']) session.add(data) session.flush() lines = [] rows = 0 db_url = str(db_rw.engine.url) with util.selfdestruct_tempdir() as temp_dir: quaddir = os.path.join(temp_dir, 'quadtrees') os.mkdir(quaddir) shapes = os.path.join(temp_dir, 'shapes') tiles = os.path.join(temp_dir, 'tiles') for shard_id, shard in DataMap.shards().items(): filename = 'map_%s.csv.gz' % shard_id filepath = os.path.join(temp_dir, filename) result = export_file( db_url, filepath, shard.__tablename__, _session=session) if not result: assert not os.path.isfile(filepath) continue rows += result with util.gzip_open(filepath, 'r') as fd: written = fd.read() lines.extend([line.split(',') for line in written.split()]) encode_file(filename, temp_dir, quaddir, DATAMAPS_DIR) quadfolder = os.path.join(quaddir, 'map_' + shard_id) assert os.path.isdir(quadfolder) self._check_quadtree(quadfolder) merge_files(quaddir, shapes, DATAMAPS_DIR) self._check_quadtree(shapes) render_tiles(shapes, tiles, 1, 2, DATAMAPS_DIR, PNGQUANT) assert (sorted(os.listdir(tiles)) == ['0', '1', '2']) assert (sorted(os.listdir(os.path.join(tiles, '0', '0'))) == ['0.png', '*****@*****.**']) assert rows == 36 assert len(lines) == 36 assert (set([round(float(l[0]), 2) for l in lines]) == set([-10.0, 0.0, 12.35])) assert (set([round(float(l[1]), 2) for l in lines]) == set([-11.0, 12.35]))
def test_files(self, db, session): # pragma: no cover today = util.utcnow().date() rows = [ dict(time=today, lat=12.345, lon=12.345), dict(time=today, lat=0, lon=12.345), dict(time=today, lat=-10.000, lon=-11.000), ] for row in rows: lat, lon = DataMap.scale(row['lat'], row['lon']) data = DataMap.shard_model(lat, lon)(grid=(lat, lon), created=row['time'], modified=row['time']) session.add(data) session.flush() lines = [] rows = 0 with util.selfdestruct_tempdir() as temp_dir: quaddir = os.path.join(temp_dir, 'quadtrees') os.mkdir(quaddir) shapes = os.path.join(temp_dir, 'shapes') tiles = os.path.join(temp_dir, 'tiles') for shard_id, shard in DataMap.shards().items(): filename = 'map_%s.csv.gz' % shard_id filepath = os.path.join(temp_dir, filename) result = export_file(filepath, shard.__tablename__, _session=session) if not result: assert not os.path.isfile(filepath) continue rows += result with util.gzip_open(filepath, 'r') as fd: written = fd.read() lines.extend([line.split(',') for line in written.split()]) encode_file(filename, temp_dir, quaddir) quadfolder = os.path.join(quaddir, 'map_' + shard_id) assert os.path.isdir(quadfolder) self._check_quadtree(quadfolder) merge_files(quaddir, shapes) self._check_quadtree(shapes) render_tiles(shapes, tiles, 1, 2) assert (sorted(os.listdir(tiles)) == ['0', '1', '2']) assert (sorted(os.listdir(os.path.join( tiles, '0', '0'))) == ['0.png', '*****@*****.**']) assert rows == 18 assert len(lines) == 18 assert (set([round(float(l[0]), 2) for l in lines]) == set([-10.0, 0.0, 12.35])) assert (set([round(float(l[1]), 2) for l in lines]) == set([-11.0, 12.35]))
def test_multiple(self, celery, session): self._add( session, [ (0.0, 1.0, self.today), (1.0, 2.0, self.yesterday), (-10.0, 40.0, self.yesterday), ], ) self._queue( celery, [ (0.0, 1.0), (1.0, 2.0), (1.0, 2.0), (40.0011, 3.0011), (40.0012, 3.0012), (40.0013, 3.0013), (0.0, 0.0), (1.0, 2.0), (1.00001, 2.00001), ], ) for shard_id in DataMap.shards(): update_datamap.delay(shard_id=shard_id).get() rows = [] for shard in DataMap.shards().values(): rows.extend(session.query(shard).all()) assert len(rows) == 5 created = set() modified = set() positions = set() for row in rows: lat, lon = row.grid created.add(row.created) modified.add(row.modified) positions.add((lat / 1000.0, lon / 1000.0)) assert created == set([self.today, self.yesterday]) assert modified == set([self.today, self.yesterday]) assert positions == set([(0.0, 0.0), (0.0, 1.0), (1.0, 2.0), (-10.0, 40.0), (40.001, 3.001)])
def test_one(self): lat = 1.234567 lon = 2.345678 shard_id = DataMap.shard_id(*DataMap.scale(lat, lon)) self._queue([(lat, lon)]) update_datamap.delay(shard_id=shard_id).get() grids = self.session.query(DataMap.shards()[shard_id]).all() self.assertEqual(len(grids), 1) self._check_position(grids[0], 1.235, 2.346) self.assertEqual(grids[0].created, self.today) self.assertEqual(grids[0].modified, self.today)
def test_one(self, celery, session): lat = 1.234567 lon = 2.345678 shard_id = DataMap.shard_id(*DataMap.scale(lat, lon)) self._queue(celery, [(lat, lon)]) update_datamap.delay(shard_id=shard_id).get() grids = session.query(DataMap.shards()[shard_id]).all() assert len(grids) == 1 self._check_position(grids[0], 1.235, 2.346) assert grids[0].created == self.today assert grids[0].modified == self.today
def test_files(self, temp_dir, mock_db_worker_session): lines = [] rows = 0 csvdir = os.path.join(temp_dir, "csv") os.mkdir(csvdir) quaddir = os.path.join(temp_dir, "quadtrees") os.mkdir(quaddir) shapes = os.path.join(temp_dir, "shapes") tiles = os.path.join(temp_dir, "tiles") expected = {"ne": (0, 0), "nw": (0, 0), "se": (12, 1), "sw": (6, 1)} for shard_id, shard in DataMap.shards().items(): filename = f"map_{shard_id}.csv" filepath = os.path.join(csvdir, filename) row_count, file_count = export_to_csv(filename, csvdir, shard.__tablename__) assert row_count == expected[shard_id][0] assert file_count == expected[shard_id][1] if not row_count: assert not os.path.isfile(filepath) continue rows += row_count with open(filepath, "r") as fd: written = fd.read() lines.extend([line.split(",") for line in written.split()]) csv_to_quadtree(filename, csvdir, quaddir) quadfolder = os.path.join(quaddir, "map_" + shard_id) assert os.path.isdir(quadfolder) self._check_quadtree(quadfolder) assert rows merge_quadtrees(quaddir, shapes) self._check_quadtree(shapes) with Pool() as pool: render_tiles(pool, shapes, tiles, max_zoom=2) assert sorted(os.listdir(tiles)) == ["0", "1", "2"] assert sorted(os.listdir(os.path.join(tiles, "0", "0"))) == [ "0.png", "*****@*****.**", ] assert rows == 18 assert len(lines) == 18 lats = [round(float(line[0]), 2) for line in lines] longs = [round(float(line[1]), 2) for line in lines] assert set(lats) == set([-10.0, 0.0, 12.35]) assert set(longs) == set([-11.0, 12.35])
def test_update(self): lat = 1.0 lon = 2.0 shard_id = DataMap.shard_id(*DataMap.scale(lat, lon)) self._add([(lat, lon, self.yesterday)]) self._queue([(lat, lon)]) update_datamap.delay(shard_id=shard_id).get() grids = self.session.query(DataMap.shards()[shard_id]).all() self.assertEqual(len(grids), 1) self._check_position(grids[0], 1.0, 2.0) self.assertEqual(grids[0].created, self.yesterday) self.assertEqual(grids[0].modified, self.today)
def test_update(self, celery, session): lat = 1.0 lon = 2.0 shard_id = DataMap.shard_id(*DataMap.scale(lat, lon)) self._add(session, [(lat, lon, self.yesterday)]) self._queue(celery, [(lat, lon)]) update_datamap.delay(shard_id=shard_id).get() grids = session.query(DataMap.shards()[shard_id]).all() assert len(grids) == 1 self._check_position(grids[0], 1.0, 2.0) assert grids[0].created == self.yesterday assert grids[0].modified == self.today
def export_files(pool, db_url, csvdir): # pragma: no cover jobs = [] result_rows = 0 for shard_id, shard in sorted(DataMap.shards().items()): # sorting the shards prefers the north which contains more # data points than the south filename = os.path.join(csvdir, 'map_%s.csv.gz' % shard_id) jobs.append(pool.apply_async(export_file, (db_url, filename, shard.__tablename__))) for job in jobs: result_rows += job.get() return result_rows
def test_multiple(self, celery, session): self._add(session, [ (0.0, 1.0, self.today), (1.0, 2.0, self.yesterday), (-10.0, 40.0, self.yesterday), ]) self._queue(celery, [ (0.0, 1.0), (1.0, 2.0), (1.0, 2.0), (40.0011, 3.0011), (40.0012, 3.0012), (40.0013, 3.0013), (0.0, 0.0), (1.0, 2.0), (1.00001, 2.00001), ]) for shard_id in DataMap.shards(): update_datamap.delay(shard_id=shard_id).get() rows = [] for shard in DataMap.shards().values(): rows.extend(session.query(shard).all()) assert len(rows) == 5 created = set() modified = set() positions = set() for row in rows: lat, lon = row.grid created.add(row.created) modified.add(row.modified) positions.add((lat / 1000.0, lon / 1000.0)) assert created == set([self.today, self.yesterday]) assert modified == set([self.today, self.yesterday]) assert (positions == set([ (0.0, 0.0), (0.0, 1.0), (1.0, 2.0), (-10.0, 40.0), (40.001, 3.001)]))
def test_cleanup(self, celery, session): session.add_all([ self._one(37.0, 6.0, self.today), self._one(37.0, 6.1, self.today - timedelta(days=366)), self._one(37.0, 4.0, self.today), self._one(37.0, 4.1, self.today - timedelta(days=366)), self._one(10.0, 6.0, self.today), self._one(10.0, 6.1, self.today - timedelta(days=366)), self._one(10.0, 4.0, self.today), self._one(10.0, 4.1, self.today - timedelta(days=366)), ]) session.flush() for shard_id, shard in DataMap.shards().items(): cleanup_datamap.delay(shard_id=shard_id).get() assert session.query(shard).count() == 1
def export_to_csvs(pool, csv_dir): """ Export from database tables to CSV. For small database tables, there will be one CSV created, such as "map_ne.csv" for the datamap_ne (northeast) table. For large database tables, there will be multiple CSVs created, such as "submap_ne_0001.csv". :param pool: A multiprocessing pool :csv_dir: The directory to write CSV output files :return: A tuple of counts (rows, CSVs) """ jobs = [] result_rows = 0 result_csvs = 0 for shard_id, shard in sorted(DataMap.shards().items()): # sorting the shards prefers the north which contains more # data points than the south filename = f"map_{shard_id}.csv" jobs.append( pool.apply_async(export_to_csv, (filename, csv_dir, shard.__tablename__)) ) # Run export jobs to completion def on_success(result): nonlocal result_rows, result_csvs rows, csvs = result result_rows += rows result_csvs += csvs def on_progress(tables_complete, table_percent): nonlocal result_rows LOG.debug( f" Exported {result_rows:,} row{_s(result_rows)}" f" from {tables_complete:,} table{_s(tables_complete)}" f" to {result_csvs:,} CSV file{_s(result_csvs)}" f" ({table_percent:0.1%})" ) watch_jobs(jobs, on_success=on_success, on_progress=on_progress) return result_rows, result_csvs
def test_multiple_csv(self, temp_dir, raven, mock_db_worker_session): """export_to_csv creates multiple CSVs at the file_limit.""" expected = {"ne": (0, 0), "nw": (0, 0), "se": (12, 2), "sw": (6, 1)} csv_dir = os.path.join(temp_dir, "csv") os.mkdir(csv_dir) for shard_id, shard in DataMap.shards().items(): filename = f"map_{shard_id}.csv" filepath = os.path.join(csv_dir, filename) row_count, file_count = export_to_csv(filename, csv_dir, shard.__tablename__, file_limit=1) assert row_count == expected[shard_id][0] assert file_count == expected[shard_id][1] if not row_count: assert not os.path.isfile(filepath) elif file_count == 1: assert os.path.isfile(filepath) else: assert not os.path.isfile(filepath) for num in range(1, file_count + 1): filename_n = f"submap_{shard_id}_{num:04}.csv" filepath_n = os.path.join(csv_dir, filename_n) assert os.path.isfile(filepath_n) quad_dir = os.path.join(temp_dir, "quadtrees") os.mkdir(quad_dir) with Pool() as pool: result = csv_to_quadtrees(pool, csv_dir, quad_dir) csv_count, intermediate_quad_count, final_quad_count = result assert csv_count == 3 assert intermediate_quad_count == 2 assert final_quad_count == 2
def __init__(self, task, session, pipe, shard_id=None): DataTask.__init__(self, task, session) self.pipe = pipe self.shard_id = shard_id self.shard = DataMap.shards().get(shard_id)
def test_empty(self): for shard_id, shard in DataMap.shards().items(): update_datamap.delay(shard_id=shard_id).get() self.assertEqual(self.session.query(shard).count(), 0)
def __init__(self, task, shard_id=None): self.task = task self.shard_id = shard_id self.shard = DataMap.shards().get(shard_id)
def test_empty(self, celery, session): for shard_id, shard in DataMap.shards().items(): update_datamap.delay(shard_id=shard_id).get() assert session.query(shard).count() == 0