def timeseries_all(cls, table_names, agg_unit, start, end, geom=None): # For each table in table_names, generate a query to be unioned selects = [] for name in table_names: table = cls.get_by_dataset_name(name) ts_select = table.timeseries(agg_unit, start, end, geom) selects.append(ts_select) # Union the time series selects to get a panel panel_query = sa.union(*selects)\ .order_by('dataset_name')\ .order_by('time_bucket') panel_vals = session.execute(panel_query) panel = [] for dataset_name, ts in groupby(panel_vals, lambda row: row.dataset_name): # ts gets closed after it's been iterated over once, # so we need to store the rows somewhere to iterate over them twice. rows = [row for row in ts] # If no records were found, don't include this dataset if all([row.count == 0 for row in rows]): continue ts_dict = {'dataset_name': dataset_name, 'items': []} for row in rows: ts_dict['items'].append({ 'datetime': row.time_bucket.date(), # UTC time 'count': row.count }) panel.append(ts_dict) return panel
def timeseries_all(cls, table_names, agg_unit, start, end, geom=None): # For each table in table_names, generate a query to be unioned selects = [] for name in table_names: table = cls.get_by_dataset_name(name) ts_select = table.timeseries(agg_unit, start, end, geom) selects.append(ts_select) # Union the time series selects to get a panel panel_query = sa.union(*selects)\ .order_by('dataset_name')\ .order_by('time_bucket') panel_vals = session.execute(panel_query) panel = [] for dataset_name, ts in groupby(panel_vals, lambda row: row.dataset_name): # ts gets closed after it's been iterated over once, # so we need to store the rows somewhere to iterate over them twice. rows = [row for row in ts] # If no records were found, don't include this dataset if all([row.count == 0 for row in rows]): continue ts_dict = {'dataset_name': dataset_name, 'items': []} for row in rows: ts_dict['items'].append({ 'datetime': row.time_bucket.date(), # UTC time 'count': row.count }) panel.append(ts_dict) return panel
def make_grid(self, resolution, geom=None, conditions=None): """ :param resolution: length of side of grid square in meters :type resolution: int :param geom: string representation of geojson fragment :type geom: str :param conditions: conditions on columns to filter on :type conditions: list of SQLAlchemy binary operations (e.g. col > value) :return: grid: result proxy with all result rows size_x, size_y: the horizontal and vertical size of the grid squares in degrees """ if conditions is None: conditions = [] # We need to convert resolution (given in meters) to degrees # - which is the unit of measure for EPSG 4326 - # - in order to generate our grid. center = self.get_bbox_center() # center[1] is longitude size_x, size_y = get_size_in_degrees(resolution, center[1]) # Generate a count for each resolution by resolution square t = self.point_table q = session.query(func.count(t.c.hash), func.ST_SnapToGrid(t.c.geom, size_x, size_y) .label('squares'))\ .filter(*conditions)\ .group_by('squares') if geom: q = q.filter(t.c.geom.ST_Within(func.ST_GeomFromGeoJSON(geom))) return session.execute(q), size_x, size_y
def test_insert_data(self): etl = PlenarioETL(self.existing_meta, source_path=self.dog_path) etl.update() existing = self.existing_table all_rows = session.execute(existing.select()).fetchall() self.assertEqual(len(all_rows), 5)
def make_grid(self, resolution, geom=None, conditions=None): """ :param resolution: length of side of grid square in meters :type resolution: int :param geom: string representation of geojson fragment :type geom: str :param conditions: conditions on columns to filter on :type conditions: list of SQLAlchemy binary operations (e.g. col > value) :return: grid: result proxy with all result rows size_x, size_y: the horizontal and vertical size of the grid squares in degrees """ if conditions is None: conditions = [] # We need to convert resolution (given in meters) to degrees # - which is the unit of measure for EPSG 4326 - # - in order to generate our grid. center = self.get_bbox_center() # center[1] is longitude size_x, size_y = get_size_in_degrees(resolution, center[1]) # Generate a count for each resolution by resolution square t = self.point_table q = session.query(func.count(t.c.hash), func.ST_SnapToGrid(t.c.geom, size_x, size_y) .label('squares'))\ .filter(*conditions)\ .group_by('squares') if geom: q = q.filter(t.c.geom.ST_Within(func.ST_GeomFromGeoJSON(geom))) return session.execute(q), size_x, size_y
def _get_num_shapes(self): table = self.shape_table # Arbitrarily select the first column of the table to count against count_query = select([func.count(table.c.geom)]) # Should return only one row. # And we want the 0th and only attribute of that row (the count). return session.execute(count_query).fetchone()[0]
def test_staging_new_table(self): # For the entry in MetaTable without a table, create a staging table. # We'll need to read from a fixture csv. with Staging(self.unloaded_meta, source_path=self.radio_path) as s_table: all_rows = session.execute(s_table.table.select()).fetchall() self.assertEqual(len(all_rows), 5)
def test_insert_data(self): etl = PlenarioETL(self.existing_meta, source_path=self.dog_path) etl.update() existing = self.existing_table all_rows = session.execute(existing.select()).fetchall() self.assertEqual(len(all_rows), 5)
def _get_num_shapes(self): table = self.shape_table # Arbitrarily select the first column of the table to count against count_query = select([func.count(table.c.geom)]) # Should return only one row. # And we want the 0th and only attribute of that row (the count). return session.execute(count_query).fetchone()[0]
def timeseries_one(self, agg_unit, start, end, geom=None, column_filters=None): ts_select = self.timeseries(agg_unit, start, end, geom, column_filters) rows = session.execute(ts_select.order_by('time_bucket')) header = [['count', 'datetime']] # Discard the name attribute. rows = [[count, time_bucket.date()] for _, time_bucket, count in rows] return header + rows
def get_metadata_with_etl_result(cls, table_name): query = ''' SELECT meta.*, celery.status, celery.traceback, celery.date_done FROM meta_shape as meta LEFT JOIN celery_taskmeta as celery ON celery.task_id = meta.celery_task_id WHERE meta.dataset_name='{}'; '''.format(table_name) metadata = session.execute(query).first() return metadata
def test_update_with_delete(self): etl = PlenarioETL(self.existing_meta, source_path=self.dog_path) etl.update() # The same source CSV, but with one less record deleted_path = os.path.join(fixtures_path, 'dog_park_permits_deleted.csv') etl = PlenarioETL(self.existing_meta, source_path=deleted_path) etl.update() all_rows = session.execute(self.existing_table.select()).fetchall() self.assertEqual(len(all_rows), 4)
def get_metadata_with_etl_result(cls, table_name): query = ''' SELECT meta.*, celery.status, celery.traceback, celery.date_done FROM meta_shape as meta LEFT JOIN celery_taskmeta as celery ON celery.task_id = meta.celery_task_id WHERE meta.dataset_name='{}'; '''.format(table_name) metadata = session.execute(query).first() return metadata
def test_update_with_delete(self): etl = PlenarioETL(self.existing_meta, source_path=self.dog_path) etl.update() # The same source CSV, but with one less record deleted_path = os.path.join(fixtures_path, 'dog_park_permits_deleted.csv') etl = PlenarioETL(self.existing_meta, source_path=deleted_path) etl.update() all_rows = session.execute(self.existing_table.select()).fetchall() self.assertEqual(len(all_rows), 4)
def get_all_with_etl_status(cls): """ :return: Every row of meta_shape joined with celery task status. """ shape_query = ''' SELECT meta.*, celery.status FROM meta_shape as meta LEFT JOIN celery_taskmeta as celery ON celery.task_id = meta.celery_task_id WHERE meta.approved_status = TRUE; ''' return list(session.execute(shape_query))
def get_all_with_etl_status(cls): """ :return: Every row of meta_shape joined with celery task status. """ shape_query = ''' SELECT meta.*, celery.status FROM meta_shape as meta LEFT JOIN celery_taskmeta as celery ON celery.task_id = meta.celery_task_id WHERE meta.approved_status = TRUE; ''' return list(session.execute(shape_query))
def test_location_col_add(self): drop_if_exists(self.opera_meta.dataset_name) etl = PlenarioETL(self.opera_meta, source_path=self.opera_path) new_table = etl.add() all_rows = session.execute(new_table.select()).fetchall() self.assertEqual(len(all_rows), 5) session.close() new_table.drop(app_engine, checkfirst=True) # Did we add a bbox? bbox = MetaTable.get_by_dataset_name('public_opera_performances').bbox self.assertIsNotNone(bbox)
def test_new_table(self): drop_if_exists(self.unloaded_meta.dataset_name) etl = PlenarioETL(self.unloaded_meta, source_path=self.radio_path) new_table = etl.add() all_rows = session.execute(new_table.select()).fetchall() self.assertEqual(len(all_rows), 5) session.close() new_table.drop(app_engine, checkfirst=True) # Did we add a bbox? bbox = MetaTable.get_by_dataset_name('community_radio_events').bbox self.assertIsNotNone(bbox)
def test_location_col_add(self): drop_if_exists(self.opera_meta.dataset_name) etl = PlenarioETL(self.opera_meta, source_path=self.opera_path) new_table = etl.add() all_rows = session.execute(new_table.select()).fetchall() self.assertEqual(len(all_rows), 5) session.close() new_table.drop(app_engine, checkfirst=True) # Did we add a bbox? bbox = MetaTable.get_by_dataset_name('public_opera_performances').bbox self.assertIsNotNone(bbox)
def test_new_table(self): drop_if_exists(self.unloaded_meta.dataset_name) etl = PlenarioETL(self.unloaded_meta, source_path=self.radio_path) new_table = etl.add() all_rows = session.execute(new_table.select()).fetchall() self.assertEqual(len(all_rows), 5) session.close() new_table.drop(app_engine, checkfirst=True) # Did we add a bbox? bbox = MetaTable.get_by_dataset_name('community_radio_events').bbox self.assertIsNotNone(bbox)
def add_intersections_to_index(listing, geom): # For each dataset_name in the listing, # get a count of intersections # and replace num_geoms for row in listing: name = row['dataset_name'] num_intersections_query = ''' SELECT count(g.geom) as num_geoms FROM "{dataset_name}" as g WHERE ST_Intersects(g.geom, ST_GeomFromGeoJSON('{geojson_fragment}')) '''.format(dataset_name=name, geojson_fragment=geom) num_intersections = session.execute(num_intersections_query)\ .first().num_geoms row['num_shapes'] = num_intersections intersecting_rows = [row for row in listing if row['num_shapes'] > 0] return intersecting_rows
def add_intersections_to_index(listing, geom): # For each dataset_name in the listing, # get a count of intersections # and replace num_geoms for row in listing: name = row['dataset_name'] num_intersections_query = ''' SELECT count(g.geom) as num_geoms FROM "{dataset_name}" as g WHERE ST_Intersects(g.geom, ST_GeomFromGeoJSON('{geojson_fragment}')) '''.format(dataset_name=name, geojson_fragment=geom) num_intersections = session.execute(num_intersections_query)\ .first().num_geoms row['num_shapes'] = num_intersections intersecting_rows = [row for row in listing if row['num_shapes'] > 0] return intersecting_rows
def timeseries_all(cls, table_names, agg_unit, start, end, geom=None, ctrees=None): # For each table in table_names, generate a query to be unioned selects = [] for name in table_names: # If we have condition trees specified, apply them. # .get will return None for those datasets who don't have filters ctree = ctrees.get(name) if ctrees else None table = cls.get_by_dataset_name(name) ts_select = table.timeseries(agg_unit, start, end, geom, ctree) selects.append(ts_select) # Union the time series selects to get a panel panel_query = sa.union(*selects)\ .order_by('dataset_name')\ .order_by('time_bucket') panel_vals = session.execute(panel_query) panel = [] for dataset_name, ts in groupby(panel_vals, lambda row: row.dataset_name): # ts gets closed after it's been iterated over once, # so we need to store the rows somewhere to iterate over them twice. rows = [row for row in ts] # If no records were found, don't include this dataset if all([row.count == 0 for row in rows]): continue ts_dict = {'dataset_name': dataset_name, 'items': []} for row in rows: ts_dict['items'].append({ 'datetime': row.time_bucket.date(), # UTC time 'count': row.count }) # Aggregate top-level count across all time slices. ts_dict['count'] = sum([i['count'] for i in ts_dict['items']]) panel.append(ts_dict) return panel
def test_staging_existing_table(self): # With a fixture CSV whose columns match the existing dataset, # create a staging table. with Staging(self.existing_meta, source_path=self.dog_path) as s_table: all_rows = session.execute(s_table.table.select()).fetchall() self.assertEqual(len(all_rows), 5)
def _make_bbox(self): bbox_query = 'SELECT ST_Envelope(ST_Union(geom)) FROM {};'.\ format(self.dataset_name) box = session.execute(bbox_query).first().st_envelope return box
def remove_table(self): if self.is_ingested: drop = "DROP TABLE {};".format(self.dataset_name) session.execute(drop) session.delete(self)
def get_bbox_center(self): sel = select([func.ST_AsGeoJSON(func.ST_centroid(self.bbox))]) result = session.execute(sel) # returns [lon, lat] return json.loads(result.first()[0])['coordinates']
def drop_tables(table_names): drop_template = 'DROP TABLE IF EXISTS {};' command = ''.join( [drop_template.format(table_name) for table_name in table_names]) session.execute(command) session.commit()
def _make_bbox(self): bbox_query = 'SELECT ST_Envelope(ST_Union(geom)) FROM {};'.\ format(self.dataset_name) box = session.execute(bbox_query).first().st_envelope return box
def remove_table(self): if self.is_ingested: drop = "DROP TABLE {};".format(self.dataset_name) session.execute(drop) session.delete(self)
def test_staging_new_table(self): # For the entry in MetaTable without a table, create a staging table. # We'll need to read from a fixture csv. with Staging(self.unloaded_meta, source_path=self.radio_path) as s_table: all_rows = session.execute(s_table.table.select()).fetchall() self.assertEqual(len(all_rows), 5)
def get_bbox_center(self): sel = select([func.ST_AsGeoJSON(func.ST_centroid(self.bbox))]) result = session.execute(sel) # returns [lon, lat] return json.loads(result.first()[0])['coordinates']
def drop_tables(table_names): drop_template = 'DROP TABLE IF EXISTS {};' command = ''.join([drop_template.format(table_name) for table_name in table_names]) session.execute(command) session.commit()
def test_staging_existing_table(self): # With a fixture CSV whose columns match the existing dataset, # create a staging table. with Staging(self.existing_meta, source_path=self.dog_path) as s_table: all_rows = session.execute(s_table.table.select()).fetchall() self.assertEqual(len(all_rows), 5)