def meta(): status_code = 200 resp = { 'meta': { 'status': 'ok', 'message': '', }, 'objects': [] } dataset_name = request.args.get('dataset_name') q = ''' SELECT m.obs_from, m.location, m.latitude, m.last_update, m.source_url_hash, m.attribution, m.description, m.source_url, m.obs_to, m.date_added, m.business_key, m.result_ids, m.longitude, m.observed_date, m.human_name, m.dataset_name, m.update_freq, ST_AsGeoJSON(m.bbox) as bbox FROM meta_master AS m LEFT JOIN celery_taskmeta AS c ON c.id = ( SELECT id FROM celery_taskmeta WHERE task_id = ANY(m.result_ids) ORDER BY date_done DESC LIMIT 1 ) WHERE m.approved_status = 'true' AND c.status = 'SUCCESS' ''' if dataset_name: # We need to get the bbox separately so we can request it as json q = text('{0} AND m.dataset_name=:dataset_name'.format(q)) with engine.begin() as c: metas = list(c.execute(q, dataset_name=dataset_name)) else: with engine.begin() as c: metas = list(c.execute(q)) for m in metas: keys = dict(zip(m.keys(), m.values())) # # If we have bounding box data, add it if (m['bbox'] is not None): keys['bbox'] = json.loads(m.bbox) resp['objects'].append(keys) resp['meta']['total'] = len(resp['objects']) resp = make_response(json.dumps(resp, default=dthandler), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def view_datasets(): datasets_pending = session.query(MetaTable)\ .filter(MetaTable.approved_status != 'true')\ .all() counts = { 'master_row_count': table_row_estimate('dat_master'), 'weather_daily_row_count': table_row_estimate('dat_weather_observations_daily'), 'weather_hourly_row_count': table_row_estimate('dat_weather_observations_hourly'), 'census_block_row_count': table_row_estimate('census_blocks'), } try: celery_table = Table('celery_taskmeta', Base.metadata, autoload=True, autoload_with=engine) q = text(''' SELECT m.*, c.status, c.task_id FROM meta_master AS m LEFT JOIN celery_taskmeta AS c ON c.id = ( SELECT id FROM celery_taskmeta WHERE task_id = ANY(m.result_ids) ORDER BY date_done DESC LIMIT 1 ) WHERE m.approved_status = 'true' ''') datasets = [] with engine.begin() as c: datasets = list(c.execute(q)) except NoSuchTableError, e: datasets = session.query(MetaTable)\ .filter(MetaTable.approved_status == 'true')\ .all()
def table_row_estimate(table_name): try: q = text(''' SELECT reltuples::bigint AS estimate FROM pg_class where relname=:table_name; ''') with engine.begin() as c: return list(c.execute(q, table_name=table_name))[0][0] except NoSuchTableError, e: print "Table %s doesn't exist" % table_name
def meta(): status_code = 200 dataset_name = request.args.get('dataset_name') q = ''' SELECT m.obs_from, m.location, m.latitude, m.last_update, m.source_url_hash, m.attribution, m.description, m.source_url, m.obs_to, m.date_added, m.view_url, m.longitude, m.observed_date, m.human_name, m.dataset_name, m.update_freq, ST_AsGeoJSON(m.bbox) as bbox FROM meta_master AS m WHERE m.date_added IS NOT NULL ''' if dataset_name: response_query = {'dataset_name': dataset_name} # We need to get the bbox separately so we can request it as json q = text('{0} AND m.dataset_name=:dataset_name'.format(q)) with engine.begin() as c: metas = list(c.execute(q, dataset_name=dataset_name)) else: response_query = None with engine.begin() as c: metas = list(c.execute(q)) resp = json_response_base(None, [], query=response_query) for m in metas: keys = dict(zip(m.keys(), m.values())) # If we have bounding box data, add it if (m['bbox'] is not None): keys['bbox'] = json.loads(m.bbox) resp['objects'].append(keys) resp['meta']['total'] = len(resp['objects']) resp = make_response(json.dumps(resp, default=dthandler), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def dataset_status(): source_url_hash = request.args.get("source_url_hash") celery_table = Table('celery_taskmeta', Base.metadata, autoload=True, autoload_with=engine) results = [] q = ''' SELECT m.human_name, m.source_url_hash, c.status, c.date_done, c.traceback, c.task_id FROM meta_master AS m, UNNEST(m.result_ids) AS ids LEFT JOIN celery_taskmeta AS c ON c.task_id = ids WHERE c.date_done IS NOT NULL ''' if source_url_hash: q = q + "AND m.source_url_hash = :source_url_hash" q = q + " ORDER BY c.id DESC" with engine.begin() as c: results = list(c.execute(text(q), source_url_hash=source_url_hash)) r = [] for result in results: tb = None if result.traceback: tb = result.traceback\ .replace('\r\n', '<br />')\ .replace('\n\r', '<br />')\ .replace('\n', '<br />')\ .replace('\r', '<br />') d = { 'human_name': result.human_name, 'source_url_hash': result.source_url_hash, 'status': result.status, 'task_id': result.task_id, 'traceback': tb, 'date_done': None, } if result.date_done: d['date_done'] = result.date_done.strftime('%B %d, %Y %H:%M'), r.append(d) return render_template('admin/dataset-status.html', results=r)
def dataset_status(): source_url_hash = request.args.get("source_url_hash") q = ''' SELECT m.human_name, m.source_url_hash, c.status, c.date_done, c.traceback, c.task_id FROM meta_master AS m, UNNEST(m.result_ids) AS ids LEFT JOIN celery_taskmeta AS c ON c.task_id = ids WHERE c.date_done IS NOT NULL ''' if source_url_hash: name = session.query(MetaTable).get(source_url_hash).dataset_name q = q + "AND m.source_url_hash = :source_url_hash" else: name = None q = q + " ORDER BY c.id DESC" with engine.begin() as c: results = list(c.execute(text(q), source_url_hash=source_url_hash)) r = [] for result in results: tb = None if result.traceback: tb = result.traceback\ .replace('\r\n', '<br />')\ .replace('\n\r', '<br />')\ .replace('\n', '<br />')\ .replace('\r', '<br />') d = { 'human_name': result.human_name, 'source_url_hash': result.source_url_hash, 'status': result.status, 'task_id': result.task_id, 'traceback': tb, 'date_done': None, } if result.date_done: d['date_done'] = result.date_done.strftime('%B %d, %Y %H:%M'), r.append(d) return render_template('admin/dataset-status.html', results=r, name=name)
def update_dataset(self, source_url_hash): md = session.query(MetaTable).get(source_url_hash) if md.result_ids: ids = md.result_ids ids.append(self.request.id) else: ids = [self.request.id] with engine.begin() as c: c.execute(MetaTable.__table__.update()\ .where(MetaTable.source_url_hash == source_url_hash)\ .values(result_ids=ids)) etl = PlenarioETL(md) etl.update() return 'Finished updating {0} ({1})'.format(md.human_name, md.source_url_hash)
def view_datasets(): datasets_pending = session.query(MetaTable).\ filter(MetaTable.approved_status != True).\ all() shapes_pending = session.query(ShapeMetadata).\ filter(ShapeMetadata.approved_status != True).\ all() try: q = text(''' SELECT m.*, c.status, c.task_id FROM meta_master AS m LEFT JOIN celery_taskmeta AS c ON c.id = ( SELECT id FROM celery_taskmeta WHERE task_id = ANY(m.result_ids) ORDER BY date_done DESC LIMIT 1 ) WHERE m.approved_status = 'true' ''') with engine.begin() as c: datasets = list(c.execute(q)) except NoSuchTableError: datasets = session.query(MetaTable)\ .filter(MetaTable.approved_status == True)\ .all() try: shape_datasets = ShapeMetadata.get_all_with_etl_status() except NoSuchTableError: # If we can't find shape metadata, soldier on. shape_datasets = None return render_template('admin/view-datasets.html', datasets_pending=datasets_pending, shapes_pending=shapes_pending, datasets=datasets, shape_datasets=shape_datasets)
def dataset(): raw_query_params = request.args.copy() # set default value for temporal aggregation agg = raw_query_params.get('agg') if not agg: agg = 'day' else: del raw_query_params['agg'] # if no obs_date given, default to >= 90 days ago if not raw_query_params.get('obs_date__ge'): six_months_ago = datetime.now() - timedelta(days=90) raw_query_params['obs_date__ge'] = six_months_ago.strftime('%Y-%m-%d') if not raw_query_params.get('obs_date__le'): raw_query_params['obs_date__le'] = datetime.now().strftime('%Y-%m-%d') # set datatype datatype = 'json' if raw_query_params.get('data_type'): datatype = raw_query_params['data_type'] del raw_query_params['data_type'] q = ''' SELECT m.dataset_name FROM meta_master AS m LEFT JOIN celery_taskmeta AS c ON c.id = ( SELECT id FROM celery_taskmeta WHERE task_id = ANY(m.result_ids) ORDER BY date_done DESC LIMIT 1 ) WHERE m.approved_status = 'true' AND c.status = 'SUCCESS' ''' with engine.begin() as c: dataset_names = [d[0] for d in c.execute(q)] raw_query_params['dataset_name__in'] = ','.join(dataset_names) mt = MasterTable.__table__ valid_query, query_clauses, resp, status_code = make_query( mt, raw_query_params) # check for valid output format if datatype not in VALID_DATA_TYPE: valid_query = False resp['meta']['status'] = 'error' resp['meta']['message'] = "'%s' is an invalid output format" % datatype resp = make_response(json.dumps(resp, default=dthandler), 400) resp.headers['Content-Type'] = 'application/json' # check for valid temporal aggregate if agg not in VALID_AGG: valid_query = False resp['meta']['status'] = 'error' resp['meta'][ 'message'] = "'%s' is an invalid temporal aggregation" % agg resp = make_response(json.dumps(resp, default=dthandler), 400) resp.headers['Content-Type'] = 'application/json' if valid_query: time_agg = func.date_trunc(agg, mt.c['obs_date']) base_query = session.query(time_agg, func.count(mt.c['obs_date']), mt.c['dataset_name']) base_query = base_query.filter(mt.c['current_flag'] == True) for clause in query_clauses: base_query = base_query.filter(clause) base_query = base_query.group_by(mt.c['dataset_name'])\ .group_by(time_agg)\ .order_by(time_agg) values = [o for o in base_query.all()] # init from and to dates ad python datetimes from_date = truncate(parse(raw_query_params['obs_date__ge']), agg) if 'obs_date__le' in raw_query_params.keys(): to_date = parse(raw_query_params['obs_date__le']) else: to_date = datetime.now() # build the response results = sorted(values, key=itemgetter(2)) for k, g in groupby(results, key=itemgetter(2)): d = {'dataset_name': k} items = [] dense_matrix = [] cursor = from_date v_index = 0 dataset_values = list(g) while cursor <= to_date: if v_index < len(dataset_values) and \ dataset_values[v_index][0].replace(tzinfo=None) == cursor: dense_matrix.append((cursor, dataset_values[v_index][1])) v_index += 1 else: dense_matrix.append((cursor, 0)) cursor = increment_datetime_aggregate(cursor, agg) dense_matrix = OrderedDict(dense_matrix) for k in dense_matrix: i = { 'datetime': k, 'count': dense_matrix[k], } items.append(i) d['items'] = items resp['objects'].append(d) resp['meta']['query'] = raw_query_params loc = resp['meta']['query'].get('location_geom__within') if loc: resp['meta']['query']['location_geom__within'] = json.loads(loc) resp['meta']['query']['agg'] = agg resp['meta']['status'] = 'ok' if datatype == 'json': resp = make_response(json.dumps(resp, default=dthandler), status_code) resp.headers['Content-Type'] = 'application/json' elif datatype == 'csv': # response format # temporal_group,dataset_name_1,dataset_name_2 # 2014-02-24 00:00:00,235,653 # 2014-03-03 00:00:00,156,624 fields = ['temporal_group'] for o in resp['objects']: fields.append(o['dataset_name']) csv_resp = [] i = 0 for k, g in groupby(resp['objects'], key=itemgetter('dataset_name')): l_g = list(g)[0] j = 0 for row in l_g['items']: # first iteration, populate the first column with temporal_groups if i == 0: csv_resp.append([row['datetime']]) csv_resp[j].append(row['count']) j += 1 i += 1 csv_resp.insert(0, fields) csv_resp = make_csv(csv_resp) resp = make_response(csv_resp, 200) resp.headers['Content-Type'] = 'text/csv' filedate = datetime.now().strftime('%Y-%m-%d') resp.headers[ 'Content-Disposition'] = 'attachment; filename=%s.csv' % ( filedate) return resp
def dataset(): raw_query_params = request.args.copy() # set default value for temporal aggregation agg = raw_query_params.get('agg') if not agg: agg = 'day' else: del raw_query_params['agg'] # if no obs_date given, default to >= 90 days ago if not raw_query_params.get('obs_date__ge'): six_months_ago = datetime.now() - timedelta(days=90) raw_query_params['obs_date__ge'] = six_months_ago.strftime('%Y-%m-%d') if not raw_query_params.get('obs_date__le'): raw_query_params['obs_date__le'] = datetime.now().strftime('%Y-%m-%d') # set datatype datatype = 'json' if raw_query_params.get('data_type'): datatype = raw_query_params['data_type'] del raw_query_params['data_type'] if not raw_query_params.get('dataset_name__in'): q = ''' SELECT m.dataset_name FROM meta_master AS m LEFT JOIN celery_taskmeta AS c ON c.id = ( SELECT id FROM celery_taskmeta WHERE task_id = ANY(m.result_ids) ORDER BY date_done DESC LIMIT 1 ) WHERE m.approved_status = 'true' AND c.status = 'SUCCESS' ''' with engine.begin() as c: dataset_names = [d[0] for d in c.execute(q)] raw_query_params['dataset_name__in'] = ','.join(dataset_names) mt = MasterTable.__table__ valid_query, query_clauses, resp, status_code = make_query(mt,raw_query_params) # check for valid output format if datatype not in VALID_DATA_TYPE: valid_query = False resp['meta']['status'] = 'error' resp['meta']['message'] = "'%s' is an invalid output format" % datatype resp = make_response(json.dumps(resp, default=dthandler), 400) resp.headers['Content-Type'] = 'application/json' # check for valid temporal aggregate if agg not in VALID_AGG: valid_query = False resp['meta']['status'] = 'error' resp['meta']['message'] = "'%s' is an invalid temporal aggregation" % agg resp = make_response(json.dumps(resp, default=dthandler), 400) resp.headers['Content-Type'] = 'application/json' if valid_query: time_agg = func.date_trunc(agg, mt.c['obs_date']) base_query = session.query(time_agg, func.count(mt.c['obs_date']), mt.c['dataset_name']) base_query = base_query.filter(mt.c['current_flag'] == True) for clause in query_clauses: base_query = base_query.filter(clause) base_query = base_query.group_by(mt.c['dataset_name'])\ .group_by(time_agg)\ .order_by(time_agg) values = [o for o in base_query.all()] # init from and to dates with python datetimes from_date = truncate(parse(raw_query_params['obs_date__ge']), agg) if 'obs_date__le' in raw_query_params.keys(): to_date = parse(raw_query_params['obs_date__le']) else: to_date = datetime.now() # build the response results = sorted(values, key=itemgetter(2)) for k,g in groupby(results, key=itemgetter(2)): d = {'dataset_name': k} items = [] dense_matrix = [] cursor = from_date v_index = 0 dataset_values = list(g) while cursor <= to_date: if v_index < len(dataset_values) and \ dataset_values[v_index][0].replace(tzinfo=None) == cursor: dense_matrix.append((cursor, dataset_values[v_index][1])) v_index += 1 else: dense_matrix.append((cursor, 0)) cursor = increment_datetime_aggregate(cursor, agg) dense_matrix = OrderedDict(dense_matrix) for k in dense_matrix: i = { 'datetime': k, 'count': dense_matrix[k], } items.append(i) d['items'] = items resp['objects'].append(d) resp['meta']['query'] = raw_query_params loc = resp['meta']['query'].get('location_geom__within') if loc: resp['meta']['query']['location_geom__within'] = json.loads(loc) resp['meta']['query']['agg'] = agg resp['meta']['status'] = 'ok' if datatype == 'json': resp = make_response(json.dumps(resp, default=dthandler), status_code) resp.headers['Content-Type'] = 'application/json' elif datatype == 'csv': # response format # temporal_group,dataset_name_1,dataset_name_2 # 2014-02-24 00:00:00,235,653 # 2014-03-03 00:00:00,156,624 fields = ['temporal_group'] for o in resp['objects']: fields.append(o['dataset_name']) csv_resp = [] i = 0 for k,g in groupby(resp['objects'], key=itemgetter('dataset_name')): l_g = list(g)[0] j = 0 for row in l_g['items']: # first iteration, populate the first column with temporal_groups if i == 0: csv_resp.append([row['datetime']]) csv_resp[j].append(row['count']) j += 1 i += 1 csv_resp.insert(0, fields) csv_resp = make_csv(csv_resp) resp = make_response(csv_resp, 200) resp.headers['Content-Type'] = 'text/csv' filedate = datetime.now().strftime('%Y-%m-%d') resp.headers['Content-Disposition'] = 'attachment; filename=%s.csv' % (filedate) return resp