def to_sql(df: DataFrame) -> NoReturn: if df.size == 0: return None df_size = df.shape[0] n_chunks = ceil(df_size / DB_INSERT_MAX_ROWS) session = Session() connection = session.connection() try: for chunk in df.pipe(array_split, n_chunks): records = chunk.to_dict(orient="records") insert_stmt = insert(MainData.__table__).values(records) stmt = insert_stmt.on_conflict_do_update( index_elements=[MainData.hash, MainData.partition_id], set_={MainData.payload.name: insert_stmt.excluded.payload} ) connection.execute(stmt) session.flush() except Exception as err: session.rollback() raise err finally: session.close() return None
def create_asset(release_date: str): partition_date = datetime.fromisoformat(release_date).strftime( "%Y_%-m_%-d") query = QUERY.format(date=partition_date) session = Session() conn = session.connection() try: resp = conn.execute(text(query)) raw_data = resp.fetchall() except Exception as err: session.rollback() raise err finally: session.close() data = DataFrame(raw_data, columns=["properties", "geometry"]) geo_data = { "type": "FeatureCollection", "features": (data.assign(type="Feature").reset_index().rename(columns={ "index": "id" }).to_dict("records")) } return store_data(geo_data, "downloads", "maps/vax-data_latest.geojson")
def derive_publish_date_percentages(timestamp, area_type): query = PERCENTAGE_DATA.format( date=f"{timestamp:%Y_%-m_%-d}", area_type=area_type ) session = Session() conn = session.connection() try: resp = conn.execute(text(query)) raw_data = resp.fetchall() except Exception as err: session.rollback() raise err finally: session.close() columns = [ 'area_id', 'partition_id', 'metric_id', 'metric', 'area_type', 'area_code', 'release_id', 'date', 'payload' ] to_sql( DataFrame(raw_data, columns=columns) .pipe(lambda dt: dt.assign(hash=generate_row_hash(dt, hash_only=True))) .loc[:, ["metric_id", "area_id", "partition_id", "release_id", "hash", "date", "payload"]] ) return True
def confirm_or_create_area(area_type: str, area_code: str, area_name: str): stmt = ( insert(AreaReference.__table__) .values( area_type=area_type, area_code=area_code, area_name=area_name, unique_ref=f"{area_type}|{area_code}" ) .on_conflict_do_nothing( index_elements=[ AreaReference.area_type, AreaReference.area_code ] ) .compile(dialect=postgres()) ) # session = Session(autocommit=True) session = Session() try: session.connection().execute(stmt) session.flush() # session.begin() # session.add(stmt) # session.commit() except Exception as err: session.rollback() raise err finally: session.close() return None
def to_sql(df: DataFrame): if df.size == 0: return None df_size = df.shape[0] n_chunks = df_size // DB_INSERT_MAX_ROWS + 1 df.drop_duplicates( ["release_id", "area_id", "metric_id", "date"], keep="first", inplace=True ) session = Session() connection = session.connection() try: for chunk in df.pipe(array_split, n_chunks): records = chunk.to_dict(orient="records") insert_stmt = insert(MainData.__table__).values(records) stmt = insert_stmt.on_conflict_do_update( index_elements=[MainData.hash, MainData.partition_id], set_={MainData.payload.name: insert_stmt.excluded.payload} ) connection.execute(stmt) session.flush() except Exception as err: session.rollback() raise err finally: session.close() return None
def get_release(timestamp): insert_stmt = ( insert(ReleaseReference.__table__) .values(timestamp=timestamp) ) stmt = ( insert_stmt .on_conflict_do_update( index_elements=[ReleaseReference.timestamp], set_={ReleaseReference.timestamp.name: insert_stmt.excluded.timestamp} ) .returning(ReleaseReference.id) ) # session = Session(autocommit=True) session = Session() try: response = session.execute(stmt) result = response.fetchone()[0] except Exception as err: session.rollback() raise err finally: session.close() return result
def register_release(category: str, release_id: int): insert_stmt = ( insert(ReleaseCategory.__table__) .values( release_id=release_id, process_name=category ) .on_conflict_do_nothing( index_elements=[ ReleaseCategory.release_id, ReleaseCategory.process_name, ] ) ) session = Session() try: session.begin() session.add(insert_stmt) session.commit() except Exception as err: session.rollback() raise err finally: session.close() return None
def get_timeseries(date: str, metric: str): ts = datetime.strptime(date, "%Y-%m-%d") partition = f"{ts:%Y_%-m_%-d}_other" partition_id = f"{ts:%Y_%-m_%-d}|other" values_query = queries.TIMESRIES_QUERY.format(partition=partition) change_query = queries.LATEST_CHANGE_QUERY.format(partition=partition) session = Session() conn = session.connection() try: resp = conn.execute(text(values_query), partition_id=partition_id, metric=metric) values = resp.fetchall() resp = conn.execute(text(change_query), partition_id=partition_id, datestamp=ts, metric=metric + "Change") change = resp.fetchone() except Exception as err: session.rollback() raise err finally: session.close() store_data(date, metric, plot_thumbnail(values, metric_name=metric, change=change)) return True
def derive_publish_date_metrics(timestamp, area_type): query = PUBLISH_DATE_CALCULATION.format( today=f"{timestamp:%Y_%-m_%-d}", yesterday=get_previous_partition_date(), area_type=area_type ) session = Session() conn = session.connection() try: resp = conn.execute(text(query), ) raw_data = resp.fetchall() except Exception as err: session.rollback() raise err finally: session.close() columns = [ 'partition_id', 'area_id', 'area_type', 'area_code', 'metric_id', 'release_id', 'date', 'payload' ] to_sql( DataFrame(raw_data, columns=columns) .pipe(lambda dt: dt.assign(hash=generate_row_hash(dt, hash_only=True))) .loc[:, ["metric_id", "area_id", "partition_id", "release_id", "hash", "date", "payload"]] ) return True
def get_vaccinations(date): ts = datetime.strptime(date, "%Y-%m-%d") partition = f"{ts:%Y_%-m_%-d}" vax_query = queries.VACCINATIONS_QUERY.format(partition_date=partition) session = Session() conn = session.connection() try: resp = conn.execute(text(vax_query), datestamp=ts) values = resp.fetchall() except Exception as err: session.rollback() raise err finally: session.close() for item in values: store_data(date, "vaccinations", plot_vaccinations(item), area_type=item["area_type"], area_code=item["area_code"]) return True
def create_partition(area_type, release): area_type = area_type.lower() partition_id = get_partition_id(area_type, release) if area_type in ["nhstrust", "utla", "ltla", "msoa"]: area_partition = f"{release:%Y_%-m_%-d}_{area_type}" else: area_partition = f"{release:%Y_%-m_%-d}_other" # session = Session(autocommit=True) session = Session() try: # session.begin() session.execute( f""" CREATE TABLE IF NOT EXISTS covid19.time_series_p{area_partition} PARTITION OF covid19.time_series ( partition_id ) FOR VALUES IN ('{partition_id}'); """ ) session.flush() except ProgrammingError as e: session.rollback() except Exception as err: session.rollback() raise err finally: session.close() return partition_id
def get_release_id(datestamp: datetime, process_name: str) -> Tuple[int, datetime]: query = ( select([ ReleaseReference.id, ReleaseReference.timestamp ]) .select_from( join( ReleaseReference, ReleaseCategory, ReleaseReference.id == ReleaseCategory.release_id ) ) .where( and_( func.DATE(ReleaseReference.timestamp) == func.DATE(datestamp.isoformat()), ReleaseCategory.process_name == process_name ) ) ) session = Session() try: response = session.execute(query) result = response.fetchone() if result is not None: return result except Exception as err: session.rollback() raise err finally: session.close() try: release = ReleaseReference(timestamp=datestamp) session.begin() session.add(release) session.commit() category = ReleaseCategory(release_id=release.id, process_name=process_name) session.begin() session.add(category) session.commit() except Exception as err: session.rollback() raise err finally: session.close() return get_release_id(datestamp, process_name)
def update_permissions(): session = Session() connection = session.connection() try: connection.execute(text(PERMISSIONS_QUERY)) session.flush() except Exception as err: session.rollback() raise err finally: session.close() return None
def get_previous_partition_date(): session = Session() conn = session.connection() try: resp = conn.execute(text(PREVIOUS_PUBLICATION_DATE)) data = resp.fetchone() except Exception as err: session.rollback() raise err finally: session.close() return f"{data[0]:%Y_%-m_%-d}"
def create_asset(data_type): params = PARAMETERS[data_type] session = Session() conn = session.connection() try: resp = conn.execute(text(params["query"]), process_name=params["process_name"]) raw_data = resp.fetchall() except Exception as err: session.rollback() raise err finally: session.close() store_data(dict(raw_data), params['container'], params['path'])
def get_area_data(): session = Session() try: results = read_sql( f"""\ SELECT c.id AS "area_id", c.area_type, c.area_code FROM covid19.area_reference AS c """, con=session.connection(), index_col=["area_type", "area_code"] ) except Exception as err: session.rollback() raise err finally: session.close() return results
def get_data(category: str, timestamp): query = text(CATEGORY_TIMESTAMPS) session = Session() conn = session.connection() try: resp = conn.execute(query, category=category, timestamp=timestamp) raw_data = resp.fetchall() except Exception as err: session.rollback() raise err finally: session.close() df = DataFrame(raw_data, columns=['timestamp']) df.timestamp = (df.timestamp.map(to_mins).astype({"timestamp": int})) return df
def get_metrics(): session = Session() try: metrics = read_sql( f"""\ SELECT ref.id AS "metric_id", ref.metric FROM covid19.metric_reference AS ref; """, con=session.connection(), index_col=["metric"] ) except Exception as err: session.rollback() raise err finally: session.close() return metrics
def update_stats(date, category): session = Session() connection = session.connection() try: connection.execute( text( STATS_QUERY.format( datestamp=date, partitions= f'{{{str.join(",", get_partition_ids(date, category))}}}')) ) session.flush() except Exception as err: session.rollback() raise err finally: session.close() return None
def generate_html(record_date: datetime): slug_id = get_record_id(record_date) session = Session() conn = session.connection() try: resp = conn.execute(text(OUTPUT_DATA), slug_id=slug_id) raw_data = resp.fetchall() except Exception as err: session.rollback() raise err finally: session.close() df = DataFrame(raw_data, columns=["area_name", "metric", "date", "value"]) df.value = df.value.map(format_number) area_names = [ "United Kingdom", "England", "Northern Ireland", "Scotland", "Wales" ] result = list() for section in structure: result.append( [process_section(df, item, area_names) for item in section]) template = env.get_template("base.html") html_data = template.render( results={ "area_names": area_names, "data": result }, timestamp=datetime.now().strftime(r"%A, %d %b %Y at %H:%M:%S GMT")) # Uncomment for testing # with open("sample.html", "w") as fp: # print(html_data, file=fp) # Comment for testing store_html(html_data, slug_id=slug_id)
def get_latest_scale_records(payload): date = datetime.fromisoformat(payload["timestamp"]) area_type = payload["area_type"] query = RATES.format(area_type=AREA_TYPE_PARTITION[area_type], date=f"{date:%Y_%-m_%-d}") metric, attr = "newCasesBySpecimenDateRollingRate", "value" if area_type == "msoa": metric, attr = "newCasesBySpecimenDate", "rollingRate" session = Session() conn = session.connection() try: resp = conn.execute(text(query), metric=metric, attr=attr, area_type=area_type) raw_data = resp.fetchall() except Exception as err: session.rollback() raise err finally: session.close() data = DataFrame(raw_data, columns=["area_type", "area_code", "rate"]) results = { "records": data.to_dict(orient="records"), "percentiles": { "min": data.loc[:, "rate"].min(), "0.1": data.loc[:, "rate"].quantile(0.1).round(1), "0.4": data.loc[:, "rate"].quantile(0.4).round(1), "median": data.loc[:, "rate"].median().round(1), "0.6": data.loc[:, "rate"].quantile(0.6).round(1), "0.9": data.loc[:, "rate"].quantile(0.9).round(1), "max": data.loc[:, "rate"].max(), }, "timestamp": payload["timestamp"] } return results
def register_file(filepath: str, timestamp: datetime, instance_id: str, release_id=None) -> True: parsed_filepath = parse_filepath(filepath) processed_file = ProcessedFile(file_path=filepath, type=category_label(parsed_filepath), timestamp=timestamp, release_id=release_id, process_id=instance_id) session = Session() try: session.add(processed_file) session.flush() except IntegrityError as err: session.rollback() query = session.execute( select([ ProcessedFile.id, ]).where( and_(ProcessedFile.file_path == filepath, ProcessedFile.process_id == instance_id))) result = query.fetchone() if result is not None: return True logging.info("Record already exists.") raise err except Exception as err: session.rollback() raise err finally: session.close() return True
def retrieve_data(timestamp: datetime): with open(query_path) as fp: query = fp.read() query = Template(query).substitute(release_date=f"{timestamp:%Y_%-m_%-d}") session = Session() conn = session.connection() try: resp = conn.execute(text(query)) raw_data = resp.fetchall() except Exception as err: session.rollback() raise err finally: session.close() df = DataFrame(raw_data, columns=["key", "value"]) return df
def get_data(record_date: datetime) -> DataFrame: session = Session() conn = session.connection() try: resp = conn.execute(text( MAIN_QUERY.format(partition_date=f"{record_date:%Y_%-m_%-d}")), metrics=METRICS) raw_data = resp.fetchall() except Exception as err: session.rollback() raise err finally: session.close() dt = (DataFrame(raw_data, columns=[ "release_id", "area_id", "metric", "date", "value" ]).assign(slug_id=get_record_id(record_date))) return dt
def get_metric_id(metric: str): stmt = ( select([ MetricReference.id ]) .where( MetricReference.metric == metric ) ) session = Session() try: result = session.execute(stmt) return result.fetchone()[0] except Exception as err: session.rollback() raise err finally: session.close()
def create_partition(area_type: str, release: datetime): """ Creates new database partition - if one doesn't already exist - for the `time_series` table based on `area_type` and `release` datestamp. Parameters ---------- area_type : str Area type, as defined in the `area_reference` table. release: datetime Release timestamp of the data. Returns ------- NoReturn """ partition_id = get_partition_id(area_type, release) if area_type in ["nhsTrust", "utla", "ltla", "msoa"]: area_partition = f"{release:%Y_%-m_%-d}_{area_type.lower()}" else: area_partition = f"{release:%Y_%-m_%-d}_other" session = Session() try: session.execute( f""" CREATE TABLE IF NOT EXISTS covid19.time_series_p{area_partition} PARTITION OF covid19.time_series ( partition_id ) FOR VALUES IN ('{partition_id}'); """ ) session.flush() except ProgrammingError as e: session.rollback() except Exception as err: session.rollback() raise err finally: session.close()
def get_data(query, metric): session = Session() conn = session.connection() try: resp = conn.execute( text(query), metric=metric ) raw_data = resp.fetchone() except Exception as err: session.rollback() raise err finally: session.close() data = { f'{metric}Date': f'{raw_data[1]:%-d %b}', metric: format(int(raw_data[2]), ",d") } return data
def get_area_codes(area_type: str): query = ( select([ AreaReference.id.label("area_id"), AreaReference.area_code ]) .where( AreaReference.area_type == area_type ) ) session = Session() try: result = read_sql(query, con=session.connection()) except Exception as err: session.rollback() raise err finally: session.close() return result
def add_metric(metric): stmt = ( insert(MetricReference.__table__) .values(metric=metric) .on_conflict_do_nothing(index_elements=[MetricReference.metric]) .compile(dialect=postgres()) ) session = Session() try: # session.begin() session.connection().execute(stmt) session.flush() # session.commit() except Exception as err: session.rollback() raise err finally: session.close() return None
def get_data(timestamp: datetime): query = MAIN.format(date=f"{timestamp:%Y_%-m_%-d}") metric = "newCasesBySpecimenDateRollingRate" session = Session() conn = session.connection() try: resp = conn.execute( text(query), metric=metric, datestamp=f'{timestamp::%Y-%m-%d}' ) raw_data = resp.fetchall() except Exception as err: session.rollback() raise err finally: session.close() print(len(raw_data)) return DataFrame(raw_data, columns=["areaType", "areaCode", metric])