def find_last_change(val, vec_vals: pd.Series, vec_date) -> Tuple[int, datetime]: vals = vec_vals.values for i in range(len(vals)): if vals[i] != val: sdate = str(vec_date.values[i]) d = datetime(int(sdate[0:4]), int(sdate[4:6]), int(sdate[6:8])) return vals[i], udatetime.naivedatetime_as_eastern(d) return 0, None
def increasing_values(row, df: pd.DataFrame, log: ResultLog, config: QCConfig = None) -> bool: """Check that new values more than previous values df contains the historical values (newest first). offset controls how many days to look back. consolidate lines if everything changed return False if it looks like we have no new data for this source so we can bypass other tests """ if not config: config = QCConfig() df = df[df.date < row.targetDate] dict_row = row._asdict() # local time is an editable field that it supposed to be the last time the data changed. # last_updated is the same value but adjusted to eastern TZ if "localTime" in dict_row: local_time = row.localTime d_local = local_time.year * 10000 + local_time.month * 100 + local_time.day else: local_time = None d_local = 0 last_updated = row.lastUpdateEt d_updated = last_updated.year * 10000 + last_updated.month * 100 + last_updated.day # target date of run s_target = str(row.targetDate) d_target = datetime(int(s_target[0:4]), int(s_target[4:6]), int(s_target[6:8])) d_target = udatetime.naivedatetime_as_eastern(d_target) d_last_change = udatetime.naivedatetime_as_eastern(datetime(2020, 1, 1)) debug = config.enable_debug if debug: logger.debug(f"check {row.state}") fieldList = [ "positive", "negative", "death", "hospitalizedCumulative", "inIcuCumulative", "onVentilatorCumulative" ] displayList = [ "positive", "negative", "death", "hospitalized", "icu", "ventilator" ] source_messages = [] has_issues, consolidate, n_days, n_days_prev = False, True, -1, 0 for c in fieldList: val = dict_row.get(c) if val is None: log.internal(row.state, f"{c} missing column") has_issues, consolidate = True, False if debug: logger.debug(f" {c} missing column") continue if not c in df.columns: log.internal(row.state, f"{c} missing history column") has_issues, consolidate = True, False if debug: logger.debug(f" {c} missing history column") continue vec = df[c].values prev_val = vec[0] if vec.size > 0 else 0 prev_date = df["date"].iloc[0] if vec.size > 0 else 0 if val < prev_val and (val > 0 and prev_val != 0 ): # negative values indicate blank/errors sd = str(prev_date)[4:] if prev_date > 0 else "-" sd = sd[0:2] + "/" + sd[2:4] log.data_quality( row.state, f"{c} ({val:,}) decreased from {prev_val:,} as-of {sd}") has_issues, consolidate = True, False if debug: logger.debug( f" {c} ({val:,}) decreased from {prev_val:,} as-of {sd}") continue # allow value to be the same if below a threshold, default to 10 t = IGNORE_THRESHOLDS.get(c) if t == None: t = 10 if val < t: if debug: logger.debug( f" {c} ({val:,}) is below threshold -> ignore 'same' check" ) continue #phase = row.phase #checked_at = row.lastCheckEt.to_pydatetime() #is_check_field_set = checked_at > START_OF_TIME if val == -1000: log.data_entry(row.state, f"{c} value cannot be converted to a number") has_issues, consolidate = True, False if debug: logger.debug(f" {c} was not a number in source data") continue if val == prev_val: changed_val, changed_date = find_last_change( val, df[c], df["date"]) n_days = int( (d_target - changed_date).total_seconds() // (60 * 60 * 24)) if n_days >= 0: d_last_change = max(d_last_change, changed_date) source_messages.append( f"{c} ({val:,}) hasn't changed since {changed_date.month}/{changed_date.day} ({n_days} days)" ) # check if we can still consolidate results if n_days_prev == 0: n_days_prev = n_days if debug: logger.debug( f" {c} ({val:,}) hasn't changed since {changed_date.month}/{changed_date.day} ({n_days} days)" ) elif n_days_prev == n_days: if debug: logger.debug( f" {c} ({val:,}) also hasn't changed since {changed_date.month}/{changed_date.day}" ) else: consolidate = False if debug: logger.debug( f" {c} ({val:,}) hasn't changed since {changed_date.month}/{changed_date.day} ({n_days} days ago) -> force individual lines " ) else: d_last_change = max(d_last_change, df["date"].values[-1]) has_issues, consolidate = True, False log.data_source(row.state, f"{c} ({val:,}) constant for all time") if debug: logger.debug( f" {c} ({val:,}) constant -> force individual lines ") else: consolidate = False if debug: logger.debug( f" {c} ({val:,}) changed from {prev_val:,} on {prev_date}" ) if len(source_messages) == 0: if debug: logger.debug(f" no source messages -> has_issues={has_issues}") return has_issues # alert if local time appears to updated incorrectly if d_local != 0 and d_local != d_updated: sd = str(d_last_change) sd = sd[4:6] + "/" + sd[6:8] sd_local = f"{local_time.month}/{local_time.day} {local_time.hour:02}:{local_time.minute:02}" checker = row.checker if checker == "": checker = "??" log.data_entry( row.state, f"checker {checker} set local time (column V) to {sd_local} but values haven't changed since {sd} ({n_days:.0f} days ago)" ) #has_issues = True if debug: logger.debug( f" checker {checker} set local time (column V) to {sd_local} but values haven't changed since {sd} ({n_days:.0f} days ago)" ) if consolidate: names = "/".join(displayList) log.data_source( row.state, f"cumulative values ({names}) haven't changed since {d_last_change.month}/{d_last_change.day} ({n_days:.0f} days)" ) if debug: logger.debug( f" cumulative values ({names}) haven't changed since {d_last_change.month}/{d_last_change.day} ({n_days:.0f} days)" ) else: for m in source_messages: log.data_source(row.state, m) if debug: logger.debug( f" {row.state}: record {len(source_messages)} source issue(s) to log" ) return has_issues
from datetime import datetime from loguru import logger import pandas as pd import numpy as np from typing import Tuple from app.util import udatetime from .qc_config import QCConfig from .log.result_log import ResultLog from .modeling.forecast import Forecast from .modeling.forecast_plot import plot_to_file from .modeling.forecast_io import save_forecast_hd5, load_forecast_hd5 START_OF_TIME = udatetime.naivedatetime_as_eastern(datetime(2020, 1, 2)) def current_time_and_phase() -> Tuple[datetime, str]: "get the current time (ET) and phase of the process given the hour" target_time = udatetime.now_as_eastern() hour = target_time.hour # note -- these are all just guesses on a mental model of 1 update per day. Josh phase = "" if hour < 10: phase = "inactive" elif hour < 12 + 2: phase = "prepare" # preparing for run