Пример #1
0
def total_tests(row, log: ResultLog):
    """Check that positive, and negative sum to the reported totalTest"""

    # note -- I don't know where this field is in the sheet so this test is not used right now - Josh

    n_pos, n_neg, n_tests = \
        row.positive, row.negative, row.totalTestResults

    n_diff = n_tests - (n_pos + n_neg)
    if n_diff != 0:
        log.error(row.state, f"Formula broken -> Postive ({n_pos}) + Negative ({n_neg}) != Total Tests ({n_tests}), delta = {n_diff}")
Пример #2
0
def death_rate(row, log: ResultLog):
    """Check that deaths are <5% of test results"""

    n_pos, n_neg, n_deaths = row.positive, row.negative, row.death
    n_tot = n_pos + n_neg

    percent_deaths = 100.0 * n_deaths / n_tot if n_tot > 0 else 0.0
    if n_tot > 100:
        if percent_deaths > 5.0:
            log.error(row.state, f"Too many deaths {percent_deaths:.0f}% (positive={n_deaths:,}, total={n_tot:,})")
    else:
        if percent_deaths > 10.0:
            log.error(row.state, f"Too many deaths {percent_deaths:.0f}% (positive={n_deaths:,}, total={n_tot:,})")
Пример #3
0
def positives_rate(row, log: ResultLog):
    """Check that positives compose <20% test results"""

    n_pos, n_neg, n_deaths = row.positive, row.negative, row.death
    n_tot = n_pos + n_neg

    percent_pos = 100.0 * n_pos / n_tot if n_tot > 0 else 0.0
    if n_tot > 100:
        if percent_pos > 40.0 and n_pos > 20:
            log.error(row.state, f"Too many positive {percent_pos:.0f}% (positive={n_pos:,}, total={n_tot:,})")
    else:
        if percent_pos > 80.0 and n_pos > 20:
            log.error(row.state, f"Too many positive {percent_pos:.0f}% (positive={n_pos:,}, total={n_tot:,})")
Пример #4
0
def counties_rollup_to_state(row, counties: pd.DataFrame, log: ResultLog):
    """
    Check that county totals from NYT, CSBS, CDS datasets are
    about equal to the reported state totals. Metrics compared are:
        - positive cases
        - patient deaths
    """
    if row.positive > 100:
        pos_error =  abs(counties["cases"] - row.positive).min() / row.positive
        if pos_error > COUNTY_ERROR_THRESHOLDS["positive"]:
            closest_pos = int(round(pos_error * row.positive + row.positive))
            log.error(row.state, f"county aggregate for positive tests does not match state totals (state: {row.positive}, county: {closest_pos})")

    if row.death > 20:
        death_error = abs(counties["deaths"] - row.death).min() / row.death
        if death_error > COUNTY_ERROR_THRESHOLDS["death"]:
            closest_death = int(round(death_error * row.death + row.death))
            log.error(row.state, f"county aggregate for patient deaths does not match state totals (state: {row.death}, county: {closest_death})")
Пример #5
0
def expected_positive_increase( current: pd.DataFrame, history: pd.DataFrame,
                                log: ResultLog, context: str, config: QCConfig=None):
    """
    Fit state-level daily positives data to an exponential and a linear curve.
    Get expected vs actual case increase to determine if current positives
    are within the expected ranges.

    The exponential is used as the upper bound. The linear is used as the lower bound.

    TODO: Eventually these curves will NOT be exp (perhaps logistic?)
          Useful to know which curves have been "leveled" but from a
          data quality perspective, this check would become annoying
    """

    if not config: config = QCConfig()

    forecast_date = current.lastUpdateEt.to_pydatetime().strftime('%Y%m%d')
    history = history.loc[history["date"].astype(str) != forecast_date]

    forecast = Forecast()
    forecast.fit(history)
    forecast.project(current)

    if config.save_results:
        save_forecast_hd5(forecast, config.results_dir)
    elif config.plot_models:
        plot_to_file(forecast, f"{config.images_dir}/{context}", FIT_THRESHOLDS)

    state = forecast.state
    date = forecast.date
    actual_value, expected_linear, expected_exp = forecast.results

    min_value = int(FIT_THRESHOLDS[0] * expected_linear)
    max_value = int(FIT_THRESHOLDS[1] * expected_exp)

    if not (min_value <= actual_value <=  max_value):
        direction = "increase"
        if actual_value < expected_linear:
            direction = "drop"

        log.error(state, f"unexpected {direction} in positive cases ({actual_value:,}) for {date}, expected between {min_value:,} and {max_value:,}")
Пример #6
0
def checkers_initials(row, log: ResultLog):
    """Confirm that checker initials are records"""

    phase = row.phase
    if phase == "inactive": return

    target_date = row.targetDateEt.to_pydatetime()
    checked_at = row.lastCheckEt.to_pydatetime()
    if checked_at <= START_OF_TIME: return

    is_near_release = phase in ["publish", "update"]

    checker = row.checker.strip()
    doubleChecker = row.doubleChecker.strip()

    delta_hours = (target_date - checked_at).total_seconds() / (60.0 * 60.0)

    if checker == "":
        if 0 < delta_hours < 5:
            s_checked = checked_at.strftime('%m/%d %H:%M')
            log.error(row.state, f"missing checker initials but checked date set recently (at {s_checked})")
        elif is_near_release:
            log.error(row.state, f"missing checker initials")
        else:
            log.info(row.state, f"missing checker initials")
        return
    if doubleChecker == "":
        if is_near_release:
            log.error(row.state, f"missing double-checker initials")
        else:
            log.info(row.state, f"Missing double-checker initials")
        return
Пример #7
0
def increasing_values(row, df: pd.DataFrame, log: ResultLog):
    """Check that new values more than previous values

    df contains the historical values (newest first).  offset controls how many days to look back.
    """

    df = df[df.date < row.targetDate]

    #print(df)
    #exit(-1)

    dict_row = row._asdict()

    for c in ["positive", "negative", "death", "total"]:
        val = dict_row[c]
        vec = df[c].values
        prev_val = vec[0] if vec.size > 0 else 0

        if val < prev_val:
            log.error(row.state, f"{c} value ({val:,}) is less than prior value ({prev_val:,})")

        # allow value to be the same if below a threshold
        if val < IGNORE_THRESHOLDS[c]: continue

        phase = row.phase
        checked_at = row.lastCheckEt.to_pydatetime()
        is_check_field_set = checked_at > START_OF_TIME

        if val == prev_val:
            n_days, d = days_since_change(val, df[c], df["date"])
            if n_days >= 0:
                d = str(d)
                d = d[4:6] + "/" + d[6:8]

                if prev_val >= 20 and (is_check_field_set or phase in ["publish", "update"]):
                    log.error(row.state, f"{c} value ({val:,}) has not changed since {d} ({n_days} days)")
                else:
                    log.warning(row.state, f"{c} value ({val:,}) has not changed since {d} ({n_days} days)")
            else:
                log.error(row.state, f"{c} value ({val:,}) constant for all time")
            continue

        p_observed = 100.0 * val / prev_val - 100.0

        #TODO: estimate expected increase from recent history
        p_min, p_max = EXPECTED_PERCENT_THRESHOLDS[c]
        if p_observed < p_min or p_observed > p_max:
            log.warning(row.state, f"{c} value ({val:,}) is a {p_observed:.0f}% increase, expected: {p_min:.0f} to {p_max:.0f}%")
Пример #8
0
def less_recovered_than_positive(row, log: ResultLog):
    """Check that we don't have more recovered than positive"""

    if row.recovered > row.positive:
        log.error(row.state, f"More recovered than positive (recovered={row.recovered:,}, positive={row.positive:,})")
Пример #9
0
def check_working(ds: DataSource, config: QCConfig) -> ResultLog:
    """
    Check unpublished results in the working google sheet
    https://docs.google.com/spreadsheets/d/1MvvbHfnjF67GnYUDJJiNYUmGco5KQ9PW0ZRnEP9ndlU/edit#gid=1777138528
    """

    logger.info("check working")
    log = ResultLog()

    # targetDate is the date that the dev sheet is currently working on.
    # phase is what part of their process they are in.
    # targetDateEt is the time that should be used on any 'staleness' checks

    d, phase = checks.current_time_and_phase()

    ds._target_date = d

    df = ds.working
    df["targetDate"] = d.year * 10000 + d.month * 100 + d.day
    df["targetDateEt"] = d
    df["phase"] = phase

    logger.info(f"Running with target date = {d} and phase = {phase}")

    # *** WHEN YOU CHANGE A CHECK THAT IMPACTS WORKING, MAKE SURE TO UPDATE THE EXCEL TRACKING DOCUMENT ***

    for row in df.itertuples():
        try:

            checks.total(row, log)
            #checks.total_tests(row, log)
            checks.last_update(row, log)
            checks.last_checked(row, log)
            checks.checkers_initials(row, log)
            checks.positives_rate(row, log)
            checks.death_rate(row, log)
            checks.less_recovered_than_positive(row, log)
            checks.pendings_rate(row, log)

            df_history = ds.history[ds.history.state == row.state]
            checks.increasing_values(row, df_history, log)
            checks.expected_positive_increase(row, df_history, log, "working",
                                              config)

            df_county_rollup = ds.county_rollup[ds.county_rollup.state ==
                                                row.state]
            if not df_county_rollup.empty:
                checks.counties_rollup_to_state(row, df_county_rollup, log)

        except Exception as ex:
            logger.exception(ex)
            log.error(row.state, f"{ex}")

    # run loop at end, insted of during run
    if config.plot_models and config.save_results:
        for row in df.itertuples():
            try:
                forecast = load_forecast_hd5(config.results_dir, row.state,
                                             row.targetDate)
                if forecast is None:
                    logger.warning(
                        f"Fould not load forecast for {row.state}/{row.targetDate}"
                    )
                else:
                    plot_to_file(forecast, f"{config.images_dir}/working",
                                 checks.FIT_THRESHOLDS)
            except Exception as ex:
                logger.exception(ex)
                log.error(row.state, f"{ex}")

    return log