示例#1
0
def _merge_field(
    datasource_series: List[Tuple[str, pd.Series]], new_index, log: structlog.BoundLoggerBase
):
    log.info("Working field")
    field_out = None
    field_provenance = pd.Series(index=new_index, dtype="object")
    # Go through the data sources, starting with the highest priority.
    for datasource_name, datasource_field_in in reversed(datasource_series):
        log_datasource = log.bind(dataset_name=datasource_name)
        if field_out is None:
            # Copy all values from the highest priority input to the output
            field_provenance.loc[pd.notna(datasource_field_in)] = datasource_name
            field_out = datasource_field_in
        else:
            field_out_has_ts = field_out.groupby(level=[CommonFields.FIPS], sort=False).transform(
                lambda x: x.notna().any()
            )
            copy_field_in = (~field_out_has_ts) & pd.notna(datasource_field_in)
            # Copy from datasource_field_in only on rows where all rows of field_out with that FIPS are NaN.
            field_provenance.loc[copy_field_in] = datasource_name
            field_out = field_out.where(~copy_field_in, datasource_field_in)
        dups = field_out.index.duplicated(keep=False)
        if dups.any():
            log_datasource.error("Found duplicates in index")
            raise ValueError()  # This is bad, somehow the input /still/ has duplicates
    return field_out, field_provenance
def filter_and_smooth_input_data(
    df: pd.DataFrame,
    region: pipeline.Region,
    include_deaths: bool,
    figure_collector: Optional[list],
    log: structlog.BoundLoggerBase,
) -> pd.DataFrame:
    """Do Filtering Here Before it Gets to the Inference Engine"""
    MIN_CUMULATIVE_COUNTS = dict(cases=20, deaths=10)
    MIN_INCIDENT_COUNTS = dict(cases=5, deaths=5)

    dates = df.index
    # Apply Business Logic To Filter Raw Data
    for column in ["cases", "deaths"]:
        requirements = [  # All Must Be True
            df[column].count() > InferRtConstants.MIN_TIMESERIES_LENGTH,
            df[column].sum() > MIN_CUMULATIVE_COUNTS[column],
            df[column].max() > MIN_INCIDENT_COUNTS[column],
        ]
        # Now Apply Input Outlier Detection and Smoothing

        filtered = utils.replace_outliers(df[column], log=rt_log.new(column=column))
        # TODO find way to indicate which points filtered in figure below

        assert len(filtered) == len(df[column])
        smoothed = filtered.rolling(
            InferRtConstants.COUNT_SMOOTHING_WINDOW_SIZE,
            win_type="gaussian",
            min_periods=InferRtConstants.COUNT_SMOOTHING_KERNEL_STD,
            center=True,
        ).mean(std=InferRtConstants.COUNT_SMOOTHING_KERNEL_STD)
        # TODO: Only start once non-zero to maintain backwards compatibility?

        # Check if the Post Smoothed Meets the Requirements
        requirements.append(smoothed.max() > MIN_INCIDENT_COUNTS[column])

        # Check include_deaths Flag
        if column == "deaths" and not include_deaths:
            requirements.append(False)
        else:
            requirements.append(True)

        if all(requirements):
            if column == "cases":
                fig = plt.figure(figsize=(10, 6))
                ax = fig.add_subplot(111)  # plt.axes
                ax.set_yscale("log")
                chart_min = max(0.1, smoothed.min())
                ax.set_ylim((chart_min, df[column].max()))
                plt.scatter(
                    dates[-len(df[column]) :],
                    df[column],
                    alpha=0.3,
                    label=f"Smoothing of: {column}",
                )
                plt.plot(dates[-len(df[column]) :], smoothed)
                plt.grid(True, which="both")
                plt.xticks(rotation=30)
                plt.xlim(min(dates[-len(df[column]) :]), max(dates) + timedelta(days=2))

                if not figure_collector:
                    plot_path = pyseir.utils.get_run_artifact_path(
                        region, RunArtifact.RT_SMOOTHING_REPORT
                    )
                    plt.savefig(plot_path, bbox_inches="tight")
                    plt.close(fig)
                else:
                    figure_collector["1_smoothed_cases"] = fig

            df[column] = smoothed
        else:
            df = df.drop(columns=column, inplace=False)
            log.info("Dropping:", columns=column, requirements=requirements)

    return df