def _merge_field( datasource_series: List[Tuple[str, pd.Series]], new_index, log: structlog.BoundLoggerBase ): log.info("Working field") field_out = None field_provenance = pd.Series(index=new_index, dtype="object") # Go through the data sources, starting with the highest priority. for datasource_name, datasource_field_in in reversed(datasource_series): log_datasource = log.bind(dataset_name=datasource_name) if field_out is None: # Copy all values from the highest priority input to the output field_provenance.loc[pd.notna(datasource_field_in)] = datasource_name field_out = datasource_field_in else: field_out_has_ts = field_out.groupby(level=[CommonFields.FIPS], sort=False).transform( lambda x: x.notna().any() ) copy_field_in = (~field_out_has_ts) & pd.notna(datasource_field_in) # Copy from datasource_field_in only on rows where all rows of field_out with that FIPS are NaN. field_provenance.loc[copy_field_in] = datasource_name field_out = field_out.where(~copy_field_in, datasource_field_in) dups = field_out.index.duplicated(keep=False) if dups.any(): log_datasource.error("Found duplicates in index") raise ValueError() # This is bad, somehow the input /still/ has duplicates return field_out, field_provenance
def filter_and_smooth_input_data( df: pd.DataFrame, region: pipeline.Region, include_deaths: bool, figure_collector: Optional[list], log: structlog.BoundLoggerBase, ) -> pd.DataFrame: """Do Filtering Here Before it Gets to the Inference Engine""" MIN_CUMULATIVE_COUNTS = dict(cases=20, deaths=10) MIN_INCIDENT_COUNTS = dict(cases=5, deaths=5) dates = df.index # Apply Business Logic To Filter Raw Data for column in ["cases", "deaths"]: requirements = [ # All Must Be True df[column].count() > InferRtConstants.MIN_TIMESERIES_LENGTH, df[column].sum() > MIN_CUMULATIVE_COUNTS[column], df[column].max() > MIN_INCIDENT_COUNTS[column], ] # Now Apply Input Outlier Detection and Smoothing filtered = utils.replace_outliers(df[column], log=rt_log.new(column=column)) # TODO find way to indicate which points filtered in figure below assert len(filtered) == len(df[column]) smoothed = filtered.rolling( InferRtConstants.COUNT_SMOOTHING_WINDOW_SIZE, win_type="gaussian", min_periods=InferRtConstants.COUNT_SMOOTHING_KERNEL_STD, center=True, ).mean(std=InferRtConstants.COUNT_SMOOTHING_KERNEL_STD) # TODO: Only start once non-zero to maintain backwards compatibility? # Check if the Post Smoothed Meets the Requirements requirements.append(smoothed.max() > MIN_INCIDENT_COUNTS[column]) # Check include_deaths Flag if column == "deaths" and not include_deaths: requirements.append(False) else: requirements.append(True) if all(requirements): if column == "cases": fig = plt.figure(figsize=(10, 6)) ax = fig.add_subplot(111) # plt.axes ax.set_yscale("log") chart_min = max(0.1, smoothed.min()) ax.set_ylim((chart_min, df[column].max())) plt.scatter( dates[-len(df[column]) :], df[column], alpha=0.3, label=f"Smoothing of: {column}", ) plt.plot(dates[-len(df[column]) :], smoothed) plt.grid(True, which="both") plt.xticks(rotation=30) plt.xlim(min(dates[-len(df[column]) :]), max(dates) + timedelta(days=2)) if not figure_collector: plot_path = pyseir.utils.get_run_artifact_path( region, RunArtifact.RT_SMOOTHING_REPORT ) plt.savefig(plot_path, bbox_inches="tight") plt.close(fig) else: figure_collector["1_smoothed_cases"] = fig df[column] = smoothed else: df = df.drop(columns=column, inplace=False) log.info("Dropping:", columns=column, requirements=requirements) return df