def from_region_and_model_output(
        region: pipeline.Region,
        combined_data: MultiRegionTimeseriesDataset,
        rt_data: MultiRegionTimeseriesDataset,
        icu_data: MultiRegionTimeseriesDataset,
    ) -> "RegionalInput":
        one_region_data = combined_data.get_one_region(region)

        # Not all regions have Rt or ICU data due to various filters in pyseir code.
        try:
            rt_data = rt_data.get_one_region(region)
        except timeseries.RegionLatestNotFound:
            rt_data = None

        try:
            icu_data = icu_data.get_one_region(region)
        except timeseries.RegionLatestNotFound:
            icu_data = None

        return RegionalInput(
            region=region,
            _combined_data=one_region_data,
            rt_data=rt_data,
            icu_data=icu_data,
        )
예제 #2
0
    def from_region_and_intervention(
        region: pipeline.Region,
        intervention: Intervention,
        rt_data: MultiRegionTimeseriesDataset,
        icu_data: MultiRegionTimeseriesDataset,
    ) -> "RegionalInput":
        combined_data = combined_datasets.RegionalData.from_region(region)

        try:
            rt_data = rt_data.get_one_region(region)
        except timeseries.RegionLatestNotFound:
            rt_data = None

        try:
            icu_data = icu_data.get_one_region(region)
        except timeseries.RegionLatestNotFound:
            icu_data = None

        return RegionalInput(
            region=region,
            model_output=None,
            intervention=intervention,
            _combined_data=combined_data,
            rt_data=rt_data,
            icu_data=icu_data,
        )
예제 #3
0
def generate_api_v2(model_output_dir, output, aggregation_level, state, fips):
    """The entry function for invocation"""

    # Caching load of us timeseries dataset
    combined_datasets.load_us_timeseries_dataset()

    active_states = [state.abbr for state in us.STATES]
    active_states = active_states + ["PR", "MP"]

    # Load all API Regions
    regions = combined_datasets.get_subset_regions(
        aggregation_level=aggregation_level,
        exclude_county_999=True,
        state=state,
        fips=fips,
        states=active_states,
    )
    _logger.info(f"Loading all regional inputs.")

    icu_data_path = model_output_dir / SummaryArtifact.ICU_METRIC_COMBINED.value
    icu_data = MultiRegionTimeseriesDataset.from_csv(icu_data_path)
    icu_data_map = dict(icu_data.iter_one_regions())

    rt_data_path = model_output_dir / SummaryArtifact.RT_METRIC_COMBINED.value
    rt_data = MultiRegionTimeseriesDataset.from_csv(rt_data_path)
    rt_data_map = dict(rt_data.iter_one_regions())

    regions_data = combined_datasets.load_us_timeseries_dataset(
    ).get_regions_subset(regions)

    regional_inputs = [
        api_v2_pipeline.RegionalInput.from_one_regions(
            region,
            regional_data,
            icu_data=icu_data_map.get(region),
            rt_data=rt_data_map.get(region),
        ) for region, regional_data in regions_data.iter_one_regions()
    ]

    _logger.info(f"Finished loading all regional inputs.")

    # Build all region timeseries API Output objects.
    _logger.info("Generating all API Timeseries")
    all_timeseries = api_v2_pipeline.run_on_regions(regional_inputs)

    api_v2_pipeline.deploy_single_level(all_timeseries,
                                        AggregationLevel.COUNTY, output)
    api_v2_pipeline.deploy_single_level(all_timeseries, AggregationLevel.STATE,
                                        output)

    _logger.info("Finished API generation.")
예제 #4
0
def generate_api(input_dir, output, summary_output, aggregation_level, state,
                 fips):
    """The entry function for invocation"""

    # Caching load of us timeseries dataset
    combined_datasets.load_us_timeseries_dataset()

    active_states = [state.abbr for state in us.STATES]
    active_states = active_states + ["PR", "MP"]
    regions = combined_datasets.get_subset_regions(
        aggregation_level=aggregation_level,
        exclude_county_999=True,
        state=state,
        fips=fips,
        states=active_states,
    )

    icu_data_path = input_dir / SummaryArtifact.ICU_METRIC_COMBINED.value
    icu_data = MultiRegionTimeseriesDataset.from_csv(icu_data_path)
    rt_data_path = input_dir / SummaryArtifact.RT_METRIC_COMBINED.value
    rt_data = MultiRegionTimeseriesDataset.from_csv(rt_data_path)

    for intervention in list(Intervention):
        _logger.info(f"Running intervention {intervention.name}")

        _load_input = functools.partial(
            api_pipeline.RegionalInput.from_region_and_intervention,
            intervention=intervention,
            rt_data=rt_data,
            icu_data=icu_data,
        )
        with multiprocessing.Pool(maxtasksperchild=1) as pool:
            regional_inputs = pool.map(_load_input, regions)

        _logger.info(f"Loaded {len(regional_inputs)} regions.")
        all_timeseries = api_pipeline.run_on_all_regional_inputs_for_intervention(
            regional_inputs)
        county_timeseries = [
            output for output in all_timeseries
            if output.aggregate_level is AggregationLevel.COUNTY
        ]
        api_pipeline.deploy_single_level(intervention, county_timeseries,
                                         summary_output, output)
        state_timeseries = [
            output for output in all_timeseries
            if output.aggregate_level is AggregationLevel.STATE
        ]
        api_pipeline.deploy_single_level(intervention, state_timeseries,
                                         summary_output, output)
예제 #5
0
def test_pyseir_end_to_end_idaho(tmp_path):
    # This covers a lot of edge cases.
    with unittest.mock.patch("pyseir.utils.OUTPUT_DIR", str(tmp_path)):
        fips = "16001"
        region = Region.from_fips(fips)
        pipelines = cli._build_all_for_states(states=["ID"], fips=fips)
        cli._write_pipeline_output(pipelines, tmp_path)

        icu_data_path = tmp_path / SummaryArtifact.ICU_METRIC_COMBINED.value
        icu_data = MultiRegionTimeseriesDataset.from_csv(icu_data_path)
        assert icu_data.get_one_region(region)

        rt_data_path = tmp_path / SummaryArtifact.RT_METRIC_COMBINED.value
        rt_data = MultiRegionTimeseriesDataset.from_csv(rt_data_path)
        assert rt_data.get_one_region(region)
예제 #6
0
def _fips_csv_to_one_region(csv_str: str,
                            region: Region) -> OneRegionTimeseriesDataset:
    # Make a Timeseries first because it can have a FIPS column without location_id
    ts = TimeseriesDataset.load_csv(io.StringIO(csv_str))
    # from_timeseries_and_latest adds the location_id column needed by get_one_region
    return MultiRegionTimeseriesDataset.from_timeseries_and_latest(
        ts, ts.latest_values_object()).get_one_region(region)
예제 #7
0
def test_inference_ok_with_5_days_cases_changed():
    # 5 days with cases data isn't enough to make inference_ok, 6 days are
    # needed so that there are 5 days with an *delta* relative to a previous day.
    csv_string_io = io.StringIO(
        "location_id,country,state,county,aggregate_level,date,cases,deaths\n"
        "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-01,100,1\n"
        "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-02,200,2\n"
        "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-03,300,3\n"
        "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-04,400,4\n"
        "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-05,500,5\n"
        "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-01,100,1\n"
        "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-02,200,2\n"
        "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-03,300,3\n"
        "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-04,400,4\n"
        "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-05,500,5\n"
        "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-06,600,6\n"
        "iso1:us#fips:97111,US,ZZ,Bar County,county,,500,5\n"
        "iso1:us#fips:97222,US,ZZ,Foo County,county,,100,1\n")
    input_dataset = MultiRegionTimeseriesDataset.from_csv(csv_string_io)

    df = WhitelistGenerator().generate_whitelist(input_dataset)

    assert to_dict(["fips"], df) == {
        "97111": {
            "state": "ZZ",
            "county": "Bar County",
            "inference_ok": False
        },
        "97222": {
            "state": "ZZ",
            "county": "Foo County",
            "inference_ok": True
        },
    }
def test_update_and_load(tmp_path: pathlib.Path, nyc_fips, nyc_region):
    us_combined_df = combined_datasets.load_us_timeseries_dataset().combined_df

    # restricting the datasets being persisted to one county to speed up tests a bit.
    nyc_combined_df = us_combined_df.loc[us_combined_df[CommonFields.FIPS] ==
                                         nyc_fips, :]
    multiregion_timeseries_nyc = MultiRegionTimeseriesDataset.from_combined_dataframe(
        nyc_combined_df)
    latest_nyc = LatestValuesDataset(
        multiregion_timeseries_nyc.latest_data_with_fips.reset_index())
    latest_nyc_record = latest_nyc.get_record_for_fips(nyc_fips)
    assert latest_nyc_record[CommonFields.POPULATION] > 1_000_000
    assert latest_nyc_record[CommonFields.LOCATION_ID]

    combined_dataset_utils.update_data_public_head(
        tmp_path,
        latest_dataset=latest_nyc,
        timeseries_dataset=multiregion_timeseries_nyc,
    )

    timeseries_loaded = combined_datasets.load_us_timeseries_dataset(
        pointer_directory=tmp_path)
    latest_loaded = combined_datasets.load_us_latest_dataset(
        pointer_directory=tmp_path)
    assert latest_loaded.get_record_for_fips(nyc_fips) == latest_nyc_record
    assert_combined_like(timeseries_loaded, multiregion_timeseries_nyc)
예제 #9
0
def test_aggregate():
    ts = TimeseriesDataset.load_csv(
        io.StringIO("fips,state,aggregate_level,county,m1,date,foo\n"
                    "55005,ZZ,county,North County,1,2020-05-01,ab\n"
                    "55005,ZZ,county,North County,2,2020-05-02,cd\n"
                    "55005,ZZ,county,North County,3,2020-05-03,ef\n"
                    "55006,ZZ,county,South County,3,2020-05-03,ef\n"
                    "55006,ZZ,county,South County,4,2020-05-04,gh\n"
                    "55,ZZ,state,Grand State,41,2020-05-01,ij\n"
                    "55,ZZ,state,Grand State,43,2020-05-03,kl\n"))
    ts_in = MultiRegionTimeseriesDataset.from_timeseries_and_latest(
        ts, ts.latest_values_object())
    agg = statistical_areas.CountyToCBSAAggregator(
        county_map={
            "55005": "10001",
            "55006": "10001"
        },
        cbsa_title_map={"10001": "Stat Area 1"})
    ts_out = agg.aggregate(ts_in)

    assert ts_out.groupby_region().ngroups == 1

    ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10001"))
    assert ts_cbsa.date_indexed["m1"].to_dict() == {
        pd.to_datetime("2020-05-01"): 1,
        pd.to_datetime("2020-05-02"): 2,
        pd.to_datetime("2020-05-03"): 6,
        pd.to_datetime("2020-05-04"): 4,
    }
예제 #10
0
    def generate_whitelist(
            self, timeseries: MultiRegionTimeseriesDataset) -> pd.DataFrame:
        """
        Generate a county whitelist based on the cuts above.

        Returns
        -------
        df: whitelist
        """
        logging.info("Generating county level whitelist...")

        df_candidates = (
            timeseries.get_counties().groupby_region()
            # Use pandarallel. It doesn't support the `name` attribute so leave FIPS as a regular
            # column so it can be read in the applied function.
            .parallel_apply(_whitelist_candidates_per_fips).reset_index(
                drop=True))

        df_candidates["inference_ok"] = (
            (df_candidates.nonzero_case_datapoints >=
             self.nonzero_case_datapoints)
            & (df_candidates.nonzero_death_datapoints >=
               self.nonzero_death_datapoints)
            & (df_candidates.total_cases >= self.total_cases)
            & (df_candidates.total_deaths >= self.total_deaths))
        whitelist_df = df_candidates[[
            "fips", "state", "county", "inference_ok"
        ]]

        return whitelist_df
예제 #11
0
def _write_pipeline_output(
    pipelines: List[Union[SubStatePipeline, StatePipeline]],
    output_dir: str,
    output_interval_days: int = 4,
    write_webui_output: bool = False,
):

    infection_rate_metric_df = pd.concat((p.infer_df for p in pipelines),
                                         ignore_index=True)
    # TODO: Use constructors in MultiRegionTimeseriesDataset
    timeseries_dataset = TimeseriesDataset(infection_rate_metric_df)
    latest = timeseries_dataset.latest_values_object()
    multiregion_rt = MultiRegionTimeseriesDataset.from_timeseries_and_latest(
        timeseries_dataset, latest)
    output_path = pathlib.Path(
        output_dir) / pyseir.utils.SummaryArtifact.RT_METRIC_COMBINED.value
    multiregion_rt.to_csv(output_path)
    root.info(f"Saving Rt results to {output_path}")

    icu_df = pd.concat((p.icu_data.data for p in pipelines if p.icu_data),
                       ignore_index=True)
    timeseries_dataset = TimeseriesDataset(icu_df)
    latest = timeseries_dataset.latest_values_object().data.set_index(
        CommonFields.LOCATION_ID)
    multiregion_icu = MultiRegionTimeseriesDataset(icu_df, latest)

    output_path = pathlib.Path(
        output_dir) / pyseir.utils.SummaryArtifact.ICU_METRIC_COMBINED.value
    multiregion_icu.to_csv(output_path)
    root.info(f"Saving ICU results to {output_path}")

    if write_webui_output:
        # does not parallelize well, because web_ui mapper doesn't serialize efficiently
        # TODO: Remove intermediate artifacts and paralellize artifacts creation better
        # Approximately 40% of the processing time is taken on this step
        web_ui_mapper = WebUIDataAdaptorV1(
            output_interval_days=output_interval_days,
            output_dir=output_dir,
        )
        webui_inputs = [
            webui_data_adaptor_v1.RegionalInput.from_results(
                p.fitter, p.ensemble, p.infer_df) for p in pipelines
            if p.fitter
        ]

        with Pool(maxtasksperchild=1) as p:
            p.map(web_ui_mapper.write_region_safely, webui_inputs)
예제 #12
0
def update(summary_filename, wide_dates_filename, aggregate_to_msas: bool):
    """Updates latest and timeseries datasets to the current checked out covid data public commit"""
    path_prefix = dataset_utils.DATA_DIRECTORY.relative_to(
        dataset_utils.REPO_ROOT)

    data_source_classes = set(
        chain(
            chain.from_iterable(ALL_FIELDS_FEATURE_DEFINITION.values()),
            chain.from_iterable(ALL_TIMESERIES_FEATURE_DEFINITION.values()),
        ))
    data_sources = {
        data_source_cls.SOURCE_NAME: data_source_cls.local()
        for data_source_cls in data_source_classes
    }
    timeseries_dataset: TimeseriesDataset = combined_datasets.build_from_sources(
        TimeseriesDataset,
        data_sources,
        ALL_TIMESERIES_FEATURE_DEFINITION,
        filter=US_STATES_FILTER)
    latest_dataset: LatestValuesDataset = combined_datasets.build_from_sources(
        LatestValuesDataset,
        data_sources,
        ALL_FIELDS_FEATURE_DEFINITION,
        filter=US_STATES_FILTER,
    )
    multiregion_dataset = MultiRegionTimeseriesDataset.from_timeseries_and_latest(
        timeseries_dataset, latest_dataset)
    multiregion_dataset = add_new_cases(multiregion_dataset)
    if aggregate_to_msas:
        aggregator = statistical_areas.CountyToCBSAAggregator.from_local_public_data(
        )
        multiregion_dataset = multiregion_dataset.append_regions(
            aggregator.aggregate(multiregion_dataset))

    _, multiregion_pointer = combined_dataset_utils.update_data_public_head(
        path_prefix,
        latest_dataset,
        multiregion_dataset,
    )

    # Write DataSource objects that have provenance information, which is only set when significant
    # processing of the source data is done in this repo before it is combined. The output is not
    # used downstream, it is for debugging only.
    for data_source in data_sources.values():
        if data_source.provenance is not None:
            wide_dates_df.write_csv(
                data_source.timeseries().get_date_columns(),
                path_prefix / f"{data_source.SOURCE_NAME}-wide-dates.csv",
            )

    if wide_dates_filename:
        wide_dates_df.write_csv(
            timeseries_dataset.get_date_columns(),
            multiregion_pointer.path.with_name(wide_dates_filename),
        )

    if summary_filename:
        _save_field_summary(multiregion_dataset,
                            path_prefix / summary_filename)
 def aggregate(self, dataset_in: MultiRegionTimeseriesDataset) -> MultiRegionTimeseriesDataset:
     """Returns a dataset of CBSA regions, created by aggregating counties in the input data."""
     return MultiRegionTimeseriesDataset.from_combined_dataframe(
         pd.concat(
             [
                 self._aggregate_fips_df(dataset_in.data_with_fips, groupby_date=True),
                 # No need to reset latest_data_with_fips LOCATION_ID index because FIPS is used.
                 self._aggregate_fips_df(dataset_in.latest_data_with_fips, groupby_date=False),
             ],
             ignore_index=True,
         )
     )
예제 #14
0
def test_load_from_local_public_data():
    agg = statistical_areas.CountyToCBSAAggregator.from_local_public_data()

    assert agg.cbsa_title_map["43580"] == "Sioux City, IA-NE-SD"
    assert agg.county_map["48187"] == "41700"

    ts = TimeseriesDataset.load_csv(
        io.StringIO("fips,state,aggregate_level,county,m1,date,foo\n"
                    "48059,ZZ,county,North County,3,2020-05-03,ef\n"
                    "48253,ZZ,county,South County,4,2020-05-03,ef\n"))
    ts_in = MultiRegionTimeseriesDataset.from_timeseries_and_latest(
        ts, ts.latest_values_object())
    ts_out = agg.aggregate(ts_in)
    ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10180"))
    assert ts_cbsa.date_indexed["m1"].to_dict() == {
        pd.to_datetime("2020-05-03"): 7,
    }
예제 #15
0
def test_skip_gaps_in_cases_and_deaths_metrics():
    csv_string_io = io.StringIO(
        "location_id,country,state,county,aggregate_level,date,cases,deaths\n"
        "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-01,10,1\n"
        "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-02,,2\n"
        "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-03,30,\n"
        "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-04,40,4\n"
        "iso1:us#fips:97111,US,ZZ,Bar County,county,,40,4\n")
    input_dataset = MultiRegionTimeseriesDataset.from_csv(csv_string_io)

    df = WhitelistGenerator().generate_whitelist(input_dataset)

    assert to_dict(["fips"], df) == {
        "97111": {
            "state": "ZZ",
            "county": "Bar County",
            "inference_ok": False
        },
    }
예제 #16
0
    def run(
        metrics_in: MultiRegionTimeseriesDataset,
        methods: Sequence[Method] = TEST_POSITIVITY_METHODS,
        diff_days: int = 7,
        recent_days: int = 14,
    ) -> "AllMethods":
        ts_value_cols = list(
            set(
                chain.from_iterable((method.numerator, method.denominator)
                                    for method in methods)))
        missing_columns = set(ts_value_cols) - set(metrics_in.data.columns)
        if missing_columns:
            raise AssertionError(
                f"Data missing for test positivity: {missing_columns}")

        input_long = metrics_in.timeseries_long(ts_value_cols).set_index(
            [PdFields.VARIABLE, CommonFields.LOCATION_ID,
             CommonFields.DATE])[PdFields.VALUE]
        dates = input_long.index.get_level_values(CommonFields.DATE)
        start_date = dates.min()
        end_date = dates.max()
        input_date_range = pd.date_range(start=start_date, end=end_date)
        recent_date_range = pd.date_range(
            end=end_date, periods=recent_days).intersection(input_date_range)
        input_wide = (input_long.unstack(CommonFields.DATE).reindex(
            columns=input_date_range).rename_axis(columns=CommonFields.DATE))
        # This calculates the difference only when the cumulative value is a real value `diff_days` apart.
        # It looks like our input data has few or no holes so this works well enough.
        diff_df = input_wide.diff(periods=diff_days, axis=1)

        all_wide = (
            pd.concat(
                {method.name: method.calculate(diff_df)
                 for method in methods},
                names=[PdFields.VARIABLE],
            ).reorder_levels([CommonFields.LOCATION_ID, PdFields.VARIABLE])
            # Drop empty timeseries
            .dropna("index", "all").sort_index())

        method_cat_type = pd.CategoricalDtype(
            categories=[method.name for method in methods], ordered=True)

        has_recent_data = all_wide.loc[:,
                                       recent_date_range].notna().any(axis=1)
        all_recent_data = all_wide.loc[has_recent_data, :].reset_index()
        all_recent_data[PdFields.VARIABLE] = all_recent_data[
            PdFields.VARIABLE].astype(method_cat_type)
        first = all_recent_data.groupby(CommonFields.LOCATION_ID).first()
        provenance = first[PdFields.VARIABLE].astype(str).rename(
            PdFields.PROVENANCE)
        provenance.index = pd.MultiIndex.from_product(
            [provenance.index, [CommonFields.TEST_POSITIVITY]],
            names=[CommonFields.LOCATION_ID, PdFields.VARIABLE],
        )
        positivity = first.drop(columns=[PdFields.VARIABLE])

        test_positivity = MultiRegionTimeseriesDataset.from_timeseries_df(
            positivity.stack().rename(
                CommonFields.TEST_POSITIVITY).reset_index(),
            provenance=provenance,
        )

        return AllMethods(all_methods_timeseries=all_wide,
                          test_positivity=test_positivity)