示例#1
0
def get_timeseries_for_state(
        state: str,
        columns: List = None,
        min_range_with_some_value: bool = False) -> TimeseriesDataset:
    """Gets timeseries for a specific state abbreviation.

    Args:
        state: 2-letter state code
        columns: List of columns, apart from `TimeseriesDataset.INDEX_FIELDS`, to include.
        min_range_with_some_value: If True, removes NaNs that pad values at beginning and end of
            timeseries. Only applicable when columns are specified.

    Returns: Timeseries for state
    """

    state_ts = load_us_timeseries_dataset().get_subset(AggregationLevel.STATE,
                                                       state=state)
    if columns:
        subset = state_ts.data.loc[:, TimeseriesDataset.INDEX_FIELDS +
                                   columns].reset_index(drop=True)

        if min_range_with_some_value:
            subset = _remove_padded_nans(subset, columns)

        state_ts = TimeseriesDataset(subset)

    return state_ts
示例#2
0
def get_timeseries_for_fips(
        fips: str,
        columns: List = None,
        min_range_with_some_value: bool = False) -> TimeseriesDataset:
    """Gets timeseries for a specific FIPS code.

    Args:
        fips: FIPS code.  Can be county (5 character) or state (2 character) code.
        columns: List of columns, apart from `TimeseriesDataset.INDEX_FIELDS`, to include.
        min_range_with_some_value: If True, removes NaNs that pad values at beginning and end of
            timeseries. Only applicable when columns are specified.

    Returns: Timeseries for fips
    """

    state_ts = load_us_timeseries_dataset().get_subset(None, fips=fips)
    if columns:
        subset = state_ts.data.loc[:, TimeseriesDataset.INDEX_FIELDS +
                                   columns].reset_index(drop=True)

        if min_range_with_some_value:
            subset = _remove_padded_nans(subset, columns)

        state_ts = TimeseriesDataset(subset)

    return state_ts
示例#3
0
def _write_pipeline_output(
    pipelines: List[Union[SubStatePipeline, StatePipeline]],
    output_dir: str,
    output_interval_days: int = 4,
    write_webui_output: bool = False,
):

    infection_rate_metric_df = pd.concat((p.infer_df for p in pipelines),
                                         ignore_index=True)
    # TODO: Use constructors in MultiRegionTimeseriesDataset
    timeseries_dataset = TimeseriesDataset(infection_rate_metric_df)
    latest = timeseries_dataset.latest_values_object()
    multiregion_rt = MultiRegionTimeseriesDataset.from_timeseries_and_latest(
        timeseries_dataset, latest)
    output_path = pathlib.Path(
        output_dir) / pyseir.utils.SummaryArtifact.RT_METRIC_COMBINED.value
    multiregion_rt.to_csv(output_path)
    root.info(f"Saving Rt results to {output_path}")

    icu_df = pd.concat((p.icu_data.data for p in pipelines if p.icu_data),
                       ignore_index=True)
    timeseries_dataset = TimeseriesDataset(icu_df)
    latest = timeseries_dataset.latest_values_object().data.set_index(
        CommonFields.LOCATION_ID)
    multiregion_icu = MultiRegionTimeseriesDataset(icu_df, latest)

    output_path = pathlib.Path(
        output_dir) / pyseir.utils.SummaryArtifact.ICU_METRIC_COMBINED.value
    multiregion_icu.to_csv(output_path)
    root.info(f"Saving ICU results to {output_path}")

    if write_webui_output:
        # does not parallelize well, because web_ui mapper doesn't serialize efficiently
        # TODO: Remove intermediate artifacts and paralellize artifacts creation better
        # Approximately 40% of the processing time is taken on this step
        web_ui_mapper = WebUIDataAdaptorV1(
            output_interval_days=output_interval_days,
            output_dir=output_dir,
        )
        webui_inputs = [
            webui_data_adaptor_v1.RegionalInput.from_results(
                p.fitter, p.ensemble, p.infer_df) for p in pipelines
            if p.fitter
        ]

        with Pool(maxtasksperchild=1) as p:
            p.map(web_ui_mapper.write_region_safely, webui_inputs)
示例#4
0
def load_combined_timeseries(
        sources: Dict[str, TimeseriesDataset],
        timeseries: TimeseriesDataset) -> TimeseriesDataset:
    timeseries_data = timeseries.data.copy()
    timeseries_data["source"] = "Combined Data"

    combined_timeseries = TimeseriesDataset(
        pd.concat([timeseries_data] +
                  [source.data for source in sources.values()]))
    return combined_timeseries
示例#5
0
def get_hospitalization_data():
    data = combined_datasets.build_us_timeseries_with_all_fields().data
    # Since we're using this data for hospitalized data only, only returning
    # values with hospitalization data.  I think as the use cases of this data source
    # expand, we may not want to drop. For context, as of 4/8 607/1821 rows contained
    # hospitalization data.
    has_current_hospital = data[
        TimeseriesDataset.Fields.CURRENT_HOSPITALIZED].notnull()
    has_cumulative_hospital = data[
        TimeseriesDataset.Fields.CUMULATIVE_HOSPITALIZED].notnull()
    return TimeseriesDataset(data[has_current_hospital
                                  | has_cumulative_hospital])
def test_make_latest_from_timeseries_simple():
    data = read_csv_and_index_fips_date(
        "fips,county,state,country,date,aggregate_level,m1,m2\n"
        "97123,Smith County,ZZ,USA,2020-04-01,county,1,\n"
        "97123,Smith County,ZZ,USA,2020-04-02,county,,2\n").reset_index()
    ts = TimeseriesDataset(data)
    assert to_dict(["fips"],
                   ts.latest_values()[["fips", "m1", "m2"]]) == {
                       "97123": {
                           "m1": 1,
                           "m2": 2
                       }
                   }
def test_persist_and_load_dataset(tmp_path, nyc_fips):
    region = Region.from_fips(nyc_fips)
    dataset = combined_datasets.load_us_timeseries_dataset()
    timeseries_nyc = TimeseriesDataset(dataset.get_one_region(region).data)

    pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path)

    downloaded_dataset = pointer.load_dataset()
    differ_l = DatasetDiff.make(downloaded_dataset.data)
    differ_r = DatasetDiff.make(timeseries_nyc.data)
    differ_l.compare(differ_r)

    assert not len(differ_l.my_ts)
示例#8
0
def test_get_subset():
    # CSV with a unique FIPS value for every region, even countries. In production countries are removed before
    # TimeseriesDataset is created. A future change may replace FIPS with a more general identifier.
    input_df = pd.read_csv(
        StringIO(
            "city,county,state,fips,country,aggregate_level,date,metric\n"
            "Smithville,,ZZ,97123,USA,city,2020-03-23,smithville-march23\n"
            "New York City,,ZZ,97324,USA,city,2020-03-22,march22-nyc\n"
            "New York City,,ZZ,97324,USA,city,2020-03-24,march24-nyc\n"
            ",North County,ZZ,97001,USA,county,2020-03-23,county-metric\n"
            ",,ZZ,97,USA,state,2020-03-23,mystate\n"
            ",,XY,96,USA,state,2020-03-23,other-state\n"
            ",,,iso2:uk,UK,country,2020-03-23,you-kee\n"
            ",,,iso2:us,US,country,2020-03-23,you-ess-hey\n"))
    ts = TimeseriesDataset(input_df)

    assert set(ts.get_subset(AggregationLevel.COUNTRY).data["metric"]) == {
        "you-kee", "you-ess-hey"
    }
    assert set(
        ts.get_subset(AggregationLevel.COUNTRY,
                      country="UK").data["country"]) == {"UK"}
    assert set(ts.get_subset(
        AggregationLevel.STATE).data["metric"]) == {"mystate", "other-state"}
    assert set(ts.get_subset(
        state="ZZ", after="2020-03-23").data["metric"]) == {"march24-nyc"}
    assert set(ts.get_subset(state="ZZ",
                             after="2020-03-22").data["metric"]) == {
                                 "smithville-march23",
                                 "county-metric",
                                 "mystate",
                                 "march24-nyc",
                             }
    assert set(
        ts.get_subset(AggregationLevel.STATE,
                      states=["ZZ", "XY"]).data["metric"]) == {
                          "mystate",
                          "other-state",
                      }
    assert set(ts.get_subset(states=["ZZ"],
                             on="2020-03-23").data["metric"]) == {
                                 "smithville-march23",
                                 "county-metric",
                                 "mystate",
                             }
    assert set(
        ts.get_subset(states=["ZZ"],
                      before="2020-03-23").data["metric"]) == {"march22-nyc"}
示例#9
0
def get_hospitalization_data():
    """
    Since we're using this data for hospitalized data only, only returning
    values with hospitalization data.  I think as the use cases of this data source
    expand, we may not want to drop. For context, as of 4/8 607/1821 rows contained
    hospitalization data.
    Returns
    -------
    TimeseriesDataset
    """
    data = combined_datasets.load_us_timeseries_dataset().data
    has_current_hospital = data[CommonFields.CURRENT_HOSPITALIZED].notnull()
    has_cumulative_hospital = data[
        CommonFields.CUMULATIVE_HOSPITALIZED].notnull()
    return TimeseriesDataset(data[has_current_hospital
                                  | has_cumulative_hospital])
示例#10
0
def test_get_subset_and_get_data():
    input_df = pd.read_csv(
        StringIO(
            "city,county,state,fips,country,aggregate_level,date,metric\n"
            "Smithville,,ZZ,97123,USA,city,2020-03-23,smithville-march23\n"
            "New York City,,ZZ,97324,USA,city,2020-03-22,march22-nyc\n"
            "New York City,,ZZ,97324,USA,city,2020-03-24,march24-nyc\n"
            ",North County,ZZ,97001,USA,county,2020-03-23,county-metric\n"
            ",,ZZ,97001,USA,state,2020-03-23,mystate\n"
            ",,XY,96001,USA,state,2020-03-23,other-state\n"
            ",,,,UK,country,2020-03-23,you-kee\n"
            ",,,,US,country,2020-03-23,you-ess-hey\n"))
    ts = TimeseriesDataset(input_df)

    assert set(ts.get_subset(AggregationLevel.COUNTRY).data["metric"]) == {
        "you-kee", "you-ess-hey"
    }
    assert set(
        ts.get_subset(AggregationLevel.COUNTRY,
                      country="UK").data["country"]) == {"UK"}
    assert set(ts.get_subset(
        AggregationLevel.STATE).data["metric"]) == {"mystate", "other-state"}
    assert set(ts.get_data(None, state="ZZ",
                           after="2020-03-23")["metric"]) == {"march24-nyc"}
    assert set(ts.get_data(None, state="ZZ",
                           after="2020-03-22")["metric"]) == {
                               "smithville-march23",
                               "county-metric",
                               "mystate",
                               "march24-nyc",
                           }
    assert set(
        ts.get_data(AggregationLevel.STATE, states=["ZZ",
                                                    "XY"])["metric"]) == {
                                                        "mystate",
                                                        "other-state",
                                                    }
    assert set(ts.get_data(None, states=["ZZ"],
                           on="2020-03-23")["metric"]) == {
                               "smithville-march23",
                               "county-metric",
                               "mystate",
                           }
    assert set(
        ts.get_data(None, states=["ZZ"],
                    before="2020-03-23")["metric"]) == {"march22-nyc"}
def test_write_csv():
    df = pd.DataFrame({
        CommonFields.DATE:
        pd.to_datetime(["2020-04-01", "2020-04-02"]),
        CommonFields.FIPS: ["06045", "45123"],
        CommonFields.CASES: [234, 456],
    })
    ts = TimeseriesDataset(df)

    expected_csv = """,,summary,summary,summary,summary,summary,summary,summary,summary,summary,value,value
date,,has_value,min_date,max_date,max_value,min_value,latest_value,num_observations,largest_delta,largest_delta_date,2020-04-01 00:00:00,2020-04-02 00:00:00
fips,variable,,,,,,,,,,,
06045,cases,True,2020-04-01,2020-04-01,234,234,234,1,,,234,
45123,cases,True,2020-04-02,2020-04-02,456,456,456,1,,,,456
"""
    # Call common_df.write_csv with index set to ["fips", "date"], the expected normal index.
    with temppathlib.NamedTemporaryFile("w+") as tmp:
        wide_dates_df.write_csv(ts.get_date_columns(), tmp.path)
        assert expected_csv == tmp.file.read()
示例#12
0
def test_wide_dates():
    input_df = read_csv_and_index_fips_date(
        "fips,county,aggregate_level,date,m1,m2\n"
        "97111,Bar County,county,2020-04-01,1,\n"
        "97111,Bar County,county,2020-04-02,2,\n"
        "97222,Foo County,county,2020-04-01,,10\n"
        "97222,Foo County,county,2020-04-03,3,30\n")
    provenance = provenance_wide_metrics_to_series(
        read_csv_and_index_fips_date("fips,date,m1,m2\n"
                                     "97111,2020-04-01,src11,\n"
                                     "97111,2020-04-02,src11,\n"
                                     "97222,2020-04-01,,src22\n"
                                     "97222,2020-04-03,src21,src22\n"),
        structlog.get_logger(),
    )

    ts = TimeseriesDataset(input_df.reset_index(), provenance=provenance)
    date_columns = ts.get_date_columns()
    assert to_dict(["fips", "variable"], date_columns["value"]) == {
        ("97111", "m1"): {
            pd.to_datetime("2020-04-01"): 1.0,
            pd.to_datetime("2020-04-02"): 2.0
        },
        ("97222", "m1"): {
            pd.to_datetime("2020-04-03"): 3.0
        },
        ("97222", "m2"): {
            pd.to_datetime("2020-04-01"): 10.0,
            pd.to_datetime("2020-04-03"): 30.0
        },
    }
    assert to_dict(["fips", "variable"], date_columns["provenance"]) == {
        ("97111", "m1"): {
            "value": "src11"
        },
        ("97222", "m1"): {
            "value": "src21"
        },
        ("97222", "m2"): {
            "value": "src22"
        },
    }
def test_make_latest_from_timeseries_dont_touch_county():
    data = read_csv_and_index_fips_date(
        "fips,county,state,country,date,aggregate_level,m1,m2\n"
        "95123,Smith Countyy,YY,USA,2020-04-01,county,1,\n"
        "97123,Smith Countzz,ZZ,USA,2020-04-01,county,2,\n"
        "97,,ZZ,USA,2020-04-01,state,3,\n").reset_index()
    ts = TimeseriesDataset(data)
    assert to_dict(["fips"],
                   ts.latest_values()[["fips", "county", "m1", "m2"]]) == {
                       "95123": {
                           "m1": 1,
                           "county": "Smith Countyy"
                       },
                       "97123": {
                           "m1": 2,
                           "county": "Smith Countzz"
                       },
                       "97": {
                           "m1": 3
                       },
                   }