def test_fips_metadata(nyc_fips):
    combined_datasets.build_us_timeseries_with_all_fields()
    fitter = initial_conditions_fitter.InitialConditionsFitter(nyc_fips)
    assert fitter.state == "NY"
    assert fitter.county == "New York County"
    assert fitter.data_start_date == pd.Timestamp("2020-03-01")
    # Checking to make sure that y is a numpy array rather than a pandas DF.
    assert isinstance(fitter.y, numpy.ndarray)
Exemplo n.º 2
0
def _cache_global_datasets():
    # Populate cache for combined latest and timeseries.  Caching pre-fork
    # will make sure cache is populated for subprocesses.  Return value
    # is not needed as the only goal is to populate the cache.
    combined_datasets.build_us_latest_with_all_fields()
    combined_datasets.build_us_timeseries_with_all_fields()

    global nyt_dataset, cds_dataset
    if cds_dataset is None:
        cds_dataset = CDSDataset.local()
    if nyt_dataset is None:
        nyt_dataset = NYTimesDataset.local()
Exemplo n.º 3
0
def get_current_hospitalized(fips, t0, category: HospitalizationCategory):
    """
    Return the current estimate for the number of people in the given category for a given fips.
    Treats a length 2 fips as a state and a length 5 fips as a county

    Parameters
    ----------
    fips: str
        US fips to lookup.
    t0: datetime
        Datetime to offset by.
    category: HospitalizationCategory
        'icu' for just ICU or 'hospitalized' for all ICU + Acute.

    Returns
    -------
    time: float
        Days since t0 for the hospitalization data.
    current estimate: float
        The most recent provided value for the current occupied in the requested category.
    """

    if len(fips) == 2:
        kwargs = dict(
            aggregation_level=AggregationLevel.STATE,
            country="USA",
            state=us.states.lookup(fips).abbr,
        )
    else:
        kwargs = dict(aggregation_level=AggregationLevel.COUNTY, country="USA", fips=fips)
    df = combined_datasets.build_us_timeseries_with_all_fields().get_data(**kwargs)
    return _get_current_hospitalized(df, t0, category)
Exemplo n.º 4
0
def generate_api(input_dir, output, summary_output, aggregation_level, state,
                 fips):
    """The entry function for invocation"""

    active_states = [state.abbr for state in us.STATES]
    us_latest = combined_datasets.build_us_latest_with_all_fields().get_subset(
        aggregation_level, state=state, fips=fips, states=active_states)
    us_timeseries = combined_datasets.build_us_timeseries_with_all_fields(
    ).get_subset(aggregation_level,
                 state=state,
                 fips=fips,
                 states=active_states)

    for intervention in list(Intervention):
        _logger.info(f"Running intervention {intervention.name}")
        all_timeseries = api_pipeline.run_on_all_fips_for_intervention(
            us_latest, us_timeseries, intervention, input_dir)
        county_timeseries = [
            output for output in all_timeseries
            if output.aggregate_level is AggregationLevel.COUNTY
        ]
        api_pipeline.deploy_single_level(intervention, county_timeseries,
                                         summary_output, output)
        state_timeseries = [
            output for output in all_timeseries
            if output.aggregate_level is AggregationLevel.STATE
        ]
        api_pipeline.deploy_single_level(intervention, state_timeseries,
                                         summary_output, output)
Exemplo n.º 5
0
def generate_top_counties(disable_validation, input_dir, output, state, fips):
    """The entry function for invocation"""
    intervention = Intervention.SELECTED_INTERVENTION
    active_states = [state.abbr for state in us.STATES]
    us_latest = combined_datasets.build_us_latest_with_all_fields().get_subset(
        AggregationLevel.COUNTY, states=active_states, state=state, fips=fips)
    us_timeseries = combined_datasets.build_us_timeseries_with_all_fields(
    ).get_subset(AggregationLevel.COUNTY,
                 states=active_states,
                 state=state,
                 fips=fips)

    def sort_func(output: CovidActNowAreaTimeseries):
        return -output.projections.totalHospitalBeds.peakShortfall

    all_timeseries = api_pipeline.run_on_all_fips_for_intervention(
        us_latest,
        us_timeseries,
        Intervention.SELECTED_INTERVENTION,
        input_dir,
        sort_func=sort_func,
        limit=100,
    )
    bulk_timeseries = CovidActNowBulkTimeseries(__root__=all_timeseries)

    api_pipeline.deploy_json_api_output(
        intervention,
        bulk_timeseries,
        output,
        filename_override="counties_top_100.json")
Exemplo n.º 6
0
def test_build_api_output_for_intervention(nyc_fips, nyc_model_output_path,
                                           tmp_path):
    county_output = tmp_path / "county"
    us_latest = combined_datasets.build_us_latest_with_all_fields()
    us_timeseries = combined_datasets.build_us_timeseries_with_all_fields()

    nyc_latest = us_latest.get_subset(None, fips=nyc_fips)
    nyc_timeseries = us_timeseries.get_subset(None, fips=nyc_fips)
    all_timeseries_api = api_pipeline.run_on_all_fips_for_intervention(
        nyc_latest, nyc_timeseries, Intervention.STRONG_INTERVENTION,
        nyc_model_output_path.parent)

    api_pipeline.deploy_single_level(Intervention.STRONG_INTERVENTION,
                                     all_timeseries_api, tmp_path,
                                     county_output)
    expected_outputs = [
        "counties.STRONG_INTERVENTION.timeseries.json",
        "counties.STRONG_INTERVENTION.csv",
        "counties.STRONG_INTERVENTION.timeseries.csv",
        "counties.STRONG_INTERVENTION.json",
        "county/36061.STRONG_INTERVENTION.json",
        "county/36061.STRONG_INTERVENTION.timeseries.json",
    ]

    output_paths = [
        str(path.relative_to(tmp_path)) for path in tmp_path.glob("**/*")
        if not path.is_dir()
    ]
    assert sorted(output_paths) == sorted(expected_outputs)
Exemplo n.º 7
0
def update_data_public_head(
    data_directory: pathlib.Path,
    latest_dataset: latest_values_dataset.LatestValuesDataset = None,
    timeseries_dataset: timeseries.TimeseriesDataset = None,
) -> Tuple[DatasetPointer, DatasetPointer]:
    """Persists US latest and timeseries dataset and saves dataset pointers for Latest tag.

    Args:
        data_directory: Directory to save dataset and pointer.
        pointer_path_dir: Directory to save DatasetPointer files.
        latest_dataset: Optionally specify a LatestValuesDataset to persist instead of building
            from head.  Generally used in testing to sidestep building entire dataset.
        timeseries_dataset: Optionally specify a TimeseriesDataset to persist instead of building
            from head.  Generally used in testing to sidestep building entire dataset.

    Returns: Tuple of DatasetPointers to latest and timeseries datasets.
    """
    if not latest_dataset:
        latest_dataset = combined_datasets.build_us_latest_with_all_fields(skip_cache=True)
    latest_pointer = persist_dataset(latest_dataset, data_directory)

    if not timeseries_dataset:
        timeseries_dataset = combined_datasets.build_us_timeseries_with_all_fields(skip_cache=True)
    timeseries_pointer = persist_dataset(timeseries_dataset, data_directory)
    return latest_pointer, timeseries_pointer
Exemplo n.º 8
0
def save_combined_csv(csv_path_format, output_dir):
    """Save the combined datasets DataFrame, cleaned up for easier comparisons."""
    csv_path = form_path_name(csv_path_format, output_dir)

    timeseries = combined_datasets.build_us_timeseries_with_all_fields()
    timeseries_data = timeseries.data

    common_df.write_csv(timeseries_data, csv_path, structlog.get_logger())
Exemplo n.º 9
0
def get_hospitalization_data():
    data = combined_datasets.build_us_timeseries_with_all_fields().data
    # Since we're using this data for hospitalized data only, only returning
    # values with hospitalization data.  I think as the use cases of this data source
    # expand, we may not want to drop. For context, as of 4/8 607/1821 rows contained
    # hospitalization data.
    has_current_hospital = data[
        TimeseriesDataset.Fields.CURRENT_HOSPITALIZED].notnull()
    has_cumulative_hospital = data[
        TimeseriesDataset.Fields.CUMULATIVE_HOSPITALIZED].notnull()
    return TimeseriesDataset(data[has_current_hospital
                                  | has_cumulative_hospital])
Exemplo n.º 10
0
def test_combined_county_has_some_timeseries_data(fips):
    latest = combined_datasets.build_us_timeseries_with_all_fields(
    ).get_subset(AggregationLevel.COUNTY, fips=fips)
    df = latest.data.set_index(CommonFields.DATE)
    assert df.loc["2020-05-01", CommonFields.CASES] > 0
    assert df.loc["2020-05-01", CommonFields.DEATHS] > 0
    if fips.startswith(
            "06"
    ):  # TODO(tom): Remove this condition when we have county data in TX too.
        assert df.loc["2020-05-01", CommonFields.POSITIVE_TESTS] > 0
        assert df.loc["2020-05-01", CommonFields.NEGATIVE_TESTS] > 0
        assert df.loc["2020-05-01", CommonFields.CURRENT_ICU] > 0
Exemplo n.º 11
0
def get_testing_timeseries_by_fips(fips):
    """Called by generate_api"""
    testing_df = build_us_timeseries_with_all_fields().get_data(
        None, fips=fips, columns_slice=CDSDataset.COMMON_TEST_FIELDS
    )
    testing_df[CDSDataset.Fields.TESTED] = (
        testing_df[CommonFields.NEGATIVE_TESTS] + testing_df[CommonFields.POSITIVE_TESTS]
    )
    testing_df.drop(columns=[CommonFields.NEGATIVE_TESTS], inplace=True)
    all_fields = dict(**CDSDataset.INDEX_FIELD_MAP, **CDSDataset.COMMON_FIELD_MAP)
    testing_df.rename(columns=all_fields, inplace=True)
    testing_df["date"] = testing_df.date.apply(lambda x: x.strftime("%m/%d/%y"))
    testing_df.set_index([CDSDataset.Fields.FIPS, CDSDataset.Fields.DATE], inplace=True)
    return testing_df
Exemplo n.º 12
0
def get_testing_timeseries_by_state(state):
    testing_df = (
        build_us_timeseries_with_all_fields()
        .get_data(aggregation_level=AggregationLevel.STATE, state=state)
        .loc[:, (CommonFields.NEGATIVE_TESTS, CommonFields.POSITIVE_TESTS, CommonFields.DATE)]
    )
    testing_df.rename(
        columns={
            CommonFields.POSITIVE_TESTS: CovidTrackingDataSource.Fields.POSITIVE_TESTS,
            CommonFields.NEGATIVE_TESTS: CovidTrackingDataSource.Fields.NEGATIVE_TESTS,
        },
        inplace=True,
    )
    testing_df["date"] = testing_df.date.apply(lambda x: x.strftime("%m/%d/%y"))
    return testing_df
Exemplo n.º 13
0
def generate_state_timeseries(
    projection_row, intervention, input_dir
) -> CovidActNowStateTimeseries:
    state = US_STATE_ABBREV[projection_row[rc.STATE_FULL_NAME]]
    fips = projection_row[rc.FIPS]
    raw_dataseries = get_can_projection.get_can_raw_data(
        input_dir, state, fips, AggregationLevel.STATE, intervention
    )

    # join in state testing data onto the timeseries
    # left join '%m/%d/%y', so the left join gracefully handles
    # missing state testing data (i.e. NE)
    testing_df = get_testing_timeseries_by_state(state)
    new_df = pd.DataFrame(raw_dataseries).merge(testing_df, on="date", how="left")
    can_dataseries = new_df.to_dict(orient="records")

    timeseries = []
    for data_series in can_dataseries:
        timeseries.append(_generate_state_timeseries_row(data_series))

    projections = _generate_api_for_projections(projection_row)
    if len(timeseries) < 1:
        raise Exception(f"State time series empty for {intervention.name}")

    state_intervention = get_can_projection.get_intervention_for_state(state)
    actuals_ts = combined_datasets.build_us_timeseries_with_all_fields()
    actual_latest = combined_datasets.build_us_latest_with_all_fields()
    state_latest = actual_latest.get_record_for_state(state)

    return CovidActNowStateTimeseries(
        population=state_latest[CommonFields.POPULATION],
        lat=projection_row[rc.LATITUDE],
        long=projection_row[rc.LONGITUDE],
        actuals=_generate_actuals(
            state_latest, state_intervention
        ),
        stateName=projection_row[rc.STATE_FULL_NAME],
        fips=projection_row[rc.FIPS],
        lastUpdatedDate=_format_date(projection_row[rc.LAST_UPDATED]),
        projections=projections,
        timeseries=timeseries,
        actuals_timeseries=_generate_actuals_timeseries(
            actuals_ts.get_records_for_state(state), state_intervention
        ),
    )
Exemplo n.º 14
0
def generate_county_timeseries(projection_row, intervention, input_dir):
    state_abbrev = US_STATE_ABBREV[projection_row[rc.STATE_FULL_NAME]]
    fips = projection_row[rc.FIPS]

    raw_dataseries = get_can_projection.get_can_raw_data(
        input_dir, state_abbrev, fips, AggregationLevel.COUNTY, intervention
    )

    testing_df = get_testing_timeseries_by_fips(fips)
    new_df = pd.DataFrame(raw_dataseries).merge(testing_df, on="date", how="left")

    can_dataseries = new_df.to_dict(orient="records")

    timeseries = []
    for data_series in can_dataseries:
        timeseries.append(_generate_county_timeseries_row(data_series))
    if len(timeseries) < 1:
        raise Exception(f"County time series empty for {intervention.name}")

    projections = _generate_api_for_projections(projection_row)
    state_intervention = get_can_projection.get_intervention_for_state(state_abbrev)
    actuals_ts = combined_datasets.build_us_timeseries_with_all_fields()
    actual_latest = combined_datasets.build_us_latest_with_all_fields()
    fips_latest = actual_latest.get_record_for_fips(fips)

    return CovidActNowCountyTimeseries(
        population=fips_latest[CommonFields.POPULATION],
        lat=projection_row[rc.LATITUDE],
        long=projection_row[rc.LONGITUDE],
        actuals=_generate_actuals(
            fips_latest, state_intervention
        ),
        stateName=projection_row[rc.STATE_FULL_NAME],
        countyName=projection_row[rc.COUNTY],
        fips=projection_row[rc.FIPS],
        lastUpdatedDate=_format_date(projection_row[rc.LAST_UPDATED]),
        projections=projections,
        timeseries=timeseries,
        actuals_timeseries=_generate_actuals_timeseries(
            actuals_ts.get_records_for_fips(fips), state_intervention
        ),
    )
Exemplo n.º 15
0
def test_generate_timeseries_for_fips(include_projections,
                                      nyc_model_output_path, nyc_fips):

    us_latest = combined_datasets.build_us_latest_with_all_fields()
    us_timeseries = combined_datasets.build_us_timeseries_with_all_fields()

    nyc_latest = us_latest.get_record_for_fips(nyc_fips)
    nyc_timeseries = us_timeseries.get_subset(None, fips=nyc_fips)
    intervention = Intervention.OBSERVED_INTERVENTION
    model_output = CANPyseirLocationOutput.load_from_path(
        nyc_model_output_path)

    area_summary = generate_api.generate_area_summary(nyc_latest, model_output)
    area_timeseries = generate_api.generate_area_timeseries(
        area_summary, nyc_timeseries, model_output)

    summary = generate_api.generate_area_summary(nyc_latest, model_output)

    assert summary.dict() == area_timeseries.area_summary.dict()
    # Double checking that serialized json does not contain NaNs, all values should
    # be serialized using the simplejson wrapper.
    assert "NaN" not in area_timeseries.json()
Exemplo n.º 16
0
def test_build_timeseries_and_summary_outputs(nyc_model_output_path, nyc_fips,
                                              intervention):

    us_latest = combined_datasets.build_us_latest_with_all_fields()
    us_timeseries = combined_datasets.build_us_timeseries_with_all_fields()

    timeseries = api_pipeline.build_timeseries_for_fips(
        intervention, us_latest, us_timeseries, nyc_model_output_path.parent,
        nyc_fips)

    if intervention is Intervention.NO_INTERVENTION:
        # Test data does not contain no intervention model, should not output any results.
        assert not timeseries
        return

    assert timeseries

    if intervention is Intervention.STRONG_INTERVENTION:
        assert timeseries.projections
        assert timeseries.timeseries
    elif intervention is Intervention.OBSERVED_INTERVENTION:
        assert not timeseries.projections
        assert not timeseries.timeseries
Exemplo n.º 17
0
def test_unique_index_values_us_timeseries():
    timeseries = combined_datasets.build_us_timeseries_with_all_fields()
    timeseries_data = timeseries.data.set_index(timeseries.INDEX_FIELDS)
    duplicates = timeseries_data.index.duplicated()
    assert not sum(duplicates)
Exemplo n.º 18
0
def load_new_test_data_by_fips(fips, t0, smoothing_tau=5, correction_threshold=5):
    """
    Return a timeseries of new tests for a geography. Note that due to reporting
    discrepancies county to county, and state-to-state, these often do not go
    back as far as case data.

    Parameters
    ----------
    fips: str
        State or county fips code
    t0: datetime
        Reference datetime to use.

    Returns
    -------
    df: pd.DataFrame
        DataFrame containing columns:
        - 'date',
        - 'new_tests': Number of total tests performed that day
        - 'increase_in_new_tests': Increase in tests performed that day vs
          previous day
        - 'positivity_rate':
            Test positivity rate
        - 'expected_positives_from_test_increase':
            Number of positive detections expected just from increased test
            capacity.
        - times: days since t0 for this observation.
    smoothing_tau: int
        expected_positives_from_test_increase is smoothed based on an
        exponentially weighted moving average of decay factor specified here.
    correction_threshold: int
        Do not apply a correction if the incident cases per day is lower than
        this value. There can be instability if case counts are very low.
    """
    us_timeseries = combined_datasets.build_us_timeseries_with_all_fields()

    if len(fips) == 2:
        df = us_timeseries.get_data(AggregationLevel.STATE, state=us.states.lookup(fips).abbr)
    else:
        df = us_timeseries.get_data(AggregationLevel.COUNTY, fips=fips)
    df = df[
        (df[CommonFields.POSITIVE_TESTS].notnull())
        & (df[CommonFields.NEGATIVE_TESTS].notnull())
        & ((df[CommonFields.POSITIVE_TESTS] + df[CommonFields.NEGATIVE_TESTS]) > 0)
    ]

    df["positivity_rate"] = df[CommonFields.POSITIVE_TESTS] / (
        df[CommonFields.POSITIVE_TESTS] + df[CommonFields.NEGATIVE_TESTS]
    )
    df["new_positive"] = np.append([0], np.diff(df[CommonFields.POSITIVE_TESTS]))

    # The first derivative gets us new instead of cumulative tests while the second derivative gives us the change in new test rate.
    df["new_tests"] = np.append(
        [0], np.diff(df[CommonFields.POSITIVE_TESTS] + df[CommonFields.NEGATIVE_TESTS])
    )
    df["increase_in_new_tests"] = np.append([0], np.diff(df["new_tests"]))

    # dPositive / dTotal = 0.65 * positivity_rate was empirically determined by looking at
    # the increase in positives day-over-day relative to the increase in total tests across all 50 states.
    df["expected_positives_from_test_increase"] = (
        df["increase_in_new_tests"] * 0.65 * df["positivity_rate"]
    )
    df = df[
        [
            "date",
            "new_tests",
            "increase_in_new_tests",
            "positivity_rate",
            "expected_positives_from_test_increase",
            "new_positive",
        ]
    ]

    df = df[df.increase_in_new_tests.notnull() & df.positivity_rate.notnull()]
    df["expected_positives_from_test_increase"] = ewma_smoothing(
        df["expected_positives_from_test_increase"], smoothing_tau
    )
    df["expected_positives_from_test_increase"][df["new_positive"] < 5] = 0

    df["times"] = [
        int((date - t0).days) for date in pd.to_datetime(df["date"].values).to_pydatetime()
    ]

    return df
Exemplo n.º 19
0
def _cache_global_datasets():
    # Populate cache for combined latest and timeseries.  Caching pre-fork
    # will make sure cache is populated for subprocesses.  Return value
    # is not needed as the only goal is to populate the cache.
    combined_datasets.build_us_latest_with_all_fields()
    combined_datasets.build_us_timeseries_with_all_fields()
Exemplo n.º 20
0
def load_hospitalization_data_by_state(
    state: str,
    t0: datetime,
    category: HospitalizationCategory = HospitalizationCategory.HOSPITALIZED,
):
    """
    Obtain hospitalization data. We clip because there are sometimes negatives
    either due to data reporting or corrections in case count. These are always
    tiny so we just make downstream easier to work with by clipping.

    Parameters
    ----------
    state: str
        State to lookup.
    t0: datetime
        Datetime to offset by.
    category: HospitalizationCategory
        'icu' for just ICU or 'hospitalized' for all ICU + Acute.

    Returns
    -------
    times: array(float) or NoneType
        List of float days since t0 for the hospitalization data.
    observed_hospitalizations: array(int) or NoneType
        Array of new cases observed each day.
    type: HospitalizationDataType
        Specifies cumulative or current hospitalizations.
    """
    abbr = us.states.lookup(state).abbr
    hospitalization_data = combined_datasets.build_us_timeseries_with_all_fields(
    ).get_data(AggregationLevel.STATE, country="USA", state=abbr)

    if len(hospitalization_data) == 0:
        return None, None, None

    if (hospitalization_data[f"current_{category}"] > 0).any():
        hospitalization_data = hospitalization_data[
            hospitalization_data[f"current_{category}"].notnull()]
        times_new = (hospitalization_data["date"].dt.date -
                     t0.date()).dt.days.values
        return (
            times_new,
            hospitalization_data[f"current_{category}"].values.clip(min=0),
            HospitalizationDataType.CURRENT_HOSPITALIZATIONS,
        )
    elif (hospitalization_data[f"cumulative_{category}"] > 0).any():
        hospitalization_data = hospitalization_data[
            hospitalization_data[f"cumulative_{category}"].notnull()]
        times_new = (hospitalization_data["date"].dt.date -
                     t0.date()).dt.days.values
        cumulative = hospitalization_data[
            f"cumulative_{category}"].values.clip(min=0)
        # Some minor glitches for a few states..
        for i, val in enumerate(cumulative[1:]):
            if cumulative[i] > cumulative[i + 1]:
                cumulative[i] = cumulative[i + 1]
        return (
            times_new,
            hospitalization_data[f"cumulative_{category}"].values.clip(min=0),
            HospitalizationDataType.CUMULATIVE_HOSPITALIZATIONS,
        )
    else:
        return None, None, None
Exemplo n.º 21
0
def get_usa_by_states_df(input_dir: str, intervention: Intervention):
    us_only = _get_usa_by_county_df()
    interventions_df = _get_interventions_df()
    projections_df = get_state_projections_df(input_dir, intervention.value, interventions_df)
    testing_df = build_us_timeseries_with_all_fields().get_data(
        aggregation_level=AggregationLevel.STATE,
        columns_slice=[
            CommonFields.STATE,
            CommonFields.POSITIVE_TESTS,
            CommonFields.NEGATIVE_TESTS,
        ],
    )
    test_max_df = (
        testing_df.groupby(CommonFields.STATE, as_index=False)[
            [CommonFields.POSITIVE_TESTS, CommonFields.NEGATIVE_TESTS]
        ]
        .max()
        .rename(
            columns={
                CommonFields.POSITIVE_TESTS: CovidTrackingDataSource.Fields.POSITIVE_TESTS,
                CommonFields.NEGATIVE_TESTS: CovidTrackingDataSource.Fields.NEGATIVE_TESTS,
            }
        )
    )
    states_group = us_only.groupby([CommonFields.STATE])
    states_agg = states_group.aggregate(
        {
            "Last Update": "max",
            "Confirmed": "sum",
            "Recovered": "sum",
            "Deaths": "sum",
            "Active": "sum",
            "Country/Region": "first",
            "Latitude": "first",
            "Longitude": "first"
            # People tested is currently null
            #'People Tested': 'sum'
        }
    )

    states_abbrev = (
        states_agg.merge(test_max_df, on=CommonFields.STATE, how="left")
        .merge(interventions_df, on=CommonFields.STATE, how="inner", suffixes=["", "_dropcol"],)
        .merge(projections_df, on=CommonFields.STATE, how="left")
    )
    STATE_COLS_REMAP = {
        CovidTrackingDataSource.Fields.POSITIVE_TESTS: CUMULATIVE_POSITIVE_TESTS,
        CovidTrackingDataSource.Fields.NEGATIVE_TESTS: CUMULATIVE_NEGATIVE_TESTS,
        **OUTPUT_COLUMN_REMAP_TO_RESULT_DATA,
    }

    states_remapped = states_abbrev.rename(columns=STATE_COLS_REMAP)
    states_remapped[CommonFields.STATE_FULL_NAME] = states_remapped[CommonFields.STATE].map(
        abbrev_us_state
    )
    states_final = pd.DataFrame(states_remapped, columns=RESULT_DATA_COLUMNS_STATES)

    # Keep nulls as nulls
    states_final = states_final.fillna(NULL_VALUE)
    states_final["Combined Key"] = states_final[CommonFields.STATE_FULL_NAME]
    states_final[CommonFields.FIPS] = states_final[CommonFields.STATE_FULL_NAME].map(us_fips)

    states_final.index.name = "OBJECTID"

    assert states_final["Combined Key"].value_counts().max() == 1
    return states_final
Exemplo n.º 22
0
def load_hospitalization_data_by_state(state,
                                       t0,
                                       convert_cumulative_to_current=False,
                                       category="hospitalized"):
    """
    Obtain hospitalization data. We clip because there are sometimes negatives
    either due to data reporting or corrections in case count. These are always
    tiny so we just make downstream easier to work with by clipping.

    Parameters
    ----------
    state: str
        State to lookup.
    t0: datetime
        Datetime to offset by.
    convert_cumulative_to_current: bool
        If True, and only cumulative hospitalizations are available, convert the
        current hospitalizations to the current value.
    category: str
        'icu' for just ICU or 'hospitalized' for all ICU + Acute.

    Returns
    -------
    times: array(float) or NoneType
        List of float days since t0 for the hospitalization data.
    observed_hospitalizations: array(int) or NoneType
        Array of new cases observed each day.
    type: HospitalizationDataType
        Specifies cumulative or current hospitalizations.
    """
    abbr = us.states.lookup(state).abbr
    hospitalization_data = (
        combined_datasets.build_us_timeseries_with_all_fields().get_subset(
            AggregationLevel.STATE, country="USA",
            state=abbr).get_data(country="USA", state=abbr))

    categories = ["icu", "hospitalized"]
    if category not in categories:
        raise ValueError(
            f"Hospitalization category {category} is not in {categories}")

    if len(hospitalization_data) == 0:
        return None, None, None

    if (hospitalization_data[f"current_{category}"] > 0).any():
        hospitalization_data = hospitalization_data[
            hospitalization_data[f"current_{category}"].notnull()]
        times_new = (hospitalization_data["date"].dt.date -
                     t0.date()).dt.days.values
        return (
            times_new,
            hospitalization_data[f"current_{category}"].values.clip(min=0),
            HospitalizationDataType.CURRENT_HOSPITALIZATIONS,
        )
    elif (hospitalization_data[f"cumulative_{category}"] > 0).any():
        hospitalization_data = hospitalization_data[
            hospitalization_data[f"cumulative_{category}"].notnull()]
        times_new = (hospitalization_data["date"].dt.date -
                     t0.date()).dt.days.values
        cumulative = hospitalization_data[
            f"cumulative_{category}"].values.clip(min=0)
        # Some minor glitches for a few states..
        for i, val in enumerate(cumulative[1:]):
            if cumulative[i] > cumulative[i + 1]:
                cumulative[i] = cumulative[i + 1]

        if convert_cumulative_to_current:
            # Must be here to avoid circular import. This is required to convert
            # cumulative hosps to current hosps. We also just use a dummy fips and t_list.
            from pyseir.parameters.parameter_ensemble_generator import ParameterEnsembleGenerator

            params = ParameterEnsembleGenerator(
                fips="06", t_list=[],
                N_samples=1).get_average_seir_parameters()
            if category == "hospitalized":
                average_length_of_stay = (
                    params["hospitalization_rate_general"] *
                    params["hospitalization_length_of_stay_general"] +
                    params["hospitalization_rate_icu"] *
                    (1 - params["fraction_icu_requiring_ventilator"]) *
                    params["hospitalization_length_of_stay_icu"] +
                    params["hospitalization_rate_icu"] *
                    params["fraction_icu_requiring_ventilator"] *
                    params["hospitalization_length_of_stay_icu_and_ventilator"]
                ) / (params["hospitalization_rate_general"] +
                     params["hospitalization_rate_icu"])
            else:
                average_length_of_stay = (
                    (1 - params["fraction_icu_requiring_ventilator"]) *
                    params["hospitalization_length_of_stay_icu"] +
                    params["fraction_icu_requiring_ventilator"] *
                    params["hospitalization_length_of_stay_icu_and_ventilator"]
                )

            # Now compute a cumulative sum, but at each day, subtract the discharges from the previous count.
            new_hospitalizations = np.append([0], np.diff(cumulative))
            current = [0]
            for i, new_hosps in enumerate(new_hospitalizations[1:]):
                current.append(current[i] + new_hosps -
                               current[i] / average_length_of_stay)
            return times_new, current, HospitalizationDataType.CURRENT_HOSPITALIZATIONS
        else:
            return times_new, cumulative, HospitalizationDataType.CUMULATIVE_HOSPITALIZATIONS
    else:
        return None, None, None