Exemplo n.º 1
0
def run_module(params):
    """
    Generate ground truth HHS hospitalization data.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output
            - "log_filename" (optional): str, name of file to write logs
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    mapper = GeoMapper()
    request_all_states = ",".join(mapper.get_geo_values("state_id"))
    today = date.today()
    past_reference_day = date(year=2020, month=1,
                              day=1)  # first available date in DB
    date_range = generate_date_ranges(past_reference_day, today)
    dfs = []
    for r in date_range:
        response = Epidata.covid_hosp(request_all_states, r)
        # The last date range might only have recent days that don't have any data, so don't error.
        if response["result"] != 1 and r != date_range[-1]:
            raise Exception(f"Bad result from Epidata: {response['message']}")
        if response["result"] == -2 and r == date_range[
                -1]:  # -2 code means no results
            continue
        dfs.append(pd.DataFrame(response['epidata']))
    all_columns = pd.concat(dfs)

    geo_mapper = GeoMapper()

    for sig in SIGNALS:
        state = geo_mapper.add_geocode(make_signal(all_columns, sig),
                                       "state_id",
                                       "state_code",
                                       from_col="state")
        for geo in GEOS:
            create_export_csv(make_geo(state, geo, geo_mapper),
                              params["common"]["export_dir"], geo, sig)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Exemplo n.º 2
0
    def test_good_file(self):
        df = pull_nchs_mortality_data(TOKEN, "test_data.csv")

        # Test columns
        assert (df.columns.values == [
            'covid_19_deaths', 'total_deaths', 'percent_of_expected_deaths',
            'pneumonia_deaths', 'pneumonia_and_covid_19_deaths',
            'influenza_deaths', 'pneumonia_influenza_or_covid_19_deaths',
            "timestamp", "geo_id", "population"
        ]).all()

        # Test aggregation for NYC and NY
        raw_df = pd.read_csv("./test_data/test_data.csv",
                             parse_dates=["start_week"])
        raw_df = standardize_columns(raw_df)
        for metric in METRICS:
            ny_list = raw_df.loc[(raw_df["state"] == "New York")
                                 & (raw_df[metric].isnull()),
                                 "timestamp"].values
            nyc_list = raw_df.loc[(raw_df["state"] == "New York City")
                                  & (raw_df[metric].isnull()),
                                  "timestamp"].values
            final_list = df.loc[(df["geo_id"] == "ny")
                                & (df[metric].isnull()), "timestamp"].values
            assert set(final_list) == set(ny_list).intersection(set(nyc_list))

        # Test missing value
        gmpr = GeoMapper()
        state_ids = pd.DataFrame(list(gmpr.get_geo_values("state_id")))
        state_names = gmpr.replace_geocode(state_ids,
                                           "state_id",
                                           "state_name",
                                           from_col=0,
                                           date_col=None)
        for state, geo_id in zip(state_names, state_ids):
            if state in set(["New York", "New York City"]):
                continue
            for metric in METRICS:
                test_list = raw_df.loc[(raw_df["state"] == state)
                                       & (raw_df[metric].isnull()),
                                       "timestamp"].values
                final_list = df.loc[(df["geo_id"] == geo_id)
                                    & (df[metric].isnull()),
                                    "timestamp"].values
                assert set(final_list) == set(test_list)
Exemplo n.º 3
0
def run_module():
    """Generate ground truth HHS hospitalization data."""
    params = read_params()
    mapper = GeoMapper()
    request_all_states = ",".join(mapper.get_geo_values("state_id"))

    today = date.today()
    past_reference_day = date(year=2020, month=1,
                              day=1)  # first available date in DB
    date_range = generate_date_ranges(past_reference_day, today)
    dfs = []
    for r in date_range:
        response = Epidata.covid_hosp(request_all_states, r)
        if response['result'] != 1:
            raise Exception(f"Bad result from Epidata: {response['message']}")
        dfs.append(pd.DataFrame(response['epidata']))
    all_columns = pd.concat(dfs)

    for sig in SIGNALS:
        create_export_csv(make_signal(all_columns, sig), params["export_dir"],
                          "state", sig)