Exemplo n.º 1
0
    def to_csv(self, path: pathlib.Path):
        """Persists timeseries to CSV.

        Args:
            path: Path to write to.
        """
        common_df.write_csv(self.data, path, structlog.get_logger())
Exemplo n.º 2
0
def save_combined_csv(csv_path_format, output_dir):
    """Save the combined datasets DataFrame, cleaned up for easier comparisons."""
    csv_path = form_path_name(csv_path_format, output_dir)

    timeseries = combined_datasets.build_us_timeseries_with_all_fields()
    timeseries_data = timeseries.data

    common_df.write_csv(timeseries_data, csv_path, structlog.get_logger())
def main(fetch: bool):
    common_init.configure_logging()
    log = structlog.get_logger()

    ccd_dataset = ccd_helpers.CovidCountyDataset.load(fetch=fetch)
    all_df = transform(ccd_dataset)

    common_df.write_csv(all_df, OUTPUT_PATH, log)
def main(fetch: bool):
    common_init.configure_logging()
    log = structlog.get_logger()
    updater = CovidCareMapUpdater()
    if fetch:
        updater.update()

    df = updater.transform()
    common_df.write_csv(df, STATIC_CSV_PATH, log, [CommonFields.FIPS])
Exemplo n.º 5
0
def test_write_csv_empty():
    df = pd.DataFrame(
        [], columns=[CommonFields.DATE, CommonFields.FIPS, CommonFields.CASES])
    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        common_df.write_csv(df, tmp.path, structlog.get_logger())
        assert "fips,date,cases\n" == tmp.file.read()
    assert [l["event"]
            for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]
def main(replace_local_mirror: bool, generate_common_csv: bool):
    common_init.configure_logging()

    if replace_local_mirror:
        update_datasets()

    if generate_common_csv:
        common_df.write_csv(transform_cms_datasets(), TIMESERIES_CSV_PATH,
                            _logger)
def main(fetch: bool):
    common_init.configure_logging()
    log = structlog.get_logger(updater="CovidDataScraperTransformer")
    local_path = DATA_ROOT / "cases-cds" / "timeseries-common.csv"

    transformer = CovidDataScraperTransformer.make_with_data_root(DATA_ROOT, log)
    if fetch:
        transformer.fetch()
    common_df.write_csv(transformer.transform(), local_path, log)
Exemplo n.º 8
0
def test_remove_index_column():
    df = pd.DataFrame(
        [("99", "2020-04-01", "a", 123)], columns=["fips", "date", "index", "cases"]
    ).set_index(COMMON_FIELDS_TIMESERIES_KEYS)

    with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs:
        common_df.write_csv(df, tmp.path, structlog.get_logger())
        assert "fips,date,cases\n99,2020-04-01,123\n" == tmp.file.read()

    assert [l["event"] for l in logs] == ["Dropping column named 'index'", "Writing DataFrame"]
    def to_csv(self, path: pathlib.Path):
        """Persists timeseries to CSV.

        Args:
            path: Path to write to.
        """
        common_df.write_csv(self.data, path, structlog.get_logger(),
                            self.COMMON_INDEX_FIELDS)
        if self.provenance is not None:
            provenance_path = str(path).replace(".csv", "-provenance.csv")
            self.provenance.sort_index().to_csv(provenance_path)
Exemplo n.º 10
0
def test_write_csv_columns_are_sorted_in_output_with_extras():
    df = pd.DataFrame(
        [], columns=[CommonFields.DATE, CommonFields.FIPS, "extra2", CommonFields.CASES, "extra1"]
    )
    df = df.set_index(COMMON_FIELDS_TIMESERIES_KEYS)
    with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs:
        log = structlog.get_logger()
        common_df.write_csv(df, tmp.path, log)
        assert "fips,date,cases,extra1,extra2\n" == tmp.file.read()
    assert [l["event"] for l in logs] == [
        "Writing DataFrame",
    ]
Exemplo n.º 11
0
def test_write_csv_extra_columns_dropped():
    df = pd.DataFrame(
        [], columns=[CommonFields.DATE, CommonFields.FIPS, "extra1", CommonFields.CASES, "extra2"]
    )
    df = df.set_index(COMMON_FIELDS_TIMESERIES_KEYS)
    with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs:
        log = structlog.get_logger()
        common_df.write_csv(only_common_columns(df, log), tmp.path, log)
        assert "fips,date,cases\n" == tmp.file.read()
    assert [l["event"] for l in logs] == [
        "Dropping columns not in CommonFields",
        "Writing DataFrame",
    ]
Exemplo n.º 12
0
    def update(self, *, fetch: bool, log):
        if fetch:
            todays_filename = self.date_today.isoformat() + ".csv"
            todays_file_path = self.gsheets_copy_directory / todays_filename
            todays_file_path.write_bytes(requests.get(self.source_url).content)

        result = pd.DataFrame.from_records(
            self.yield_dict_per_state_date(),
            columns=["fips", "state", "date", "contact_tracers_count"],
        ).sort_values(["fips", "date"])
        result[CommonFields.COUNTRY] = "USA"
        result[CommonFields.AGGREGATE_LEVEL] = "state"
        common_df.write_csv(result, self.state_timeseries_path, log)
Exemplo n.º 13
0
    def to_csv(self, path: pathlib.Path):
        """Persists timeseries to CSV.

        Args:
            path: Path to write to.
        """
        combined = self.combined_df
        assert combined[CommonFields.LOCATION_ID].notna().all()
        common_df.write_csv(combined, path, structlog.get_logger(),
                            [CommonFields.LOCATION_ID, CommonFields.DATE])
        if self.provenance is not None:
            provenance_path = str(path).replace(".csv", "-provenance.csv")
            self.provenance.sort_index().to_csv(provenance_path)
Exemplo n.º 14
0
def main(fetch: bool):
    common_init.configure_logging()
    connection = zoltpy.util.authenticate()
    transformer = ForecastHubUpdater.make_with_data_root(
        ForecastModel.ENSEMBLE, connection, DATA_ROOT
    )
    if fetch:
        _logger.info("Fetching new data.")
        transformer.update_source_data()

    data = transformer.load_source_data()
    data = transformer.transform(data)
    common_df.write_csv(data, transformer.timeseries_output_path, _logger)
def main(replace_local_mirror: bool, generate_common_csv: bool):
    logging.basicConfig(level=logging.INFO)
    common_init.configure_logging()

    if replace_local_mirror:
        update_local_json()

    CovidTrackingDataUpdater().update()

    if generate_common_csv:
        common_df.write_csv(
            transform(load_local_json()),
            TIMESERIES_CSV_PATH,
            structlog.get_logger(),
        )
Exemplo n.º 16
0
def test_float_na_formatting():
    df = pd.DataFrame(
        [("99", "2020-04-01", 1.0, 2, 3), ("99", "2020-04-02", pd.NA, pd.NA, None)],
        columns="fips date metric_a metric_b metric_c".split(),
    ).set_index(COMMON_FIELDS_TIMESERIES_KEYS)

    expected_csv = """fips,date,metric_a,metric_b,metric_c
99,2020-04-01,1,2,3
99,2020-04-02,,,
"""

    with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs:
        common_df.write_csv(df, tmp.path, structlog.get_logger())
        assert expected_csv == tmp.file.read()

    assert [l["event"] for l in logs] == ["Writing DataFrame"]
def main(check_for_new_data: bool, fetch: bool):
    common_init.configure_logging()
    transformer = NYTimesUpdater.make_with_data_root(DATA_ROOT)

    if check_for_new_data:
        if not transformer.is_new_data_available():
            raise Exception("No new data available")
        _logger.info("New data available")
        return

    if fetch:
        _logger.info("Fetching new data.")
        transformer.update_source_data()

    data = transformer.load_state_and_county_data()
    data = transformer.transform(data)
    common_df.write_csv(data, transformer.timeseries_output_path, _logger)
def main(replace_local_mirror: bool, generate_common_csv: bool):
    common_init.configure_logging()

    if replace_local_mirror:
        update_dataset_csv()

    if generate_common_csv:
        dataset = pd.read_csv(
            DATASET_CSV_PATH,
            parse_dates=[Fields.DATE],
            dtype={Fields.STATE_FIPS: str},
            low_memory=False,
        )

        common_df.write_csv(
            transform(dataset),
            TIMESERIES_CSV_PATH,
            _logger,
        )
Exemplo n.º 19
0
def test_write_csv_without_date():
    df = pd.DataFrame(
        {
            CommonFields.FIPS: ["06045", "45123"],
            "extra_index": ["idx_1", "idx_2"],
            CommonFields.CASES: [234, 456],
            "extra_column": ["extra_data", None],
        }
    )
    expected_csv = """fips,extra_index,cases,extra_column
06045,idx_1,234,extra_data
45123,idx_2,456,
"""
    with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs:
        common_df.write_csv(
            df, tmp.path, structlog.get_logger(), index_names=[CommonFields.FIPS, "extra_index"]
        )
        assert expected_csv == tmp.file.read()
    assert [l["event"] for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]
Exemplo n.º 20
0
def test_float_formatting():
    input_csv = """fips,date,col_1,col_2,col_3,col_4,col_5,col_6
99123,2020-04-01,1,2.0000000,3,0.0004,0.00005,6000000000
99123,2020-04-02,,,,,,
99123,2020-04-03,1,2,3.1234567,4,5,6.0
"""
    input_df = read_csv_to_indexed_df(StringIO(input_csv))

    expected_csv = """fips,date,col_1,col_2,col_3,col_4,col_5,col_6
99123,2020-04-01,1,2,3,0.0004,5e-05,6000000000
99123,2020-04-02,,,,,,
99123,2020-04-03,1,2,3.1234567,4,5,6
"""

    with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs:
        common_df.write_csv(input_df, tmp.path, structlog.get_logger())
        assert expected_csv == tmp.file.read()

    assert [l["event"] for l in logs] == ["Writing DataFrame"]
Exemplo n.º 21
0
def test_write_csv():
    df = pd.DataFrame({
        CommonFields.DATE: ["2020-04-01", "2020-04-02"],
        CommonFields.FIPS: ["06045", "45123"],
        CommonFields.CASES: [234, 456],
    })
    df_original = df.copy()
    expected_csv = """fips,date,cases
06045,2020-04-01,234
45123,2020-04-02,456
"""
    # Call common_df.write_csv with index set to ["fips", "date"], the expected normal index.
    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        common_df.write_csv(df.set_index(["fips", "date"]), tmp.path,
                            structlog.get_logger())
        assert expected_csv == tmp.file.read()
    assert [l["event"] for l in logs] == ["Writing DataFrame"]

    # Pass df with other index that will be changed. Check that the same output is written to the
    # file.
    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        common_df.write_csv(df, tmp.path, structlog.get_logger())
        assert expected_csv == tmp.file.read()
    assert [l["event"]
            for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]

    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        common_df.write_csv(df.set_index(["date", "cases"]), tmp.path,
                            structlog.get_logger())
        assert expected_csv == tmp.file.read()
    assert [l["event"]
            for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]

    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        common_df.write_csv(df.set_index(["date", "fips"]), tmp.path,
                            structlog.get_logger())
        assert expected_csv == tmp.file.read()
    assert [l["event"]
            for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]

    assert repr(df) == repr(df_original)
def main():
    common_init.configure_logging()
    log = structlog.get_logger()
    df = load_dataset()
    common_df.write_csv(df, CSV_PATH, log, index_names=[CommonFields.FIPS])
        if bad_rows.any():
            self.log.warning("Dropping rows with null in important columns",
                             bad_rows=str(df.loc[bad_rows]))
            df = df.loc[~bad_rows]

        # Removing a string of misleading FL current_icu values.
        is_incorrect_fl_icu_dates = df[CommonFields.DATE].between(
            "2020-05-14", "2020-05-20")
        is_fl_state = df[CommonFields.FIPS] == "12"
        df.loc[is_fl_state & is_incorrect_fl_icu_dates,
               CommonFields.CURRENT_ICU] = None

        df = df.set_index(COMMON_FIELDS_TIMESERIES_KEYS, verify_integrity=True)

        return df


if __name__ == "__main__":
    common_init.configure_logging()
    log = structlog.get_logger()
    transformer = CovidCountyDataTransformer.make_with_data_root(
        DATA_ROOT,
        os.environ.get("CMDC_API_KEY", None),
        log,
    )
    common_df.write_csv(
        common_df.only_common_columns(transformer.transform(), log),
        DATA_ROOT / "cases-covid-county-data" / "timeseries-common.csv",
        log,
    )
    # Merge counties and states back together.
    out_df = pd.concat([counties_df, states_df])

    # Add country metadata.
    out_df[CommonFields.COUNTRY] = "USA"

    out_df = filter_early_data(out_df)

    return out_df


def filter_early_data(df):
    keep_rows = df[CommonFields.DATE.value] >= pd.to_datetime(DEFAULT_START_DATE)
    df = df.loc[keep_rows]

    for (fips, start_date) in CUSTOM_START_DATES.items():
        keep_rows = (df[Fields.FIPS] != fips) | (
            df[CommonFields.DATE] >= pd.to_datetime(start_date)
        )
        df = df.loc[keep_rows]

    return df


if __name__ == "__main__":

    common_init.configure_logging()
    all_df = update(DATA_URL)
    common_df.write_csv(all_df, OUTPUT_PATH, _logger)
Exemplo n.º 25
0
                        value=value,
                    )
                    del row[key]

            row[CommonFields.COUNTY] = county_name
            row[CommonFields.FIPS] = county_to_fips[county_name]
            yield row

    def transform(self):
        county_to_fips = (
            helpers.load_county_fips_data(self.county_fips_csv)
            .loc[lambda x: x[CommonFields.STATE] == "NV"]
            .set_index([CommonFields.COUNTY])
            .loc[:, CommonFields.FIPS]
            .to_dict()
        )

        df = pd.DataFrame.from_records(self._yield_rows(county_to_fips))
        return df


if __name__ == "__main__":
    common_init.configure_logging()

    log = structlog.get_logger()
    common_df.write_csv(
        CsvCopy.make_with_data_root(DATA_ROOT).transform(),
        DATA_ROOT / "states" / "nv" / "nha_hospitalization_county.csv",
        log,
    )
    class Config:
        arbitrary_types_allowed = True

    @staticmethod
    def make_with_data_root(data_root: pathlib.Path,) -> "TexasFipsHospitalizationsUpdater":
        return TexasFipsHospitalizationsUpdater(
            hospitalizations_by_tsa_csv=data_root / "states" / "tx" / "tx_tsa_hospitalizations.csv",
            county_fips_csv=data_root / "misc" / "fips_population.csv",
            tsa_to_fips_csv=data_root / "states" / "tx" / "tx_tsa_region_fips_map.csv",
        )

    def update(self):
        hosp_by_tsa_date = pd.read_csv(self.hospitalizations_by_tsa_csv, dtype={Fields.FIPS: str})
        census_data = census_data_helpers.load_county_fips_data(self.county_fips_csv)
        tsa_to_fips = pd.read_csv(self.tsa_to_fips_csv, dtype={Fields.FIPS: str})
        output = build_hospitalizations_spread_by_population(
            hosp_by_tsa_date, census_data.data, tsa_to_fips
        )
        output[CommonFields.AGGREGATE_LEVEL] = "county"
        output[CommonFields.COUNTRY] = "USA"
        return output


if __name__ == "__main__":
    common_init.configure_logging()
    log = structlog.get_logger()
    updater = TexasFipsHospitalizationsUpdater.make_with_data_root(DATA_ROOT)
    data = updater.update()
    output_csv = DATA_ROOT / "states" / "tx" / "tx_fips_hospitalizations.csv"
    common_df.write_csv(data, output_csv, log)