def to_csv(self, path: pathlib.Path): """Persists timeseries to CSV. Args: path: Path to write to. """ common_df.write_csv(self.data, path, structlog.get_logger())
def save_combined_csv(csv_path_format, output_dir): """Save the combined datasets DataFrame, cleaned up for easier comparisons.""" csv_path = form_path_name(csv_path_format, output_dir) timeseries = combined_datasets.build_us_timeseries_with_all_fields() timeseries_data = timeseries.data common_df.write_csv(timeseries_data, csv_path, structlog.get_logger())
def main(fetch: bool): common_init.configure_logging() log = structlog.get_logger() ccd_dataset = ccd_helpers.CovidCountyDataset.load(fetch=fetch) all_df = transform(ccd_dataset) common_df.write_csv(all_df, OUTPUT_PATH, log)
def main(fetch: bool): common_init.configure_logging() log = structlog.get_logger() updater = CovidCareMapUpdater() if fetch: updater.update() df = updater.transform() common_df.write_csv(df, STATIC_CSV_PATH, log, [CommonFields.FIPS])
def test_write_csv_empty(): df = pd.DataFrame( [], columns=[CommonFields.DATE, CommonFields.FIPS, CommonFields.CASES]) with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: common_df.write_csv(df, tmp.path, structlog.get_logger()) assert "fips,date,cases\n" == tmp.file.read() assert [l["event"] for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]
def main(replace_local_mirror: bool, generate_common_csv: bool): common_init.configure_logging() if replace_local_mirror: update_datasets() if generate_common_csv: common_df.write_csv(transform_cms_datasets(), TIMESERIES_CSV_PATH, _logger)
def main(fetch: bool): common_init.configure_logging() log = structlog.get_logger(updater="CovidDataScraperTransformer") local_path = DATA_ROOT / "cases-cds" / "timeseries-common.csv" transformer = CovidDataScraperTransformer.make_with_data_root(DATA_ROOT, log) if fetch: transformer.fetch() common_df.write_csv(transformer.transform(), local_path, log)
def test_remove_index_column(): df = pd.DataFrame( [("99", "2020-04-01", "a", 123)], columns=["fips", "date", "index", "cases"] ).set_index(COMMON_FIELDS_TIMESERIES_KEYS) with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs: common_df.write_csv(df, tmp.path, structlog.get_logger()) assert "fips,date,cases\n99,2020-04-01,123\n" == tmp.file.read() assert [l["event"] for l in logs] == ["Dropping column named 'index'", "Writing DataFrame"]
def to_csv(self, path: pathlib.Path): """Persists timeseries to CSV. Args: path: Path to write to. """ common_df.write_csv(self.data, path, structlog.get_logger(), self.COMMON_INDEX_FIELDS) if self.provenance is not None: provenance_path = str(path).replace(".csv", "-provenance.csv") self.provenance.sort_index().to_csv(provenance_path)
def test_write_csv_columns_are_sorted_in_output_with_extras(): df = pd.DataFrame( [], columns=[CommonFields.DATE, CommonFields.FIPS, "extra2", CommonFields.CASES, "extra1"] ) df = df.set_index(COMMON_FIELDS_TIMESERIES_KEYS) with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs: log = structlog.get_logger() common_df.write_csv(df, tmp.path, log) assert "fips,date,cases,extra1,extra2\n" == tmp.file.read() assert [l["event"] for l in logs] == [ "Writing DataFrame", ]
def test_write_csv_extra_columns_dropped(): df = pd.DataFrame( [], columns=[CommonFields.DATE, CommonFields.FIPS, "extra1", CommonFields.CASES, "extra2"] ) df = df.set_index(COMMON_FIELDS_TIMESERIES_KEYS) with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs: log = structlog.get_logger() common_df.write_csv(only_common_columns(df, log), tmp.path, log) assert "fips,date,cases\n" == tmp.file.read() assert [l["event"] for l in logs] == [ "Dropping columns not in CommonFields", "Writing DataFrame", ]
def update(self, *, fetch: bool, log): if fetch: todays_filename = self.date_today.isoformat() + ".csv" todays_file_path = self.gsheets_copy_directory / todays_filename todays_file_path.write_bytes(requests.get(self.source_url).content) result = pd.DataFrame.from_records( self.yield_dict_per_state_date(), columns=["fips", "state", "date", "contact_tracers_count"], ).sort_values(["fips", "date"]) result[CommonFields.COUNTRY] = "USA" result[CommonFields.AGGREGATE_LEVEL] = "state" common_df.write_csv(result, self.state_timeseries_path, log)
def to_csv(self, path: pathlib.Path): """Persists timeseries to CSV. Args: path: Path to write to. """ combined = self.combined_df assert combined[CommonFields.LOCATION_ID].notna().all() common_df.write_csv(combined, path, structlog.get_logger(), [CommonFields.LOCATION_ID, CommonFields.DATE]) if self.provenance is not None: provenance_path = str(path).replace(".csv", "-provenance.csv") self.provenance.sort_index().to_csv(provenance_path)
def main(fetch: bool): common_init.configure_logging() connection = zoltpy.util.authenticate() transformer = ForecastHubUpdater.make_with_data_root( ForecastModel.ENSEMBLE, connection, DATA_ROOT ) if fetch: _logger.info("Fetching new data.") transformer.update_source_data() data = transformer.load_source_data() data = transformer.transform(data) common_df.write_csv(data, transformer.timeseries_output_path, _logger)
def main(replace_local_mirror: bool, generate_common_csv: bool): logging.basicConfig(level=logging.INFO) common_init.configure_logging() if replace_local_mirror: update_local_json() CovidTrackingDataUpdater().update() if generate_common_csv: common_df.write_csv( transform(load_local_json()), TIMESERIES_CSV_PATH, structlog.get_logger(), )
def test_float_na_formatting(): df = pd.DataFrame( [("99", "2020-04-01", 1.0, 2, 3), ("99", "2020-04-02", pd.NA, pd.NA, None)], columns="fips date metric_a metric_b metric_c".split(), ).set_index(COMMON_FIELDS_TIMESERIES_KEYS) expected_csv = """fips,date,metric_a,metric_b,metric_c 99,2020-04-01,1,2,3 99,2020-04-02,,, """ with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs: common_df.write_csv(df, tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Writing DataFrame"]
def main(check_for_new_data: bool, fetch: bool): common_init.configure_logging() transformer = NYTimesUpdater.make_with_data_root(DATA_ROOT) if check_for_new_data: if not transformer.is_new_data_available(): raise Exception("No new data available") _logger.info("New data available") return if fetch: _logger.info("Fetching new data.") transformer.update_source_data() data = transformer.load_state_and_county_data() data = transformer.transform(data) common_df.write_csv(data, transformer.timeseries_output_path, _logger)
def main(replace_local_mirror: bool, generate_common_csv: bool): common_init.configure_logging() if replace_local_mirror: update_dataset_csv() if generate_common_csv: dataset = pd.read_csv( DATASET_CSV_PATH, parse_dates=[Fields.DATE], dtype={Fields.STATE_FIPS: str}, low_memory=False, ) common_df.write_csv( transform(dataset), TIMESERIES_CSV_PATH, _logger, )
def test_write_csv_without_date(): df = pd.DataFrame( { CommonFields.FIPS: ["06045", "45123"], "extra_index": ["idx_1", "idx_2"], CommonFields.CASES: [234, 456], "extra_column": ["extra_data", None], } ) expected_csv = """fips,extra_index,cases,extra_column 06045,idx_1,234,extra_data 45123,idx_2,456, """ with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs: common_df.write_csv( df, tmp.path, structlog.get_logger(), index_names=[CommonFields.FIPS, "extra_index"] ) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]
def test_float_formatting(): input_csv = """fips,date,col_1,col_2,col_3,col_4,col_5,col_6 99123,2020-04-01,1,2.0000000,3,0.0004,0.00005,6000000000 99123,2020-04-02,,,,,, 99123,2020-04-03,1,2,3.1234567,4,5,6.0 """ input_df = read_csv_to_indexed_df(StringIO(input_csv)) expected_csv = """fips,date,col_1,col_2,col_3,col_4,col_5,col_6 99123,2020-04-01,1,2,3,0.0004,5e-05,6000000000 99123,2020-04-02,,,,,, 99123,2020-04-03,1,2,3.1234567,4,5,6 """ with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs: common_df.write_csv(input_df, tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Writing DataFrame"]
def test_write_csv(): df = pd.DataFrame({ CommonFields.DATE: ["2020-04-01", "2020-04-02"], CommonFields.FIPS: ["06045", "45123"], CommonFields.CASES: [234, 456], }) df_original = df.copy() expected_csv = """fips,date,cases 06045,2020-04-01,234 45123,2020-04-02,456 """ # Call common_df.write_csv with index set to ["fips", "date"], the expected normal index. with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: common_df.write_csv(df.set_index(["fips", "date"]), tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Writing DataFrame"] # Pass df with other index that will be changed. Check that the same output is written to the # file. with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: common_df.write_csv(df, tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"] with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: common_df.write_csv(df.set_index(["date", "cases"]), tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"] with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: common_df.write_csv(df.set_index(["date", "fips"]), tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"] assert repr(df) == repr(df_original)
def main(): common_init.configure_logging() log = structlog.get_logger() df = load_dataset() common_df.write_csv(df, CSV_PATH, log, index_names=[CommonFields.FIPS])
if bad_rows.any(): self.log.warning("Dropping rows with null in important columns", bad_rows=str(df.loc[bad_rows])) df = df.loc[~bad_rows] # Removing a string of misleading FL current_icu values. is_incorrect_fl_icu_dates = df[CommonFields.DATE].between( "2020-05-14", "2020-05-20") is_fl_state = df[CommonFields.FIPS] == "12" df.loc[is_fl_state & is_incorrect_fl_icu_dates, CommonFields.CURRENT_ICU] = None df = df.set_index(COMMON_FIELDS_TIMESERIES_KEYS, verify_integrity=True) return df if __name__ == "__main__": common_init.configure_logging() log = structlog.get_logger() transformer = CovidCountyDataTransformer.make_with_data_root( DATA_ROOT, os.environ.get("CMDC_API_KEY", None), log, ) common_df.write_csv( common_df.only_common_columns(transformer.transform(), log), DATA_ROOT / "cases-covid-county-data" / "timeseries-common.csv", log, )
# Merge counties and states back together. out_df = pd.concat([counties_df, states_df]) # Add country metadata. out_df[CommonFields.COUNTRY] = "USA" out_df = filter_early_data(out_df) return out_df def filter_early_data(df): keep_rows = df[CommonFields.DATE.value] >= pd.to_datetime(DEFAULT_START_DATE) df = df.loc[keep_rows] for (fips, start_date) in CUSTOM_START_DATES.items(): keep_rows = (df[Fields.FIPS] != fips) | ( df[CommonFields.DATE] >= pd.to_datetime(start_date) ) df = df.loc[keep_rows] return df if __name__ == "__main__": common_init.configure_logging() all_df = update(DATA_URL) common_df.write_csv(all_df, OUTPUT_PATH, _logger)
value=value, ) del row[key] row[CommonFields.COUNTY] = county_name row[CommonFields.FIPS] = county_to_fips[county_name] yield row def transform(self): county_to_fips = ( helpers.load_county_fips_data(self.county_fips_csv) .loc[lambda x: x[CommonFields.STATE] == "NV"] .set_index([CommonFields.COUNTY]) .loc[:, CommonFields.FIPS] .to_dict() ) df = pd.DataFrame.from_records(self._yield_rows(county_to_fips)) return df if __name__ == "__main__": common_init.configure_logging() log = structlog.get_logger() common_df.write_csv( CsvCopy.make_with_data_root(DATA_ROOT).transform(), DATA_ROOT / "states" / "nv" / "nha_hospitalization_county.csv", log, )
class Config: arbitrary_types_allowed = True @staticmethod def make_with_data_root(data_root: pathlib.Path,) -> "TexasFipsHospitalizationsUpdater": return TexasFipsHospitalizationsUpdater( hospitalizations_by_tsa_csv=data_root / "states" / "tx" / "tx_tsa_hospitalizations.csv", county_fips_csv=data_root / "misc" / "fips_population.csv", tsa_to_fips_csv=data_root / "states" / "tx" / "tx_tsa_region_fips_map.csv", ) def update(self): hosp_by_tsa_date = pd.read_csv(self.hospitalizations_by_tsa_csv, dtype={Fields.FIPS: str}) census_data = census_data_helpers.load_county_fips_data(self.county_fips_csv) tsa_to_fips = pd.read_csv(self.tsa_to_fips_csv, dtype={Fields.FIPS: str}) output = build_hospitalizations_spread_by_population( hosp_by_tsa_date, census_data.data, tsa_to_fips ) output[CommonFields.AGGREGATE_LEVEL] = "county" output[CommonFields.COUNTRY] = "USA" return output if __name__ == "__main__": common_init.configure_logging() log = structlog.get_logger() updater = TexasFipsHospitalizationsUpdater.make_with_data_root(DATA_ROOT) data = updater.update() output_csv = DATA_ROOT / "states" / "tx" / "tx_fips_hospitalizations.csv" common_df.write_csv(data, output_csv, log)