def test_update_sensor(self): """Tests that the sensors are properly updated.""" for geo in ["state","hrr"]: td = TemporaryDirectory() su_inst = CHCSensorUpdator( "02-01-2020", "06-01-2020", "06-12-2020", geo, self.parallel, self.weekday, self.numtype, self.se ) with mock_s3(): # Create the fake bucket we will be using params = read_params() aws_credentials = params["aws_credentials"] s3_client = Session(**aws_credentials).client("s3") s3_client.create_bucket(Bucket=params["bucket_name"]) su_inst.update_sensor( self.small_test_data, td.name) assert len(os.listdir(td.name)) == len(su_inst.sensor_dates),\ f"failed {geo} update sensor test" td.cleanup()
def run_as_module(date): # Clean directories for fname in listdir("receiving"): if ".csv" in fname: remove(join("receiving", fname)) for fname in listdir("cache"): if ".csv" in fname: remove(join("cache", fname)) for fname in listdir("daily_cache"): if ".csv" in fname: remove(join("daily_cache", fname)) # Simulate the cache already being partially populated copy("test_data/weekly_202025_state_wip_deaths_covid_incidence_prop.csv", "daily_cache") for fname in listdir("daily_receiving"): if ".csv" in fname: remove(join("daily_receiving", fname)) with mock_s3(): with freeze_time(date): # Create the fake bucket we will be using params = read_params() aws_credentials = params["aws_credentials"] s3_client = Session(**aws_credentials).client("s3") s3_client.create_bucket(Bucket=params["bucket_name"]) run_module()
def test_match_old_smoothed_output(self, run_as_module, wip_signal=read_params()["wip_signal"]): """Tests that smooth output files don't change over time.""" if wip_signal: files = [ "20200419_hrr_wip_smoothed_search.csv", "20200419_msa_wip_smoothed_search.csv", "20200419_state_wip_smoothed_search.csv", "20200419_dma_wip_smoothed_search.csv", ] else: files = [ "20200419_hrr_smoothed_search.csv", "20200419_msa_smoothed_search.csv", "20200419_state_smoothed_search.csv", "20200419_dma_smoothed_search.csv", ] for fname in files: test_df = pd.read_csv(join("receiving_test", fname)) new_df = pd.read_csv(join("receiving", fname)) assert_frame_equal(test_df, new_df)
def run_module(): """Run the validator as a module.""" parent_params = read_params() params = parent_params['validation'] validator = Validator(params) validator.validate(parent_params["export_dir"]).print_and_exit()
def test_class(self, run_as_module, wip_signal=read_params()["wip_signal"]): """Tests output file existence.""" if wip_signal: assert exists(join("receiving", "20200419_hrr_wip_raw_search.csv")) assert exists(join("receiving", "20200419_msa_wip_raw_search.csv")) assert exists( join("receiving", "20200419_state_wip_raw_search.csv")) assert exists(join("receiving", "20200419_dma_wip_raw_search.csv")) assert exists(join("receiving", "20200315_hrr_wip_raw_search.csv")) assert exists(join("receiving", "20200315_msa_wip_raw_search.csv")) assert exists( join("receiving", "20200315_state_wip_raw_search.csv")) assert exists(join("receiving", "20200315_dma_wip_raw_search.csv")) else: assert exists(join("receiving", "20200419_hrr_raw_search.csv")) assert exists(join("receiving", "20200419_msa_raw_search.csv")) assert exists(join("receiving", "20200419_state_raw_search.csv")) assert exists(join("receiving", "20200419_dma_raw_search.csv")) assert exists(join("receiving", "20200315_hrr_raw_search.csv")) assert exists(join("receiving", "20200315_msa_raw_search.csv")) assert exists(join("receiving", "20200315_state_raw_search.csv")) assert exists(join("receiving", "20200315_dma_raw_search.csv"))
def test_pull_quidel_data(self): params = read_params() dfs, _ = pull_quidel_data(params) # For covid_ag df = dfs["covid_ag"] first_date = df["timestamp"].min().date() last_date = df["timestamp"].max().date() assert [first_date.month, first_date.day] == [7, 2] assert [last_date.month, last_date.day] == [7, 23] assert (df.columns == [ 'timestamp', 'zip', 'totalTest', 'numUniqueDevices', 'positiveTest' ]).all() # For covid_ag df = dfs["flu_ag"] first_date = df["timestamp"].min().date() last_date = df["timestamp"].max().date() assert [first_date.month, first_date.day] == [6, 22] assert [last_date.month, last_date.day] == [8, 17] assert (df.columns == [ 'timestamp', 'zip', 'totalTest', 'numUniqueDevices', 'positiveTest' ]).all()
def update_sensor( state_files: List[str], mmwr_info: pd.DataFrame, output_path: str, start_date: datetime, end_date: datetime) -> pd.DataFrame: """ Generate sensor values, and write to csv format. Args: state_files: List of JSON files representing COVID-NET hospitalization data for each state mmwr_info: Mappings from MMWR week to actual dates, as a pd.DataFrame output_path: Path to write the csvs to start_date: First sensor date (datetime.datetime) end_date: Last sensor date (datetime.datetime) Returns: The overall pd.DataFrame after all processing """ assert start_date < end_date, "start_date >= end_date" # Combine and format hospitalizations dataframe hosp_df = CovidNet.read_all_hosp_data(state_files) hosp_df = hosp_df.merge(mmwr_info, how="left", left_on=["mmwr-year", "mmwr-week"], right_on=["year", "weeknumber"]) # Select relevant columns and standardize naming hosp_df = hosp_df.loc[:, APIConfig.HOSP_RENAME_COLS.keys()]\ .rename(columns=APIConfig.HOSP_RENAME_COLS) # Restrict to start and end date hosp_df = hosp_df[ (hosp_df["date"] >= start_date) & ( hosp_df["date"] < end_date) ] # Set state id to two-letter abbreviation gmpr = GeoMapper() hosp_df = gmpr.add_geocode(hosp_df, from_col=APIConfig.STATE_COL, from_code="state_name", new_code="state_id", dropna=False) # To use the original column name, reassign original column and drop new one hosp_df[APIConfig.STATE_COL] = hosp_df["state_id"] hosp_df.drop("state_id", axis=1, inplace=True) assert not hosp_df.duplicated(["date", "geo_id"]).any(), "Non-unique (date, geo_id) pairs" hosp_df.set_index(["date", "geo_id"], inplace=True) # Fill in remaining expected columns hosp_df["se"] = np.nan hosp_df["sample_size"] = np.nan # Write results signals = add_prefix(SIGNALS, wip_signal=read_params()["wip_signal"], prefix="wip_") for signal in signals: write_to_csv(hosp_df, signal, output_path) return hosp_df
def run_module(): """ Calls the method for handling the wip signals Returns ------- prints the updated signal names """ params = read_params() wip_signal = params["wip_signal"] signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_") print(signal_names)
def run_module() -> None: """Run entire hhs_facilities indicator.""" params = read_params() raw_df = pull_data() gmpr = GeoMapper() filled_fips_df = fill_missing_fips(raw_df, gmpr) for geo, (sig_name, sig_cols, sig_func, sig_offset) in product(GEO_RESOLUTIONS, SIGNALS): mapped_df = convert_geo(filled_fips_df, geo, gmpr) output_df = generate_signal(mapped_df, sig_cols, sig_func, sig_offset) create_export_csv(output_df, params["export_dir"], geo, sig_name)
def run_as_module(): # Clean receiving directory for fname in listdir("receiving"): if fname != ".gitignore": remove(join("receiving", fname)) with mock_s3(): # Create the fake bucket we will be using params = read_params() aws_credentials = params["aws_credentials"] s3_client = Session(**aws_credentials).client("s3") s3_client.create_bucket(Bucket=params["bucket_name"]) run_module()
def run_module(): params = read_params() qparams = params['qualtrics'] qparams['qualtrics_dir'] = params['input_dir'] if not os.path.exists(qparams['qualtrics_dir']): os.makedirs(qparams['qualtrics_dir']) if not qparams['token']: print("\nDRY-RUN MODE\n") fetch,post = make_fetchers(qparams) get(fetch, post, qparams)
def test_output_files(self, run_as_module): """Tests that the output files contain the correct results of the run.""" params = read_params() # Test output exists csv_files = listdir("receiving") dates_for_covid_ag = [ "20200702", "20200703", "20200704", "20200705", "20200706", "20200707", "20200708", "20200709" ] dates_for_flu_ag = [ "20200623", "20200624", "20200625", "20200626", "20200627", "20200628", "20200629", "20200630", "20200701", "20200702", "20200703" ] geos = GEO_RESOLUTIONS.copy() sensors = add_prefix(list(SENSORS.keys()), wip_signal=params["wip_signal"], prefix="wip_") expected_files = [] for geo in geos: for sensor in sensors: if "covid_ag" in sensor: for date in dates_for_covid_ag: expected_files += [ date + "_" + geo + "_" + sensor + ".csv" ] else: for date in dates_for_flu_ag: expected_files += [ date + "_" + geo + "_" + sensor + ".csv" ] assert set(expected_files).issubset(set(csv_files)) # Test output format df = pd.read_csv( join("./receiving", "20200709_state_covid_ag_raw_pct_positive.csv")) assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all() # test_intermediate_file flag = None for fname in listdir("./cache"): if ".csv" in fname: flag = 1 assert flag is not None
def test_pull_quidel_covidtest(self): params = read_params() df, _ = pull_quidel_covidtest(params) first_date = df["timestamp"].min().date() last_date = df["timestamp"].max().date() assert [first_date.month, first_date.day] == [7, 2] assert [last_date.month, last_date.day] == [7, 23] assert (df.columns == [ 'timestamp', 'zip', 'totalTest', 'numUniqueDevices', 'positiveTest' ]).all()
def __init__(self, startdate, enddate, dropdate, geo, parallel, weekday, numtype, se): """Init Sensor Updator. Args: startdate: first sensor date (YYYY-mm-dd) enddate: last sensor date (YYYY-mm-dd) dropdate: data drop date (YYYY-mm-dd) geo: geographic resolution, one of ["county", "state", "msa", "hrr", "hhs", "nation"] parallel: boolean to run the sensor update in parallel weekday: boolean to adjust for weekday effects numtype: type of count data used, one of ["covid", "cli"] se: boolean to write out standard errors, if true, use an obfuscated name """ self.startdate, self.enddate, self.dropdate = [ pd.to_datetime(t) for t in (startdate, enddate, dropdate)] # handle dates assert (self.startdate > (Config.FIRST_DATA_DATE + Config.BURN_IN_PERIOD) ), f"not enough data to produce estimates starting {self.startdate}" assert self.startdate < self.enddate, "start date >= end date" assert self.enddate <= self.dropdate, "end date > drop date" self.geo, self.parallel, self.weekday, self.numtype, self.se = geo.lower(), parallel, \ weekday, numtype, se # output file naming if self.numtype == "covid": signals = [SMOOTHED_ADJ if self.weekday else SMOOTHED] elif self.numtype == "cli": signals = [SMOOTHED_ADJ_CLI if self.weekday else SMOOTHED_CLI] signal_names = add_prefix( signals, wip_signal=read_params()["wip_signal"]) self.updated_signal_names = signal_names # initialize members set in shift_dates(). self.burnindate = None self.fit_dates = None self.burn_in_dates = None self.sensor_dates = None
def configure(variants): """Validate params file and set date range.""" params = read_params() params['export_start_date'] = date(*params['export_start_date']) yesterday = date.today() - timedelta(days=1) if params['date_range'] == 'new': # only create combined file for the newest update # (usually for yesterday, but check just in case) params['date_range'] = [ min( yesterday, next_missing_day(params["source"], set(signal[-1] for signal in variants))), yesterday ] elif params['date_range'] == 'all': # create combined files for all of the historical reports params['date_range'] = [params['export_start_date'], yesterday] else: match_res = re.findall(re.compile(r'^\d{8}-\d{8}$'), params['date_range']) if len(match_res) == 0: raise ValueError( "Invalid date_range parameter. Please choose from (new, all, yyyymmdd-yyyymmdd)." ) try: date1 = datetime.strptime(params['date_range'][:8], '%Y%m%d').date() except ValueError as error: raise ValueError( "Invalid date_range parameter. Please check the first date." ) from error try: date2 = datetime.strptime(params['date_range'][-8:], '%Y%m%d').date() except ValueError as error: raise ValueError( "Invalid date_range parameter. Please check the second date." ) from error #The the valid start date if date1 < params['export_start_date']: date1 = params['export_start_date'] params['date_range'] = [date1, date2] return params
def run_module(): """Run Google Symptoms module.""" params = read_params() export_start_date = datetime.strptime(params["export_start_date"], "%Y-%m-%d") export_dir = params["export_dir"] base_url = params["base_url"] # Pull GS data dfs = pull_gs_data(base_url) gmpr = geomap.GeoMapper() for geo_res in GEO_RESOLUTIONS: if geo_res == "state": df_pull = dfs["state"] elif geo_res in ["hhs", "nation"]: df_pull = gmpr.replace_geocode(dfs["county"], "fips", geo_res, from_col="geo_id", date_col="timestamp") df_pull.rename(columns={geo_res: "geo_id"}, inplace=True) else: df_pull = geo_map(dfs["county"], geo_res) for metric, smoother in product(METRICS + [COMBINED_METRIC], SMOOTHERS): print(geo_res, metric, smoother) df = df_pull.set_index(["timestamp", "geo_id"]) df["val"] = df[metric].groupby(level=1).transform( SMOOTHERS_MAP[smoother][0]) df["se"] = np.nan df["sample_size"] = np.nan # Drop early entries where data insufficient for smoothing df = df.loc[~df["val"].isnull(), :] df = df.reset_index() sensor_name = "_".join([smoother, "search"]) create_export_csv( df, export_dir=export_dir, start_date=SMOOTHERS_MAP[smoother][1](export_start_date), metric=metric.lower(), geo_res=geo_res, sensor=sensor_name)
def run_module(): """Parse parameters and generates csv files for the COVID-NET sensor.""" params = read_params() logging.basicConfig(level=logging.DEBUG) start_date = datetime.strptime(params["start_date"], "%Y-%m-%d") # If no end_date is specified, assume it is the current date if params["end_date"] == "": end_date = datetime.now() else: end_date = datetime.strptime(params["end_date"], "%Y-%m-%d") logging.info("start date:\t%s", start_date.date()) logging.info("end date:\t%s", end_date.date()) logging.info("outpath:\t%s", params["export_dir"]) logging.info("parallel:\t%s", params["parallel"]) # Only geo is state, and no weekday adjustment for now # COVID-NET data is by weeks anyway, not daily logging.info("starting state, no adj") # Download latest COVID-NET files into the cache directory first mappings_file = join(params["cache_dir"], "init.json") CovidNet.download_mappings(outfile=mappings_file) _, mmwr_info, _ = CovidNet.read_mappings(mappings_file) state_files = CovidNet.download_all_hosp_data(mappings_file, params["cache_dir"], parallel=params["parallel"]) update_sensor(state_files, mmwr_info, params["export_dir"], start_date, end_date) # Cleanup cache dir remove(mappings_file) for state_file in state_files: remove(state_file) logging.info("finished all")
def run_module(): start_time = time.time() params = read_params() meta = covidcast.metadata() slack_notifier = None if "channel" in params and "slack_token" in params: slack_notifier = SlackNotifier(params["channel"], params["slack_token"]) complaints = [] for data_source in params["sources"].keys(): complaints.extend( check_source(data_source, meta, params["sources"], params.get("grace", 0), LOGGER)) if len(complaints) > 0: report_complaints(complaints, slack_notifier) elapsed_time_in_seconds = round(time.time() - start_time, 2) LOGGER.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def run_module(): """Generate ground truth HHS hospitalization data.""" params = read_params() mapper = GeoMapper() request_all_states = ",".join(mapper.get_geo_values("state_id")) today = date.today() past_reference_day = date(year=2020, month=1, day=1) # first available date in DB date_range = generate_date_ranges(past_reference_day, today) dfs = [] for r in date_range: response = Epidata.covid_hosp(request_all_states, r) if response['result'] != 1: raise Exception(f"Bad result from Epidata: {response['message']}") dfs.append(pd.DataFrame(response['epidata'])) all_columns = pd.concat(dfs) for sig in SIGNALS: create_export_csv(make_signal(all_columns, sig), params["export_dir"], "state", sig)
def test_output_files(self, run_as_module): """Tests that the proper files are output.""" # Test output exists csv_files = listdir("receiving") dates = [ "20200702", "20200703", "20200704", "20200705", "20200706", "20200707", "20200708", "20200709" ] geos = GEO_RESOLUTIONS.copy() sensors = add_prefix(SENSORS, wip_signal=read_params()["wip_signal"], prefix="wip_") expected_files = [] for date in dates: for geo in geos: for sensor in sensors: expected_files += [ date + "_" + geo + "_" + sensor + ".csv" ] assert set(expected_files).issubset(set(csv_files)) # Test output format df = pd.read_csv( join("./receiving", "20200709_state_covid_ag_raw_pct_positive.csv")) assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all() # test_intermediate_file flag = None for fname in listdir("./cache"): if ".csv" in fname: flag = 1 assert flag is not None
def run_module(): params = read_params() meta = covidcast.metadata() complaints = [] for data_source in params["sources"].keys(): complaints.extend( check_source(data_source, meta, params["sources"], params.get("grace", 0), LOGGER)) if len(complaints) > 0: for complaint in complaints: LOGGER.critical( event="signal out of SLA", message=complaint.message, data_source=complaint.data_source, signal=complaint.signal, geo_types=complaint.geo_types, last_updated=complaint.last_updated.strftime("%Y-%m-%d")) report_complaints(complaints, params) sys.exit(1)
def run_module(): """Run the delphi_changehc module.""" params = read_params() logging.basicConfig(level=logging.DEBUG) make_asserts(params) if params["drop_date"] is None: # files are dropped about 4pm the day after the issue date dropdate_dt = (datetime.now() - timedelta(days=1, hours=16)) dropdate_dt = dropdate_dt.replace(hour=0, minute=0, second=0, microsecond=0) else: dropdate_dt = datetime.strptime(params["drop_date"], "%Y-%m-%d") filedate = dropdate_dt.strftime("%Y%m%d") file_dict = retrieve_files(params, filedate) dropdate = str(dropdate_dt.date()) # range of estimates to produce n_backfill_days = params[ "n_backfill_days"] # produce estimates for n_backfill_days n_waiting_days = params[ "n_waiting_days"] # most recent n_waiting_days won't be est enddate_dt = dropdate_dt - timedelta(days=n_waiting_days) startdate_dt = enddate_dt - timedelta(days=n_backfill_days) enddate = str(enddate_dt.date()) startdate = str(startdate_dt.date()) # now allow manual overrides if params["end_date"] is not None: enddate = params["end_date"] if params["start_date"] is not None: startdate = params["start_date"] logging.info("first sensor date:\t%s", startdate) logging.info("last sensor date:\t%s", enddate) logging.info("drop date:\t\t%s", dropdate) logging.info("n_backfill_days:\t%s", n_backfill_days) logging.info("n_waiting_days:\t%s", n_waiting_days) ## print out other vars logging.info("geos:\t\t\t%s", params["geos"]) logging.info("outpath:\t\t%s", params["export_dir"]) logging.info("parallel:\t\t%s", params["parallel"]) logging.info("weekday:\t\t%s", params["weekday"]) logging.info("types:\t\t%s", params["types"]) logging.info("se:\t\t\t%s", params["se"]) ## start generating for geo in params["geos"]: for numtype in params["types"]: for weekday in params["weekday"]: if weekday: logging.info("starting %s, %s, weekday adj", geo, numtype) else: logging.info("starting %s, %s, no adj", geo, numtype) su_inst = CHCSensorUpdator(startdate, enddate, dropdate, geo, params["parallel"], weekday, numtype, params["se"]) if numtype == "covid": data = load_combined_data(file_dict["denom"], file_dict["covid"], dropdate_dt, "fips") elif numtype == "cli": data = load_cli_data(file_dict["denom"], file_dict["flu"], file_dict["mixed"], file_dict["flu_like"], file_dict["covid_like"], dropdate_dt, "fips") su_inst.update_sensor(data, params["export_dir"]) logging.info("finished %s", geo) logging.info("finished all")
def get_logger(): params = read_params() return get_structured_logger(__name__, filename=params.get("log_filename"), log_exceptions=params.get( "log_exceptions", True))
def test_return_params(self): params = read_params() assert params["test"] == "yes"
def test_copy_template(self): os.remove("params.json") params = read_params() assert params["test"] == "yes"
# -*- coding: utf-8 -*- """Call the function run_module when executed. This file indicates that calling the module (`python -m MODULE_NAME`) will call the function `run_module` found within the run.py file. There should be no need to change this template. """ from delphi_utils import read_params from .run import run_module # pragma: no cover run_module(read_params()) # pragma: no cover
def run_module(): """Run the usafacts indicator.""" params = read_params() export_start_date = params["export_start_date"] if export_start_date == "latest": export_start_date = datetime.combine(date.today(), time( 0, 0)) - timedelta(days=1) else: export_start_date = datetime.strptime(export_start_date, "%Y-%m-%d") export_dir = params["export_dir"] base_url = params["base_url"] cache_dir = params["cache_dir"] arch_diff = S3ArchiveDiffer(cache_dir, export_dir, params["bucket_name"], "usafacts", params["aws_credentials"]) arch_diff.update_cache() geo_mapper = GeoMapper() dfs = { metric: pull_usafacts_data(base_url, metric, geo_mapper) for metric in METRICS } for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS): print(geo_res, metric, sensor, smoother) df = dfs[metric] # Aggregate to appropriate geographic resolution df = geo_map(df, geo_res, sensor) df["val"] = SMOOTHERS_MAP[smoother][0].smooth(df[sensor].values) df["se"] = np.nan df["sample_size"] = np.nan # Drop early entries where data insufficient for smoothing df = df.loc[~df["val"].isnull(), :] sensor_name = SENSOR_NAME_MAP[sensor][0] # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]): # metric = f"wip_{metric}" # sensor_name = WIP_SENSOR_NAME_MAP[sensor][0] sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name create_export_csv( df, export_dir=export_dir, start_date=SMOOTHERS_MAP[smoother][3](export_start_date), metric=metric, geo_res=geo_res, sensor=sensor_name, ) # Diff exports, and make incremental versions _, common_diffs, new_files = arch_diff.diff_exports() # Archive changed and new files only to_archive = [f for f, diff in common_diffs.items() if diff is not None] to_archive += new_files _, fails = arch_diff.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive succ_common_diffs = { f: diff for f, diff in common_diffs.items() if f not in fails } arch_diff.filter_exports(succ_common_diffs) # Report failures: someone should probably look at them for exported_file in fails: print(f"Failed to archive '{exported_file}'")
def get_logger(): params = read_params() return get_structured_logger(__name__, filename=params.get("log_filename"))
from os.path import join, exists from tempfile import TemporaryDirectory # third party from delphi_utils import read_params import numpy as np import pandas as pd import pytest # first party from delphi_claims_hosp.config import Config, GeoConstants from delphi_claims_hosp.update_indicator import ClaimsHospIndicatorUpdater CONFIG = Config() CONSTANTS = GeoConstants() PARAMS = read_params() DATA_FILEPATH = PARAMS["input_file"] DROP_DATE = pd.to_datetime(PARAMS["drop_date"]) OUTPATH = "test_data/" class TestClaimsHospIndicatorUpdater: geo = "hrr" parallel = False weekday = False write_se = False prefix = "foo" small_test_data = pd.DataFrame({ "num": [0, 100, 200, 300, 400, 500, 600, 100, 200, 300, 400, 500, 600], "hrr": [1.0] * 7 + [2.0] * 6, "den": [1000] * 7 + [2000] * 6,
def run_module(): """Run the JHU indicator module.""" params = read_params() export_start_date = params["export_start_date"] export_dir = params["export_dir"] base_url = params["base_url"] cache_dir = params["cache_dir"] logger = get_structured_logger(__name__, filename=params.get("log_filename")) if len(params["bucket_name"]) > 0: arch_diff = S3ArchiveDiffer( cache_dir, export_dir, params["bucket_name"], "jhu", params["aws_credentials"], ) arch_diff.update_cache() else: arch_diff = None gmpr = GeoMapper() dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS} for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS): print(metric, geo_res, sensor, smoother) logger.info(event="generating signal and exporting to CSV", metric=metric, geo_res=geo_res, sensor=sensor, smoother=smoother) df = dfs[metric] # Aggregate to appropriate geographic resolution df = geo_map(df, geo_res) df.set_index(["timestamp", "geo_id"], inplace=True) df["val"] = df[sensor].groupby(level=1).transform( SMOOTHERS_MAP[smoother][0]) df["se"] = np.nan df["sample_size"] = np.nan # Drop early entries where data insufficient for smoothing df = df[~df["val"].isnull()] df = df.reset_index() sensor_name = SENSOR_NAME_MAP[sensor][0] # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]): # metric = f"wip_{metric}" # sensor_name = WIP_SENSOR_NAME_MAP[sensor][0] sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name create_export_csv( df, export_dir=export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), metric=metric, geo_res=geo_res, sensor=sensor_name, ) if not arch_diff is None: # Diff exports, and make incremental versions _, common_diffs, new_files = arch_diff.diff_exports() # Archive changed and new files only to_archive = [ f for f, diff in common_diffs.items() if diff is not None ] to_archive += new_files _, fails = arch_diff.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive succ_common_diffs = { f: diff for f, diff in common_diffs.items() if f not in fails } arch_diff.filter_exports(succ_common_diffs) # Report failures: someone should probably look at them for exported_file in fails: print(f"Failed to archive '{exported_file}'")