def run_module(): """Produce a combined cases and deaths signal using data from JHU and USA Facts.""" variants = [ tuple((metric, geo_res) + sensor_signal(metric, sensor, smoother)) for (metric, geo_res, sensor, smoother ) in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTH_TYPES) ] params = configure(variants) for metric, geo_res, sensor_name, signal in variants: df = combine_usafacts_and_jhu( signal, geo_res, extend_raw_date_range(params, sensor_name)) df["timestamp"] = pd.to_datetime(df["timestamp"]) start_date = pd.to_datetime(params['export_start_date']) export_dir = params["export_dir"] dates = pd.Series(df[df["timestamp"] >= start_date] ["timestamp"].unique()).sort_values() signal_name = add_prefix([signal], wip_signal=params["wip_signal"], prefix="wip_") for date_ in dates: export_fn = f'{date_.strftime("%Y%m%d")}_{geo_res}_{signal_name[0]}.csv' df[df["timestamp"] == date_][[ "geo_id", "val", "se", "sample_size", ]].to_csv(f"{export_dir}/{export_fn}", index=False, na_rep="NA")
def update_sensor( state_files: List[str], mmwr_info: pd.DataFrame, output_path: str, start_date: datetime, end_date: datetime) -> pd.DataFrame: """ Generate sensor values, and write to csv format. Args: state_files: List of JSON files representing COVID-NET hospitalization data for each state mmwr_info: Mappings from MMWR week to actual dates, as a pd.DataFrame output_path: Path to write the csvs to start_date: First sensor date (datetime.datetime) end_date: Last sensor date (datetime.datetime) Returns: The overall pd.DataFrame after all processing """ assert start_date < end_date, "start_date >= end_date" # Combine and format hospitalizations dataframe hosp_df = CovidNet.read_all_hosp_data(state_files) hosp_df = hosp_df.merge(mmwr_info, how="left", left_on=["mmwr-year", "mmwr-week"], right_on=["year", "weeknumber"]) # Select relevant columns and standardize naming hosp_df = hosp_df.loc[:, APIConfig.HOSP_RENAME_COLS.keys()]\ .rename(columns=APIConfig.HOSP_RENAME_COLS) # Restrict to start and end date hosp_df = hosp_df[ (hosp_df["date"] >= start_date) & ( hosp_df["date"] < end_date) ] # Set state id to two-letter abbreviation gmpr = GeoMapper() hosp_df = gmpr.add_geocode(hosp_df, from_col=APIConfig.STATE_COL, from_code="state_name", new_code="state_id", dropna=False) # To use the original column name, reassign original column and drop new one hosp_df[APIConfig.STATE_COL] = hosp_df["state_id"] hosp_df.drop("state_id", axis=1, inplace=True) assert not hosp_df.duplicated(["date", "geo_id"]).any(), "Non-unique (date, geo_id) pairs" hosp_df.set_index(["date", "geo_id"], inplace=True) # Fill in remaining expected columns hosp_df["se"] = np.nan hosp_df["sample_size"] = np.nan # Write results signals = add_prefix(SIGNALS, wip_signal=read_params()["wip_signal"], prefix="wip_") for signal in signals: write_to_csv(hosp_df, signal, output_path) return hosp_df
def run_module(params): """ Produce a combined cases and deaths signal using data from JHU and USA Facts. Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output. - "log_exceptions" (optional): bool, whether to log exceptions to file. - "log_filename" (optional): str, name of file to write logs - "indicator": - "export_start_date": list of ints, [year, month, day] format, first day to begin data exports from. - "date_range": str, YYYYMMDD-YYYYMMDD format, range of dates to generate data for. - "source": str, name of combo indicator in metadata. - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix. """ start_time = time.time() variants = [ tuple((metric, geo_res) + sensor_signal(metric, sensor, smoother)) for (metric, geo_res, sensor, smoother ) in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTH_TYPES) ] params = configure(variants, params) logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) for metric, geo_res, sensor_name, signal in variants: df = combine_usafacts_and_jhu( signal, geo_res, extend_raw_date_range(params, sensor_name), params['indicator']['issue_range']) df["timestamp"] = pd.to_datetime(df["timestamp"]) start_date = pd.to_datetime(params['indicator']['export_start_date']) export_dir = params["common"]["export_dir"] dates = pd.Series(df[df["timestamp"] >= start_date] ["timestamp"].unique()).sort_values() signal_name = add_prefix([signal], wip_signal=params['indicator']["wip_signal"], prefix="wip_") for date_ in dates: export_fn = f'{date_.strftime("%Y%m%d")}_{geo_res}_{signal_name[0]}.csv' df[df["timestamp"] == date_][[ "geo_id", "val", "se", "sample_size", ]].to_csv(f"{export_dir}/{export_fn}", index=False, na_rep="NA") elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def test_output_files(self, run_as_module): """Tests that the output files contain the correct results of the run.""" params = read_params() # Test output exists csv_files = listdir("receiving") dates_for_covid_ag = [ "20200702", "20200703", "20200704", "20200705", "20200706", "20200707", "20200708", "20200709" ] dates_for_flu_ag = [ "20200623", "20200624", "20200625", "20200626", "20200627", "20200628", "20200629", "20200630", "20200701", "20200702", "20200703" ] geos = GEO_RESOLUTIONS.copy() sensors = add_prefix(list(SENSORS.keys()), wip_signal=params["wip_signal"], prefix="wip_") expected_files = [] for geo in geos: for sensor in sensors: if "covid_ag" in sensor: for date in dates_for_covid_ag: expected_files += [ date + "_" + geo + "_" + sensor + ".csv" ] else: for date in dates_for_flu_ag: expected_files += [ date + "_" + geo + "_" + sensor + ".csv" ] assert set(expected_files).issubset(set(csv_files)) # Test output format df = pd.read_csv( join("./receiving", "20200709_state_covid_ag_raw_pct_positive.csv")) assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all() # test_intermediate_file flag = None for fname in listdir("./cache"): if ".csv" in fname: flag = 1 assert flag is not None
def __init__(self, startdate, enddate, dropdate, geo, parallel, weekday, numtype, se): """Init Sensor Updator. Args: startdate: first sensor date (YYYY-mm-dd) enddate: last sensor date (YYYY-mm-dd) dropdate: data drop date (YYYY-mm-dd) geo: geographic resolution, one of ["county", "state", "msa", "hrr", "hhs", "nation"] parallel: boolean to run the sensor update in parallel weekday: boolean to adjust for weekday effects numtype: type of count data used, one of ["covid", "cli"] se: boolean to write out standard errors, if true, use an obfuscated name """ self.startdate, self.enddate, self.dropdate = [ pd.to_datetime(t) for t in (startdate, enddate, dropdate)] # handle dates assert (self.startdate > (Config.FIRST_DATA_DATE + Config.BURN_IN_PERIOD) ), f"not enough data to produce estimates starting {self.startdate}" assert self.startdate < self.enddate, "start date >= end date" assert self.enddate <= self.dropdate, "end date > drop date" self.geo, self.parallel, self.weekday, self.numtype, self.se = geo.lower(), parallel, \ weekday, numtype, se # output file naming if self.numtype == "covid": signals = [SMOOTHED_ADJ if self.weekday else SMOOTHED] elif self.numtype == "cli": signals = [SMOOTHED_ADJ_CLI if self.weekday else SMOOTHED_CLI] signal_names = add_prefix( signals, wip_signal=read_params()["wip_signal"]) self.updated_signal_names = signal_names # initialize members set in shift_dates(). self.burnindate = None self.fit_dates = None self.burn_in_dates = None self.sensor_dates = None
def test_output_files(self, run_as_module): """Tests that the proper files are output.""" # Test output exists csv_files = listdir("receiving") dates = [ "20200702", "20200703", "20200704", "20200705", "20200706", "20200707", "20200708", "20200709" ] geos = GEO_RESOLUTIONS.copy() sensors = add_prefix(SENSORS, wip_signal=read_params()["wip_signal"], prefix="wip_") expected_files = [] for date in dates: for geo in geos: for sensor in sensors: expected_files += [ date + "_" + geo + "_" + sensor + ".csv" ] assert set(expected_files).issubset(set(csv_files)) # Test output format df = pd.read_csv( join("./receiving", "20200709_state_covid_ag_raw_pct_positive.csv")) assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all() # test_intermediate_file flag = None for fname in listdir("./cache"): if ".csv" in fname: flag = 1 assert flag is not None
def test_output_files(self, clean_receiving_dir): """Tests that the proper files are output.""" run_module(self.PARAMS) csv_files = listdir("receiving") dates = ["20200718", "20200719", "20200720"] geos = GEO_RESOLUTIONS.copy() sensors = add_prefix(SENSORS, wip_signal=self.PARAMS["indicator"]["wip_signal"], prefix="wip_") expected_files = [] for date in dates: for geo in geos: for sensor in sensors: expected_files += [ date + "_" + geo + "_" + sensor + ".csv" ] assert set(expected_files).issubset(set(csv_files)) assert '20200721_state_covid_ag_raw_pct_positive.csv' not in csv_files assert '20200722_state_covid_ag_raw_pct_positive.csv' not in csv_files # Test output format df = pd.read_csv( join("./receiving", "20200718_state_covid_ag_smoothed_pct_positive.csv")) assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all() # test_intermediate_file flag = None for fname in listdir("./cache"): if ".csv" in fname: flag = 1 assert flag is not None
def run_module(params): """ Main function run when calling the module. Inputs parameters from the file 'params.json' and produces output data in the directory defined by the `export_dir` (should be "receiving" expect for testing purposes). Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - "indicator": - "static_file_dir": str, path to DMA mapping files - "data_dir": str, location of cached CSVs - "start_date": str, YYYY-MM-DD format, first day to generate data for - "end_date": str, YYYY-MM-DD format or empty string, last day to generate data for. - "ght_key": str, GHT API key - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix - "test_data_dir": str, path to test data - "archive" (optional): if provided, output will be archived with S3 - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation) - "bucket_name: str, name of S3 bucket to read/write - "cache_dir": str, directory of locally cached data """ start_time = time.time() csv_export_count = 0 oldest_final_export_date = None # read parameters ght_key = params["indicator"]["ght_key"] start_date = params["indicator"]["start_date"] end_date = params["indicator"]["end_date"] static_dir = params["indicator"]["static_file_dir"] export_dir = params["common"]["export_dir"] data_dir = params["indicator"]["data_dir"] wip_signal = params["indicator"]["wip_signal"] logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) if "archive" in params: arch_diff = S3ArchiveDiffer(params["archive"]["cache_dir"], export_dir, params["archive"]["bucket_name"], "ght", params["archive"]["aws_credentials"]) arch_diff.update_cache() # if missing start_date, set to today (GMT) minus 5 days if start_date == "": now = datetime.datetime.now(datetime.timezone.utc) start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # if missing end_date, set to today (GMT) minus 5 days if end_date == "": now = datetime.datetime.now(datetime.timezone.utc) end_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # Turn on basic logging messages (level INFO) logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) logging.info("Creating data from %s through %s.", start_date, end_date) # Dictionary mapping geo resolution to the data corresponding to that resolution. df_by_geo_res = {} if not params["indicator"]["test_data_dir"]: # setup class to handle API calls ght = GoogleHealthTrends(ght_key=ght_key) # read data frame version of the data df_by_geo_res[STATE] = get_counts_states(ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir) df_by_geo_res[DMA] = get_counts_dma(ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir) else: df_by_geo_res[STATE] = pd.read_csv( params["indicator"]["test_data_dir"].format(geo_res="state")) df_by_geo_res[DMA] = pd.read_csv( params["indicator"]["test_data_dir"].format(geo_res="dma")) df_by_geo_res[HRR], df_by_geo_res[MSA] = derived_counts_from_dma( df_by_geo_res[DMA], static_dir=static_dir) signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_") for signal in signal_names: is_smoothed = signal.endswith(SMOOTHED) for geo_res, df in df_by_geo_res.items(): exported_csv_dates = create_export_csv(format_for_export( df, is_smoothed), geo_res=geo_res, sensor=signal, start_date=start_date, export_dir=export_dir) if not exported_csv_dates.empty: csv_export_count += exported_csv_dates.size if not oldest_final_export_date: oldest_final_export_date = max(exported_csv_dates) oldest_final_export_date = min(oldest_final_export_date, max(exported_csv_dates)) if "archive" in params: archive(arch_diff) elapsed_time_in_seconds = round(time.time() - start_time, 2) max_lag_in_days = None formatted_oldest_final_export_date = None if oldest_final_export_date: max_lag_in_days = (datetime.datetime.now() - oldest_final_export_date).days formatted_oldest_final_export_date = oldest_final_export_date.strftime( "%Y-%m-%d") logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds, csv_export_count=csv_export_count, max_lag_in_days=max_lag_in_days, oldest_final_export_date=formatted_oldest_final_export_date)
def run_module(params: Dict[str, Any]): """Run the quidel_covidtest indicator. The `params` argument is expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - indicator": - "static_file_dir": str, directory name with population information - "input_cache_dir": str, directory in which to cache input data - "export_start_date": str, YYYY-MM-DD format of earliest date to create output - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3 documentation - "bucket_name": str, name of AWS bucket in which to find data - "wip_signal": List[str], list of signal names that are works in progress - "test_mode": bool, whether we are running in test mode """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) atexit.register(log_exit, start_time, logger) cache_dir = params["indicator"]["input_cache_dir"] export_dir = params["common"]["export_dir"] export_start_date = params["indicator"]["export_start_date"] export_end_date = params["indicator"]["export_end_date"] # Pull data and update export date df, _end_date = pull_quidel_covidtest(params["indicator"]) if _end_date is None: print("The data is up-to-date. Currently, no new data to be ingested.") return export_end_date = check_export_end_date(export_end_date, _end_date, END_FROM_TODAY_MINUS) export_start_date = check_export_start_date(export_start_date, export_end_date, EXPORT_DAY_RANGE) first_date, last_date = df["timestamp"].min(), df["timestamp"].max() # State Level data = df.copy() state_groups = geo_map("state", data).groupby("state_id") # Add prefix, if required sensors = add_prefix(SENSORS, wip_signal=params["indicator"]["wip_signal"], prefix="wip_") smoothers = SMOOTHERS.copy() for sensor in sensors: # For State Level print("state", sensor) if sensor.endswith(SMOOTHED_POSITIVE): smoothers[sensor] = smoothers.pop(SMOOTHED_POSITIVE) elif sensor.endswith(RAW_POSITIVE): smoothers[sensor] = smoothers.pop(RAW_POSITIVE) elif sensor.endswith(SMOOTHED_TEST_PER_DEVICE): smoothers[sensor] = smoothers.pop(SMOOTHED_TEST_PER_DEVICE) else: smoothers[sensor] = smoothers.pop(RAW_TEST_PER_DEVICE) state_df = generate_sensor_for_states(state_groups, smooth=smoothers[sensor][1], device=smoothers[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(state_df, geo_res="state", sensor=sensor, export_dir=export_dir, start_date=export_start_date, end_date=export_end_date) # County/HRR/MSA level for geo_res in GEO_RESOLUTIONS: geo_data, res_key = geo_map(geo_res, data) for sensor in sensors: print(geo_res, sensor) res_df = generate_sensor_for_other_geores( state_groups, geo_data, res_key, smooth=smoothers[sensor][1], device=smoothers[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(res_df, geo_res=geo_res, sensor=sensor, export_dir=export_dir, start_date=export_start_date, end_date=export_end_date, remove_null_samples=True) # Export the cache file if the pipeline runs successfully. # Otherwise, don't update the cache file update_cache_file(df, _end_date, cache_dir)
def run_module(): """Run the quidel_covidtest indicator.""" params = read_params() cache_dir = params["cache_dir"] export_dir = params["export_dir"] export_start_date = params["export_start_date"] export_end_date = params["export_end_date"] # Pull data and update export date df, _end_date = pull_quidel_covidtest(params) if _end_date is None: print("The data is up-to-date. Currently, no new data to be ingested.") return export_end_date = check_export_end_date(export_end_date, _end_date, END_FROM_TODAY_MINUS) export_start_date = check_export_start_date(export_start_date, export_end_date, EXPORT_DAY_RANGE) first_date, last_date = df["timestamp"].min(), df["timestamp"].max() # State Level data = df.copy() state_groups = geo_map("state", data).groupby("state_id") # Add prefix, if required sensors = add_prefix(SENSORS, wip_signal=read_params()["wip_signal"], prefix="wip_") smoothers = SMOOTHERS.copy() for sensor in sensors: # For State Level print("state", sensor) if sensor.endswith(SMOOTHED_POSITIVE): smoothers[sensor] = smoothers.pop(SMOOTHED_POSITIVE) elif sensor.endswith(RAW_POSITIVE): smoothers[sensor] = smoothers.pop(RAW_POSITIVE) elif sensor.endswith(SMOOTHED_TEST_PER_DEVICE): smoothers[sensor] = smoothers.pop(SMOOTHED_TEST_PER_DEVICE) else: smoothers[sensor] = smoothers.pop(RAW_TEST_PER_DEVICE) state_df = generate_sensor_for_states(state_groups, smooth=smoothers[sensor][1], device=smoothers[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(state_df, geo_res="state", sensor=sensor, export_dir=export_dir, start_date=export_start_date, end_date=export_end_date) # County/HRR/MSA level for geo_res in GEO_RESOLUTIONS: geo_data, res_key = geo_map(geo_res, data) for sensor in sensors: print(geo_res, sensor) res_df = generate_sensor_for_other_geores( state_groups, geo_data, res_key, smooth=smoothers[sensor][1], device=smoothers[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(res_df, geo_res=geo_res, sensor=sensor, export_dir=export_dir, start_date=export_start_date, end_date=export_end_date, remove_null_samples=True) # Export the cache file if the pipeline runs successfully. # Otherwise, don't update the cache file update_cache_file(df, _end_date, cache_dir)
def run_module(): """Main function run when calling the module. Inputs parameters from the file 'params.json' and produces output data in the directory defined by the `export_dir` (should be "receiving" expect for testing purposes). """ # read parameters params = read_params() ght_key = params["ght_key"] start_date = params["start_date"] end_date = params["end_date"] static_dir = params["static_file_dir"] export_dir = params["export_dir"] data_dir = params["data_dir"] wip_signal = params["wip_signal"] cache_dir = params["cache_dir"] arch_diff = S3ArchiveDiffer(cache_dir, export_dir, params["bucket_name"], "ght", params["aws_credentials"]) arch_diff.update_cache() print(arch_diff) # if missing start_date, set to today (GMT) minus 5 days if start_date == "": now = datetime.datetime.now(datetime.timezone.utc) start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # if missing start_date, set to today (GMT) minus 5 days if start_date == "": now = datetime.datetime.now(datetime.timezone.utc) start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # if missing end_date, set to today (GMT) minus 5 days if end_date == "": now = datetime.datetime.now(datetime.timezone.utc) end_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # Turn on basic logging messages (level INFO) logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) logging.info("Creating data from %s through %s.", start_date, end_date) # Dictionary mapping geo resolution to the data corresponding to that resolution. df_by_geo_res = {} if not params["test"]: # setup class to handle API calls ght = GoogleHealthTrends(ght_key=ght_key) # read data frame version of the data df_by_geo_res[STATE] = get_counts_states(ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir) df_by_geo_res[DMA] = get_counts_dma(ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir) else: df_by_geo_res[STATE] = pd.read_csv( params["test_data_dir"].format(geo_res="state")) df_by_geo_res[DMA] = pd.read_csv( params["test_data_dir"].format(geo_res="dma")) df_by_geo_res[HRR], df_by_geo_res[MSA] = derived_counts_from_dma( df_by_geo_res[DMA], static_dir=static_dir) signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_") for signal in signal_names: is_smoothed = signal.endswith(SMOOTHED) for geo_res, df in df_by_geo_res.items(): create_export_csv(format_for_export(df, is_smoothed), geo_res=geo_res, sensor=signal, start_date=start_date, export_dir=export_dir) if not params["test"]: # Diff exports, and make incremental versions _, common_diffs, new_files = arch_diff.diff_exports() # Archive changed and new files only to_archive = [ f for f, diff in common_diffs.items() if diff is not None ] to_archive += new_files _, fails = arch_diff.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive succ_common_diffs = { f: diff for f, diff in common_diffs.items() if f not in fails } arch_diff.filter_exports(succ_common_diffs) # Report failures: someone should probably look at them for exported_file in fails: print(f"Failed to archive '{exported_file}'")
def run_module(params: Dict[str, Any]): """Run Quidel flu test module. The `params` argument is expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - indicator": - "static_file_dir": str, directory name with population information - "input_cache_dir": str, directory in which to cache input data - "export_start_date": str, YYYY-MM-DD format of earliest date to create output - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3 documentation - "bucket_name": str, name of AWS bucket in which to find data - "wip_signal": List[str], list of signal names that are works in progress - "test_mode": bool, whether we are running in test mode """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) cache_dir = params["indicator"]["input_cache_dir"] export_dir = params["common"]["export_dir"] static_file_dir = params["indicator"]["static_file_dir"] export_start_dates = params["indicator"]["export_start_date"] export_end_dates = params["indicator"]["export_end_date"] map_df = pd.read_csv(join(static_file_dir, "fips_prop_pop.csv"), dtype={"fips": int}) # Pull data and update export date dfs, _end_date = pull_quidel_data(params["indicator"]) if _end_date is None: print("The data is up-to-date. Currently, no new data to be ingested.") return export_end_dates = check_export_end_date(export_end_dates, _end_date, END_FROM_TODAY_MINUS) export_start_dates = check_export_start_date(export_start_dates, export_end_dates, EXPORT_DAY_RANGE) # Add prefix, if required sensors = add_prefix(list(SENSORS.keys()), wip_signal=params["indicator"]["wip_signal"], prefix="wip_") for sensor in sensors: # Check either covid_ag or flu_ag test_type = "covid_ag" if "covid_ag" in sensor else "flu_ag" print("state", sensor) data = dfs[test_type].copy() state_groups = geo_map("state", data, map_df).groupby("state_id") first_date, last_date = data["timestamp"].min(), data["timestamp"].max( ) # For State Level state_df = generate_sensor_for_states(state_groups, smooth=SENSORS[sensor][1], device=SENSORS[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(state_df, geo_res="state", sensor=sensor, export_dir=export_dir, start_date=export_start_dates[test_type], end_date=export_end_dates[test_type]) # County/HRR/MSA level for geo_res in GEO_RESOLUTIONS: print(geo_res, sensor) data = dfs[test_type].copy() data, res_key = geo_map(geo_res, data, map_df) res_df = generate_sensor_for_other_geores( state_groups, data, res_key, smooth=SENSORS[sensor][1], device=SENSORS[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(res_df, geo_res=geo_res, sensor=sensor, export_dir=export_dir, start_date=export_start_dates[test_type], end_date=export_end_dates[test_type], remove_null_samples=True) # Export the cache file if the pipeline runs successfully. # Otherwise, don't update the cache file update_cache_file(dfs, _end_date, cache_dir) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)