def run_module(params) -> None: """ Run entire hhs_facilities indicator. Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) raw_df = pull_data() gmpr = GeoMapper() filled_fips_df = fill_missing_fips(raw_df, gmpr) for geo, (sig_name, sig_cols, sig_func, sig_offset) in product(GEO_RESOLUTIONS, SIGNALS): mapped_df = convert_geo(filled_fips_df, geo, gmpr) output_df = generate_signal(mapped_df, sig_cols, sig_func, sig_offset) create_export_csv(output_df, params["common"]["export_dir"], geo, sig_name) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def run_module(params): """ Produce a combined cases and deaths signal using data from JHU and USA Facts. Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output. - "log_exceptions" (optional): bool, whether to log exceptions to file. - "log_filename" (optional): str, name of file to write logs - "indicator": - "export_start_date": list of ints, [year, month, day] format, first day to begin data exports from. - "date_range": str, YYYYMMDD-YYYYMMDD format, range of dates to generate data for. - "source": str, name of combo indicator in metadata. - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix. """ start_time = time.time() variants = [ tuple((metric, geo_res) + sensor_signal(metric, sensor, smoother)) for (metric, geo_res, sensor, smoother ) in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTH_TYPES) ] params = configure(variants, params) logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) for metric, geo_res, sensor_name, signal in variants: df = combine_usafacts_and_jhu( signal, geo_res, extend_raw_date_range(params, sensor_name), params['indicator']['issue_range']) df["timestamp"] = pd.to_datetime(df["timestamp"]) start_date = pd.to_datetime(params['indicator']['export_start_date']) export_dir = params["common"]["export_dir"] dates = pd.Series(df[df["timestamp"] >= start_date] ["timestamp"].unique()).sort_values() signal_name = add_prefix([signal], wip_signal=params['indicator']["wip_signal"], prefix="wip_") for date_ in dates: export_fn = f'{date_.strftime("%Y%m%d")}_{geo_res}_{signal_name[0]}.csv' df[df["timestamp"] == date_][[ "geo_id", "val", "se", "sample_size", ]].to_csv(f"{export_dir}/{export_fn}", index=False, na_rep="NA") elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def run_module(params): """ Runs the indicator Arguments -------- params: Dict[str, Any] Nested dictionary of parameters. """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) mapper = GeoMapper() run_stats = [] ## build the base version of the signal at the most detailed geo level you can get. ## compute stuff here or farm out to another function or file all_data = pd.DataFrame( columns=["timestamp", "val", "zip", "sample_size", "se"]) ## aggregate & smooth ## TODO: add num/prop variations if needed for sensor, smoother, geo in product(SIGNALS, SMOOTHERS, GEOS): df = mapper.replace_geocode(all_data, "zip", geo, new_col="geo_id", date_col="timestamp") ## TODO: recompute sample_size, se here if not NA df["val"] = df[["geo_id", "val"]].groupby("geo_id")["val"].transform( smoother[0].smooth) sensor_name = sensor + smoother[ 1] ## TODO: +num/prop variation if used # don't export first 6 days for smoothed signals since they'll be nan. start_date = min(df.timestamp) + timedelta(6) if smoother[1] else min( df.timestamp) dates = create_export_csv(df, params["common"]["export_dir"], geo, sensor_name, start_date=start_date) if len(dates) > 0: run_stats.append((max(dates), len(dates))) ## log this indicator run elapsed_time_in_seconds = round(time.time() - start_time, 2) min_max_date = run_stats and min(s[0] for s in run_stats) csv_export_count = sum(s[-1] for s in run_stats) max_lag_in_days = min_max_date and (datetime.now() - min_max_date).days formatted_min_max_date = min_max_date and min_max_date.strftime("%Y-%m-%d") logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds, csv_export_count=csv_export_count, max_lag_in_days=max_lag_in_days, oldest_final_export_date=formatted_min_max_date)
def run_module(params): """ Generate ground truth HHS hospitalization data. Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_filename" (optional): str, name of file to write logs """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) mapper = GeoMapper() request_all_states = ",".join(mapper.get_geo_values("state_id")) today = date.today() past_reference_day = date(year=2020, month=1, day=1) # first available date in DB date_range = generate_date_ranges(past_reference_day, today) dfs = [] for r in date_range: response = Epidata.covid_hosp(request_all_states, r) # The last date range might only have recent days that don't have any data, so don't error. if response["result"] != 1 and r != date_range[-1]: raise Exception(f"Bad result from Epidata: {response['message']}") if response["result"] == -2 and r == date_range[ -1]: # -2 code means no results continue dfs.append(pd.DataFrame(response['epidata'])) all_columns = pd.concat(dfs) geo_mapper = GeoMapper() for sig in SIGNALS: state = geo_mapper.add_geocode(make_signal(all_columns, sig), "state_id", "state_code", from_col="state") for geo in GEOS: create_export_csv(make_geo(state, geo, geo_mapper), params["common"]["export_dir"], geo, sig) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def run_module(params: Dict[str, Any]): """Run module for processing NCHS mortality data. The `params` argument is expected to have the following structure: - "common": - "daily_export_dir": str, directory to write daily output - "weekly_export_dir": str, directory to write weekly output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - "indicator": - "export_start_date": str, date from which to export data in YYYY-MM-DD format - "static_file_dir": str, directory containing population csv files - "test_file" (optional): str, name of file from which to read test data - "token": str, authentication for upstream data pull - "archive" (optional): if provided, output will be archived with S3 - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation) - "bucket_name: str, name of S3 bucket to read/write - "daily_cache_dir": str, directory of locally cached daily data - "weekly_cache_dir": str, directory of locally cached weekly data """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) export_start_date = params["indicator"]["export_start_date"] if export_start_date == "latest": # Find the previous Saturday export_start_date = date.today() - timedelta( days=date.today().weekday() + 2) export_start_date = export_start_date.strftime('%Y-%m-%d') daily_export_dir = params["common"]["daily_export_dir"] token = params["indicator"]["token"] test_file = params["indicator"].get("test_file", None) if "archive" in params: daily_arch_diff = S3ArchiveDiffer(params["archive"]["daily_cache_dir"], daily_export_dir, params["archive"]["bucket_name"], "nchs_mortality", params["archive"]["aws_credentials"]) daily_arch_diff.update_cache() df_pull = pull_nchs_mortality_data(token, test_file) for metric in METRICS: if metric == 'percent_of_expected_deaths': print(metric) df = df_pull.copy() df["val"] = df[metric] df["se"] = np.nan df["sample_size"] = np.nan df = df[~df["val"].isnull()] sensor_name = "_".join([SENSOR_NAME_MAP[metric]]) export_csv( df, geo_name=GEO_RES, export_dir=daily_export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), sensor=sensor_name, ) else: for sensor in SENSORS: print(metric, sensor) df = df_pull.copy() if sensor == "num": df["val"] = df[metric] else: df["val"] = df[metric] / df["population"] * INCIDENCE_BASE df["se"] = np.nan df["sample_size"] = np.nan df = df[~df["val"].isnull()] sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor]) export_csv( df, geo_name=GEO_RES, export_dir=daily_export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), sensor=sensor_name, ) # Weekly run of archive utility on Monday # - Does not upload to S3, that is handled by daily run of archive utility # - Exports issues into receiving for the API # Daily run of archiving utility # - Uploads changed files to S3 # - Does not export any issues into receiving if "archive" in params: arch_diffs(params, daily_arch_diff) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def get_logger(): params = read_params() return get_structured_logger(__name__, filename=params.get("log_filename"), log_exceptions=params.get( "log_exceptions", True))
def run_module(params): """Create the Safegraph indicator. The `params` argument is expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - "indicator": - "aws_access_key_id": str, ID of access key for AWS S3 - "aws_secret_access_key": str, access key for AWS S3 - "aws_default_region": str, name of AWS S3 region - "aws_endpoint": str, name of AWS S3 endpoint - "n_core": int, number of cores to use for multithreaded processing - "raw_data_dir": str, directory from which to read downloaded data from AWS, - "static_file_dir": str, directory containing brand and population csv files - "sync": bool, whether to sync S3 data before running indicator - "wip_signal": list of str or bool, list of work-in-progress signals to be passed to `delphi_utils.add_prefix()` """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) # Place to write output files. export_dir = params["common"]["export_dir"] # Location of input files. raw_data_dir = params["indicator"]["raw_data_dir"] # Number of cores to use in multiprocessing. n_core = params["indicator"]["n_core"] # AWS credentials aws_access_key_id = params["indicator"]["aws_access_key_id"] aws_secret_access_key = params["indicator"]["aws_secret_access_key"] aws_default_region = params["indicator"]["aws_default_region"] aws_endpoint = params["indicator"]["aws_endpoint"] # Whether to sync `raw_data_dir` with an AWS backend. # Must be a bool in the JSON file (rather than the string "True" or "False") sync = params["indicator"]["sync"] # List of work-in-progress signal names. wip_signal = params["indicator"]["wip_signal"] # Convert `process()` to a single-argument function for use in `pool.map`. single_arg_process = functools.partial( process, signal_names=SIGNALS, wip_signal=wip_signal, geo_resolutions=GEO_RESOLUTIONS, export_dir=export_dir, ) # Update raw data # Why call subprocess rather than using a native Python client, e.g. boto3? # Because boto3 does not have a simple rsync-like call that can perform # the following behavior elegantly. if sync: subprocess.run( f'aws s3 sync s3://sg-c19-response/social-distancing/v2/ ' f'{raw_data_dir}/social-distancing/ --endpoint {aws_endpoint}', env={ 'AWS_ACCESS_KEY_ID': aws_access_key_id, 'AWS_SECRET_ACCESS_KEY': aws_secret_access_key, 'AWS_DEFAULT_REGION': aws_default_region, }, shell=True, check=True, ) files = get_daily_source_files( f'{raw_data_dir}/social-distancing/**/*.csv.gz') files_with_previous_weeks = [] for day in files: previous_week = [files[day]] for i in range(1, 7): if day - timedelta(i) in files: previous_week.append(files[day - timedelta(i)]) files_with_previous_weeks.append(previous_week) with mp.Pool(n_core) as pool: pool.map(single_arg_process, files_with_previous_weeks) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def run_module(params): """ Main function run when calling the module. Inputs parameters from the file 'params.json' and produces output data in the directory defined by the `export_dir` (should be "receiving" expect for testing purposes). Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - "indicator": - "static_file_dir": str, path to DMA mapping files - "data_dir": str, location of cached CSVs - "start_date": str, YYYY-MM-DD format, first day to generate data for - "end_date": str, YYYY-MM-DD format or empty string, last day to generate data for. - "ght_key": str, GHT API key - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix - "test_data_dir": str, path to test data - "archive" (optional): if provided, output will be archived with S3 - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation) - "bucket_name: str, name of S3 bucket to read/write - "cache_dir": str, directory of locally cached data """ start_time = time.time() csv_export_count = 0 oldest_final_export_date = None # read parameters ght_key = params["indicator"]["ght_key"] start_date = params["indicator"]["start_date"] end_date = params["indicator"]["end_date"] static_dir = params["indicator"]["static_file_dir"] export_dir = params["common"]["export_dir"] data_dir = params["indicator"]["data_dir"] wip_signal = params["indicator"]["wip_signal"] logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) if "archive" in params: arch_diff = S3ArchiveDiffer(params["archive"]["cache_dir"], export_dir, params["archive"]["bucket_name"], "ght", params["archive"]["aws_credentials"]) arch_diff.update_cache() # if missing start_date, set to today (GMT) minus 5 days if start_date == "": now = datetime.datetime.now(datetime.timezone.utc) start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # if missing end_date, set to today (GMT) minus 5 days if end_date == "": now = datetime.datetime.now(datetime.timezone.utc) end_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # Turn on basic logging messages (level INFO) logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) logging.info("Creating data from %s through %s.", start_date, end_date) # Dictionary mapping geo resolution to the data corresponding to that resolution. df_by_geo_res = {} if not params["indicator"]["test_data_dir"]: # setup class to handle API calls ght = GoogleHealthTrends(ght_key=ght_key) # read data frame version of the data df_by_geo_res[STATE] = get_counts_states(ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir) df_by_geo_res[DMA] = get_counts_dma(ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir) else: df_by_geo_res[STATE] = pd.read_csv( params["indicator"]["test_data_dir"].format(geo_res="state")) df_by_geo_res[DMA] = pd.read_csv( params["indicator"]["test_data_dir"].format(geo_res="dma")) df_by_geo_res[HRR], df_by_geo_res[MSA] = derived_counts_from_dma( df_by_geo_res[DMA], static_dir=static_dir) signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_") for signal in signal_names: is_smoothed = signal.endswith(SMOOTHED) for geo_res, df in df_by_geo_res.items(): exported_csv_dates = create_export_csv(format_for_export( df, is_smoothed), geo_res=geo_res, sensor=signal, start_date=start_date, export_dir=export_dir) if not exported_csv_dates.empty: csv_export_count += exported_csv_dates.size if not oldest_final_export_date: oldest_final_export_date = max(exported_csv_dates) oldest_final_export_date = min(oldest_final_export_date, max(exported_csv_dates)) if "archive" in params: archive(arch_diff) elapsed_time_in_seconds = round(time.time() - start_time, 2) max_lag_in_days = None formatted_oldest_final_export_date = None if oldest_final_export_date: max_lag_in_days = (datetime.datetime.now() - oldest_final_export_date).days formatted_oldest_final_export_date = oldest_final_export_date.strftime( "%Y-%m-%d") logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds, csv_export_count=csv_export_count, max_lag_in_days=max_lag_in_days, oldest_final_export_date=formatted_oldest_final_export_date)
def run_module(params: Dict[str, Dict[str, Any]]): """ Run the delphi_changehc module. Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output. - "log_exceptions" (optional): bool, whether to log exceptions to file. - "log_filename" (optional): str, name of file to write logs - "indicator": - "input_cache_dir": str, directory to download source files. - "input_files": dict of str: str or null, optional filenames to download. If null, defaults are set in retrieve_files(). - "start_date": str, YYYY-MM-DD format, first day to generate data for. - "end_date": str or null, YYYY-MM-DD format, last day to generate data for. If set to null, end date is derived from drop date and n_waiting_days. - "drop_date": str or null, YYYY-MM-DD format, date data is dropped. If set to null, current day minus 40 hours is used. - "n_backfill_days": int, number of past days to generate estimates for. - "n_waiting_days": int, number of most recent days to skip estimates for. - "se": bool, whether to write out standard errors. - "parallel": bool, whether to update sensor in parallel. - "geos": list of str, geographies to generate sensor for. - "weekday": list of bool, whether to adjust for weekday effects. - "types": list of str, sensor types to generate. - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix. - "ftp_conn": dict, connection information for source FTP. """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) make_asserts(params) if params["indicator"]["drop_date"] is None: # files are dropped about 4pm the day after the issue date dropdate_dt = (datetime.now() - timedelta(days=1, hours=16)) dropdate_dt = dropdate_dt.replace(hour=0, minute=0, second=0, microsecond=0) else: dropdate_dt = datetime.strptime(params["indicator"]["drop_date"], "%Y-%m-%d") filedate = dropdate_dt.strftime("%Y%m%d") file_dict = retrieve_files(params, filedate, logger) dropdate = str(dropdate_dt.date()) # range of estimates to produce n_backfill_days = params["indicator"][ "n_backfill_days"] # produce estimates for n_backfill_days n_waiting_days = params["indicator"][ "n_waiting_days"] # most recent n_waiting_days won't be est enddate_dt = dropdate_dt - timedelta(days=n_waiting_days) startdate_dt = enddate_dt - timedelta(days=n_backfill_days) enddate = str(enddate_dt.date()) startdate = str(startdate_dt.date()) # now allow manual overrides if params["indicator"]["end_date"] is not None: enddate = params["indicator"]["end_date"] if params["indicator"]["start_date"] is not None: startdate = params["indicator"]["start_date"] logger.info("generating signal and exporting to CSV", first_sensor_date=startdate, last_sensor_date=enddate, drop_date=dropdate, n_backfill_days=n_backfill_days, n_waiting_days=n_waiting_days, geos=params["indicator"]["geos"], export_dir=params["common"]["export_dir"], parallel=params["indicator"]["parallel"], weekday=params["indicator"]["weekday"], types=params["indicator"]["types"], se=params["indicator"]["se"]) ## start generating for geo in params["indicator"]["geos"]: for numtype in params["indicator"]["types"]: for weekday in params["indicator"]["weekday"]: if weekday: logger.info("starting weekday adj", geo=geo, numtype=numtype) else: logger.info("starting no adj", geo=geo, numtype=numtype) su_inst = CHCSensorUpdator(startdate, enddate, dropdate, geo, params["indicator"]["parallel"], weekday, numtype, params["indicator"]["se"], params["indicator"]["wip_signal"]) if numtype == "covid": data = load_combined_data(file_dict["denom"], file_dict["covid"], dropdate_dt, "fips") elif numtype == "cli": data = load_cli_data(file_dict["denom"], file_dict["flu"], file_dict["mixed"], file_dict["flu_like"], file_dict["covid_like"], dropdate_dt, "fips") su_inst.update_sensor(data, params["common"]["export_dir"]) logger.info("finished processing", geo=geo) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def get_logger(): params = read_params() return get_structured_logger(__name__, filename=params.get("log_filename"))
def run_module(params): """ Generate updated claims-based hospitalization indicator values. Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output. - "log_exceptions" (optional): bool, whether to log exceptions to file. - "log_filename" (optional): str, name of file to write logs - "indicator": - "input_file": str, optional filenames to download. If null, defaults are set in retrieve_files(). - "start_date": str, YYYY-MM-DD format, first day to generate data for. - "end_date": str or null, YYYY-MM-DD format, last day to generate data for. If set to null, end date is derived from drop date and n_waiting_days. - "drop_date": str or null, YYYY-MM-DD format, date data is dropped. If set to null, current day minus 40 hours is used. - "n_backfill_days": int, number of past days to generate estimates for. - "n_waiting_days": int, number of most recent days to skip estimates for. - "write_se": bool, whether to write out standard errors. - "obfuscated_prefix": str, prefix for signal name if write_se is True. - "parallel": bool, whether to update sensor in parallel. - "geos": list of str, geographies to generate sensor for. - "weekday": list of bool, which weekday adjustments to perform. For each value in the list, signals will be generated with weekday adjustments (True) or without adjustments (False). """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) # handle range of estimates to produce # filename expected to have format: EDI_AGG_INPATIENT_DDMMYYYY_HHMM{timezone}.csv.gz if params["indicator"]["drop_date"] is None: dropdate_dt = datetime.strptime( Path(params["indicator"]["input_file"]).name.split("_")[3], "%d%m%Y") else: dropdate_dt = datetime.strptime(params["indicator"]["drop_date"], "%Y-%m-%d") # produce estimates for n_backfill_days # most recent n_waiting_days won't be est enddate_dt = dropdate_dt - timedelta( days=params["indicator"]["n_waiting_days"]) startdate_dt = enddate_dt - timedelta( days=params["indicator"]["n_backfill_days"]) enddate = str(enddate_dt.date()) startdate = str(startdate_dt.date()) dropdate = str(dropdate_dt.date()) # now allow manual overrides if params["indicator"]["end_date"] is not None: enddate = params["indicator"]["end_date"] if params["indicator"]["start_date"] is not None: startdate = params["indicator"]['start_date'] # print out information logger.info("Loaded params", startdate=startdate, enddate=enddate, dropdate=dropdate, n_backfill_days=params["indicator"]["n_backfill_days"], n_waiting_days=params["indicator"]["n_waiting_days"], geos=params["indicator"]["geos"], outpath=params["common"]["export_dir"], parallel=params["indicator"]["parallel"], weekday=params["indicator"]["weekday"], write_se=params["indicator"]["write_se"]) # generate indicator csvs for geo in params["indicator"]["geos"]: for weekday in params["indicator"]["weekday"]: if weekday: logger.info("starting weekday adj", geo=geo) else: logger.info("starting no weekday adj", geo=geo) signal_name = Config.signal_weekday_name if weekday else Config.signal_name if params["indicator"]["write_se"]: assert params["indicator"]["obfuscated_prefix"] is not None, \ "supply obfuscated prefix in params.json" signal_name = params["indicator"][ "obfuscated_prefix"] + "_" + signal_name logger.info("Updating signal name", signal_name=signal_name) updater = ClaimsHospIndicatorUpdater( startdate, enddate, dropdate, geo, params["indicator"]["parallel"], weekday, params["indicator"]["write_se"], signal_name) updater.update_indicator(params["indicator"]["input_file"], params["common"]["export_dir"]) logger.info("finished updating", geo=geo) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def run_module(params): """ Run Google Symptoms module. Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - "indicator": - "export_start_date": str, YYYY-MM-DD format, date from which to export data - "num_export_days": int, number of days before end date (today) to export - "path_to_bigquery_credentials": str, path to BigQuery API key and service account JSON file """ start_time = time.time() csv_export_count = 0 oldest_final_export_date = None export_start_date = datetime.strptime( params["indicator"]["export_start_date"], "%Y-%m-%d") export_dir = params["common"]["export_dir"] num_export_days = params["indicator"].get("num_export_days", "all") logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) # Pull GS data dfs = pull_gs_data(params["indicator"]["bigquery_credentials"], export_start_date, num_export_days) gmpr = geomap.GeoMapper() for geo_res in GEO_RESOLUTIONS: if geo_res == "state": df_pull = dfs["state"] elif geo_res in ["hhs", "nation"]: df_pull = gmpr.replace_geocode(dfs["county"], "fips", geo_res, from_col="geo_id", date_col="timestamp") df_pull.rename(columns={geo_res: "geo_id"}, inplace=True) else: df_pull = geo_map(dfs["county"], geo_res) if len(df_pull) == 0: continue for metric, smoother in product( METRICS+[COMBINED_METRIC], SMOOTHERS): print(geo_res, metric, smoother) df = df_pull.set_index(["timestamp", "geo_id"]) df["val"] = df[metric].groupby(level=1 ).transform(SMOOTHERS_MAP[smoother][0]) df["se"] = np.nan df["sample_size"] = np.nan # Drop early entries where data insufficient for smoothing df = df.loc[~df["val"].isnull(), :] df = df.reset_index() sensor_name = "_".join([smoother, "search"]) if len(df) == 0: continue exported_csv_dates = create_export_csv( df, export_dir=export_dir, start_date=SMOOTHERS_MAP[smoother][1](export_start_date), metric=metric.lower(), geo_res=geo_res, sensor=sensor_name) if not exported_csv_dates.empty: csv_export_count += exported_csv_dates.size if not oldest_final_export_date: oldest_final_export_date = max(exported_csv_dates) oldest_final_export_date = min( oldest_final_export_date, max(exported_csv_dates)) elapsed_time_in_seconds = round(time.time() - start_time, 2) max_lag_in_days = None formatted_oldest_final_export_date = None if oldest_final_export_date: max_lag_in_days = (datetime.now() - oldest_final_export_date).days formatted_oldest_final_export_date = oldest_final_export_date.strftime( "%Y-%m-%d") logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds, csv_export_count=csv_export_count, max_lag_in_days=max_lag_in_days, oldest_final_export_date=formatted_oldest_final_export_date)
def run_module(params: Dict[str, Any]): """Run the quidel_covidtest indicator. The `params` argument is expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - indicator": - "static_file_dir": str, directory name with population information - "input_cache_dir": str, directory in which to cache input data - "export_start_date": str, YYYY-MM-DD format of earliest date to create output - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3 documentation - "bucket_name": str, name of AWS bucket in which to find data - "wip_signal": List[str], list of signal names that are works in progress - "test_mode": bool, whether we are running in test mode """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) atexit.register(log_exit, start_time, logger) cache_dir = params["indicator"]["input_cache_dir"] export_dir = params["common"]["export_dir"] export_start_date = params["indicator"]["export_start_date"] export_end_date = params["indicator"]["export_end_date"] # Pull data and update export date df, _end_date = pull_quidel_covidtest(params["indicator"]) if _end_date is None: print("The data is up-to-date. Currently, no new data to be ingested.") return export_end_date = check_export_end_date(export_end_date, _end_date, END_FROM_TODAY_MINUS) export_start_date = check_export_start_date(export_start_date, export_end_date, EXPORT_DAY_RANGE) first_date, last_date = df["timestamp"].min(), df["timestamp"].max() # State Level data = df.copy() state_groups = geo_map("state", data).groupby("state_id") # Add prefix, if required sensors = add_prefix(SENSORS, wip_signal=params["indicator"]["wip_signal"], prefix="wip_") smoothers = SMOOTHERS.copy() for sensor in sensors: # For State Level print("state", sensor) if sensor.endswith(SMOOTHED_POSITIVE): smoothers[sensor] = smoothers.pop(SMOOTHED_POSITIVE) elif sensor.endswith(RAW_POSITIVE): smoothers[sensor] = smoothers.pop(RAW_POSITIVE) elif sensor.endswith(SMOOTHED_TEST_PER_DEVICE): smoothers[sensor] = smoothers.pop(SMOOTHED_TEST_PER_DEVICE) else: smoothers[sensor] = smoothers.pop(RAW_TEST_PER_DEVICE) state_df = generate_sensor_for_states(state_groups, smooth=smoothers[sensor][1], device=smoothers[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(state_df, geo_res="state", sensor=sensor, export_dir=export_dir, start_date=export_start_date, end_date=export_end_date) # County/HRR/MSA level for geo_res in GEO_RESOLUTIONS: geo_data, res_key = geo_map(geo_res, data) for sensor in sensors: print(geo_res, sensor) res_df = generate_sensor_for_other_geores( state_groups, geo_data, res_key, smooth=smoothers[sensor][1], device=smoothers[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(res_df, geo_res=geo_res, sensor=sensor, export_dir=export_dir, start_date=export_start_date, end_date=export_end_date, remove_null_samples=True) # Export the cache file if the pipeline runs successfully. # Otherwise, don't update the cache file update_cache_file(df, _end_date, cache_dir)
def run_module(params): """Run module for Safegraph patterns data. The `params` argument is expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - "indicator": - "aws_access_key_id": str, ID of access key for AWS S3 - "aws_secret_access_key": str, access key for AWS S3 - "aws_default_region": str, name of AWS S3 region - "aws_endpoint": str, name of AWS S3 endpoint - "n_core": int, number of cores to use for multithreaded processing - "raw_data_dir": directory from which to read downloaded data from AWS, - "static_file_dir": str, directory containing brand and population csv files - "sync": bool, whether to sync S3 data before running indicator """ start_time = time.time() export_dir = params["common"]["export_dir"] raw_data_dir = params["indicator"]["raw_data_dir"] n_core = params["indicator"]["n_core"] aws_endpoint = params["indicator"]["aws_endpoint"] static_file_dir = params["indicator"]["static_file_dir"] logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) env_vars = { 'AWS_ACCESS_KEY_ID': params["indicator"]["aws_access_key_id"], 'AWS_SECRET_ACCESS_KEY': params["indicator"]["aws_secret_access_key"], 'AWS_DEFAULT_REGION': params["indicator"]["aws_default_region"], } for ver in VERSIONS: # Update raw data # Why call subprocess rather than using a native Python client, e.g. boto3? # Because boto3 does not have a simple rsync-like call that can perform # the following behavior elegantly. if params["indicator"]["sync"]: subprocess.run( f'aws s3 sync s3://sg-c19-response/{ver[1]}/ ' f'{raw_data_dir}/{ver[1]}/ --endpoint {aws_endpoint}', env=env_vars, shell=True, check=True) brand_df = pd.read_csv( join(static_file_dir, f"brand_info/brand_info_{ver[0]}.csv")) files = glob.glob(f'{raw_data_dir}/{ver[1]}/{ver[2]}', recursive=True) process_file = partial( process, brand_df=brand_df, metrics=METRICS, sensors=SENSORS, geo_resolutions=GEO_RESOLUTIONS, export_dir=export_dir, ) with mp.Pool(n_core) as pool: pool.map(process_file, files) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def run_module(): """Run the JHU indicator module.""" params = read_params() export_start_date = params["export_start_date"] export_dir = params["export_dir"] base_url = params["base_url"] cache_dir = params["cache_dir"] logger = get_structured_logger(__name__, filename=params.get("log_filename")) if len(params["bucket_name"]) > 0: arch_diff = S3ArchiveDiffer( cache_dir, export_dir, params["bucket_name"], "jhu", params["aws_credentials"], ) arch_diff.update_cache() else: arch_diff = None gmpr = GeoMapper() dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS} for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS): print(metric, geo_res, sensor, smoother) logger.info(event="generating signal and exporting to CSV", metric=metric, geo_res=geo_res, sensor=sensor, smoother=smoother) df = dfs[metric] # Aggregate to appropriate geographic resolution df = geo_map(df, geo_res) df.set_index(["timestamp", "geo_id"], inplace=True) df["val"] = df[sensor].groupby(level=1).transform( SMOOTHERS_MAP[smoother][0]) df["se"] = np.nan df["sample_size"] = np.nan # Drop early entries where data insufficient for smoothing df = df[~df["val"].isnull()] df = df.reset_index() sensor_name = SENSOR_NAME_MAP[sensor][0] # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]): # metric = f"wip_{metric}" # sensor_name = WIP_SENSOR_NAME_MAP[sensor][0] sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name create_export_csv( df, export_dir=export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), metric=metric, geo_res=geo_res, sensor=sensor_name, ) if not arch_diff is None: # Diff exports, and make incremental versions _, common_diffs, new_files = arch_diff.diff_exports() # Archive changed and new files only to_archive = [ f for f, diff in common_diffs.items() if diff is not None ] to_archive += new_files _, fails = arch_diff.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive succ_common_diffs = { f: diff for f, diff in common_diffs.items() if f not in fails } arch_diff.filter_exports(succ_common_diffs) # Report failures: someone should probably look at them for exported_file in fails: print(f"Failed to archive '{exported_file}'")
def run_module(params: Dict[str, Any]): """Run the JHU indicator module. The `params` argument is expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - "indicator": - "base_url": str, URL from which to read upstream data - "export_start_date": str, date from which to export data in YYYY-MM-DD format - "archive" (optional): if provided, output will be archived with S3 - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation) - "bucket_name: str, name of S3 bucket to read/write - "cache_dir": str, directory of locally cached data """ start_time = time.time() csv_export_count = 0 oldest_final_export_date = None export_start_date = params["indicator"]["export_start_date"] export_dir = params["common"]["export_dir"] base_url = params["indicator"]["base_url"] logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) if "archive" in params: arch_diff = S3ArchiveDiffer( params["archive"]["cache_dir"], export_dir, params["archive"]["bucket_name"], "jhu", params["archive"]["aws_credentials"], ) arch_diff.update_cache() else: arch_diff = None gmpr = GeoMapper() dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS} for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS): print(metric, geo_res, sensor, smoother) logger.info(event="generating signal and exporting to CSV", metric=metric, geo_res=geo_res, sensor=sensor, smoother=smoother) df = dfs[metric] # Aggregate to appropriate geographic resolution df = geo_map(df, geo_res, sensor) df.set_index(["timestamp", "geo_id"], inplace=True) df["val"] = df[sensor].groupby(level=1).transform( SMOOTHERS_MAP[smoother][0]) df["se"] = np.nan df["sample_size"] = np.nan # Drop early entries where data insufficient for smoothing df = df[~df["val"].isnull()] df = df.reset_index() sensor_name = SENSOR_NAME_MAP[sensor][0] # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]): # metric = f"wip_{metric}" # sensor_name = WIP_SENSOR_NAME_MAP[sensor][0] sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name exported_csv_dates = create_export_csv( df, export_dir=export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), metric=metric, geo_res=geo_res, sensor=sensor_name, ) if not exported_csv_dates.empty: csv_export_count += exported_csv_dates.size if not oldest_final_export_date: oldest_final_export_date = max(exported_csv_dates) oldest_final_export_date = min(oldest_final_export_date, max(exported_csv_dates)) if arch_diff is not None: # Diff exports, and make incremental versions _, common_diffs, new_files = arch_diff.diff_exports() # Archive changed and new files only to_archive = [ f for f, diff in common_diffs.items() if diff is not None ] to_archive += new_files _, fails = arch_diff.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive succ_common_diffs = { f: diff for f, diff in common_diffs.items() if f not in fails } arch_diff.filter_exports(succ_common_diffs) # Report failures: someone should probably look at them for exported_file in fails: print(f"Failed to archive '{exported_file}'") elapsed_time_in_seconds = round(time.time() - start_time, 2) max_lag_in_days = None formatted_oldest_final_export_date = None if oldest_final_export_date: max_lag_in_days = (datetime.now() - oldest_final_export_date).days formatted_oldest_final_export_date = oldest_final_export_date.strftime( "%Y-%m-%d") logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds, csv_export_count=csv_export_count, max_lag_in_days=max_lag_in_days, oldest_final_export_date=formatted_oldest_final_export_date)
def run_module(params: Dict[str, Any]): """Run Quidel flu test module. The `params` argument is expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - indicator": - "static_file_dir": str, directory name with population information - "input_cache_dir": str, directory in which to cache input data - "export_start_date": str, YYYY-MM-DD format of earliest date to create output - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3 documentation - "bucket_name": str, name of AWS bucket in which to find data - "wip_signal": List[str], list of signal names that are works in progress - "test_mode": bool, whether we are running in test mode """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) cache_dir = params["indicator"]["input_cache_dir"] export_dir = params["common"]["export_dir"] static_file_dir = params["indicator"]["static_file_dir"] export_start_dates = params["indicator"]["export_start_date"] export_end_dates = params["indicator"]["export_end_date"] map_df = pd.read_csv(join(static_file_dir, "fips_prop_pop.csv"), dtype={"fips": int}) # Pull data and update export date dfs, _end_date = pull_quidel_data(params["indicator"]) if _end_date is None: print("The data is up-to-date. Currently, no new data to be ingested.") return export_end_dates = check_export_end_date(export_end_dates, _end_date, END_FROM_TODAY_MINUS) export_start_dates = check_export_start_date(export_start_dates, export_end_dates, EXPORT_DAY_RANGE) # Add prefix, if required sensors = add_prefix(list(SENSORS.keys()), wip_signal=params["indicator"]["wip_signal"], prefix="wip_") for sensor in sensors: # Check either covid_ag or flu_ag test_type = "covid_ag" if "covid_ag" in sensor else "flu_ag" print("state", sensor) data = dfs[test_type].copy() state_groups = geo_map("state", data, map_df).groupby("state_id") first_date, last_date = data["timestamp"].min(), data["timestamp"].max( ) # For State Level state_df = generate_sensor_for_states(state_groups, smooth=SENSORS[sensor][1], device=SENSORS[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(state_df, geo_res="state", sensor=sensor, export_dir=export_dir, start_date=export_start_dates[test_type], end_date=export_end_dates[test_type]) # County/HRR/MSA level for geo_res in GEO_RESOLUTIONS: print(geo_res, sensor) data = dfs[test_type].copy() data, res_key = geo_map(geo_res, data, map_df) res_df = generate_sensor_for_other_geores( state_groups, data, res_key, smooth=SENSORS[sensor][1], device=SENSORS[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(res_df, geo_res=geo_res, sensor=sensor, export_dir=export_dir, start_date=export_start_dates[test_type], end_date=export_end_dates[test_type], remove_null_samples=True) # Export the cache file if the pipeline runs successfully. # Otherwise, don't update the cache file update_cache_file(dfs, _end_date, cache_dir) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)