def test_export_without_null_removal(self): """Test that `remove_null_samples = False` does not remove entries with null samples.""" _clean_directory(self.TEST_DIR) df_with_nulls = self.DF.copy().append( { "geo_id": "66666", "timestamp": datetime(2020, 6, 6), "val": 10, "se": 0.2, "sample_size": pd.NA }, ignore_index=True) create_export_csv(df=df_with_nulls, export_dir=self.TEST_DIR, geo_res="state", sensor="test", remove_null_samples=False) assert _non_ignored_files_set(self.TEST_DIR) == set([ "20200215_state_test.csv", "20200301_state_test.csv", "20200315_state_test.csv", "20200606_state_test.csv" ]) assert pd.read_csv(join(self.TEST_DIR, "20200606_state_test.csv")).size > 0
def test_export_with_limiting_start_date(self): """Test that the `start_date` prevents earlier dates from being exported.""" # Clean receiving directory _clean_directory(self.TEST_DIR) create_export_csv( df=self.DF, start_date=datetime.strptime("2020-02-20", "%Y-%m-%d"), export_dir=self.TEST_DIR, geo_res="county", sensor="test", ) assert _non_ignored_files_set(self.TEST_DIR) == set([ "20200301_county_test.csv", "20200315_county_test.csv", ])
def test_export_with_no_dates(self): """Test that omitting the `start_date` and `end_date` exports all dates.""" # Clean receiving directory _clean_directory(self.TEST_DIR) create_export_csv( df=self.DF, export_dir=self.TEST_DIR, geo_res="state", sensor="test", ) assert _non_ignored_files_set(self.TEST_DIR) == set([ "20200215_state_test.csv", "20200301_state_test.csv", "20200315_state_test.csv", ])
def test_export_without_metric(self): """Test that exporting CSVs without the `metrics` argument yields the correct files.""" # Clean receiving directory _clean_directory(self.TEST_DIR) create_export_csv( df=self.DF, start_date=datetime.strptime("2020-02-15", "%Y-%m-%d"), export_dir=self.TEST_DIR, geo_res="county", sensor="test", ) assert _non_ignored_files_set(self.TEST_DIR) == set([ "20200215_county_test.csv", "20200301_county_test.csv", "20200315_county_test.csv", ])
def run_module(): """Run Google Symptoms module.""" params = read_params() export_start_date = datetime.strptime(params["export_start_date"], "%Y-%m-%d") export_dir = params["export_dir"] base_url = params["base_url"] # Pull GS data dfs = pull_gs_data(base_url) gmpr = geomap.GeoMapper() for geo_res in GEO_RESOLUTIONS: if geo_res == "state": df_pull = dfs["state"] elif geo_res in ["hhs", "nation"]: df_pull = gmpr.replace_geocode(dfs["county"], "fips", geo_res, from_col="geo_id", date_col="timestamp") df_pull.rename(columns={geo_res: "geo_id"}, inplace=True) else: df_pull = geo_map(dfs["county"], geo_res) for metric, smoother in product(METRICS + [COMBINED_METRIC], SMOOTHERS): print(geo_res, metric, smoother) df = df_pull.set_index(["timestamp", "geo_id"]) df["val"] = df[metric].groupby(level=1).transform( SMOOTHERS_MAP[smoother][0]) df["se"] = np.nan df["sample_size"] = np.nan # Drop early entries where data insufficient for smoothing df = df.loc[~df["val"].isnull(), :] df = df.reset_index() sensor_name = "_".join([smoother, "search"]) create_export_csv( df, export_dir=export_dir, start_date=SMOOTHERS_MAP[smoother][1](export_start_date), metric=metric.lower(), geo_res=geo_res, sensor=sensor_name)
def test_export_rounding(self): """Test that exporting CSVs with the `metrics` argument yields the correct files.""" # Clean receiving directory _clean_directory(self.TEST_DIR) create_export_csv( df=self.DF, start_date=datetime.strptime("2020-02-15", "%Y-%m-%d"), export_dir=self.TEST_DIR, metric="deaths", geo_res="county", sensor="test", ) pd.testing.assert_frame_equal( pd.read_csv(join(self.TEST_DIR, "20200215_county_deaths_test.csv")), pd.DataFrame({ "geo_id": [51093, 51175], "val": [round(3.12345678910, 7), 2.1], "se": [0.15, 0.22], "sample_size": [100, 100] }))
def run_module(params): """ Run the CAN testing metrics indicator. Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output - "indicator": - "parquet_url": str, URL of source file in parquet format - "archive" (optional): if provided, output will be archived with S3 - "cache_dir": str, directory of locally cached data - "bucket_name: str, name of S3 bucket to read/write - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation) """ # Configuration export_dir = params["common"]["export_dir"] parquet_url = params["indicator"]["parquet_url"] # Archive Differ configuration if "archive" in params: cache_dir = params["archive"]["cache_dir"] arch_diff = S3ArchiveDiffer(cache_dir, export_dir, params["archive"]["bucket_name"], "CAN", params["archive"]["aws_credentials"]) arch_diff.update_cache() else: arch_diff = None # Load CAN county-level testing data print("Pulling CAN data") df_pq = load_data(parquet_url) df_county_testing = extract_testing_metrics(df_pq) # Perform geo aggregations and export to receiving for geo_res in GEO_RESOLUTIONS: print(f"Processing {geo_res}") df = geo_map(df_county_testing, geo_res) # Export 'pcr_specimen_positivity_rate' exported_csv_dates = create_export_csv(df, export_dir=export_dir, geo_res=geo_res, sensor=SIGNALS[0]) # Export 'pcr_specimen_total_tests' df["val"] = df["sample_size"] df["sample_size"] = np.nan df["se"] = np.nan exported_csv_dates = create_export_csv(df, export_dir=export_dir, geo_res=geo_res, sensor=SIGNALS[1]) earliest, latest = min(exported_csv_dates), max(exported_csv_dates) print(f"Exported dates: {earliest} to {latest}") # Perform archive differencing if not arch_diff is None: # Diff exports, and make incremental versions _, common_diffs, new_files = arch_diff.diff_exports() # Archive changed and new files only to_archive = [ f for f, diff in common_diffs.items() if diff is not None ] to_archive += new_files _, fails = arch_diff.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive succ_common_diffs = { f: diff for f, diff in common_diffs.items() if f not in fails } arch_diff.filter_exports(succ_common_diffs) # Report failures: someone should probably look at them for exported_file in fails: print(f"Failed to archive '{exported_file}'")
def run_module(params: Dict[str, Any]): """Run the JHU indicator module. The `params` argument is expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - "indicator": - "base_url": str, URL from which to read upstream data - "export_start_date": str, date from which to export data in YYYY-MM-DD format - "archive" (optional): if provided, output will be archived with S3 - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation) - "bucket_name: str, name of S3 bucket to read/write - "cache_dir": str, directory of locally cached data """ start_time = time.time() csv_export_count = 0 oldest_final_export_date = None export_start_date = params["indicator"]["export_start_date"] export_dir = params["common"]["export_dir"] base_url = params["indicator"]["base_url"] logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) if "archive" in params: arch_diff = S3ArchiveDiffer( params["archive"]["cache_dir"], export_dir, params["archive"]["bucket_name"], "jhu", params["archive"]["aws_credentials"], ) arch_diff.update_cache() else: arch_diff = None gmpr = GeoMapper() dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS} for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS): print(metric, geo_res, sensor, smoother) logger.info(event="generating signal and exporting to CSV", metric=metric, geo_res=geo_res, sensor=sensor, smoother=smoother) df = dfs[metric] # Aggregate to appropriate geographic resolution df = geo_map(df, geo_res, sensor) df.set_index(["timestamp", "geo_id"], inplace=True) df["val"] = df[sensor].groupby(level=1).transform( SMOOTHERS_MAP[smoother][0]) df["se"] = np.nan df["sample_size"] = np.nan # Drop early entries where data insufficient for smoothing df = df[~df["val"].isnull()] df = df.reset_index() sensor_name = SENSOR_NAME_MAP[sensor][0] # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]): # metric = f"wip_{metric}" # sensor_name = WIP_SENSOR_NAME_MAP[sensor][0] sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name exported_csv_dates = create_export_csv( df, export_dir=export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), metric=metric, geo_res=geo_res, sensor=sensor_name, ) if not exported_csv_dates.empty: csv_export_count += exported_csv_dates.size if not oldest_final_export_date: oldest_final_export_date = max(exported_csv_dates) oldest_final_export_date = min(oldest_final_export_date, max(exported_csv_dates)) if arch_diff is not None: # Diff exports, and make incremental versions _, common_diffs, new_files = arch_diff.diff_exports() # Archive changed and new files only to_archive = [ f for f, diff in common_diffs.items() if diff is not None ] to_archive += new_files _, fails = arch_diff.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive succ_common_diffs = { f: diff for f, diff in common_diffs.items() if f not in fails } arch_diff.filter_exports(succ_common_diffs) # Report failures: someone should probably look at them for exported_file in fails: print(f"Failed to archive '{exported_file}'") elapsed_time_in_seconds = round(time.time() - start_time, 2) max_lag_in_days = None formatted_oldest_final_export_date = None if oldest_final_export_date: max_lag_in_days = (datetime.now() - oldest_final_export_date).days formatted_oldest_final_export_date = oldest_final_export_date.strftime( "%Y-%m-%d") logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds, csv_export_count=csv_export_count, max_lag_in_days=max_lag_in_days, oldest_final_export_date=formatted_oldest_final_export_date)
def run_module(params): """ Main function run when calling the module. Inputs parameters from the file 'params.json' and produces output data in the directory defined by the `export_dir` (should be "receiving" expect for testing purposes). Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - "indicator": - "static_file_dir": str, path to DMA mapping files - "data_dir": str, location of cached CSVs - "start_date": str, YYYY-MM-DD format, first day to generate data for - "end_date": str, YYYY-MM-DD format or empty string, last day to generate data for. - "ght_key": str, GHT API key - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix - "test_data_dir": str, path to test data - "archive" (optional): if provided, output will be archived with S3 - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation) - "bucket_name: str, name of S3 bucket to read/write - "cache_dir": str, directory of locally cached data """ start_time = time.time() csv_export_count = 0 oldest_final_export_date = None # read parameters ght_key = params["indicator"]["ght_key"] start_date = params["indicator"]["start_date"] end_date = params["indicator"]["end_date"] static_dir = params["indicator"]["static_file_dir"] export_dir = params["common"]["export_dir"] data_dir = params["indicator"]["data_dir"] wip_signal = params["indicator"]["wip_signal"] logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) if "archive" in params: arch_diff = S3ArchiveDiffer(params["archive"]["cache_dir"], export_dir, params["archive"]["bucket_name"], "ght", params["archive"]["aws_credentials"]) arch_diff.update_cache() # if missing start_date, set to today (GMT) minus 5 days if start_date == "": now = datetime.datetime.now(datetime.timezone.utc) start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # if missing end_date, set to today (GMT) minus 5 days if end_date == "": now = datetime.datetime.now(datetime.timezone.utc) end_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # Turn on basic logging messages (level INFO) logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) logging.info("Creating data from %s through %s.", start_date, end_date) # Dictionary mapping geo resolution to the data corresponding to that resolution. df_by_geo_res = {} if not params["indicator"]["test_data_dir"]: # setup class to handle API calls ght = GoogleHealthTrends(ght_key=ght_key) # read data frame version of the data df_by_geo_res[STATE] = get_counts_states(ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir) df_by_geo_res[DMA] = get_counts_dma(ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir) else: df_by_geo_res[STATE] = pd.read_csv( params["indicator"]["test_data_dir"].format(geo_res="state")) df_by_geo_res[DMA] = pd.read_csv( params["indicator"]["test_data_dir"].format(geo_res="dma")) df_by_geo_res[HRR], df_by_geo_res[MSA] = derived_counts_from_dma( df_by_geo_res[DMA], static_dir=static_dir) signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_") for signal in signal_names: is_smoothed = signal.endswith(SMOOTHED) for geo_res, df in df_by_geo_res.items(): exported_csv_dates = create_export_csv(format_for_export( df, is_smoothed), geo_res=geo_res, sensor=signal, start_date=start_date, export_dir=export_dir) if not exported_csv_dates.empty: csv_export_count += exported_csv_dates.size if not oldest_final_export_date: oldest_final_export_date = max(exported_csv_dates) oldest_final_export_date = min(oldest_final_export_date, max(exported_csv_dates)) if "archive" in params: archive(arch_diff) elapsed_time_in_seconds = round(time.time() - start_time, 2) max_lag_in_days = None formatted_oldest_final_export_date = None if oldest_final_export_date: max_lag_in_days = (datetime.datetime.now() - oldest_final_export_date).days formatted_oldest_final_export_date = oldest_final_export_date.strftime( "%Y-%m-%d") logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds, csv_export_count=csv_export_count, max_lag_in_days=max_lag_in_days, oldest_final_export_date=formatted_oldest_final_export_date)
def process(fname, sensors, metrics, geo_resolutions, export_dir, brand_df): """ Process an input census block group-level CSV and export it. Assumes that the input file has _only_ one date of data. Parameters ---------- fname: str Input filename. metrics: List[Tuple[str, bool]] List of (metric_name, wip). sensors: List[str] List of (sensor) geo_resolutions: List[str] List of geo resolutions to export the data. brand_df: pd.DataFrame mapping info from naics_code to safegraph_brand_id Returns ------- None """ metric_names, naics_codes, wips = (list(x) for x in zip(*metrics)) used_cols = [ "safegraph_brand_ids", "visits_by_day", "date_range_start", "date_range_end", "postal_code", ] if ".csv.gz" in fname: df = pd.read_csv(fname, usecols=used_cols, parse_dates=["date_range_start", "date_range_end"]) dfs = construct_signals(df, metric_names, naics_codes, brand_df) print("Finished pulling data from " + fname) else: files = glob.glob(f'{fname}/**/*.csv.gz', recursive=True) dfs_dict = {"bars_visit": [], "restaurants_visit": []} for fn in files: df = pd.read_csv(fn, usecols=used_cols, parse_dates=["date_range_start", "date_range_end"]) dfs = construct_signals(df, metric_names, naics_codes, brand_df) dfs_dict["bars_visit"].append(dfs["bars_visit"]) dfs_dict["restaurants_visit"].append(dfs["restaurants_visit"]) dfs = {} dfs["bars_visit"] = pd.concat(dfs_dict["bars_visit"] ).groupby(["timestamp", "zip"]).sum().reset_index() dfs["restaurants_visit"] = pd.concat(dfs_dict["restaurants_visit"] ).groupby(["timestamp", "zip"]).sum().reset_index() print("Finished pulling data from " + fname) for geo_res, sensor in product(geo_resolutions, sensors): for metric, wip in zip(metric_names, wips): df_export = aggregate(dfs[metric], metric, geo_res) df_export["val"] = df_export["_".join([metric, sensor])] df_export["se"] = np.nan df_export["sample_size"] = np.nan if wip: metric = "wip_" + metric create_export_csv( df_export, export_dir=export_dir, start_date=df_export["timestamp"].min(), metric=metric, geo_res=geo_res, sensor=sensor, )
def run_module(): """Main function run when calling the module. Inputs parameters from the file 'params.json' and produces output data in the directory defined by the `export_dir` (should be "receiving" expect for testing purposes). """ # read parameters params = read_params() ght_key = params["ght_key"] start_date = params["start_date"] end_date = params["end_date"] static_dir = params["static_file_dir"] export_dir = params["export_dir"] data_dir = params["data_dir"] wip_signal = params["wip_signal"] cache_dir = params["cache_dir"] arch_diff = S3ArchiveDiffer(cache_dir, export_dir, params["bucket_name"], "ght", params["aws_credentials"]) arch_diff.update_cache() print(arch_diff) # if missing start_date, set to today (GMT) minus 5 days if start_date == "": now = datetime.datetime.now(datetime.timezone.utc) start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # if missing start_date, set to today (GMT) minus 5 days if start_date == "": now = datetime.datetime.now(datetime.timezone.utc) start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # if missing end_date, set to today (GMT) minus 5 days if end_date == "": now = datetime.datetime.now(datetime.timezone.utc) end_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d") # Turn on basic logging messages (level INFO) logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) logging.info("Creating data from %s through %s.", start_date, end_date) # Dictionary mapping geo resolution to the data corresponding to that resolution. df_by_geo_res = {} if not params["test"]: # setup class to handle API calls ght = GoogleHealthTrends(ght_key=ght_key) # read data frame version of the data df_by_geo_res[STATE] = get_counts_states(ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir) df_by_geo_res[DMA] = get_counts_dma(ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir) else: df_by_geo_res[STATE] = pd.read_csv( params["test_data_dir"].format(geo_res="state")) df_by_geo_res[DMA] = pd.read_csv( params["test_data_dir"].format(geo_res="dma")) df_by_geo_res[HRR], df_by_geo_res[MSA] = derived_counts_from_dma( df_by_geo_res[DMA], static_dir=static_dir) signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_") for signal in signal_names: is_smoothed = signal.endswith(SMOOTHED) for geo_res, df in df_by_geo_res.items(): create_export_csv(format_for_export(df, is_smoothed), geo_res=geo_res, sensor=signal, start_date=start_date, export_dir=export_dir) if not params["test"]: # Diff exports, and make incremental versions _, common_diffs, new_files = arch_diff.diff_exports() # Archive changed and new files only to_archive = [ f for f, diff in common_diffs.items() if diff is not None ] to_archive += new_files _, fails = arch_diff.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive succ_common_diffs = { f: diff for f, diff in common_diffs.items() if f not in fails } arch_diff.filter_exports(succ_common_diffs) # Report failures: someone should probably look at them for exported_file in fails: print(f"Failed to archive '{exported_file}'")
def run_module(): """Run the JHU indicator module.""" params = read_params() export_start_date = params["export_start_date"] export_dir = params["export_dir"] base_url = params["base_url"] cache_dir = params["cache_dir"] logger = get_structured_logger(__name__, filename=params.get("log_filename")) if len(params["bucket_name"]) > 0: arch_diff = S3ArchiveDiffer( cache_dir, export_dir, params["bucket_name"], "jhu", params["aws_credentials"], ) arch_diff.update_cache() else: arch_diff = None gmpr = GeoMapper() dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS} for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS): print(metric, geo_res, sensor, smoother) logger.info(event="generating signal and exporting to CSV", metric=metric, geo_res=geo_res, sensor=sensor, smoother=smoother) df = dfs[metric] # Aggregate to appropriate geographic resolution df = geo_map(df, geo_res) df.set_index(["timestamp", "geo_id"], inplace=True) df["val"] = df[sensor].groupby(level=1).transform( SMOOTHERS_MAP[smoother][0]) df["se"] = np.nan df["sample_size"] = np.nan # Drop early entries where data insufficient for smoothing df = df[~df["val"].isnull()] df = df.reset_index() sensor_name = SENSOR_NAME_MAP[sensor][0] # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]): # metric = f"wip_{metric}" # sensor_name = WIP_SENSOR_NAME_MAP[sensor][0] sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name create_export_csv( df, export_dir=export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), metric=metric, geo_res=geo_res, sensor=sensor_name, ) if not arch_diff is None: # Diff exports, and make incremental versions _, common_diffs, new_files = arch_diff.diff_exports() # Archive changed and new files only to_archive = [ f for f, diff in common_diffs.items() if diff is not None ] to_archive += new_files _, fails = arch_diff.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive succ_common_diffs = { f: diff for f, diff in common_diffs.items() if f not in fails } arch_diff.filter_exports(succ_common_diffs) # Report failures: someone should probably look at them for exported_file in fails: print(f"Failed to archive '{exported_file}'")
def run_module(): """Run the quidel_covidtest indicator.""" params = read_params() cache_dir = params["cache_dir"] export_dir = params["export_dir"] export_start_date = params["export_start_date"] export_end_date = params["export_end_date"] # Pull data and update export date df, _end_date = pull_quidel_covidtest(params) if _end_date is None: print("The data is up-to-date. Currently, no new data to be ingested.") return export_end_date = check_export_end_date(export_end_date, _end_date, END_FROM_TODAY_MINUS) export_start_date = check_export_start_date(export_start_date, export_end_date, EXPORT_DAY_RANGE) first_date, last_date = df["timestamp"].min(), df["timestamp"].max() # State Level data = df.copy() state_groups = geo_map("state", data).groupby("state_id") # Add prefix, if required sensors = add_prefix(SENSORS, wip_signal=read_params()["wip_signal"], prefix="wip_") smoothers = SMOOTHERS.copy() for sensor in sensors: # For State Level print("state", sensor) if sensor.endswith(SMOOTHED_POSITIVE): smoothers[sensor] = smoothers.pop(SMOOTHED_POSITIVE) elif sensor.endswith(RAW_POSITIVE): smoothers[sensor] = smoothers.pop(RAW_POSITIVE) elif sensor.endswith(SMOOTHED_TEST_PER_DEVICE): smoothers[sensor] = smoothers.pop(SMOOTHED_TEST_PER_DEVICE) else: smoothers[sensor] = smoothers.pop(RAW_TEST_PER_DEVICE) state_df = generate_sensor_for_states(state_groups, smooth=smoothers[sensor][1], device=smoothers[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(state_df, geo_res="state", sensor=sensor, export_dir=export_dir, start_date=export_start_date, end_date=export_end_date) # County/HRR/MSA level for geo_res in GEO_RESOLUTIONS: geo_data, res_key = geo_map(geo_res, data) for sensor in sensors: print(geo_res, sensor) res_df = generate_sensor_for_other_geores( state_groups, geo_data, res_key, smooth=smoothers[sensor][1], device=smoothers[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(res_df, geo_res=geo_res, sensor=sensor, export_dir=export_dir, start_date=export_start_date, end_date=export_end_date, remove_null_samples=True) # Export the cache file if the pipeline runs successfully. # Otherwise, don't update the cache file update_cache_file(df, _end_date, cache_dir)
def run_module(params: Dict[str, Any]): """Run the quidel_covidtest indicator. The `params` argument is expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - indicator": - "static_file_dir": str, directory name with population information - "input_cache_dir": str, directory in which to cache input data - "export_start_date": str, YYYY-MM-DD format of earliest date to create output - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3 documentation - "bucket_name": str, name of AWS bucket in which to find data - "wip_signal": List[str], list of signal names that are works in progress - "test_mode": bool, whether we are running in test mode """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) atexit.register(log_exit, start_time, logger) cache_dir = params["indicator"]["input_cache_dir"] export_dir = params["common"]["export_dir"] export_start_date = params["indicator"]["export_start_date"] export_end_date = params["indicator"]["export_end_date"] # Pull data and update export date df, _end_date = pull_quidel_covidtest(params["indicator"]) if _end_date is None: print("The data is up-to-date. Currently, no new data to be ingested.") return export_end_date = check_export_end_date(export_end_date, _end_date, END_FROM_TODAY_MINUS) export_start_date = check_export_start_date(export_start_date, export_end_date, EXPORT_DAY_RANGE) first_date, last_date = df["timestamp"].min(), df["timestamp"].max() # State Level data = df.copy() state_groups = geo_map("state", data).groupby("state_id") # Add prefix, if required sensors = add_prefix(SENSORS, wip_signal=params["indicator"]["wip_signal"], prefix="wip_") smoothers = SMOOTHERS.copy() for sensor in sensors: # For State Level print("state", sensor) if sensor.endswith(SMOOTHED_POSITIVE): smoothers[sensor] = smoothers.pop(SMOOTHED_POSITIVE) elif sensor.endswith(RAW_POSITIVE): smoothers[sensor] = smoothers.pop(RAW_POSITIVE) elif sensor.endswith(SMOOTHED_TEST_PER_DEVICE): smoothers[sensor] = smoothers.pop(SMOOTHED_TEST_PER_DEVICE) else: smoothers[sensor] = smoothers.pop(RAW_TEST_PER_DEVICE) state_df = generate_sensor_for_states(state_groups, smooth=smoothers[sensor][1], device=smoothers[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(state_df, geo_res="state", sensor=sensor, export_dir=export_dir, start_date=export_start_date, end_date=export_end_date) # County/HRR/MSA level for geo_res in GEO_RESOLUTIONS: geo_data, res_key = geo_map(geo_res, data) for sensor in sensors: print(geo_res, sensor) res_df = generate_sensor_for_other_geores( state_groups, geo_data, res_key, smooth=smoothers[sensor][1], device=smoothers[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(res_df, geo_res=geo_res, sensor=sensor, export_dir=export_dir, start_date=export_start_date, end_date=export_end_date, remove_null_samples=True) # Export the cache file if the pipeline runs successfully. # Otherwise, don't update the cache file update_cache_file(df, _end_date, cache_dir)
def run_module(params: Dict[str, Any]): """Run Quidel flu test module. The `params` argument is expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - indicator": - "static_file_dir": str, directory name with population information - "input_cache_dir": str, directory in which to cache input data - "export_start_date": str, YYYY-MM-DD format of earliest date to create output - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create through the present - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3 documentation - "bucket_name": str, name of AWS bucket in which to find data - "wip_signal": List[str], list of signal names that are works in progress - "test_mode": bool, whether we are running in test mode """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) cache_dir = params["indicator"]["input_cache_dir"] export_dir = params["common"]["export_dir"] static_file_dir = params["indicator"]["static_file_dir"] export_start_dates = params["indicator"]["export_start_date"] export_end_dates = params["indicator"]["export_end_date"] map_df = pd.read_csv(join(static_file_dir, "fips_prop_pop.csv"), dtype={"fips": int}) # Pull data and update export date dfs, _end_date = pull_quidel_data(params["indicator"]) if _end_date is None: print("The data is up-to-date. Currently, no new data to be ingested.") return export_end_dates = check_export_end_date(export_end_dates, _end_date, END_FROM_TODAY_MINUS) export_start_dates = check_export_start_date(export_start_dates, export_end_dates, EXPORT_DAY_RANGE) # Add prefix, if required sensors = add_prefix(list(SENSORS.keys()), wip_signal=params["indicator"]["wip_signal"], prefix="wip_") for sensor in sensors: # Check either covid_ag or flu_ag test_type = "covid_ag" if "covid_ag" in sensor else "flu_ag" print("state", sensor) data = dfs[test_type].copy() state_groups = geo_map("state", data, map_df).groupby("state_id") first_date, last_date = data["timestamp"].min(), data["timestamp"].max( ) # For State Level state_df = generate_sensor_for_states(state_groups, smooth=SENSORS[sensor][1], device=SENSORS[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(state_df, geo_res="state", sensor=sensor, export_dir=export_dir, start_date=export_start_dates[test_type], end_date=export_end_dates[test_type]) # County/HRR/MSA level for geo_res in GEO_RESOLUTIONS: print(geo_res, sensor) data = dfs[test_type].copy() data, res_key = geo_map(geo_res, data, map_df) res_df = generate_sensor_for_other_geores( state_groups, data, res_key, smooth=SENSORS[sensor][1], device=SENSORS[sensor][0], first_date=first_date, last_date=last_date) create_export_csv(res_df, geo_res=geo_res, sensor=sensor, export_dir=export_dir, start_date=export_start_dates[test_type], end_date=export_end_dates[test_type], remove_null_samples=True) # Export the cache file if the pipeline runs successfully. # Otherwise, don't update the cache file update_cache_file(dfs, _end_date, cache_dir) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def run_module(): """Run the usafacts indicator.""" params = read_params() export_start_date = params["export_start_date"] if export_start_date == "latest": export_start_date = datetime.combine(date.today(), time( 0, 0)) - timedelta(days=1) else: export_start_date = datetime.strptime(export_start_date, "%Y-%m-%d") export_dir = params["export_dir"] base_url = params["base_url"] cache_dir = params["cache_dir"] arch_diff = S3ArchiveDiffer(cache_dir, export_dir, params["bucket_name"], "usafacts", params["aws_credentials"]) arch_diff.update_cache() geo_mapper = GeoMapper() dfs = { metric: pull_usafacts_data(base_url, metric, geo_mapper) for metric in METRICS } for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS): print(geo_res, metric, sensor, smoother) df = dfs[metric] # Aggregate to appropriate geographic resolution df = geo_map(df, geo_res, sensor) df["val"] = SMOOTHERS_MAP[smoother][0].smooth(df[sensor].values) df["se"] = np.nan df["sample_size"] = np.nan # Drop early entries where data insufficient for smoothing df = df.loc[~df["val"].isnull(), :] sensor_name = SENSOR_NAME_MAP[sensor][0] # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]): # metric = f"wip_{metric}" # sensor_name = WIP_SENSOR_NAME_MAP[sensor][0] sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name create_export_csv( df, export_dir=export_dir, start_date=SMOOTHERS_MAP[smoother][3](export_start_date), metric=metric, geo_res=geo_res, sensor=sensor_name, ) # Diff exports, and make incremental versions _, common_diffs, new_files = arch_diff.diff_exports() # Archive changed and new files only to_archive = [f for f, diff in common_diffs.items() if diff is not None] to_archive += new_files _, fails = arch_diff.archive_exports(to_archive) # Filter existing exports to exclude those that failed to archive succ_common_diffs = { f: diff for f, diff in common_diffs.items() if f not in fails } arch_diff.filter_exports(succ_common_diffs) # Report failures: someone should probably look at them for exported_file in fails: print(f"Failed to archive '{exported_file}'")
def run_module(params): """ Run Google Symptoms module. Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - "indicator": - "export_start_date": str, YYYY-MM-DD format, date from which to export data - "num_export_days": int, number of days before end date (today) to export - "path_to_bigquery_credentials": str, path to BigQuery API key and service account JSON file """ start_time = time.time() csv_export_count = 0 oldest_final_export_date = None export_start_date = datetime.strptime( params["indicator"]["export_start_date"], "%Y-%m-%d") export_dir = params["common"]["export_dir"] num_export_days = params["indicator"].get("num_export_days", "all") logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) # Pull GS data dfs = pull_gs_data(params["indicator"]["bigquery_credentials"], export_start_date, num_export_days) gmpr = geomap.GeoMapper() for geo_res in GEO_RESOLUTIONS: if geo_res == "state": df_pull = dfs["state"] elif geo_res in ["hhs", "nation"]: df_pull = gmpr.replace_geocode(dfs["county"], "fips", geo_res, from_col="geo_id", date_col="timestamp") df_pull.rename(columns={geo_res: "geo_id"}, inplace=True) else: df_pull = geo_map(dfs["county"], geo_res) if len(df_pull) == 0: continue for metric, smoother in product( METRICS+[COMBINED_METRIC], SMOOTHERS): print(geo_res, metric, smoother) df = df_pull.set_index(["timestamp", "geo_id"]) df["val"] = df[metric].groupby(level=1 ).transform(SMOOTHERS_MAP[smoother][0]) df["se"] = np.nan df["sample_size"] = np.nan # Drop early entries where data insufficient for smoothing df = df.loc[~df["val"].isnull(), :] df = df.reset_index() sensor_name = "_".join([smoother, "search"]) if len(df) == 0: continue exported_csv_dates = create_export_csv( df, export_dir=export_dir, start_date=SMOOTHERS_MAP[smoother][1](export_start_date), metric=metric.lower(), geo_res=geo_res, sensor=sensor_name) if not exported_csv_dates.empty: csv_export_count += exported_csv_dates.size if not oldest_final_export_date: oldest_final_export_date = max(exported_csv_dates) oldest_final_export_date = min( oldest_final_export_date, max(exported_csv_dates)) elapsed_time_in_seconds = round(time.time() - start_time, 2) max_lag_in_days = None formatted_oldest_final_export_date = None if oldest_final_export_date: max_lag_in_days = (datetime.now() - oldest_final_export_date).days formatted_oldest_final_export_date = oldest_final_export_date.strftime( "%Y-%m-%d") logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds, csv_export_count=csv_export_count, max_lag_in_days=max_lag_in_days, oldest_final_export_date=formatted_oldest_final_export_date)