def run_module(params): """ Runs the indicator Arguments -------- params: Dict[str, Any] Nested dictionary of parameters. """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) mapper = GeoMapper() run_stats = [] ## build the base version of the signal at the most detailed geo level you can get. ## compute stuff here or farm out to another function or file all_data = pd.DataFrame( columns=["timestamp", "val", "zip", "sample_size", "se"]) ## aggregate & smooth ## TODO: add num/prop variations if needed for sensor, smoother, geo in product(SIGNALS, SMOOTHERS, GEOS): df = mapper.replace_geocode(all_data, "zip", geo, new_col="geo_id", date_col="timestamp") ## TODO: recompute sample_size, se here if not NA df["val"] = df[["geo_id", "val"]].groupby("geo_id")["val"].transform( smoother[0].smooth) sensor_name = sensor + smoother[ 1] ## TODO: +num/prop variation if used # don't export first 6 days for smoothed signals since they'll be nan. start_date = min(df.timestamp) + timedelta(6) if smoother[1] else min( df.timestamp) dates = create_export_csv(df, params["common"]["export_dir"], geo, sensor_name, start_date=start_date) if len(dates) > 0: run_stats.append((max(dates), len(dates))) ## log this indicator run elapsed_time_in_seconds = round(time.time() - start_time, 2) min_max_date = run_stats and min(s[0] for s in run_stats) csv_export_count = sum(s[-1] for s in run_stats) max_lag_in_days = min_max_date and (datetime.now() - min_max_date).days formatted_min_max_date = min_max_date and min_max_date.strftime("%Y-%m-%d") logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds, csv_export_count=csv_export_count, max_lag_in_days=max_lag_in_days, oldest_final_export_date=formatted_min_max_date)
def test_good_file(self): df = pull_nchs_mortality_data(TOKEN, "test_data.csv") # Test columns assert (df.columns.values == [ 'covid_19_deaths', 'total_deaths', 'percent_of_expected_deaths', 'pneumonia_deaths', 'pneumonia_and_covid_19_deaths', 'influenza_deaths', 'pneumonia_influenza_or_covid_19_deaths', "timestamp", "geo_id", "population" ]).all() # Test aggregation for NYC and NY raw_df = pd.read_csv("./test_data/test_data.csv", parse_dates=["start_week"]) raw_df = standardize_columns(raw_df) for metric in METRICS: ny_list = raw_df.loc[(raw_df["state"] == "New York") & (raw_df[metric].isnull()), "timestamp"].values nyc_list = raw_df.loc[(raw_df["state"] == "New York City") & (raw_df[metric].isnull()), "timestamp"].values final_list = df.loc[(df["geo_id"] == "ny") & (df[metric].isnull()), "timestamp"].values assert set(final_list) == set(ny_list).intersection(set(nyc_list)) # Test missing value gmpr = GeoMapper() state_ids = pd.DataFrame(list(gmpr.get_geo_values("state_id"))) state_names = gmpr.replace_geocode(state_ids, "state_id", "state_name", from_col=0, date_col=None) for state, geo_id in zip(state_names, state_ids): if state in set(["New York", "New York City"]): continue for metric in METRICS: test_list = raw_df.loc[(raw_df["state"] == state) & (raw_df[metric].isnull()), "timestamp"].values final_list = df.loc[(df["geo_id"] == geo_id) & (df[metric].isnull()), "timestamp"].values assert set(final_list) == set(test_list)