def save_y_star(eps_version, arima_version, years, measure, draws, decay, gbd_round_id): """ apply random walk and save the output """ ds = open_xr(eps_path).data try: eps_preds = open_xr(f"{mig_dir}/eps_star.nc").data except Exception: eps_preds = arima_migration(ds, years, draws, decay) epsilon_hat_out = mig_dir / "eps_star.nc" save_xr(eps_preds, epsilon_hat_out, metric="rate", space="identity") # cap residuals between 10 and -10 # the population forecasts to 2100 is decreasing to 0 with current # forecasts from migration for Syria, Latvia and Jamaica, the capping # method helps to make things more reasonable eps_past = eps_preds.sel(year_id=years.past_years) eps_preds = eps_preds.sel(year_id=years.forecast_years) eps_preds = eps_preds.clip(min=-10, max=10) eps_preds = xr.concat([eps_past, eps_preds], dim="year_id") pred_path = mig_dir / "mig_hat.nc" preds = open_xr(pred_path).data preds = preds.sel(year_id=years.years) preds = expand_dimensions(preds, draw=range(0, draws)) y_star = preds + eps_preds save_xr(y_star, ystar_out, metric="rate", space="identity")
def load_forecast_pop(gbd_round_id, version, years, draws): """ Load forecast population data. Aggregates if necessary. Args: gbd_round_id (int): The gbd round ID that the past population is from version (str): The version of forecast population to read from years (YearRange): The Forecasting format years to use. Returns: xarray.DataArray: The past population xarray dataarray """ forecast_pop_dir = FBDPath(f"/{gbd_round_id}/future/population/{version}") try: forecast_pop_path = forecast_pop_dir / "population_agg.nc" forecast_pop_da = open_xr(forecast_pop_path).data except: # Need to make agg version forecast_pop_path = forecast_pop_dir / "population.nc" forecast_pop_da = open_xr(forecast_pop_path).data forecast_pop_da = Aggregator.aggregate_everything( forecast_pop_da, gbd_round_id).pop forecast_pop_out_path = forecast_pop_dir / "population_agg.nc" save_xr(forecast_pop_da, forecast_pop_out_path, metric="number", space="identity") # slice to correct years and number of draws forecast_pop_da = forecast_pop_da.sel(year_id=years.forecast_years) forecast_pop_da = resample(forecast_pop_da, draws) return forecast_pop_da
def create_age_sex_xarray(): LOGGER.debug("Creating xarray of age-sex patterns for migration") # load patterns qatar = pd.read_csv(QATAR_PATTERN) eurostat = pd.read_csv(EUROSTAT_PATTERN) # convert to xarrays qatar = df_to_xr(qatar, dims=PATTERN_ID_VARS) eurostat = df_to_xr(eurostat, dims=PATTERN_ID_VARS) # create superarray to hold all locs all_locs_xr_list = [] # Put dataframes for each location into a list for loc in WPP_LOCATION_IDS: if loc in QATAR_LOCS: data = qatar else: data = eurostat data = expand_dimensions(data, location_id=[loc]) all_locs_xr_list.append(data) # Concat all locations together result = xr.concat(all_locs_xr_list, dim='location_id') # Save all locs pattern LOGGER.debug("Saving age-sex pattern xarray") pattern_dir = FBDPath(f'/{gbd_round_id}/future/migration/' f'{PATTERN_VERSION}') pattern_path = pattern_dir / f"combined_age_sex_pattern.nc" save_xr(pattern, pattern_path, metric="percent", space="identity") LOGGER.debug("Saved age-sex pattern xarray") return result
def output_to_xarray(gbd_round, out, version_out): asfr_path = FBDPath("/{gri}/future/asfr/{version}".format( gri=gbd_round, version=version_out)) dims = ['location_id', 'year_id', 'scenario', 'age_group_id', 'sex_id', 'draw'] out_xr = df_to_xr(out, dims = dims) save_xr(out_xr, fbdpath = asfr_path / "asfr.nc", metric="rate", space="identity", version="version", model="asfr_adjusted_to_tfr_plus_point1_if_below2")
def save_paf(paf, gbd_round_id, past_or_future, version, acause, cluster_risk=None): """ Save mediated PAF at cause level. Args: paf (pandas.DataFrame): dataframe of PAF. gbd_round_id (int): gbd round id. past_or_future (str): 'past' or 'future'. version (str): version, dated. acause (str): analytical cause. cluster_risk (str, optional): if none, it will be just risk. """ if cluster_risk is not None: out_fbd_path = (FBDPath(gbd_round_id=gbd_round_id, past_or_future=past_or_future, stage="paf", version=version) / "risk_acause_specific" / "{}_{}.nc".format(acause, cluster_risk)) LOGGER.info("Saving cause-agg risk paf: {}".format(out_fbd_path)) save_xr(paf, out_fbd_path, metric="percent", space="identity", acause=acause, risk=cluster_risk, version=version, gbd_round_id=gbd_round_id) else: out_fbd_path = (FBDPath(gbd_round_id=gbd_round_id, past_or_future=past_or_future, stage="paf", version=version) / "{}.nc".format(acause)) LOGGER.info("Saving cause-only paf: {}".format(out_fbd_path)) save_xr(paf, out_fbd_path, metric="percent", space="identity", acause=acause, version=version, gbd_round_id=gbd_round_id)
def main(migration_version, past_pop_version, forecast_pop_version, gbd_round_id, draws, years): """ Load pops and migration rate, multiply to get counts """ # Load migration data mig_dir = FBDPath(f"/{gbd_round_id}/future/migration/{migration_version}/") mig_path = mig_dir / "mig_star.nc" mig_da = open_xr(mig_path).data # Load pops past_pop_da = load_past_pop(gbd_round_id, past_pop_version) forecast_pop_da = load_forecast_pop(gbd_round_id, forecast_pop_version, years, draws) # Give past populations dummy draws/scenarios to be concatenated with # forecast pops past_pop_da = expand_dimensions(past_pop_da, draw=forecast_pop_da["draw"].values) past_pop_da = expand_dimensions( past_pop_da, scenario=forecast_pop_da["scenario"].values) # Subset to coordinates relevant to mig_da forecast_pop_da = forecast_pop_da.sel( sex_id=3, age_group_id=22, location_id=mig_da.location_id.values, scenario=0) past_pop_da = past_pop_da.sel(sex_id=3, age_group_id=22, location_id=mig_da.location_id.values, scenario=0) # Combine past and forecast pop pop_da = past_pop_da.combine_first(forecast_pop_da) # Multiply rates by pop to get counts mig_counts = mig_da * pop_da mig_counts = mig_counts / SCALE_FACTOR # Save out mig_counts_path = mig_dir / "mig_counts.nc" save_xr(mig_counts, mig_counts_path, metric="number", space="identity")
def make_eps(mig_version, model_version, model_name, gbd_round_id, years): df = pd.read_csv(model_path) # add all-sex and all-age id columns df["sex_id"] = 3 df["age_group_id"] = 22 # select the columns we need df = df[[ "location_id", "year_id", "age_group_id", "sex_id", "predictions", "migration_rate" ]] # set index columns index_cols = ["location_id", "year_id", "age_group_id", "sex_id"] dataset = df.set_index(index_cols).to_xarray() dataset["eps"] = dataset["migration_rate"] - dataset["predictions"] save_xr(dataset["eps"].sel(year_id=years.past_years), eps_path, metric="rate", space="identity") pred_path = mig_dir / "mig_hat.nc" save_xr(dataset["predictions"].sel(year_id=years.years), pred_path, metric="rate", space="identity") mig_path = mig_dir / "wpp_hat.nc" save_xr(dataset["migration_rate"].sel(year_id=years.years), mig_path, metric="rate", space="identity")
def main(forecast_pop_version, input_version, output_version, draws, gbd_round_id, measure, years, past_pop_version, past_asfr_version, past_mortality_version): """Compute deaths or births aggregations and save the data Args: forecast_pop_version (str): The version name of the population forecasts. gbd_round_id (int): The GBD round fed into FBDPath to pull the correct version of ASFR. input_version (str): The version name of the input mortality or ASFR output_version (str): The version name of the output deaths or births to be saved. measure (str): Death or live_births. draws (int): The number of desired draws. Returns: None. """ forecast_pop = get_pop(forecast_pop_version, gbd_round_id, measure, draws, years, past_pop_version) if measure == "live_births": forecast_da = get_asfr(input_version, gbd_round_id, draws, years, past_asfr_version) else: forecast_da = get_mortality(input_version, gbd_round_id, draws, years, past_mortality_version) agg_output = get_agg(forecast_pop, forecast_da, gbd_round_id) data_path = FBDPath(f"{gbd_round_id}/future/{measure}/{output_version}") save_xr(agg_output, f"{data_path}/{measure}.nc", metric="number", space="identity") LOGGER.info(f"{measure} have been calculated")
def main(migration_version, gbd_round_id): # load age-sex pattern (loc, draw, age, sex) LOGGER.debug("Loading age-sex migration pattern") try: pattern_dir = FBDPath(f'/{gbd_round_id}/future/migration/' f'{PATTERN_VERSION}') pattern_path = pattern_dir / "combined_age_sex_pattern.nc" pattern = open_xr(pattern_path).data except FileNotFoundError: # Data doesn't yet exist pattern = create_age_sex_xarray() # load migration counts (loc, draw, year) LOGGER.debug("Loading migration data") mig_dir = FBDPath(f"/{gbd_round_id}/future/migration/{migration_version}/") mig_path = mig_dir / "mig_counts.nc" migration = open_xr(mig_path).data migration = migration.squeeze(drop=True) # end up with migration counts with age and sex (loc, draw, year, age, sex) split_data = migration * pattern # Save it! LOGGER.debug("Saving age-sex split migration data") split_path = mig_dir / "migration_split.nc" save_xr(split_data, split_path, metric="number", space="identity")
def one_draw_main(gbd_round_id, years, draw, forecast_version, output_version): """Driver function that handles the education cohort correction for a single draw. Args: gbd_round_id (int): The gbd round id. years (YearRange): The past and forecasted years. draw (int): The draw number to perform the correction on. forecast_version (str): Forecast version of education. output_version (str): Cohort corrected version. """ LOGGER.info("Applying cohort correction to draw: {}".format(draw)) input_dir = FBDPath("".format()) # Path removed for security reasons uncorrected_da = open_xr(input_dir / "education.nc").data # subset to national and subnational location ids location_table = db.get_location_set(gbd_round_id) # modeling subnational estimates modeled_location_ids = list(location_table["location_id"].unique()) avail_sex_ids = [ sex for sex in uncorrected_da["sex_id"].values if sex in MODELED_SEX_IDS] # Age groups 2,3,4 and 5 gets filtered out here. Will add them back later. avail_age_group_ids = [ age for age in uncorrected_da["age_group_id"].values if age in MODELED_AGE_GROUP_IDS] uncorrected_draw_da = uncorrected_da.sel( sex_id=avail_sex_ids, age_group_id=avail_age_group_ids, location_id=modeled_location_ids ).sel(draw=draw, drop=True) # Create cohort information from age groups and year ids. cohort_age_df = get_cohort_info_from_age_year(avail_age_group_ids, years) corrected_da = get_corrected_da( uncorrected_draw_da, cohort_age_df, years) # Combine with dropped age groups dropped_age_ids = [ age for age in uncorrected_da["age_group_id"].values if age not in MODELED_AGE_GROUP_IDS] dropped_age_da = uncorrected_da.sel( sex_id=avail_sex_ids, age_group_id=dropped_age_ids, location_id=modeled_location_ids).sel(draw=draw, drop=True) combined_da = xr.concat([dropped_age_da, corrected_da], dim='age_group_id') combined_da['draw'] = draw op_dir = FBDPath("".format()) save_xr(combined_da, op_dir / "corrected_edu_draw{}.nc".format(draw), metric="rate", space="identity")
def main(asfr_version, past_asfr_version, location_id, gbd_round_id, years, granularity, iterations, **kwargs): """ 1. Read in location-specific draws of period ASFR from CCF stage 2. Add terminal age group ASFR's 3. Intercept shift asfr by holding CCF50 constant. 4. Export location-specific intercept-shifted ASFR in .nc Args: asfr_version (str): version name of future ccf/asfr. past_asfr_version (str): asfr version from past. location_id (int): location_id. gbd_round_id (int): gbd round id. years (YearRange): past_start:forecast_start:forecast_end iterations (int): number of times to intercept-shift. """ ages_df = db.get_ages(gbd_round_id)[[ "age_group_id", "age_group_years_start", "age_group_years_end" ]] # read the location-specific asfr .csv into dataarray # the raw forecasted ASFR are stored in the CCF stage of the same ccf_fbd_path = FBDPath(gbd_round_id=gbd_round_id, past_or_future="future", stage="ccf", version=asfr_version) if granularity == 1: sub_folder = "asfr_single_year" ccf_asfr_fbd_path = ccf_fbd_path / sub_folder future_asfr = read_to_xr(location_id, ccf_asfr_fbd_path, dims=list(ASFR_NON_AGE_DIMS + ("age", ))) else: sub_folder = "asfr" ccf_asfr_fbd_path = ccf_fbd_path / sub_folder future_asfr =\ read_to_xr(location_id, ccf_asfr_fbd_path, dims=list(ASFR_NON_AGE_DIMS + ("age_group_id",))) # we intercept-shift in 1-year ages, so convert to single years future_asfr = _expand_5yr_age_groups_to_1yr_ages(future_asfr, ages_df) if "sex_id" in future_asfr.dims: raise ValueError("Found sex_id dim in future asfr") # now etl the past asfr data past_asfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id, past_or_future="past", stage="asfr", version=past_asfr_version) past_asfr =\ open_xr(past_asfr_fbd_path / "asfr.nc").data.sel(location_id=location_id) if "sex_id" in past_asfr.dims: raise ValueError("Found sex_id dim in past asfr") # past has no scenarios, so we need to expand it for merging past_asfr = expand_dimensions(past_asfr, scenario=future_asfr["scenario"]) # past asfr has age group ids 7-15, but future asfr in ccf only has 8-14. # we only need age groups 8-14 for intercept shift past_asfr_1yrs = _expand_5yr_age_groups_to_1yr_ages( past_asfr.sel(age_group_id=range(8, 15)), ages_df) # now ready to concat past and future together for intercept shift asfr = xr.concat([ past_asfr_1yrs.sel(year_id=years.past_years), future_asfr.sel(year_id=years.forecast_years) ], dim="year_id") del past_asfr_1yrs, future_asfr gc.collect() # the intercept-shift should keep ccf50 (asfr sum) constant pre_fix_asfr_sum = asfr.sum() # sum of all asfr values before shift asfr = ccf50_intercept_shift_lpf(asfr, gbd_round_id, years, iterations) post_fix_asfr_sum = asfr.sum() # asfr sum post-shift should stay the same assert np.isclose(post_fix_asfr_sum, pre_fix_asfr_sum, rtol=RTOL),\ f"The intercept shift changed total asfr sum by more than rtol={RTOL}" # need to save years.past_end for cohort-component model save_years = [years.past_end] + years.forecast_years.tolist() asfr = asfr.sel(year_id=save_years) # only do forecast # convert forecasted asfr back to 5-year age groups asfr = _convert_ages_to_5_year_age_groups_by_mean(asfr, ages_df) # add 10-15 (7) and 50-55 (15) age groups for forecasted asfr asfr = extrapolate_terminal_asfr_age_groups(past_asfr, asfr, last_year=years.past_end) asfr["location_id"] = location_id asfr.name = "value" del past_asfr gc.collect() LOGGER.info("Finished CCF50 intercept-shift") asfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id, past_or_future="future", stage="asfr", version=asfr_version) save_xr(asfr, asfr_fbd_path / f"{location_id}.nc", metric="rate", space="identity", version=asfr_version, past_asfr_version=past_asfr_version, iterations=iterations)
def compute_paf(acause, rei, version, years, gbd_round_id, draws, sev, rrmax, vaccine_sev, vaccine_rrmax, gbd_paf_version, **kwargs): r""" Computes PAF for the given acause-risk pair, and exports said PAF to ``/{gbd_round_id}/{past_or_future}/paf/{version}``. Args: acause (str): analytical cause. rei (str): rei, or commonly called risk. version (str): FBDPath version to export to. years (YearRange): [past_start, forecast_start, forecast_end] years. gbd_round_id (int): gbd round id. draws (int): number of draws for output file. This means input files will be up/down-sampled to meet this criterion. sev (str): upstream sev version rrmax (str): upstream rrmax version vaccine_sev (str): upstream vaccine sev version vaccine_rrmax (str): upstream vaccine rrmax version gbd_paf_version (str): gbd_paf version to read from, if not downloading from get_draws(). """ sev_da = read_sev(rei=rei, sev=sev, vaccine_sev=vaccine_sev, gbd_round_id=gbd_round_id, years=years, draws=draws) rrmax_da = read_rrmax(acause=acause, rei=rei, rrmax=rrmax, vaccine_rrmax=vaccine_rrmax, gbd_round_id=gbd_round_id, years=years, draws=draws) # estimated cause-risk-specific paf with xr.set_options(arithmetic_join="outer"): paf = 1 - 1 / (sev_da * (rrmax_da - 1) + 1) location_ids = sev_da["location_id"].values.tolist() sex_ids = sev_da["sex_id"].values.tolist() del sev_da, rrmax_da gc.collect() maybe_negative_paf = is_maybe_negative_paf(acause, rei, gbd_round_id) # Forecasted PAFs are cleaned first before further processing paf = _data_cleaning_for_paf(paf, maybe_negative_paf) # now ping get_draws for gbd paf values LOGGER.info("Got estimated paf for {}_{}. Pulling gbd paf...". format(acause, rei)) gbd_round = get_gbd_round(gbd_round_id) if gbd_paf_version: # then we read gbd_paf from this folder cache_version = gbd_paf_version else: # default to {gbd_round}_gbd cache_version = str(gbd_round) + "_gbd" gbd_paf = get_gbd_paf(acause, rei, cache_version, gbd_round_id, sex_ids=sex_ids, location_ids=location_ids, draws=draws) LOGGER.info("Pulled gbd paf for {}_{}. Computing adjusted paf...". format(acause, rei)) # compute correction factor and perform adjustment if gbd_paf is not None: # First make sure there's no COMPLETE mismatch between paf and gbd_paf. # If so, an error should be raised paf.load() gbd_paf.load() # need to force load() because dask is lazy if (paf - gbd_paf).size == 0: # normal arithmetic is inner-join error_message = ("Complete mismatch between computed and GBD in " "{}-{} PAF. Are you sure you used the correct " "version of GBD PAF?".format(acause, rei)) LOGGER.error(error_message) raise ValueError(error_message) gbd_paf = _data_cleaning_for_paf(gbd_paf, maybe_negative_paf) correction_factor = compute_correction_factor( paf.sel(year_id=gbd_round), gbd_paf, maybe_negative_paf) del gbd_paf gc.collect() paf = correct_paf(paf, correction_factor, maybe_negative_paf) LOGGER.info("Adjusted paf for {}_{}. Now saving...". format(acause, rei)) else: # correction factor is 0, and we leave paf as is correction_factor = xr.zeros_like(paf) LOGGER.info("paf for {}_{} not adjusted because gbd_paf is None". format(acause, rei)) # If there are still NaNs at this point, then they should indicate age or # sex restrictions and should be filled with 0. paf = paf.fillna(0) # we need to save the results separately in "past" and "future" for p_or_f, yrs in {"past": years.past_years, "future": years.forecast_years}.items(): out = paf.sel(year_id=yrs) out_fbd_path = FBDPath(gbd_round_id=gbd_round_id, past_or_future=p_or_f, stage="paf", version=version) # first we save cause-risk-specific paf outpath = (out_fbd_path / "risk_acause_specific" / (acause + "_" + rei + ".nc")) LOGGER.info("Saving {}".format(outpath)) save_xr(out, outpath, metric="percent", space="identity", acause=acause, risk=rei, gbd_round_id=gbd_round_id, sev=sev, rrmax=rrmax, vaccine_sev=vaccine_sev, vaccine_rrmax=vaccine_rrmax, gbd_paf_version=cache_version) del out gc.collect() # now saving cause-risk-specific correction factor if p_or_f == "past": outpath = (out_fbd_path / "risk_acause_specific" / (acause + "_" + rei + "_cf.nc")) LOGGER.info("Saving {}".format(outpath)) save_xr(correction_factor, outpath, metric="percent", space="logit", sev=sev, rrmax=rrmax, vaccine_sev=vaccine_sev, vaccine_rrmax=vaccine_rrmax, gbd_paf_version=cache_version) del paf, correction_factor gc.collect()
def get_gbd_paf(acause, rei, cache_version, gbd_round_id, sex_ids, location_ids, draws, measure_id=4, metric_id=2): """ Downloads and transforms gbd cause-risk-specific PAF. The dataarray is then cleaned and saved in a FBDPath. The gbd paf coming from get_draws:: >>> df.columns Index([u'rei_id', u'modelable_entity_id', u'location_id', u'year_id', u'age_group_id', u'sex_id', u'cause_id', u'measure_id', u'draw_0', u'draw_1', ... u'draw_991', u'draw_992', u'draw_993', u'draw_994', u'draw_995', u'draw_996', u'draw_997', u'draw_998', u'draw_999', u'metric_id'], dtype='object', length=1009) where we will need to 1.) use cause_id to slice for the cause-risk pair 2.) use measure_id (typically 4 for yll) to slice for measure_id 3.) use metric_id (typically 2 for percent) to slice for metric_id Args: acause (str): analytical cause. rei (str): risk, could also be vaccine intervention. cache_version (str): the FBDPath paf version to save the gbd paf in, or to read from. gbd_round_id (int): gbd round id sex_ids (list): sexes. Typically [1, 2]. location_ids (list): locations to get pafs from. draws (int): number of draws for output file. This means input files will be up/down-sampled to meet this criterion. measure_id (int, optional): typically the yll measure id (4). At the most detailed PAF yll is equivalent to death, so measure_id 4 works the same as measure_id 1 (death). Empirically, it seems to pull data faster if calling with meausre_id=4. metric_id (int, optional): typically the percent metric (2) Returns: (xr.DataArray/None): Dataarray with complete demographic indices, sans "scenario". """ if rei in get_vaccine_reis(gbd_round_id): # get_draws won't have anything for vaccines return None cache_file_fbdpath =\ FBDPath(gbd_round_id=gbd_round_id, past_or_future="past", stage="paf", version=cache_version) / (acause + "_" + rei + ".nc") if cache_file_fbdpath.exists(): LOGGER.info("{} already exists. Will read from it for gbd paf.". format(cache_file_fbdpath)) paf_da = open_xr(cache_file_fbdpath).data paf_da = paf_da.sel(location_id=location_ids) if len(paf_da["draw"]) != draws: paf_da = resample(paf_da, draws) return paf_da else: # no cache exists, must download & clean rei_id = get_rei_id(rei) if acause in CAUSES_NOT_IN_GBD_MAP: # edge case for diarrhea_* cause_id = get_cause_id(CAUSES_NOT_IN_GBD_MAP[acause]) else: cause_id = get_cause_id(acause) gbd_round = get_gbd_round(gbd_round_id) try: # we only need it for year_id=gbd_round, but for every other dim # we collect everything. paf_df = get_draws(gbd_id_type=['cause_id', 'rei_id'], gbd_id=[cause_id, rei_id], source='burdenator', year_id=gbd_round, gbd_round_id=gbd_round_id, measure_id=measure_id, metric_id=metric_id) except Exception as exc: error_message = "Error in get_draws for {}_{}".format(acause, rei) LOGGER.error(error_message) raise IOError(str(exc)) paf_df = paf_df.drop(columns=["year_id", "rei_id", "cause_id", "measure_id", "metric_id"]) # don't need these no more paf_da = df_to_xr(paf_df, dims=["location_id", "age_group_id", "sex_id"], wide_dim_name='draw', wide_dim_transform=lambda x: int(x.split('_')[1]), fill_value=np.nan) paf_da = paf_da.sortby("draw") # draws don't always come in sorted paf_da = _data_cleaning_for_paf(paf_da, acause, rei, "GBD") LOGGER.info("Saving downloaded & cleaned {}". format(cache_file_fbdpath)) save_xr(paf_da, cache_file_fbdpath, metric="percent", space="identity", cause_id=cause_id, rei_id=rei_id, gbd_round_id=gbd_round_id, year_id=gbd_round, measure_id=measure_id, metric_id=metric_id, upper_bound=PAF_UPPER_BOUND, lower_bound=PAF_LOWER_BOUND) if len(paf_da["draw"]) != draws: paf_da = resample(paf_da, draws) return paf_da
def arima_and_ystar(acause, agg_version, arima_version, smoothing, years, measure, intercept_shift, gbd_round_id, draws, decay, dryrun=False, no_correction=False, past_version="best", no_arima=False, **kwargs): r"""Samples mortality residuals from an ARIMA and forms $y^* = \hat{y} + \hat{\epsilon}$. :param str acause: name of the target acause to aggregate to. :param str agg_version: name of the aggregate version. :param str arima_version: name of the arima version. :param list[str] smoothing: what dimensions to smooth over during the ARIMA step. :param fbd_core.argparse.YearRange years: a container for the three years which define our forecast. :param int draws: number of draws to take. :param bool dryrun: dryrun flag. This is a test run if True. :param bool bias: Perform log bias correction. """ logger.debug("Opening: {}".format(FILEPATH)) y_hat = xr.open_dataarray(str(FILEPATH)) # GK intercept shift y_hat = gis.intercept_shift_at_draw(y_hat, acause, past_version, gbd_round_id, years, draws) save_xr(y_hat, FILEPATH, root_dir="scratch", metric="rate", space="log") y_past = _get_y_past(acause, years, measure, gbd_round_id, past_version=past_version) past_years = years.past_years if not no_arima: # ARIMA for everything except NTDs logger.info("Computing epsilon_past.") epsilon_past_with_scenarios_and_draws = ( y_past.loc[dict(year_id=past_years)] - y_hat.loc[dict(year_id=past_years)]) epsilon_past = epsilon_past_with_scenarios_and_draws.loc[dict( scenario=0)].mean("draw") try: epsilon_hat = xr.open_dataarray(str(FILEPATH)) except: epsilon_hat = _draw_epsilons(epsilon_past, draws, smoothing, years, acause, decay, gbd_round_id=gbd_round_id) if not dryrun: logger.info("Saving epsilon_hat to {}".format(FILEPATH)) _save_netcdf(epsilon_hat, FILEPATH) y_star = _get_y_star(y_hat, epsilon_hat, years).copy() else: # no arima for ntds y_star = y_hat y_star.name = "value" # intercept shift and bias if intercept_shift: y_star = _intercept_shift(acause, y_star, years, measure, gbd_round_id, draws=draws, no_arima=no_arima, past_version=past_version) if not no_correction: y_star = xr.ufuncs.log(bias_exp(y_star)) if not dryrun: logger.info("Saving y_star to {}".format(FILEPATH)) _save_netcdf(y_star, FILEPATH)
def compute_scalar(acause, version, gbd_round_id, no_update_past, **kwargs): """ Computes and saves scalars for acause, given upstream paf version. Args: acause (str): cause to compute scalars for version (str): date/version string pointing to folder to pull data from gbd_round_id (int): gbd round id. no_update_past (boolean): whether to overwrite past scalars. """ risk_table = get_risk_hierarchy(gbd_round_id) risk_id_dict = get_risk_id_dict(risk_table) # {id: risk} cause_risks = get_acause_related_risks(acause, gbd_round_id) # list of risks for past_or_future in ['past', 'future']: LOGGER.info("OH BOY WE'RE DOING THE: {}".format(past_or_future)) outpath_scalar =\ FBDPath(gbd_round_id=gbd_round_id, past_or_future=past_or_future, stage="scalar", version=version) / ("{}.nc".format(acause)) if os.path.exists(str(outpath_scalar)) and no_update_past: continue # Aggregate PAF for level-1 cluster risks # We don't need to use the PAF for scalar. # take the risks associated with the cause (cause_risks), # and make a dict of all the level 1, 2, 3 parent risks of these risks, # with list of their sub-risks (within cause_risks) as value. # So you end up with keys that may have risks outside of cause_risks, # and values that are subsets of cause_risks. risk_lst = get_cluster_risks(cause_risks, risk_id_dict, risk_table) for key in risk_lst.keys(): # loop over all antecedent-risks LOGGER.info("Looping over super/parent risks.") subrisks = risk_lst[key] if len(subrisks) > 0: LOGGER.info('Start aggregating cluster risk: {}'.format(key)) aggregate_paf(acause, subrisks, gbd_round_id, past_or_future, version, cluster_risk=key) gc.collect() # Aggregate PAF for all risks. # We need to use the PAF for scalar. paf_mediated = aggregate_paf(acause, cause_risks, gbd_round_id, past_or_future, version) if paf_mediated is None: LOGGER.info("No paf_mediated. Early return.") return scalar = 1.0 / (1.0 - paf_mediated) del paf_mediated gc.collect() LOGGER.debug("Checking data value for {} scalar".format(acause)) data_value_check(scalar) # make sure no NaNs or <0 in dataarray save_xr(scalar, outpath_scalar, metric="number", space="identity", acause=acause, version=version, gbd_round_id=gbd_round_id, no_update_past=str(no_update_past))
def make_tfr_and_agg(asfr_version, pop_version, gbd_round_id, years, model, hyperparam, **kwargs): """ From asfr and pop, make asfr_agg, tfr, and tfr_agg, and export files for pipeline and plotting needs. Args: asfr_version (str): intercept-shifted asfr version where an "asfr.nc" with both past and future is present. pop_version (str): future pop version to use for agg. gbd_round_id (int): gbd round id. years (YearRange): past_start:forecast_start:forecast_end. """ pop_fbd_path = FBDPath(gbd_round_id=gbd_round_id, past_or_future="future", stage="population", version=pop_version) # only need females for fertility studies pop = open_xr(pop_fbd_path / "population.nc").data.\ sel(sex_id=2, year_id=years.forecast_years) agg = Aggregator(pop) locs = db.get_locations_by_max_level(3) hierarchy = locs[["location_id", "parent_id"]].\ set_index("location_id").to_xarray().parent_id asfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id, past_or_future="future", stage="asfr", version=asfr_version) asfr = open_xr(asfr_fbd_path / "asfr.nc").data.\ sel(year_id=years.forecast_years) asfr_agg = agg.aggregate_locations(hierarchy, data=asfr).rate # Calculate TFR tfr = calc_tfr_from_asfr(asfr) tfr_agg = calc_tfr_from_asfr(asfr_agg) # Saving to .nc files asfr.name = "value" tfr.name = "value" asfr_agg.name = "value" tfr_agg.name = "value" LOGGER.info("saving asfr_agg, tfr, tfr_agg to .nc") save_xr(asfr_agg, asfr_fbd_path / "asfr_agg_based_on_preliminary_pop.nc", metric="rate", space="identity", asfr_version=asfr_version, pop_version=pop_version) tfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id, past_or_future="future", stage="tfr", version=asfr_version) save_xr(tfr, tfr_fbd_path / "tfr.nc", metric="rate", space="identity", asfr_version=asfr_version) save_xr(tfr_agg, tfr_fbd_path / "tfr_agg_based_on_preliminary_pop.nc", metric="rate", space="identity", asfr_version=asfr_version, pop_version=pop_version) print("Saving Quantiles and Means to .csv") asfr.mean("draw").to_dataframe().reset_index().\ to_csv(asfr_fbd_path / "asfr_mean.csv", index=False) asfr_quantiles = asfr.quantile([0.025, 0.975], "draw") asfr_quantiles.sel(quantile=0.025).to_dataframe().reset_index().\ to_csv(asfr_fbd_path / "asfr_lower.csv", index=False) asfr_quantiles.sel(quantile=0.975).to_dataframe().reset_index().\ to_csv(asfr_fbd_path / "asfr_upper.csv", index=False) asfr_agg.mean("draw").to_dataframe().reset_index().\ to_csv(asfr_fbd_path / "asfr_agg_based_on_preliminary_pop_mean.csv", index=False) asfr_agg_quantiles = asfr_agg.quantile([0.025, 0.975], "draw") asfr_agg_quantiles.sel(quantile=0.025).to_dataframe().reset_index().\ to_csv(asfr_fbd_path / "asfr_agg_based_on_preliminary_pop_lower.csv", index=False) asfr_agg_quantiles.sel(quantile=0.975).to_dataframe().reset_index().\ to_csv(asfr_fbd_path / "asfr_agg_based_on_preliminary_pop_upper.csv", index=False) tfr.mean("draw").to_dataframe().reset_index().\ to_csv(tfr_fbd_path / "tfr_mean.csv", index=False) tfr_quantiles = tfr.quantile([0.025, 0.975], "draw") tfr_quantiles.sel(quantile=0.025).to_dataframe().reset_index().\ to_csv(tfr_fbd_path / "tfr_lower.csv", index=False) tfr_quantiles.sel(quantile=0.975).to_dataframe().reset_index().\ to_csv(tfr_fbd_path / "tfr_upper.csv", index=False) tfr_agg.mean("draw").to_dataframe().reset_index().\ to_csv(tfr_fbd_path / "tfr_agg_based_on_preliminary_pop_mean.csv", index=False) tfr_agg_quantiles = tfr_agg.quantile([0.025, 0.975], "draw") tfr_agg_quantiles.sel(quantile=0.025).to_dataframe().reset_index().\ to_csv(tfr_fbd_path / "tfr_agg_based_on_preliminary_pop_lower.csv", index=False) tfr_agg_quantiles.sel(quantile=0.975).to_dataframe().reset_index().\ to_csv(tfr_fbd_path / "tfr_agg_based_on_preliminary_pop_upper.csv", index=False)
def forecast_edu_main(transform, past_version, forecast_version, pv_version, weight_strategy, gbd_round_id, years, reference_scenario, diff_over_mean, truncate, truncate_quantiles, replace_with_mean, draws, **kwargs): LOGGER.debug("weight strategy: {}".format(weight_strategy.__name__)) pv_path = FBDPath("".format()) # Path removed for security reasons rmse = open_xr(pv_path / "education_arc_weight_rmse.nc").data weight_exp = weight_strategy(rmse, draws) LOGGER.info("omega selected: {}".format(weight_exp)) LOGGER.debug("Reading in the past") past_path = FBDPath("".format()) # Path removed for security reasons past = resample(open_xr(past_path / "education.nc").data, draws) past = past.sel(year_id=years.past_years) if isinstance(weight_exp, float) or isinstance(weight_exp, int): extra_dim = None else: if not isinstance(weight_exp, xr.DataArray): omega_exp_err_msg = ( "`omega` must be either a float, an int, or an " "xarray.DataArray") LOGGER.error(omega_exp_err_msg) raise RuntimeError(omega_exp_err_msg) elif len(weight_exp.dims) != 1 or "draw" not in weight_exp.dims: omega_exp_err_msg = ( "If `omega` is a xarray.DataArray, then it must have only " "1 dim, `draw`") LOGGER.error(omega_exp_err_msg) raise RuntimeError(omega_exp_err_msg) elif not weight_exp["draw"].equals(past["draw"]): omega_err_msg = ( "If `omega` is a xarray.DataArray, then it's `draw` dim " "must have the coordinates as `past`") LOGGER.error(omega_err_msg) raise RuntimeError(omega_err_msg) else: extra_dim = "draw" forecast = arc_forecast_education(past, gbd_round_id, transform, weight_exp, years, reference_scenario, diff_over_mean, truncate, truncate_quantiles, replace_with_mean, extra_dim=extra_dim) forecast_path = FBDPath("".format()) if isinstance(weight_exp, xr.DataArray): report_omega = float(weight_exp.mean()) else: report_omega = weight_exp save_xr(forecast, forecast_path / "education.nc", metric="number", space="identity", omega=report_omega, omega_strategy=weight_strategy.__name__) LOGGER.info("education forecasts have saved")
def run_against(version, pop_version, asfr_version, lifetable_version, migration_version, srb_version, gbd_round_id, location_idx, years, location_id, draws, test=False): """ Takes versions for files, finds the files, and computes future populations. It then saves those files. This is what you call from the pipeline. Args: version (str): Version name for output pop_version (str): Version for population asfr_version (str): version for asfr lifetable_version (list[str]): List of versions for lifetable migration_version (list[str]): List of versions for migration gbd_round_id (int): GBD Round ID, 4 is 2016 location_idx (int|None): Zero-based index into list of locations. years (YearRange): years for past and forecast. location_id (int|None): A location ID. test (bool): Run a reduced subset of locations and draws. Returns: None """ out_path = FBDPath("/{}/future/population/{}".format( gbd_round_id, version)) try: out_path.mkdir(parents=True, exist_ok=True) except OSError as ose: LOGGER.error("Could not create output directory {}: {}".format( out_path, ose)) asfr_lim, lifetable_lim, pop, migration, srb =\ agreement_rules( *read_datasets( asfr_version, gbd_round_id, lifetable_version, pop_version, migration_version, years, srb_version, draws), years ) ruler = timeline(pop.age_group_id.values, asfr_lim.age_group_id.values) locations = pop.location_id.values if location_idx is not None: try: locations = [locations[location_idx]] LOGGER.info("Using location_id {} from location_idx {}".format( locations, location_idx)) except IndexError: LOGGER.warning("Asked for out-of-bounds location {} of {}".format( location_idx, locations.shape[0])) exit(0) # Maybe you ask for 200 jobs but have 195 countries. OK. elif location_id is not None: locations = [location_id] else: locations = pop.location_id.values for location in locations: begin_time = perf_time() loc_idx = dict(location_id=location) future = one_location(pop.loc[loc_idx], asfr_lim.loc[loc_idx], lifetable_lim.loc[loc_idx], migration.loc[loc_idx], srb.loc[loc_idx], ruler, gbd_round_id, years, test) out_name = out_path / "{}.nc".format(location) future.coords["location_id"] = location summary = summarize_pop(future) elapsed = perf_time() - begin_time LOGGER.info("Elapsed {}".format(elapsed)) write_begin = perf_time() save_xr(summary, out_name, metric="number", space="identity", death=version, pop=pop_version, asfr=asfr_version, lifetable=lifetable_version, migration=migration_version, srb=srb_version) LOGGER.info("Wrote {}".format(out_name)) LOGGER.info("Write time Elapsed {}".format(perf_time() - write_begin))
if __name__ == '__main__': parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--version", type=str, help="Which version of migration to balance.") parser.add_argument( "--gbd_round_id", type=int, required=True, help="Which gbd_round_id to use in file loading and saving") args = parser.parse_args() # Try to load data, else combine csvs into dataarray try: mig_dir = FBDPath( f"/{args.gbd_round_id}/future/migration/{args.version}/") mig_path = mig_dir / "migration_split.nc" mig_da = open_xr(mig_path).data except: # Data doesn't yet exist mig_da = combine_and_save_mig(version=args.version) balanced_mig_da = balance_migration(mig_da) # Save to forecasting directory balanced_path = mig_dir / "migration.nc" save_xr(balanced_mig_da, balanced_path, metric="number", space="identity") great_job.congratulations() # You did it!