def save_y_star(eps_version, arima_version, years, measure, draws, decay,
                gbd_round_id):
    """
    apply random walk and save the output
    """

    ds = open_xr(eps_path).data
    try:
        eps_preds = open_xr(f"{mig_dir}/eps_star.nc").data
    except Exception:
        eps_preds = arima_migration(ds, years, draws, decay)
        epsilon_hat_out = mig_dir / "eps_star.nc"
        save_xr(eps_preds, epsilon_hat_out, metric="rate", space="identity")

    # cap residuals between 10 and -10
    # the population forecasts to 2100 is decreasing to 0 with current
    # forecasts from migration for Syria, Latvia and Jamaica, the capping
    # method helps to make things more reasonable
    eps_past = eps_preds.sel(year_id=years.past_years)
    eps_preds = eps_preds.sel(year_id=years.forecast_years)
    eps_preds = eps_preds.clip(min=-10, max=10)
    eps_preds = xr.concat([eps_past, eps_preds], dim="year_id")

    pred_path = mig_dir / "mig_hat.nc"
    preds = open_xr(pred_path).data
    preds = preds.sel(year_id=years.years)
    preds = expand_dimensions(preds, draw=range(0, draws))
    y_star = preds + eps_preds

    save_xr(y_star, ystar_out, metric="rate", space="identity")
示例#2
0
def load_forecast_pop(gbd_round_id, version, years, draws):
    """
    Load forecast population data. Aggregates if necessary.

    Args:
        gbd_round_id (int):
            The gbd round ID that the past population is from
        version (str):
            The version of forecast population to read from
        years (YearRange):
            The Forecasting format years to use.

    Returns:
        xarray.DataArray: The past population xarray dataarray
    """
    forecast_pop_dir = FBDPath(f"/{gbd_round_id}/future/population/{version}")
    try:
        forecast_pop_path = forecast_pop_dir / "population_agg.nc"
        forecast_pop_da = open_xr(forecast_pop_path).data
    except:  # Need to make agg version
        forecast_pop_path = forecast_pop_dir / "population.nc"
        forecast_pop_da = open_xr(forecast_pop_path).data
        forecast_pop_da = Aggregator.aggregate_everything(
            forecast_pop_da, gbd_round_id).pop
        forecast_pop_out_path = forecast_pop_dir / "population_agg.nc"
        save_xr(forecast_pop_da,
                forecast_pop_out_path,
                metric="number",
                space="identity")

    # slice to correct years and number of draws
    forecast_pop_da = forecast_pop_da.sel(year_id=years.forecast_years)
    forecast_pop_da = resample(forecast_pop_da, draws)

    return forecast_pop_da
def create_age_sex_xarray():
    LOGGER.debug("Creating xarray of age-sex patterns for migration")
    # load patterns
    qatar = pd.read_csv(QATAR_PATTERN)
    eurostat = pd.read_csv(EUROSTAT_PATTERN)
    # convert to xarrays
    qatar = df_to_xr(qatar, dims=PATTERN_ID_VARS)
    eurostat = df_to_xr(eurostat, dims=PATTERN_ID_VARS)
    # create superarray to hold all locs
    all_locs_xr_list = []
    # Put dataframes for each location into a list
    for loc in WPP_LOCATION_IDS:
        if loc in QATAR_LOCS:
            data = qatar
        else:
            data = eurostat
        data = expand_dimensions(data, location_id=[loc])
        all_locs_xr_list.append(data)
    # Concat all locations together
    result = xr.concat(all_locs_xr_list, dim='location_id')
    # Save all locs pattern
    LOGGER.debug("Saving age-sex pattern xarray")
    pattern_dir = FBDPath(f'/{gbd_round_id}/future/migration/'
                          f'{PATTERN_VERSION}')
    pattern_path = pattern_dir / f"combined_age_sex_pattern.nc"
    save_xr(pattern, pattern_path, metric="percent", space="identity")
    LOGGER.debug("Saved age-sex pattern xarray")
    return result
示例#4
0
def output_to_xarray(gbd_round, out, version_out):
    asfr_path = FBDPath("/{gri}/future/asfr/{version}".format(
        gri=gbd_round, version=version_out))
    dims = ['location_id', 'year_id', 'scenario', 'age_group_id', 'sex_id', 'draw']
    out_xr = df_to_xr(out, dims = dims)
    save_xr(out_xr,
        fbdpath = asfr_path / "asfr.nc",
        metric="rate",
        space="identity",
        version="version",
        model="asfr_adjusted_to_tfr_plus_point1_if_below2")
示例#5
0
def save_paf(paf,
             gbd_round_id,
             past_or_future,
             version,
             acause,
             cluster_risk=None):
    """
    Save mediated PAF at cause level.

    Args:
        paf (pandas.DataFrame): dataframe of PAF.
        gbd_round_id (int): gbd round id.
        past_or_future (str): 'past' or 'future'.
        version (str): version, dated.
        acause (str): analytical cause.
        cluster_risk (str, optional): if none, it will be just risk.
    """
    if cluster_risk is not None:
        out_fbd_path = (FBDPath(gbd_round_id=gbd_round_id,
                                past_or_future=past_or_future,
                                stage="paf",
                                version=version) / "risk_acause_specific" /
                        "{}_{}.nc".format(acause, cluster_risk))

        LOGGER.info("Saving cause-agg risk paf: {}".format(out_fbd_path))
        save_xr(paf,
                out_fbd_path,
                metric="percent",
                space="identity",
                acause=acause,
                risk=cluster_risk,
                version=version,
                gbd_round_id=gbd_round_id)
    else:
        out_fbd_path = (FBDPath(gbd_round_id=gbd_round_id,
                                past_or_future=past_or_future,
                                stage="paf",
                                version=version) / "{}.nc".format(acause))

        LOGGER.info("Saving cause-only paf: {}".format(out_fbd_path))
        save_xr(paf,
                out_fbd_path,
                metric="percent",
                space="identity",
                acause=acause,
                version=version,
                gbd_round_id=gbd_round_id)
示例#6
0
def main(migration_version, past_pop_version, forecast_pop_version,
         gbd_round_id, draws, years):
    """
    Load pops and migration rate, multiply to get counts
    """
    # Load migration data
    mig_dir = FBDPath(f"/{gbd_round_id}/future/migration/{migration_version}/")
    mig_path = mig_dir / "mig_star.nc"
    mig_da = open_xr(mig_path).data

    # Load pops
    past_pop_da = load_past_pop(gbd_round_id, past_pop_version)
    forecast_pop_da = load_forecast_pop(gbd_round_id, forecast_pop_version,
                                        years, draws)

    # Give past populations dummy draws/scenarios to be concatenated with
    # forecast pops
    past_pop_da = expand_dimensions(past_pop_da,
                                    draw=forecast_pop_da["draw"].values)
    past_pop_da = expand_dimensions(
        past_pop_da, scenario=forecast_pop_da["scenario"].values)

    # Subset to coordinates relevant to mig_da
    forecast_pop_da = forecast_pop_da.sel(
        sex_id=3,
        age_group_id=22,
        location_id=mig_da.location_id.values,
        scenario=0)
    past_pop_da = past_pop_da.sel(sex_id=3,
                                  age_group_id=22,
                                  location_id=mig_da.location_id.values,
                                  scenario=0)

    # Combine past and forecast pop
    pop_da = past_pop_da.combine_first(forecast_pop_da)

    # Multiply rates by pop to get counts
    mig_counts = mig_da * pop_da
    mig_counts = mig_counts / SCALE_FACTOR

    # Save out
    mig_counts_path = mig_dir / "mig_counts.nc"
    save_xr(mig_counts, mig_counts_path, metric="number", space="identity")
def make_eps(mig_version, model_version, model_name, gbd_round_id, years):
    df = pd.read_csv(model_path)
    # add all-sex and all-age id columns
    df["sex_id"] = 3
    df["age_group_id"] = 22
    # select the columns we need
    df = df[[
        "location_id", "year_id", "age_group_id", "sex_id", "predictions",
        "migration_rate"
    ]]
    # set index columns
    index_cols = ["location_id", "year_id", "age_group_id", "sex_id"]

    dataset = df.set_index(index_cols).to_xarray()
    dataset["eps"] = dataset["migration_rate"] - dataset["predictions"]

    save_xr(dataset["eps"].sel(year_id=years.past_years),
            eps_path,
            metric="rate",
            space="identity")

    pred_path = mig_dir / "mig_hat.nc"
    save_xr(dataset["predictions"].sel(year_id=years.years),
            pred_path,
            metric="rate",
            space="identity")

    mig_path = mig_dir / "wpp_hat.nc"
    save_xr(dataset["migration_rate"].sel(year_id=years.years),
            mig_path,
            metric="rate",
            space="identity")
def main(forecast_pop_version, input_version, output_version, draws,
         gbd_round_id, measure, years, past_pop_version, past_asfr_version,
         past_mortality_version):
    """Compute deaths or births aggregations and save the data

    Args:
        forecast_pop_version (str):
            The version name of the population forecasts.
        gbd_round_id (int):
            The GBD round fed into FBDPath to pull the correct version of ASFR.
        input_version (str):
            The version name of the input mortality or ASFR
        output_version (str):
            The version name of the output deaths or births to be saved.
        measure (str):
            Death or live_births.
        draws (int):
            The number of desired draws.
    Returns: None.
    """
    forecast_pop = get_pop(forecast_pop_version, gbd_round_id, measure, draws,
                           years, past_pop_version)

    if measure == "live_births":
        forecast_da = get_asfr(input_version, gbd_round_id, draws, years,
                               past_asfr_version)
    else:
        forecast_da = get_mortality(input_version, gbd_round_id, draws, years,
                                    past_mortality_version)

    agg_output = get_agg(forecast_pop, forecast_da, gbd_round_id)

    data_path = FBDPath(f"{gbd_round_id}/future/{measure}/{output_version}")
    save_xr(agg_output,
            f"{data_path}/{measure}.nc",
            metric="number",
            space="identity")

    LOGGER.info(f"{measure} have been calculated")
def main(migration_version, gbd_round_id):
    # load age-sex pattern (loc, draw, age, sex)
    LOGGER.debug("Loading age-sex migration pattern")
    try:
        pattern_dir = FBDPath(f'/{gbd_round_id}/future/migration/'
                              f'{PATTERN_VERSION}')
        pattern_path = pattern_dir / "combined_age_sex_pattern.nc"
        pattern = open_xr(pattern_path).data
    except FileNotFoundError:  # Data doesn't yet exist
        pattern = create_age_sex_xarray()
    # load migration counts (loc, draw, year)
    LOGGER.debug("Loading migration data")
    mig_dir = FBDPath(f"/{gbd_round_id}/future/migration/{migration_version}/")
    mig_path = mig_dir / "mig_counts.nc"
    migration = open_xr(mig_path).data
    migration = migration.squeeze(drop=True)
    # end up with migration counts with age and sex (loc, draw, year, age, sex)
    split_data = migration * pattern
    # Save it!
    LOGGER.debug("Saving age-sex split migration data")

    split_path = mig_dir / "migration_split.nc"
    save_xr(split_data, split_path, metric="number", space="identity")
def one_draw_main(gbd_round_id, years, draw, forecast_version, output_version):
    """Driver function that handles the education cohort correction for a
    single draw.

    Args:
        gbd_round_id (int):
            The gbd round id.
        years (YearRange):
            The past and forecasted years.
        draw (int):
            The draw number to perform the correction on.
        forecast_version (str):
            Forecast version of education.
        output_version (str):
            Cohort corrected version.
    """
    LOGGER.info("Applying cohort correction to draw: {}".format(draw))
    input_dir = FBDPath("".format())  # Path removed for security reasons
    uncorrected_da = open_xr(input_dir / "education.nc").data
    # subset to national and subnational location ids
    location_table = db.get_location_set(gbd_round_id)

    # modeling subnational estimates
    modeled_location_ids = list(location_table["location_id"].unique())
    avail_sex_ids = [
        sex for sex in uncorrected_da["sex_id"].values
        if sex in MODELED_SEX_IDS]

    # Age groups 2,3,4 and 5 gets filtered out here. Will add them back later.
    avail_age_group_ids = [
        age for age in uncorrected_da["age_group_id"].values
        if age in MODELED_AGE_GROUP_IDS]

    uncorrected_draw_da = uncorrected_da.sel(
        sex_id=avail_sex_ids,
        age_group_id=avail_age_group_ids,
        location_id=modeled_location_ids
    ).sel(draw=draw, drop=True)

    # Create cohort information from age groups and year ids.

    cohort_age_df = get_cohort_info_from_age_year(avail_age_group_ids, years)

    corrected_da = get_corrected_da(
        uncorrected_draw_da, cohort_age_df, years)

    # Combine with dropped age groups
    dropped_age_ids = [
        age for age in uncorrected_da["age_group_id"].values
        if age not in MODELED_AGE_GROUP_IDS]

    dropped_age_da = uncorrected_da.sel(
        sex_id=avail_sex_ids,
        age_group_id=dropped_age_ids,
        location_id=modeled_location_ids).sel(draw=draw, drop=True)

    combined_da = xr.concat([dropped_age_da, corrected_da], dim='age_group_id')
    combined_da['draw'] = draw
    op_dir = FBDPath("".format())

    save_xr(combined_da, op_dir / "corrected_edu_draw{}.nc".format(draw),
            metric="rate", space="identity")
示例#11
0
def main(asfr_version, past_asfr_version, location_id, gbd_round_id, years,
         granularity, iterations, **kwargs):
    """
    1. Read in location-specific draws of period ASFR from CCF stage
    2. Add terminal age group ASFR's
    3. Intercept shift asfr by holding CCF50 constant.
    4. Export location-specific intercept-shifted ASFR in .nc

    Args:
        asfr_version (str): version name of future ccf/asfr.
        past_asfr_version (str): asfr version from past.
        location_id (int): location_id.
        gbd_round_id (int): gbd round id.
        years (YearRange): past_start:forecast_start:forecast_end
        iterations (int): number of times to intercept-shift.
    """
    ages_df = db.get_ages(gbd_round_id)[[
        "age_group_id", "age_group_years_start", "age_group_years_end"
    ]]

    # read the location-specific asfr .csv into dataarray
    # the raw forecasted ASFR are stored in the CCF stage of the same
    ccf_fbd_path = FBDPath(gbd_round_id=gbd_round_id,
                           past_or_future="future",
                           stage="ccf",
                           version=asfr_version)
    if granularity == 1:
        sub_folder = "asfr_single_year"
        ccf_asfr_fbd_path = ccf_fbd_path / sub_folder
        future_asfr = read_to_xr(location_id,
                                 ccf_asfr_fbd_path,
                                 dims=list(ASFR_NON_AGE_DIMS + ("age", )))
    else:
        sub_folder = "asfr"
        ccf_asfr_fbd_path = ccf_fbd_path / sub_folder
        future_asfr =\
            read_to_xr(location_id, ccf_asfr_fbd_path,
                       dims=list(ASFR_NON_AGE_DIMS + ("age_group_id",)))
        # we intercept-shift in 1-year ages, so convert to single years
        future_asfr = _expand_5yr_age_groups_to_1yr_ages(future_asfr, ages_df)

    if "sex_id" in future_asfr.dims:
        raise ValueError("Found sex_id dim in future asfr")

    # now etl the past asfr data
    past_asfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id,
                                 past_or_future="past",
                                 stage="asfr",
                                 version=past_asfr_version)
    past_asfr =\
        open_xr(past_asfr_fbd_path /
                "asfr.nc").data.sel(location_id=location_id)

    if "sex_id" in past_asfr.dims:
        raise ValueError("Found sex_id dim in past asfr")

    # past has no scenarios, so we need to expand it for merging
    past_asfr = expand_dimensions(past_asfr, scenario=future_asfr["scenario"])

    # past asfr has age group ids 7-15, but future asfr in ccf only has 8-14.
    # we only need age groups 8-14 for intercept shift
    past_asfr_1yrs = _expand_5yr_age_groups_to_1yr_ages(
        past_asfr.sel(age_group_id=range(8, 15)), ages_df)

    # now ready to concat past and future together for intercept shift
    asfr = xr.concat([
        past_asfr_1yrs.sel(year_id=years.past_years),
        future_asfr.sel(year_id=years.forecast_years)
    ],
                     dim="year_id")

    del past_asfr_1yrs, future_asfr
    gc.collect()

    # the intercept-shift should keep ccf50 (asfr sum) constant
    pre_fix_asfr_sum = asfr.sum()  # sum of all asfr values before shift

    asfr = ccf50_intercept_shift_lpf(asfr, gbd_round_id, years, iterations)

    post_fix_asfr_sum = asfr.sum()  # asfr sum post-shift should stay the same

    assert np.isclose(post_fix_asfr_sum, pre_fix_asfr_sum, rtol=RTOL),\
        f"The intercept shift changed total asfr sum by more than rtol={RTOL}"

    # need to save years.past_end for cohort-component model
    save_years = [years.past_end] + years.forecast_years.tolist()
    asfr = asfr.sel(year_id=save_years)  # only do forecast
    # convert forecasted asfr back to 5-year age groups
    asfr = _convert_ages_to_5_year_age_groups_by_mean(asfr, ages_df)
    # add 10-15 (7) and 50-55 (15) age groups for forecasted asfr
    asfr = extrapolate_terminal_asfr_age_groups(past_asfr,
                                                asfr,
                                                last_year=years.past_end)
    asfr["location_id"] = location_id
    asfr.name = "value"

    del past_asfr
    gc.collect()

    LOGGER.info("Finished CCF50 intercept-shift")

    asfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id,
                            past_or_future="future",
                            stage="asfr",
                            version=asfr_version)

    save_xr(asfr,
            asfr_fbd_path / f"{location_id}.nc",
            metric="rate",
            space="identity",
            version=asfr_version,
            past_asfr_version=past_asfr_version,
            iterations=iterations)
示例#12
0
def compute_paf(acause, rei, version, years, gbd_round_id, draws,
                sev, rrmax, vaccine_sev, vaccine_rrmax, gbd_paf_version,
                **kwargs):
    r"""
    Computes PAF for the given acause-risk pair, and exports said PAF to
    ``/{gbd_round_id}/{past_or_future}/paf/{version}``.

    Args:
        acause (str): analytical cause.
        rei (str): rei, or commonly called risk.
        version (str): FBDPath version to export to.
        years (YearRange): [past_start, forecast_start, forecast_end] years.
        gbd_round_id (int): gbd round id.
        draws (int): number of draws for output file.  This means input files
            will be up/down-sampled to meet this criterion.
        sev (str): upstream sev version
        rrmax (str): upstream rrmax version
        vaccine_sev (str): upstream vaccine sev version
        vaccine_rrmax (str): upstream vaccine rrmax version
        gbd_paf_version (str): gbd_paf version to read from,
            if not downloading from get_draws().
    """
    sev_da = read_sev(rei=rei, sev=sev, vaccine_sev=vaccine_sev,
                      gbd_round_id=gbd_round_id, years=years, draws=draws)
    rrmax_da = read_rrmax(acause=acause, rei=rei, rrmax=rrmax,
                          vaccine_rrmax=vaccine_rrmax,
                          gbd_round_id=gbd_round_id, years=years, draws=draws)

    # estimated cause-risk-specific paf
    with xr.set_options(arithmetic_join="outer"):
        paf = 1 - 1 / (sev_da * (rrmax_da - 1) + 1)

    location_ids = sev_da["location_id"].values.tolist()
    sex_ids = sev_da["sex_id"].values.tolist()

    del sev_da, rrmax_da
    gc.collect()

    maybe_negative_paf = is_maybe_negative_paf(acause, rei, gbd_round_id)

    # Forecasted PAFs are cleaned first before further processing
    paf = _data_cleaning_for_paf(paf, maybe_negative_paf)

    # now ping get_draws for gbd paf values
    LOGGER.info("Got estimated paf for {}_{}.  Pulling gbd paf...".
                format(acause, rei))

    gbd_round = get_gbd_round(gbd_round_id)

    if gbd_paf_version:  # then we read gbd_paf from this folder
        cache_version = gbd_paf_version
    else:  # default to {gbd_round}_gbd
        cache_version = str(gbd_round) + "_gbd"

    gbd_paf = get_gbd_paf(acause, rei, cache_version, gbd_round_id,
                          sex_ids=sex_ids, location_ids=location_ids,
                          draws=draws)

    LOGGER.info("Pulled gbd paf for {}_{}.  Computing adjusted paf...".
                format(acause, rei))

    # compute correction factor and perform adjustment
    if gbd_paf is not None:

        # First make sure there's no COMPLETE mismatch between paf and gbd_paf.
        # If so, an error should be raised
        paf.load()
        gbd_paf.load()  # need to force load() because dask is lazy
        if (paf - gbd_paf).size == 0:  # normal arithmetic is inner-join
            error_message = ("Complete mismatch between computed and GBD in "
                             "{}-{} PAF.  Are you sure you used the correct "
                             "version of GBD PAF?".format(acause, rei))
            LOGGER.error(error_message)
            raise ValueError(error_message)

        gbd_paf = _data_cleaning_for_paf(gbd_paf, maybe_negative_paf)

        correction_factor = compute_correction_factor(
            paf.sel(year_id=gbd_round), gbd_paf, maybe_negative_paf)

        del gbd_paf
        gc.collect()

        paf = correct_paf(paf, correction_factor, maybe_negative_paf)

        LOGGER.info("Adjusted paf for {}_{}.  Now saving...".
                    format(acause, rei))
    else:  # correction factor is 0, and we leave paf as is
        correction_factor = xr.zeros_like(paf)
        LOGGER.info("paf for {}_{} not adjusted because gbd_paf is None".
                    format(acause, rei))

    # If there are still NaNs at this point, then they should indicate age or
    # sex restrictions and should be filled with 0.
    paf = paf.fillna(0)

    # we need to save the results separately in "past" and "future"
    for p_or_f, yrs in {"past": years.past_years,
                        "future": years.forecast_years}.items():

        out = paf.sel(year_id=yrs)
        out_fbd_path = FBDPath(gbd_round_id=gbd_round_id,
                               past_or_future=p_or_f,
                               stage="paf",
                               version=version)

        # first we save cause-risk-specific paf
        outpath = (out_fbd_path / "risk_acause_specific" /
                   (acause + "_" + rei + ".nc"))

        LOGGER.info("Saving {}".format(outpath))
        save_xr(out, outpath, metric="percent", space="identity",
                acause=acause, risk=rei, gbd_round_id=gbd_round_id,
                sev=sev, rrmax=rrmax, vaccine_sev=vaccine_sev,
                vaccine_rrmax=vaccine_rrmax, gbd_paf_version=cache_version)

        del out
        gc.collect()

        # now saving cause-risk-specific correction factor
        if p_or_f == "past":
            outpath = (out_fbd_path / "risk_acause_specific" /
                       (acause + "_" + rei + "_cf.nc"))

            LOGGER.info("Saving {}".format(outpath))
            save_xr(correction_factor, outpath, metric="percent",
                    space="logit", sev=sev, rrmax=rrmax,
                    vaccine_sev=vaccine_sev, vaccine_rrmax=vaccine_rrmax,
                    gbd_paf_version=cache_version)

    del paf, correction_factor
    gc.collect()
示例#13
0
def get_gbd_paf(acause, rei, cache_version, gbd_round_id, sex_ids,
                location_ids, draws, measure_id=4, metric_id=2):
    """
    Downloads and transforms gbd cause-risk-specific PAF.  The dataarray
    is then cleaned and saved in a FBDPath.

    The gbd paf coming from get_draws::
        >>> df.columns
        Index([u'rei_id', u'modelable_entity_id', u'location_id', u'year_id',
               u'age_group_id', u'sex_id', u'cause_id', u'measure_id',
               u'draw_0', u'draw_1', ... u'draw_991', u'draw_992', u'draw_993',
               u'draw_994', u'draw_995', u'draw_996', u'draw_997', u'draw_998',
               u'draw_999', u'metric_id'], dtype='object', length=1009)

    where we will need to
    1.) use cause_id to slice for the cause-risk pair
    2.) use measure_id (typically 4 for yll) to slice for measure_id
    3.) use metric_id (typically 2 for percent) to slice for metric_id

    Args:
        acause (str): analytical cause.
        rei (str): risk, could also be vaccine intervention.
        cache_version (str): the FBDPath paf version to save the gbd paf in,
            or to read from.
        gbd_round_id (int): gbd round id
        sex_ids (list): sexes.  Typically [1, 2].
        location_ids (list): locations to get pafs from.
        draws (int): number of draws for output file.  This means input files
            will be up/down-sampled to meet this criterion.
        measure_id (int, optional): typically the yll measure id (4).  At the
            most detailed PAF yll is equivalent to death, so measure_id 4 works
            the same as measure_id 1 (death).  Empirically, it seems to pull
            data faster if calling with meausre_id=4.
        metric_id (int, optional): typically the percent metric (2)

    Returns:
        (xr.DataArray/None): Dataarray with complete demographic indices,
            sans "scenario".
    """
    if rei in get_vaccine_reis(gbd_round_id):
        # get_draws won't have anything for vaccines
        return None

    cache_file_fbdpath =\
        FBDPath(gbd_round_id=gbd_round_id,
                past_or_future="past",
                stage="paf",
                version=cache_version) / (acause + "_" + rei + ".nc")

    if cache_file_fbdpath.exists():

        LOGGER.info("{} already exists.  Will read from it for gbd paf.".
                    format(cache_file_fbdpath))

        paf_da = open_xr(cache_file_fbdpath).data

        paf_da = paf_da.sel(location_id=location_ids)

        if len(paf_da["draw"]) != draws:
            paf_da = resample(paf_da, draws)

        return paf_da

    else:  # no cache exists, must download & clean
        rei_id = get_rei_id(rei)

        if acause in CAUSES_NOT_IN_GBD_MAP:  # edge case for diarrhea_*
            cause_id = get_cause_id(CAUSES_NOT_IN_GBD_MAP[acause])
        else:
            cause_id = get_cause_id(acause)

        gbd_round = get_gbd_round(gbd_round_id)

        try:
            # we only need it for year_id=gbd_round, but for every other dim
            # we collect everything.
            paf_df = get_draws(gbd_id_type=['cause_id', 'rei_id'],
                               gbd_id=[cause_id, rei_id],
                               source='burdenator',
                               year_id=gbd_round,
                               gbd_round_id=gbd_round_id,
                               measure_id=measure_id,
                               metric_id=metric_id)
        except Exception as exc:
            error_message = "Error in get_draws for {}_{}".format(acause, rei)
            LOGGER.error(error_message)
            raise IOError(str(exc))

        paf_df = paf_df.drop(columns=["year_id",
                                      "rei_id",
                                      "cause_id",
                                      "measure_id",
                                      "metric_id"])  # don't need these no more

        paf_da = df_to_xr(paf_df,
                          dims=["location_id", "age_group_id", "sex_id"],
                          wide_dim_name='draw',
                          wide_dim_transform=lambda x: int(x.split('_')[1]),
                          fill_value=np.nan)

        paf_da = paf_da.sortby("draw")  # draws don't always come in sorted

        paf_da = _data_cleaning_for_paf(paf_da, acause, rei, "GBD")

        LOGGER.info("Saving downloaded & cleaned {}".
                    format(cache_file_fbdpath))

        save_xr(paf_da, cache_file_fbdpath, metric="percent", space="identity",
                cause_id=cause_id, rei_id=rei_id, gbd_round_id=gbd_round_id,
                year_id=gbd_round, measure_id=measure_id, metric_id=metric_id,
                upper_bound=PAF_UPPER_BOUND, lower_bound=PAF_LOWER_BOUND)

    if len(paf_da["draw"]) != draws:
        paf_da = resample(paf_da, draws)

    return paf_da
示例#14
0
def arima_and_ystar(acause,
                    agg_version,
                    arima_version,
                    smoothing,
                    years,
                    measure,
                    intercept_shift,
                    gbd_round_id,
                    draws,
                    decay,
                    dryrun=False,
                    no_correction=False,
                    past_version="best",
                    no_arima=False,
                    **kwargs):
    r"""Samples mortality residuals from an ARIMA and forms
    $y^* = \hat{y} + \hat{\epsilon}$.

    :param str acause: name of the target acause to aggregate to.
    :param str agg_version: name of the aggregate version.
    :param str arima_version: name of the arima version.
    :param list[str] smoothing: what dimensions to smooth over during the ARIMA
        step.
    :param fbd_core.argparse.YearRange years: a container for the three years
        which define our forecast.
    :param int draws: number of draws to take.
    :param bool dryrun: dryrun flag. This is a test run if True.
    :param bool bias: Perform log bias correction.
    """
    logger.debug("Opening: {}".format(FILEPATH))
    y_hat = xr.open_dataarray(str(FILEPATH))

    # GK intercept shift
    y_hat = gis.intercept_shift_at_draw(y_hat, acause, past_version,
                                        gbd_round_id, years, draws)
    save_xr(y_hat, FILEPATH, root_dir="scratch", metric="rate", space="log")

    y_past = _get_y_past(acause,
                         years,
                         measure,
                         gbd_round_id,
                         past_version=past_version)

    past_years = years.past_years

    if not no_arima:
        # ARIMA for everything except NTDs
        logger.info("Computing epsilon_past.")
        epsilon_past_with_scenarios_and_draws = (
            y_past.loc[dict(year_id=past_years)] -
            y_hat.loc[dict(year_id=past_years)])
        epsilon_past = epsilon_past_with_scenarios_and_draws.loc[dict(
            scenario=0)].mean("draw")

        try:
            epsilon_hat = xr.open_dataarray(str(FILEPATH))
        except:
            epsilon_hat = _draw_epsilons(epsilon_past,
                                         draws,
                                         smoothing,
                                         years,
                                         acause,
                                         decay,
                                         gbd_round_id=gbd_round_id)
        if not dryrun:
            logger.info("Saving epsilon_hat to {}".format(FILEPATH))
            _save_netcdf(epsilon_hat, FILEPATH)
        y_star = _get_y_star(y_hat, epsilon_hat, years).copy()

    else:
        # no arima for ntds
        y_star = y_hat
        y_star.name = "value"

    # intercept shift and bias
    if intercept_shift:
        y_star = _intercept_shift(acause,
                                  y_star,
                                  years,
                                  measure,
                                  gbd_round_id,
                                  draws=draws,
                                  no_arima=no_arima,
                                  past_version=past_version)
    if not no_correction:
        y_star = xr.ufuncs.log(bias_exp(y_star))

    if not dryrun:
        logger.info("Saving y_star to {}".format(FILEPATH))
        _save_netcdf(y_star, FILEPATH)
示例#15
0
def compute_scalar(acause, version, gbd_round_id, no_update_past, **kwargs):
    """
    Computes and saves scalars for acause, given upstream paf version.

    Args:
        acause (str): cause to compute scalars for
        version (str): date/version string pointing to folder to pull data from
        gbd_round_id (int): gbd round id.
        no_update_past (boolean): whether to overwrite past scalars.
    """
    risk_table = get_risk_hierarchy(gbd_round_id)
    risk_id_dict = get_risk_id_dict(risk_table)  # {id: risk}

    cause_risks = get_acause_related_risks(acause,
                                           gbd_round_id)  # list of risks

    for past_or_future in ['past', 'future']:
        LOGGER.info("OH BOY WE'RE DOING THE: {}".format(past_or_future))
        outpath_scalar =\
            FBDPath(gbd_round_id=gbd_round_id,
                    past_or_future=past_or_future,
                    stage="scalar",
                    version=version) / ("{}.nc".format(acause))

        if os.path.exists(str(outpath_scalar)) and no_update_past:
            continue

        # Aggregate PAF for level-1 cluster risks
        # We don't need to use the PAF for scalar.

        # take the risks associated with the cause (cause_risks),
        # and make a dict of all the level 1, 2, 3 parent risks of these risks,
        # with list of their sub-risks (within cause_risks) as value.
        # So you end up with keys that may have risks outside of cause_risks,
        # and values that are subsets of cause_risks.
        risk_lst = get_cluster_risks(cause_risks, risk_id_dict, risk_table)

        for key in risk_lst.keys():  # loop over all antecedent-risks
            LOGGER.info("Looping over super/parent risks.")
            subrisks = risk_lst[key]

            if len(subrisks) > 0:
                LOGGER.info('Start aggregating cluster risk: {}'.format(key))
                aggregate_paf(acause,
                              subrisks,
                              gbd_round_id,
                              past_or_future,
                              version,
                              cluster_risk=key)
                gc.collect()

        # Aggregate PAF for all risks.
        # We need to use the PAF for scalar.
        paf_mediated = aggregate_paf(acause, cause_risks, gbd_round_id,
                                     past_or_future, version)
        if paf_mediated is None:
            LOGGER.info("No paf_mediated. Early return.")
            return

        scalar = 1.0 / (1.0 - paf_mediated)

        del paf_mediated
        gc.collect()

        LOGGER.debug("Checking data value for {} scalar".format(acause))
        data_value_check(scalar)  # make sure no NaNs or <0 in dataarray

        save_xr(scalar,
                outpath_scalar,
                metric="number",
                space="identity",
                acause=acause,
                version=version,
                gbd_round_id=gbd_round_id,
                no_update_past=str(no_update_past))
示例#16
0
def make_tfr_and_agg(asfr_version, pop_version, gbd_round_id, years, model,
                     hyperparam, **kwargs):
    """
    From asfr and pop, make asfr_agg, tfr, and tfr_agg, and export
    files for pipeline and plotting needs.

    Args:
        asfr_version (str): intercept-shifted asfr version where an "asfr.nc"
            with both past and future is present.
        pop_version (str): future pop version to use for agg.
        gbd_round_id (int): gbd round id.
        years (YearRange): past_start:forecast_start:forecast_end.
    """
    pop_fbd_path = FBDPath(gbd_round_id=gbd_round_id,
                           past_or_future="future",
                           stage="population",
                           version=pop_version)
    # only need females for fertility studies
    pop = open_xr(pop_fbd_path / "population.nc").data.\
        sel(sex_id=2, year_id=years.forecast_years)

    agg = Aggregator(pop)
    locs = db.get_locations_by_max_level(3)
    hierarchy = locs[["location_id", "parent_id"]].\
        set_index("location_id").to_xarray().parent_id

    asfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id,
                            past_or_future="future",
                            stage="asfr",
                            version=asfr_version)
    asfr = open_xr(asfr_fbd_path / "asfr.nc").data.\
        sel(year_id=years.forecast_years)
    asfr_agg = agg.aggregate_locations(hierarchy, data=asfr).rate

    # Calculate TFR
    tfr = calc_tfr_from_asfr(asfr)
    tfr_agg = calc_tfr_from_asfr(asfr_agg)

    # Saving to .nc files
    asfr.name = "value"
    tfr.name = "value"
    asfr_agg.name = "value"
    tfr_agg.name = "value"

    LOGGER.info("saving asfr_agg, tfr, tfr_agg to .nc")
    save_xr(asfr_agg,
            asfr_fbd_path / "asfr_agg_based_on_preliminary_pop.nc",
            metric="rate",
            space="identity",
            asfr_version=asfr_version,
            pop_version=pop_version)

    tfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id,
                           past_or_future="future",
                           stage="tfr",
                           version=asfr_version)
    save_xr(tfr,
            tfr_fbd_path / "tfr.nc",
            metric="rate",
            space="identity",
            asfr_version=asfr_version)

    save_xr(tfr_agg,
            tfr_fbd_path / "tfr_agg_based_on_preliminary_pop.nc",
            metric="rate",
            space="identity",
            asfr_version=asfr_version,
            pop_version=pop_version)

    print("Saving Quantiles and Means to .csv")
    asfr.mean("draw").to_dataframe().reset_index().\
        to_csv(asfr_fbd_path / "asfr_mean.csv", index=False)
    asfr_quantiles = asfr.quantile([0.025, 0.975], "draw")
    asfr_quantiles.sel(quantile=0.025).to_dataframe().reset_index().\
        to_csv(asfr_fbd_path / "asfr_lower.csv", index=False)
    asfr_quantiles.sel(quantile=0.975).to_dataframe().reset_index().\
        to_csv(asfr_fbd_path / "asfr_upper.csv", index=False)

    asfr_agg.mean("draw").to_dataframe().reset_index().\
        to_csv(asfr_fbd_path / "asfr_agg_based_on_preliminary_pop_mean.csv",
               index=False)
    asfr_agg_quantiles = asfr_agg.quantile([0.025, 0.975], "draw")
    asfr_agg_quantiles.sel(quantile=0.025).to_dataframe().reset_index().\
        to_csv(asfr_fbd_path / "asfr_agg_based_on_preliminary_pop_lower.csv",
               index=False)
    asfr_agg_quantiles.sel(quantile=0.975).to_dataframe().reset_index().\
        to_csv(asfr_fbd_path / "asfr_agg_based_on_preliminary_pop_upper.csv",
               index=False)

    tfr.mean("draw").to_dataframe().reset_index().\
        to_csv(tfr_fbd_path / "tfr_mean.csv", index=False)
    tfr_quantiles = tfr.quantile([0.025, 0.975], "draw")
    tfr_quantiles.sel(quantile=0.025).to_dataframe().reset_index().\
        to_csv(tfr_fbd_path / "tfr_lower.csv", index=False)
    tfr_quantiles.sel(quantile=0.975).to_dataframe().reset_index().\
        to_csv(tfr_fbd_path / "tfr_upper.csv", index=False)

    tfr_agg.mean("draw").to_dataframe().reset_index().\
        to_csv(tfr_fbd_path / "tfr_agg_based_on_preliminary_pop_mean.csv",
               index=False)
    tfr_agg_quantiles = tfr_agg.quantile([0.025, 0.975], "draw")
    tfr_agg_quantiles.sel(quantile=0.025).to_dataframe().reset_index().\
        to_csv(tfr_fbd_path / "tfr_agg_based_on_preliminary_pop_lower.csv",
               index=False)
    tfr_agg_quantiles.sel(quantile=0.975).to_dataframe().reset_index().\
        to_csv(tfr_fbd_path / "tfr_agg_based_on_preliminary_pop_upper.csv",
               index=False)
def forecast_edu_main(transform, past_version, forecast_version, pv_version,
                      weight_strategy, gbd_round_id, years, reference_scenario,
                      diff_over_mean, truncate, truncate_quantiles,
                      replace_with_mean, draws, **kwargs):
    LOGGER.debug("weight strategy: {}".format(weight_strategy.__name__))
    pv_path = FBDPath("".format())  # Path removed for security reasons
    rmse = open_xr(pv_path / "education_arc_weight_rmse.nc").data
    weight_exp = weight_strategy(rmse, draws)
    LOGGER.info("omega selected: {}".format(weight_exp))

    LOGGER.debug("Reading in the past")
    past_path = FBDPath("".format())  # Path removed for security reasons
    past = resample(open_xr(past_path / "education.nc").data, draws)
    past = past.sel(year_id=years.past_years)

    if isinstance(weight_exp, float) or isinstance(weight_exp, int):
        extra_dim = None
    else:
        if not isinstance(weight_exp, xr.DataArray):
            omega_exp_err_msg = (
                "`omega` must be either a float, an int, or an "
                "xarray.DataArray")
            LOGGER.error(omega_exp_err_msg)
            raise RuntimeError(omega_exp_err_msg)
        elif len(weight_exp.dims) != 1 or "draw" not in weight_exp.dims:
            omega_exp_err_msg = (
                "If `omega` is a xarray.DataArray, then it must have only "
                "1 dim, `draw`")
            LOGGER.error(omega_exp_err_msg)
            raise RuntimeError(omega_exp_err_msg)
        elif not weight_exp["draw"].equals(past["draw"]):
            omega_err_msg = (
                "If `omega` is a xarray.DataArray, then it's `draw` dim "
                "must have the coordinates as `past`")
            LOGGER.error(omega_err_msg)
            raise RuntimeError(omega_err_msg)
        else:
            extra_dim = "draw"

    forecast = arc_forecast_education(past,
                                      gbd_round_id,
                                      transform,
                                      weight_exp,
                                      years,
                                      reference_scenario,
                                      diff_over_mean,
                                      truncate,
                                      truncate_quantiles,
                                      replace_with_mean,
                                      extra_dim=extra_dim)

    forecast_path = FBDPath("".format())
    if isinstance(weight_exp, xr.DataArray):
        report_omega = float(weight_exp.mean())
    else:
        report_omega = weight_exp
    save_xr(forecast,
            forecast_path / "education.nc",
            metric="number",
            space="identity",
            omega=report_omega,
            omega_strategy=weight_strategy.__name__)
    LOGGER.info("education forecasts have saved")
示例#18
0
def run_against(version,
                pop_version,
                asfr_version,
                lifetable_version,
                migration_version,
                srb_version,
                gbd_round_id,
                location_idx,
                years,
                location_id,
                draws,
                test=False):
    """
    Takes versions for files, finds the files, and computes future
    populations. It then saves those files. This is what you call from
    the pipeline.

    Args:
        version (str): Version name for output
        pop_version (str): Version for population
        asfr_version (str): version for asfr
        lifetable_version (list[str]): List of versions for lifetable
        migration_version (list[str]): List of versions for migration
        gbd_round_id (int): GBD Round ID, 4 is 2016
        location_idx (int|None): Zero-based index into list of locations.
        years (YearRange): years for past and forecast.
        location_id (int|None): A location ID.
        test (bool): Run a reduced subset of locations and draws.

    Returns:
        None
    """
    out_path = FBDPath("/{}/future/population/{}".format(
        gbd_round_id, version))
    try:
        out_path.mkdir(parents=True, exist_ok=True)
    except OSError as ose:
        LOGGER.error("Could not create output directory {}: {}".format(
            out_path, ose))

    asfr_lim, lifetable_lim, pop, migration, srb =\
        agreement_rules(
            *read_datasets(
                asfr_version, gbd_round_id,
                lifetable_version, pop_version, migration_version, years,
                srb_version, draws),
            years
        )

    ruler = timeline(pop.age_group_id.values, asfr_lim.age_group_id.values)

    locations = pop.location_id.values
    if location_idx is not None:
        try:
            locations = [locations[location_idx]]
            LOGGER.info("Using location_id {} from location_idx {}".format(
                locations, location_idx))
        except IndexError:
            LOGGER.warning("Asked for out-of-bounds location {} of {}".format(
                location_idx, locations.shape[0]))
            exit(0)  # Maybe you ask for 200 jobs but have 195 countries. OK.
    elif location_id is not None:
        locations = [location_id]
    else:
        locations = pop.location_id.values

    for location in locations:
        begin_time = perf_time()
        loc_idx = dict(location_id=location)

        future = one_location(pop.loc[loc_idx], asfr_lim.loc[loc_idx],
                              lifetable_lim.loc[loc_idx],
                              migration.loc[loc_idx], srb.loc[loc_idx], ruler,
                              gbd_round_id, years, test)
        out_name = out_path / "{}.nc".format(location)
        future.coords["location_id"] = location
        summary = summarize_pop(future)
        elapsed = perf_time() - begin_time
        LOGGER.info("Elapsed {}".format(elapsed))
        write_begin = perf_time()
        save_xr(summary,
                out_name,
                metric="number",
                space="identity",
                death=version,
                pop=pop_version,
                asfr=asfr_version,
                lifetable=lifetable_version,
                migration=migration_version,
                srb=srb_version)
        LOGGER.info("Wrote {}".format(out_name))
        LOGGER.info("Write time Elapsed {}".format(perf_time() - write_begin))
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--version",
                        type=str,
                        help="Which version of migration to balance.")
    parser.add_argument(
        "--gbd_round_id",
        type=int,
        required=True,
        help="Which gbd_round_id to use in file loading and saving")
    args = parser.parse_args()

    # Try to load data, else combine csvs into dataarray
    try:
        mig_dir = FBDPath(
            f"/{args.gbd_round_id}/future/migration/{args.version}/")
        mig_path = mig_dir / "migration_split.nc"
        mig_da = open_xr(mig_path).data
    except:  # Data doesn't yet exist
        mig_da = combine_and_save_mig(version=args.version)

    balanced_mig_da = balance_migration(mig_da)

    # Save to forecasting directory
    balanced_path = mig_dir / "migration.nc"
    save_xr(balanced_mig_da, balanced_path, metric="number", space="identity")

    great_job.congratulations()  # You did it!