Пример #1
0
    def get_dws(self):
        dws = self._get_standard_dws()
        dws = dws.append(self._get_custom_dws())
        dws = dws.append(self._get_inj_dws())
        dws = dws.append(self._get_epi_dws())
        dws = dws.append(self._get_mnd_dws())
        dws = dws.append(self._get_autism_dws())
        dws = dws.append(self._get_uro_dws())

        # healthstate_id 799 - Asymptomatic
        asymp_row = {draw_col: 0 for draw_col in self._draw_cols}
        asymp_row['healthstate_id'] = 799
        dws = dws.append(pd.DataFrame([asymp_row]))
        dws = dws.reset_index(drop=True)
        dws = dws[['healthstate_id'] + self._draw_cols]

        # resample
        if len(self._draw_cols) != len(self.dims.data_list()):
            dimensions = deepcopy(self.dims)
            dimensions.index_dim.drop_level("age_group_id")
            dimensions.index_dim.drop_level("year_id")
            dimensions.index_dim.drop_level("location_id")
            gbdizer = gbdize.GBDizeDataFrame(dimensions)
            dws = gbdizer.correlated_percentile_resample(dws)

        self.dws = dws
Пример #2
0
    def copy_and_backfill(self):
        prof_id_cret_old = self.me_map["cretinism"]["srcs"]
        old = self.me_dict[prof_id_cret_old].reset_index()

        # Handle year differences between gbd2016 and gbd2017
        old.loc[old.year_id == 2016, 'year_id'] = 2017
        # Handle Saudia Arabia
        loc_meta = get_location_metadata(location_set_id=35, gbd_round_id=4)
        saudia_id = 152
        saudia_sub_nats = loc_meta.loc[loc_meta.parent_id == saudia_id,
                                       'location_id'].tolist()
        saudi_arabia = old.loc[old.location_id.isin(saudia_sub_nats), :]
        saudi_arabia.loc[:, 'location_id'] = saudia_id
        saudi_arabia = saudi_arabia.drop_duplicates(keep='first')
        old = pd.concat([old, saudi_arabia], axis=0)

        # Handle other location differences between gbd2016 and gbd2017
        data_cols = self.draw_cols
        data_dct = {'data_cols': data_cols}
        index_cols = list(set(old.columns) - set(data_cols))
        index_cols.remove('location_id')
        demo = get_demographics(gbd_team='epi', gbd_round_id=5)
        index_dct = {
            tuple(index_cols):
            list(set(tuple(x) for x in old[index_cols].values)),
            'location_id': demo['location_id']
        }
        gbdizer = gbdize.GBDizeDataFrame(
            dimensionality.DataFrameDimensions(index_dct, data_dct))
        new = gbdizer.fill_location_from_nearest_parent(old,
                                                        location_set_id=35,
                                                        gbd_round_id=5)
        prof_id_cret_new = self.me_map["cretinism"]["trgs"]
        self.me_dict[prof_id_cret_new] = new
Пример #3
0
    def get_dws(self):
        dws = self._get_standard_dws()
        dws = dws.append(self._get_custom_dws())
        dws = dws.append(self._get_inj_dws())
        dws = dws.append(self._get_epi_dws())
        dws = dws.append(self._get_mnd_dws())
        dws = dws.append(self._get_autism_dws())
        dws = dws.append(self._get_uro_dws())

        # Asymp
        draw_cols = ["draw_{}".format(i) for i in range(1000)]
        asymp_row = {'draw_%s' % i: 0 for i in range(1000)}
        asymp_row['healthstate_id'] = 799
        dws = dws.append(pd.DataFrame([asymp_row]))
        dws = dws.reset_index(drop=True)
        dws = dws[['healthstate_id'] + draw_cols]

        # resample
        if len(draw_cols) != len(self.dims.data_list()):
            dimensions = deepcopy(self.dims)
            dimensions.index_dim.drop_level("age_group_id")
            dimensions.index_dim.drop_level("year_id")
            dimensions.index_dim.drop_level("location_id")
            gbdizer = gbdize.GBDizeDataFrame(dimensions)
            dws = gbdizer.random_choice_resample(dws)

        self.dws = dws
Пример #4
0
def _fill_0s(df, draw_source, dimensions):
    # make it square
    filters = draw_source.content_kwargs["filters"].copy()
    dimensions = deepcopy(dimensions)
    dimensions.index_dim.replace_level("location_id", filters["location_id"])
    gbdizer = gbdize.GBDizeDataFrame(dimensions)
    df = gbdizer.fill_empty_indices(df, 0)
    return df
Пример #5
0
 def _simulated_aggregate_prevalence(self):
     if self.como_sim is None:
         raise AttributeError(
             "cannot access _simulated_aggregate_prevalence without "
             "setting como_sim")
     df = self.como_sim.agg_causes.copy()
     gbdizer = gbdize.GBDizeDataFrame(self.dimensions)
     df = gbdizer.random_choice_resample(df)
     return df[self.index_cols + self.draw_cols]
Пример #6
0
    def _import_draws(self):
        gbdizer = gbdize.GBDizeDataFrame(self.dimensions)

        # import draws
        for me_id in self._importers.keys():
            draw_source = self._importers[me_id]
            draws = draw_source.content(filters=self.filters)
            draws = gbdizer.fill_empty_indices(draws, 0)
            self.draws[me_id] = draws.set_index(self.dimensions.index_names)
Пример #7
0
 def get_id_dws(self):
     dws = pd.read_csv("FILEPATH/combined_id_dws.csv")
     dimensions = deepcopy(self.dims)
     dimensions.index_dim.drop_level("year_id")
     dimensions.index_dim.drop_level("location_id")
     gbdizer = gbdize.GBDizeDataFrame(dimensions)
     dws = dws.reset_index(drop=True)
     if len(self._draw_cols) != len(self.dims.data_list()):
         dws = gbdizer.correlated_percentile_resample(dws)
     self.id_dws = dws
Пример #8
0
    def read_single_en_injury(self,
                              modelable_entity_id,
                              model_version_id,
                              measure_id=[
                                  measures.YLD, measures.INCIDENCE,
                                  measures.ST_PREVALENCE,
                                  measures.LT_PREVALENCE
                              ]):
        injury_source = (
            self._ss_factory.get_en_injuries_modelable_entity_source(
                modelable_entity_id, model_version_id))
        dim = self.dimensions.get_simulation_dimensions(measure_id=measure_id,
                                                        at_birth=False)

        # get filters w/ added years if interpolation is needed
        filters = dim.index_dim.to_dict()["levels"]
        req_years = filters["year_id"]
        if not set(req_years).issubset(set(self._estim_years)):
            filters["year_id"] = list(set(req_years + self._estim_years))

        # read data
        df = injury_source.content(filters=filters)
        if df.empty:
            raise Exception(f"No data returned for ME {modelable_entity_id}, "
                            f"model version {model_version_id}.")
        draw_cols = [col for col in df.columns if "draw_" in col]

        # add indices to dimensions object from draw source transforms
        dim.index_dim.add_level("sequela_id", df.sequela_id.unique().tolist())
        dim.index_dim.add_level("cause_id", df.cause_id.unique().tolist())
        dim.index_dim.add_level("healthstate_id",
                                df.healthstate_id.unique().tolist())
        dim.index_dim.add_level("rei_id", df.rei_id.unique().tolist())

        # interpolate missing years
        if not set(df.year_id.unique()).issuperset(set(req_years)):
            interp_df = pchip_interpolate(df=df,
                                          id_cols=dim.index_names,
                                          value_cols=draw_cols,
                                          time_col="year_id",
                                          time_vals=req_years)
            df = df[df.year_id.isin(req_years)]
            df = df.append(interp_df)
        else:
            df = df[df.year_id.isin(req_years)]

        # resample
        if len(dim.data_list()) != len(draw_cols):
            gbdizer = gbdize.GBDizeDataFrame(dim)
            df = gbdizer.correlated_percentile_resample(df)

        return df
Пример #9
0
def fill_square(df, col, gbd_round_id):
    '''make data square across a column for a set of index columns'''
    demo = get_demographics(gbd_team='epi', gbd_round_id=gbd_round_id)
    draw_cols = list(df.filter(like='draw_').columns)
    index_cols = list(set(df.columns) - set(draw_cols))
    index_cols.remove(col)
    index_dct = {
        tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)),
        col: demo[col]
    }
    data_dct = {'draw_cols': draw_cols}
    gbdizer = gbdize.GBDizeDataFrame(
        dimensionality.DataFrameDimensions(index_dct, data_dct))
    return gbdizer.fill_empty_indices(df, 0)
Пример #10
0
    def read_single_en_injury(self,
                              modelable_entity_id,
                              model_version_id,
                              measure_id=[3, 6, 35, 36]):
        injury_source = (
            self._ss_factory.get_en_injuries_modelable_entity_source(
                modelable_entity_id, model_version_id))
        dim = self.dimensions.get_simulation_dimensions(measure_id)

        # get filters w/ added years if interpolation is needed
        filters = dim.index_dim.to_dict()["levels"]
        req_years = filters["year_id"]
        if not set(req_years).issubset(set(self._estim_years)):
            filters["year_id"] = list(set(req_years + self._estim_years))

        # read data
        df = injury_source.content(filters=filters)
        if df.empty:
            raise Exception("No data returned for meid:{} and mvid:{}".format(
                modelable_entity_id, model_version_id))

        # add indices to dimensions object from draw source transforms
        dim.index_dim.add_level("sequela_id", df.sequela_id.unique().tolist())
        dim.index_dim.add_level("cause_id", df.cause_id.unique().tolist())
        dim.index_dim.add_level("healthstate_id",
                                df.healthstate_id.unique().tolist())
        dim.index_dim.add_level("rei_id", df.rei_id.unique().tolist())

        # interpolate missing years
        if not set(df.year_id.unique()).issuperset(set(req_years)):
            interp_df = pchip_interpolate(df=df,
                                          id_cols=dim.index_names,
                                          value_cols=self._draw_cols,
                                          time_col="year_id",
                                          time_vals=req_years)
            df = df[df.year_id.isin(req_years)]
            df = df.append(interp_df)
        else:
            df = df[df.year_id.isin(req_years)]

        # resample if ndraws is less than 1000
        if len(dim.data_list()) != 1000:
            gbdizer = gbdize.GBDizeDataFrame(dim)
            df = gbdizer.correlated_percentile_resample(df)

        return df
Пример #11
0
def backfill(df, norway_id, code_dir, loc_meta):
    #backfill
    data_cols = ['cases', 'effective_sample_size', 'sample_size']
    data_dct = {'data_cols': data_cols}
    index_cols = list(set(df.columns) - set(data_cols))
    index_cols.remove('location_id')
    norway_subs = loc_meta.loc[loc_meta.parent_id == norway_id,
                               'location_id'].tolist()
    index_dct = {
        tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)),
        'location_id': norway_subs
    }
    gbdizer = gbdize.GBDizeDataFrame(
        dimensionality.DataFrameDimensions(index_dct, data_dct))
    backfilled = gbdizer.fill_location_from_nearest_parent(df,
                                                           location_set_id=35,
                                                           gbd_round_id=5)
    return backfilled
Пример #12
0
def fill_square(df, index_cols, square_col, square_col_vals, fill_val=0):
    """make data square across a column for a set of index columns"""
    # get index dimensions
    index_cols = [col for col in index_cols if col != square_col]
    index_dct = {
        tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)),
        square_col: square_col_vals
    }

    # get data dimensions
    data_dct = {
        "non_draw_cols":
        [col for col in df.columns if col not in index_cols + [square_col]]
    }

    # make it square
    gbdizer = gbdize.GBDizeDataFrame(
        dimensionality.DataFrameDimensions(index_dct, data_dct))
    df = gbdizer.fill_empty_indices(df, fill_val)
    return df
Пример #13
0
    def get_id_dws(self):
        draw_cols = ["draw_{}".format(i) for i in range(1000)]

        dws = pd.read_csv("filepath/03_custom/" "combined_id_dws.csv")
        dws['age_end'] = dws['age_end'] + 1
        dws['age_end'] = dws.age_end.replace({101: 200})
        dws.rename(columns={d.replace("_", ""): d
                            for d in draw_cols},
                   inplace=True)
        dws = dws[["age_start", "age_end", "healthstate_id"] + draw_cols]
        dimensions = deepcopy(self.dims)
        dimensions.index_dim.drop_level("year_id")
        dimensions.index_dim.drop_level("location_id")
        gbdizer = gbdize.GBDizeDataFrame(dimensions)

        dws = gbdizer.fill_age_from_continuous_range(dws, 12, "age_start",
                                                     "age_end")
        dws = dws.reset_index(drop=True)
        if len(draw_cols) != len(self.dims.data_list()):
            dws = gbdizer.random_choice_resample(dws)
        self.id_dws = dws
Пример #14
0
    def get_en_matrices(self):
        if self.dimensions is None or self.como_version is None:
            raise AttributeError(
                "cannot get ncodes via this method if como_version is None or"
                " dimensions is None")
        df_list = []
        years = list(
            set(
                cap_val(self.dimensions.index_dim.levels.year_id,
                        [1990, 1995, 2000, 2005, 2010, 2016])))
        parallelism = ["location_id", "sex_id"]
        for slices in self.dimensions.index_slices(parallelism):
            for year in years:
                nemat = pd.read_csv(
                    "FILEPATH/"
                    "NEmatrix_{location_id}_{year_id}_{sex_id}.csv".format(
                        location_id=slices[0], year_id=year, sex_id=slices[1]))
                nemat = nemat.merge(self.como_version.cause_list,
                                    left_on="ecode",
                                    right_on="acause")
                nemat = nemat.merge(self.como_version.injury_dws_by_sequela,
                                    left_on="ncode",
                                    right_on="n_code")
                nemat = nemat[["cause_id", "age_group_id", "sequela_id"] +
                              self.dimensions.data_list()]
                nemat["location_id"] = slices[0]
                nemat["sex_id"] = slices[1]
                nemat["year_id"] = year
                df_list.append(nemat)

        df = pd.concat(df_list)
        dims = deepcopy(self.dimensions)
        dims.index_dim.add_level("sequela_id", df.sequela_id.unique().tolist())
        dims.index_dim.add_level("cause_id", df.cause_id.unique().tolist())
        dims.index_dim.drop_level("measure_id")
        gbdizer = gbdize.GBDizeDataFrame(dims)
        df = gbdizer.random_choice_resample(df)
        df = gbdizer.fill_year_from_nearest_neighbor(df)
        self.en_matrices = df
Пример #15
0
def square_data(df, ds):
    if df.empty:
        return df

    # collect draw source attributes
    filters = ds.content_kwargs["filters"].copy()
    model_version_id = ds.params['model_version_id']
    modelable_entity_id = ds.params['modelable_entity_id']
    dim = deepcopy(ds.params["dimensions"])

    # construct the dimensions object
    for key, val in list(filters.items()):
        dim.index_dim.replace_level(key, val)
    dim.index_dim.add_level("modelable_entity_id", [modelable_entity_id])
    dim.index_dim.add_level("model_version_id", [model_version_id])
    dim.index_dim.replace_level("year_id", df.year_id.unique().tolist())

    # fill in empty
    gbdizer = gbdize.GBDizeDataFrame(dim)
    df = gbdizer.add_missing_index_cols(df)
    df = gbdizer.fill_empty_indices(df, 0)
    return df
Пример #16
0
    def _get_short_term_EN_annual(self, dim):
        # get non interpolated values
        annual_sg = SuperGopher(
            {'file_pattern': '{location_id}/ylds_{year_id}_{sex_id}.dta'},
            os.path.join("filepath", "FILEPATH"))
        annual_df = annual_sg.content(
            location_id=dim.index_dim.get_level("location_id"),
            year_id=dim.index_dim.get_level("year_id"),
            sex_id=dim.index_dim.get_level("sex_id"))

        # clean data
        annual_df = annual_df.merge(self.como_version.cause_list,
                                    left_on="ecode",
                                    right_on="acause")
        annual_df = annual_df.merge(self.como_version.ncode_hierarchy,
                                    left_on="ncode",
                                    right_on="rei")
        annual_df["age"] = annual_df["age"].round(2).astype(str)
        ridiculous_am = {
            '0.0': 2,
            '0.01': 3,
            '0.1': 4,
            '1.0': 5,
            '5.0': 6,
            '10.0': 7,
            '15.0': 8,
            '20.0': 9,
            '25.0': 10,
            '30.0': 11,
            '35.0': 12,
            '40.0': 13,
            '45.0': 14,
            '50.0': 15,
            '55.0': 16,
            '60.0': 17,
            '65.0': 18,
            '70.0': 19,
            '75.0': 20,
            '80.0': 30,
            '85.0': 31,
            '90.0': 32,
            '95.0': 235
        }
        annual_df["age"] = annual_df["age"].replace(ridiculous_am).astype(int)
        annual_df.rename(columns={"age": "age_group_id"}, inplace=True)

        # transform to rate
        annual_df = transform_metric(annual_df, 3, 1)

        # collapse inpatient
        annual_df = annual_df.groupby([
            "location_id", "year_id", "age_group_id", "sex_id", "cause_id",
            "rei_id"
        ]).sum().reset_index()

        # fill demographics
        gbdizer = gbdize.GBDizeDataFrame(dim)
        annual_df = gbdizer.add_missing_index_cols(annual_df)
        annual_df = gbdizer.gbdize_any_by_dim(annual_df, "age_group_id")
        annual_df.fillna(0, inplace=True)

        # resample if necessary
        annual_df = self.resample_if_needed(annual_df, dim, gbdizer)
        return annual_df
Пример #17
0
    def _get_short_term_EN_estimation(self, dim):
        # get non interpolated values
        estim_sg = SuperGopher(
            {'file_pattern': '{location_id}/ylds_{year_id}_{sex_id}.dta'},
            os.path.join("filepath", "03_outputs/01_draws/ylds"))
        years = list(
            set(
                cap_val(dim.index_dim.levels.year_id,
                        [1990, 1995, 2000, 2005, 2010, 2016]) + [2005]))
        estim_df = estim_sg.content(
            location_id=dim.index_dim.get_level("location_id"),
            year_id=years,
            sex_id=dim.index_dim.get_level("sex_id"))

        # clean data
        estim_df = estim_df.merge(self.como_version.cause_list,
                                  left_on="ecode",
                                  right_on="acause")
        estim_df = estim_df.merge(self.como_version.ncode_hierarchy,
                                  left_on="ncode",
                                  right_on="rei")
        estim_df["age"] = estim_df["age"].round(2).astype(str)
        ridiculous_am = {
            '0.0': 2,
            '0.01': 3,
            '0.1': 4,
            '1.0': 5,
            '5.0': 6,
            '10.0': 7,
            '15.0': 8,
            '20.0': 9,
            '25.0': 10,
            '30.0': 11,
            '35.0': 12,
            '40.0': 13,
            '45.0': 14,
            '50.0': 15,
            '55.0': 16,
            '60.0': 17,
            '65.0': 18,
            '70.0': 19,
            '75.0': 20,
            '80.0': 30,
            '85.0': 31,
            '90.0': 32,
            '95.0': 235
        }
        estim_df["age"] = estim_df["age"].replace(ridiculous_am).astype(int)
        estim_df.rename(columns={"age": "age_group_id"}, inplace=True)

        # transform to rate
        estim_df = transform_metric(estim_df, 3, 1)

        # collapse inpatient
        estim_df = estim_df.groupby([
            "location_id", "year_id", "age_group_id", "sex_id", "cause_id",
            "rei_id"
        ]).sum().reset_index()

        # fill demographics
        data_cols = ["draw_{}".format(i) for i in range(1000)]
        gbdizer = gbdize.GBDizeDataFrame(dim)
        estim_df = gbdizer.add_missing_index_cols(estim_df)
        estim_df = gbdizer.gbdize_any_by_dim(estim_df, "age_group_id")
        estim_df.fillna(0, inplace=True)

        if gbdizer.missing_values(estim_df, "year_id"):
            estim_df = gbdizer.fill_year_by_interpolating(
                df=estim_df,
                rank_df=estim_df[estim_df["year_id"] == 2005],
                data_cols=data_cols)
        estim_df = estim_df[estim_df.year_id.isin(
            dim.index_dim.get_level("year_id"))]

        # resample if necessary
        estim_df = self.resample_if_needed(estim_df, dim, gbdizer)
        return estim_df
Пример #18
0
    def read_inputs(self):
        """get como draws for a single modelable_entity/model_version"""
        print('Reading draws for (meid, mvid): ({}, {})'.format(
            self.meid, self.mvid))
        if self.super_gopher is None:
            self.super_gopher = SuperGopher.auto(self.meid_data_dir)

        all_draws = []
        reference_draws = []
        missing_dim_q = []
        for dimensions in self.dimensions_q:

            gbdizer = gbdize.GBDizeDataFrame(dimensions)
            try:
                draws = self.super_gopher.content(
                    location_id=dimensions.index_dim.get_level("location_id"),
                    year_id=dimensions.index_dim.get_level("year_id"),
                    sex_id=dimensions.index_dim.get_level("sex_id"),
                    measure_id=dimensions.index_dim.get_level("measure_id"),
                    age_group_id=dimensions.index_dim.get_level(
                        "age_group_id"))
            except InvalidFilter:
                draws = pd.DataFrame(columns=dimensions.index_names)

            if not draws.empty:
                # gbdize. aka fill in missing dimensions
                draws = self.gbdize_dimensions(draws, gbdizer)
                # keep a copy of all 1000 draws for interpolation
                reference_draws.append(draws)

                # resample
                draws = self.resample_if_needed(draws, dimensions, gbdizer)

            if len(draws) != dimensions.total_cardinality:
                missing = self.missing_dimensions(draws, dimensions)
                missing_dim_q.append(missing)

            all_draws.append(draws)

        # prep for interpolation of missing demographics
        if len(reference_draws) > 0:
            reference_draws = pd.concat(reference_draws)
        else:
            reference_draws = pd.DataFrame(columns=dimensions.index_names)
        missing_dim_q = list(flatten(missing_dim_q))

        for dimensions in missing_dim_q:

            gbdizer = gbdize.GBDizeDataFrame(dimensions)
            interp_draws, rank_df = self.get_interpolation_draws(
                reference_draws, dimensions)

            if not interp_draws.empty:
                # gbdize. aka fill in missing dimensions
                interp_draws = self.gbdize_dimensions(interp_draws, gbdizer)
                rank_df = self.gbdize_dimensions(rank_df, gbdizer)

                # case where years are stored as floats, breaks interpolate
                interp_draws['year_id'] = interp_draws['year_id'].astype(int)
                try:
                    data_cols = ["draw_{}".format(i) for i in range(1000)]
                    interp_draws = gbdizer.fill_year_by_interpolating(
                        interp_draws, rank_df, data_cols)
                except MissingGBDemographics:
                    print(
                        "(meid: {meid}, mvid: {mvid}) "
                        " Could not interpolate for years: {years}, "
                        "measure: {meas} "
                        "location_id: {loc} "
                        "sex_id: {sex}".format(
                            meid=self.meid,
                            mvid=self.mvid,
                            years=dimensions.index_dim.get_level("year_id"),
                            meas=dimensions.index_dim.get_level("measure_id"),
                            loc=dimensions.index_dim.get_level("location_id"),
                            sex=dimensions.index_dim.get_level("sex_id")))
                    interp_draws = self.gbdize_dimensions(
                        interp_draws, gbdizer, "year_id")

                # append draws to reference
                reference_draws = reference_draws.append(interp_draws,
                                                         ignore_index=True)

                draws = interp_draws.loc[interp_draws['year_id'].isin(
                    dimensions.index_dim.get_level('year_id'))]

                # resample
                draws = self.resample_if_needed(draws, dimensions, gbdizer)
                all_draws.append(draws)

            # if dimensions overlap, drop duplicates from reference draws
            reference_draws.drop_duplicates(subset=dimensions.index_names,
                                            inplace=True)

        # concatenate all the results
        draws = pd.concat(all_draws)
        # in case dimensions overlap, drop duplicates
        draws.drop_duplicates(inplace=True)
        draws['modelable_entity_id'] = self.meid

        return draws.reset_index(drop=True)