예제 #1
0
def read_cod_draw_files(pool, parent_dir, location_id, years):
    """Pull in all data to be summarized for CoD, by location and filtering by
    years."""
    logger = logging.getLogger('summary.read_cod_draw_files')
    try:
        agg_rescaled_params = {
            'draw_dir': os.path.join(parent_dir, 'aggregated/rescaled'),
            'file_pattern': '{measure_id}_{location_id}_{year_id}.h5'
        }
        ds = DrawSource(agg_rescaled_params)
        rescaled_draws = ds.content(filters={'location_id': location_id,
                                             'year_id': years,
                                             'measure_id': 1})

        daly_draw_params = {
            'draw_dir': os.path.join(parent_dir, 'draws'),
            'file_pattern': '{measure_id}_{location_id}.h5',
        }
        ds = DrawSource(daly_draw_params)
        dalynator_draws = ds.content(filters={'location_id': location_id,
                                              'year_id': years,
                                              'measure_id': 1})
        return rescaled_draws, dalynator_draws
    except Exception as e:
        logger.exception('Failed to read location: {}'.format(e))
예제 #2
0
 def compute_percent(self):
     mem_source = DrawSource({
         "draw_dict": self._io_mock,
         "name": "cause"
     }, mem_read_func)
     denom = mem_source.content(filters={"metric_id": metrics.RATE})
     self._compute_percent(denom_df=denom)
예제 #3
0
def read_shocks_draw_files(parent_dir, location_id):
    """Reads in rescaled draw files."""
    params = {
        'draw_dir': os.path.join(parent_dir, 'aggregated/shocks'),
        'file_pattern': '1_{location_id}_{year_id}.h5'
    }
    source = DrawSource(params)
    return source.content(filters={'location_id': location_id})
예제 #4
0
 def estimate_single_component(self, component):
     # data to summarize
     draw_source = DrawSource({
         "draw_dict": self.io_mock,
         "name": component
     }, mem_read_func)
     df = draw_source.content()
     df = compute_estimates(df, point_estimate="mean")
     df.rename(columns={"mean": "val"}, inplace=True)
     return df
예제 #5
0
def read_aggregated_rescaled(parent_dir, location_id, diag_years):
    """ Read in location aggregates of rescaled draws for deaths only"""
    rescaled_params = {
        'draw_dir': os.path.join(parent_dir, 'draws'),
        'file_pattern': '{measure_id}_{location_id}.h5'
    }
    ds = DrawSource(rescaled_params)
    rescaled_draws = ds.content(filters={
        'location_id': location_id,
        'year_id': diag_years,
        'measure_id': 1
    })
    return rescaled_draws
예제 #6
0
    def _load_data_frame(self):
        """If 'turn_off_null_check' is true then the null check will be skipped.
        Yuk. GBD 2015 como files have nulls caused by "other maternal" issues
        for males.  Generally it is much safer to validate data, this is
        dangerous but historically necessary.

        This will pass-through NoDrawsError exception raised by the underlying
        SuperGopher implementation if it cannot find any files.

        Will raise ValueError if no files exist. ValueError is used to be
        consistent with other DataSource methods
        """

        logger.debug('Super gopher _load_data_frame, kwargs:')
        for key, value in self.kwargs.items():
            value = list(np.atleast_1d(value))
            self.kwargs[key] = value
            logger.debug("    {} == {}".format(key, value))
        self.kwargs.update({'strict_filter_checking': True})

        try:
            pattern = self.file_naming_conventions['file_pattern']
            draw_dir = self.dir_path
            h5_tablename = self.file_naming_conventions.get(
                'h5_tablename', None)
            params = {'file_pattern': pattern, 'draw_dir': draw_dir}
            if h5_tablename:
                params.update({'h5_tablename': h5_tablename})

            if not self.ds:
                ds = DrawSource(params=params)
                self.ds = ds
            df = ds.content(filters=self.kwargs)
            df = self._add_n_draws(df)

        except ex.InvalidFilter:
            logger.info(
                "Super gopher '{}' found no files with file_pattern: {}"
                ", draw_dir: {}, and filters {}. Stopping pipeline"
                "".format(self.name, pattern, draw_dir, self.kwargs))
            raise

        logger.info('Super gopher "{}" got content, shape {}'.format(
            self.name, df.shape))

        logger.debug(
            ('SuperGopher "{}" got and validated data, dir={}, filter='
             '{}'.format(self.name, self.dir_path,
                         self.file_naming_conventions)))
        return df
예제 #7
0
def read_aggregated_rescaled(parent_dir, location_id, diag_years):
    """ Read in location aggregates of rescaled draws for deaths only"""
    rescaled_params = {
        'draw_dir': os.path.join(parent_dir, FilePaths.DRAWS_DIR),
        'file_pattern': FilePaths.DRAWS_FILE_PATTERN
    }
    ds = DrawSource(rescaled_params)
    rescaled_draws = ds.content(
        filters={
            'location_id': location_id,
            'year_id': diag_years,
            'measure_id': Measures.Ids.DEATHS
        })
    return rescaled_draws
예제 #8
0
def read_gbd_draw_files(parent_dir, location_id, years, measure_id):
    """Pull in all data to be summarized for gbd, by location and measure, and
    filtering by years."""
    logger = logging.getLogger('summary.read_gbd_draw_files')
    try:
        params = {
            'draw_dir': os.path.join(parent_dir, 'draws'),
            'file_pattern': '{measure_id}_{location_id}.h5'
        }
        ds = DrawSource(params)
        return ds.content(filters={'measure_id': measure_id,
                                   'location_id': location_id,
                                   'year_id': years})
    except Exception as e:
        logger.exception('Failed to read location: {}'.format(e))
예제 #9
0
    def _import_static_draws_by_component(self, component):
        dim = self.dimensions.get_dimension_by_component(
            component, self.measure_id)
        draw_dir = os.path.join(self.como_version.como_dir, "draws", component,
                                str(dim.index_dim.get_level("location_id")[0]))
        real_source = DrawSource({
            "draw_dir":
            draw_dir,
            "file_pattern":
            "{measure_id}_{year_id}_{sex_id}.h5"
        })
        real_source.add_transform(add_metric)
        fake_sink = DrawSink({
            "draw_dict": self.io_mock,
            "name": component
        }, mem_write_func)

        # get the filters and data
        sim_dim = self.dimensions.get_simulation_dimensions(self.measure_id)
        filters = sim_dim.index_dim.levels.copy()
        filters["age_group_id"] = dim.index_dim.get_level("age_group_id")
        df = real_source.content(filters=filters)
        fake_sink.push(df)
예제 #10
0
def _get_population(
    version: MachineParameters,
    location_set_id: int = constants.LocationSetId.OUTPUTS,
    agg_loc_sets: Optional[List[int]] = (
        constants.LocationAggregation.Ids.SPECIAL_LOCATIONS +
        [constants.LocationSetId.OUTPUTS])
) -> pd.DataFrame:
    """
    Unpacks arguments from version object to use with get_population
    function. Requests most detailed ages and most detailed sexes because
    age-sex population aggregates are created in the summarize module.
    Dependant on demographics team to upload population for majority of
    aggregate locations but currently uses AggSynchronous to create population
    information for select Norway locations in LocationSetId.OUTPUTS.

    Arguments:
        version (MachineParameters): object containing all the demographic
            and configuration data needed to query population
            estimates.
        location_set_id (int): The id for hierarchy to aggregate up
        agg_loc_sets (list): Additional location sets to create special
                aggregates

    Return:
        pd.DataFrame
    """
    pop = get_population(age_group_id=version.most_detailed_age_group_ids,
                         location_id=version.location_ids,
                         year_id=version.year_ids,
                         sex_id=version.sex_ids,
                         run_id=version.population_version_id,
                         decomp_step=version.decomp_step,
                         gbd_round_id=version.gbd_round_id)
    io_mock = {}
    source = DrawSource({"draw_dict": io_mock, "name": "tmp"}, mem_read_func)
    sink = DrawSink({"draw_dict": io_mock, "name": "tmp"}, mem_write_func)
    index_cols = constants.Columns.DEMOGRAPHIC_INDEX
    data_cols = [constants.Columns.POPULATION]
    sink.push(pop[index_cols + data_cols])
    # location
    if agg_loc_sets:
        assert len(agg_loc_sets) == len(set(agg_loc_sets))
        assert agg_loc_sets[-1] == constants.LocationSetId.OUTPUTS

        for set_id in agg_loc_sets:
            loc_tree = dbtrees.loctree(location_set_id=set_id,
                                       gbd_round_id=version.gbd_round_id)
            operator = Sum(index_cols=([
                col for col in index_cols
                if col != constants.Columns.LOCATION_ID
            ]),
                           value_cols=data_cols)
            aggregator = AggSynchronous(
                draw_source=source,
                draw_sink=sink,
                index_cols=([
                    col for col in index_cols
                    if col != constants.Columns.LOCATION_ID
                ]),
                aggregate_col=constants.Columns.LOCATION_ID,
                operator=operator)
            aggregator.run(loc_tree)
        special_locations = source.content()
    else:
        special_locations = pd.DataFrame()

    return pd.concat([
        pop, special_locations.
        loc[~special_locations.location_id.isin(pop.location_id.unique())]
    ],
                     ignore_index=True)
예제 #11
0
    def new_population(self, location_set_id, agg_loc_sets=[]):
        dim = self.nonfatal_dimensions.get_simulation_dimensions(
            self.measure_id)
        df = get_population(
            age_group_id=(
                dim.index_dim.get_level("age_group_id") + [164]),
            location_id=dbtrees.loctree(location_set_id=location_set_id,
                                        gbd_round_id=self.gbd_round_id
                                        ).node_ids,
            sex_id=dim.index_dim.get_level("sex_id"),
            year_id=dim.index_dim.get_level("year_id"))
        index_cols = ["location_id", "year_id", "age_group_id", "sex_id"]
        data_cols = ["population"]

        io_mock = {}
        source = DrawSource({"draw_dict": io_mock, "name": "tmp"},
                            mem_read_func)
        sink = DrawSink({"draw_dict": io_mock, "name": "tmp"}, mem_write_func)
        sink.push(df[index_cols + data_cols])

        # location
        for set_id in agg_loc_sets:
            loc_tree = dbtrees.loctree(
                location_set_id=set_id,
                gbd_round_id=self.gbd_round_id)
            operator = Sum(
                index_cols=[col for col in index_cols if col != "location_id"],
                value_cols=data_cols)
            aggregator = AggSynchronous(
                draw_source=source,
                draw_sink=sink,
                index_cols=[col for col in index_cols if col != "location_id"],
                aggregate_col="location_id",
                operator=operator)
            aggregator.run(loc_tree)

        # age
        for age_group_id in ComoSummaries._gbd_compare_age_group_list:
            age_tree = dbtrees.agetree(age_group_id)
            operator = Sum(
                index_cols=[col for col in index_cols if col != "age_group_id"
                            ],
                value_cols=data_cols)
            aggregator = AggSynchronous(
                draw_source=source,
                draw_sink=sink,
                index_cols=[col for col in index_cols if col != "age_group_id"
                            ],
                aggregate_col="age_group_id",
                operator=operator)
            aggregator.run(age_tree)

        # sex
        sex_tree = dbtrees.sextree()
        operator = Sum(
            index_cols=[col for col in index_cols if col != "sex_id"],
            value_cols=data_cols)
        aggregator = AggSynchronous(
            draw_source=source,
            draw_sink=sink,
            index_cols=[col for col in index_cols if col != "sex_id"],
            aggregate_col="sex_id",
            operator=operator)
        aggregator.run(sex_tree)
        df = source.content()
        df.to_hdf(
            "{}/info/population.h5".format(self.como_dir),
            'draws',
            mode='w',
            format='table',
            data_columns=["location_id", "year_id", "age_group_id", "sex_id"])
예제 #12
0
        # Read in all inputs
        logging.info("Reading in all inputs for {}".format(location))
        rescaled_dir = os.path.join(parent_dir, 'aggregated/rescaled')

        shock_dir = os.path.join(parent_dir, 'aggregated/shocks')
        input_file_pattern = '{measure_id}_{location_id}_{year_id}.h5'

        logging.info("Rescaled draws...")
        rescaled_params = {
            'draw_dir': rescaled_dir,
            'file_pattern': input_file_pattern
        }
        rescaled_ds = DrawSource(rescaled_params)
        rescaled = rescaled_ds.content(filters={
            'location_id': location,
            'measure_id': 1
        })

        logging.info("Shock draws...")
        shock_params = {
            'draw_dir': shock_dir,
            'file_pattern': input_file_pattern
        }
        shock_ds = DrawSource(shock_params)
        shocks = shock_ds.content(filters={
            'location_id': location,
            'measure_id': 1
        })

        logging.info("Rescaled YLL draws...")
        rescaled_yll_params = {
예제 #13
0
파일: shocks.py 프로젝트: cheth-rowe/ihmexp
def append_shocks(
        parent_dir: str,
        machine_process: str,
        measure_ids: List[int],
        location_id: int,
        most_detailed_location: bool,
        sex_id: int
) -> None:
    """
    Add yll and death shocks (location-aggregated) to re/scaled ylls and
    re/scaled deaths (also location-aggregated).
    Draws are stored broken down by location and sex for parallel execution.

    Arguments:
        parent_dir (str):
        machine_process (str):
        measure_ids (list): the measure_ids included in this run
        location_id (int): draws location_id
        most_detailed_location (bool):
        sex_id (int): draws sex_id

    """
    scaled_dir, shocks_dir = _get_input_filepaths(
        parent_dir,
        machine_process,
        most_detailed_location
    )
    input_file_pattern = FilePaths.APPEND_SHOCKS_FILE_PATTERN.format(
        sex_id=sex_id, location_id=location_id)
    # Deaths
    if Measures.Ids.DEATHS in measure_ids:
        scaled_params = {
            'draw_dir': os.path.join(scaled_dir, FilePaths.DEATHS_DIR),
            'file_pattern': input_file_pattern
        }
        scaled_ds = DrawSource(scaled_params)
        scaled = scaled_ds.content(
            filters={
                Columns.LOCATION_ID: location_id,
                Columns.SEX_ID: sex_id,
                Columns.MEASURE_ID: Measures.Ids.DEATHS
            }
        )
        shock_params = {
            'draw_dir': os.path.join(shocks_dir, FilePaths.DEATHS_DIR),
            'file_pattern': input_file_pattern
        }
        shock_ds = DrawSource(shock_params)
        shocks = shock_ds.content(
            filters={
                Columns.LOCATION_ID: location_id,
                Columns.SEX_ID: sex_id,
                Columns.MEASURE_ID: Measures.Ids.DEATHS
            }
        )
        new_scaled = _append_shocks(scaled, shocks)
    else:
        new_scaled = None

    # YLLS
    if Measures.Ids.YLLS in measure_ids:
        scaled_yll_params = {
            'draw_dir': os.path.join(scaled_dir, FilePaths.YLLS_DIR),
            'file_pattern': input_file_pattern
        }
        scaled_yll_ds = DrawSource(scaled_yll_params)
        scaled_ylls = scaled_yll_ds.content(
            filters={
                Columns.LOCATION_ID: location_id,
                Columns.SEX_ID: sex_id,
                Columns.MEASURE_ID: Measures.Ids.YLLS
            }
        )
        shock_yll_params = {
            'draw_dir': os.path.join(shocks_dir, FilePaths.YLLS_DIR),
            'file_pattern': input_file_pattern
        }
        shock_yll_ds = DrawSource(shock_yll_params)
        shock_ylls = shock_yll_ds.content(
            filters={
                Columns.LOCATION_ID: location_id,
                Columns.SEX_ID: sex_id,
                Columns.MEASURE_ID: Measures.Ids.YLLS
            }
        )
        new_scaled_ylls = _append_shocks(scaled_ylls, shock_ylls)
    else:
        new_scaled_ylls = None

    save_map = {
        GBD.Process.Name.CODCORRECT: _save_all_codcorrect_outputs,
        GBD.Process.Name.FAUXCORRECT: _save_all_fauxcorrect_outputs
    }
    save_map[machine_process](
        parent_dir,
        new_scaled,
        new_scaled_ylls,
        location_id,
        sex_id
    )