Exemplo n.º 1
0
    def __init__(self,
                 regression_launch_set_id=None,
                 shared_package_id=None,
                 shared_package_name=None,
                 formula=None,
                 run_description=None,
                 data_id=None,
                 data_description=None,
                 test=False):
        # Assert arguments passed in are valid to give flexibility in
        # creating a RegressionLauncher
        if run_description is None:
            raise AssertionError("Argument 'run_description' missing.")

        if data_id is not None and data_description is not None:
            # If both data_id and data_description were supplied,
            # that's an issue! We either use an old data version (data_id
            # supplied) or we give new data description and create new data
            raise AssertionError(
                "Only one of the following arguments may be supplied: "
                "'data_id', 'data_description'. Supply 'data_id' to "
                "use an old data version. Supply 'data_description' to "
                "create a new set of data.")

        if regression_launch_set_id is None:
            # If the RLSID isn't passed in, all necessary arguments must
            # be supplied by user!
            for arg in [shared_package_id, formula]:
                assert arg is not None, "Missing needed argument without an " \
                                        "explicit regression_launch_set_id " \
                                        "('shared_package_id', 'formula')."
            assert data_id is not None or data_description is not None, "Either 'data_id' or data_description' must be supplied if no RLSID is."

        self._test = test
        if not self._test:
            self.assert_repo_is_committed()
        else:
            print("***TEST*** - Not checking script commit status")

        self._launch_set_dict = {
            "regression_launch_set_id": RegressionLauncher.new_launch_set_id(),
            "username": getpass.getuser(),
            "commit_hash": get_git_commit_hash(),
            "timestamp": cod_timestamp(),
            "run_description": run_description,
            "formula": formula,
            "shared_package_id": shared_package_id,
            "shared_package_name": shared_package_name,
            "data_id": data_id,
            "data_description": data_description
        }

        self.fill_in_launch_set_gaps(regression_launch_set_id)
Exemplo n.º 2
0
    def __init__(self):
        self.cg = Configurator('standard')
        self.cache_dir = self.cg.get_directory('db_cache')
        # if you do not want to write any output files then set test to "True"
        self.test = False
        self.cache_options = {
            'force_rerun': True,
            'block_rerun': False,
            'cache_dir': self.cache_dir
        }
        self.dataset_filters = {
            'data_type_id': [8, 9, 10, 12],
            'location_set_id': 35,
            'is_active': True,
            'year_id': range(1980, 2050)
        }
        self.national_nids = self.cg.get_resource("nid_replacements")

        # resources
        self.completeness = self.cg.get_resource("completeness")
        self.env_meta_df = get_env(env_run_id=self.cg.get_id('env_run'),
                                   **self.cache_options)
        self.location_meta_df = get_current_location_hierarchy(
            location_set_version_id=self.cg.get_id('location_set_version'),
            **self.cache_options)
        self.cod_ages = list(
            get_cod_ages(**self.cache_options)['age_group_id'].unique())

        # identifiers
        self.source_cols = ["source", "nid", "data_type_id"]
        self.geo_cols = ["location_id", "year_id"]
        self.meta_cols = ["nationally_representative", "detail_level_id"]
        self.value_cols = ['deaths']
        self.year_end = self.cg.get_id('year_end')
        self.full_time_series = "full_time_series"

        # directories
        self.current_best_version = "2018_04_03_151739"
        self.out_dir = "FILEPATH"
        self.arch_dir = "{}/_archive".format(self.out_dir)
        self.timestamp = cod_timestamp()
Exemplo n.º 3
0
def pull_vr_data_for_rdp_reg(reg_spec, location_hierarchy, data_id, small_test=False,
                             vr_pull_timestamp=None):
    """Pull vr used to make redistribution proportions.

    If vr_pull_timestamp is passed, and it does in fact exist, then this will
    just read that. Otherwise, it runs a custom get_claude_data based on
    the passed regression specification.
    """

    shared_package_id = reg_spec['shared_package_id']

    if vr_pull_timestamp is not None:
        timestamp = vr_pull_timestamp
    else:
        timestamp = cod_timestamp()

    outdir = "FILEPATH".format(RDP_REG_DIR, shared_package_id)
    outpath = "FILEPATH".format(outdir, data_id, timestamp)

    if vr_pull_timestamp is not None:
        print_log_message("Reading VR data pulled on {}".format(vr_pull_timestamp))
        if not os.path.exists(outpath):
            raise ValueError(
                "Passed [vr_pull_timestamp={}], but {} does not exist. "
                "Need to either pass a different version that does exist, or"
                " run a new vr pull by passing vr_pull_timestamp=None.".format(
                    vr_pull_timestamp, outpath)
            )
        df = pd.read_csv(outpath)

    else:
        print_log_message(
            "Pulling a fresh version of VR with timestamp {}".format(
                timestamp)
        )
        # regressions only use detailed code systems
        code_system_id = MODEL_DATA_CODE_SYSTEMS

        # regressions only use national-level data to avoid biasing the sample
        # toward subnational datasets
        country_loc_map = get_country_loc_id_map(location_hierarchy)

        if small_test:
            year_id = [2010, 2011]
            print("Pulling data for year subset: {}".format(year_id))
        else:
            year_id = range(1980, 2018)

        dfs = []
        for code_system_id in code_system_id:
            print_log_message("Code system id: {}".format(code_system_id))
            garbage_code_ids = get_package_code_ids(reg_spec, code_system_id)
            target_cause_ids = reg_spec['target_cause_ids']
            df = get_claude_data(
                "disaggregation",
                data_type_id=9, code_system_id=code_system_id,
                is_active=True, year_id=year_id, location_set_id=35,
                exec_function=collapse_to_reg_df,
                exec_function_args=[garbage_code_ids, target_cause_ids, country_loc_map],
                attach_launch_set_id=True
            )
            dfs.append(df)
        df = pd.concat(dfs, ignore_index=True)

        df['vr_pull_timestamp'] = timestamp

        df.to_csv(outpath, index=False)

    return df
Exemplo n.º 4
0
def finalize_formatting(df,
                        source,
                        write=False,
                        code_system_id=None,
                        extract_type=None,
                        conn_def='ADDRESS',
                        is_active=False,
                        refresh_cache=True,
                        check_ages=True):
    """Finalize the formatting of the source and optionally write it out.

    Decides whether to map code_id based on whether code_id is already a
        column in the dataset.

    Needs the following information from either the df values or from the
        nid_meta_vals dict:

            data_type_id
            representative_id

        All of the above must have only one value per nid in df.

    Maps site_id to the data based on incoming 'site' column. Will upload
        any sites that are not in the cod.site table already.

    Arguments:
        df, pandas.DataFrame: The dataframe with near-formatted data
        source, str: The source this df is (should be the whole source and
            nothing but the source). Will break if there is no source in
            FILEPATH with this name, and you should pass the
            source without a leading underscore even if it is that way
            in J
        write, bool: whether to write the outputs
        extract_type, str: The manner in which the nid was extracted. If
            left as None, will be induced by the location_type_id of
            the location_id with the maximum level in the dataset. This should
            be over-ridden in cases like China DSP, where the same locations
            are used in two extraction types - "DSP + VR" and "DSP"; China DSP
            then gets two extraction types: "admin1" and
            "admin1: DSP sites only" (in the particular instance of DSP,
            extract type is built into this code. Feel free to add other
            source-extract type mappings here to force consistency.)
        check_ages, bool: Whether or not to enforce age group checks such as
            ensuring no overlaps or gaps. This can be turned off because sometimes
            raw data reports overlapping age groups (e.g. Palestine data has Gaza Strip and West
            Bank data with different age groupings).

    Returns:
        Every local value to the function
        Why? There are multiple df outputs, and formatting is a very
        engaged process so its helpful to just see everything sometimes
    """
    # set column groups, and verify that we have everything we need
    NID_META_COLS = [
        'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id',
        'code_system_id', 'is_active', 'is_mort_active'
    ]
    NID_LOCATION_YEAR_COLS = [
        'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id'
    ]
    FORMATTED_ID_COLS = [
        'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id',
        'age_group_id', 'location_id'
    ]
    if 'code_id' in df.columns:
        code_col = 'code_id'
        map_code_id = False
    elif 'cause' in df.columns:
        code_col = 'cause'
        map_code_id = True
    else:
        raise AssertionError("Need either 'code_id' or 'cause' in columns")
    INCOMING_EXPECTED_ID_COLS = [
        'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col,
        'site', 'data_type_id', 'representative_id', 'code_system_id'
    ]
    VALUE_COLS = ['deaths']
    FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS

    missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns)
    assert len(missing_cols) == 0, \
        "Required formatting columns not found in df: \n{}".format(missing_cols)

    # SET FORMATTING TIMESTAMP
    format_timestamp = cod_timestamp()
    print("Finalizing formatting with timestamp {}".format(format_timestamp))

    # ADD SOURCE
    df['source'] = source

    # MAP OR CHECK CODE ID
    code_system_ids = df.code_system_id.unique()
    if map_code_id:
        cs_dfs = []
        for code_system_id in code_system_ids:
            cs_df = df.loc[df['code_system_id'] == code_system_id].copy()
            # map code_id to the data
            cs_df['value'] = cs_df['cause']
            cs_df = add_code_metadata(cs_df, ['code_id'],
                                      code_system_id=code_system_id,
                                      merge_col='value',
                                      force_rerun=True,
                                      cache_dir='standard')
            report_if_merge_fail(cs_df, ['code_id'], ['value'])
            cs_df = cs_df.drop('value', axis=1)
            cs_dfs.append(cs_df)
        df = pd.concat(cs_dfs, ignore_index=True)
    else:
        # CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE CODE SYSTEM
        all_codes_q = """
            SELECT code_id
            FROM engine_room.maps_code
            WHERE code_system_id IN ({})
        """.format(",".join([str(c) for c in code_system_ids]))
        all_codes = ezfuncs.query(all_codes_q, conn_def='ADDRESS')
        bad_codes = set(df.code_id) - set(all_codes.code_id)
        assert len(bad_codes) == 0, "Found code ids in data that can't exist in code "\
                                    "systems {}: {}".format(code_system_ids, bad_codes)
    check_vr_raw_causes(df)

    # MAP SITE ID
    df = map_site_id(df, conn_def=conn_def)
    # MAP EXTRACT TYPE ID
    df = map_extract_type_id(df, source, extract_type, conn_def=conn_def)
    # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS
    df = group_six_minor_territories(df, sum_cols=VALUE_COLS)

    # sorry for putting this here
    # drop these loc/years b/c env < deaths creating negative cc_code
    # maybe re run w/ another envelope?
    df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))]
    df = df.loc[~(df['nid'].isin([24143, 107307]))]

    # ENSURE NO NEGATIVES
    for val_col in VALUE_COLS:
        assert (df[val_col] >= 0).all(), \
            "there are negative values in {}".format(val_col)

    ################################################
    # keep all 0s now, messing up for NR in non-VR
    # df['val_sum_tmp'] = df[VALUE_COLS].sum(axis=1)
    # all-cause extractions want to keep zeroes
    # keep_zeroes = df['extract_type_id'] == ALL_CAUSE_EXTRACT_ID
    # otherwise, drop them
    # greater_than_zero = df['val_sum_tmp'] > 0
    # df = df[greater_than_zero | keep_zeroes]
    # df = df.drop('val_sum_tmp', axis=1)
    ################################################

    # CHECKS FOR FORMATTED PHASE OUTPUT
    input_df = df[FINAL_FORMATED_COLS].copy()
    assert not input_df.isnull().values.any(), "null values in df"
    dupped = input_df[input_df.duplicated()]
    if len(dupped) > 0:
        raise AssertionError("duplicate values in df: \n{}".format(dupped))

    # GROUP IF NECESSARY
    if input_df[FORMATTED_ID_COLS].duplicated().any():
        input_df = input_df.groupby(FORMATTED_ID_COLS,
                                    as_index=False)[VALUE_COLS].sum()

    # TESTS F0R CHECKING AGE GROUP IDS
    if check_ages:
        check_age_groups(df)

    # MORE TESTS FOR DEATHS - MAYBE THAT THEY AREN'T MORE THAN 1.25 THE
    # VALUE IN THE ENVELOPE BY LOCATION AGE YEAR SEX?

    # AND THEN WRITE A TABLE OF COMPARISONS OF DEATHS / ENVELOPE BY LOCATION
    # AGE YEAR SEX FOR REVIEW

    # MAKE NID METADATA TABLE
    if 'parent_nid' not in df.columns:
        df['parent_nid'] = np.nan

    if is_active is True:
        warnings.warn(
            """is_active is deprecated: use the update_nid_metadata_status
                         function to change the status of finalized datasets"""
        )

    # Use existing is_active and is_mort_active values, otherwise default to 0
    nid_map = pull_nid_metadata()
    df = df.merge(nid_map,
                  on=[
                      'nid', 'parent_nid', 'extract_type_id', 'source',
                      'data_type_id', 'code_system_id'
                  ],
                  how='left')

    df_na = df[pd.isnull(df['is_active'])]
    df_na = df_na[['nid', 'extract_type_id']].drop_duplicates()

    if df_na.shape[0] > 0:
        print("""New rows for the following NID/extract_type_id will be added
                 with is_active and is_mort_active = 0:\n {}""".format(df_na))

    df['is_active'] = df['is_active'].fillna(0)
    df['is_mort_active'] = df['is_mort_active'].fillna(0)

    # CHECK SUBNATIONAL LOCATIONS
    df = check_subnational_locations(df)

    # OVERRIDE REPRESENTATIVE ID FOR NON-VR
    df = adjust_representative_id(df)

    nid_meta_df = df[NID_META_COLS].drop_duplicates()
    nid_meta_df['last_formatted_timestamp'] = format_timestamp

    # MAKE NID LOCATION YEAR TABLE
    nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates()
    nid_locyears['last_formatted_timestamp'] = format_timestamp
    # check one iso3 per nid
    nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id')
    nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3)
    report_duplicates(
        nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(),
        ['nid', 'extract_type_id'])
    nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1)

    if write:
        # write nid metadata
        write_to_claude_nid_table(nid_meta_df,
                                  'claude_nid_metadata',
                                  replace=True,
                                  conn_def=conn_def)

        # write nid location-year map
        write_to_claude_nid_table(nid_locyears,
                                  'claude_nid_location_year',
                                  replace=True,
                                  conn_def=conn_def)

        # write to cod.source for new sources
        insert_source_id(source)

        nid_extracts = input_df[['nid', 'extract_type_id'
                                 ]].drop_duplicates().to_records(index=False)
        for nid, extract_type_id in nid_extracts:
            nid = int(nid)
            extract_type_id = int(extract_type_id)
            print("Writing nid {}, extract_type_id {}".format(
                nid, extract_type_id))
            idf = input_df.loc[(input_df['nid'] == nid) & (
                input_df['extract_type_id'] == extract_type_id)].copy()
            phase = 'formatted'
            launch_set_id = format_timestamp
            print("\nTotal deaths: {}".format(idf.deaths.sum()))
            write_phase_output(idf, phase, nid, extract_type_id, launch_set_id)

        # now refresh cache files for nid
        if refresh_cache:
            refresh_claude_nid_cache_files()

    return locals()
Exemplo n.º 5
0
def finalize_formatting(df,
                        source,
                        write=False,
                        code_system_id=None,
                        extract_type=None,
                        conn_def='ADDRESS',
                        is_active=True):

    NID_META_COLS = [
        'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id',
        'code_system_id', 'is_active'
    ]
    NID_LOCATION_YEAR_COLS = [
        'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id'
    ]
    FORMATTED_ID_COLS = [
        'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id',
        'age_group_id', 'location_id'
    ]
    if 'code_id' in df.columns:
        code_col = 'code_id'
        map_code_id = False
    elif 'cause' in df.columns:
        code_col = 'cause'
        map_code_id = True
    else:
        raise AssertionError("Need either 'code_id' or 'cause' in columns")
    INCOMING_EXPECTED_ID_COLS = [
        'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col,
        'site', 'data_type_id', 'representative_id', 'code_system_id'
    ]
    VALUE_COLS = ['deaths']
    FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS

    missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns)
    if len(missing_cols) > 0:
        raise AssertionError(
            """These columns are needed for formatting but not found in df:

            {}
            """.format(missing_cols))

    # SET FORMATTING TIMESTAMP
    format_timestamp = cod_timestamp()
    print("Finalizing formatting with timestamp {}".format(format_timestamp))

    # ADD SOURCE
    df['source'] = source

    # MAP OR CHECK CODE ID
    code_system_ids = df.code_system_id.unique()
    if map_code_id:
        cs_dfs = []
        for code_system_id in code_system_ids:
            cs_df = df.loc[df['code_system_id'] == code_system_id].copy()
            # map code_id to the data
            cs_df['value'] = cs_df['cause']
            cs_df = add_code_metadata(cs_df, ['code_id'],
                                      code_system_id=code_system_id,
                                      merge_col='value',
                                      force_rerun=True,
                                      cache_dir='standard')
            print(cs_df.loc[cs_df['code_id'].isnull()].value.unique())
            report_if_merge_fail(cs_df, ['code_id'], ['value'])
            cs_df = cs_df.drop('value', axis=1)
            cs_dfs.append(cs_df)
        df = pd.concat(cs_dfs, ignore_index=True)
    else:
        # ADD TEST TO CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE
        # CODE SYSTEM
        all_codes_q = """
            SELECT code_id
            FROM ADDRESS
            WHERE code_system_id IN ({})
        """.format(",".join([str(c) for c in code_system_ids]))
        all_codes = ezfuncs.query(all_codes_q, conn_def='engine')
        bad_codes = set(df.code_id) - set(all_codes.code_id)
        if len(bad_codes) > 0:
            print("Found these code ids in data that can't exist in code "
                  "systems {}: {}".format(code_system_ids, bad_codes))

    # MAP SITE ID
    df = map_site_id(df, conn_def=conn_def)
    # MAP EXTRACT TYPE ID
    df = map_extract_type_id(df, source, extract_type, conn_def=conn_def)

    # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS
    df = group_six_minor_territories(df, sum_cols=VALUE_COLS)

    df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))]
    df = df.loc[~(df['nid'].isin([24143, 107307]))]

    # ENSURE NO NEGATIVES
    for val_col in VALUE_COLS:
        assert (df[val_col] >= 0).all(), \
            "there are negative values in {}".format(val_col)

    input_df = df[FINAL_FORMATED_COLS].copy()
    assert not input_df.isnull().values.any(), "null values in df"
    dupped = input_df[input_df.duplicated()]
    if len(dupped) > 0:
        raise AssertionError("duplicate values in df: \n{}".format(dupped))

    # GROUP IF NECESSARY
    if input_df[FORMATTED_ID_COLS].duplicated().any():
        input_df = input_df.groupby(FORMATTED_ID_COLS,
                                    as_index=False)[VALUE_COLS].sum()

    # MAKE NID METADATA TABLE
    if 'parent_nid' not in df.columns:
        df['parent_nid'] = np.nan

    df['is_active'] = 1 * is_active

    # CHECK SUBNATIONAL LOCATIONS
    # alters is_active if needed
    df = check_subnational_locations(df)

    nid_meta_df = df[NID_META_COLS].drop_duplicates()
    nid_meta_df['last_updated_timestamp'] = format_timestamp

    # MAKE NID LOCATION YEAR TABLE
    nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates()
    nid_locyears['last_updated_timestamp'] = format_timestamp
    # check one iso3 per nid
    nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id')
    nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3)
    report_duplicates(
        nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(),
        ['nid', 'extract_type_id'])
    nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1)

    if write:
        # write nid metadata
        write_to_claude_nid_table(nid_meta_df,
                                  'claude_nid_metadata',
                                  replace=True,
                                  conn_def=conn_def)

        # write nid location-year map
        write_to_claude_nid_table(nid_locyears,
                                  'claude_nid_location_year',
                                  replace=True,
                                  conn_def=conn_def)

        insert_source_id(source)

        nid_extracts = input_df[['nid', 'extract_type_id'
                                 ]].drop_duplicates().to_records(index=False)
        for nid, extract_type_id in nid_extracts:
            nid = int(nid)
            extract_type_id = int(extract_type_id)
            print("Writing nid {}, extract_type_id {}".format(
                nid, extract_type_id))
            idf = input_df.loc[(input_df['nid'] == nid) & (
                input_df['extract_type_id'] == extract_type_id)].copy()
            phase = 'formatted'
            launch_set_id = format_timestamp
            print("\nTotal deaths: {}".format(idf.deaths.sum()))
            write_phase_output(idf, phase, nid, extract_type_id, launch_set_id)

        # now refresh cache files for nid
        print("\nRefreshing claude nid metadata cache files")
        force_cache_options = {
            'force_rerun': True,
            'block_rerun': False,
            'cache_dir': "standard",
            'cache_results': True,
            'verbose': True
        }
        get_nid_metadata(**force_cache_options)
        get_nidlocyear_map(**force_cache_options)

    return locals()