def main(nid, extract_type_id, code_system_id, launch_set_id):

    cause_set_version_id = CONF.get_id('cause_set_version')
    location_set_version_id = CONF.get_id('location_set_version')
    pop_run_id = CONF.get_id('pop_run')
    env_run_id = CONF.get_id('env_run')
    distribution_set_version_id = CONF.get_id('distribution_set_version')

    # download data from input database
    df = get_claude_data('formatted',
                         nid=nid,
                         extract_type_id=extract_type_id,
                         location_set_version_id=location_set_version_id)

    assert len(df) != 0, ("Dataframe is empty."
                          " Are you sure this source is in"
                          "the input database?")
    # run the pipeline
    df = run_pipeline(nid,
                      extract_type_id,
                      launch_set_id,
                      df,
                      code_system_id,
                      cause_set_version_id,
                      location_set_version_id,
                      pop_run_id,
                      env_run_id,
                      distribution_set_version_id,
                      diagnostic=False)
    # upload to database
    write_phase_output(df, 'disaggregation', nid, extract_type_id,
                       launch_set_id)
def main(nid, extract_type_id, remove_decimal, code_map_version_id,
         launch_set_id):
    print_log_message("Reading corrections data")
    df = get_phase_output('corrections',
                          nid=nid,
                          extract_type_id=extract_type_id)

    data_type_id = get_value_from_nid(nid,
                                      'data_type_id',
                                      extract_type_id=extract_type_id)

    cause_set_version_id = CONF.get_id('cause_set_version')
    location_set_version_id = CONF.get_id('location_set_version')
    env_run_id = CONF.get_id('env_run')

    source = get_value_from_nid(nid, 'source', extract_type_id=extract_type_id)

    # run the pipeline
    df = run_phase(df, cause_set_version_id, location_set_version_id,
                   data_type_id, env_run_id, source, nid, extract_type_id,
                   remove_decimal, code_map_version_id)

    # upload to database
    print_log_message(
        "Writing {n} rows of output for launch set {ls}, nid {nid}, extract "
        "{e}".format(n=len(df), ls=launch_set_id, e=extract_type_id, nid=nid))
    df = intify_cols(df)
    write_phase_output(df, 'aggregation', nid, extract_type_id, launch_set_id)
def main(nid, extract_type_id, csvid, lsvid, pop_run_id, cmvid, remove_decimal,
         launch_set_id):

    df = get_phase_output("misdiagnosiscorrection", nid, extract_type_id)
    if has_garbage(df):
        print_log_message("Running redistribution")
        # run the pipeline
        df = run_phase(df, csvid, nid, extract_type_id, lsvid, pop_run_id,
                       cmvid, launch_set_id, remove_decimal)
    else:
        print_log_message("No redistribution to do.")
        # collapse code id
        val_cols = ['deaths']
        group_cols = list(set(df.columns) - set(['code_id'] + val_cols))
        df = df.groupby(group_cols, as_index=False)[val_cols].sum()

    write_phase_output(df, 'redistribution', nid, extract_type_id,
                       launch_set_id)
示例#4
0
def main(nid, extract_type_id, csvid, lsvid, pop_run_id, cmvid, remove_decimal,
         launch_set_id):
    """Download data, run phase, and output result."""
    # download data from input database
    df = get_phase_output("misdiagnosiscorrection", nid, extract_type_id)
    # who even needs redistribution?
    if has_garbage(df):
        print_log_message("Running redistribution")
        # run the pipeline
        df = run_phase(df, csvid, nid, extract_type_id, lsvid, pop_run_id,
                       cmvid, launch_set_id, remove_decimal)
    else:
        print_log_message("No redistribution to do.")
        # collapse code id
        val_cols = ['deaths']
        group_cols = list(set(df.columns) - set(['code_id'] + val_cols))
        df = df.groupby(group_cols, as_index=False)[val_cols].sum()

    # write it out
    write_phase_output(df, 'redistribution', nid, extract_type_id,
                       launch_set_id)
示例#5
0
def main(nid, extract_type_id, launch_set_id):
    """Read the data, run the phase, write the output."""
    print_log_message("Reading {} data".format(PHASE_ANTECEDENT))
    df = get_claude_data(PHASE_ANTECEDENT,
                         nid=nid,
                         extract_type_id=extract_type_id)

    env_run_id = int(CONF.get_id('env_run'))
    pop_run_id = int(CONF.get_id('pop_run'))
    location_set_version_id = int(CONF.get_id('location_set_version'))
    cause_set_version_id = int(CONF.get_id('cause_set_version'))

    df = df.rename(columns={'cf': 'cf_final'})

    df = run_phase(df, nid, extract_type_id, env_run_id, pop_run_id,
                   location_set_version_id, cause_set_version_id)

    print_log_message(
        "Writing {n} rows of output for launch set {ls}, nid {nid}, extract "
        "{e}".format(n=len(df), ls=launch_set_id, nid=nid, e=extract_type_id))
    write_phase_output(df, PHASE_NAME, nid, extract_type_id, launch_set_id)
def main(nid, extract_type_id, code_system_id, launch_set_id):

    cause_set_version_id = CONF.get_id('cause_set_version')
    location_set_version_id = CONF.get_id('location_set_version')
    pop_run_id = CONF.get_id('pop_run')
    env_run_id = CONF.get_id('env_run')

    # need to use special age/sex distribution for Norway based on National data
    if get_value_from_nid(nid, 'iso3',
                          extract_type_id=extract_type_id) == 'NOR':
        distribution_set_version_id = CONF.get_id(
            'NOR_distribution_set_version')
    else:
        distribution_set_version_id = CONF.get_id('distribution_set_version')

    # download data from input database
    df = get_claude_data('formatted',
                         nid=nid,
                         extract_type_id=extract_type_id,
                         location_set_version_id=location_set_version_id)

    assert len(df) != 0, ("Dataframe is empty."
                          " Are you sure this source is in"
                          "the input database?")
    # run the pipeline
    df = run_pipeline(nid,
                      extract_type_id,
                      launch_set_id,
                      df,
                      code_system_id,
                      code_map_version_id,
                      cause_set_version_id,
                      location_set_version_id,
                      pop_run_id,
                      env_run_id,
                      distribution_set_version_id,
                      diagnostic=False)
    # upload to database
    write_phase_output(df, 'disaggregation', nid, extract_type_id,
                       launch_set_id)
def run_pipeline(nid,
                 extract_type_id,
                 launch_set_id,
                 df,
                 code_system_id,
                 cause_set_version_id,
                 location_set_version_id,
                 pop_run_id,
                 env_run_id,
                 distribution_set_version_id,
                 diagnostic=False):
    """Run the full pipeline"""

    cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_results': False,
        'cache_dir': CONF.get_directory('FILEPATH'),
        'verbose': False
    }

    location_meta_df = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id, **cache_options)

    code_map = get_cause_map(code_system_id=code_system_id, **cache_options)

    source = get_value_from_nid(nid, "source", extract_type_id)

    print("Overriding causes when necessary")
    df = overrides(df, location_meta_df)

    print("Dropping data out of scope")
    df = drop_data_out_of_scope(df, location_meta_df, source)
    if len(df) > 0:
        # make sure six minor territories are grouped correctly
        assert_no_six_minor_territories(df)

        # run mapping
        print("\nDeaths before MAPPING: {}".format(df.deaths.sum()))
        Mapper = GBDCauseMapper(cause_set_version_id, code_map)
        df = Mapper.get_computed_dataframe(df, code_system_id)
        if diagnostic:
            write_phase_output(df,
                               'mapping',
                               nid,
                               extract_type_id,
                               launch_set_id,
                               sub_dirs='diagnostic')

        print("\nDeaths before AGESEXSPLIT: {}".format(df.deaths.sum()))
        # run age sex splitting
        MySplitter = AgeSexSplitter(cause_set_version_id,
                                    pop_run_id,
                                    distribution_set_version_id,
                                    verbose=True,
                                    collect_diagnostics=False)

        df = MySplitter.get_computed_dataframe(df, location_meta_df)
        if diagnostic:
            diag_df = MySplitter.get_diagnostic_dataframe()
            write_phase_output(diag_df,
                               'agesexsplit',
                               nid,
                               extract_type_id,
                               launch_set_id,
                               sub_dirs='diagnostic')

        print("\nDeaths before CORRECTIONS: {}".format(df.deaths.sum()))
        # run restrictions corrections
        Corrector = RestrictionsCorrector(code_system_id,
                                          cause_set_version_id,
                                          collect_diagnostics=False,
                                          verbose=True)
        df = Corrector.get_computed_dataframe(df)

        # calculate cc_code for some sources
        if source in ['Iran_maternal_surveillance', 'Iran_forensic']:
            env_meta_df = get_env(env_run_id=env_run_id, **cache_options)
            df = calculate_cc_code(df, env_meta_df, code_map)
            print("\nDeaths after adding cc_code: {}".format(df.deaths.sum()))

        # adjust deaths for New Zealand by maori/non-maori ethnicities
        if source in ["NZL_MOH_ICD9", "NZL_MOH_ICD10"]:
            df = correct_maori_non_maori_deaths(df)
            print("\nDeaths after Maori/non-Maori adjustment: {}".format(
                df.deaths.sum()))

        print("\nDeaths at END: {}".format(df.deaths.sum()))

    return df
示例#8
0
def finalize_formatting(df,
                        source,
                        write=False,
                        code_system_id=None,
                        extract_type=None,
                        conn_def='ADDRESS',
                        is_active=False,
                        refresh_cache=True,
                        check_ages=True):
    """Finalize the formatting of the source and optionally write it out.

    Decides whether to map code_id based on whether code_id is already a
        column in the dataset.

    Needs the following information from either the df values or from the
        nid_meta_vals dict:

            data_type_id
            representative_id

        All of the above must have only one value per nid in df.

    Maps site_id to the data based on incoming 'site' column. Will upload
        any sites that are not in the cod.site table already.

    Arguments:
        df, pandas.DataFrame: The dataframe with near-formatted data
        source, str: The source this df is (should be the whole source and
            nothing but the source). Will break if there is no source in
            FILEPATH with this name, and you should pass the
            source without a leading underscore even if it is that way
            in J
        write, bool: whether to write the outputs
        extract_type, str: The manner in which the nid was extracted. If
            left as None, will be induced by the location_type_id of
            the location_id with the maximum level in the dataset. This should
            be over-ridden in cases like China DSP, where the same locations
            are used in two extraction types - "DSP + VR" and "DSP"; China DSP
            then gets two extraction types: "admin1" and
            "admin1: DSP sites only" (in the particular instance of DSP,
            extract type is built into this code. Feel free to add other
            source-extract type mappings here to force consistency.)
        check_ages, bool: Whether or not to enforce age group checks such as
            ensuring no overlaps or gaps. This can be turned off because sometimes
            raw data reports overlapping age groups (e.g. Palestine data has Gaza Strip and West
            Bank data with different age groupings).

    Returns:
        Every local value to the function
        Why? There are multiple df outputs, and formatting is a very
        engaged process so its helpful to just see everything sometimes
    """
    # set column groups, and verify that we have everything we need
    NID_META_COLS = [
        'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id',
        'code_system_id', 'is_active', 'is_mort_active'
    ]
    NID_LOCATION_YEAR_COLS = [
        'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id'
    ]
    FORMATTED_ID_COLS = [
        'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id',
        'age_group_id', 'location_id'
    ]
    if 'code_id' in df.columns:
        code_col = 'code_id'
        map_code_id = False
    elif 'cause' in df.columns:
        code_col = 'cause'
        map_code_id = True
    else:
        raise AssertionError("Need either 'code_id' or 'cause' in columns")
    INCOMING_EXPECTED_ID_COLS = [
        'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col,
        'site', 'data_type_id', 'representative_id', 'code_system_id'
    ]
    VALUE_COLS = ['deaths']
    FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS

    missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns)
    assert len(missing_cols) == 0, \
        "Required formatting columns not found in df: \n{}".format(missing_cols)

    # SET FORMATTING TIMESTAMP
    format_timestamp = cod_timestamp()
    print("Finalizing formatting with timestamp {}".format(format_timestamp))

    # ADD SOURCE
    df['source'] = source

    # MAP OR CHECK CODE ID
    code_system_ids = df.code_system_id.unique()
    if map_code_id:
        cs_dfs = []
        for code_system_id in code_system_ids:
            cs_df = df.loc[df['code_system_id'] == code_system_id].copy()
            # map code_id to the data
            cs_df['value'] = cs_df['cause']
            cs_df = add_code_metadata(cs_df, ['code_id'],
                                      code_system_id=code_system_id,
                                      merge_col='value',
                                      force_rerun=True,
                                      cache_dir='standard')
            report_if_merge_fail(cs_df, ['code_id'], ['value'])
            cs_df = cs_df.drop('value', axis=1)
            cs_dfs.append(cs_df)
        df = pd.concat(cs_dfs, ignore_index=True)
    else:
        # CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE CODE SYSTEM
        all_codes_q = """
            SELECT code_id
            FROM engine_room.maps_code
            WHERE code_system_id IN ({})
        """.format(",".join([str(c) for c in code_system_ids]))
        all_codes = ezfuncs.query(all_codes_q, conn_def='ADDRESS')
        bad_codes = set(df.code_id) - set(all_codes.code_id)
        assert len(bad_codes) == 0, "Found code ids in data that can't exist in code "\
                                    "systems {}: {}".format(code_system_ids, bad_codes)
    check_vr_raw_causes(df)

    # MAP SITE ID
    df = map_site_id(df, conn_def=conn_def)
    # MAP EXTRACT TYPE ID
    df = map_extract_type_id(df, source, extract_type, conn_def=conn_def)
    # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS
    df = group_six_minor_territories(df, sum_cols=VALUE_COLS)

    # sorry for putting this here
    # drop these loc/years b/c env < deaths creating negative cc_code
    # maybe re run w/ another envelope?
    df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))]
    df = df.loc[~(df['nid'].isin([24143, 107307]))]

    # ENSURE NO NEGATIVES
    for val_col in VALUE_COLS:
        assert (df[val_col] >= 0).all(), \
            "there are negative values in {}".format(val_col)

    ################################################
    # keep all 0s now, messing up for NR in non-VR
    # df['val_sum_tmp'] = df[VALUE_COLS].sum(axis=1)
    # all-cause extractions want to keep zeroes
    # keep_zeroes = df['extract_type_id'] == ALL_CAUSE_EXTRACT_ID
    # otherwise, drop them
    # greater_than_zero = df['val_sum_tmp'] > 0
    # df = df[greater_than_zero | keep_zeroes]
    # df = df.drop('val_sum_tmp', axis=1)
    ################################################

    # CHECKS FOR FORMATTED PHASE OUTPUT
    input_df = df[FINAL_FORMATED_COLS].copy()
    assert not input_df.isnull().values.any(), "null values in df"
    dupped = input_df[input_df.duplicated()]
    if len(dupped) > 0:
        raise AssertionError("duplicate values in df: \n{}".format(dupped))

    # GROUP IF NECESSARY
    if input_df[FORMATTED_ID_COLS].duplicated().any():
        input_df = input_df.groupby(FORMATTED_ID_COLS,
                                    as_index=False)[VALUE_COLS].sum()

    # TESTS F0R CHECKING AGE GROUP IDS
    if check_ages:
        check_age_groups(df)

    # MORE TESTS FOR DEATHS - MAYBE THAT THEY AREN'T MORE THAN 1.25 THE
    # VALUE IN THE ENVELOPE BY LOCATION AGE YEAR SEX?

    # AND THEN WRITE A TABLE OF COMPARISONS OF DEATHS / ENVELOPE BY LOCATION
    # AGE YEAR SEX FOR REVIEW

    # MAKE NID METADATA TABLE
    if 'parent_nid' not in df.columns:
        df['parent_nid'] = np.nan

    if is_active is True:
        warnings.warn(
            """is_active is deprecated: use the update_nid_metadata_status
                         function to change the status of finalized datasets"""
        )

    # Use existing is_active and is_mort_active values, otherwise default to 0
    nid_map = pull_nid_metadata()
    df = df.merge(nid_map,
                  on=[
                      'nid', 'parent_nid', 'extract_type_id', 'source',
                      'data_type_id', 'code_system_id'
                  ],
                  how='left')

    df_na = df[pd.isnull(df['is_active'])]
    df_na = df_na[['nid', 'extract_type_id']].drop_duplicates()

    if df_na.shape[0] > 0:
        print("""New rows for the following NID/extract_type_id will be added
                 with is_active and is_mort_active = 0:\n {}""".format(df_na))

    df['is_active'] = df['is_active'].fillna(0)
    df['is_mort_active'] = df['is_mort_active'].fillna(0)

    # CHECK SUBNATIONAL LOCATIONS
    df = check_subnational_locations(df)

    # OVERRIDE REPRESENTATIVE ID FOR NON-VR
    df = adjust_representative_id(df)

    nid_meta_df = df[NID_META_COLS].drop_duplicates()
    nid_meta_df['last_formatted_timestamp'] = format_timestamp

    # MAKE NID LOCATION YEAR TABLE
    nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates()
    nid_locyears['last_formatted_timestamp'] = format_timestamp
    # check one iso3 per nid
    nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id')
    nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3)
    report_duplicates(
        nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(),
        ['nid', 'extract_type_id'])
    nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1)

    if write:
        # write nid metadata
        write_to_claude_nid_table(nid_meta_df,
                                  'claude_nid_metadata',
                                  replace=True,
                                  conn_def=conn_def)

        # write nid location-year map
        write_to_claude_nid_table(nid_locyears,
                                  'claude_nid_location_year',
                                  replace=True,
                                  conn_def=conn_def)

        # write to cod.source for new sources
        insert_source_id(source)

        nid_extracts = input_df[['nid', 'extract_type_id'
                                 ]].drop_duplicates().to_records(index=False)
        for nid, extract_type_id in nid_extracts:
            nid = int(nid)
            extract_type_id = int(extract_type_id)
            print("Writing nid {}, extract_type_id {}".format(
                nid, extract_type_id))
            idf = input_df.loc[(input_df['nid'] == nid) & (
                input_df['extract_type_id'] == extract_type_id)].copy()
            phase = 'formatted'
            launch_set_id = format_timestamp
            print("\nTotal deaths: {}".format(idf.deaths.sum()))
            write_phase_output(idf, phase, nid, extract_type_id, launch_set_id)

        # now refresh cache files for nid
        if refresh_cache:
            refresh_claude_nid_cache_files()

    return locals()
示例#9
0
def finalize_formatting(df,
                        source,
                        write=False,
                        code_system_id=None,
                        extract_type=None,
                        conn_def='ADDRESS',
                        is_active=True):

    NID_META_COLS = [
        'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id',
        'code_system_id', 'is_active'
    ]
    NID_LOCATION_YEAR_COLS = [
        'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id'
    ]
    FORMATTED_ID_COLS = [
        'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id',
        'age_group_id', 'location_id'
    ]
    if 'code_id' in df.columns:
        code_col = 'code_id'
        map_code_id = False
    elif 'cause' in df.columns:
        code_col = 'cause'
        map_code_id = True
    else:
        raise AssertionError("Need either 'code_id' or 'cause' in columns")
    INCOMING_EXPECTED_ID_COLS = [
        'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col,
        'site', 'data_type_id', 'representative_id', 'code_system_id'
    ]
    VALUE_COLS = ['deaths']
    FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS

    missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns)
    if len(missing_cols) > 0:
        raise AssertionError(
            """These columns are needed for formatting but not found in df:

            {}
            """.format(missing_cols))

    # SET FORMATTING TIMESTAMP
    format_timestamp = cod_timestamp()
    print("Finalizing formatting with timestamp {}".format(format_timestamp))

    # ADD SOURCE
    df['source'] = source

    # MAP OR CHECK CODE ID
    code_system_ids = df.code_system_id.unique()
    if map_code_id:
        cs_dfs = []
        for code_system_id in code_system_ids:
            cs_df = df.loc[df['code_system_id'] == code_system_id].copy()
            # map code_id to the data
            cs_df['value'] = cs_df['cause']
            cs_df = add_code_metadata(cs_df, ['code_id'],
                                      code_system_id=code_system_id,
                                      merge_col='value',
                                      force_rerun=True,
                                      cache_dir='standard')
            print(cs_df.loc[cs_df['code_id'].isnull()].value.unique())
            report_if_merge_fail(cs_df, ['code_id'], ['value'])
            cs_df = cs_df.drop('value', axis=1)
            cs_dfs.append(cs_df)
        df = pd.concat(cs_dfs, ignore_index=True)
    else:
        # ADD TEST TO CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE
        # CODE SYSTEM
        all_codes_q = """
            SELECT code_id
            FROM ADDRESS
            WHERE code_system_id IN ({})
        """.format(",".join([str(c) for c in code_system_ids]))
        all_codes = ezfuncs.query(all_codes_q, conn_def='engine')
        bad_codes = set(df.code_id) - set(all_codes.code_id)
        if len(bad_codes) > 0:
            print("Found these code ids in data that can't exist in code "
                  "systems {}: {}".format(code_system_ids, bad_codes))

    # MAP SITE ID
    df = map_site_id(df, conn_def=conn_def)
    # MAP EXTRACT TYPE ID
    df = map_extract_type_id(df, source, extract_type, conn_def=conn_def)

    # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS
    df = group_six_minor_territories(df, sum_cols=VALUE_COLS)

    df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))]
    df = df.loc[~(df['nid'].isin([24143, 107307]))]

    # ENSURE NO NEGATIVES
    for val_col in VALUE_COLS:
        assert (df[val_col] >= 0).all(), \
            "there are negative values in {}".format(val_col)

    input_df = df[FINAL_FORMATED_COLS].copy()
    assert not input_df.isnull().values.any(), "null values in df"
    dupped = input_df[input_df.duplicated()]
    if len(dupped) > 0:
        raise AssertionError("duplicate values in df: \n{}".format(dupped))

    # GROUP IF NECESSARY
    if input_df[FORMATTED_ID_COLS].duplicated().any():
        input_df = input_df.groupby(FORMATTED_ID_COLS,
                                    as_index=False)[VALUE_COLS].sum()

    # MAKE NID METADATA TABLE
    if 'parent_nid' not in df.columns:
        df['parent_nid'] = np.nan

    df['is_active'] = 1 * is_active

    # CHECK SUBNATIONAL LOCATIONS
    # alters is_active if needed
    df = check_subnational_locations(df)

    nid_meta_df = df[NID_META_COLS].drop_duplicates()
    nid_meta_df['last_updated_timestamp'] = format_timestamp

    # MAKE NID LOCATION YEAR TABLE
    nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates()
    nid_locyears['last_updated_timestamp'] = format_timestamp
    # check one iso3 per nid
    nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id')
    nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3)
    report_duplicates(
        nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(),
        ['nid', 'extract_type_id'])
    nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1)

    if write:
        # write nid metadata
        write_to_claude_nid_table(nid_meta_df,
                                  'claude_nid_metadata',
                                  replace=True,
                                  conn_def=conn_def)

        # write nid location-year map
        write_to_claude_nid_table(nid_locyears,
                                  'claude_nid_location_year',
                                  replace=True,
                                  conn_def=conn_def)

        insert_source_id(source)

        nid_extracts = input_df[['nid', 'extract_type_id'
                                 ]].drop_duplicates().to_records(index=False)
        for nid, extract_type_id in nid_extracts:
            nid = int(nid)
            extract_type_id = int(extract_type_id)
            print("Writing nid {}, extract_type_id {}".format(
                nid, extract_type_id))
            idf = input_df.loc[(input_df['nid'] == nid) & (
                input_df['extract_type_id'] == extract_type_id)].copy()
            phase = 'formatted'
            launch_set_id = format_timestamp
            print("\nTotal deaths: {}".format(idf.deaths.sum()))
            write_phase_output(idf, phase, nid, extract_type_id, launch_set_id)

        # now refresh cache files for nid
        print("\nRefreshing claude nid metadata cache files")
        force_cache_options = {
            'force_rerun': True,
            'block_rerun': False,
            'cache_dir': "standard",
            'cache_results': True,
            'verbose': True
        }
        get_nid_metadata(**force_cache_options)
        get_nidlocyear_map(**force_cache_options)

    return locals()