def main(dataset_id, data_type_id): ''' Maps data with the following steps: 1) imports input data 2) runs mapping function 3) expands icd codes to fill garbage acause(n) where necessary 4) applies sex and cause restrictions ''' this_dataset = md.MI_Dataset(dataset_id, 3, data_type_id) metric_name = this_dataset.metric uid_cols = md.get_uid_cols(3) input_data = this_dataset.load_input().reset_index(drop=True) input_data.loc[input_data['im_frmat_id'].isnull() & input_data['frmat_id'].eq(9), 'im_frmat_id'] = 9 df = md.stdz_col_formats(input_data) # Ensure that there is no "all age" data. Remove this line after updating # the preceeding steps to stop using the old cancer age formats df = df.loc[df['age'] != 1, :] # Map data and apply restrictions mapped_df = map_data(df, this_dataset) restricted_df = cm.restrict_causes(mapped_df, cause_col='gbd_cause', data_type_id=data_type_id, restrict_age=False) md.complete_prep_step(restricted_df, this_dataset) print("mapping complete.\n")
def split_unknown_age(dataset, wgt_df, uid_cols, metric): ''' redistribute unkonwn ages --- Inputs --- dataset : DataFrame pandas dataframe metric : string possible values: ['pop', 'cases', 'deaths'] ''' assert 'wgt' in wgt_df.columns, "Error: no wgt column sent in wgt_df" # Ensure that 'age' is not listed as a uid uids_noAge = [u for u in uid_cols if 'age' != u] uid_cols = uids_noAge + ['age'] dataset = add_missing_ages(dataset, uids_noAge, metric) # Split unknown age unknown_age = dataset.loc[dataset['age'] == 26, uids_noAge + [metric]].copy() unknown_age.rename(columns={metric: 'unknown_age_data'}, inplace=True) known_age = dataset.loc[dataset['age'] != 26, :].copy() # standardize columns to enable merging wgt_df = md.stdz_col_formats(wgt_df) wgt_df_uids = list(wgt_df.columns) wgt_df_uids = [u for u in wgt_df_uids if 'age' != u] wgt_df_uids = [u for u in wgt_df_uids if 'wgt' != u] wgt_df = add_missing_ages(wgt_df, wgt_df_uids, 'wgt') wgt_df = wgt_df.loc[~(wgt_df['wgt'].eq(0))] known_age['dataset_id'] = 0 #temporary fill so stdz_col_formats() works known_age = md.stdz_col_formats(known_age) with_weights = known_age.merge(wgt_df, how='left') astest.test_weights(known_age, with_weights) prop_df = pp.add_proportions(with_weights, uids_noAge) # added for more standardized merge prop_df = md.stdz_col_formats(prop_df, additional_float_stubs = ['proportion', 'wgt', 'wgt_tot']) unknown_age = md.stdz_col_formats(unknown_age) to_distribute = prop_df.merge(unknown_age, how = "left", indicator = True) to_distribute.loc[:, 'unknown_age_data'].fillna(value=0, inplace=True) to_distribute['orig_data'] = to_distribute[metric].copy() to_distribute['unknown_age_data'] = to_distribute['unknown_age_data'].astype(float) to_distribute['proportion'] = to_distribute['proportion'].astype(float) to_distribute.loc[:, 'proportion'].fillna(value=0, inplace=True) to_distribute.loc[:, metric] += to_distribute['unknown_age_data'].multiply( to_distribute['proportion']) output = to_distribute[uid_cols + [metric]] output.loc[:, 'frmat_id'] = 131 pt.verify_metric_total(dataset, output, metric, "split unknown age") return(output)
def add_cause_rates(dataset, uid_cols, metric): ''' Returns expected values used for splitting data in the cancer pipeline. Uses two merges --- Inputs --- dataset : DataFrame pandas dataframe ''' print(" loading cause rates...") # Make acause long and remove blank acause entries merge_df = dataset.copy() # Save original acause for later reversion merge_df.loc[:, 'orig_acause'] = merge_df['acause'] # Load maps acause_rates = pp.load_cause_rates(3) # 3 for deaths # Merge with rates. If rate is not available, use "average_cancer" rate cause_df = md.stdz_col_formats(merge_df) acause_rates = md.stdz_col_formats(acause_rates) rate_df = cause_df.merge(acause_rates, how='left', on=["age", "acause", "sex_id"], indicator=True) not_mapped_to_rate = (rate_df['_merge'].isin(['left_only'])) rate_df.loc[not_mapped_to_rate, 'acause'] = "average_cancer" rate_df.drop(['_merge', 'rate'], axis=1, inplace=True) rate_df = rate_df.merge(acause_rates, on=['acause', 'sex_id', 'age'], how='left') # set rates to 0 for age 2 (0-4 ages) rate_df.loc[rate_df['age'].isin([2]), 'rate'] = 0 # Set rates to 0 for non-prepped ages rate_df.loc[~rate_df['age'].isin(acause_rates['age'].unique()), 'rate'] = 0 # Reset acause to the input acause rate_df.loc[:, 'acause'] = rate_df['orig_acause'] # Validate that values exist for all rate entriesnn rate_df = rate_df[get_uid_columns() + ['rate', 'deaths']] rate_df = md.stdz_col_formats(rate_df) assert not rate_df['rate'].isnull().any(), \ "Error in merge. Some observations do not have rates" return (rate_df)
def reformat_input(df, ds_instance): ''' Collapse and reshape input data from standardize_format output ''' metric_name = ds_instance.metric uid_cols = md.get_uid_cols(2) wide_uid_cols = [u for u in uid_cols if 'age' not in u] uids_noCause = [u for u in uid_cols if 'cause' not in u] df.loc[df['im_frmat_id'].isnull() & df['frmat_id'].isin([9]), 'im_frmat_id'] = 9 df = md.stdz_col_formats(df) df = dft.collapse(df, by_cols=wide_uid_cols, func='sum', stub=metric_name) df = dft.wide_to_long(df, stubnames=metric_name, i=wide_uid_cols, j='age') df = df.groupby(uid_cols, as_index=False)[metric_name].sum() df[metric_name].fillna(value=0, inplace=True) df = md.stdz_col_formats(df) df = dft.make_group_id_col(df, uids_noCause, id_col='uniqid') return(df)
def create_metric_weights(df, uid_cols, metric): ''' Returns a dataframe of expected values ("weights") used for splitting neonatal data in the cancer pipeline. --- Inputs --- df : DataFrame pandas dataframe ''' rate_df = add_cause_rates(df, uid_cols, metric) df = md.stdz_col_formats(df) # make weights (weights = expected number of events = rate*pop)rate weight_df = pd.merge(df, rate_df[get_uid_columns() + ['rate', 'deaths']], on=get_uid_columns() + ['deaths']) weight_df.loc[:, 'wgt'] = weight_df['rate'] * weight_df['pop'] weight_df = md.stdz_col_formats(weight_df, additional_float_stubs=['wgt', 'rate']) return (weight_df)
def split_neo_age(dataset, wgt_df, uid_cols, metric): ''' redistribute unknown ages --- Inputs --- dataset : DataFrame pandas dataframe metric : string possible values: ['pop', 'cases', 'deaths'] ''' assert 'wgt' in wgt_df.columns, "Error: no wgt column sent in wgt_df" # Ensure that 'age' is not listed as a uid uids_noAge = [u for u in uid_cols if 'age' != u] uid_cols = uids_noAge + ['age'] # Split 0-4 age neo_age = dataset.loc[dataset['age'] == 2, uids_noAge + [metric]].copy() neo_age.rename(columns={metric: 'unknown_age_data'}, inplace=True) known_age = dataset.loc[dataset['age'] != 2, :].copy() known_age.fillna(value=0, inplace=True) # standardize columns to enable merging wgt_df = md.stdz_col_formats(wgt_df) wgt_df_uids = list(wgt_df.columns) wgt_df_uids = [u for u in wgt_df_uids if 'age' != u] wgt_df_uids = [u for u in wgt_df_uids if 'wgt' != u] with_weights = known_age.merge(wgt_df, how='left') at.test_weights(known_age, with_weights) prop_df = pp.add_proportions(with_weights, uids_noAge) prop_df = md.stdz_col_formats(prop_df) neo_age = md.stdz_col_formats(neo_age) to_distribute = prop_df.merge(neo_age, on=uids_noAge) to_distribute.loc[:, 'unknown_age_data'].fillna(value=0, inplace=True) to_distribute['orig_data'] = to_distribute[metric].copy() to_distribute['unknown_age_data'] = to_distribute[ 'unknown_age_data'].astype(float) to_distribute['proportion'] = to_distribute['proportion'].astype(float) to_distribute.loc[:, metric] += to_distribute['unknown_age_data'].multiply( to_distribute['proportion']) output = to_distribute[uid_cols + [metric]] pt.verify_metric_total(dataset, output, metric, "split unknown age") return (output)
def check_and_save(df, metric): '''Does common tests on our dataset for testing ''' output = md.stdz_col_formats(df) dataset_verified = pt.verify_prep_step_output(output, get_uid_columns(), metric) if dataset_verified: return (output) else: raise AssertionError("ERROR: data could not be verified on output") pass
def get_age_frmat_map(frmat_type): ''' ''' if frmat_type == "im_frmat_id": resource = pd.read_csv(utils.get_path( 'im_frmat_map', process="mi_dataset")) elif frmat_type == "frmat_id": resource = pd.read_csv(utils.get_path( 'frmat_map', process="mi_dataset")) resource = md.stdz_col_formats( resource, additional_float_stubs=['age_specific', 'age_split']) return(resource)
def apply_age_spilt_proportions(input_df, frmat_type, wgt_df, uid_cols, metric): ''' ''' # remove dataset_id if present in dataframe split_input = input_df.copy() if 'dataset_id' in split_input.columns: del split_input['dataset_id'] # merge with the age format map and get an expanded dataframe with the ages # to be split uids_noAge = [u for u in uid_cols if 'age' != u] uid_cols = uids_noAge + ['age'] marked_df = mark_ages_to_be_split(split_input, frmat_type, uid_cols, metric) to_expand = marked_df.loc[marked_df['to_expand'].eq(1), :].copy() if len(to_expand) == 0: return(split_input) # merge with expected values ("weights") to_expand.rename(columns={'age': 'split_age', 'gbd_age': 'age'}, inplace=True) to_expand = md.stdz_col_formats(to_expand) wgt_df = md.stdz_col_formats(wgt_df) weighted_df = pd.merge(to_expand, wgt_df, how='left',indicator=True) astest.test_weights(to_expand, weighted_df) # calculate proportions to_split = pp.add_proportions( weighted_df, uids_noAge+['split_age']) # adjust by proportions to_split.loc[:, 'split_value'] = to_split[metric] to_split.loc[:, metric] = to_split['proportion'] * to_split['split_value'] # collapse, then update format types of split data recombined_df = to_split.append( marked_df.loc[marked_df['to_expand'].eq(0), :].copy()) adjusted_df = dft.collapse(recombined_df, by_cols=uid_cols, func='sum', combine_cols=metric) astest.compare_pre_post_split(split_input, adjusted_df, metric) adjusted_df.loc[:, 'need_split'] = 1 pt.verify_metric_total(split_input, adjusted_df, metric, "apply age proportions") return(adjusted_df[split_input.columns.values])
def validate_mapping(in_df, out_df, metric): ''' Tests the mapping output to verify results. ''' uid_cols = md.get_uid_cols(3) test_results = {'missing entries': [], 'new entries': []} in_df = md.stdz_col_formats(in_df) out_df = md.stdz_col_formats(out_df) test_df = pd.merge(in_df[uid_cols], out_df[uid_cols], on=uid_cols, how='outer', indicator=True) if test_df['_merge'].isin(["left_only"]).any(): test_results['missing entries'] = test_df.loc[ test_df._merge == "left_only", uid_cols].to_dict() if test_df['_merge'].isin(["right_only"]).any(): test_results['new entries'] = test_df.loc[ test_df._merge == "right_only", uid_cols].to_dict() if len(out_df) != len(in_df): test_results['missing or extra uids'] = "fail" pt.verify_metric_total(in_df, out_df, metric, "mapping test") tests.checkTestResults(test_results, 'validate mapping', displayOnly=False)
def manage_no_split(df, metric_name, uid_cols, this_dataset): ''' ''' uids_noAge = [c for c in uid_cols if 'age' not in c] # collapse remaining under5 and 80+ ages df = md.stdz_col_formats(df) df[metric_name].fillna(value=0, inplace=True) final_df = df.copy(deep=True) # final_df = md.collapse_youngAndOld_ages(df, uids_noAge, metric_name) # if metric_name == "pop": md.complete_prep_step(final_df, this_dataset, is_pop=True) else: md.complete_prep_step(final_df, this_dataset, is_pop=False) return(None)
def disaggregate_acause(df, ds_instance): ''' Description: Returns a dataframe in which metric values have been distributed across all associated acauses How it Works: Utilizes the create_metric_weights function to reshape the df so that acause is in long form. Adds proportions to each acause by observation, then applies those proportions to split the input metric value across the attributed acauses. Finally, collapses to re-combine data to single datapoints by gbd_cause and acause. NOTE: this process drops the 'cause' and 'cause_name' columns. ''' # Ensure that 'age' is not listed as a uid metric = ds_instance.metric uids_noAcause = [c for c in md.get_uid_cols(5) if 'acause' not in c] acause_cols = [a for a in df.columns if 'acause' in a] all_uids = md.get_uid_cols(5) needs_split = (df['acause2'].notnull() & ~df['acause2'].isin([""])) to_split = df.loc[needs_split, :] no_split = df.loc[~needs_split, :] # If no split needed, simply return the datafraqme with a renamed acause1 if len(to_split) == 0: df.rename(columns={'acause1': 'acause'}, inplace=True) acause_cols.remove('acause1') df.drop(labels=acause_cols, axis=1, inplace=True) return (df) print("disaggregating acause...") # create weights used for splitting weight_df = pp.create_metric_weights(df, all_uids, ds_instance) # calculate proportions based on the weights proportions_df = pp.add_proportions(weight_df, uids_noAcause) # adjust by proportions is_split = to_split.merge(proportions_df) is_split['split_value'] = is_split[metric] is_split.loc[:, metric] = is_split['proportion'] * is_split['split_value'] is_split = md.stdz_col_formats(is_split) # no_split.rename(columns={'acause1': 'acause'}, inplace=True) acause_cols.remove('acause1') no_split.drop(labels=acause_cols, axis=1, inplace=True) # output = no_split.append(is_split) pt.verify_metric_total(df, output, metric, "disaggregate acause") return (output.loc[:, no_split.columns.tolist()])
def run_split_pop(this_dataset): ''' ''' print(" splitting population...") uid_cols = md.get_uid_cols(prep_type_id=4, is_pop=True) df = this_dataset.load_pop(prep_type_id=1) if "registry_id" in df.columns: # some old 00_pop files still have registry_id column instead of registry_index df.rename(columns={'registry_id': 'registry_index'}, inplace=True) # Exit if no population present if len(df) == 0: return(None) # Temporarily reshape and updated dataframe until input is no longer in STATA uids_noAge = [c for c in uid_cols if 'age' not in c] df = dft.wide_to_long(df, stubnames='pop', i=uids_noAge, j='age', drop_others=True) df = df.loc[df.age != 1, :] # drop 'all ages' pop_cols = [p for p in df.columns.values if "pop" in p] df.loc[:, pop_cols].fillna(value=0, inplace=True) df = md.stdz_col_formats(df) if not (pt.has_combinedSex(df) | pt.has_age_unk(df, "pop") | pt.has_nonStdAge(df) ): manage_no_split(df, "pop", uid_cols, this_dataset) else: # replace missing data with zeroes uid_cols += ['location_id', 'country_id', 'year'] df = md.add_location_ids(df) df = modeled_locations.add_country_id(df) # data with no country_id have no population estimates. # global estimate should be used to generate weights df.loc[df['country_id'] == 0, 'location_id'] = 1 # add mean year to faciliate merge with population weights df = gct.add_year_id(df) # Split data manage_split(df, "pop", uid_cols, this_dataset) return(None)
def main(dataset_id, data_type_id): ''' ''' # prep_step 5 = cause_disaggregation this_dataset = md.MI_Dataset(dataset_id, 5, data_type_id) input_data = this_dataset.load_input() metric = this_dataset.metric uid_cols = md.get_uid_cols(5) input_data = input_data.loc[~input_data['age'].isin( [26, 3, 4, 5, 6, 91, 92, 93, 94]), :] # Format and add observation numbers formatted_input = prep_for_disagg(input_data.copy(), uid_cols, metric) # Disaggregate disaggregated_df = core.disaggregate_acause(formatted_input, this_dataset) # update uid columns to account for reshaped acause uid_cols = [u for u in uid_cols if 'acause' not in u] + ['acause'] # kaposi_df = core.redist_kaposi(disaggregated_df, metric, uid_cols) if data_type_id == 2: adjusted_df = core.redist_nmsc_gc(kaposi_df, metric) else: adjusted_df = kaposi_df final_df = core.map_remaining_garbage(adjusted_df, data_type_id) # run test functions and save output pt.verify_metric_total(input_data, adjusted_df, metric, "cause disaggregation module") # collapse to incorperate newly-split data output_uids = md.get_uid_cols(6) final_df = md.stdz_col_formats(final_df) final_df = dft.collapse(final_df, by_cols=output_uids, func='sum', combine_cols=metric ) # save md.complete_prep_step(final_df, this_dataset) print("Acause disaggregated")
def submit_sr(calc_df, this_dataset): ''' Splits data based on subtotal-recalculation requirement and submits jobs as needed to recalculate subtotals. Then returns a re-combined dataset with subtotals recalculated ''' def submission_req(df, uid): ''' Returns boolean indicating whether data are to be submitted, qualified by whether subtotals are present and whether any component codes exist that could enable recalculation ''' uid_test = df[df['uniqid'].eq(uid)] meets_requirement = bool( has_subtotals(uid_test, 'orig_cause') and components_present(uid_test) ) return(meets_requirement) def output_file_func(id): ''' Function fed to get_results relative to the ''' return(get_sr_file(this_dataset, 'split_output', id[0])) # output_uids = md.get_uid_cols(3) metric_name = this_dataset.metric job_header = "cnSR_{}_{}".format(dataset_id, data_type_id) sr_input_file = get_sr_file(this_dataset, "sr_input") worker_script = utils.get_path("subtotal_recalculation_worker", process="mi_dataset") # convert components to string to enable save in hdf file uniqid_map = calc_df[output_uids + ['uniqid', 'orig_cause'] ].copy().drop_duplicates() submitted_data, unsubmitted_data = cup.split_submission_data(calc_df, group_id_col='uniqid', submission_requirement=submission_req, hdf_file=sr_input_file, regenerate_hdf=False) if len(submitted_data) == 0: final_results = unsubmitted_data else: uid_list = submitted_data['uniqid'].unique().tolist() sr_jobs = cup.generate_prep_workers(worker_script, list_of_uids=uid_list, ds_instance=this_dataset, job_header=job_header, is_resubmission=is_resubmission) output_files = cup.get_results(sr_jobs, output_file_func, parent_process_name="sr", noisy_checker=True, add_resubmission_argument=is_resubmission, wait_time=5) # Re-combine compiled results with the set-aside data, before collapsing # and testing results = pe.read_files(output_files) results.rename(columns={'cause':'orig_cause','codes_remaining':'cause'}, inplace=True) results = md.stdz_col_formats(results, additional_float_stubs='uniqid') results = results.merge(uniqid_map, how='outer', indicator=True) assert results['_merge'].isin(["both", "right_only"]).all(), \ "Error merging with uids" del results['_merge'] # entries with blank "cause" could not be corrected. replace with the # original aggregate (will be handled by cause recalculation and rdp). results.loc[results['cause'].eq(""), 'cause'] = results['orig_cause'] # drop causes that were zeroed in subtotal recalculation results['total'] = results.groupby(output_uids)[metric_name].transform(sum) results = results.loc[results['total'].ne(0) & results[metric_name].notnull(), :] final_results = results.append(unsubmitted_data) # Re-combine with data that were not split final_results = dft.collapse(final_results, by_cols=output_uids, combine_cols=this_dataset.metric) return(final_results)
def run(dataset_id, data_type_id, uid): ''' Preps data for recalculation then recalculates as necessary ''' this_dataset = md.MI_Dataset(dataset_id, 2, data_type_id) dataset_name = this_dataset.name metric = this_dataset.metric input_file = run_sr.get_sr_file(this_dataset, "sr_input") # Exit if output already exists output_file = run_sr.get_sr_file(this_dataset, 'split_output', uid) print(output_file) if os.path.exists(output_file): print(" output file found for uid " + str(uid)) return (None) # negative_data_ok = is_exception(dataset_id, data_type_id) error_folder = utils.get_path("mi_input", base_folder='j_temp') subcause_issue_file = '{}/subcause_issue/{}_{}_uid_{}.txt'.format( error_folder, dataset_name, data_type_id, uid) exception_file = '{}/negative_data/{}_{}_uid_{}.csv'.format( error_folder, dataset_name, data_type_id, uid) for d in [subcause_issue_file, exception_file, error_folder]: utils.ensure_dir(d) # print(" removing subtotals from uid {}...".format(uid)) # add data for the given uid df = pd.read_hdf(input_file, 'split_{}'.format(uid)) # Create a list of possible codes so that decimal subcauses are only added # if available input_cause_list = sorted(df['orig_cause'].unique().tolist()) # create a dictionary for codes in the selected uid and attach the uid's # data uid_subset = {} input_data = {} # process decimals first and ranges last to ensure that nested causes are # removed for c in sorted(df['orig_cause'].unique().tolist()): uid_subset[c] = {} input_data[c] = {} uid_subset[c]['codes'] = [] uid_subset[c]['subcodes'] = [] if "-" not in c and "," not in c: uid_subset[c]['codes'].append(c) # add subcodes to 'subcode' key df.loc[df['orig_cause'].eq(c), 'cause'].dropna().unique().tolist() for subcode in sorted(df['cause'].where( df['orig_cause'] == c).dropna().unique().tolist()): if subcode != c: uid_subset[c]['subcodes'].append(subcode) # if none of the subcodes appear in the list, set the cause as a # subcode of itself (prevents the addition of unused decimal # causes) if not len(uid_subset[c]['subcodes']): uid_subset[c]['subcodes'] = uid_subset[c]['codes'] elif (not any('{}.'.format(sub[:3]) in check for check in input_cause_list for sub in uid_subset[c]['subcodes'])): uid_subset[c]['subcodes'] = uid_subset[c]['codes'] else: for code in sorted(df['cause'].where( df['orig_cause'].eq(c)).dropna().unique().tolist()): uid_subset[c]['codes'].append(code) uid_subset[c]['subcodes'].append(code) # create other lists associated with the cause and add the metric data uid_subset[c]['subcauses_remaining'] = [] uid_subset[c]['codes_removed'] = [] uid_subset[c]['causes_removed'] = [] uid_subset[c]['data'] = df.loc[df['cause'].eq(c), ['age', metric]].set_index('age') input_data[c]['data'] = uid_subset[c]['data'] input_data[c]['codes'] = uid_subset[c]['codes'] # Determine subcauses and highest number of causes remaining (how many # subcauses are contained within each cause) uid_set = set_subcauses(uid_subset, subcause_issue_file) highest_level = determine_highest_level(uid_set) # remove lowest level codes from parent causes if highest_level == 0: print(' no subcauses present.') else: subcauses_removed = True while subcauses_removed: uid_set, subcauses_removed = remove_subcauses( uid_set, uid, exception_file) # remove duplicates uid_set = remove_duplicates(uid_set) # re-set subcauses and num_subcause_remaining uid_set, highest_level = set_subcauses( uid_set, subcause_issue_file, ) print(" subcauses removed.") # Prepare Output print("saving output...") output = pd.DataFrame( columns=['cause', 'codes_remaining', 'codes_removed', 'age', metric]) for c in uid_set: # format cause information cause_data = pd.DataFrame( columns=['cause', 'codes_remaining', 'codes_removed']) cause_data.loc[0, ['cause']] = c # if nothing was removed, or there was only a single cause, or all of # the input codes are still present, set the codes remaining as the # cause if (not len(uid_set[c]['codes_removed']) or ("-" not in c and "," not in c) or set(input_data[c]['codes']) <= set(uid_set[c]['codes'])): cause_data.loc[0, ['codes_remaining']] = c else: cause_data.loc[0, ['codes_remaining']] = ','.join( convert_to_range(uid_set[c]['codes'])) cause_data.loc[0, ['codes_removed']] = ','.join( convert_to_range(uid_set[c]['codes_removed'])) # format output data output_data = uid_set[c]['data'] output_data['age'] = output_data.index output_data['cause'] = c orig_data = input_data[c]['data'] orig_data['age'] = orig_data.index orig_data = orig_data.rename(columns={metric: 'orig_metric_value'}) orig_data['cause'] = c # combine and add to output final = pd.merge(output_data, cause_data, on='cause') final = pd.merge(final, orig_data, on=['cause', 'age']) output = output.append(final) # Create output dataset output['uniqid'] = uid # Update encoding (bug fix to work around pandas to_stata issue) output = md.stdz_col_formats(output, additional_float_stubs='uniqid') # Export results output.to_csv(output_file, index=False) print('\n Done!') time.sleep(1) return (None)