def add_required_columns(df, uid_cols, data_version_id, db): ''' adding in required columns for upload ''' print('adding required coluuns...') final_df = df.copy() final_df['data_version_id'] = data_version_id # rename column names final_df.rename(columns = {"national":"representative_id", 'sex':'sex_id', 'year':'year_id', },inplace = True) # add temp cf columns cf_var_cols = ['cf_final_low_rd', 'cf_final_high_rd','cf_final_low_ss', 'cf_final_high_ss', 'cf_final_low_total', 'cf_final_high_total', 'variance_rd_logit_cf', 'variance_rd_log_dr'] for col in cf_var_cols: final_df[col] = 0 # add cause_id cancer_link = cdb.db_api() gbd_id = utils.get_gbd_parameter('current_gbd_round') # other columns that are constant final_df['underlying_nid'] = np.nan final_df['source_id'] = 68 # cancer default final_df['data_type_id'] = 2 # cancer registry # add site labels. upload to site table if new sources are present final_df = map_site_id(final_df, db) return(final_df)
def add_coverage_metadata(df): ''' Returns the dataframe with a boolean indication of whether each registry covers it's associated location_id ''' merge_col = ['registry_index'] metadata_cols = ['full_coverage', 'national_registry'] assert all(c in df.columns for c in merge_col), \ "add_coverage_metadata requires {} column(s)".format(merge_col) if (all(c in df.columns for c in metadata_cols) and df.loc[:, metadata_cols].notnull().all().all()): return(df) else: assert df.loc[:, merge_col].notnull().all().all(), \ "add_coverage_metadata cannot accept nulls for {} column(s)".format( merge_col) input_len = len(df) df = df.drop(labels=metadata_cols, axis=1, errors='ignore') reg_df = cdb.db_api().get_table("registry") reg_df.rename( columns={'coverage_of_location_id': 'full_coverage'}, inplace=True) reg_df.loc[(reg_df['location_id'] == reg_df['country_id']) & reg_df['full_coverage'].isin([1]), 'national_registry'] = 1 reg_df.loc[reg_df['national_registry'].isnull(), 'national_registry'] = 0 df = df.merge(reg_df[merge_col + metadata_cols], how='left', on=merge_col) assert len(df) == input_len, "Data dropped while adding coverage metadata" return(df)
def get_me_info(parent_meid): ''' Returns the modelable_entity_ids (meids) of the "child" (component) MEs, meids of the proportions to generate those children MEs, and the me_tag of the survival phase associated with the modelable entity ''' # Load modelable_entity_ids from the cnf_model_entity_table. db_link = cdb.db_api("cancer_db") me_table = db_link.get_table('cnf_model_entity') parent_info = me_table.loc[ me_table['modelable_entity_id'].eq(parent_meid), :].sort_values( 'me_tag').copy().reset_index() parent_cause = parent_info['acause'].item() me_tag = parent_info['me_tag'].item() me_info = me_table[ me_table['is_active'].eq(1) & me_table['acause'].str.startswith(parent_cause) & me_table['me_tag'].isin([me_tag, "subcause_proportion"]) & me_table['cancer_model_type'].eq("split_custom_epi")].sort_values( ['acause', 'me_tag']).copy().reset_index() children_mes = me_info.loc[me_info['me_tag'].eq(me_tag), 'modelable_entity_id'].to_dict(into=OrderedDict) proportion_mes = me_info.loc[me_info['me_tag'].eq("subcause_proportion"), 'modelable_entity_id'].to_dict( into=OrderedDict) return ([c for c in children_mes.values()], [p for p in proportion_mes.values()], me_tag)
def update_cancer_mortality_version(lsid): ''' Updates cod_mortality_version table and inserts CoD's launch_set_id for versioning purposes. ''' # get max_id db_link = cdb.db_api() cod_df = db_link.get_table('cod_mortality_version') max_id = cod_df['cod_mortality_version_id'].max() # update Cancer's database with CoD's launch_set_id update_query=''' UPDATE cancer_db.{tbl} cod_launch_set_id = {l_id} WHERE cod_mortality_version_id={cod_id} '''.format(tbl='cod_mortality_version', l_id=int(lsid), cod_id=max_id) db_link = cdb.db_api() db_link.run_query(update_query) print('cancers cod_mortality_version table updated!') return
def is_exception(dataset_id, data_type_id): ''' Determines if dataset is flagged such that negative values are accepted ''' db_link = cdb.db_api() tbl = db_link.get_table("prep_exception") is_exception = tbl.loc[tbl['dataset_id'].eq(dataset_id) & tbl['data_type_id'].eq(data_type_id) & tbl['prep_exception_type_id'].eq(1) & tbl['processing_status_id'].eq(2), :].any().any() return (is_exception)
def get_cod_description(): ''' Pulls notes from cod_mortality_version table from the cancer database ''' # grab description db_link = cdb.db_api() cod_df = db_link.get_table('cod_mortality_version') max_id = cod_df['cod_mortality_version_id'].max() desc = cod_df.loc[cod_df['cod_mortality_version_id'].eq(max_id), 'notes'].reset_index() return (desc['notes'][0])
def procedure_me_id(acause): me_table = (cdb.db_api('cancer_db')).get_table('cnf_model_entity') me_id = me_table.loc[me_table['is_active'].eq(1) & me_table['acause'].eq(acause) & me_table['me_tag'].eq('procedure_proportion'), 'modelable_entity_id'] if len(me_id) == 0: me_id = None else: me_id = me_id.item() return (me_id)
def query_data_version(database, table, launch_set_id): ''' ''' query_string = ''' SELECT MAX(data_version_id) FROM cod.{db_table} WHERE launch_set_id={lsid} '''.format(db_table=table,lsid=launch_set_id) db_link = cdb.db_api(database) conn = cdb.create_connection_string(database) max_id = cdb.execute_query(query_string, conn) print('DATA VERSION ID SET TO...{}'.format(max_id)) return max_id
def query_launch_set_id(db,tb): ''' Looks for the current max id in claude_launch_set id per user ''' query_string = ''' SELECT MAX(launch_set_id) FROM cod.{db_table} WHERE username='******' '''.format(db_table=tb,usr=getpass.getuser()) db_link = cdb.db_api(db) conn = cdb.create_connection_string(db) max_id = cdb.execute_query(query_string, conn) print('LAUNCH SET ID SET TO...{}'.format(max_id)) return max_id
def get_sites(db): ''' Fetch metadata for site labels ''' query_string = ''' SELECT site_id, site_name FROM cod.site ''' db_link = cdb.db_api(db) conn = cdb.create_connection_string(db) site_df = cdb.execute_query(query_string, conn) # ensure empty string instead of null values site_df.loc[(site_df['site_id'].eq(2)) & (site_df['site_name'].isnull()), 'site_name'] = '' return site_df
def check_columns(df, table, db): ''' checks that all required columns are present before upload ''' db_link = cdb.db_api(db) cod_cols = load_db_cols(table,db) # remove id and update cols from check cod_cols = list(cod_cols[1:,]) update_cols = get_update_settings() for col in update_cols.keys(): cod_cols.remove(col) assert set(cod_cols).difference(set(df.columns)) == set(), \ "Missing columns in dataset: \n{}".format( set(cod_cols).difference(df.columns)) return(None)
def load_surv_folder(cnf_model_version_id): ''' Using the cnf_lambda_version_id, returns the datestamp suffix of that version ''' surv_folder = surv_folder = utils.get_path("relative_survival", process="nonfatal_model") record = nd.get_run_record(cnf_model_version_id) rs_version = record.at[0, 'rel_survival_version_id'] db_link = cdb.db_api() this_version = db_link.get_entry(table_name='rel_survival_version', uniq_col='rel_survival_version_id', val=rs_version) suffix = str(this_version.at[0, 'date_generated']) rs_folder = surv_folder.replace("<date>", suffix) return(rs_folder)
def load_lambda_file(cnf_model_version_id): ''' Using the cnf_lambda_version_id, returns the datestamp suffix of that version ''' lambda_file_default = utils.get_path( "lambda_values", process="nonfatal_model") record = nd.get_run_record(cnf_model_version_id) lambda_version = record.at[0, 'cnf_lambda_version_id'] db_link = cdb.db_api() this_version = db_link.get_entry(table_name='cnf_lambda_version', uniq_col='cnf_lambda_version_id', val=lambda_version) suffix = str(this_version.at[0, 'date_generated']) lambda_file = lambda_file_default.replace("<date>", suffix) return(lambda_file)
def run_nonzero_floor(): ''' enforces cause-, age-, sex-, and year-specific minimums on non-zero cause fractions. It allows cause fractions to be 0, but not to be arbitrarily small. ''' print('running nonzero floor regression...') db_link = cdb.db_api() nr_df = pd.read_csv(utils.get_path(process='cod_mortality', key='noise_reduced')) cause_df = db_link.get_table('cod_model_entity') cause_df = cause_df.loc[cause_df['is_active'].eq(1),['cause_id','acause']] cause_metadata = nr_df.merge(cause_df, how='left', on='acause') cause_hierarchy = get_cause_hierarchy() nonzero_floorer = ra.NonZeroFloorer(cause_metadata) df = nonzero_floorer.get_computed_dataframe(get_pop(), get_env(), cause_hierarchy) df.to_csv(<FILE PATH>) return
def get_current_mi_results(cod_mortality_version_id): ''' Returns the current/best compiled MI results ''' db_link = cdb.db_api() mor_config = db_link.get_table('cod_mortality_version') # get mir_model_version_id mir_id = mor_config.loc[mor_config['cod_mortality_version_id'] == cod_mortality_version_id, 'mir_model_version_id'][len(mor_config) - 1] print('using mir model version id...{}'.format(mir_id)) mi_path = utils.get_path(process='mir_model', key='compiled_mir_results') # replace suffix with model_version_id mi_path = mi_path.replace('<mir_model_version_id>', str(mir_id)) compiled_mi_df = pd.read_csv(mi_path) return compiled_mi_df
def update_data_version(database, table, launch_set_id): ''' ''' clear_prev_data_version_status(database, table) update_query = """ UPDATE cod.{tbl} SET status='1' WHERE launch_set_id = {lsid} """.format( tbl=table, lsid=launch_set_id ) db_link = cdb.db_api(database) db_link.run_query(update_query) print('updated data version table!!') return()
def update_launch_set(database, table, launch_set_id): ''' ''' launch_set_end = make_db_datestamp() update_query = """ UPDATE cod.{tbl} SET launch_set_end = '{lse}' WHERE launch_set_id = {lsid} """.format( tbl=table, lse=launch_set_end, lsid=launch_set_id ) db_link = cdb.db_api(database) db_link.run_query(update_query) print('updated launch_set_end to...{}'.format(launch_set_end)) return()
def check_all_entries_uploaded(database, table, data_version_id): ''' ''' upload_file = pd.read_csv(_get_upload_path('data')) count_query = ''' SELECT COUNT(data_version_id) FROM cod.{tbl} WHERE data_version_id={d_id} '''.format(tbl=table, d_id=data_version_id) db_link = cdb.db_api(database) conn = cdb.create_connection_string(database) data_count = cdb.execute_query(count_query, conn) if len(upload_file) == data_count: print("ALL DATA POINTS SUCCESSFULLY UPLOADED") else: print("ERROR: SOME datapoints were not uploaded. Please check what data points \ have not been uploaded successfully.") return
def _add_ihme_pop_marker(df): ''' Returns the dataframe with an added 'ihme_pop_ok' column indicating whether ihme population estimates may be merged with the uid ''' if not 'sdi_quintile' in df.columns: df = modeled_locations.add_sdi_quintile(df) if not 'full_coverage' in df.columns: df = add_coverage_metadata(df) ds_df = cdb.db_api().get_table("dataset") df.loc[:, 'ihme_pop_ok'] = 0 for dsid in df['dataset_id'].unique(): pop_ok = ds_df.loc[ds_df['dataset_id'] == dsid, 'can_use_ihme_pop'].values[0] if pop_ok == 1: df.loc[df['dataset_id'] == dsid, 'ihme_pop_ok'] = pop_ok ihme_pop_ok = (df['sdi_quintile'].isin([5]) & (df['full_coverage'].isin([1]))) df.loc[ihme_pop_ok, 'ihme_pop_ok'] = 1 return(df)
def sequelae_fractions(acause): ''' Defines fractions from lit review to be used when splitting sequela ''' # Set fractions of population recieving treatment according to # what is basically their primary disabilty (sequela) pros_incont_frac = 0.18 # pct. who primarily develop incontinence pros_impot_frac = 0.55 # pct. who primarily develop impotence # Define dict fractions = { 'neo_prostate': { # Fractions used to calculate the controlled phase 18781: { 'fraction': pros_impot_frac }, # with impotence 18782: { 'fraction': pros_incont_frac }, # with incontinence # Fractions used to calculate the metrics of sequela beyond ten years 18784: { 'fraction': pros_impot_frac }, 18785: { 'fraction': pros_incont_frac } } } # Add me_tags to dict (enables later linking of data to modelable_entity_id) me_tbl = cdb.db_api().get_table("cnf_model_entity") meids = list(fractions['neo_prostate'].keys()) for me in meids: if me_tbl.loc[me_tbl['modelable_entity_id'].eq(me), 'is_active'].item() == 0: del fractions['neo_prostate'][me] else: tag = me_tbl.loc[me_tbl['modelable_entity_id'].eq(me), 'me_tag'].item() fractions['neo_prostate'][me]['me_tag'] = tag if acause in fractions.keys(): return (fractions[acause]) else: return (False)
def load_durations(acause): ''' ''' db_link = cdb.db_api('cancer_db') if acause[:8] == "neo_liver_": sequelae_cause = "neo_liver" elif acause == "neo_leukemia_other": sequelae_cause = "neo_leukemia_ll_chronic" elif acause == "neo_nmsc": sequelae_cause = "neo_nmsc_scc" elif acause == "neo_other_cancer": sequelae_cause = "neo_other" else: sequelae_cause = acause sq_df = db_link.get_table('sequelae_durations') this_sq = sq_df.loc[sq_df['acause'] == sequelae_cause, :] this_sq.loc[:, 'acause'] = acause assert this_sq['sequela_duration'].notnull().all( ), "error loading sequela durations" assert len(this_sq) > 0, "Error loading sequela durations" return (this_sq[['acause', 'me_tag', 'sequela_duration']])
def add_representativeness(df): ''' Adds 'representative' integer indicating whether data are representative of their attached location_id ''' def _avg_repness(regs, rep_table): ''' If all registries are representative, returns 1. Else returns 0 ''' try: if not isinstance(regs, tuple) and not isinstance(regs, list): try: regs = list(literal_eval(regs)) except: regs = list(regs) rep = rep_table.loc[rep_table['registry_index'].isin(regs), 'representative_of_location_id'] if len(rep) == 0: return(0) else: return(rep.min()) except: return(0) print("adding representative status...") db_link = cdb.db_api("cancer_db") # Add representative status based on the input registries rep_status = db_link.get_table( "registry")[['registry_index', 'representative_of_location_id']] rep_df = pd.DataFrame( {'registry_index': df['registry_index'].unique().tolist()}) get_repness = partial(_avg_repness, rep_table=rep_status) rep_df.loc[:, 'representative'] = rep_df['registry_index'].apply( get_repness) output = df.merge(rep_df, on='registry_index') output = update_repness(output) assert len(output) == len( df), "add_representativeness is adding or deleting data" return(output)
def execute_upload(df, table, db): '''Uploads our table to cod data. This will load in the columns from the database you want to upload to, and re-orders the dataframe to the approrpriate order ''' print('executing upload...') db_link = cdb.db_api(db) # order columns same as table cod_cols = load_db_cols(table,db) cod_cols = list(cod_cols[1:,]) update_cols = get_update_settings() for col in update_cols.keys(): cod_cols.remove(col) upload_df = df[cod_cols] # ASSERT columns to upload are the same as what exists assert set(cod_cols).difference(set(df.columns)) == set(), \ "Missing columns in dataset: \n{}".format( set(cod_cols).difference(df.columns)) # upload dataframe write_table_to_csv(upload_df, table) upload_to_db(table, db)
def _check_all_floors_exist(self, nzf_df): ''' Check that all expected cancers, ages, and years, are present and have nonzero floor values ''' def _remove_ages_less_than(a, b): ''' ''' orig_list = a.copy() for val in orig_list: if b == 5 & val in [2, 3, 4]: continue if val < b: a.remove(val) return a print("CHECKING FOR ALL CAUSES, AGES, and YEARS...") # create cause_list db_link = cdb.db_api(db_connection_name='cancer_db') gbd_id = utils.get_gbd_parameter('current_gbd_round') registry_entity = db_link.get_table('registry_input_entity') registry_entity = registry_entity.loc[ registry_entity['gbd_round_id'].eq(gbd_id) & registry_entity['is_active'].eq(1), ] cancer_metadata = registry_entity[[ 'acause', 'cause_id', 'yll_age_start', 'yll_age_end' ]] causes_checklist = registry_entity['acause'].unique().tolist() # exceptions for nonzero floors causes_checklist.remove('neo_nmsc_bcc') causes_checklist.remove('neo_ben_intest') causes_checklist.remove('neo_ben_utr') causes_checklist.remove('neo_ben_other') causes_checklist.remove('neo_ben_brain') causes_checklist.remove('_gc') # create year_list year_start = utils.get_gbd_parameter('min_year_cod') year_end = utils.get_gbd_parameter('max_year') # + 1 for GBD2020 year_checklist = list(range(year_start, year_end)) # sex & age_id checklist age_id_checklist = [ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31, 32, 235, 2, 3, 4 ] #age_ids for 0-95 ages sex_checklist = [1, 2] # print any causes/years/sexes that are expected and missing for cancer in causes_checklist: print('working on...{}'.format(cancer)) subset = nzf_df.loc[nzf_df['acause'].eq(cancer), ] age_start = int( cancer_metadata.loc[cancer_metadata['acause'].eq(cancer), 'yll_age_start']) age_start = (age_start / 5) + 5 # conversion from age to GBD age_group_id if len(subset) == 0: print('MISSING CAUSE... {} '.format(cancer)) missing_ages = set(age_id_checklist) - set( subset['age_group_id'].unique().tolist()) missing_ages = list(missing_ages) missing_ages = _remove_ages_less_than(missing_ages, age_start) if len(missing_ages) > 0: print('missing the following ages for {}: {}'.format( cancer, missing_ages)) missing_sexes = set(sex_checklist) - set( subset['sex_id'].unique().tolist()) if len(missing_sexes) > 0: print('missing the following sexes for {}: {}'.format( cancer, missing_sexes)) missing_years = set(year_checklist) - set( subset['year_id'].unique().tolist()) if len(missing_years) > 0: print('missing the following years for {}: {}'.format( cancer, missing_years)) return