def adjust_data(bundle, cause_name): #delete all data to start over if needed, comment out if not bundle = int(bundle) #If export=True, save an excel file for the data retrieved in the bundle's download folder. df = run.get_epi_data(bundle, export=True) df = df[['bundle_id', 'seq']] destination_file = "{FILEPATH}" df.to_excel(destination_file, index=False, sheet_name="extraction") report = run.upload_epi_data(bundle, destination_file) assert (report['request_status'].item() == 'Successful') # grab latest download if needed, comment out if not #download_path = "{FILEPATH}" #allFiles = glob.glob(os.path.join(download_path, "*.xlsx")) #request_list = [] #for request in allFiles: # underscore_index = request.rfind('_') # file_ext_index= request.rfind('.') # request_list.append(request[underscore_index+1:file_ext_index]) #convert strings to int or the sort will be incorrect #request_list = map(int, request_list) #request_list.sort(reverse=True) #fname = "request_{}.xlsx".format(request_list[0]) #print fname #Grab the recovery file if needed, comment out if not download_path = "{FILEPATH}" print fname re_up = pd.read_excel(os.path.join(download_path, fname), header=0, sheet="extraction") re_up['response_rate'] = None #reupload so we can start from the beginning destination_file = "{FILEPATH}" print destination_file re_up.to_excel(destination_file, index=False, sheet_name="extraction") report = run.upload_epi_data(bundle, destination_file) assert (report['request_status'].item() == 'Successful') report_file = "{FILEPATH}" report.to_csv(report_file, encoding='utf-8') #df = pd.read_excel(os.path.join(download_path, fname), header=0, sheet="extraction") #use this for testing then comment out df = run.get_epi_data(bundle) hospital_data(df, bundle, cause_name) df = run.get_epi_data(bundle) assert (df.duplicated(subset=df.iloc[:, 2:], keep=False).any()) == False marketscan_data(df, bundle, cause_name) df = run.get_epi_data(bundle) other_data(df, bundle, cause_name)
def other_data(df, bundle, cause_name): print "Other data!" metadata_set = 35 round_id = 4 meta = get_location_metadata(location_set_id=metadata_set, gbd_round_id=4) meta = meta[['location_id', 'parent_id']] china_df = df.merge(meta, how='left') china_loc_id = 6 #filter china and subnats china_df = china_df[(china_df.parent_id == china_loc_id) | (china_df.location_id == china_loc_id)] #filter "mtwith" measure china_df = china_df.loc[(china_df.measure == "mtwith")] if not china_df.empty: print "china_df not empty!" china_df.loc[:, 'is_outlier'] = 1 china_df.drop('parent_id', axis=1, inplace=True) destination_file = "{FILEPATH}" print destination_file china_df.to_excel(destination_file, index=False, sheet_name="extraction") report = run.upload_epi_data(bundle, destination_file) #assert nothing in the report is wrong assert (report['request_status'].item() == 'Successful')
def upload(self, modelable_entity_id, destination_file, error_path): validate = run.validate_input_sheet(me_to_bundle[modelable_entity_id], destination_file, error_path) assert (validate['status'].item() == 'passed') status_df = run.upload_epi_data(me_to_bundle[modelable_entity_id], destination_file) assert (status_df['request_status'].item() == 'Successful') return status_df
def marketscan_data(df, bundle, cause_name): note = "fixes_{USERNAME}" # delete marketscan data and reupload with new specifications # subset to marketscan data searchfor = 'marketscan' #all marketscan data should have marketscan in the column names cols = [c for c in df.columns if (searchfor in c)] sub_list = [] for c in cols: subset = df.loc[df['{}'.format(c)] == 1, ['bundle_id', 'seq']] sub_list.append(subset) mrkt_delete = pd.concat(sub_list) if not mrkt_delete.empty: destination_file = "{FILEPATH}" mrkt_delete.to_excel(destination_file, index=False, sheet_name="extraction") report = run.upload_epi_data(bundle, destination_file) #assert nothing in the report is wrong assert (report['request_status'].item() == 'Successful') mrkt_infile = "{FILEPATH}" fname = "ALL_{b}_v3_{DATE}.xlsx".format(b=bundle) market = pd.read_excel(os.path.join(mrkt_infile, fname), header=0, sheet="extraction") #drop everything with year_start 2000 market = market[market.year_start != 2000] if bundle == 610: market.loc[:, 'is_outlier'] = 1 #outlier Hawaii market.loc[market.location_name == "Hawaii", 'is_outlier'] = 1 market = cause_specifics(market, bundle) mrkt_destination_file = "{FILEPATH}" print mrkt_destination_file market.to_excel(mrkt_destination_file, index=False, sheet_name="extraction") report = run.upload_epi_data(bundle, mrkt_destination_file) #assert nothing in the report is wrong assert (report['request_status'].item() == 'Successful')
def upload_dataset(bundle_id, out_dir, status_dir): """ Upload the data to the Epi-database for the given bundle ID """ # excel file to upload excel_file = "{out_dir}new_inputs_{bundle_id}.xlsx".format(\ out_dir=out_dir, bundle_id=bundle_id) print bundle_id print excel_file # upload status = upload_epi_data(bundle_id, excel_file) print status message = ("Inputs for {bundle_id} has {result} for the upload to the " "Database").format(bundle_id=bundle_id, result=str(status.loc[0, 'request_status'])) return message
def upload(bundle_id, destination_file, error_path): validate = validate_input_sheet(bundle_id, destination_file, error_path) assert (validate['status'].item() == 'passed') status_df = upload_epi_data(bundle_id, destination_file) assert (status_df['request_status'].item() == 'Successful') return status_df
import sys from elmo import run import db_queries step, bundle, out_dir = sys.argv[1:4] print step print bundle print out_dir df = run.upload_epi_data(bundle, '%s/step_%s_input_%s.xlsx'% (out_dir, step, bundle))
def hospital_data(df, bundle, cause_name): note = "fixes_{USERNAME}" # delete hospital data and reupload with new specifications # subset to hospital data hosp_delete = df.loc[(df.cv_hospital == 1), ['bundle_id', 'seq']] destination_file = "{FILEPATH}".format(b=bundle, d=date) if not hosp_delete.empty: print destination_file hosp_delete.to_excel(destination_file, index=False, sheet_name="extraction") report = run.upload_epi_data(bundle, destination_file) #assert nothing in the report is wrong assert (report['request_status'].item() == 'Successful') hosp_infile = "{FILEPATH}" fname = "{b}_v6_{DATE}.xlsx".format(b=bundle) hospital = pd.read_excel(os.path.join(hosp_infile, fname), header=0, sheet="extraction") # change to mean_3 (mean_0*correction_factor_3) for var in ['mean', 'lower', 'upper']: hospital.loc[hospital['{}_3'.format(var)].isnull(), '{}_3'.format(var)] = hospital['{}_0'.format( var)] * hospital['correction_factor_3'] hospital.rename(columns={'{}_3'.format(var): var}, inplace=True) # add covariates hospital['cv_hospital'] = 1 hospital['cv_hosp_under1'] = 0 hospital['cv_hosp_over1'] = 0 hospital.loc[hospital.age_start == 0, 'cv_hosp_under1'] = 1 hospital.loc[hospital.age_start > 0, 'cv_hosp_over1'] = 1 #drop unneeded columns searchfor = ['_0', '_1', '_2'] cols = [ c for c in hospital.columns if (searchfor[0] in c) or (searchfor[1] in c) or (searchfor[2] in c) ] hospital.drop(cols, axis=1, inplace=True) # add note hospital[ 'note_modeler'] = "Hospital data version 6.3, prepped {DATE} by {USERNAME}. Used Mean 3: inpatient and outpatient, after correction for multiple visits, all diagnoses" #Outlier locations out_loc_list_1 = ["Roraima", "Turkey", "Meghalaya", "Philippines"] hospital.loc[hospital['location_name'].isin(out_loc_list_1), "is_outlier"] = 1 if (bundle == 602 or bundle == 604 or bundle == 620 or bundle == 624): out_loc_list_2 = [ "Lithuania", "Poland", "Croatia", "Romania", "Slovakia", "Czech Republic", "Slovenia" ] hospital.loc[hospital['location_name'].isin(out_loc_list_2), "is_outlier"] = 1 hospital = cause_specifics(hospital, bundle) hosp_destination_file = "{FILEPATH}" print hosp_destination_file hospital.to_excel(hosp_destination_file, index=False, sheet_name="extraction") report = run.upload_epi_data(bundle, hosp_destination_file) #assert nothing in the report is wrong assert (report['request_status'].item() == 'Successful')
#set etiology specific variables, change to integer from float bundle = mapping.loc[mapping['me'] == me, 'bundle'].values[0].astype(int) acause = mapping.loc[mapping['me'] == me, 'acause'].values[0] ##clean data data = data.loc[data.sample_size != 0] data.loc[data['mean'].isnull() == True, 'mean'] = 0 #add outlier where mean is less than 1% data.loc[data['mean'] < 0.01, 'is_outlier'] = 1 #create seq column data['seq'] = "" #try to replace source type to see if it will upload data['source_type'] = "Vital registration - sample" if delete == 1: ##download epi data to delete database print 'deleting {} bundle {} for {} upload'.format( acause, bundle, version) del_path = "FILEPATH".format(acause, bundle, version) epi_data = run.get_epi_data(bundle) epi_data = epi_data[['seq']] epi_data.to_excel(del_path, index=False, sheet_name='extraction') ##upload blank sheet upload = run.upload_epi_data(bundle, del_path) ##export data print 'uploading {} bundle {} {}'.format(acause, bundle, version) upload_path = "FILEPATH".format(acause, bundle, me, version) data.to_excel(upload_path, index=False, sheet_name='extraction') upload = run.upload_epi_data(bundle, upload_path)
# In[78]: ## For troubleshooting collect.to_csv('FILEPATH') # In[92]: len(collect) # In[93]: ## upload data run.upload_epi_data(3125,write_dir +'ip_upload.xlsx') # # outlier/ifd specifics # In[65]: outlier = pd.read_csv('FILEPATH') # In[66]: outlier.head() # In[67]:
import sys from elmo import run chronic_csmr_bd, dismod_dir, out_dir = sys.argv[1:4] df = run.upload_epi_data(chronic_csmr_bd, '%s/epi_input_%s.xlsx' % (out_dir, chronic_csmr_bd))
def upload(self, modelable_entity_id, fname): status_df = upload_epi_data(me_to_bundle[modelable_entity_id], fname) return status_df
## Check length one more time to make sure it looks correct len(collect) # In[ ]: ## Check age trend too collect.groupby(['age_group_id', 'sex_id'], as_index = False)['mean'].mean() # In[ ]: ## Upload data ## Should be ready! run.upload_epi_data(3125,write_dir +'aut.xlsx') # # Bring in China Hospital data and process # In[ ]: chn = pd.read_hdf('FILEPATH', key = 'df') ## already in age_groups # In[ ]: chn = chn[['location_id', 'year_start', 'year_end']]