def run_local(): if os.path.isdir('H:/') == True: j = 'J:' h = 'H:' #cod_dict, cod_raw, cod_missingness = run_local() elif os.path.isdir('/home/j/') == True: j = '/home/j' h = '/homes/abertozz' else: print_and_log('What am I supposed to do?') cod_dict = {} cod_raw = {} cod_missingness = {} nrows = None # number of rows of file to read; make it small for running local jobs map_dir = '%s/Project/us_counties/mortality/data_prep/counties/01_clean_microdata/state_map.csv' % j yearvals = [1992] for year in yearvals: if year in range(1968, 1989): us_indir = "%s/DATA/USA/VR/%d/USA_VITAL_STATISTICS_%d_MORTALITY.TXT" % ( j, year, year) ps_indir = 'NONE' elif year in range(1989, 1994): fname = get_filepaths(year) us_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % ( j, year, fname) ps_indir = 'NONE' else: fname = get_filepaths(year) us_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % ( j, year, fname['US']) ps_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % ( j, year, fname['PS']) rlog.open( '%s/temp/amelia/counties/parse_death_files/debug_parse_%d.log' % (j, year)) rlog.log('Hello, world') rlog.log('Initializing') cod_data, cod_data_raw, cod_missingness = parse_cod_mortality( year, us_indir, ps_indir, map_dir, nrows) return cod_data, cod_data_raw, cod_missingness
year = int(year) env_id = int(env_id) late_id = int(late_id) cause = [env_id, late_id] # get list of locations locations = maternal_fns.get_locations() # set up columns we want to subset columns = maternal_fns.filter_cols() index_cols = [col for col in columns if not col.startswith('draw_')] # logging rlog.open('FILEPATH.log' % (log_dir, year)) rlog.log('') rlog.log('Starting to get late cause fractions') ############################################## # GET LATE CAUSE FRACTIONS: ############################################## codcorrect_df = draws(gbd_ids={'cause_ids': [env_id, late_id]}, source='codcorrect', year_ids=[year], sex_ids=[2], measure_ids=[1]) codcorrect_df['measure_id'] = 1 codcorrect_df = codcorrect_df[codcorrect_df.age_group_id.isin(range(7, 16))] envelope_df = codcorrect_df[codcorrect_df.cause_id == env_id] late_df = codcorrect_df[codcorrect_df.cause_id == late_id] # we only want index_cols and draws as columns
log_dir, year, dalynator_dir, env_id, late_id, out_dir = sys.argv[1:7] year = int(year) env_id = int(env_id) late_id = int(late_id) cause = [env_id, late_id] # get list of locations locations = maternal_fns.get_locations() # set up columns we want to subset columns = maternal_fns.filter_cols() # logging rlog.open('%s/dalynator_late_%s.log' % (log_dir, year)) rlog.log('') rlog.log('Starting to get late cause fractions') ############################################## # GET LATE CAUSE FRACTIONS: ############################################## for geo in locations: fname = 'draws_%s_%s.h5' % (geo, year) # dalynator files are saved as loc/year, with age, sex and cause inside try: dalynator_df = pd.read_hdf('%s/%s/%s' % (dalynator_dir, geo, fname), 'data', where=[("'cause_id'==%s & 'measure_id'==1" "& 'metric_id'==1 & 'sex_id'==2" "& 'rei_id'==0") % cause])
# We do this for subcauses. Next, we proportionately scale the cause # fractions so they sum to one across subcauses. Timing scaling and # interpolation is done in Step 3, after codcorrect. ########################################################################## interp_yearvals = { start_year: start_year + 5 for start_year in range(1990, 2011, 5) } # dismod files: model_vers_id/full/draws/{location_id}_{year_id}_{sex_id}.h5 dismod_dir = '/ihme/epi/panda_cascade/prod' # 'step 1' refers both to interpolation and fraction scaling. print maternal_fns.check_dependencies(1) if maternal_fns.check_dependencies(1): rlog.log("On Step 1") step_df = dep_map.ix[dep_map.step == 1] # make output directories for target_id in pd.unique(step_df.target_id): maternal_fns.check_dir('%s/%s' % (cluster_dir, target_id)) ############################## # INTERPOLATION ############################### rlog.log('Interpolating subcause cause fractions') # set in and out directories for interpolation for index, row in step_df.iterrows(): if row['source_id'] != 9015: # don't run for HIV! dismod_me_id = row['source_id'] dismod_model_vers = maternal_fns.get_model_vers(
print 'Where am I supposed to go?' ############################################## # PREP WORK: # set directories and other preliminary data ############################################## print 'starting job!' log_dir, jobname, dismod_dir, cluster_dir, year = sys.argv[1:6] year = int(year) # logging rlog.open('%s/%s.log' % (log_dir, jobname)) rlog.log('Starting scale fractions step') # get list of locations locations = maternal_fns.get_locations() geo_length = len(locations) # set up database enginer = dbapis.engine_factory() # set up columns we want to subset columns = maternal_fns.filter_cols() # get dependency_map dep_map = pd.read_csv( "dependency_map.csv", header=0).dropna(axis='columns', how='all')
if os.path.isdir('J:/'): j = 'J:' elif os.path.isdir('/home/j/'): j = '/home/j' else: print 'Where am I supposed to go?' log_dir, jobname, source_dir, out_dir, start_year_str, end_year_str = sys.argv[ 1:7] start_year = int(start_year_str) end_year = int(end_year_str) # logging rlog.open('%s/%s.log' % (log_dir, jobname)) rlog.log('source_dir is %s' % source_dir) rlog.log('out_dir is %s' % out_dir) # get list of locations locations = maternal_fns.get_locations() # set up columns we want to subset columns = maternal_fns.filter_cols() for geo_idx, geo in enumerate(locations): rlog.log('interpolating for place %s' % geo) rlog.log('place is number %s of %s' % (geo_idx, len(locations))) rlog.log('getting data') start_dir = '%s/%s_%s_2.h5' % (source_dir, geo, start_year)
def print_and_log(phrase): from PyJobTools import rlog print phrase rlog.log(phrase)
if os.path.isdir('J:/'): j = 'J:' elif os.path.isdir('/home/j/'): j = '/home/j' else: print 'Where am I supposed to go?' log_dir, jobname, envelope_dir, prop_dir, out_dir = sys.argv[1:6] # get list of locations locations = maternal_fns.get_locations() # logging rlog.open('%s/%s.log' % (log_dir, jobname)) rlog.log('out_dir is %s' % out_dir) # set up columns we want to subset columns = maternal_fns.filter_cols() columns.append('year_id') index_cols = ['year_id', 'age_group_id'] # concatenate dalynator draws into one df, if we're doing timings if "timing" in jobname: files = [] for root, dirnames, filenames in os.walk('%s' % envelope_dir): for filename in fnmatch.filter(filenames, '*.h5'): files.append(os.path.join(root, filename)) def read_file(f): return pd.read_hdf(f,
from __future__ import division import sys from PyJobTools import rlog import maternal_fns from transmogrifier.gopher import draws log_dir, jobname, env_model_vers, source_id, target_id, out_dir = sys.argv[1:7] # get list of locations locations = maternal_fns.get_locations() # logging rlog.open('%s/%s.log' % (log_dir, jobname)) rlog.log('out_dir is %s' % out_dir) # set up columns we want to subset columns = maternal_fns.filter_cols() columns.remove('measure_id') index_cols = [col for col in columns if not col.startswith('draw_')] # read maternal disorders envelope # CAUSES get multiplied by the Late corrected env from codem # TIMINGS get multiplied by the CoDcorrect env rlog.log("reading in envelope draws") if 'timing' in jobname: env = draws(gbd_ids={'cause_ids': [366]}, source='codcorrect', measure_ids=[1], sex_ids=[2], location_ids=locations) else: env = draws(gbd_ids={'cause_ids': [366]}, source='codem', sex_ids=[2],
header=0).dropna(axis='columns', how='all') # subset dep_map for the step that we're on if "timing" in jobname: step_df = dep_map[(dep_map.step == 4) & (dep_map.source_id != 'codcorrect')] held_constant_me = 376 else: step_df = dep_map.ix[dep_map.step == 1] held_constant_me = 9015 ####################################################################### # STEP 1: FOR EACH CAUSE, EXTRACT FILES, GET SUM BY GROUP + TOTAL SUM ####################################################################### print 'getting data' rlog.log('getting data') all_data = {} summed_idx = 0 for index, row in step_df.iterrows(): target_id = row['target_id'] try: subtype_df = draws( gbd_ids={'modelable_entity_ids': [row['source_id']]}, source='dismod', measure_ids=[18], sex_ids=[2], year_ids=[year]) except (ValueError, OSError): # pull data from where interp saves it subtype_df = pd.read_hdf( '%s/%s/%s_2.h5' % (cluster_dir, row['source_id'], year), 'draws')
j = '/home/j' else: print 'Where am I supposed to go?' log_dir, jobname, envelope_dir, out_dir = sys.argv[1:5] # do all the prep work enginer = dbapis.engine_factory() dep_map = pd.read_csv("dependency_map.csv", header=0).dropna(axis='columns', how='all') step_df = dep_map.ix[dep_map.step == 2].reset_index() index_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id'] # logging rlog.open('%s/%s.log' % (log_dir, jobname)) rlog.log("Correcting for the underreporting of Late Maternal deaths") rlog.log('out_dir is %s' % out_dir) # get list of locations locs = maternal_fns.get_locations() # get list of location/years that don't need correction rlog.log('Pulling in adjustment csv') adjust_df = pd.read_csv('%s/late_maternal_correction.csv' % (os.getcwd())) adjust_df = adjust_df[['location_id', 'year_id', 'subnationals', 'adj_factor']] # get the adjustment factor for most-detailed level, not just countries only_subnats = adjust_df[adjust_df.subnationals == 1] only_subnats.rename(columns={'location_id': 'parent_id'}, inplace=True) query = '''SELECT
from __future__ import division import sys from PyJobTools import rlog from transmogrifier.draw_ops import interpolate import maternal_fns log_dir, jobname, me_id, out_dir, start_year_str, end_year_str = sys.argv[1:7] start_year = int(start_year_str) end_year = int(end_year_str) # logging rlog.open('%s/%s.log' % (log_dir, jobname)) rlog.log('out_dir is %s' % out_dir) # get list of locations locations = maternal_fns.get_locations() if start_year == 1990: yearlist = range(1980, 1990) + range(1991, 1995) start_year = 1980 else: yearlist = range(start_year + 1, end_year) # call central function to interpolate rlog.log("Calling interpolate") interp_df = interpolate(gbd_id_field='modelable_entity_id', gbd_id=int(me_id), source='dismod',