Пример #1
0
def run_local():

    if os.path.isdir('H:/') == True:
        j = 'J:'
        h = 'H:'
        #cod_dict, cod_raw, cod_missingness = run_local()
    elif os.path.isdir('/home/j/') == True:
        j = '/home/j'
        h = '/homes/abertozz'
    else:
        print_and_log('What am I supposed to do?')

    cod_dict = {}
    cod_raw = {}
    cod_missingness = {}

    nrows = None  # number of rows of file to read; make it small for running local jobs
    map_dir = '%s/Project/us_counties/mortality/data_prep/counties/01_clean_microdata/state_map.csv' % j

    yearvals = [1992]

    for year in yearvals:
        if year in range(1968, 1989):
            us_indir = "%s/DATA/USA/VR/%d/USA_VITAL_STATISTICS_%d_MORTALITY.TXT" % (
                j, year, year)
            ps_indir = 'NONE'
        elif year in range(1989, 1994):
            fname = get_filepaths(year)
            us_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % (
                j, year, fname)
            ps_indir = 'NONE'
        else:
            fname = get_filepaths(year)
            us_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % (
                j, year, fname['US'])
            ps_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % (
                j, year, fname['PS'])

        rlog.open(
            '%s/temp/amelia/counties/parse_death_files/debug_parse_%d.log' %
            (j, year))
        rlog.log('Hello, world')

        rlog.log('Initializing')

        cod_data, cod_data_raw, cod_missingness = parse_cod_mortality(
            year, us_indir, ps_indir, map_dir, nrows)

    return cod_data, cod_data_raw, cod_missingness
Пример #2
0
year = int(year)
env_id = int(env_id)
late_id = int(late_id)
cause = [env_id, late_id]

# get list of locations
locations = maternal_fns.get_locations()

# set up columns we want to subset
columns = maternal_fns.filter_cols()
index_cols = [col for col in columns if not col.startswith('draw_')]

# logging
rlog.open('FILEPATH.log' % (log_dir, year))
rlog.log('')
rlog.log('Starting to get late cause fractions')

##############################################
# GET LATE CAUSE FRACTIONS:
##############################################
codcorrect_df = draws(gbd_ids={'cause_ids': [env_id, late_id]},
                      source='codcorrect', year_ids=[year], sex_ids=[2],
                      measure_ids=[1])
codcorrect_df['measure_id'] = 1
codcorrect_df = codcorrect_df[codcorrect_df.age_group_id.isin(range(7, 16))]

envelope_df = codcorrect_df[codcorrect_df.cause_id == env_id]
late_df = codcorrect_df[codcorrect_df.cause_id == late_id]

# we only want index_cols and draws as columns
Пример #3
0
log_dir, year, dalynator_dir, env_id, late_id, out_dir = sys.argv[1:7]

year = int(year)
env_id = int(env_id)
late_id = int(late_id)
cause = [env_id, late_id]

# get list of locations
locations = maternal_fns.get_locations()

# set up columns we want to subset
columns = maternal_fns.filter_cols()

# logging
rlog.open('%s/dalynator_late_%s.log' % (log_dir, year))
rlog.log('')
rlog.log('Starting to get late cause fractions')

##############################################
# GET LATE CAUSE FRACTIONS:
##############################################
for geo in locations:
    fname = 'draws_%s_%s.h5' % (geo, year)

    # dalynator files are saved as loc/year, with age, sex and cause inside
    try:
        dalynator_df = pd.read_hdf('%s/%s/%s' % (dalynator_dir, geo, fname),
                                   'data',
                                   where=[("'cause_id'==%s & 'measure_id'==1"
                                           "& 'metric_id'==1 & 'sex_id'==2"
                                           "& 'rei_id'==0") % cause])
Пример #4
0
# We do this for subcauses. Next, we proportionately scale the cause
# fractions so they sum to one across subcauses. Timing scaling and
# interpolation is done in Step 3, after codcorrect.
##########################################################################

interp_yearvals = {
    start_year: start_year + 5
    for start_year in range(1990, 2011, 5)
}
# dismod files: model_vers_id/full/draws/{location_id}_{year_id}_{sex_id}.h5
dismod_dir = '/ihme/epi/panda_cascade/prod'

# 'step 1' refers both to interpolation and fraction scaling.
print maternal_fns.check_dependencies(1)
if maternal_fns.check_dependencies(1):
    rlog.log("On Step 1")
    step_df = dep_map.ix[dep_map.step == 1]

    # make output directories
    for target_id in pd.unique(step_df.target_id):
        maternal_fns.check_dir('%s/%s' % (cluster_dir, target_id))

    ##############################
    # INTERPOLATION
    ###############################
    rlog.log('Interpolating subcause cause fractions')
    # set in and out directories for interpolation
    for index, row in step_df.iterrows():
        if row['source_id'] != 9015:  # don't run for HIV!
            dismod_me_id = row['source_id']
            dismod_model_vers = maternal_fns.get_model_vers(
Пример #5
0
    print 'Where am I supposed to go?'

##############################################
# PREP WORK:
# set directories and other preliminary data
##############################################

print 'starting job!'

log_dir, jobname, dismod_dir, cluster_dir, year = sys.argv[1:6]

year = int(year)

# logging
rlog.open('%s/%s.log' % (log_dir, jobname))
rlog.log('Starting scale fractions step')

# get list of locations
locations = maternal_fns.get_locations()
geo_length = len(locations)

# set up database
enginer = dbapis.engine_factory()

# set up columns we want to subset
columns = maternal_fns.filter_cols()

# get dependency_map
dep_map = pd.read_csv(
    "dependency_map.csv", header=0).dropna(axis='columns', how='all')
Пример #6
0
if os.path.isdir('J:/'):
    j = 'J:'
elif os.path.isdir('/home/j/'):
    j = '/home/j'
else:
    print 'Where am I supposed to go?'

log_dir, jobname, source_dir, out_dir, start_year_str, end_year_str = sys.argv[
    1:7]

start_year = int(start_year_str)
end_year = int(end_year_str)

# logging
rlog.open('%s/%s.log' % (log_dir, jobname))
rlog.log('source_dir is %s' % source_dir)
rlog.log('out_dir is %s' % out_dir)

# get list of locations
locations = maternal_fns.get_locations()

# set up columns we want to subset
columns = maternal_fns.filter_cols()

for geo_idx, geo in enumerate(locations):

    rlog.log('interpolating for place %s' % geo)
    rlog.log('place is number %s of %s' % (geo_idx, len(locations)))

    rlog.log('getting data')
    start_dir = '%s/%s_%s_2.h5' % (source_dir, geo, start_year)
Пример #7
0
def print_and_log(phrase):
    from PyJobTools import rlog
    print phrase
    rlog.log(phrase)
Пример #8
0
if os.path.isdir('J:/'):
    j = 'J:'
elif os.path.isdir('/home/j/'):
    j = '/home/j'
else:
    print 'Where am I supposed to go?'

log_dir, jobname, envelope_dir, prop_dir, out_dir = sys.argv[1:6]

# get list of locations
locations = maternal_fns.get_locations()

# logging
rlog.open('%s/%s.log' % (log_dir, jobname))
rlog.log('out_dir is %s' % out_dir)

# set up columns we want to subset
columns = maternal_fns.filter_cols()
columns.append('year_id')
index_cols = ['year_id', 'age_group_id']

# concatenate dalynator draws into one df, if we're doing timings
if "timing" in jobname:
    files = []
    for root, dirnames, filenames in os.walk('%s' % envelope_dir):
        for filename in fnmatch.filter(filenames, '*.h5'):
            files.append(os.path.join(root, filename))

    def read_file(f):
        return pd.read_hdf(f,
Пример #9
0
from __future__ import division
import sys
from PyJobTools import rlog

import maternal_fns
from transmogrifier.gopher import draws

log_dir, jobname, env_model_vers, source_id, target_id, out_dir = sys.argv[1:7]

# get list of locations
locations = maternal_fns.get_locations()

# logging
rlog.open('%s/%s.log' % (log_dir, jobname))
rlog.log('out_dir is %s' % out_dir)

# set up columns we want to subset
columns = maternal_fns.filter_cols()
columns.remove('measure_id')
index_cols = [col for col in columns if not col.startswith('draw_')]

# read maternal disorders envelope
# CAUSES get multiplied by the Late corrected env from codem
# TIMINGS get multiplied by the CoDcorrect env
rlog.log("reading in envelope draws")
if 'timing' in jobname:
    env = draws(gbd_ids={'cause_ids': [366]}, source='codcorrect',
                measure_ids=[1], sex_ids=[2], location_ids=locations)
else:
    env = draws(gbd_ids={'cause_ids': [366]}, source='codem', sex_ids=[2],
Пример #10
0
                      header=0).dropna(axis='columns', how='all')

# subset dep_map for the step that we're on
if "timing" in jobname:
    step_df = dep_map[(dep_map.step == 4)
                      & (dep_map.source_id != 'codcorrect')]
    held_constant_me = 376
else:
    step_df = dep_map.ix[dep_map.step == 1]
    held_constant_me = 9015

#######################################################################
# STEP 1: FOR EACH CAUSE, EXTRACT FILES, GET SUM BY GROUP + TOTAL SUM
#######################################################################
print 'getting data'
rlog.log('getting data')
all_data = {}
summed_idx = 0

for index, row in step_df.iterrows():
    target_id = row['target_id']
    try:
        subtype_df = draws(
            gbd_ids={'modelable_entity_ids': [row['source_id']]},
            source='dismod',
            measure_ids=[18],
            sex_ids=[2],
            year_ids=[year])
    except (ValueError, OSError):  # pull data from where interp saves it
        subtype_df = pd.read_hdf(
            '%s/%s/%s_2.h5' % (cluster_dir, row['source_id'], year), 'draws')
Пример #11
0
    j = '/home/j'
else:
    print 'Where am I supposed to go?'

log_dir, jobname, envelope_dir, out_dir = sys.argv[1:5]

# do all the prep work
enginer = dbapis.engine_factory()
dep_map = pd.read_csv("dependency_map.csv", header=0).dropna(axis='columns',
                                                             how='all')
step_df = dep_map.ix[dep_map.step == 2].reset_index()
index_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id']

# logging
rlog.open('%s/%s.log' % (log_dir, jobname))
rlog.log("Correcting for the underreporting of Late Maternal deaths")
rlog.log('out_dir is %s' % out_dir)

# get list of locations
locs = maternal_fns.get_locations()

# get list of location/years that don't need correction
rlog.log('Pulling in adjustment csv')
adjust_df = pd.read_csv('%s/late_maternal_correction.csv' % (os.getcwd()))
adjust_df = adjust_df[['location_id', 'year_id', 'subnationals', 'adj_factor']]

# get the adjustment factor for most-detailed level, not just countries
only_subnats = adjust_df[adjust_df.subnationals == 1]
only_subnats.rename(columns={'location_id': 'parent_id'}, inplace=True)

query = '''SELECT
Пример #12
0
from __future__ import division
import sys

from PyJobTools import rlog
from transmogrifier.draw_ops import interpolate
import maternal_fns

log_dir, jobname, me_id, out_dir, start_year_str, end_year_str = sys.argv[1:7]

start_year = int(start_year_str)
end_year = int(end_year_str)

# logging
rlog.open('%s/%s.log' % (log_dir, jobname))
rlog.log('out_dir is %s' % out_dir)

# get list of locations
locations = maternal_fns.get_locations()

if start_year == 1990:
    yearlist = range(1980, 1990) + range(1991, 1995)
    start_year = 1980
else:
    yearlist = range(start_year + 1, end_year)

# call central function to interpolate
rlog.log("Calling interpolate")
interp_df = interpolate(gbd_id_field='modelable_entity_id',
                        gbd_id=int(me_id),
                        source='dismod',