def __init__(self, mvid, cv_iter_id): self.mvid = mvid self.cv_iter_id = cv_iter_id try: j = job.Job('%s/%s' % (settings['cascade_ode_out_dir'], mvid)) j.start() except IOError as e: logging.exception(e) except Exception as e: logging.exception(e) self.cascade = Cascade(mvid, reimport=True, cv_iter=cv_iter_id) self.meid = ( self.cascade.model_version_meta.modelable_entity_id.values[0]) if self.meid in [9422, 7695, 1175, 10352, 9309]: self.project = "proj_tb" self.is_tb = True else: self.project = "proj_dismod" self.is_tb = False self.logdir = '{}/{}'.format(settings['log_dir'], self.mvid) self.finished = False self.has_csmr = 'mtspecific' in self.cascade.data.integrand.unique() if cv_iter_id == 0: self.jt_dir = "{}/".format(self.cascade.root_dir) else: self.jt_dir = "{}/{}".format(self.cascade.root_dir, cv_iter_id) self.rerun_num = self.get_rerun_num() ijs = self.incomplete_jobs() # Run global once self.run_jobtree_global() # Check if retry limit has been exceeded if self.rerun_num > 3: elog_file = '{}/error{}.log'.format(self.cascade.root_dir, self.cv_iter_id) with open(elog_file, 'w') as log: log.write('Model is incomplete after two attempted relaunches') for ij in ijs: log.write(str(ij)) sys.exit() # Submit all jobs and checking job, making sure varnish waits # for the checking job to complete varn_jobname = 'dm_%s_varnish' % (mvid) varn_job = sge.qstat(pattern=varn_jobname) varn_jid = int(varn_job.job_id.values[0]) if len(ijs) > 0: jids = self.submit_cascade_jobs(ijs) pjid = self.resubmit_self_check(jids) sge.add_holds(varn_jid, pjid)
def main(): '''Read command line arguments to run dismod for all child location ids of given location ids. Args: mvid(int): model version id location_id(int): parent location id sex(str): one of 'male'/'female' year_id(int): year id debug(str, optional): If specified and value == 'debug', will run in serial instead of in parallel ''' mvid = int(sys.argv[1]) location_id = int(sys.argv[2]) sex = sys.argv[3] y = int(sys.argv[4]) cv_iter = int(sys.argv[5]) setup_logger() log = logging.getLogger(__name__) log.info( "Starting cascade mvid {} loc {} sex {} year {} cv_iter {}".format( mvid, location_id, sex, y, cv_iter)) # The cascade and parent information are shared across all subprocesses. # Make it a global to avoid the memory overhead of passing a copy to # each process global cascade global cl_parent try: if sys.argv[6] == "debug": debug = True else: debug = False except: debug = False if sex == 'male': sex_id = 0.5 elif sex == 'female': sex_id = -0.5 log.info("Creating cascade") cascade = Cascade(mvid, reimport=False, cv_iter=cv_iter) log.info("Done with cascade") year_split_lvl = cascade.model_version_meta.fix_year.values[0] - 1 lt = cascade.loctree this_lvl = lt.get_nodelvl_by_id(location_id) log.info("Generating cascade loc") if location_id == 1: cl_parent = Cascade_loc(location_id, 0, 2000, cascade, reimport=False) else: cl_parent = Cascade_loc(location_id, sex_id, y, cascade, reimport=False) num_children = len(lt.get_node_by_id(location_id).children) log.info("Done generating cascade loc") num_cpus = mp.cpu_count() num_workers = min(num_cpus, num_children, 10) if not debug: pool = mp.Pool(num_workers) # Run child locations arglist = [] for child_loc in lt.get_node_by_id(location_id).children: if this_lvl >= (year_split_lvl - 1): full_timespan = False else: full_timespan = True arglist.append((child_loc.id, sex_id, y, full_timespan, debug)) if debug: '..... RUNNING IN SINGLE PROCESS DEBUG MODE .....' res = map(run_loc, arglist) else: log.info( "Running {} child locations in parallel with {} processes".format( len(arglist), num_workers)) res = pool.map(run_loc, arglist) pool.close() pool.join() log.info("Done running") errors = ['%s: %s' % (str(r[0]), r[1]) for r in res if r[1] != 0] if len(errors) == 0: log.info("No errors found") else: num_errors = len(errors) error_msg = "; ".join(errors) log.error("Found {} errors for mvid {} loc {} sex {} year {} cv_iter" "{}: {}".format(num_errors, mvid, location_id, sex, y, cv_iter, error_msg))
cv_iter = int(sys.argv[5]) try: if sys.argv[6] == "debug": debug = True else: debug = False except: debug = False if sex == 'male': sex_id = 0.5 elif sex == 'female': sex_id = -0.5 c = Cascade(mvid, reimport=False, cv_iter=cv_iter) try: j = job.Job(os.path.normpath(os.path.join(c.root_dir, '..'))) j.start() except IOError as e: logging.exception(e) except Exception as e: logging.exception(e) year_split_lvl = c.model_version_meta.fix_year.values[0] - 1 lt = c.loctree this_lvl = lt.get_nodelvl_by_id(location_id) if location_id == 1: cl_parent = Cascade_loc(location_id, 0, 2000, c, reimport=False) else:
varn_jid = sge.qsub( finfile, varn_jobname, project=project, holds=jids, slots=15, memory=30, parameters=[mvid], conda_env='cascade_ode', prepend_to_path='/ihme/code/central_comp/anaconda/bin', stderr='%s/%s.error' % (logdir, varn_jobname)) sys.exit() else: cv_iter = 0 cascade = Cascade(mvid, reimport=False, cv_iter=cv_iter) has_csmr = 'mtspecific' in cascade.data.integrand.unique() csmr_cause_id = cascade.model_version_meta.add_csmr_cause.values[0] if csmr_cause_id is None: csmr_cause_id = np.nan ccvid = cascade.model_version_meta.csmr_cod_output_version_id.values[0] meid = cascade.model_version_meta.modelable_entity_id.values[0] if meid in [9422, 7695, 1175]: project = "proj_tb" else: project = "proj_dismod" user = getpass.getuser() remdf = cascade.model_params.query( 'parameter_type_id == 1 & measure_id == 7') if len(remdf) > 0: remdf = remdf[[
if __name__ == "__main__": mvid = int(sys.argv[1]) super_id = int(sys.argv[2]) sex = sys.argv[3] y = int(sys.argv[4]) if sex == 'male': sex_id = 0.5 elif sex == 'female': sex_id = -0.5 cl_worlds = {} c = Cascade(mvid, reimport=False) lt = c.loctree cl_world = Cascade_loc(1, 0, y, c, reimport=False) cl_worlds[y] = cl_world num_cpus = mp.cpu_count() pool = mp.Pool(min(num_cpus, 8)) cl_world = cl_worlds[y] cl_super = Cascade_loc(super_id, sex_id, y, c, parent_loc=cl_world, reimport=False) cl_super.run_dismod()
def run_jobtree_global(self): """Sets up and runs the global 1 or more times, depending on the settings for EMR (which requires adjusted data from one run of the global model to calculate EMR to be fed into a second global model) and whether this is a cross validation instance. We will use cv_iter=0 to denote a 'full' model, whereas all non-zero cv_iters will be run on a randomly selected subset of the data""" log = logging.getLogger(__name__) log.info("Starting run_jobtree_global") csmr_cause_id = ( self.cascade.model_version_meta.add_csmr_cause.values[0]) if csmr_cause_id is None: csmr_cause_id = np.nan ccvid = self.cascade.model_version_meta.csmr_cod_output_version_id ccvid = ccvid.values[0] remdf = self.cascade.model_params.query( 'parameter_type_id == 1 & measure_id == 7') if len(remdf) > 0: remdf = remdf[['parameter_type_id', 'measure_id', 'age_start', 'age_end', 'lower', 'mean', 'upper']] else: remdf = None should_run_emr = (self.rerun_num == 0 and self.cv_iter_id == 0 and (not np.isnan(csmr_cause_id) or self.has_csmr) and (not self.is_tb)) emr_disabled_setting = self.cascade.model_version_meta.get( 'disable_emr', pd.Series()) emr_is_disabled = emr_disabled_setting.unique().tolist() == [1] if should_run_emr and not emr_is_disabled: # Check whether there is a value constraint on EMR (in which case # we cannot compute EMR) emr_prior = self.cascade.model_params.query( 'parameter_type_id == 1 & measure_id == 9') if len(emr_prior) == 1: zero_EMR_prior = (emr_prior.lower.squeeze() == 0 and emr_prior.upper.squeeze() == 0 and emr_prior.age_start.squeeze() == 0 and emr_prior.age_end.squeeze() >= 100) if zero_EMR_prior: raise InvalidSettings("Cannot set a value prior of 0 for " "EMR for ages 0-100 while also " "triggering EMR calculation via " "cause/remission settings") # Set the commit hash here upload.update_model_status(self.mvid, upload.RUNNING) try: commit_hash = sge.get_commit_hash( dir='%s/..' % drill.this_path) except subprocess.CalledProcessError: commit_hash = __version__ upload.set_commit_hash(self.mvid, commit_hash) # Use CSMR data from codcorrect if requested, otherwise # use the user-provided data if np.isnan(csmr_cause_id): csmr_type = "custom" else: csmr_type = "cod" # Run the world once for emr calculation log.info("Run world once for emr") update_run_time(self.mvid) run_world(2000, self.cascade, drop_emr=True) try: dismod_emr(self.mvid, envr=settings['env_variables']['ENVIRONMENT_NAME'], remission_df=remdf, csmr_type=csmr_type) except (NoNonZeroValues, InsufficientInputs, NoEMRCalculated) as e: log.info(( "EMR calculation started but did not complete due to " "insufficient inputs. Skipping. Error was '{}'".format(e))) else: log.info("Emr done") # ... then re-import the cascade and re-run the world update_run_time(self.mvid) log.info("Rerunning world") self.cascade = Cascade(self.mvid, reimport=True, cv_iter=self.cv_iter_id) run_world(2000, self.cascade, reimport=True) log.info("world done rerunning") elif self.rerun_num == 0 and self.cv_iter_id == 0: update_run_time(self.mvid) upload.update_model_status(self.mvid, upload.RUNNING) log.info("Running world") run_world(2000, self.cascade) log.info("Done Running world") elif self.rerun_num == 0: update_run_time(self.mvid) upload.update_model_status(self.mvid, upload.RUNNING) run_world(2000, self.cascade) else: # not first run, do nothing pass
def __init__(self, mvid, cv_iter_id): ''' CascadeJobTree manages a full run of the cascade. It's responsible for launching dismod jobs all down the location heirarchy''' self.mvid = mvid self.cv_iter_id = cv_iter_id self.logdir = '{}/{}'.format(settings['log_dir'], self.mvid) log = logging.getLogger(__name__) log.info( "Beginning CascadeJobTree with mvid {} and cv_iter_id {}".format( mvid, cv_iter_id)) log.info("Beginning cascade creation") self.cascade = Cascade(mvid, reimport=True, cv_iter=cv_iter_id) log.info("Done with cascade creation") self.meid = ( self.cascade.model_version_meta.modelable_entity_id.values[0]) if self.meid in [9422, 7695, 1175, 10352, 9309]: self.project = "proj_tb" self.is_tb = True else: self.project = "proj_dismod" self.is_tb = False self.finished = False self.has_csmr = 'mtspecific' in self.cascade.data.integrand.unique() if cv_iter_id == 0: self.jt_dir = "{}/".format(self.cascade.root_dir) else: self.jt_dir = "{}/{}".format(self.cascade.root_dir, cv_iter_id) self.rerun_num = self.get_rerun_num() ijs = self.incomplete_jobs() # Run global once self.run_jobtree_global() # Check if retry limit has been exceeded if self.rerun_num > 3: elog_file = '{}/error{}.log'.format(self.cascade.root_dir, self.cv_iter_id) with open(elog_file, 'w') as log_file: err = 'Model is incomplete after two attempted relaunches' log_file.write(err) log.error(err) for ij in ijs: log_file.write(str(ij)) sys.exit() # Submit all jobs and checking job, making sure varnish waits # for the checking job to complete varn_jobname = 'dm_%s_varnish' % (mvid) varn_job = sge.qstat_w_retry(pattern=varn_jobname) varn_jid = int(varn_job.job_id.values[0]) if len(ijs) > 0: log.info("Submitting {} cascade jobs".format(len(ijs))) jids = self.submit_cascade_jobs(ijs) log.info("Done submitting cascade jobs") pjid = self.resubmit_self_check(jids) sge.add_holds(varn_jid, pjid) else: log.info("No cascade jobs submitted")
def run_jobtree_global(self): """Sets up and runs the global 1 or more times, depending on the settings for EMR (which requires adjusted data from one run of the global model to calculate EMR to be fed into a second global model) and whether this is a cross validation instance. We will use cv_iter=0 to denote a 'full' model, whereas all non-zero cv_iters will be run on a randomly selected subset of the data""" csmr_cause_id = ( self.cascade.model_version_meta.add_csmr_cause.values[0]) if csmr_cause_id is None: csmr_cause_id = np.nan ccvid = self.cascade.model_version_meta.csmr_cod_output_version_id ccvid = ccvid.values[0] remdf = self.cascade.model_params.query( 'parameter_type_id == 1 & measure_id == 7') if len(remdf) > 0: remdf = remdf[[ 'parameter_type_id', 'measure_id', 'age_start', 'age_end', 'lower', 'mean', 'upper' ]] else: remdf = None if (self.rerun_num == 0 and self.cv_iter_id == 0 and (not np.isnan(csmr_cause_id) or self.has_csmr) and (not self.is_tb)): # Check whether there is a value constraint on EMR (in which case # we cannot compute EMR) emr_prior = self.cascade.model_params.query( 'parameter_type_id == 1 & measure_id == 9') if len(emr_prior) == 1: zero_EMR_prior = (emr_prior.lower.squeeze() == 0 and emr_prior.upper.squeeze() == 0 and emr_prior.age_start.squeeze() == 0 and emr_prior.age_end.squeeze() >= 100) if zero_EMR_prior: raise InvalidSettings("Cannot set a value prior of 0 for " "EMR for ages 0-100 while also " "triggering EMR calculation via " "cause/remission settings") upload.update_model_status(self.mvid, -1) commit_hash = sge.get_commit_hash(dir='%s/..' % drill.this_path) upload.set_commit_hash(self.mvid, commit_hash) # Use CSMR data from codcorrect if requested, otherwise # use the user-provided data if np.isnan(csmr_cause_id): csmr_type = "custom" else: csmr_type = "cod" # Run the world once for emr calculation update_run_time(self.mvid) run_world(2000, self.cascade, drop_emr=True) dismod_emr(self.mvid, envr='prod', remission_df=remdf, csmr_type=csmr_type) # ... then re-import the cascade and re-run the world update_run_time(self.mvid) self.cascade = Cascade(self.mvid, reimport=True, cv_iter=self.cv_iter_id) run_world(2000, self.cascade, reimport=True) elif self.rerun_num == 0 and self.cv_iter_id == 0: update_run_time(self.mvid) upload.update_model_status(self.mvid, -1) run_world(2000, self.cascade) elif self.rerun_num == 0: update_run_time(self.mvid) upload.update_model_status(self.mvid, -1) run_world(2000, self.cascade)