def submit_driver( mvid, project, dirs, extra_arguments=None, driver_arguments=None): """ Submit the cascade driver job. This job will run jobmon, which manages all subsequent cascade jobs. Args: mvid (str): model version ID project (str): The name of the proj, eg. ``proj_dismod`` dirs (dict): Dictionary of directory locations. extra_arguments (List[str]): command-line arguments to add to every Jobmon job. driver_arguments (List[str]): command-line arguments just for the driver.py job that gets launched here. """ logdir = dirs['logdir'] gfile = os.path.join(settings['code_dir'], "driver.py") jobname = 'dm_%s_driver' % mvid slots, memory, _runtime = sge.cluster_limits('driver', mvm=None) extra_arguments = extra_arguments if extra_arguments else list() driver_arguments = driver_arguments if driver_arguments else list() sge.qsub_w_retry( gfile, jobname, jobtype='python', project=project, slots=slots, memory=memory, parameters=[mvid] + driver_arguments + extra_arguments, conda_env=settings['conda_env'], environment_variables=settings['env_variables'], prepend_to_path=os.path.join(settings['conda_root'], 'bin'), stderr='%s/%s.error' % (logdir, jobname))
def submit_varnish(self, hold_jids): """Submits a job that 'varnishes' this run, meaning it: 1. Uploads fits 2. Uploads adjusted data 3. Computes fit statistics 4. Uploads fit statistics 5. Attempts to generate diagnostic plots 5. Computes finals 6. Uploads finals 7. Updates the status of the model to finished """ varn_jobname = 'dm_%s_varnish' % (self.mvid) varn_jid = sge.qsub_w_retry( finfile, varn_jobname, project=self.project, slots=35, memory=180, parameters=[self.mvid], holds=hold_jids, conda_env=settings['conda_env'], prepend_to_path=os.path.join(settings['conda_root'], 'bin'), environment_variables=settings['env_variables'], stderr='%s/%s.error' % (self.logdir, varn_jobname)) return varn_jid
def submit_jobtree(self, cv_iter): """Submits a jobtree, which manages a given full/cross-validation run from global on down through the cascade""" jobname = 'dm_{}_G{}'.format(self.mvid, cv_iter) jid = sge.qsub_w_retry( gfile, jobname, project=self.project, slots=20, memory=40, parameters=[self.mvid, '--submit_stage', 'jt', '--cv_iter', cv_iter], conda_env=settings['conda_env'], prepend_to_path=os.path.join(settings['conda_root'], 'bin'), environment_variables=settings['env_variables'], stderr='{}/{}.error'.format(self.logdir, jobname), stdout='{}/{}.stdout'.format(self.logdir, jobname)) return jid
def resubmit_self_check(self, hold_jids): """Submits a job that checks that all child location-year-sex groups have run succesfully. If any have failed, it resubmits the below-global levels of the cascade (i.e. the submit_cascade function)""" jobname = 'dm_{}_G{}'.format(self.mvid, self.cv_iter_id) jid = sge.qsub_w_retry( gfile, jobname, project=self.project, slots=20, memory=40, holds=hold_jids, parameters=[self.mvid, '--submit_stage', 'jt', '--cv_iter', self.cv_iter_id], conda_env=settings['conda_env'], prepend_to_path=os.path.join(settings['conda_root'], 'bin'), environment_variables=settings['env_variables'], stderr='{}/{}.error'.format(self.logdir, jobname)) return jid
def submit_global(mvid, project, dirs): """ Submit the global dismod_ode job. This job will attempt to run the entire cascade.""" logdir = dirs['logdir'] gfile = os.path.join(settings['code_dir'], "run_global.py") jobname = 'dm_%s_boot' % mvid jid = sge.qsub_w_retry(gfile, jobname, jobtype='python', project=project, slots=15, memory=30, parameters=[mvid], conda_env=settings['conda_env'], environment_variables=settings['env_variables'], prepend_to_path=os.path.join( settings['conda_root'], 'bin'), stderr='%s/%s.error' % (logdir, jobname)) return jid
def dependent_submit(location_id, hold_ids): node = self.cascade.loctree.get_node_by_id(location_id) num_children = len(node.children) if num_children == 0: return 0 else: jids = [] for y in demo.year_ids: job_name = "dm_%s_%s_%s_%s_%s" % (self.mvid, location_id, sex[0], str(y)[2:], self.cv_iter_id) if location_id == 1: num_slots = 20 else: num_slots = min(20, num_children * 2) if ((location_id, sex, y, self.cv_iter_id) in incomplete_jobs): params = [self.mvid, location_id, sex, y, self.cv_iter_id] jid = sge.qsub_w_retry( cfile, job_name, project=self.project, holds=hold_ids, slots=num_slots, memory=int(math.ceil(num_slots * 2.5)), parameters=params, conda_env=settings['conda_env'], prepend_to_path=os.path.join( settings['conda_root'], 'bin'), environment_variables=( settings['env_variables']), stderr='%s/%s.error' % (self.logdir, job_name)) jids.append(jid) all_jids.append(jid) for c in node.children: dependent_submit(c.id, jids)
def dependent_submit(location_id, hold_ids): node = loctree.get_node_by_id(location_id) num_children = len(node.children) if num_children == 0: return 0 else: if (location_id, sex, y) in run_set: job_name = "casc_%s_%s_%s" % (location_id, sex[0], str(y)[2:]) num_slots = min(8, num_children) jid = sge.qsub_w_retry( runfile, job_name, holds=hold_ids, slots=num_slots, memory=num_slots * 2, parameters=[mvid, location_id, sex, y]) jid = [jid] else: jid = [] for c in node.children: dependent_submit(c.id, jid)