示例#1
0
    def __init__(self, mvid, cv_iter_id):
        self.mvid = mvid
        self.cv_iter_id = cv_iter_id
        try:
            j = job.Job('%s/%s' % (settings['cascade_ode_out_dir'], mvid))
            j.start()
        except IOError as e:
            logging.exception(e)
        except Exception as e:
            logging.exception(e)

        self.cascade = Cascade(mvid, reimport=True, cv_iter=cv_iter_id)
        self.meid = (
            self.cascade.model_version_meta.modelable_entity_id.values[0])
        if self.meid in [9422, 7695, 1175, 10352, 9309]:
            self.project = "proj_tb"
            self.is_tb = True
        else:
            self.project = "proj_dismod"
            self.is_tb = False
        self.logdir = '{}/{}'.format(settings['log_dir'], self.mvid)
        self.finished = False
        self.has_csmr = 'mtspecific' in self.cascade.data.integrand.unique()

        if cv_iter_id == 0:
            self.jt_dir = "{}/".format(self.cascade.root_dir)
        else:
            self.jt_dir = "{}/{}".format(self.cascade.root_dir, cv_iter_id)

        self.rerun_num = self.get_rerun_num()
        ijs = self.incomplete_jobs()

        # Run global once
        self.run_jobtree_global()

        # Check if retry limit has been exceeded
        if self.rerun_num > 3:
            elog_file = '{}/error{}.log'.format(self.cascade.root_dir,
                                                self.cv_iter_id)

            with open(elog_file, 'w') as log:
                log.write('Model is incomplete after two attempted relaunches')
                for ij in ijs:
                    log.write(str(ij))
            sys.exit()

        # Submit all jobs and checking job, making sure varnish waits
        # for the checking job to complete
        varn_jobname = 'dm_%s_varnish' % (mvid)
        varn_job = sge.qstat(pattern=varn_jobname)
        varn_jid = int(varn_job.job_id.values[0])
        if len(ijs) > 0:
            jids = self.submit_cascade_jobs(ijs)
            pjid = self.resubmit_self_check(jids)
            sge.add_holds(varn_jid, pjid)
示例#2
0
def main():
    '''Read command line arguments to run dismod for all child location ids of
    given location ids.

    Args:
        mvid(int): model version id
        location_id(int): parent location id
        sex(str): one of 'male'/'female'
        year_id(int): year id
        debug(str, optional): If specified and value == 'debug', will run
            in serial instead of in parallel
    '''
    mvid = int(sys.argv[1])
    location_id = int(sys.argv[2])
    sex = sys.argv[3]
    y = int(sys.argv[4])
    cv_iter = int(sys.argv[5])

    setup_logger()
    log = logging.getLogger(__name__)
    log.info(
        "Starting cascade mvid {} loc {} sex {} year {} cv_iter {}".format(
            mvid, location_id, sex, y, cv_iter))
    # The cascade and parent information are shared across all subprocesses.
    # Make it a global to avoid the memory overhead of passing a copy to
    # each process
    global cascade
    global cl_parent

    try:
        if sys.argv[6] == "debug":
            debug = True
        else:
            debug = False
    except:
        debug = False

    if sex == 'male':
        sex_id = 0.5
    elif sex == 'female':
        sex_id = -0.5

    log.info("Creating cascade")
    cascade = Cascade(mvid, reimport=False, cv_iter=cv_iter)
    log.info("Done with cascade")

    year_split_lvl = cascade.model_version_meta.fix_year.values[0] - 1
    lt = cascade.loctree
    this_lvl = lt.get_nodelvl_by_id(location_id)
    log.info("Generating cascade loc")
    if location_id == 1:
        cl_parent = Cascade_loc(location_id, 0, 2000, cascade, reimport=False)
    else:
        cl_parent = Cascade_loc(location_id,
                                sex_id,
                                y,
                                cascade,
                                reimport=False)
    num_children = len(lt.get_node_by_id(location_id).children)
    log.info("Done generating cascade loc")

    num_cpus = mp.cpu_count()

    num_workers = min(num_cpus, num_children, 10)
    if not debug:
        pool = mp.Pool(num_workers)

    # Run child locations
    arglist = []
    for child_loc in lt.get_node_by_id(location_id).children:
        if this_lvl >= (year_split_lvl - 1):
            full_timespan = False
        else:
            full_timespan = True
        arglist.append((child_loc.id, sex_id, y, full_timespan, debug))

    if debug:
        '..... RUNNING IN SINGLE PROCESS DEBUG MODE .....'
        res = map(run_loc, arglist)
    else:
        log.info(
            "Running {} child locations in parallel with {} processes".format(
                len(arglist), num_workers))
        res = pool.map(run_loc, arglist)
        pool.close()
        pool.join()
        log.info("Done running")

    errors = ['%s: %s' % (str(r[0]), r[1]) for r in res if r[1] != 0]

    if len(errors) == 0:
        log.info("No errors found")
    else:
        num_errors = len(errors)
        error_msg = "; ".join(errors)
        log.error("Found {} errors for mvid {} loc {} sex {} year {} cv_iter"
                  "{}: {}".format(num_errors, mvid, location_id, sex, y,
                                  cv_iter, error_msg))
示例#3
0
    cv_iter = int(sys.argv[5])

    try:
        if sys.argv[6] == "debug":
            debug = True
        else:
            debug = False
    except:
        debug = False

    if sex == 'male':
        sex_id = 0.5
    elif sex == 'female':
        sex_id = -0.5

    c = Cascade(mvid, reimport=False, cv_iter=cv_iter)

    try:
        j = job.Job(os.path.normpath(os.path.join(c.root_dir, '..')))
        j.start()
    except IOError as e:
        logging.exception(e)
    except Exception as e:
        logging.exception(e)

    year_split_lvl = c.model_version_meta.fix_year.values[0] - 1
    lt = c.loctree
    this_lvl = lt.get_nodelvl_by_id(location_id)
    if location_id == 1:
        cl_parent = Cascade_loc(location_id, 0, 2000, c, reimport=False)
    else:
示例#4
0
            varn_jid = sge.qsub(
                finfile,
                varn_jobname,
                project=project,
                holds=jids,
                slots=15,
                memory=30,
                parameters=[mvid],
                conda_env='cascade_ode',
                prepend_to_path='/ihme/code/central_comp/anaconda/bin',
                stderr='%s/%s.error' % (logdir, varn_jobname))
            sys.exit()
        else:
            cv_iter = 0

    cascade = Cascade(mvid, reimport=False, cv_iter=cv_iter)
    has_csmr = 'mtspecific' in cascade.data.integrand.unique()
    csmr_cause_id = cascade.model_version_meta.add_csmr_cause.values[0]
    if csmr_cause_id is None:
        csmr_cause_id = np.nan
    ccvid = cascade.model_version_meta.csmr_cod_output_version_id.values[0]
    meid = cascade.model_version_meta.modelable_entity_id.values[0]
    if meid in [9422, 7695, 1175]:
        project = "proj_tb"
    else:
        project = "proj_dismod"
    user = getpass.getuser()
    remdf = cascade.model_params.query(
        'parameter_type_id == 1 & measure_id == 7')
    if len(remdf) > 0:
        remdf = remdf[[
示例#5
0

if __name__ == "__main__":

    mvid = int(sys.argv[1])
    super_id = int(sys.argv[2])
    sex = sys.argv[3]
    y = int(sys.argv[4])

    if sex == 'male':
        sex_id = 0.5
    elif sex == 'female':
        sex_id = -0.5

    cl_worlds = {}
    c = Cascade(mvid, reimport=False)
    lt = c.loctree
    cl_world = Cascade_loc(1, 0, y, c, reimport=False)
    cl_worlds[y] = cl_world

    num_cpus = mp.cpu_count()
    pool = mp.Pool(min(num_cpus, 8))

    cl_world = cl_worlds[y]
    cl_super = Cascade_loc(super_id,
                           sex_id,
                           y,
                           c,
                           parent_loc=cl_world,
                           reimport=False)
    cl_super.run_dismod()
示例#6
0
    def run_jobtree_global(self):
        """Sets up and runs the global 1 or more times, depending on the
        settings for EMR (which requires adjusted data from one run of the
        global model to calculate EMR to be fed into a second global model) and
        whether this is a cross validation instance. We will use cv_iter=0 to
        denote a 'full' model, whereas all non-zero cv_iters will be run on a
        randomly selected subset of the data"""

        log = logging.getLogger(__name__)
        log.info("Starting run_jobtree_global")

        csmr_cause_id = (
            self.cascade.model_version_meta.add_csmr_cause.values[0])
        if csmr_cause_id is None:
            csmr_cause_id = np.nan
        ccvid = self.cascade.model_version_meta.csmr_cod_output_version_id
        ccvid = ccvid.values[0]
        remdf = self.cascade.model_params.query(
            'parameter_type_id == 1 & measure_id == 7')
        if len(remdf) > 0:
            remdf = remdf[['parameter_type_id', 'measure_id', 'age_start',
                           'age_end', 'lower', 'mean', 'upper']]
        else:
            remdf = None

        should_run_emr = (self.rerun_num == 0 and self.cv_iter_id == 0
                          and (not np.isnan(csmr_cause_id) or self.has_csmr)
                          and (not self.is_tb))
        emr_disabled_setting = self.cascade.model_version_meta.get(
                'disable_emr', pd.Series())
        emr_is_disabled = emr_disabled_setting.unique().tolist() == [1]

        if should_run_emr and not emr_is_disabled:

            # Check whether there is a value constraint on EMR (in which case
            # we cannot compute EMR)
            emr_prior = self.cascade.model_params.query(
                'parameter_type_id == 1 & measure_id == 9')
            if len(emr_prior) == 1:
                zero_EMR_prior = (emr_prior.lower.squeeze() == 0 and
                                  emr_prior.upper.squeeze() == 0 and
                                  emr_prior.age_start.squeeze() == 0 and
                                  emr_prior.age_end.squeeze() >= 100)
                if zero_EMR_prior:
                    raise InvalidSettings("Cannot set a value prior of 0 for "
                                          "EMR for ages 0-100 while also "
                                          "triggering EMR calculation via "
                                          "cause/remission settings")

            # Set the commit hash here
            upload.update_model_status(self.mvid, upload.RUNNING)
            try:
                commit_hash = sge.get_commit_hash(
                    dir='%s/..' % drill.this_path)
            except subprocess.CalledProcessError:
                commit_hash = __version__

            upload.set_commit_hash(self.mvid, commit_hash)

            # Use CSMR data from codcorrect if requested, otherwise
            # use the user-provided data
            if np.isnan(csmr_cause_id):
                csmr_type = "custom"
            else:
                csmr_type = "cod"

            # Run the world once for emr calculation
            log.info("Run world once for emr")
            update_run_time(self.mvid)
            run_world(2000, self.cascade, drop_emr=True)
            try:
                dismod_emr(self.mvid,
                           envr=settings['env_variables']['ENVIRONMENT_NAME'],
                           remission_df=remdf, csmr_type=csmr_type)
            except (NoNonZeroValues, InsufficientInputs, NoEMRCalculated) as e:
                log.info((
                    "EMR calculation started but did not complete due to "
                    "insufficient inputs. Skipping. Error was '{}'".format(e)))
            else:
                log.info("Emr done")

            # ... then re-import the cascade and re-run the world
            update_run_time(self.mvid)
            log.info("Rerunning world")
            self.cascade = Cascade(self.mvid,
                                   reimport=True,
                                   cv_iter=self.cv_iter_id)
            run_world(2000, self.cascade, reimport=True)
            log.info("world done rerunning")

        elif self.rerun_num == 0 and self.cv_iter_id == 0:
            update_run_time(self.mvid)
            upload.update_model_status(self.mvid, upload.RUNNING)
            log.info("Running world")
            run_world(2000, self.cascade)
            log.info("Done Running world")

        elif self.rerun_num == 0:
            update_run_time(self.mvid)
            upload.update_model_status(self.mvid, upload.RUNNING)
            run_world(2000, self.cascade)
        else:
            # not first run, do nothing
            pass
示例#7
0
    def __init__(self, mvid, cv_iter_id):
        ''' CascadeJobTree manages a full run of the cascade. It's responsible
        for launching dismod jobs all down the location heirarchy'''
        self.mvid = mvid
        self.cv_iter_id = cv_iter_id

        self.logdir = '{}/{}'.format(settings['log_dir'], self.mvid)
        log = logging.getLogger(__name__)
        log.info(
            "Beginning CascadeJobTree with mvid {} and cv_iter_id {}".format(
                mvid, cv_iter_id))

        log.info("Beginning cascade creation")
        self.cascade = Cascade(mvid, reimport=True, cv_iter=cv_iter_id)
        log.info("Done with cascade creation")

        self.meid = (
            self.cascade.model_version_meta.modelable_entity_id.values[0])
        if self.meid in [9422, 7695, 1175, 10352, 9309]:
            self.project = "proj_tb"
            self.is_tb = True
        else:
            self.project = "proj_dismod"
            self.is_tb = False
        self.finished = False
        self.has_csmr = 'mtspecific' in self.cascade.data.integrand.unique()

        if cv_iter_id == 0:
            self.jt_dir = "{}/".format(self.cascade.root_dir)
        else:
            self.jt_dir = "{}/{}".format(self.cascade.root_dir, cv_iter_id)

        self.rerun_num = self.get_rerun_num()
        ijs = self.incomplete_jobs()

        # Run global once
        self.run_jobtree_global()

        # Check if retry limit has been exceeded
        if self.rerun_num > 3:
            elog_file = '{}/error{}.log'.format(self.cascade.root_dir,
                                                self.cv_iter_id)

            with open(elog_file, 'w') as log_file:
                err = 'Model is incomplete after two attempted relaunches'
                log_file.write(err)
                log.error(err)
                for ij in ijs:
                    log_file.write(str(ij))
            sys.exit()

        # Submit all jobs and checking job, making sure varnish waits
        # for the checking job to complete
        varn_jobname = 'dm_%s_varnish' % (mvid)
        varn_job = sge.qstat_w_retry(pattern=varn_jobname)
        varn_jid = int(varn_job.job_id.values[0])
        if len(ijs) > 0:
            log.info("Submitting {} cascade jobs".format(len(ijs)))
            jids = self.submit_cascade_jobs(ijs)
            log.info("Done submitting cascade jobs")
            pjid = self.resubmit_self_check(jids)
            sge.add_holds(varn_jid, pjid)
        else:
            log.info("No cascade jobs submitted")
示例#8
0
    def run_jobtree_global(self):
        """Sets up and runs the global 1 or more times, depending on the
        settings for EMR (which requires adjusted data from one run of the
        global model to calculate EMR to be fed into a second global model) and
        whether this is a cross validation instance. We will use cv_iter=0 to
        denote a 'full' model, whereas all non-zero cv_iters will be run on a
        randomly selected subset of the data"""

        csmr_cause_id = (
            self.cascade.model_version_meta.add_csmr_cause.values[0])
        if csmr_cause_id is None:
            csmr_cause_id = np.nan
        ccvid = self.cascade.model_version_meta.csmr_cod_output_version_id
        ccvid = ccvid.values[0]
        remdf = self.cascade.model_params.query(
            'parameter_type_id == 1 & measure_id == 7')
        if len(remdf) > 0:
            remdf = remdf[[
                'parameter_type_id', 'measure_id', 'age_start', 'age_end',
                'lower', 'mean', 'upper'
            ]]
        else:
            remdf = None
        if (self.rerun_num == 0 and self.cv_iter_id == 0
                and (not np.isnan(csmr_cause_id) or self.has_csmr)
                and (not self.is_tb)):

            # Check whether there is a value constraint on EMR (in which case
            # we cannot compute EMR)
            emr_prior = self.cascade.model_params.query(
                'parameter_type_id == 1 & measure_id == 9')
            if len(emr_prior) == 1:
                zero_EMR_prior = (emr_prior.lower.squeeze() == 0
                                  and emr_prior.upper.squeeze() == 0
                                  and emr_prior.age_start.squeeze() == 0
                                  and emr_prior.age_end.squeeze() >= 100)
                if zero_EMR_prior:
                    raise InvalidSettings("Cannot set a value prior of 0 for "
                                          "EMR for ages 0-100 while also "
                                          "triggering EMR calculation via "
                                          "cause/remission settings")

            upload.update_model_status(self.mvid, -1)
            commit_hash = sge.get_commit_hash(dir='%s/..' % drill.this_path)
            upload.set_commit_hash(self.mvid, commit_hash)

            # Use  CSMR data from codcorrect if requested, otherwise
            # use the user-provided data
            if np.isnan(csmr_cause_id):
                csmr_type = "custom"
            else:
                csmr_type = "cod"

            # Run the world once for emr calculation
            update_run_time(self.mvid)
            run_world(2000, self.cascade, drop_emr=True)
            dismod_emr(self.mvid,
                       envr='prod',
                       remission_df=remdf,
                       csmr_type=csmr_type)

            # ... then re-import the cascade and re-run the world
            update_run_time(self.mvid)
            self.cascade = Cascade(self.mvid,
                                   reimport=True,
                                   cv_iter=self.cv_iter_id)
            run_world(2000, self.cascade, reimport=True)

        elif self.rerun_num == 0 and self.cv_iter_id == 0:
            update_run_time(self.mvid)
            upload.update_model_status(self.mvid, -1)
            run_world(2000, self.cascade)

        elif self.rerun_num == 0:
            update_run_time(self.mvid)
            upload.update_model_status(self.mvid, -1)
            run_world(2000, self.cascade)