示例#1
0
 def update_metadata(self, step_name, step_meta):
     """
     Store step metadata (if any) and pull out global metadata from it
     """
     self.meta['steps'][step_name] = step_meta
     modified = False
     if 'pipeline' in step_meta:
         ut.dict_update(self.meta['pipeline'], step_meta['pipeline'])
         #self.log.debug('Pulled metadata from step: %s' % ut.format_dict(self.meta))
         self.db.update_pipeline_metadata(copy.deepcopy(self.meta['pipeline']))
     self.db.update_step_metadata(step_name, copy.deepcopy(self.meta['steps'][step_name]['step']))
示例#2
0
 def update_metadata(self, step_name, step_meta):
     """
     Store step metadata (if any) and pull out global metadata from it
     """
     self.meta['steps'][step_name] = step_meta
     modified = False
     if 'pipeline' in step_meta:
         ut.dict_update(self.meta['pipeline'], step_meta['pipeline'])
         #self.log.debug('Pulled metadata from step: %s' % ut.format_dict(self.meta))
         self.db.update_pipeline_metadata(
             copy.deepcopy(self.meta['pipeline']))
     self.db.update_step_metadata(
         step_name, copy.deepcopy(self.meta['steps'][step_name]['step']))
示例#3
0
文件: step.py 项目: fronga/pypers
    def __init__(self):
        self.bootstrap = STARTUP_CYCLE
        self.status = JOB_STATUS.QUEUED
        self.meta = {'pipeline': {}, 'step': {}, 'job': {}}
        self.requirements = {'memory': '1', 'cpus': '1'}
        self.output_dir = '.'
        self.jobs = OrderedDict()
        self.cmd_count = 0

        logger.set_stdout_level(logger.DEBUG)
        self.log = logger.get_log()

        # parse specs and create keys
        self.spec["name"] = self.__module__.replace('nespipe.steps.',
                                                    '').split('.')[-1]
        self.name = self.spec["name"]
        self.__version__ = self.spec['version']

        self.local_step = self.spec.get('local', False)
        global scheduler
        if self.local_step:
            self.scheduler = get_scheduler("SCHED_LOCAL")
        else:
            self.scheduler = scheduler

        for k, v in self.spec["args"].iteritems():
            for param in v:
                if param.get('name', None):
                    setattr(self, param['name'], param.get('value', []))

        ut.dict_update(
            self.requirements,
            self.spec.get('requirements', {
                'memory': '1',
                'cpus': '1'
            }))
        for k, v in self.requirements.iteritems():
            setattr(self, k, int(v))

        #set the jvm memory
        if 'memory' in self.requirements:
            self.jvm_memory = int(int(self.requirements['memory']) * 0.9)
            if not self.jvm_memory:
                self.jvm_memory = 1
示例#4
0
    def __init__(self):
        self.bootstrap = STARTUP_CYCLE
        self.status = JOB_STATUS.QUEUED
        self.meta = { 'pipeline':{}, 'step':{}, 'job':{}}
        self.requirements = {'memory' : '1', 'cpus' : '1'}
        self.output_dir = '.'
        self.jobs = OrderedDict()
        self.cmd_count = 0

        logger.set_stdout_level(logger.DEBUG)
        self.log = logger.get_log()

        # parse specs and create keys
        self.spec["name"] = self.__module__.replace('nespipe.steps.','').split('.')[-1]
        self.name = self.spec["name"]
        self.__version__ = self.spec['version']

        self.local_step = self.spec.get('local', False)
        global scheduler
        if self.local_step:
            self.scheduler = get_scheduler("SCHED_LOCAL")
        else:
            self.scheduler = scheduler

        for k, v in self.spec["args"].iteritems():
            for param in v:
                if param.get('name', None):
                    setattr(self, param['name'], param.get('value', []))

        ut.dict_update(self.requirements, self.spec.get('requirements', {'memory' : '1', 'cpus' : '1'}))
        for k, v in self.requirements.iteritems():
            setattr(self, k, int(v))

        #set the jvm memory
        if 'memory' in self.requirements:
            self.jvm_memory = int(int(self.requirements['memory']) * 0.9)
            if not self.jvm_memory:
                self.jvm_memory = 1
示例#5
0
    def run_step(self, step_name):
        """
        Configure and run a job for the given step
        """

        #skip the input step
        if step_name == 'inputs':
            self.completed.append(step_name)
            self.outputs[step_name] = self.cfg['config']['steps'].get(step_name, {})
            self.outputs[step_name]['output_dir'] = ''
            self.db.update_step_status(step_name, JOB_STATUS.RUNNING)
            self.db.update_step_status(step_name, JOB_STATUS.SUCCEEDED)
            self.db.set_step_outputs(step_name, self.outputs[step_name])
        else:
            if self.one_step:
                step_config = self.cfg
                step_config['sys_path'] = self.sys_path
                step_config['output_dir'] = self.output_dir
                step_config['meta'] = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} }}
                ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline'])
            elif step_name == FINAL_STEP:
                step_config = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} } }
                ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline'])
                step_config['name'] = FINAL_STEP
                step_config['step_class'] = self.dag.node[step_name]['class_name']
                step_config['target_dir'] = self.output_dir
                step_config['source_dir'] = self.work_dir
                step_config['output_dir'] = os.path.join(self.work_dir, step_name)
                self.configure_finalstep(step_config)
            else:
                step_config = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} } }
                ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline'])
                step_class = self.dag.node[step_name]['class_name']
                step_config['name'] = step_name
                step_config['sys_path'] = self.sys_path
                step_config['step_class'] = step_class
                step_config['output_dir'] = os.path.join(self.work_dir, step_name)

                # 1. Form input keys
                # Remember: edges are labelled by 'from' keys
                for pred in self.dag.predecessors(step_name):
                    edge = self.dag[pred][step_name]
                    # Not an actual loop: just get key/value
                    for bind_to, bind_from in edge.get('bindings', {}).iteritems():
                        to_key = bind_to.split('.')[1]
                        if hasattr(bind_from, '__iter__'):
                            for from_key in bind_from:
                                key = from_key.split('.')[1]
                                out = self.outputs[pred][key]
                                if to_key in step_config:
                                    if isinstance(step_config[to_key], basestring):
                                        step_config[to_key] = [step_config[to_key]]
                                    step_config[to_key].extend(out)
                                else:
                                    step_config[to_key] = out
                        else:
                            from_key = bind_from.split('.')[1]
                            out = self.outputs[pred][from_key]
                            if to_key in step_config:
                                if isinstance(step_config[to_key], basestring):
                                    step_config[to_key] = [step_config[to_key]]
                                step_config[to_key].extend(out)
                            else:
                                step_config[to_key] = out

                    # Transfer metadata of previous step to next step
                    for key in self.meta['steps'].get(pred, {}):
                        step_config['meta'][key] = self.meta['steps'][pred][key]

            # 2. Form step config.
            if not self.one_step:
                ut.dict_update(step_config, self.cfg['config']['steps'].get(step_name, {}), replace=False)
                if step_name == FINAL_STEP:
                    # final step: pass full pipeline metadata
                    step_config['meta'].update(self.meta)
                else:
                    self.update_metadata(step_name, step_config[KEY_META])

            # 3. Submit step
            self.log.info('Executing step %s' % str(step_name))
            self.log.debug('  step configuration:\n %s' % ut.format_dict(step_config, indent=4))
            self.log.info('  step %s queued ' % str(step_name))

            self.running[step_name] = Step.load_step(step_config)
            job_counter = self.running[step_name].distribute()
            self.db.start_step(step_name, step_config, job_counter)
示例#6
0
    def load_cfg(cls, cfg):
        """
        Return the json cfg
        Is expecting as input one between a file, a json text or a dictionary
        """

        cfg_load = None
        try:
            if type(cfg) == dict:
                cfg_load = copy.deepcopy(cfg)
            elif isinstance(cfg, basestring):
                if os.path.exists(cfg):
                    with open(cfg) as fh:
                        cfg_load = json.load(fh)
                        if 'sys_path' not in cfg_load:
                            cfg_load['sys_path'] = os.path.dirname(os.path.realpath(cfg))
                else:
                    cfg_load = json.load(cfg)
        except Exception as e:
            raise Exception("Unable to load config file %s: %s" % (cfg, e))
        else:
            #load the spec_type or spec_file into the json_spec
            #if they exists
            cfg_data = { 'config' : {'steps': {}, 'pipeline' : {'project_name' : '', 'description' : '', 'output_dir': ''}}}
            ut.dict_update(cfg_data, cfg_load)

            if 'sys_path' in cfg_data:
                sys.path.insert(0, cfg_data['sys_path'])

            pipeline_to_load = cfg_data['dag'].pop("load") if "load" in cfg_data['dag'] else None
            if pipeline_to_load:
                try:
                    if os.path.exists(pipeline_to_load):
                        spec_file = pipeline_to_load
                    else:
                        if pipeline_to_load in pipeline_names:
                            spec_file = pipeline_names[pipeline_to_load]
                        else:
                            raise Exception("Pipeline %s not found in list of pipelines: [%s]"
                                            % (pipeline_to_load, ','.join(pipeline_names)))

                    with open(spec_file) as fh:
                        ut.pretty_print("Loading pipeline spec from %s" % spec_file)
                        spec = json.load(fh)
                        stepobjs = Pipeline.create_steps(spec)
                        steps_defaults = {}
                        for step in stepobjs:
                            step_default = stepobjs[step].keys_values(['params', 'requirements'])
                            if step_default:
                                steps_defaults[step] = step_default

                        spec.setdefault('config', {})
                        spec['config'].setdefault('pipeline', {})
                        spec['config'].setdefault('steps', {})
                        ut.dict_update(spec['config']['steps'], steps_defaults, replace=False)
                        ut.dict_update(spec['config'], cfg_data.get('config', ''))
                        cfg_data = spec
                except:
                    raise


            if cfg_data.get('config', {}).get('pipeline', {}).get('refgenome',{}):
                key_refgenome = cfg_data['config']['pipeline'].pop('refgenome')
                try:
                    ref_genomes = Pipeline.get_refgenomes(cfg_data)
                    if key_refgenome in ref_genomes:
                        # set refgenome parameters in each step (update config if already exists)
                        for step in ref_genomes[key_refgenome]:
                            if step in cfg_data['config']['steps']:
                                cfg_data['config']['steps'][step].update(ref_genomes[key_refgenome][step])
                            else:
                                cfg_data['config']['steps'][step] = ref_genomes[key_refgenome][step]
                    else:
                        raise Exception("unable to load ref genome paths for %s " % key_refgenome)
                except Exception, e:
                    raise

            if 'sys_path' in cfg_data:
                del sys.path[0]

            return cfg_data
示例#7
0
    def run_step(self, step_name):
        """
        Configure and run a job for the given step
        """

        #skip the input step
        if step_name == 'inputs':
            self.completed.append(step_name)
            self.outputs[step_name] = self.cfg['config']['steps'].get(
                step_name, {})
            self.outputs[step_name]['output_dir'] = ''
            self.db.update_step_status(step_name, JOB_STATUS.RUNNING)
            self.db.update_step_status(step_name, JOB_STATUS.SUCCEEDED)
            self.db.set_step_outputs(step_name, self.outputs[step_name])
        else:
            if self.one_step:
                step_config = self.cfg
                step_config['sys_path'] = self.sys_path
                step_config['output_dir'] = self.output_dir
                step_config['meta'] = {
                    'meta': {
                        'pipeline': {},
                        'step': {},
                        'job': {}
                    }
                }
                ut.dict_update(step_config['meta']['pipeline'],
                               self.meta['pipeline'])
            elif step_name == FINAL_STEP:
                step_config = {'meta': {'pipeline': {}, 'step': {}, 'job': {}}}
                ut.dict_update(step_config['meta']['pipeline'],
                               self.meta['pipeline'])
                step_config['name'] = FINAL_STEP
                step_config['step_class'] = self.dag.node[step_name][
                    'class_name']
                step_config['target_dir'] = self.output_dir
                step_config['source_dir'] = self.work_dir
                step_config['output_dir'] = os.path.join(
                    self.work_dir, step_name)
                self.configure_finalstep(step_config)
            else:
                step_config = {'meta': {'pipeline': {}, 'step': {}, 'job': {}}}
                ut.dict_update(step_config['meta']['pipeline'],
                               self.meta['pipeline'])
                step_class = self.dag.node[step_name]['class_name']
                step_config['name'] = step_name
                step_config['sys_path'] = self.sys_path
                step_config['step_class'] = step_class
                step_config['output_dir'] = os.path.join(
                    self.work_dir, step_name)

                # 1. Form input keys
                # Remember: edges are labelled by 'from' keys
                for pred in self.dag.predecessors(step_name):
                    edge = self.dag[pred][step_name]
                    # Not an actual loop: just get key/value
                    for bind_to, bind_from in edge.get('bindings',
                                                       {}).iteritems():
                        to_key = bind_to.split('.')[1]
                        if hasattr(bind_from, '__iter__'):
                            for from_key in bind_from:
                                key = from_key.split('.')[1]
                                out = self.outputs[pred][key]
                                if to_key in step_config:
                                    if isinstance(step_config[to_key],
                                                  basestring):
                                        step_config[to_key] = [
                                            step_config[to_key]
                                        ]
                                    step_config[to_key].extend(out)
                                else:
                                    step_config[to_key] = out
                        else:
                            from_key = bind_from.split('.')[1]
                            out = self.outputs[pred][from_key]
                            if to_key in step_config:
                                if isinstance(step_config[to_key], basestring):
                                    step_config[to_key] = [step_config[to_key]]
                                step_config[to_key].extend(out)
                            else:
                                step_config[to_key] = out

                    # Transfer metadata of previous step to next step
                    for key in self.meta['steps'].get(pred, {}):
                        step_config['meta'][key] = self.meta['steps'][pred][
                            key]

            # 2. Form step config.
            if not self.one_step:
                ut.dict_update(step_config,
                               self.cfg['config']['steps'].get(step_name, {}),
                               replace=False)
                if step_name == FINAL_STEP:
                    # final step: pass full pipeline metadata
                    step_config['meta'].update(self.meta)
                else:
                    self.update_metadata(step_name, step_config[KEY_META])

            # 3. Submit step
            self.log.info('Executing step %s' % str(step_name))
            self.log.debug('  step configuration:\n %s' %
                           ut.format_dict(step_config, indent=4))
            self.log.info('  step %s queued ' % str(step_name))

            self.running[step_name] = Step.load_step(step_config)
            job_counter = self.running[step_name].distribute()
            self.db.start_step(step_name, step_config, job_counter)
示例#8
0
    def load_cfg(cls, cfg):
        """
        Return the json cfg
        Is expecting as input one between a file, a json text or a dictionary
        """

        cfg_load = None
        try:
            if type(cfg) == dict:
                cfg_load = copy.deepcopy(cfg)
            elif isinstance(cfg, basestring):
                if os.path.exists(cfg):
                    with open(cfg) as fh:
                        cfg_load = json.load(fh)
                        if 'sys_path' not in cfg_load:
                            cfg_load['sys_path'] = os.path.dirname(
                                os.path.realpath(cfg))
                else:
                    cfg_load = json.load(cfg)
        except Exception as e:
            raise Exception("Unable to load config file %s: %s" % (cfg, e))
        else:
            #load the spec_type or spec_file into the json_spec
            #if they exists
            cfg_data = {
                'config': {
                    'steps': {},
                    'pipeline': {
                        'project_name': '',
                        'description': '',
                        'output_dir': ''
                    }
                }
            }
            ut.dict_update(cfg_data, cfg_load)

            if 'sys_path' in cfg_data:
                sys.path.insert(0, cfg_data['sys_path'])

            pipeline_to_load = cfg_data['dag'].pop(
                "load") if "load" in cfg_data['dag'] else None
            if pipeline_to_load:
                try:
                    if os.path.exists(pipeline_to_load):
                        spec_file = pipeline_to_load
                    else:
                        if pipeline_to_load in pipeline_names:
                            spec_file = pipeline_names[pipeline_to_load]
                        else:
                            raise Exception(
                                "Pipeline %s not found in list of pipelines: [%s]"
                                % (pipeline_to_load, ','.join(pipeline_names)))

                    with open(spec_file) as fh:
                        ut.pretty_print("Loading pipeline spec from %s" %
                                        spec_file)
                        spec = json.load(fh)
                        stepobjs = Pipeline.create_steps(spec)
                        steps_defaults = {}
                        for step in stepobjs:
                            step_default = stepobjs[step].keys_values(
                                ['params', 'requirements'])
                            if step_default:
                                steps_defaults[step] = step_default

                        spec.setdefault('config', {})
                        spec['config'].setdefault('pipeline', {})
                        spec['config'].setdefault('steps', {})
                        ut.dict_update(spec['config']['steps'],
                                       steps_defaults,
                                       replace=False)
                        ut.dict_update(spec['config'],
                                       cfg_data.get('config', ''))
                        cfg_data = spec
                except:
                    raise

            if cfg_data.get('config', {}).get('pipeline',
                                              {}).get('refgenome', {}):
                key_refgenome = cfg_data['config']['pipeline'].pop('refgenome')
                try:
                    ref_genomes = Pipeline.get_refgenomes(cfg_data)
                    if key_refgenome in ref_genomes:
                        # set refgenome parameters in each step (update config if already exists)
                        for step in ref_genomes[key_refgenome]:
                            if step in cfg_data['config']['steps']:
                                cfg_data['config']['steps'][step].update(
                                    ref_genomes[key_refgenome][step])
                            else:
                                cfg_data['config']['steps'][
                                    step] = ref_genomes[key_refgenome][step]
                    else:
                        raise Exception(
                            "unable to load ref genome paths for %s " %
                            key_refgenome)
                except Exception, e:
                    raise

            if 'sys_path' in cfg_data:
                del sys.path[0]

            return cfg_data