def get(self, run_id): """ Return the dag of the given run """ pipeline = db.pipelines.find_one({'run_id': run_id}, {'config': 1, 'file_registry':1}) file_registry = pipeline.get('file_registry', []) if file_registry: file_registry = json.loads(file_registry) conf_str = json.loads(pipeline['config']) config = Pipeline.load_cfg(conf_str); result_steps = config.get('config', {}).get('pipeline', {}).get('results', []) delete_steps = config.get('config', {}).get('pipeline', {}).get('delete', []) delete_steps.append('finalize') delete_steps.append('inputs') steps = list(db.steps.find( {"run_id":run_id, "name": {"$nin": delete_steps}, "jobs": {"$elemMatch": {"outputs": {"$exists": True}}}}, {"name":1, "jobs":1, "outputs.output_dir": 1, "step_config": 1})) outputs = {} for step in steps: if step.get('step_config', {}): s = Step.load_step(step['step_config']) output_files = [] for job_id, job in enumerate(step['jobs']): for key in job['outputs']: if key in s.keys(key_groups='outputs', key_filter={'type':'file'}): for i, filename in enumerate(job['outputs'][key]): output = { 'path': filename } if not isinstance(filename, list): output['archived'] = (filename in file_registry) else: output['archived'] = False output_files.append(output) if output_files: outputs[step['name']] = defaultdict(list) outputs[step['name']]['archive'] = step['name'] in result_steps outputs[step['name']]['dir'] = step.get('outputs', {}).get('output_dir') outputs[step['name']]['files'] = copy.deepcopy(output_files) return outputs
def __init__(self, cfg, user='******', db=True, schedname="SCHED_CONDOR"): """ Read in the pipeline graph and load the configuration. """ self.all_ok = True self.user = user self.status = JOB_STATUS.QUEUED self.lock = '' self.completed = [] self.running = {} self.outputs = {} self.schedname = schedname db_model_name = "MONGO_DB" if db else "STUB_DB" # Load configuration self.one_step = False try: self.cfg = Pipeline.load_cfg(cfg) except Exception as e1: print('Failed to load config as pipeline (error=%s). Trying as step' % e1) try: self.cfg = Step.load_cfg(cfg) self.step = Step.load_step(self.cfg) self.one_step = True except Exception as e2: Exception("Unable to load config file %s:\n" \ "pipeline load: %s\n" \ "step load: %s" % (cfg, e1, e2)) # Set all additional information self.run_id = self.cfg.get('run_id') if self.one_step: self.name = self.step.name self.label = self.step.name self.project_name = self.cfg.get('project_name', '') self.description = self.cfg.get('description', '') self.output_dir = self.step.output_dir self.ordered = [self.step.name] else: self.name = self.cfg['name'] self.label = self.cfg['label'] self.project_name = self.cfg['config']['pipeline'].get('project_name', '') self.description = self.cfg['config']['pipeline'].get('description', '') self.output_dir = self.cfg['config']['pipeline']['output_dir'] if not self.output_dir.startswith('/scratch'): self.cfg['dag']['nodes'][FINAL_STEP] = 'utils.Finalize' #TODO: Make it work for one_step as well self.ordered = Pipeline.ordered_steps(self.cfg) self.sys_path = self.cfg.get('sys_path') if self.sys_path: sys.path.insert(0, self.sys_path) self.dag = self.create_dag(self.cfg, one_step=self.one_step) self.meta = { 'pipeline': { 'label': self.label, 'project_name': self.project_name, 'descr': self.description, 'run_id': self.run_id }, 'steps': {}, 'job' : {} } self.db = db_models[db_model_name](self.name, self.cfg, self.ordered, self.user, output_dir=self.output_dir) if hasattr(self.db, 'run_id'): self.run_id = self.db.run_id self.cfg['run_id'] = self.run_id # Define the output directories if not os.path.exists(self.output_dir): os.makedirs(self.output_dir, 0775) # Use default output dir under /scratch/cgi/nespipe (linked to user-defined dir.) # if: a) this run is using the db (so we have a run ID); b) it is not a demux. run; # and c) the user-defined directory is not already under /scratch if self.run_id and not (self.name == 'demultiplexing'): dirname = '%s_%d' % (self.name, self.db.run_id) self.output_dir = os.path.join(self.output_dir, dirname) if not os.path.exists(self.output_dir): os.makedirs(self.output_dir, 0775) # In case of /scratch, do not create an additional sub-directory if self.output_dir.startswith('/scratch'): self.work_dir = self.output_dir else: self.work_dir = os.path.join(WORK_DIR, self.user, dirname) if not os.path.exists(self.work_dir): os.makedirs(self.work_dir, 0775) symlink = os.path.join(self.output_dir, 'work_area') if not os.path.exists(symlink): os.symlink(self.work_dir, symlink) else: self.work_dir = self.output_dir ut.pretty_print('Output directories: output_dir=%s, work_dir=%s' % (self.output_dir, self.work_dir)) self.db.update_pipeline(self.run_id, {'output_dir': self.output_dir, 'work_dir': self.work_dir })
def run_step(self, step_name): """ Configure and run a job for the given step """ #skip the input step if step_name == 'inputs': self.completed.append(step_name) self.outputs[step_name] = self.cfg['config']['steps'].get(step_name, {}) self.outputs[step_name]['output_dir'] = '' self.db.update_step_status(step_name, JOB_STATUS.RUNNING) self.db.update_step_status(step_name, JOB_STATUS.SUCCEEDED) self.db.set_step_outputs(step_name, self.outputs[step_name]) else: if self.one_step: step_config = self.cfg step_config['sys_path'] = self.sys_path step_config['output_dir'] = self.output_dir step_config['meta'] = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} }} ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) elif step_name == FINAL_STEP: step_config = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} } } ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_config['name'] = FINAL_STEP step_config['step_class'] = self.dag.node[step_name]['class_name'] step_config['target_dir'] = self.output_dir step_config['source_dir'] = self.work_dir step_config['output_dir'] = os.path.join(self.work_dir, step_name) self.configure_finalstep(step_config) else: step_config = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} } } ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_class = self.dag.node[step_name]['class_name'] step_config['name'] = step_name step_config['sys_path'] = self.sys_path step_config['step_class'] = step_class step_config['output_dir'] = os.path.join(self.work_dir, step_name) # 1. Form input keys # Remember: edges are labelled by 'from' keys for pred in self.dag.predecessors(step_name): edge = self.dag[pred][step_name] # Not an actual loop: just get key/value for bind_to, bind_from in edge.get('bindings', {}).iteritems(): to_key = bind_to.split('.')[1] if hasattr(bind_from, '__iter__'): for from_key in bind_from: key = from_key.split('.')[1] out = self.outputs[pred][key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [step_config[to_key]] step_config[to_key].extend(out) else: step_config[to_key] = out else: from_key = bind_from.split('.')[1] out = self.outputs[pred][from_key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [step_config[to_key]] step_config[to_key].extend(out) else: step_config[to_key] = out # Transfer metadata of previous step to next step for key in self.meta['steps'].get(pred, {}): step_config['meta'][key] = self.meta['steps'][pred][key] # 2. Form step config. if not self.one_step: ut.dict_update(step_config, self.cfg['config']['steps'].get(step_name, {}), replace=False) if step_name == FINAL_STEP: # final step: pass full pipeline metadata step_config['meta'].update(self.meta) else: self.update_metadata(step_name, step_config[KEY_META]) # 3. Submit step self.log.info('Executing step %s' % str(step_name)) self.log.debug(' step configuration:\n %s' % ut.format_dict(step_config, indent=4)) self.log.info(' step %s queued ' % str(step_name)) self.running[step_name] = Step.load_step(step_config) job_counter = self.running[step_name].distribute() self.db.start_step(step_name, step_config, job_counter)
def __init__(self, cfg, user='******', db=True, schedname="SCHED_CONDOR"): """ Read in the pipeline graph and load the configuration. """ self.all_ok = True self.user = user self.status = JOB_STATUS.QUEUED self.lock = '' self.completed = [] self.running = {} self.outputs = {} self.schedname = schedname db_model_name = "MONGO_DB" if db else "STUB_DB" # Load configuration self.one_step = False try: self.cfg = Pipeline.load_cfg(cfg) except Exception as e1: print( 'Failed to load config as pipeline (error=%s). Trying as step' % e1) try: self.cfg = Step.load_cfg(cfg) self.step = Step.load_step(self.cfg) self.one_step = True except Exception as e2: Exception("Unable to load config file %s:\n" \ "pipeline load: %s\n" \ "step load: %s" % (cfg, e1, e2)) # Set all additional information self.run_id = self.cfg.get('run_id') if self.one_step: self.name = self.step.name self.label = self.step.name self.project_name = self.cfg.get('project_name', '') self.description = self.cfg.get('description', '') self.output_dir = self.step.output_dir self.ordered = [self.step.name] else: self.name = self.cfg['name'] self.label = self.cfg['label'] self.project_name = self.cfg['config']['pipeline'].get( 'project_name', '') self.description = self.cfg['config']['pipeline'].get( 'description', '') self.output_dir = self.cfg['config']['pipeline']['output_dir'] if not self.output_dir.startswith('/scratch'): self.cfg['dag']['nodes'][ FINAL_STEP] = 'utils.Finalize' #TODO: Make it work for one_step as well self.ordered = Pipeline.ordered_steps(self.cfg) self.sys_path = self.cfg.get('sys_path') if self.sys_path: sys.path.insert(0, self.sys_path) self.dag = self.create_dag(self.cfg, one_step=self.one_step) self.meta = { 'pipeline': { 'label': self.label, 'project_name': self.project_name, 'descr': self.description, 'run_id': self.run_id }, 'steps': {}, 'job': {} } self.db = db_models[db_model_name](self.name, self.cfg, self.ordered, self.user, output_dir=self.output_dir) if hasattr(self.db, 'run_id'): self.run_id = self.db.run_id self.cfg['run_id'] = self.run_id # Define the output directories if not os.path.exists(self.output_dir): os.makedirs(self.output_dir, 0775) # Use default output dir under /scratch/cgi/nespipe (linked to user-defined dir.) # if: a) this run is using the db (so we have a run ID); b) it is not a demux. run; # and c) the user-defined directory is not already under /scratch if self.run_id and not (self.name == 'demultiplexing'): dirname = '%s_%d' % (self.name, self.db.run_id) self.output_dir = os.path.join(self.output_dir, dirname) if not os.path.exists(self.output_dir): os.makedirs(self.output_dir, 0775) # In case of /scratch, do not create an additional sub-directory if self.output_dir.startswith('/scratch'): self.work_dir = self.output_dir else: self.work_dir = os.path.join(WORK_DIR, self.user, dirname) if not os.path.exists(self.work_dir): os.makedirs(self.work_dir, 0775) symlink = os.path.join(self.output_dir, 'work_area') if not os.path.exists(symlink): os.symlink(self.work_dir, symlink) else: self.work_dir = self.output_dir ut.pretty_print('Output directories: output_dir=%s, work_dir=%s' % (self.output_dir, self.work_dir)) self.db.update_pipeline(self.run_id, { 'output_dir': self.output_dir, 'work_dir': self.work_dir })
def run_step(self, step_name): """ Configure and run a job for the given step """ #skip the input step if step_name == 'inputs': self.completed.append(step_name) self.outputs[step_name] = self.cfg['config']['steps'].get( step_name, {}) self.outputs[step_name]['output_dir'] = '' self.db.update_step_status(step_name, JOB_STATUS.RUNNING) self.db.update_step_status(step_name, JOB_STATUS.SUCCEEDED) self.db.set_step_outputs(step_name, self.outputs[step_name]) else: if self.one_step: step_config = self.cfg step_config['sys_path'] = self.sys_path step_config['output_dir'] = self.output_dir step_config['meta'] = { 'meta': { 'pipeline': {}, 'step': {}, 'job': {} } } ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) elif step_name == FINAL_STEP: step_config = {'meta': {'pipeline': {}, 'step': {}, 'job': {}}} ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_config['name'] = FINAL_STEP step_config['step_class'] = self.dag.node[step_name][ 'class_name'] step_config['target_dir'] = self.output_dir step_config['source_dir'] = self.work_dir step_config['output_dir'] = os.path.join( self.work_dir, step_name) self.configure_finalstep(step_config) else: step_config = {'meta': {'pipeline': {}, 'step': {}, 'job': {}}} ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_class = self.dag.node[step_name]['class_name'] step_config['name'] = step_name step_config['sys_path'] = self.sys_path step_config['step_class'] = step_class step_config['output_dir'] = os.path.join( self.work_dir, step_name) # 1. Form input keys # Remember: edges are labelled by 'from' keys for pred in self.dag.predecessors(step_name): edge = self.dag[pred][step_name] # Not an actual loop: just get key/value for bind_to, bind_from in edge.get('bindings', {}).iteritems(): to_key = bind_to.split('.')[1] if hasattr(bind_from, '__iter__'): for from_key in bind_from: key = from_key.split('.')[1] out = self.outputs[pred][key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [ step_config[to_key] ] step_config[to_key].extend(out) else: step_config[to_key] = out else: from_key = bind_from.split('.')[1] out = self.outputs[pred][from_key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [step_config[to_key]] step_config[to_key].extend(out) else: step_config[to_key] = out # Transfer metadata of previous step to next step for key in self.meta['steps'].get(pred, {}): step_config['meta'][key] = self.meta['steps'][pred][ key] # 2. Form step config. if not self.one_step: ut.dict_update(step_config, self.cfg['config']['steps'].get(step_name, {}), replace=False) if step_name == FINAL_STEP: # final step: pass full pipeline metadata step_config['meta'].update(self.meta) else: self.update_metadata(step_name, step_config[KEY_META]) # 3. Submit step self.log.info('Executing step %s' % str(step_name)) self.log.debug(' step configuration:\n %s' % ut.format_dict(step_config, indent=4)) self.log.info(' step %s queued ' % str(step_name)) self.running[step_name] = Step.load_step(step_config) job_counter = self.running[step_name].distribute() self.db.start_step(step_name, step_config, job_counter)
def post(self, run_id): """ Pushes files into iRODS """ data = request.get_json(force=True) runmeta = data.get('meta') selection = data.get('selection') user = auth_get_username(request.authorization, data.get('user')) npdis = dbmodel.get_npdi_projects() npdi = runmeta.get('Project NPDI ID', '') study_nickname = runmeta.get('Study nickname', 'Required field missing') if (npdi + study_nickname) not in npdis: return {'pipeline': { 'Project': '%s (%s)' %(npdi, study_nickname) }}, 400 run = db.pipelines.find_one({'run_id': run_id}, {'meta':1, 'run_id':1}) steps_names = selection.keys() steps = list(db.steps.find( {"run_id":run_id, "name": {'$in': steps_names}, "jobs": {"$elemMatch": {"outputs": {"$exists": True}}}}, {"name":1, "jobs":1, "outputs.output_dir": 1, "step_config": 1})) outputs = {} for step in steps: if step.get('step_config', {}): s = Step.load_step(step['step_config']) output_files = {} for job_id, job in enumerate(step['jobs']): for key in job['outputs']: if key in s.keys(key_groups='outputs', key_filter={'type':'file'}): for i, filename in enumerate(job['outputs'][key]): filemeta = {'step': step['name'], 'job_id': job_id} ext = os.path.splitext(filename)[1][1:].upper() for key in job.get('meta', {}): meta = job['meta'][key] if key == 'sample_id': okey = 'Operational sample accession' else: okey = key if isinstance(meta, list): filemeta[okey] = meta[i] else: filemeta[okey] = meta filemeta['File type'] = 'Processed data file' filemeta['File format'] = ext output_files[filename] = filemeta if output_files: outputs[step['name']] = output_files input_files = [] meta_data = [] for step_name, step_selection in selection.iteritems(): for filepath in step_selection: input_files.append(filepath) filemeta = outputs[step_name][filepath] filemeta.update(runmeta) meta_data.append(filemeta) cfg = Pipeline.load_cfg(pipeline_specs['irods_lz']) cfg['config']['steps']['irods_mvtolz'] = { 'input_files' : input_files, 'meta_data' : meta_data } cfg['config']['steps']['irods_monitorlz'] = { 'prun_id' : run['run_id'] } cfg['config']['pipeline']['project_name'] = run['meta']['project_name'] cfg['config']['pipeline']['description'] = 'Archive data for run %s' %run['run_id'] cfg['config']['pipeline']['output_dir'] = '/scratch/cgi/irods' # Get id from DB db_info = dbmodel.PipelineDb(cfg['name'], cfg, Pipeline.ordered_steps(cfg), user) cfg['run_id'] = db_info.run_id ut.pretty_print("Submitting pipeline %s (ID %d) for user %s" % (cfg['label'], cfg['run_id'], user)) return pm.add_pipeline(cfg, user)
def post(self, run_id): """ Pushes files into iRODS """ data = request.get_json(force=True) runmeta = data.get('meta') selection = data.get('selection') user = auth_get_username(request.authorization, data.get('user')) npdis = dbmodel.get_npdi_projects() npdi = runmeta.get('Project NPDI ID', '') study_nickname = runmeta.get('Study nickname', 'Required field missing') if (npdi + study_nickname) not in npdis: return { 'pipeline': { 'Project': '%s (%s)' % (npdi, study_nickname) } }, 400 run = db.pipelines.find_one({'run_id': run_id}, { 'meta': 1, 'run_id': 1 }) steps_names = selection.keys() steps = list( db.steps.find( { "run_id": run_id, "name": { '$in': steps_names }, "jobs": { "$elemMatch": { "outputs": { "$exists": True } } } }, { "name": 1, "jobs": 1, "outputs.output_dir": 1, "step_config": 1 })) outputs = {} for step in steps: if step.get('step_config', {}): s = Step.load_step(step['step_config']) output_files = {} for job_id, job in enumerate(step['jobs']): for key in job['outputs']: if key in s.keys(key_groups='outputs', key_filter={'type': 'file'}): for i, filename in enumerate( job['outputs'][key]): filemeta = { 'step': step['name'], 'job_id': job_id } ext = os.path.splitext( filename)[1][1:].upper() for key in job.get('meta', {}): meta = job['meta'][key] if key == 'sample_id': okey = 'Operational sample accession' else: okey = key if isinstance(meta, list): filemeta[okey] = meta[i] else: filemeta[okey] = meta filemeta[ 'File type'] = 'Processed data file' filemeta['File format'] = ext output_files[filename] = filemeta if output_files: outputs[step['name']] = output_files input_files = [] meta_data = [] for step_name, step_selection in selection.iteritems(): for filepath in step_selection: input_files.append(filepath) filemeta = outputs[step_name][filepath] filemeta.update(runmeta) meta_data.append(filemeta) cfg = Pipeline.load_cfg(pipeline_specs['irods_lz']) cfg['config']['steps']['irods_mvtolz'] = { 'input_files': input_files, 'meta_data': meta_data } cfg['config']['steps']['irods_monitorlz'] = { 'prun_id': run['run_id'] } cfg['config']['pipeline']['project_name'] = run['meta'][ 'project_name'] cfg['config']['pipeline'][ 'description'] = 'Archive data for run %s' % run['run_id'] cfg['config']['pipeline']['output_dir'] = '/scratch/cgi/irods' # Get id from DB db_info = dbmodel.PipelineDb(cfg['name'], cfg, Pipeline.ordered_steps(cfg), user) cfg['run_id'] = db_info.run_id ut.pretty_print("Submitting pipeline %s (ID %d) for user %s" % (cfg['label'], cfg['run_id'], user)) return pm.add_pipeline(cfg, user)
def get(self, run_id): """ Return the dag of the given run """ pipeline = db.pipelines.find_one({'run_id': run_id}, { 'config': 1, 'file_registry': 1 }) file_registry = pipeline.get('file_registry', []) if file_registry: file_registry = json.loads(file_registry) conf_str = json.loads(pipeline['config']) config = Pipeline.load_cfg(conf_str) result_steps = config.get('config', {}).get('pipeline', {}).get('results', []) delete_steps = config.get('config', {}).get('pipeline', {}).get('delete', []) delete_steps.append('finalize') delete_steps.append('inputs') steps = list( db.steps.find( { "run_id": run_id, "name": { "$nin": delete_steps }, "jobs": { "$elemMatch": { "outputs": { "$exists": True } } } }, { "name": 1, "jobs": 1, "outputs.output_dir": 1, "step_config": 1 })) outputs = {} for step in steps: if step.get('step_config', {}): s = Step.load_step(step['step_config']) output_files = [] for job_id, job in enumerate(step['jobs']): for key in job['outputs']: if key in s.keys(key_groups='outputs', key_filter={'type': 'file'}): for i, filename in enumerate( job['outputs'][key]): output = {'path': filename} if not isinstance(filename, list): output['archived'] = (filename in file_registry) else: output['archived'] = False output_files.append(output) if output_files: outputs[step['name']] = defaultdict(list) outputs[step['name']]['archive'] = step[ 'name'] in result_steps outputs[step['name']]['dir'] = step.get( 'outputs', {}).get('output_dir') outputs[step['name']]['files'] = copy.deepcopy( output_files) return outputs