def test_workflow_merge(self): job1 = wfclient.Job(['ls', '-l', '/tmp'], name='job1') job2 = wfclient.Job(['ls', '-l', '/tmp'], name='job2') job3 = wfclient.Job(['ls', '-l', '/tmp'], name='job3') group = wfclient.Group([job1, job2, job3], name='group1') workflow1 = wfclient.Workflow([job1, job2, job3], dependencies=[(job1, job2), (job1, job3)], root_group=[group]) job4 = wfclient.Job(['ls', '-l', '/tmp'], name='job4') job5 = wfclient.Job(['ls', '-l', '/tmp'], name='job5') job6 = wfclient.Job(['ls', '-l', '/tmp'], name='job6') workflow2 = wfclient.Workflow([job4, job5, job6], name='workflow2', dependencies=[(job4, job6), (job5, job6)]) group2 = workflow1.add_workflow(workflow2, as_group='group2') self.assertTrue(len(workflow1.jobs) == 6) self.assertTrue(len(workflow1.root_group) == 2) self.assertTrue(len(workflow1.groups) == 2) self.assertTrue(len(workflow1.dependencies) == 4) # make both wf sequential workflow1.add_dependencies([(group, group2)]) # 4 hub barrier jobs have been added self.assertTrue(len(workflow1.jobs) == 10) self.assertTrue(len(workflow1.groups) == 2) self.assertTrue(len(workflow1.dependencies) == 17) # do it again using sets for deps workflow3 = wfclient.Workflow([job1, job2, job3], dependencies=set([(job1, job2), (job1, job3)]), root_group=[group]) workflow4 = wfclient.Workflow([job4, job5, job6], name='workflow4', dependencies=set([(job4, job6), (job5, job6)])) group3 = workflow3.add_workflow(workflow4, as_group='group3') self.assertTrue(len(workflow3.jobs) == 6) self.assertTrue(len(workflow3.root_group) == 2) self.assertTrue(len(workflow3.groups) == 2) self.assertTrue(len(workflow3.dependencies) == 4) # make both wf sequential workflow3.add_dependencies([(group, group3)]) # 4 hub barrier jobs have been added self.assertTrue(len(workflow3.jobs) == 10) self.assertTrue(len(workflow3.groups) == 2) self.assertTrue(len(workflow3.dependencies) == 17)
def test_workflow_simple(self): job1 = wfclient.Job(['ls', '-l', '/tmp']) job2 = wfclient.Job(['ls', '-l', '/tmp']) job3 = wfclient.Job(['ls', '-l', '/tmp']) workflow = wfclient.Workflow([job1, job2, job3], dependencies=[(job1, job2), (job1, job3)]) self.assertTrue(len(workflow.jobs) == 3) self.assertTrue(len(workflow.dependencies) == 2)
def workflow_from_pipeline(pipeline, study_config=None, disabled_nodes=None, jobs_priority=0, create_directories=True): """ Create a soma-workflow workflow from a Capsul Pipeline Parameters ---------- pipeline: Pipeline (mandatory) a CAPSUL pipeline study_config: StudyConfig (optional), or dict holds information about file transfers and shared resource paths. If not specified, it will be accessed through the pipeline. disabled_nodes: sequence of pipeline nodes (Node instances) (optional) such nodes will be disabled on-the-fly in the pipeline, file transfers will be adapted accordingly (outputs may become inputs in the resulting workflow), and temporary files will be checked. If a disabled node was to produce a temporary files which is still used in an enabled node, then a ValueError exception will be raised. If disabled_nodes is not passed, they will possibly be taken from the pipeline (if available) using disabled steps: see Pipeline.define_steps() jobs_priority: int (optional, default: 0) set this priority on soma-workflow jobs. create_directories: bool (optional, default: True) if set, needed output directories (which will contain output files) will be created in a first job, which all other ones depend on. Returns ------- workflow: Workflow a soma-workflow workflow """ class TempFile(str): # class needed temporary to identify temporary paths in the pipeline. # must inherit a string type since it is used as a trait value def __init__(self, string=''): # in python3 super(..).__init__() cannot take an argument # moreover the str value is assigned anyway. super(TempFile, self).__init__() if isinstance(string, TempFile): self.pattern = string.pattern self.value = string.value self.ref = string.ref if string.ref else string else: self.pattern = '%s' self.value = string self.ref = None def referent(self): return self.ref if self.ref else self def get_value(self): return self.referent().value def __add__(self, other): res = TempFile(str(self) + str(other)) res.pattern = self.pattern + str(other) res.value = self.value res.ref = self.referent() return res def __radd__(self, other): res = TempFile(str(other) + str(self)) res.pattern = str(other) + self.pattern res.value = self.value res.ref = self.referent() return res def __iadd__(self, other): self.pattern += str(other) str(TempFile, self).__iadd__(str(other)) def __str__(self): return self.pattern % self.get_value() def __hash__(self): if self.ref: return self.referent().__hash__() return super(TempFile, self).__hash__() def __eq__(self, other): if not isinstance(other, TempFile): return False return self.referent() is other.referent() def _files_group(path, merged_formats): bname = os.path.basename(path) l0 = len(path) - len(bname) p0 = 0 paths = [path] while True: p = bname.find('.', p0) if p < 0: break ext = bname[p:] p0 = p + 1 format_def = merged_formats.get(ext) if format_def: path0 = path[:l0 + p] paths += [path0 + e[0] for e in format_def] break paths.append(path + '.minf') return paths def _translated_path(path, shared_map, shared_paths, trait=None): if path is None or path is Undefined \ or not shared_paths \ or (trait is not None and not isinstance(trait.trait_type, File) \ and not isinstance(trait.trait_type, Directory)): return None # not a path item = shared_map.get(path) if item is not None: # already in map return item for base_dir, (namespace, uuid) in six.iteritems(shared_paths): if path.startswith(base_dir + os.sep): rel_path = path[len(base_dir)+1:] #uuid = path item = swclient.SharedResourcePath( rel_path, namespace, uuid=uuid) shared_map[path] = item return item return None def build_job(process, temp_map={}, shared_map={}, transfers=[{}, {}], shared_paths={}, forbidden_temp=set(), name='', priority=0, step_name=''): """ Create a soma-workflow Job from a Capsul Process Parameters ---------- process: Process (mandatory) a CAPSUL process instance temp_map: dict (optional) temporary paths map. shared_map: dict (optional) file shared translated paths, global pipeline dict. This dict is updated when needed during the process. shared_paths: dict (optional) holds information about shared resource paths base dirs for soma-workflow. If not specified, no translation will be used. name: string (optional) job name. If empty, use the process name. priority: int (optional) priority assigned to the job step_name: str (optional) the step name will be stored in the job user_storage variable Returns ------- job: Job a soma-workflow Job instance that will execute the CAPSUL process """ def _replace_in_list(rlist, temp_map): for i, item in enumerate(rlist): if item in temp_map: value = temp_map[item] value = value.__class__(value) if hasattr(item, 'pattern'): # temp case (differs from shared case) value.pattern = item.pattern rlist[i] = value elif isinstance(item, (list, tuple)): deeperlist = list(item) _replace_in_list(deeperlist, temp_map) rlist[i] = deeperlist elif item is Undefined: rlist[i] = '' def _replace_transfers(rlist, process, itransfers, otransfers): param_name = None i = 3 for item in rlist[3:]: if param_name is None: param_name = item else: transfer = itransfers.get(param_name) if transfer is None: transfer = otransfers.get(param_name) if transfer is not None: value = transfer[0] if isinstance(item, list) or isinstance(item, tuple): # TODO: handle lists of files [transfers] #deeperlist = list(item) #_replace_in_list(deeperlist, transfers) #rlist[i] = deeperlist print('*** LIST! ***') else: rlist[i] = value param_name = None i += 1 job_name = name if not job_name: job_name = process.name # check for special modified paths in parameters input_replaced_paths = [] output_replaced_paths = [] for param_name, parameter in six.iteritems(process.user_traits()): if param_name not in ('nodes_activation', 'selection_changed'): value = getattr(process, param_name) if isinstance(value, list): values = value else: values = [value] for value in values: if isinstance(value, TempFile): # duplicate swf temp and copy pattern into it tval = temp_map[value] tval = tval.__class__(tval) tval.pattern = value.pattern if parameter.output: output_replaced_paths.append(tval) else: if value in forbidden_temp: raise ValueError( 'Temporary value used cannot be generated ' 'in the workflkow: %s.%s' % (job_name, param_name)) input_replaced_paths.append(tval) else: _translated_path(value, shared_map, shared_paths, parameter) # Get the process command line process_cmdline = process.get_commandline() # and replace in commandline iproc_transfers = transfers[0].get(process, {}) oproc_transfers = transfers[1].get(process, {}) #proc_transfers = dict(iproc_transfers) #proc_transfers.update(oproc_transfers) _replace_in_list(process_cmdline, temp_map) _replace_in_list(process_cmdline, shared_map) _replace_transfers( process_cmdline, process, iproc_transfers, oproc_transfers) # Return the soma-workflow job job = swclient.Job( name=job_name, command=process_cmdline, referenced_input_files =input_replaced_paths \ + [x[0] for x in iproc_transfers.values()], referenced_output_files =output_replaced_paths \ + [x[0] for x in oproc_transfers.values()], priority=priority) if step_name: job.user_storage = step_name return job def build_group(name, jobs): """ Create a group of jobs Parameters ---------- name: str (mandatory) the group name jobs: list of Job (mandatory) the jobs we want to insert in the group Returns ------- group: Group the soma-workflow Group instance """ return swclient.Group(jobs, name=name) def get_jobs(group, groups): gqueue = list(group.elements) jobs = [] while gqueue: group_or_job = gqueue.pop(0) if group_or_job in groups: gqueue += group_or_job.elements else: jobs.append(group_or_job) return jobs def assign_temporary_filenames(pipeline, count_start=0): ''' Find and temporarily assign necessary temporary file names''' temp_filenames = pipeline.find_empty_parameters() temp_map = {} count = count_start for node, plug_name, optional in temp_filenames: if hasattr(node, 'process'): process = node.process else: process = node trait = process.trait(plug_name) is_list = isinstance(trait.trait_type, List) values = [] if is_list: todo = getattr(process, plug_name) trait = trait.inner_traits[0] else: todo = [Undefined] for item in todo: if item not in (Undefined, '', None): # non-empty list element values.append(item) continue is_directory = isinstance(trait.trait_type, Directory) if trait.allowed_extensions: suffix = trait.allowed_extensions[0] else: suffix = '' swf_tmp = swclient.TemporaryPath(is_directory=is_directory, suffix=suffix) tmp_file = TempFile('%d' % count) count += 1 temp_map[tmp_file] = (swf_tmp, node, plug_name, optional) values.append(tmp_file) # set a TempFile value to identify the params / value if is_list: setattr(process, plug_name, values) else: setattr(process, plug_name, values[0]) return temp_map def restore_empty_filenames(temporary_map): ''' Set back Undefined values to temporarily assigned file names (using assign_temporary_filenames() ''' for tmp_file, item in six.iteritems(temporary_map): node, plug_name = item[1:3] if hasattr(node, 'process'): process = node.process else: process = node value = getattr(process, plug_name) if isinstance(value, list): # FIXME TODO: only restore values in list which correspond to # a temporary. # Problem: they are sometimes transformed into strings # FIXME: several temp items can be part of the same list, # so this assignment is likely to be done several times. # It could probably be optimized. # WARNING: we set "" values instead of Undefined because they may # be mandatory setattr(process, plug_name, [''] * len(value)) else: setattr(process, plug_name, Undefined) def _get_swf_paths(study_config): computing_resource = getattr( study_config, 'somaworkflow_computing_resource', None) if computing_resource in (None, Undefined): return [], {} resources_conf = getattr( study_config, 'somaworkflow_computing_resources_config', None) if resources_conf in (None, Undefined): return [], {} resource_conf = getattr(resources_conf, computing_resource, None) if resource_conf in (None, Undefined): return [], {} return (resource_conf.transfer_paths, resource_conf.path_translations.export_to_dict()) def _propagate_transfer(node, param, path, output, transfers, transfer_item): todo_plugs = [(node, param, output)] done_plugs = set() while todo_plugs: node, param, output = todo_plugs.pop() plug = node.plugs[param] is_pipeline = False if plug is None or not plug.enabled or not plug.activated \ or plug in done_plugs: continue done_plugs.add(plug) if isinstance(node, Switch): if output: # propagate to active input other_param = node.switch + '_switch_' + param #plug = node.plugs[input_param] else: other_param = param[len(node.switch + '_switch_'):] #other_plug = node.plugs[other_param] todo_plugs.append((node, other_param, not output)) else: process = node.process if hasattr(process, 'nodes'): is_pipeline = True #plug = process.nodes[''].plugs.get(param) else: # process: replace its param # check trait type (must be File or Directory, not Any) trait = process.user_traits()[param] #plug = node.plugs[param] if isinstance(trait.trait_type, File) \ or isinstance(trait.trait_type, Directory): transfers[bool(trait.output)].setdefault(process, {})[ param] = (transfer_item, path) #output = not output # invert IO status #if plug is None or not plug.enabled or not plug.activated \ #or plug in done_plugs: #continue if output: links = plug.links_to else: links = plug.links_from for proc_name, param_name, node, other_plug, act in links: if not node.activated or not node.enabled \ or not other_plug.activated or not other_plug.enabled \ or other_plug in done_plugs: continue todo_plugs.append((node, param_name, not output)) if is_pipeline: # in a pipeline node, go both directions if output: links = plug.links_from else: links = plug.links_to for proc_name, param_name, node, plug, act in links: if not node.activated or not node.enabled \ or not plug.activated or not plug.enabled \ or plug in done_plugs: continue todo_plugs.append((node, param_name, output)) def _get_transfers(pipeline, transfer_paths, merged_formats): """ Create and list FileTransfer objects needed in the pipeline. Parameters ---------- pipeline: Pipeline pipeline to build workflow for transfer_paths: list paths basedirs for translations from soma-worflow config Returns ------- [in_transfers, out_transfers] each of which is a dict: { Process: proc_dict } proc_dict is a dict: { file_path : FileTransfer object } FileTransfer objects are reused when referring to the same path used from different processes within the pipeline. """ in_transfers = {} out_transfers = {} transfers = [in_transfers, out_transfers] todo_nodes = [pipeline.pipeline_node] while todo_nodes: node = todo_nodes.pop(0) if hasattr(node, 'process'): process = node.process else: process = node for param, trait in six.iteritems(process.user_traits()): if isinstance(trait.trait_type, File) \ or isinstance(trait.trait_type, Directory) \ or type(trait.trait_type) is Any: # is value in paths path = getattr(process, param) if path is None or path is Undefined: continue output = bool(trait.output) existing_transfers = transfers[output].get(process, {}) existing_transfer = existing_transfers.get(param) if existing_transfer: continue for tpath in transfer_paths: if path.startswith(os.path.join(tpath, '')): transfer_item = swclient.FileTransfer( is_input=not output, client_path=path, client_paths=_files_group(path, merged_formats)) _propagate_transfer(node, param, path, not output, transfers, transfer_item) break if hasattr(process, 'nodes'): todo_nodes += [sub_node for name, sub_node in six.iteritems(process.nodes) if name != '' and not isinstance(sub_node, Switch)] return transfers def _expand_nodes(nodes): '''Expands the nodes list or set to leaf nodes by replacing pipeline nodes by their children list. Returns ------- set of leaf nodes. ''' nodes_list = list(nodes) expanded_nodes = set() while nodes_list: node = nodes_list.pop(0) if not hasattr(node, 'process'): continue # switch or something if isinstance(node.process, Pipeline): nodes_list.extend([p for p in six.itervalues(node.process.nodes) if p is not node]) else: expanded_nodes.add(node) return expanded_nodes def _handle_disable_nodes(pipeline, temp_map, transfers, disabled_nodes): '''Take into account disabled nodes by changing FileTransfer outputs for such nodes to inputs, and recording output temporary files, so as to ensure that missing temporary outputs will not be used later in the workflow. disabled_nodes should be a list, or preferably a set, of leaf process nodes. Use _expand_nodes() if needed before calling _handle_disable_nodes(). ''' move_to_input = {} remove_temp = set() for node in disabled_nodes: if not hasattr(node, 'process'): continue # switch or something else process = node.process otrans = transfers[1].get(process, None) for param, trait in six.iteritems(process.user_traits()): if trait.output and (isinstance(trait.trait_type, File) \ or isinstance(trait.trait_type, Directory) \ or type(trait.trait_type) is Any): path = getattr(process, param) if otrans is not None: transfer, path2 = otrans.get(param, (None, None)) else: transfer = None if transfer is not None: print('transfered output path:', path, 'from: %s.%s changes to input.' % (node.name, param)) move_to_input[path] = transfer transfer.initial_status \ = swclient.constants.FILES_ON_CLIENT elif path in temp_map: print('temp path in: %s.%s will not be produced.' % (node.name, param)) remove_temp.add(path) return move_to_input, remove_temp def iter_to_workflow(process, node_name, step_name, temp_map, shared_map, transfers, shared_paths, disabled_nodes, remove_temp, steps, study_config, iteration): ''' Build a workflow for a single iteration step of a process/sub-pipeline is called for each iteration by build_iteration() Returns ------- (jobs, dependencies, groups, root_jobs) ''' if isinstance(process, Pipeline): temp_map2 = assign_temporary_filenames(process, len(temp_map)) temp_subst_list = [(x1, x2[0]) for x1, x2 in six.iteritems(temp_map2)] temp_subst_map = dict(temp_subst_list) temp_subst_map.update(temp_map) try: graph = process.workflow_graph() (jobs, dependencies, groups, sub_root_jobs) = \ workflow_from_graph( graph, temp_subst_map, shared_map, transfers, shared_paths, disabled_nodes=disabled_nodes, forbidden_temp=remove_temp, steps=steps, study_config=study_config) group = build_group(node_name, six_values(sub_root_jobs)) groups[(process, iteration)] = group root_jobs = {(process, iteration): group} finally: restore_empty_filenames(temp_map2) elif isinstance(process, ProcessIteration): # sub-iteration return build_iteration(process, step_name, study_config) else: # single process job = build_job(process, temp_map, shared_map, transfers, shared_paths, forbidden_temp=remove_temp, name=node_name, priority=jobs_priority, step_name=step_name) jobs = {(process, iteration): job} groups = {} dependencies = {} root_jobs = {(process, iteration): job} return (jobs, dependencies, groups, root_jobs) def build_iteration(it_process, step_name, temp_map, shared_map, transfers, shared_paths, disabled_nodes, remove_temp, steps, study_config={}): ''' Build workflow for an iterative process: the process / sub-pipeline is filled with appropriate parameters for each iteration, and its workflow is generated. Returns ------- (jobs, dependencies, groups, root_jobs) ''' no_output_value = None size = None size_error = False for parameter in it_process.iterative_parameters: trait = it_process.trait(parameter) psize = len(getattr(it_process, parameter)) if psize: if size is None: size = psize elif size != psize: size_error = True break if trait.output: if no_output_value is None: no_output_value = False elif no_output_value: size_error = True break else: if trait.output: if no_output_value is None: no_output_value = True elif not no_output_value: size_error = True break else: if size is None: size = psize elif size != psize: size_error = True break if size_error: raise ValueError('Iterative parameter values must be lists of the ' 'same size: %s' % ','.join('%s=%d' % (n, len(getattr(it_process, n))) for n in it_process.iterative_parameters)) jobs = {} workflows = [] for parameter in it_process.regular_parameters: setattr(it_process.process, parameter, getattr(it_process, parameter)) jobs = {} dependencies = set() groups = {} root_jobs = {} if size == 0: return (jobs, dependencies, groups, root_jobs) if no_output_value: # this case is a "really" dynamic iteration, the number of # iterations and parameters are determined in runtime, so we # cannot handle it at the moment. raise ValueError('Dynamic iteration is not handled in this ' 'version of CAPSUL / Soma-Workflow') for parameter in it_process.iterative_parameters: trait = it_process.trait(parameter) if trait.output: setattr(it_process, parameter, []) outputs = {} for iteration in xrange(size): for parameter in it_process.iterative_parameters: if not no_output_value \ or not it_process.trait(parameter).output: setattr(it_process.process, parameter, getattr(it_process, parameter)[iteration]) # operate completion complete_iteration(it_process, iteration) #workflow = workflow_from_pipeline(it_process.process, #study_config=study_config) #workflows.append(workflow) for parameter in it_process.iterative_parameters: trait = it_process.trait(parameter) if trait.output: outputs.setdefault( parameter,[]).append(getattr(it_process.process, parameter)) for parameter, value in six.iteritems(outputs): setattr(it_process, parameter, value) else: for iteration in xrange(size): for parameter in it_process.iterative_parameters: setattr(it_process.process, parameter, getattr(it_process, parameter)[iteration]) # operate completion complete_iteration(it_process, iteration) process_name = it_process.process.name + '_%d' % iteration (sub_jobs, sub_dependencies, sub_groups, sub_root_jobs) = \ iter_to_workflow(it_process.process, process_name, step_name, temp_map, shared_map, transfers, shared_paths, disabled_nodes, remove_temp, steps, study_config, iteration) jobs.update(dict([((p, iteration), j) for p, j in six.iteritems(sub_jobs)])) dependencies.update(sub_dependencies) groups.update(sub_groups) root_jobs.update(sub_root_jobs) return (jobs, dependencies, groups, root_jobs) def complete_iteration(it_process, iteration): completion_engine = ProcessCompletionEngine.get_completion_engine( it_process) # check if it is an iterative completion engine if hasattr(completion_engine, 'complete_iteration_step'): completion_engine.complete_iteration_step(iteration) def workflow_from_graph(graph, temp_map={}, shared_map={}, transfers=[{}, {}], shared_paths={}, disabled_nodes=set(), forbidden_temp=set(), jobs_priority=0, steps={}, current_step='', study_config={}): """ Convert a CAPSUL graph to a soma-workflow workflow Parameters ---------- graph: Graph (mandatory) a CAPSUL graph temp_map: dict (optional) temporary files to replace by soma_workflow TemporaryPath objects shared_map: dict (optional) shared translated paths maps (global to pipeline). This dict is updated when needed during the process. transfers: list of 2 dicts (optional) File transfers dicts (input / ouput), indexed by process, then by file path. shared_paths: dict (optional) holds information about shared resource paths from soma-worflow section in study config. If not specified, no translation will be used. jobs_priority: int (optional, default: 0) set this priority on soma-workflow jobs. steps: dict (optional) node name -> step name dict current_step: str (optional) the parent node step name study_config: StydyConfig instance (optional) used only for iterative nodes, to be passed to create sub-workflows Returns ------- workflow: tuple (jobs, dependencies, groups, root_jobs) the corresponding soma-workflow workflow definition (to be passed to Workflow constructor) """ jobs = {} groups = {} root_jobs = {} dependencies = set() group_nodes = {} ordered_nodes = graph.topological_sort() proc_keys = dict([(node[1] if isinstance(node[1], Graph) else node[1][0].process, i) for i, node in enumerate(ordered_nodes)]) # Go through all graph nodes for node_name, node in six.iteritems(graph._nodes): # If the the node meta is a Graph store it if isinstance(node.meta, Graph): group_nodes[node_name] = node # Otherwise convert all the processes in meta as jobs else: sub_jobs = {} for pipeline_node in node.meta: process = pipeline_node.process if pipeline_node in disabled_nodes: continue step_name = current_step or steps.get(pipeline_node.name) if isinstance(process, ProcessIteration): # iterative node group_nodes.setdefault( node_name, []).append(pipeline_node) elif (not isinstance(process, Pipeline) and isinstance(process, Process)): job = build_job(process, temp_map, shared_map, transfers, shared_paths, forbidden_temp=forbidden_temp, name=pipeline_node.name, priority=jobs_priority, step_name=step_name) sub_jobs[process] = job root_jobs[process] = [job] #node.job = job jobs.update(sub_jobs) # Recurrence on graph node for node_name, node in six.iteritems(group_nodes): if isinstance(node, list): # iterative nodes for i, it_node in enumerate(node): process = it_node.process sub_workflows = build_iteration( process, step_name, temp_map, shared_map, transfers, shared_paths, disabled_nodes, {}, steps, study_config={}) (sub_jobs, sub_deps, sub_groups, sub_root_jobs) = \ sub_workflows group = build_group(node_name, six_values(sub_root_jobs)) groups.setdefault(process, []).append(group) root_jobs.setdefault(process, []).append(group) groups.update(sub_groups) jobs.update(sub_jobs) dependencies.update(sub_deps) else: # sub-pipeline wf_graph = node.meta step_name = current_step or steps.get(node_name, '') (sub_jobs, sub_deps, sub_groups, sub_root_jobs) \ = workflow_from_graph( wf_graph, temp_map, shared_map, transfers, shared_paths, disabled_nodes, jobs_priority=jobs_priority, steps=steps, current_step=step_name) group = build_group(node_name, six_values(sub_root_jobs)) groups[node.meta] = group root_jobs[node.meta] = [group] jobs.update(sub_jobs) groups.update(sub_groups) dependencies.update(sub_deps) # Add dependencies between a source job and destination jobs for node_name, node in six.iteritems(graph._nodes): # Source job if isinstance(node.meta, list): if isinstance(node.meta[0].process, ProcessIteration): sjobs = groups.get(node.meta[0].process) if not sjobs: continue # disabled elif node.meta[0].process in jobs: sjobs = [jobs[node.meta[0].process]] else: continue # disabled node else: sjobs = [groups[node.meta]] # Destination jobs for dnode in node.links_to: if isinstance(dnode.meta, list): if isinstance(dnode.meta[0].process, ProcessIteration): djobs = groups.get(dnode.meta[0].process) if not djobs: continue # disabled elif dnode.meta[0].process in jobs: djobs = [jobs[dnode.meta[0].process]] else: continue # disabled node else: djobs = groups[dnode.meta] if not isinstance(djobs, list): djobs = [djobs] for djob in djobs: dependencies.update([(sjob, djob) for sjob in sjobs]) # sort root jobs/groups root_jobs_list = [] for p, js in six.iteritems(root_jobs): root_jobs_list.extend([(proc_keys[p], p, j) for j in js]) root_jobs_list.sort() root_jobs = OrderedDict([x[1:] for x in root_jobs_list]) return jobs, dependencies, groups, root_jobs def _create_directories_job(pipeline, shared_map={}, shared_paths={}, priority=0, transfer_paths=[]): def _is_transfer(d, transfer_paths): for path in transfer_paths: if d.startswith(os.path.join(path, '')): return True return False directories = [d for d in pipeline_tools.get_output_directories( pipeline)[1] if not _is_transfer(d, transfer_paths)] if len(directories) == 0: return None # no dirs to create. paths = [] # check for path translations for path in directories: new_path = _translated_path(path, shared_map, shared_paths) paths.append(new_path or path) # use a python command to avoid the shell command mkdir cmdline = ['python', '-c', 'import sys, os; [os.makedirs(p) if not os.path.exists(p) ' 'else None ' 'for p in sys.argv[1:]]'] \ + paths job = swclient.Job( name='output directories creation', command=cmdline, priority=priority) return job # TODO: handle formats in a separate, centralized place # formats: {name: ext_props} # ext_props: {ext: [dependent_exts]} # dependent_exts: (ext, mandatory) formats = { 'NIFTI-1': {'.nii': [], '.img': [('.hdr', True)], '.nii.gz': []}, 'GIS': {'.ima': [('.dim', True)]}, 'GIFTI': {'.gii': []}, 'MESH': {'.mesh': []}, 'ARG': {'.arg': [('.data', False)]}, } # transform it to an ext-based dict # merged_formats: {ext: [dependent_exts]} # (formats names are lost here) merged_formats = {} for format, values in six.iteritems(formats): merged_formats.update(values) if study_config is None: study_config = pipeline.get_study_config() if not isinstance(pipeline, Pipeline): # "pipeline" is actally a single process (or should, if it is not a # pipeline). Get it into a pipeine (with a single node) to make the # workflow. new_pipeline = Pipeline() new_pipeline.set_study_config(study_config) new_pipeline.add_process('main', pipeline) new_pipeline.autoexport_nodes_parameters() pipeline = new_pipeline temp_map = assign_temporary_filenames(pipeline) temp_subst_list = [(x1, x2[0]) for x1, x2 in six.iteritems(temp_map)] temp_subst_map = dict(temp_subst_list) shared_map = {} swf_paths = _get_swf_paths(study_config) transfers = _get_transfers(pipeline, swf_paths[0], merged_formats) #print('disabling nodes:', disabled_nodes) # get complete list of disabled leaf nodes if disabled_nodes is None: disabled_nodes = pipeline.disabled_pipeline_steps_nodes() disabled_nodes = disabled_nodes \ + [name for name, node in six.iteritems(pipeline.nodes) if node.node_type != 'processing_node' and name not in disabled_nodes] disabled_nodes = _expand_nodes(disabled_nodes) move_to_input, remove_temp = _handle_disable_nodes( pipeline, temp_subst_map, transfers, disabled_nodes) #print('changed transfers:', move_to_input) #print('removed temp:', remove_temp) #print('temp_map:', temp_map, '\n') #print('SWF transfers:', swf_paths[0]) #print('shared paths:', swf_paths[1]) if create_directories: # create job dirs_job = _create_directories_job( pipeline, shared_map=shared_map, shared_paths=swf_paths[1], transfer_paths=swf_paths[0]) # build steps map steps = {} if hasattr(pipeline, 'pipeline_steps'): for step_name, step \ in six.iteritems(pipeline.pipeline_steps.user_traits()): nodes = step.nodes steps.update(dict([(node, step_name) for node in nodes])) # Get a graph try: graph = pipeline.workflow_graph() (jobs, dependencies, groups, root_jobs) = workflow_from_graph( graph, temp_subst_map, shared_map, transfers, swf_paths[1], disabled_nodes=disabled_nodes, forbidden_temp=remove_temp, steps=steps, study_config=study_config) finally: restore_empty_filenames(temp_map) all_jobs = six_values(jobs) root_jobs = six_values(root_jobs) # if directories have to be created, all other primary jobs will depend # on this first one if create_directories and dirs_job is not None: dependend_jobs = set() for dependency in dependencies: dependend_jobs.add(dependency[1]) new_deps = [(dirs_job, job) for job in all_jobs if job not in dependend_jobs] dependencies.update(new_deps) all_jobs.insert(0, dirs_job) root_jobs.insert(0, dirs_job) workflow = swclient.Workflow(jobs=all_jobs, dependencies=dependencies, root_group=root_jobs, name=pipeline.name) return workflow