def with_workflow_storage(self): """Yield a workflow storage.""" with self.lock: from renku.models.cwl._ascwl import ascwl from renku.models.cwl.workflow import Workflow workflow = Workflow() yield workflow for step in workflow.steps: step_name = '{0}_{1}.cwl'.format( uuid.uuid4().hex, secure_filename('_'.join(step.run.baseCommand)), ) workflow_path = self.workflow_path if not workflow_path.exists(): workflow_path.mkdir() step_path = workflow_path / step_name with step_path.open('w') as step_file: yaml.dump( ascwl( # filter=lambda _, x: not (x is False or bool(x) step.run, filter=lambda _, x: x is not None, basedir=workflow_path, ), stream=step_file, default_flow_style=False)
def test_workflow(runner, project): """Test workflow command.""" result = runner.invoke(cli.cli, ['run', 'touch', 'data.csv']) assert 0 == result.exit_code with open('counted.txt', 'w') as stdout: with contextlib.redirect_stdout(stdout): try: cli.cli.main( args=('run', 'wc', 'data.csv'), prog_name=runner.get_default_prog_name(cli.cli), ) except SystemExit as e: assert e.code in {None, 0} result = runner.invoke( cli.cli, ['workflow', 'create', 'counted.txt', '-o', 'workflow.cwl'], catch_exceptions=False, ) assert 0 == result.exit_code with open('workflow.cwl', 'r') as f: workflow = Workflow.from_cwl(yaml.safe_load(f)) assert workflow.steps[0].run.startswith('.renku/workflow/') # Compare default log and log for a specific file. result_default = runner.invoke(cli.cli, ['log']) result_arg = runner.invoke(cli.cli, ['log', 'counted.txt']) assert 0 == result_default.exit_code assert 0 == result_arg.exit_code assert result_default.output == result_arg.output
def test_workflow(runner): """Test workflow command.""" result = runner.invoke(cli.cli, ['run', 'touch', 'data.csv']) assert result.exit_code == 0 with open('counted.txt', 'w') as stdout: with contextlib.redirect_stdout(stdout): try: cli.cli.main( args=('run', 'wc', 'data.csv'), prog_name=runner.get_default_prog_name(cli.cli), ) except SystemExit as e: assert e.code in {None, 0} result = runner.invoke( cli.cli, ['workflow', 'create', 'counted.txt', '-o', 'workflow.cwl']) assert result.exit_code == 0 with open('workflow.cwl', 'r') as f: workflow = Workflow.from_cwl(yaml.load(f)) assert workflow.steps[0].run.startswith('.renku/workflow/')
def add_tool(self, commit, path, file_key=None, expand_workflow=True, is_step=False): """Add a tool and its dependencies to the graph.""" data = (commit.tree / path).data_stream.read() cwl = yaml.load(data) try: tool = CommandLineTool.from_cwl(cwl) except TypeError: if expand_workflow: return self.add_workflow(commit, path, file_key=file_key, cwl=cwl) tool = Workflow.from_cwl(cwl) tool_key = self.add_node(commit, path, tool=tool) if is_step: return tool_key for input_path, input_id in self.iter_file_inputs( tool, os.path.dirname(path)): input_key = self.add_file(input_path, revision='{0}^'.format(commit)) #: Edge from an input to the tool. self.G.add_edge(input_key, tool_key, id=input_id) if file_key: _, path = file_key output_id = tool.get_output_id(path) if output_id: self.G.add_edge(tool_key, file_key, id=output_id) return tool_key
def add_tool(self, commit, path, file_key=None, expand_workflow=True, is_step=False): """Add a tool and its dependencies to the graph.""" data = (commit.tree / path).data_stream.read() cwl = yaml.load(data) try: tool = CommandLineTool.from_cwl(cwl) except TypeError: if expand_workflow: return self.add_workflow(commit, path, file_key=file_key, cwl=cwl) tool = Workflow.from_cwl(cwl) tool_key = self.add_node(commit, path, tool=tool) if is_step: return tool_key for input_id, input_path in self.iter_input_files( tool, os.path.dirname(path)): input_key = self.add_file(input_path, revision='{0}^'.format(commit)) #: Edge from an input to the tool. self.G.add_edge(input_key, tool_key, id=input_id) # Find ALL siblings that MUST be generated in the same commit. for output_id, path in self.iter_output_files(tool): self.G.add_edge(tool_key, (str(commit), path), id=output_id) return tool_key
def ascwl( self, input_paths=None, output_paths=None, outputs=None, use_latest=True, ): """Serialize graph to CWL workflow. :param global_step_outputs: Make all step outputs global. """ if output_paths is None: output_paths = { node.path for node in outputs if _safe_path(node.path) } workflow = Workflow() processes = set() stack = [] output_keys = {(node.commit, node.path) for node in outputs} nodes = {(node.commit, node.path): node for node in self.nodes} def connect_file_to_directory(node): """Return step connecting file to a directory.""" process = attr.evolve( LINK_CWL, inputs={ 'input_directory': 'Directory', 'filename': { 'type': 'string', 'default': str(Path(node.path).relative_to(node.parent.path)), }, }) process_run = ProcessRun( commit=node.commit, client=node.client, path=None, process=process, inputs={ node.parent.path: Usage( entity=node.parent, role='input_directory', ), }, outputs={ node.path: 'output_file', }, ) for generated in process_run.generated: nodes[(generated.commit, generated.path)] = generated return process_run for node in self.nodes: if (node.commit, node.path) not in output_keys: continue process_run = None if isinstance(node, Entity) and not hasattr(node, 'activity'): process_run = connect_file_to_directory(node) stack.append(process_run) processes.add(process_run) else: assert hasattr(node, 'activity'), node assert isinstance(node.activity, ProcessRun) plan = node.activity.association.plan latest = self.latest(plan) if use_latest and latest: plan = nodes[(latest, plan.path)] process_run = plan.activity if process_run not in processes: stack.append(process_run) processes.add(process_run) while stack: action = stack.pop() if not hasattr(action, 'inputs'): continue for path, dependency in action.inputs.items(): # Do not follow defined input paths. if input_paths and path in input_paths: continue node = nodes.get((dependency.commit, dependency.path), dependency) if isinstance(node, Generation): process_run = node.activity elif isinstance(node, Collection) and node.parent: raise NotImplementedError('Can not connect subdirectory') elif isinstance(node, Entity) and node.parent: process_run = connect_file_to_directory(node) else: process_run = None # Skip existing commits if process_run and isinstance(process_run, ProcessRun): plan = process_run.association.plan latest = self.latest(plan) if process_run.path and use_latest and latest: plan = nodes[(latest, plan.path)] process_run = plan.activity if process_run not in processes: stack.append(process_run) processes.add(process_run) steps = { tool: 'step_{0}'.format(tool_index) for tool_index, tool in enumerate(processes, 1) } def _source_name(commit, path): """Find source name for a node.""" try: process_run = nodes[(commit, path)].activity output_id = process_run.outputs[path] return '{0}/{1}'.format(steps[process_run], output_id) except (KeyError, AttributeError): pass def _relative_default(client, default): """Evolve ``File`` or ``Directory`` path.""" if isinstance(default, PATH_TYPES): path = (client.workflow_path / default.path).resolve() return attr.evolve(default, path=path) return default input_index = 1 for action, step_id in steps.items(): tool = action.process ins = {} for path, dependency in action.inputs.items(): alias = _source_name(dependency.commit, path) if alias: ins[dependency.role] = alias outs = list(set(action.outputs.values())) for generated in action.generated: if generated.entity.path not in output_paths: output_paths.add(generated.entity.path) outputs.add(generated.entity) for input_ in tool.inputs: input_mapping = ins.get(input_.id) if input_mapping is None: input_id = 'input_{0}'.format(input_index) workflow.inputs.append( InputParameter( id=input_id, type=input_.type, default=_relative_default(self.client, input_.default), )) input_index += 1 ins[input_.id] = input_id workflow.add_step( run=self.client.path / action.path if action.path else tool, id=step_id, in_=ins, out=outs, ) for index, node in enumerate( (node for node in outputs if node.path in output_paths)): commit, path = node.commit, node.path id_ = 'output_{0}'.format(index) process_run = nodes[(commit, path)].activity if process_run.process is None or process_run.path is None: continue output_id = process_run.outputs[path] type_ = next(output for output in process_run.process.outputs if output.id == output_id).type type_ = type_ if type_ == 'Directory' else 'File' output_source = _source_name(commit, path) if output_source is None: continue workflow.outputs.append( WorkflowOutputParameter( id=id_, type=type_, outputSource=output_source, )) return workflow
def ascwl(self, global_step_outputs=False): """Serialize graph to CWL workflow. :param global_step_outputs: Make all step outputs global. """ workflow = Workflow() input_index = 1 steps = {} def _source_name(key): """Find source name for a node.""" if self.G.in_degree(key) == 0: return None assert self.G.in_degree(key) == 1 tool_key, node = list(self.G.pred[key].items())[0] return '{0}/{1}'.format(steps[tool_key], node['id']) def _relative_default(client, default): """Evolve ``File`` path.""" if isinstance(default, File): path = (client.workflow_path / default.path).resolve() return attr.evolve(default, path=path) return default for tool_index, (key, node) in enumerate(self._tool_nodes, 1): _, path = key tool = node['tool'] step_id = 'step_{0}'.format(tool_index) steps[key] = step_id ins = { edge_id: _source_name(target_id) for target_id, _, edge_id in self.G.in_edges(key, data='id') } outs = [ edge_id for _, _, edge_id in self.G.out_edges(key, data='id') ] for input_ in tool.inputs: input_mapping = ins.get(input_.id) if input_mapping is None: input_id = 'input_{0}'.format(input_index) workflow.inputs.append( InputParameter( id=input_id, type=input_.type, default=_relative_default(self.client, input_.default), )) input_index += 1 ins[input_.id] = input_id workflow.add_step( run=self.client.path / path, id=step_id, in_=ins, out=outs, ) output_keys = (key for _, key in self.G.out_edges(steps.keys()) ) if global_step_outputs else self._output_keys for index, key in enumerate(output_keys): output_id = 'output_{0}'.format(index) workflow.outputs.append( WorkflowOutputParameter( id=output_id, type='File', outputSource=_source_name(key), )) return workflow
def add_workflow(self, commit, path, cwl=None, file_key=None): """Add a workflow and its dependencies to the graph.""" if cwl is None: data = (commit.tree / path).data_stream.read() cwl = yaml.load(data) workflow = Workflow.from_cwl(cwl) basedir = os.path.dirname(path) # Keep track of node identifiers for steps, inputs and outputs: step_map = {} input_map = {} output_map = {} #: First find workflow inputs, but don't connect them yet. for input_id, input_path in self.iter_input_files(workflow, basedir): input_key = self.add_file(input_path, revision='{0}^'.format(commit)) input_map[input_id] = input_key for step in workflow.steps: tool_key = self.add_tool( commit, os.path.join(basedir, step.run), file_key=file_key, is_step=True, ) step_tool = self.G.nodes[tool_key]['tool'] for input_id, input_path in self.iter_input_files( step_tool, basedir): if input_path in commit.stats.files: #: Check intermediate committed files input_key = self.add_node(commit, input_path) #: Edge from an input to the tool. self.G.add_edge(input_key, tool_key, id=input_id) else: #: Global workflow input source = step.in_[input_id] self.G.add_edge(input_map[source], tool_key, id=input_id) # Find ALL siblings that MUST be generated in the same commit. for output_id, output_path in self.iter_output_files(step_tool): self.G.add_edge(tool_key, (str(commit), output_path), id=output_id) output_map.update({ step.id + '/' + name: target for target, _, name in self.G.in_edges(tool_key, data='id') }) step_map[step.id] = tool_key self.G.nodes[tool_key]['workflow'] = workflow self.G.nodes[tool_key][ 'workflow_path'] = path + '#steps/' + step.id for step in workflow.steps: for alias, source in step.in_.items(): name = step.id + '/' + alias if name in output_map and '/' in source: other_step, id_ = source.split('/') other_key = step_map[other_step] self.G.add_edge(other_key, output_map[name], id=id_) return workflow
def ascwl( self, input_paths=None, output_paths=None, outputs=None, use_latest=True, ): """Serialize graph to CWL workflow. :param global_step_outputs: Make all step outputs global. """ if output_paths is None: output_paths = { node.path for node in outputs if _safe_path(node.path) } workflow = Workflow() processes = set() stack = [] output_keys = {(node.commit, node.path) for node in outputs} nodes = {(node.commit, node.path): node for node in self.nodes} for node in self.nodes: if (node.commit, node.path) not in output_keys: continue process_run = None if isinstance(node, ProcessRun): process_run = node elif isinstance(node.activity, ProcessRun): process_run = node.activity if process_run: latest = self.latest(process_run) if use_latest and latest: process_run = nodes[(latest, process_run.path)] if process_run not in processes: stack.append(process_run) processes.add(process_run) while stack: action = stack.pop() if not hasattr(action, 'inputs'): continue for path, dependency in action.inputs.items(): # Do not follow defined input paths. if input_paths and path in input_paths: continue try: process_run = nodes[(dependency.commit, dependency.path)].activity except AttributeError: continue # Skip existing commits if process_run and isinstance(process_run, ProcessRun): latest = self.latest(process_run) if use_latest and latest: process_run = nodes[(latest, process_run.path)] if process_run not in processes: stack.append(process_run) processes.add(process_run) steps = { tool: 'step_{0}'.format(tool_index) for tool_index, tool in enumerate(processes, 1) } def _source_name(commit, path): """Find source name for a node.""" try: process_run = nodes[(commit, path)].activity output_id = process_run.outputs[path] return '{0}/{1}'.format(steps[process_run], output_id) except (KeyError, AttributeError): pass def _relative_default(client, default): """Evolve ``File`` or ``Directory`` path.""" if isinstance(default, PATH_TYPES): path = (client.workflow_path / default.path).resolve() return attr.evolve(default, path=path) return default input_index = 1 for action, step_id in steps.items(): tool = action.process ins = {} for path, dependency in action.inputs.items(): alias = _source_name(dependency.commit, path) if alias: ins[dependency.role] = alias outs = list(set(action.outputs.values())) for generated in action.generated: if generated.entity.path not in output_paths: output_paths.add(generated.entity.path) outputs.add(generated.entity) for input_ in tool.inputs: input_mapping = ins.get(input_.id) if input_mapping is None: input_id = 'input_{0}'.format(input_index) workflow.inputs.append( InputParameter( id=input_id, type=input_.type, default=_relative_default(self.client, input_.default), )) input_index += 1 ins[input_.id] = input_id workflow.add_step( run=self.client.path / action.path, id=step_id, in_=ins, out=outs, ) for index, node in enumerate( (node for node in outputs if node.path in output_paths)): commit, path = node.commit, node.path id_ = 'output_{0}'.format(index) process_run = nodes[(commit, path)].activity if process_run.process is None: continue output_id = process_run.outputs[path] type_ = next(output for output in process_run.process.outputs if output.id == output_id).type type_ = type_ if type_ == 'Directory' else 'File' output_source = _source_name(commit, path) if output_source is None: continue workflow.outputs.append( WorkflowOutputParameter( id=id_, type=type_, outputSource=output_source, )) return workflow