class WorkflowGenerator(object): """Class for creating a CWL workflow. The WorkflowGenerator class allows users to tie together inputs and outputs of the steps that need to be executed to perform a data processing task. The steps (i.e., command line tools and subworkflows) must be added to the steps library of the WorkflowGenerator object before they can be added to the workflow. To add steps to the steps library, the `load` method can be called with either a path to a directory containing CWL files: ``` from scriptcwl import WorkflowGenerator wf = WorkflowGenerator() wf.load(steps_dir='/path/to/dir/with/cwl/steps/') ``` Or a single CWL file: ``` wf.load(step_file='/path/to/cwl/step/file') ``` `wf.load` can be called multiple times. Step files are added to the steps library one after the other. For every step that is added to the steps library, a method with the same name is added to the WorkflowGenerator object. To add a step to the workflow, this method must be called (examples below). Next, the user should add one or more workflow inputs: ``` txt_dir = wf.add_inputs(txt_dir='Directory') ``` The `add_inputs` method expects (key, value) pairs as input parameters. Each pair connects an input name (`txt_dir` in the example) to a type (`'Directory'`). `addd_inputs` method returns a list of strings containing the names that can be used to connect these input parameters to step input parameter names. (Please note that because **kwargs are unordered, the list of input names may not be in the same order as the **kwargs. When a workflow has multiple inputs, it is probably safer to call `add_inputs` for every parameter separately.) Next, workflow steps can be added. To add a workflow step, its method must be called on the WorkflowGenerator object. This method expects a list of (key, value) pairs as input parameters. (To find out what inputs a step needs call `wf.inputs(<step name>)`. This method prints all the inputs and their types.) The method returns a list of strings containing output names that can be used as input for later steps, or that can be connected to workflow outputs. For example, to add a step called `frog-dir` to the workflow, the following method must be called: ``` frogout = wf.frog_dir(dir_in=txt_dir) ``` In a next step, `frogout` can be used as input: ``` saf = wf.frog_to_saf(in_files=frogout) txt = wf.saf_to_txt(in_files=saf) ``` Etcetera. When all steps of the workflow have been added, the user can specify workflow outputs: ``` wf.add_outputs(txt=txt) ``` Finally, the workflow can be saved to file: ``` wf.save('workflow.cwl') ``` To list steps and signatures available in the steps library, call: ``` wf.list_steps() ``` """ def __init__(self, steps_dir=None): self.wf_steps = CommentedMap() self.wf_inputs = CommentedMap() self.wf_outputs = CommentedMap() self.step_output_types = {} self.steps_library = {} self.has_workflow_step = False self.has_scatter_requirement = False self.load(steps_dir) def __getattr__(self, name, **kwargs): name = cwl_name(name) step = self._get_step(name) return partial(self._make_step, step, **kwargs) def load(self, steps_dir=None, step_file=None): """Load CWL steps into the WorkflowGenerator's steps library. Adds steps (command line tools and workflows) to the WorkflowGenerator's steps library. These steps can be used to create workflows. Args: steps_dir (str): path to directory containing CWL files. All CWL in the directory are loaded. step_file (str): path to a file containing a CWL step that will be added to the steps library. """ steps = load_steps(steps_dir=steps_dir, step_file=step_file) for n, step in steps.iteritems(): if n in self.steps_library.keys(): print 'WARNING: step "{}" already in steps library'.format(n) else: self.steps_library[n] = step def list_steps(self): """Prints the signature of all steps in the steps library. """ for name, step in self.steps_library.iteritems(): print 'Step "{}": {}'.format(name, step) def _has_requirements(self): """Returns True if the workflow needs a requirements section. Returns: bool: True if the workflow needs a requirements section, False otherwise. """ return bool(self.has_workflow_step or self.has_scatter_requirement) def inputs(self, name): """List input names and types of a step in the steps library. Args: name (str): name of a step in the steps library. """ s = self._get_step(name, make_copy=False) print s.list_inputs() def _add_step(self, step): """Add a step to the workflow. Args: step (Step): a step from the steps library. """ self.has_workflow_step = self.has_workflow_step or step.is_workflow self.wf_steps[step.name] = step.to_obj() def add_inputs(self, **kwargs): """Add workflow inputs. Args: kwargs (dict): A dict with `name=type` pairs, where name is the name (id) of the workflow input (e.g., `dir_in`) and type is the type of the input (e.g., `'Directory'`). The type of input parameters can be learned from `step.inputs(step_name=input_name)`. Returns: list of inputnames """ names = [] for name, typ in kwargs.iteritems(): self.wf_inputs[name] = typ names.append(name) if len(names) == 1: return names[0] return names def add_outputs(self, **kwargs): """Add workflow outputs. The output type is added automatically, based on the steps in the steps library. Args: kwargs (dict): A dict containing name=source name pairs. name is the name of the workflow output (e.g., `txt_files`) and source name is the name of the step that produced this output plus the output name (e.g., `saf-to-txt/out_files`). """ for name, source_name in kwargs.iteritems(): obj = {} obj['outputSource'] = source_name obj['type'] = self.step_output_types[source_name] self.wf_outputs[name] = obj def set_documentation(self, doc): """Set workflow documentation. Args: doc (str): documentation string. """ self.documentation = doc def _get_step(self, name, make_copy=True): """Return step from steps library. Optionally, the step returned is a deep copy from the step in the steps library, so additional information (e.g., about whether the step was scattered) can be stored in the copy. Args: name (str): name of the step in the steps library. make_copy (bool): whether a deep copy of the step should be returned or not (default: True). Returns: Step from steps library. Raises: ValueError: The requested step cannot be found in the steps library. """ s = self.steps_library.get(name) if s is None: msg = '"{}" not found in steps library. Please check your ' \ 'spelling or load additional steps' raise ValueError(msg.format(name)) if make_copy: s = copy.deepcopy(s) return s def to_obj(self): """Return the created workflow as a dict. The dict can be written to a yaml file. Returns: A yaml-compatible dict representing the workflow. """ obj = CommentedMap() obj['cwlVersion'] = 'v1.0' obj['class'] = 'Workflow' try: obj['doc'] = self.documentation except (AttributeError, ValueError): pass if self._has_requirements(): obj['requirements'] = [] if self.has_workflow_step: obj['requirements'].append( {'class': 'SubworkflowFeatureRequirement'}) if self.has_scatter_requirement: obj['requirements'].append({'class': 'ScatterFeatureRequirement'}) obj['inputs'] = self.wf_inputs obj['outputs'] = self.wf_outputs obj['steps'] = self.wf_steps return obj def to_script(self, wf_name='wf'): """Generated and print the scriptcwl script for the currunt workflow. Args: wf_name (str): string used for the WorkflowGenerator object in the generated script (default: wf). """ # Workflow documentation if self.documentation: if is_multiline(self.documentation): print 'doc = """' print self.documentation print '"""' print '{}.set_documentation(doc)'.format(wf_name) else: print '{}.set_documentation(\'{}\')'.format( wf_name, self.documentation) # Workflow inputs params = [] returns = [] for name, typ in self.wf_inputs.iteritems(): params.append('{}=\'{}\''.format(name, typ)) returns.append(name) print '{} = {}.add_inputs({})'.format(', '.join(returns), wf_name, ', '.join(params)) # Workflow steps returns = [] for name, step in self.wf_steps.iteritems(): s = Step(step['run']) returns = [ '{}_{}'.format(python_name(s.name), o) for o in step['out'] ] params = [ '{}={}'.format(name, python_name(param)) for name, param in step['in'].iteritems() ] print '{} = {}.{}({})'.format(', '.join(returns), wf_name, s.python_name, ', '.join(params)) # Workflow outputs params = [] for name, details in self.wf_outputs.iteritems(): params.append('{}={}'.format(name, python_name(details['outputSource']))) print '{}.add_outputs({})'.format(wf_name, ', '.join(params)) def _make_step(self, step, **kwargs): for k in step.get_input_names(): if k not in kwargs.keys() and k not in step.optional_input_names: raise ValueError( 'Expecting "{}" as a keyword argument.'.format(k)) if kwargs.get(k): step.set_input(k, kwargs[k]) if 'scatter' in kwargs.keys() or 'scatter_method' in kwargs.keys(): # Check whether both required keyword arguments are present msg = 'Expecting "{}" as a keyword argument.' if not kwargs.get('scatter'): raise ValueError(msg.format('scatter')) if not kwargs.get('scatter_method'): raise ValueError(msg.format('scatter_method')) # Check validity of scatterMethod scatter_methods = [ 'dotproduct', 'nested_crossproduct', 'flat_crossproduct' ] m = kwargs.get('scatter_method') if m not in scatter_methods: msg = 'Invalid scatterMethod "{}". Please use one of ({}).' raise ValueError(msg.format(m, ', '.join(scatter_methods))) step.scatter_method = m # Check whether the scatter variables are valid for this step scatter_vars = kwargs.get('scatter') if isinstance(scatter_vars, six.string_types): scatter_vars = [scatter_vars] for var in scatter_vars: if var not in step.get_input_names(): msg = 'Invalid variable "{}" for scatter.' raise ValueError(msg.format(var)) step.scattered_inputs.append(var) # Update step output types (outputs are now arrays) for name, typ in step.step_outputs.iteritems(): step.step_outputs[name] = {'type': 'array', 'items': typ} self.has_scatter_requirement = True step.is_scattered = True outputs = [] for n in step.output_names: oname = step.output_to_input(n) self.step_output_types[oname] = step.step_outputs[n] outputs.append(step.output_to_input(n)) self._add_step(step) if len(outputs) == 1: return outputs[0] return outputs def save(self, fname, encoding='utf-8'): """Save the workflow to file. Save the workflow to a CWL file that can be run with a CWL runner. Args: fname (str): file to save the workflow to. encoding (str): file encoding to use (default: utf-8). """ dirname = os.path.dirname(os.path.abspath(fname)) if not os.path.exists(dirname): os.makedirs(dirname) yaml.add_representer(str, str_presenter) with codecs.open(fname, 'wb', encoding=encoding) as yaml_file: yaml_file.write('#!/usr/bin/env cwl-runner\n') yaml_file.write( yaml.dump(self.to_obj(), Dumper=yaml.RoundTripDumper))