def _migrate_composite_step(client, workflow, path, commit=None): """Migrate a composite workflow.""" if not commit: commit = client.find_previous_commit(path) run = Run(client=client, path=path, commit=commit) name = '{0}_migrated.yaml'.format(uuid.uuid4().hex) run.path = (client.workflow_path / name).relative_to(client.path) for step in workflow.steps: if isinstance(step.run, dict): continue else: path = client.workflow_path / step.run subrun = parse_cwl_cached(str(path)) subprocess, _ = _migrate_single_step(client, subrun, path, commit=commit) subprocess.path = run.path run.add_subprocess(subprocess) with with_reference(run.path): wf = WorkflowRun.from_run(run, client, run.path, commit=commit) wf.to_yaml() client.add_to_activity_index(wf) return wf, run.path
def with_workflow_storage(self): """Yield a workflow storage.""" from renku.core.models.cwl.workflow import Workflow workflow = Workflow() yield workflow for step in workflow.steps: step_name = '{0}_{1}.yaml'.format( uuid.uuid4().hex, secure_filename('_'.join(step.run.baseCommand)), ) workflow_path = self.workflow_path if not workflow_path.exists(): workflow_path.mkdir() path = workflow_path / step_name with with_reference(path): run = step.run.generate_process_run( client=self, commit=self.repo.head.commit, path=path, ) run.to_yaml() self.add_to_activity_index(run)
def create_dataset( self, short_name=None, title=None, description=None, creators=None, keywords=None, ): """Create a dataset.""" if not short_name: raise errors.ParameterError('Dataset short_name must be provided.') if not is_dataset_short_name_valid(short_name): raise errors.ParameterError( 'Dataset short_name "{}" is not valid.'.format(short_name)) if self.load_dataset(short_name=short_name): raise errors.DatasetExistsError( 'Dataset exists: "{}".'.format(short_name)) if not title: title = short_name identifier = str(uuid.uuid4()) path = self.renku_datasets_path / identifier / self.METADATA if path.exists(): raise errors.DatasetExistsError( 'Dataset with reference {} exists'.format(path)) path.parent.mkdir(parents=True, exist_ok=True) if creators is None: creators = [Person.from_git(self.repo)] keywords = keywords or () with with_reference(path): dataset = Dataset( client=self, identifier=identifier, short_name=short_name, name=title, description=description, creator=creators, keywords=keywords, ) dataset_ref = LinkReference.create(client=self, name='datasets/' + short_name) dataset_ref.set_reference(path) dataset.path = Path(dataset.path).relative_to(self.path) dataset.to_yaml() return dataset, path, dataset_ref
def update(client, revision, no_output, siblings, paths): """Update existing files by rerunning their outdated workflow.""" graph = Graph(client) outputs = graph.build(revision=revision, can_be_cwl=no_output, paths=paths) outputs = {node for node in outputs if graph.need_update(node)} if not outputs: click.secho("All files were generated from the latest inputs.", fg="green") sys.exit(0) # Check or extend siblings of outputs. outputs = siblings(graph, outputs) output_paths = {node.path for node in outputs if _safe_path(node.path)} # Get all clean nodes. input_paths = {node.path for node in graph.nodes} - output_paths # Store the generated workflow used for updating paths. workflow = graph.as_workflow(input_paths=input_paths, output_paths=output_paths, outputs=outputs,) wf, path = CWLConverter.convert(workflow, client) # Don't compute paths if storage is disabled. if client.check_external_storage(): # Make sure all inputs are pulled from a storage. paths_ = (i.consumes.path for i in workflow.inputs) client.pull_paths_from_storage(*paths_) execute(client, path, output_paths=output_paths) paths = [o.produces.path for o in workflow.outputs] client.repo.git.add(*paths) if client.repo.is_dirty(): commit_msg = "renku update: committing {} newly added files".format(len(paths)) committer = Actor("renku {0}".format(__version__), version_url) client.repo.index.commit( commit_msg, committer=committer, skip_hooks=True, ) workflow_name = "{0}_update.yaml".format(uuid.uuid4().hex) path = client.workflow_path / workflow_name workflow.update_id_and_label_from_commit_path(client, client.repo.head.commit, path) with with_reference(path): cls = WorkflowRun if workflow.subprocesses else ProcessRun run = cls.from_run(run=workflow, client=client, path=path, update_commits=True) run.to_yaml() client.add_to_activity_index(run)
def from_cwl(cls, data, __reference__=None): """Return an instance from CWL data.""" class_name = data.get('class', None) cls = cls.registry.get(class_name, cls) if __reference__: with with_reference(__reference__): self = cls( **{k: v for k, v in iteritems(data) if k != 'class'}) else: self = cls(**{k: v for k, v in iteritems(data) if k != 'class'}) return self
def create_dataset(self, name, short_name=None, description='', creators=None): """Create a dataset.""" if not name: raise errors.ParameterError('Dataset name must be provided.') if not short_name: short_name = generate_default_short_name(name, None) if not is_dataset_name_valid(short_name): raise errors.ParameterError( 'Dataset name "{}" is not valid.'.format(short_name)) if self.load_dataset(name=short_name): raise errors.DatasetExistsError( 'Dataset exists: "{}".'.format(short_name)) identifier = str(uuid.uuid4()) path = self.renku_datasets_path / identifier / self.METADATA if path.exists(): raise errors.DatasetExistsError( 'Dataset with reference {} exists'.format(path)) path.parent.mkdir(parents=True, exist_ok=True) if creators is None: creators = [Person.from_git(self.repo)] with with_reference(path): dataset = Dataset(client=self, identifier=identifier, name=name, short_name=short_name, description=description, creator=creators) dataset_ref = LinkReference.create(client=self, name='datasets/' + short_name) dataset_ref.set_reference(path) dataset.to_yaml() return dataset, path, dataset_ref
def _migrate_composite_step(client, workflow, path, commit=None): """Migrate a composite workflow.""" if not commit: commit = client.find_previous_commit(path) run = Run(client=client, path=path, commit=commit) rel_path = Path(path).relative_to(client.path) label = f"{rel_path}@{commit.hexsha}" identifier = sha1(label.encode("utf-8")).hexdigest() run._id = Run.generate_id(client, identifier=identifier) name = "{0}_migrated.yaml".format(uuid.uuid4().hex) run.path = (client.workflow_path / name).relative_to(client.path) for step in workflow.steps: if isinstance(step.run, dict): continue else: path = client.workflow_path / step.run subrun = parse_cwl_cached(str(path)) subprocess, _ = _migrate_single_step(client, subrun, path, parent_commit=commit) run.add_subprocess(subprocess) with with_reference(run.path): wf = WorkflowRun.from_run(run, client, run.path, commit=commit) # HACK: This fixes broken SoftwareAgent due to rebases done by users if isinstance( wf.association.agent, Person) or not wf.association.agent.label.startswith("renku "): wf.association.agent = default_missing_software_agent for p in wf._processes: if isinstance( p.association.agent, Person ) or not p.association.agent.label.startswith("renku "): p.association.agent = default_missing_software_agent wf.to_yaml() client.add_to_activity_index(wf) return wf, run.path
def rerun(client, revision, roots, siblings, inputs, paths): """Recreate files generated by a sequence of ``run`` commands.""" graph = Graph(client) outputs = graph.build(paths=paths, revision=revision) # Check or extend siblings of outputs. outputs = siblings(graph, outputs) output_paths = {node.path for node in outputs} # Normalize and check all starting paths. roots = {graph.normalize_path(root) for root in roots} output_paths -= roots outputs = [o for o in outputs if o.path not in roots] # Generate workflow and check inputs. # NOTE The workflow creation is done before opening a new file. workflow = inputs( client, graph.as_workflow( input_paths=roots, output_paths=output_paths, outputs=outputs, ) ) wf, path = CWLConverter.convert(workflow, client) # Don't compute paths if storage is disabled. if client.check_external_storage(): # Make sure all inputs are pulled from a storage. paths_ = (i.consumes.path for i in workflow.inputs) client.pull_paths_from_storage(*paths_) # Execute the workflow and relocate all output files. # FIXME get new output paths for edited tools # output_paths = {path for _, path in workflow.iter_output_files()} execute( client, path, output_paths=output_paths, ) paths = [o.produces.path for o in workflow.outputs] client.repo.git.add(*paths) if client.repo.is_dirty(): commit_msg = ('renku rerun: ' 'committing {} newly added files').format(len(paths)) committer = Actor('renku {0}'.format(__version__), version_url) client.repo.index.commit( commit_msg, committer=committer, skip_hooks=True, ) workflow_name = '{0}_rerun.yaml'.format(uuid.uuid4().hex) path = client.workflow_path / workflow_name workflow.update_id_and_label_from_commit_path( client, client.repo.head.commit, path ) with with_reference(path): run = WorkflowRun.from_run(workflow, client, path) run.to_yaml() client.add_to_activity_index(run)
def _migrate_single_step(client, cmd_line_tool, path, commit=None, parent_commit=None, persist=False): """Migrate a single step workflow.""" if not commit: commit = client.find_previous_commit( path, revision=parent_commit if parent_commit else "HEAD") run = Run(client=client, path=path, commit=commit) run.command = " ".join(cmd_line_tool.baseCommand) run.successcodes = cmd_line_tool.successCodes inputs = list(cmd_line_tool.inputs) outputs = list(cmd_line_tool.outputs) # NOTE: Make run ids deterministic to prevent duplication. rel_path = Path(path).relative_to(client.path) if parent_commit: label = f"{rel_path}@{parent_commit.hexsha}" else: label = f"{rel_path}@{commit.hexsha}" identifier = sha1(label.encode("utf-8")).hexdigest() base_id = Run.generate_id(client, identifier=identifier) run._id = base_id if cmd_line_tool.stdin: name = cmd_line_tool.stdin.split(".")[1] if name.endswith(")"): name = name[:-1] matched_input = next(i for i in inputs if i.id == name) inputs.remove(matched_input) path = client.workflow_path / Path(matched_input.default["path"]) stdin = path.resolve().relative_to(client.path) id_ = CommandInput.generate_id(base_id, "stdin") run.inputs.append( CommandInput( id=id_, consumes=_entity_from_path(client, stdin, commit), mapped_to=MappedIOStream(client=client, stream_type="stdin"), )) if cmd_line_tool.stdout: run.outputs.append( CommandOutput( id=CommandOutput.generate_id(base_id, "stdout"), produces=_entity_from_path(client, cmd_line_tool.stdout, commit), mapped_to=MappedIOStream(client=client, stream_type="stdout"), create_folder=False, )) matched_output = next(o for o in outputs if o.id == "output_stdout") if matched_output: outputs.remove(matched_output) if cmd_line_tool.stderr: run.outputs.append( CommandOutput( id=CommandOutput.generate_id(base_id, "stderr"), produces=_entity_from_path(client, cmd_line_tool.stderr, commit), mapped_to=MappedIOStream(client=client, stream_type="stderr"), create_folder=False, )) matched_output = next(o for o in outputs if o.id == "output_stderr") if matched_output: outputs.remove(matched_output) created_outputs = [] workdir_requirements = [ r for r in cmd_line_tool.requirements if isinstance(r, InitialWorkDirRequirement) ] for r in workdir_requirements: for listing in r.listing: if listing.entry == '$({"listing": [], "class": "Directory"})': created_outputs.append(listing.entryname) for o in outputs: prefix = None position = None if o.outputBinding.glob.startswith("$(inputs."): name = o.outputBinding.glob.split(".")[1] if name.endswith(")"): name = name[:-1] matched_input = next(i for i in inputs if i.id == name) inputs.remove(matched_input) if isinstance(matched_input.default, dict): path = client.workflow_path / Path( matched_input.default["path"]) else: path = Path(matched_input.default) path = Path(os.path.abspath(client.path / path)).relative_to( client.path) if matched_input.inputBinding: prefix = matched_input.inputBinding.prefix position = matched_input.inputBinding.position if prefix and matched_input.inputBinding.separate: prefix += " " else: path = Path(o.outputBinding.glob) create_folder = False check_path = path if not (client.path / path).is_dir(): check_path = path.parent if check_path != "." and str(check_path) in created_outputs: create_folder = True run.outputs.append( CommandOutput( id=CommandOutput.generate_id(base_id, position), position=position, prefix=prefix, produces=_entity_from_path(client, path, commit), create_folder=create_folder, )) for i in inputs: prefix = None position = None if i.inputBinding: prefix = i.inputBinding.prefix position = i.inputBinding.position if prefix and i.inputBinding.separate: prefix += " " if isinstance( i.default, dict) and "class" in i.default and i.default["class"] in [ "File", "Directory" ]: path = client.workflow_path / Path(i.default["path"]) path = Path(os.path.abspath(path)).relative_to(client.path) run.inputs.append( CommandInput( id=CommandInput.generate_id(base_id, position), position=position, prefix=prefix, consumes=_entity_from_path(client, path, commit), )) else: run.arguments.append( CommandArgument( id=CommandArgument.generate_id(base_id, position), position=position, prefix=prefix, value=str(i.default), )) for a in cmd_line_tool.arguments: id_ = CommandArgument.generate_id(base_id, a["position"]) run.arguments.append( CommandArgument(id=id_, position=a["position"], value=a["valueFrom"])) if not persist: return run, None step_name = "{0}_{1}.yaml".format( uuid.uuid4().hex, secure_filename("_".join(cmd_line_tool.baseCommand)), ) absolute_path = client.workflow_path / step_name path = absolute_path.relative_to(client.path) with with_reference(absolute_path): run.path = path process_run = ProcessRun.from_run(run, client, path, commit=commit) process_run.invalidated = _invalidations_from_commit(client, commit) # HACK: This fixes broken SoftwareAgent due to rebases done by users if isinstance( process_run.association.agent, Person ) or not process_run.association.agent.label.startswith("renku "): process_run.association.agent = default_missing_software_agent process_run.to_yaml() client.add_to_activity_index(process_run) return process_run, absolute_path
def from_jsonld( cls, data, client=None, commit=None, __reference__=None, __source__=None, ): """Instantiate a JSON-LD class from data.""" if isinstance(data, cls): return data if not isinstance(data, dict): raise ValueError(data) if '@type' in data: # @type could be a string or a list - make sure it is a list type_ = data['@type'] if not isinstance(type_, list): type_ = [type_] # If a json-ld class has multiple types, they are in a # sorted tuple. This is used as the key for the class # registry, so we have to match it here. type_ = tuple(sorted(type_)) if type_ in cls.__type_registry__ and getattr( cls, '_jsonld_type', None ) != type_: new_cls = cls.__type_registry__[type_] if cls != new_cls: return new_cls.from_jsonld( data, client=client, commit=commit ) if cls._jsonld_translate: # perform the translation data = pyld.jsonld.compact(data, cls._jsonld_translate) # compact using the class json-ld context data.pop('@context', None) data = pyld.jsonld.compact(data, cls._jsonld_context) data.setdefault('@context', cls._jsonld_context) if data['@context'] != cls._jsonld_context: # merge new context into old context to prevent properties # getting lost in jsonld expansion if isinstance(data['@context'], str): data['@context'] = {'@base': data['@context']} data['@context'].update(cls._jsonld_context) try: compacted = pyld.jsonld.compact(data, cls._jsonld_context) except Exception: compacted = data else: compacted = data fields = cls._jsonld_fields data_ = {} # `client` and `commit` are passed in optionally for some classes # They might be unset if the metadata is used to instantiate # an object outside of a repo/client context. if client: data_['client'] = client if commit: data_['commit'] = commit for k, v in compacted.items(): if k in fields: no_value_context = isinstance(v, dict) and '@context' not in v has_nested_context = ( k in compacted['@context'] and '@context' in compacted['@context'][k] ) if no_value_context and has_nested_context: # Propagate down context v['@context'] = compacted['@context'][k]['@context'] data_[k.lstrip('_')] = v if __reference__: with with_reference(__reference__): self = cls(**data_) else: self = cls(**data_) if __source__: setattr(self, '__source__', __source__) return self
def from_cwl(cls, data, __reference__=None): """Return an instance from CWL data.""" exclude_properties = ['class', '$namespaces', '@reverse'] class_name = data.get('class', None) cls = cls.registry.get(class_name, cls) if '$namespaces' in data: # handle custom metadata keys = data.keys() metadata_keys = [(k, False) for k in keys if ':' in k] if '@reverse' in keys: metadata_keys.extend( (k, True) for k in data['@reverse'].keys() if ':' in k) attrs = fields(cls) for a in attrs: # map custom metadata if 'cwl_metadata' not in a.metadata: continue metadata = a.metadata['cwl_metadata'] k = (metadata.get('property'), metadata.get('reverse', False)) if k not in metadata_keys: continue metadata_type = metadata.get('type') if not metadata_type: raise ValueError('CWL metadata type not specified') if metadata.get('reverse', False): metadata_value = data['@reverse'][metadata['property']] else: metadata_value = data[metadata['property']] exclude_properties.append(metadata['property']) if isinstance(metadata_value, list): data[a.name] = [ type_from_metadata(metadata_type, v) for v in metadata_value ] else: data[a.name] = type_from_metadata(metadata_type, metadata_value) if __reference__: with with_reference(__reference__): self = cls( **{ k: v for k, v in iteritems(data) if k not in exclude_properties }) else: self = cls(**{ k: v for k, v in iteritems(data) if k not in exclude_properties }) return self
def _migrate_single_step(client, cmd_line_tool, path, commit=None, persist=False): """Migrate a single step workflow.""" if not commit: commit = client.find_previous_commit(path) run = Run(client=client, path=path, commit=commit) run.command = ' '.join(cmd_line_tool.baseCommand) run.successcodes = cmd_line_tool.successCodes inputs = list(cmd_line_tool.inputs) outputs = list(cmd_line_tool.outputs) if cmd_line_tool.stdin: name = cmd_line_tool.stdin.split('.')[1] if name.endswith(')'): name = name[:-1] matched_input = next(i for i in inputs if i.id == name) inputs.remove(matched_input) path = client.workflow_path / Path(matched_input.default['path']) stdin = path.resolve().relative_to(client.path) run.inputs.append( CommandInput(consumes=_entity_from_path(client, stdin, commit), mapped_to=MappedIOStream(stream_type='stdin'))) if cmd_line_tool.stdout: run.outputs.append( CommandOutput(produces=_entity_from_path(client, cmd_line_tool.stdout, commit), mapped_to=MappedIOStream(stream_type='stdout'), create_folder=False)) matched_output = next(o for o in outputs if o.id == 'output_stdout') if matched_output: outputs.remove(matched_output) if cmd_line_tool.stderr: run.outputs.append( CommandOutput(produces=_entity_from_path(client, cmd_line_tool.stderr, commit), mapped_to=MappedIOStream(stream_type='stderr'), create_folder=False)) matched_output = next(o for o in outputs if o.id == 'output_stderr') if matched_output: outputs.remove(matched_output) created_outputs = [] workdir_requirements = [ r for r in cmd_line_tool.requirements if isinstance(r, InitialWorkDirRequirement) ] for r in workdir_requirements: for l in r.listing: if l.entry == '$({"listing": [], "class": "Directory"})': created_outputs.append(l.entryname) for o in outputs: prefix = None position = None if o.outputBinding.glob.startswith('$(inputs.'): name = o.outputBinding.glob.split('.')[1] if name.endswith(')'): name = name[:-1] matched_input = next(i for i in inputs if i.id == name) inputs.remove(matched_input) if isinstance(matched_input.default, dict): path = client.workflow_path / Path( matched_input.default['path']) else: path = Path(matched_input.default) path = Path(os.path.abspath(path)).relative_to(client.path) if matched_input.inputBinding: prefix = matched_input.inputBinding.prefix position = matched_input.inputBinding.position if prefix and matched_input.inputBinding.separate: prefix += ' ' else: path = Path(o.outputBinding.glob) create_folder = False check_path = path if not (client.path / path).is_dir(): check_path = path.parent if check_path != '.' and str(check_path) in created_outputs: create_folder = True run.outputs.append( CommandOutput(position=position, prefix=prefix, produces=_entity_from_path(client, path, commit), create_folder=create_folder)) for i in inputs: prefix = None position = None if i.inputBinding: prefix = i.inputBinding.prefix position = i.inputBinding.position if prefix and i.inputBinding.separate: prefix += ' ' if (isinstance(i.default, dict) and 'class' in i.default and i.default['class'] in ['File', 'Directory']): path = client.workflow_path / Path(i.default['path']) path = Path(os.path.abspath(path)).relative_to(client.path) run.inputs.append( CommandInput(position=position, prefix=prefix, consumes=_entity_from_path(client, path, commit))) else: run.arguments.append( CommandArgument(position=position, prefix=prefix, value=str(i.default))) for a in cmd_line_tool.arguments: run.arguments.append( CommandArgument(position=a['position'], value=a['valueFrom'])) if not persist: return run, None step_name = '{0}_{1}.yaml'.format( uuid.uuid4().hex, secure_filename('_'.join(cmd_line_tool.baseCommand)), ) path = (client.workflow_path / step_name).relative_to(client.path) with with_reference(path): run.path = path process_run = ProcessRun.from_run(run, client, path, commit=commit) process_run.invalidated = _invalidations_from_commit(client, commit) process_run.to_yaml() client.add_to_activity_index(process_run) return process_run, path