def find_explicit_inputs(self): """Yield explicit inputs and command line input bindings if any.""" input_paths = [ input.default.path for input in self.inputs if input.type in PATH_OBJECTS ] input_id = len(self.inputs) + len(self.arguments) for explicit_input in self.explicit_inputs: if explicit_input in input_paths: continue try: explicit_input.relative_to(self.working_dir) except ValueError: raise errors.UsageError( "The input file or directory is not in the repository." "\n\n\t" + click.style(str(explicit_input), fg="yellow") + "\n\n") if self.is_existing_path(explicit_input) is None: raise errors.UsageError( "The input file or directory does not exist." "\n\n\t" + click.style(str(explicit_input), fg="yellow") + "\n\n") input_id += 1 default, type, _ = self.guess_type(explicit_input) # Explicit inputs are either File or Directory assert type in PATH_OBJECTS # The inputBinging is None because these inputs won't # appear on command-line yield CommandInputParameter(id="input_{0}".format(input_id), type=type, default=default, inputBinding=None)
def config(key, value, remove, local_only, global_only): """Manage configuration options.""" is_write = value is not None if is_write and remove: raise errors.UsageError('Cannot remove and set at the same time.') if remove and not key: raise errors.UsageError('KEY is missing.') if local_only and global_only: raise errors.UsageError('Cannot use --local and --global together.') if remove: update_config(key, remove=remove, global_only=global_only) elif is_write: update_config(key, value=value, global_only=global_only) else: value = read_config(key, local_only, global_only) click.secho(value)
def add_data_to_dataset(self, dataset, urls, force=False, sources=(), destination='', ref=None, link=False, external=False, extract=False, all_at_once=False, destination_names=None, progress=None): """Import the data into the data directory.""" warning_message = '' dataset_path = self.path / self.datadir / dataset.short_name destination = destination or Path('.') destination = self._resolve_path(dataset_path, destination) destination = self.path / dataset_path / destination files = [] if all_at_once: # only for URLs files = self._add_from_urls(dataset=dataset, urls=urls, destination_names=destination_names, destination=destination, extract=extract, progress=progress) else: for url in urls: is_remote, is_git = _check_url(url) if is_git and is_remote: # Remote git repo sources = sources or () new_files = self._add_from_git(dataset, url, sources, destination, ref) else: if sources: raise errors.UsageError( 'Cannot use "--source" with URLs or local files.') if not is_remote: # Local path, might be git if is_git: warning_message = 'Adding data from local Git ' \ 'repository. Use remote\'s Git URL instead ' \ 'to enable lineage information and updates.' u = parse.urlparse(url) new_files = self._add_from_local( dataset, u.path, link, external, destination) else: # Remote URL new_files = self._add_from_url(dataset, url, destination, extract, progress=progress) files.extend(new_files) files_to_commit = {f['path'] for f in files if f['path']} ignored = self.find_ignored_paths(*files_to_commit) if not force: if ignored: raise errors.IgnoredFiles(ignored) if dataset.contains_any(files): raise errors.DatasetFileExists() # all files at this point can be force-added and overwritten for data in files: operation = data.pop('operation', None) if not operation: continue src, dst, action = operation # Remove existing file if any self.remove_file(dst) dst.parent.mkdir(parents=True, exist_ok=True) if action == 'copy': shutil.copy(src, dst) elif action == 'link': try: os.link(src, dst) except Exception as e: raise errors.OperationError( 'Could not create hard link. Retry without "--link."' ) from e elif action == 'symlink': self._create_external_file(src, dst) data['external'] = True # Track non-symlinks in LFS self.track_paths_in_storage(*files_to_commit) # Force-add to include possible ignored files self.repo.git.add(*files_to_commit, force=True) self.repo.git.add(self.renku_pointers_path, force=True) staged_files = self.repo.index.diff('HEAD') if staged_files: msg = 'renku dataset: committing {} newly added files'.format( len(files_to_commit)) self.repo.index.commit(msg) # Generate the DatasetFiles dataset_files = [] for data in files: if os.path.basename(str(data['path'])) == '.git': continue dataset_file = DatasetFile.from_revision(self, **data) # Set dataset file path relative to root for submodules. if dataset_file.client != self: dataset_file.path = str(data['path']) dataset_files.append(dataset_file) dataset.update_files(dataset_files) return warning_message
def validate_path(self, attribute, value): """Path must exists.""" if not value.exists(): raise errors.UsageError('Directory must exist.')
def validate_command_line(self, attribute, value): """Check the command line structure.""" if not value: raise errors.UsageError('Command line can not be empty.')
def run( client, explicit_inputs, explicit_outputs, no_output, no_input_detection, no_output_detection, success_codes, isolation, command_line, ): """Tracking work on a specific problem.""" paths = explicit_outputs if no_output_detection else client.candidate_paths mapped_std = get_mapped_std_streams(paths, streams=("stdout", "stderr")) paths = explicit_inputs if no_input_detection else client.candidate_paths mapped_std_in = get_mapped_std_streams(paths, streams=("stdin", )) mapped_std.update(mapped_std_in) invalid = get_mapped_std_streams(explicit_inputs, streams=("stdout", "stderr")) if invalid: raise errors.UsageError( "Explicit input file cannot be used as stdout/stderr:" "\n\t" + click.style("\n\t".join(invalid.values()), fg="yellow") + "\n") invalid = get_mapped_std_streams(explicit_outputs, streams=("stdin", )) if invalid: raise errors.UsageError( "Explicit output file cannot be used as stdin:" "\n\t" + click.style("\n\t".join(invalid.values()), fg="yellow") + "\n") system_stdout = None system_stderr = None # /dev/tty is a virtual device that points to the terminal # of the currently executed process try: with open("/dev/tty", "w"): tty_exists = True except OSError: tty_exists = False try: stdout_redirected = "stdout" in mapped_std stderr_redirected = "stderr" in mapped_std if tty_exists: # if renku was called with redirected stdout/stderr, undo the # redirection here so error messages can be printed normally if stdout_redirected: system_stdout = open("/dev/tty", "w") old_stdout = sys.stdout sys.stdout = system_stdout if stderr_redirected: system_stderr = open("/dev/tty", "w") old_stderr = sys.stderr sys.stderr = system_stderr working_dir = client.repo.working_dir factory = CommandLineToolFactory( command_line=command_line, explicit_inputs=explicit_inputs, explicit_outputs=explicit_outputs, directory=os.getcwd(), working_dir=working_dir, no_input_detection=no_input_detection, no_output_detection=no_output_detection, successCodes=success_codes, **{ name: os.path.relpath(path, working_dir) for name, path in mapped_std.items() }, ) with client.with_workflow_storage() as wf: with factory.watch(client, no_output=no_output) as tool: # Don't compute paths if storage is disabled. if client.check_external_storage(): # Make sure all inputs are pulled from a storage. paths_ = (path for _, path in tool.iter_input_files( client.workflow_path)) client.pull_paths_from_storage(*paths_) if tty_exists: # apply original output redirection if stdout_redirected: sys.stdout = old_stdout if stderr_redirected: sys.stderr = old_stderr return_code = call( factory.command_line, cwd=os.getcwd(), **{key: getattr(sys, key) for key in mapped_std.keys()}, ) sys.stdout.flush() sys.stderr.flush() if tty_exists: # change back to /dev/tty redirection if stdout_redirected: sys.stdout = system_stdout if stderr_redirected: sys.stderr = system_stderr if return_code not in (success_codes or {0}): raise errors.InvalidSuccessCode( return_code, success_codes=success_codes) wf.add_step(run=tool) if factory.messages: click.echo(factory.messages) if factory.warnings: click.echo(factory.warnings) finally: if system_stdout: sys.stdout = old_stdout system_stdout.close() if system_stderr: sys.stderr = old_stderr system_stderr.close()
def add_data_to_dataset(self, dataset, urls, force=False, overwrite=False, sources=(), destination='', ref=None, external=False, extract=False, all_at_once=False, destination_names=None, progress=None): """Import the data into the data directory.""" messages = [] warning_messages = [] dataset_datadir = self.path / dataset.data_dir destination = destination or Path('.') destination = self._resolve_path(dataset_datadir, destination) destination = self.path / dataset_datadir / destination if destination.exists() and not destination.is_dir(): raise errors.ParameterError( f'Destination is not a directory: "{destination}"') self.check_external_storage() files = [] if all_at_once: # Importing a dataset files = self._add_from_urls(dataset=dataset, urls=urls, destination_names=destination_names, destination=destination, extract=extract, progress=progress) else: for url in urls: is_remote, is_git = _check_url(url) if is_git and is_remote: # Remote git repo sources = sources or () new_files = self._add_from_git(dataset=dataset, url=url, sources=sources, destination=destination, ref=ref) else: if sources: raise errors.UsageError( 'Cannot use "--source" with URLs or local files.') if not is_remote: # Local path, might be git if is_git: warning_messages.append( 'Adding data from local Git repository: ' + 'Use remote\'s Git URL instead to enable ' + 'lineage information and updates.') u = parse.urlparse(url) new_files = self._add_from_local( dataset=dataset, path=u.path, external=external, destination=destination) else: # Remote URL new_files = self._add_from_url(dataset=dataset, url=url, destination=destination, extract=extract, progress=progress) files.extend(new_files) # Remove all files that are under a .git directory paths_to_avoid = [ f['path'] for f in files if '.git' in str(f['path']).split(os.path.sep) ] if paths_to_avoid: files = [f for f in files if f['path'] not in paths_to_avoid] warning_messages.append( 'Ignored adding paths under a .git directory:\n ' + '\n '.join(str(p) for p in paths_to_avoid)) files_to_commit = {str(self.path / f['path']) for f in files} if not force: ignored_files = self.find_ignored_paths(*files_to_commit) if ignored_files: ignored_files = set(ignored_files) files_to_commit = files_to_commit.difference(ignored_files) ignored_sources = [] for file_ in files: if str(self.path / file_['path']) in ignored_files: operation = file_.get('operation') if operation: src, _, _ = operation ignored_sources.append(src) else: ignored_sources.append(file_['path']) files = [ f for f in files if str(self.path / f['path']) in files_to_commit ] warning_messages.append( 'Theses paths are ignored by one of your .gitignore ' + 'files (use "--force" flag if you really want to add ' + 'them):\n ' + '\n '.join([str(p) for p in ignored_sources])) # all files at this point can be force-added if not overwrite: existing_files = dataset.find_files(files_to_commit) if existing_files: files_to_commit = files_to_commit.difference(existing_files) files = [ f for f in files if str(self.path / f['path']) in files_to_commit ] warning_messages.append( 'These existing files were not overwritten ' + '(use "--overwrite" flag to overwrite them):\n ' + '\n '.join([str(p) for p in existing_files])) for data in files: operation = data.pop('operation', None) if not operation: continue src, dst, action = operation # Remove existing file if any self.remove_file(dst) dst.parent.mkdir(parents=True, exist_ok=True) if action == 'copy': shutil.copy(src, dst) elif action == 'move': shutil.move(src, dst, copy_function=shutil.copy) elif action == 'symlink': self._create_external_file(src, dst) data['external'] = True else: raise errors.OperationError(f'Invalid action {action}') # Track non-symlinks in LFS if self.check_external_storage(): lfs_paths = self.track_paths_in_storage(*files_to_commit) show_message = self.get_value('renku', 'show_lfs_message') if (lfs_paths and (show_message is None or show_message == 'True')): messages.append( ('Adding these files to Git LFS:\n' + '\t{}'.format('\n\t'.join(lfs_paths)) + '\nTo disable this message in the future, run:' + '\n\trenku config show_lfs_message False')) # Force-add to include possible ignored files self.repo.git.add(*files_to_commit, force=True) self.repo.git.add(self.renku_pointers_path, force=True) staged_files = self.repo.index.diff('HEAD') if staged_files: msg = 'renku dataset: committing {} newly added files'.format( len(files_to_commit)) skip_hooks = not self.external_storage_requested self.repo.index.commit(msg, skip_hooks=skip_hooks) else: warning_messages.append('No file was added to project') # Generate the DatasetFiles dataset_files = [] for data in files: dataset_file = DatasetFile.from_revision(self, **data) # Set dataset file path relative to root for submodules. if dataset_file.client != self: dataset_file.path = str(data['path']) dataset_files.append(dataset_file) dataset.update_files(dataset_files) return warning_messages, messages
def add_data_to_dataset(self, dataset, urls, force=False, sources=(), destination='', ref=None, link=False, extract=False, all_at_once=False, progress=None): """Import the data into the data directory.""" warning_message = '' dataset_path = self.path / self.datadir / dataset.short_name destination = destination or Path('.') destination = self._resolve_path(dataset_path, destination) destination = self.path / dataset_path / destination files = [] if all_at_once: # only for URLs files = self._add_from_urls(dataset=dataset, urls=urls, destination=destination, extract=extract, progress=progress) else: for url in urls: is_remote, is_git = _check_url(url) if is_git and is_remote: # Remote git repo sources = sources or () new_files = self._add_from_git(dataset, url, sources, destination, ref) else: if sources: raise errors.UsageError( 'Cannot use "--source" with URLs or local files.') if not is_remote: # Local path, might be git if is_git: warning_message = 'Adding data from local Git ' \ 'repository. Use remote\'s Git URL instead ' \ 'to enable lineage information and updates.' u = parse.urlparse(url) new_files = self._add_from_local( dataset, u.path, link, destination) else: # Remote URL new_files = self._add_from_url(dataset, url, destination, extract) files.extend(new_files) self.track_paths_in_storage(*(f['path'] for f in files)) ignored = self.find_ignored_paths(*(data['path'] for data in files)) or [] if ignored: if force: self.repo.git.add(*ignored, force=True) else: raise errors.IgnoredFiles(ignored) if dataset.contains_any(files) and force is False: raise errors.DatasetFileExists() # commit all new data file_paths = {str(data['path']) for data in files if str(data['path'])} files_to_add = (file_paths - set(ignored)) self.repo.git.add(*files_to_add) if self.repo.is_dirty(): commit_msg = ('renku dataset: ' 'committing {} newly added files' ).format(len(file_paths) + len(ignored)) self.repo.index.commit(commit_msg) # Generate the DatasetFiles dataset_files = [] for data in files: if os.path.basename(str(data['path'])) == '.git': continue dataset_file = DatasetFile.from_revision(self, **data) # Set dataset file path relative to root for submodules. if dataset_file.client != self: dataset_file.path = str(data['path']) dataset_files.append(dataset_file) dataset.update_files(dataset_files) return warning_message
def guess_outputs(self, candidates): """Yield detected output and changed command input parameter.""" # TODO what to do with duplicate paths & inputs with same defaults candidates = list(candidates) tree = DirectoryTree.from_list(candidates) input_candidates = {} conflicting_paths = {} for index, input in enumerate(self.inputs): # Convert input defaults to paths relative to working directory. if input.type not in PATH_OBJECTS: if self.no_input_detection: continue try: path = self.directory / str(input.default) input_path = Path(os.path.abspath(path)).relative_to( self.working_dir) except FileNotFoundError: continue else: input_path = input.default.path.relative_to(self.working_dir) if input_path.is_dir() and tree.get(input_path): # The directory might exist before running the script subpaths = { str(input_path / path) for path in tree.get(input_path, default=[]) } absolute_path = os.path.abspath(input_path) if Path(absolute_path) not in self.explicit_outputs: content = { str(path) for path in input_path.rglob("*") if not path.is_dir() and path.name != ".gitkeep" } preexisting_paths = content - subpaths if preexisting_paths: raise errors.InvalidOutputPath( 'The output directory "{0}" is not empty. \n\n' "Delete existing files before running the " "command:" '\n (use "git rm <file>..." to remove them ' "first)" "\n\n".format(input_path) + "\n".join("\t" + click.style(path, fg="yellow") for path in preexisting_paths) + "\n\n" "Once you have removed files that should be used " "as outputs,\n" "you can safely rerun the previous command.") # Remove files from the input directory candidates[:] = (path for path in candidates if path not in subpaths) # Include input path in the candidates to check candidates.append(str(input_path)) input_candidates[str(input_path)] = input elif input.type not in PATH_OBJECTS: # Input need to be changed if an output is detected input_candidates[str(input_path)] = input else: # Names that can not be outputs because they are already inputs conflicting_paths[str(input_path)] = input streams = { path for path in (getattr(self, name) for name in ("stdout", "stderr")) if path is not None } # TODO group by a common prefix for position, path in enumerate(candidates): candidate = self.is_existing_path(self.working_dir / path) if candidate is None: raise errors.UsageError( 'Path "{0}" does not exist.'.format(path)) glob = str(candidate.relative_to(self.working_dir)) if glob in streams: continue new_input = None if glob in conflicting_paths: # it means that it is rewriting a file input = conflicting_paths[glob] new_input = attr.evolve(input, type="string", default=glob) input_candidates[glob] = new_input del conflicting_paths[glob] # TODO add warning ('Output already exists in inputs.') candidate_type = "Directory" if candidate.is_dir() else "File" if glob in input_candidates: input = input_candidates[glob] if new_input is None: new_input = input_candidates[glob] = attr.evolve( input, type="string", default=glob) yield ( CommandOutputParameter( id="output_{0}".format(position), type=candidate_type, outputBinding=dict(glob="$(inputs.{0})".format( input.id), ), ), new_input, glob, ) else: yield ( CommandOutputParameter( id="output_{0}".format(position), type=candidate_type, outputBinding=dict(glob=glob, ), ), None, glob, )