def push(self) -> None: if self.size_only: flags = ["--size-only"] else: flags = [] flags.append("--verbose") if self.master == "local": if self.sync_mode == "copy": ret = self.rclone.copy(self.local_path, self.remote_origin, flags=flags) if ret["code"] != 0: raise ConfigurationError( "rclone copy raised error %d: %s" % (ret["code"], ret["error"])) else: ret = self.rclone.sync(self.local_path, self.remote_origin, flags=flags) if ret["code"] != 0: raise ConfigurationError( "rclone sync raised error %d: %s" % (ret["code"], ret["error"])) else: click.echo("Skiping push of resource %s, master is %s" % (self.name, self.master))
def validate_template(template): if not template.startswith("snapshots/"): raise ConfigurationError("Templates must start with 'snapshots/'") for mo in TEMPLATE_VAR_RE.finditer(template): tvar = mo.group(0)[1:-1] if tvar not in VALID_TEMPLATE_VARS: raise ConfigurationError( "Unknown variable '%s' in results directory template '%s'" % (tvar, template))
def from_command_line( self, role, name, workspace, remote_origin, local_path, config, compute_hash, export, imported, master, sync_mode, size_only, ): rclone = self._add_prechecks(local_path, remote_origin, config) self._copy_from_remote(name, local_path, remote_origin, rclone, master, sync_mode, size_only, workspace.verbose) setup_path_for_hashes(role, name, workspace, local_path) if imported: lineage_path = os.path.join(local_path, "lineage.json") if not os.path.exists(lineage_path): raise ConfigurationError( "--imported was specified, but missing exported lineage file %s" % lineage_path) if (not isinstance(workspace, SnapshotWorkspaceMixin) or not workspace.supports_lineage()): raise ConfigurationError( "--imported was specified, but this workspace does not support lineage" ) with open(lineage_path, "r") as f: lineage_data = json.load(f) if lineage_data["resource_name"] != name: raise ConfigurationError( "Resource name in imported lineage '%s' does not match '%s'" % (lineage_data["resource_name"], name)) cast(SnapshotWorkspaceMixin, workspace).get_lineage_store().import_lineage_file( name, lineage_data["lineages"]) return RcloneResource( name, role, workspace, remote_origin, global_local_path=local_path, my_local_path=None, config=config, compute_hash=compute_hash, export=export, imported=imported, master=master, sync_mode=sync_mode, size_only=size_only, )
def validate_subpath_exists(self, subpath: str) -> None: if self.current_snapshot is not None: assert self.snapshot_fs is not None if not self.snapshot_fs.exists(subpath): raise ConfigurationError( f"Subpath {subpath} does not existing in bucket {self.bucket_name} as of snapshot {self.current_snapshot}" ) elif not self.fs.exists(subpath): raise ConfigurationError( f"Subpath {subpath} does not currently exist in bucket {self.bucket_name}" )
def echo_git_status_for_user(cwd): """Run git status and echo to the user. """ if GIT_EXE_PATH is None: raise ConfigurationError("git executable not found") cmd = [GIT_EXE_PATH, "status"] # p = run(cmd, cwd=cwd, stdout=PIPE, encoding="utf-8") p = run(cmd, cwd=cwd, encoding="utf-8") # for line in p.stdout.split("\n"): # click.echo(line) if p.returncode != 0: raise ConfigurationError("Problem invoking %s status on %s" % (GIT_EXE_PATH, cwd))
def _add_prechecks(self, local_path, remote_path, config) -> RClone: if os.path.exists(local_path) and not (os.access(local_path, os.W_OK)): raise ConfigurationError(local_path + " does not have write permission") if config: rclone = RClone(cfgfile=config) else: rclone = RClone() known_remotes = rclone.listremotes() (remote_name, _) = remote_path.split(":") if remote_name not in known_remotes: raise ConfigurationError("Remote '" + remote_name + "' not found by rclone") return rclone
def load_workspace( batch: bool, verbose: bool, parsed_uri: ParseResult) -> ws.Workspace: # type: ignore path = parsed_uri.path if not isabs(path): path = abspath(expanduser(path)) if not isdir(path): raise ConfigurationError("Workspace directory %s does not exist" % path) metadata_path = join(path, ".dataworkspace") if not isdir(metadata_path): raise ConfigurationError( "Workspace directory %s does not correspond to an initialized git-backend workspace" % path) return Workspace(path, batch, verbose)
def lineage_graph_command( workspace: Workspace, output_file: str, resource_name: Optional[str], snapshot: Optional[str], format="html", width: int = 1024, height: int = 800, ) -> None: if not isinstance(workspace, SnapshotWorkspaceMixin): raise ConfigurationError( "Workspace %s does not support snapshots and lineage" % workspace.name) if not workspace.supports_lineage(): raise ConfigurationError("Workspace %s does not support lineage" % workspace.name) store = workspace.get_lineage_store() snapshot_hash = None # type: Optional[str] if snapshot is not None: md = workspace.get_snapshot_by_tag_or_hash(snapshot) snapshot_hash = md.hashval if resource_name is not None: workspace.validate_resource_name(resource_name) else: for r in workspace.get_resource_names(): if workspace.get_resource_role(r) == ResourceRoles.RESULTS: resource_name = r break if resource_name is None: raise ConfigurationError( "Did not find a results resource in workspace. If you want to graph the lineage of a non-results resource, use the --resource option." ) make_simplified_lineage_graph_for_resource( workspace.get_instance(), store, resource_name, output_file, snapshot_hash=snapshot_hash, format=format, width=width, height=height, ) if snapshot is None: click.echo("Wrote lineage for %s to %s" % (resource_name, output_file)) else: click.echo("Wrote lineage for %s as of snapshot %s to %s" % (resource_name, snapshot, output_file))
def build_resource_list( workspace: Workspace, only: Optional[List[str]], skip: Optional[List[str]] ) -> List[str]: """Build up our resource name list for either push or pull commands. """ if (only is not None) and (skip is not None): raise ConfigurationError("Cannot specify both --only and --skip") all_resource_names_set = frozenset(workspace.get_resource_names()) local_state_names_set = frozenset(workspace.get_names_of_resources_with_local_state()) if only is not None: only_set = frozenset(only) invalid = only_set.difference(all_resource_names_set) if len(invalid) > 0: raise ConfigurationError( "Invalid resource names were included with --only: %s" % ", ".join(sorted(invalid)) ) nonsync_rnames = only_set.difference(local_state_names_set) if len(nonsync_rnames) > 0: click.echo( "Skipping the following resources, which do not have local state: %s" % ", ".join(sorted(nonsync_rnames)) ) return [rn for rn in only if rn in local_state_names_set] elif skip is not None: skip_set = frozenset(skip) invalid = skip_set.difference(all_resource_names_set) if len(invalid) > 0: raise ConfigurationError( "Invalid resource names were included with --skip: %s" % ", ".join(sorted(invalid)) ) nonsync_rnames = all_resource_names_set.difference(skip_set).difference( local_state_names_set ) if len(nonsync_rnames) > 0: click.echo( "Skipping the following resources, which do not have local state: %s" % ", ".join(sorted(nonsync_rnames)) ) skip_set = skip_set.union(nonsync_rnames) return [rn for rn in workspace.get_resource_names() if rn not in skip_set] else: nonsync_rnames = all_resource_names_set.difference(local_state_names_set) if len(nonsync_rnames) > 0: click.echo( "Skipping the following resources, which do not have local state: %s" % ", ".join(sorted(nonsync_rnames)) ) return [rn for rn in workspace.get_resource_names() if rn not in nonsync_rnames]
def clone_scratch_directory( workspace_dir: str, global_params: Dict[str, Any], local_params: Dict[str, Any], batch: bool = False, ) -> str: """Set the scratch directory parameters for a cloned copy of the workspace, updating local_params if neded. Returns the absolute path for the scratch directory on this system. """ if SCRATCH_DIRECTORY in global_params: return join(workspace_dir, global_params[SCRATCH_DIRECTORY]) elif not batch: local_path = cast( str, click.prompt( "Please specify a location for this workspace's scratch directory (must be outside of workspace)", type=LocalPathType(exists=False, must_be_outside_of_workspace=workspace_dir), ), ) local_params[LOCAL_SCRATCH_DIRECTORY] = local_path return local_path else: # TODO: come up with a standard way of handling this when called from the API - either by # letting the user provide values in advance or by having some standard defaults. raise ConfigurationError( "Scratch directory was not within workspaces and we are running in batch mode. No way to ask user for location." )
def init_scratch_directory( scratch_dir: str, workspace_dir: str, global_params: Dict[str, Any], local_params: Dict[str, Any], ) -> Tuple[str, Optional[str]]: """Given the user-provided or default scratch directory, set the SCRATCH_DIRECTORY and LOCAL_SCRATCH_DIRECTORY parameters accordingly. One only will be set, with preference to the global parameter, which is relative to the workspace. Returns a tuple of the absolute and the gitignore entry (if any) for the scratch_directory """ abs_scratch_dir = abspath(expanduser(scratch_dir)) if not isabs(scratch_dir) else scratch_dir scratch_dir_gitignore = None # type: Optional[str] if abs_scratch_dir.startswith(workspace_dir): rel_scratch_dir = get_subpath_from_absolute(workspace_dir, abs_scratch_dir) global_params[SCRATCH_DIRECTORY] = rel_scratch_dir # always store a relative directory # scratch dir gitignore should start with / to indicate that this should only # match the exact path relative to the git repo root. if rel_scratch_dir is None: raise ConfigurationError( "Scratch directory cannot be equal to workspace directory. " + "It should either be a subdirectory or completely outside it." ) if rel_scratch_dir.startswith("./"): scratch_dir_gitignore = rel_scratch_dir[1:] else: scratch_dir_gitignore = "/" + rel_scratch_dir else: local_params[LOCAL_SCRATCH_DIRECTORY] = abs_scratch_dir return (abs_scratch_dir, scratch_dir_gitignore)
def pull_command( workspace: Workspace, only: Optional[List[str]] = None, skip: Optional[List[str]] = None, only_workspace: bool = False, ) -> int: if isinstance(workspace, SyncedWorkspaceMixin): # first, sync the workspace click.echo("Syncing workspace") mixin = workspace.pull_workspace() workspace = cast(Workspace, mixin) if not only_workspace: rcount = _pull_and_clone_resources(workspace, only, skip) else: rcount = 0 elif isinstance(workspace, CentralWorkspaceMixin): if only_workspace: raise ConfigurationError( "--only-workspace not valid for central workspace %s" % workspace.name) rcount = _pull_and_clone_resources(workspace, only, skip) else: raise InternalError( "Workspace %s is neither a SyncedWorkspaceMixin nor a CentralWorkspaceMixin" % workspace.name) workspace.save("Pull command") return rcount
def clone(self, params: JSONDict, workspace: Workspace) -> LocalStateResourceMixin: """Instantiate a resource that was created remotely. We need to verify that the local copy of the data exists -- we are not responsible for making certain it is in th correct place. """ name = params["name"] # check local_path, too for backward compatibility global_local_path = (params["global_local_path"] if "global_local_path" in params else params["local_path"]) # type: str local_params = {} # type: JSONDict if exists(global_local_path): local_path = global_local_path else: if not workspace.batch: local_path = cast( str, click.prompt( "Local files resource '%s' was located at '%s' on the original system. W\here is it located on this system?" % (name, global_local_path), type=LocalPathType(exists=True), ), ) local_params["my_local_path"] = local_path else: raise ConfigurationError( "Local files resource %s is missing from %s." % (name, global_local_path)) if not isinstance(workspace, git_backend.Workspace): non_git_hashes = join(local_path, ".hashes") if not exists(non_git_hashes): os.mkdir(non_git_hashes) return self.from_json(params, local_params, workspace)
def _load_json_file(self, relative_path): f_path = join(self.workspace_dir, relative_path) if not exists(f_path): raise ConfigurationError( "Did not find workspace metadata file %s" % f_path) with open(f_path, "r") as f: return json.load(f)
def add_command(scheme: str, role: str, name: str, workspace: Workspace, *args): current_names = set(workspace.get_resource_names()) if workspace.batch: if name == None: name = workspace.suggest_resource_name(scheme, role, *args) else: if name in current_names: raise ConfigurationError("Resource name '%s' already in use" % name) else: suggested_name = None while (name is None) or (name in current_names): if suggested_name == None: suggested_name = workspace.suggest_resource_name( scheme, role, *args) name = click.prompt( "Please enter a short, unique name for this resource", default=suggested_name) if name in current_names: click.echo("Resource name '%s' already in use." % name, err=True) workspace.add_resource(name, scheme, role, *args) workspace.save("add of %s" % name) click.echo("Successful added resource '%s' to workspace." % name)
def deploy_run_command(workspace: Workspace, image_name: Optional[str], no_mount_ssh_keys: bool) -> None: try: from repo2docker.__main__ import make_r2d # type: ignore except ImportError as e: raise ConfigurationError(R2D_IMPORT_ERROR) from e target_repo_dir = "/home/%s/%s" % (os.environ["USER"], workspace.name) if image_name is None: image_name = workspace.name argv = [ "--target-repo-dir", target_repo_dir, "--image-name", image_name, ] if not no_mount_ssh_keys: dot_ssh = abspath(expanduser("~/.ssh")) argv.append("-v") argv.append("%s:/home/%s/.ssh" % (dot_ssh, os.environ["USER"])) if isinstance(workspace, git_backend.Workspace): workspace_dir = workspace.get_workspace_local_path_if_any() assert workspace_dir is not None argv.append( "dws+" + get_remote_origin_url(workspace_dir, verbose=workspace.verbose)) else: # need to figure out how the clone url works for a non-git workspace assert 0, "run build not yet implemented for non-git workspaces" if workspace.verbose: click.echo("Command args for repo2docker: %s" % repr(argv)) r2d = make_r2d(argv=argv) r2d.initialize() r2d.run_image() click.echo("Run of image %s was successful." % image_name)
def __init__(self, cfgfile=None, cfgstring=None): self.log = logging.getLogger("RClone") self._ensure_rclone_exists() self.cfgstring = '' self.cfgfile = None if cfgstring: self.cfgstring = cfgstring.replace("\\n", "\n") elif cfgfile: self.cfgfile = cfgstring.replace("\\n", "\n") else: # find the default config file used by the rclone installation ret = self._execute(['rclone', 'config', 'file']) self.log.debug(ret) if ret['code'] == 0: # rclone config file output looks like: # # Configuration file is stored at: # filename # so we skip until the '\n' self.cfgfile = ret['out'].splitlines()[1].decode('utf_8') else: print(ret) raise ConfigurationError( "RClone requires either a configuration file or a configuration string" ) assert (self.cfgstring or self.cfgfile ), 'Either a config string is given or a filename is given'
def get_scratch_directory(self) -> str: if self.scratch_dir is not None: return self.scratch_dir else: raise ConfigurationError( "Neither the %s nor %s parameters are set, so cannot find scratch directory. Please set one using 'dws config'." % (SCRATCH_DIRECTORY, LOCAL_SCRATCH_DIRECTORY))
def read_results_file(self, subpath: str) -> JSONDict: """Read and parse json results data from the specified path in the resource. If the path does not exist or is not a file throw a ConfigurationError. """ path = os.path.join(self.local_path, subpath) if not os.path.isfile(path): raise ConfigurationError( "subpath %s does not exist or is not a file in resource %s" % (subpath, self.name)) with open(path, "r") as f: try: return json.load(f) except Exception as e: raise ConfigurationError( "Parse error when reading %s in resource %s" % (subpath, self.name)) from e
def setup_git_fat_for_repo( repo_dir: str, git_fat_remote: str, git_fat_user: Optional[str] = None, git_fat_port: Optional[int] = None, git_fat_attributes: Optional[str] = None, verbose: bool = False, ) -> None: """Setup git fat and all the associated configuration files for a repository """ validate_git_fat_in_path() dot_git_fat_fpath = get_dot_gitfat_file_path(repo_dir) files_to_add = [ ".gitfat", ] dot_git_attributes_fpath = None # type: Optional[str] if git_fat_attributes: dot_git_attributes_fpath = join(repo_dir, ".gitattributes") files_to_add.append(".gitattributes") if (RSYNC_RE.match(git_fat_remote) is None) and (FPATH_RE.match(git_fat_remote) is None): raise ConfigurationError( ("'%s' is not a valid remote address for rsync (used by git-fat). " + "Please use the format HOSTNAME:/PATH") % git_fat_remote) if git_fat_user is not None and USERNAME_RE.match(git_fat_user) is None: raise ConfigurationError( "'%s' is not a valid remote username for git-fat" % git_fat_user) import dataworkspaces.third_party.git_fat as git_fat python2_exe = git_fat.find_python2_exe() # click.echo("Initializing git-fat with remote %s" % git_fat_remote) with open(dot_git_fat_fpath, "w") as f: f.write("[rsync]\nremote = %s\n" % git_fat_remote) if git_fat_user: f.write("sshuser = %s\n" % git_fat_user) if git_fat_port: f.write("sshport = %s\n" % git_fat_port) if git_fat_attributes is not None: with open(cast(str, dot_git_attributes_fpath), "w") as f: for extn in git_fat_attributes.split(","): f.write("%s filter=fat -crlf\n" % extn) git_fat.run_git_fat(python2_exe, ["init"], cwd=repo_dir, verbose=verbose) git_add(repo_dir, files_to_add, verbose) git_commit(repo_dir, "initialized git-fat with remote %s" % git_fat_remote, verbose)
def switch_git_branch(local_path, branch, verbose): try: call_subprocess([GIT_EXE_PATH, "checkout", branch], cwd=local_path, verbose=verbose) except Exception as e: raise ConfigurationError( "Unable to switch git repo at %s to branch %s" % (local_path, branch)) from e
def _get_snapshot_manifest_as_bytes(self, hash_val: str) -> bytes: snapshot_dir = join(self.workspace_dir, SNAPSHOT_DIR_PATH) snapshot_file = join(snapshot_dir, "snapshot-%s.json" % hash_val.lower()) if not exists(snapshot_file): raise ConfigurationError("No snapshot found for hash value %s" % hash_val) with open(snapshot_file, "rb") as f: return f.read()
def get_snapshot_by_tag(self, tag: str) -> SnapshotMetadata: """Given a tag, return the asssociated snapshot metadata. This lookup could be slower ,if a reverse index is not kept.""" md_dir = join(self.workspace_dir, SNAPSHOT_METADATA_DIR_PATH) if not exists(md_dir): raise ConfigurationError(f"Snapshot for tag {tag} not found") regexp = re.compile(re.escape(tag)) for fname in os.listdir(md_dir): if not fname.endswith("_md.json"): continue fpath = join(md_dir, fname) with open(fpath, "r") as f: raw_data = f.read() if regexp.search(raw_data) is not None: md = SnapshotMetadata.from_json(json.loads(raw_data)) if md.has_tag(tag): return md raise ConfigurationError("Snapshot for tag %s not found" % tag)
def __init__( self, model_name: str, monitor: str = "val_loss", save_best_only: bool = False, mode: str = "auto", save_freq: Union[str, int] = "epoch", results_resource: Optional[Union[str, ResourceRef]] = None, workspace_dir: Optional[str] = None, verbose: Union[int, bool] = 0, ): """ model_name is used to create the checkpoint filenames. The checkpoints will be saved as MODEL_NAME_{epoch}. Currently, only supports save_weights_only option. verbose can be either 0,1 in the style of tensorflow or a True,False in the style of Data Workspaces. """ self.dws_model_name = model_name if verbose == 0 or verbose == False: tf_verbose = 0 dws_verbose = False else: tf_verbose = 1 dws_verbose = True self.workspace = find_and_load_workspace( batch=True, verbose=dws_verbose, uri_or_local_path=workspace_dir) results_ref = _find_resource(self.workspace, ResourceRoles.RESULTS, results_resource) self.results_resource = self.workspace.get_resource(results_ref.name) if not isinstance(self.results_resource, FileResourceMixin): raise ConfigurationError( "Resource %s is not a file-based resource" % results_ref.name) self.results_subdir = results_ref.subpath # type: Optional[str] scratch_dir = self.workspace.get_scratch_directory() assert isdir(scratch_dir), "missing scratch directory %s" % scratch_dir self.dws_checkpoint_path = join(scratch_dir, "checkpoints") # type: str if not isdir(self.dws_checkpoint_path): os.mkdir(self.dws_checkpoint_path) self.checkpoint_filepath_template = join(self.dws_checkpoint_path, model_name + "_{epoch}") super().__init__( filepath=self.checkpoint_filepath_template, monitor=monitor, save_best_only=save_best_only, mode=mode, save_freq=save_freq, save_weights_only=True, verbose=tf_verbose, )
def publish_command(workspace: Workspace, remote_repository: str) -> None: if isinstance(workspace, SyncedWorkspaceMixin): workspace.publish(remote_repository) else: raise ConfigurationError( "Workspace %s does not support publish command; only supported for synced workspaces" % workspace.name ) click.echo("Set remote origin to %s" % remote_repository)
def is_git_dirty(cwd): """See if the git repo is dirty. We are looking for untracked files, changes in staging, and changes in the working directory. """ if GIT_EXE_PATH is None: raise ConfigurationError("git executable not found") cmd = [GIT_EXE_PATH, "status", "--porcelain"] p = run(cmd, cwd=cwd, stdout=PIPE, encoding="utf-8") for line in p.stdout.split("\n"): if len(line) < 2: continue if (line[0] in ("?", "D", "M", "A")) or (line[1] in ("?", "D", "M", "A")): return True if p.returncode == 0: return False else: raise ConfigurationError("Problem invoking %s status on %s" % (GIT_EXE_PATH, cwd))
def restore_precheck(self, hashval): snapshot_file = hashval + '.json.gz' snapshot_local_path = join(self.snapshot_cache_dir, snapshot_file) if not exists(snapshot_local_path): snapshot_s3_path = join(join(self.bucket_name, '.snapshots'), snapshot_file) if not self.fs.exists(snapshot_s3_path): raise ConfigurationError( f"File s3://{snapshot_s3_path} not found for snapshot {hashval}" )
def _get_resource_params(self, resource_name) -> JSONDict: """Get the parameters for this resource from the workspace's metadata store - used when instantitating resources. Show throw an exception if resource does not exist. """ if resource_name not in self.resource_params_by_name: raise ConfigurationError( "A resource by the name '%s' does not exist in this workspace" % resource_name) return self.resource_params_by_name[resource_name]
def get_remote_origin_url(repo_dir: str, verbose: bool) -> str: try: url = call_subprocess( [GIT_EXE_PATH, "config", "--get", "remote.origin.url"], cwd=repo_dir, verbose=verbose) return url.strip() except Exception as e: raise ConfigurationError( "Problem getting remote origin from repository at %s. Do you have a remote origin configured?" % repo_dir) from e
def make_lineage_table( workspace_uri_or_path: Optional[str] = None, tag_or_hash: Optional[str] = None, verbose: bool = False, ) -> Iterable[Tuple[str, str, str, Optional[List[str]]]]: """Make a table of the lineage for each resource. The columns are: ref, lineage type, details, inputs """ workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path) if not isinstance(workspace, SnapshotWorkspaceMixin): raise ConfigurationError("Workspace %s does not support lineage" % workspace.name) if not workspace.supports_lineage(): raise ConfigurationError("Workspace %s does not support lineage" % workspace.name) snapshot_hash = None # type: Optional[str] if tag_or_hash is not None: md = workspace.get_snapshot_by_tag_or_hash(tag_or_hash) snapshot_hash = md.hashval return lu.make_lineage_table( workspace.get_instance(), workspace.get_lineage_store(), snapshot_hash )