Пример #1
0
    def __init__(
        self,
        name: str,
        default_value: Any,
        optional: bool,
        help: str,
        ptype: ParamType,
        allow_missing: bool = False,
    ):
        """Define a parameter - used for both workspace config params
        and resource params.

        If optional is True, that means that the parameter can take
        a value of None. If optional is False and the default_value is None,
        then the parameter must be explicitly specified. Also, set optional to
        False if you never want to allow a value of None (user cannot override
        default value to None).

        Set allow_missing to True if this parameter cannot be None but may be missing
        in a workspace created using an older version of DWS. Will just apply the
        default value and print a warning.
        """
        self.name = name
        self.default_value = default_value
        self.optional = optional
        self.help = help
        self.ptype = ptype
        if default_value is not None:
            # we validate our default value if it is specified
            ptype.validate(default_value)
        self.allow_missing = allow_missing
        if allow_missing and (default_value is None):
            raise InternalError(
                "Parameter %s: Allow missing requires a default value" % name)
Пример #2
0
def pull_command(
    workspace: Workspace,
    only: Optional[List[str]] = None,
    skip: Optional[List[str]] = None,
    only_workspace: bool = False,
) -> int:

    if isinstance(workspace, SyncedWorkspaceMixin):
        # first, sync the workspace
        click.echo("Syncing workspace")
        mixin = workspace.pull_workspace()
        workspace = cast(Workspace, mixin)
        if not only_workspace:
            rcount = _pull_and_clone_resources(workspace, only, skip)
        else:
            rcount = 0
    elif isinstance(workspace, CentralWorkspaceMixin):
        if only_workspace:
            raise ConfigurationError(
                "--only-workspace not valid for central workspace %s" %
                workspace.name)
        rcount = _pull_and_clone_resources(workspace, only, skip)
    else:
        raise InternalError(
            "Workspace %s is neither a SyncedWorkspaceMixin nor a CentralWorkspaceMixin"
            % workspace.name)

    workspace.save("Pull command")
    return rcount
def commit_changes_in_repo_subdir(local_path,
                                  subdir,
                                  message,
                                  remove_empty_dirs=False,
                                  verbose=False):
    """For only the specified subdirectory, figure out what has changed in
    the working tree relative to HEAD and get those changes into HEAD. We
    only commit if there is something to be done.
    """
    if not subdir.endswith("/"):
        subdir = subdir + "/"
    status = call_subprocess([GIT_EXE_PATH, "status", "--porcelain", subdir],
                             cwd=local_path,
                             verbose=verbose)
    maybe_delete_dirs = []
    need_to_commit = False
    for line in status.split("\n"):
        if len(line) < 2:
            continue
        # first character is the staging area status, second character
        # is the working tree status, and rest is the relative path.
        relpath = line[2:].strip()
        if relpath[0] == '"' and relpath[-1] == '"':  # issue 79
            relpath = relpath[
                1:-1]  # line has spaces, so git enclosed it in quotes
        if not relpath.startswith(subdir):
            raise InternalError("Git status line not in subdirectory %s: %s" %
                                (subdir, line))
        elif line[1] == "?":
            call_subprocess([GIT_EXE_PATH, "add", relpath],
                            cwd=local_path,
                            verbose=verbose)
            need_to_commit = True
        elif line[1] == "D":
            call_subprocess([GIT_EXE_PATH, "rm", relpath],
                            cwd=local_path,
                            verbose=verbose)
            maybe_delete_dirs.append(dirname(join(local_path, relpath)))
            need_to_commit = True
        elif line[1] == "M":
            call_subprocess([GIT_EXE_PATH, "add", relpath],
                            cwd=local_path,
                            verbose=verbose)
            need_to_commit = True
        elif line[0] in ("?", "A", "D", "M"):
            need_to_commit = True
            if line[0] == "D":
                maybe_delete_dirs.append(dirname(join(local_path, relpath)))
        elif verbose:
            click.echo("Skipping git status line: '%s'" % line)
    if remove_empty_dirs:
        for d in maybe_delete_dirs:
            remove_dir_if_empty(d, join(local_path, subdir), verbose=verbose)
    if need_to_commit:
        call_subprocess(
            [GIT_EXE_PATH, "commit", "--only", subdir, "-m", message],
            cwd=local_path,
            verbose=verbose,
        )
Пример #4
0
def run_git_fat(python2_exe, args, cwd=curdir, verbose=False):
    fat_script = join(THIS_FILES_DIRPATH, 'git-fat')
    if not exists(fat_script):
        raise InternalError("Missing %s" % fat_script)
    cmd = [python2_exe, fat_script] + args
    if verbose:
        click.echo("%s from %s" % (' '.join(cmd), cwd))
        env = os.environ.copy()  # type: Optional[Dict[str,Any]]
        assert env is not None
        env['GIT_FAT_VERBOSE'] = "1"
    else:
        env = None
    cp = subprocess.run(cmd, cwd=cwd, env=env)
    try:
        cp.check_returncode()
    except Exception as e:
        raise InternalError("git-fat execution with args '%s' failed." %
                            ' '.join(args)) from e
Пример #5
0
 def remove_tag_from_snapshot(self, hash_val: str, tag: str) -> None:
     """Remove the specified tag from the specified snapshot. Throw an
     InternalError if either the snapshot or the tag do not exist.
     """
     md_filename = join(
         join(self.workspace_dir, SNAPSHOT_METADATA_DIR_PATH),
         "%s_md.json" % hash_val.lower())
     if not exists(md_filename):
         raise InternalError("No metadata entry for snapshot %s" % hash_val)
     with open(md_filename, "r") as f:
         data = json.load(f)
     md = ws.SnapshotMetadata.from_json(data)
     assert md.hashval == hash_val
     if tag not in md.tags:
         raise InternalError("Tag %s not found in snapshot %s" %
                             (tag, hash_val))
     md.tags = [tag for tag in md.tags if tag != tag]
     with open(md_filename, "w") as f:
         json.dump(md.to_json(), f, indent=2)
 def _load_snapshot(self, snapshot_hash: str) -> S3Snapshot:
     snapshot_file = snapshot_hash + '.json.gz'
     snapshot_local_path = join(self.snapshot_cache_dir, snapshot_file)
     if not exists(snapshot_local_path):
         snapshot_s3_path = join(join(self.bucket_name, '.snapshots'),
                                 snapshot_file)
         if not self.fs.exists(snapshot_s3_path):
             raise InternalError(
                 f"File s3://{snapshot_s3_path} not found for snapshot {snapshot_hash}"
             )
         self.fs.get(snapshot_s3_path, snapshot_local_path)
     return S3Snapshot.read_snapshot_from_file(snapshot_local_path)
def switch_git_branch_if_needed(local_path,
                                branch,
                                verbose,
                                ok_if_not_present=False):
    (current, others) = get_branch_info(local_path, verbose)
    if branch == current:
        return
    else:
        if (branch not in others) and (not ok_if_not_present):
            raise InternalError(
                "Trying to switch to branch %s not in repo at %s" %
                (branch, others))
        switch_git_branch(local_path, branch, verbose)
 def get_value(self, name: str) -> Any:
     defn = self.defs[name]
     if name in self.params:
         value = self.params[name]
         if (value is None) and (not defn.optional):
             if defn.allow_missing:
                 return "MISSING"
             else:
                 raise InternalError(
                     "Missing value for %s %s parameter %s"
                     % (self.get_scope(), self.get_what_for(), name)
                 )
         else:
             return value
     else:
         return self.defs[name].default_value
Пример #9
0
 def _get_local_scratch_space_for_resource(
         self,
         resource_name: str,
         create_if_not_present: bool = False) -> str:
     scratch_path = join(self.workspace_dir,
                         ".dataworkspace/scratch/%s" % resource_name)
     if not isdir(scratch_path):
         if create_if_not_present is False:
             raise InternalError(
                 "Scratch path '%s' for resource %s is missing" %
                 (scratch_path, resource_name))
         os.makedirs(scratch_path)
         ensure_entry_in_gitignore(
             self.workspace_dir,
             ".dataworkspace/.gitignore",
             "/scratch/%s/" % resource_name,
             commit=True,
         )
     return scratch_path
def get_subdirectory_hash(repo_dir, relpath, verbose=False):
    """Get the subdirectory hash for the HEAD revision of the
    specified path. This matches the hash that git is storing
    internally. You should be able to run: git cat-file -p HASH
    to see a listing of the contents.
    """
    cmd = [GIT_EXE_PATH, "ls-tree", "-t", "HEAD", relpath]
    if verbose:
        click.echo("%s [run in %s]" % (" ".join(cmd), repo_dir))
    cp = run(cmd, cwd=repo_dir, encoding="utf-8", stdout=PIPE, stderr=PIPE)
    cp.check_returncode()
    for line in cp.stdout.split("\n"):
        m = LS_TREE_RE.match(line)
        if m is None:
            continue
        hashval = m.group(1)
        subdir = m.group(2)
        if subdir == relpath:
            return hashval
    raise InternalError("Did not find subdirectory '%s' in git ls-tree" %
                        relpath)
def get_branch_info(local_path, verbose=False):
    data = call_subprocess([GIT_EXE_PATH, "branch"],
                           cwd=local_path,
                           verbose=verbose)
    current = None
    other = []
    for line in data.split("\n"):
        line = line.strip()
        if len(line) == 0:
            continue
        if line.startswith("*"):
            assert current is None
            current = line[2:]
        else:
            other.append(line)
    if current is None:
        raise InternalError(
            "Problem obtaining branch information for local git repo at %s" %
            local_path)
    else:
        return (current, other)
Пример #12
0
    def clone_workspace(local_params: JSONDict, batch: bool, verbose: bool,
                        *args) -> ws.Workspace:
        # args is REPOSITORY_URL [DIRECTORY]
        if len(args) == 0:
            raise ConfigurationError(
                "Need to specify a Git repository URL when cloning a workspace"
            )
        else:
            repository = args[0]  # type: str
        directory = args[1] if len(args) == 2 else None  # type: Optional[str]
        if len(args) > 2:
            raise ConfigurationError(
                "Clone of git backend expecting at most two arguments, received: %s"
                % repr(args))

        # initial checks on the directory
        if directory:
            directory = abspath(expanduser(directory))
            parent_dir = dirname(directory)
            if isdir(directory):
                raise ConfigurationError(
                    "Clone target directory '%s' already exists" % directory)
            initial_path = directory
        else:
            parent_dir = abspath(expanduser(curdir))
            initial_path = join(
                parent_dir,
                uuid.uuid4().hex)  # get a unique name within this directory
        if not isdir(parent_dir):
            raise ConfigurationError("Parent directory '%s' does not exist" %
                                     parent_dir)
        if not os.access(parent_dir, os.W_OK):
            raise ConfigurationError("Unable to write into directory '%s'" %
                                     parent_dir)

        verify_git_config_initialized(parent_dir, batch=batch, verbose=verbose)

        # ping the remote repo
        cmd = [GIT_EXE_PATH, "ls-remote", "--quiet", repository]
        try:
            call_subprocess(cmd, parent_dir, verbose)
        except Exception as e:
            raise ConfigurationError(
                "Unable to access remote repository '%s'" % repository) from e

        # we have to clone the repo first to find out its name!
        try:
            cmd = [GIT_EXE_PATH, "clone", repository, initial_path]
            call_subprocess(cmd, parent_dir, verbose)
            config_file = join(initial_path, CONFIG_FILE_PATH)
            if not exists(config_file):
                raise ConfigurationError(
                    "Did not find configuration file in remote repo")
            with open(config_file, "r") as f:
                config_json = json.load(f)
            if "name" not in config_json:
                raise InternalError(
                    "Missing 'name' property in configuration file")
            workspace_name = config_json["name"]
            if directory is None:
                new_name = join(parent_dir, workspace_name)
                if isdir(new_name):
                    raise ConfigurationError(
                        "Clone target directory %s already exists" % new_name)
                safe_rename(initial_path, new_name)
                directory = new_name

            cf_path = join(directory, CONFIG_FILE_PATH)
            if not exists(cf_path):
                raise ConfigurationError(
                    "Did not find workspace config file %s" % cf_path)
            with open(cf_path, "r") as f:
                cf_data = json.load(f)
            global_params = cf_data["global_params"]
            # get the scratch directory (also adds local param if needed)
            abs_scratch_dir = clone_scratch_directory(directory, global_params,
                                                      local_params, batch)
            if DWS_GIT_BRANCH in global_params and (
                    global_params[DWS_GIT_BRANCH] is not None):
                # if the branch is specified, make sure we are on it
                switch_git_branch_if_needed(directory,
                                            global_params[DWS_GIT_BRANCH],
                                            verbose=verbose)
            if not isdir(abs_scratch_dir):
                if verbose:
                    print("Creating scratch directory %s" % abs_scratch_dir)
                os.makedirs(abs_scratch_dir)
            with open(join(directory, LOCAL_PARAMS_PATH), "w") as f:
                json.dump(local_params, f,
                          indent=2)  # create an initial local params file
            with open(join(directory, RESOURCE_LOCAL_PARAMS_PATH), "w") as f:
                json.dump(
                    {}, f, indent=2
                )  # create resource local params, to be populated via resource clones
            snapshot_md_dir = join(directory, SNAPSHOT_METADATA_DIR_PATH)
            if not exists(snapshot_md_dir):
                # It is possible that we are cloning a repo with no snapshots
                os.mkdir(snapshot_md_dir)
            snapshot_dir = join(directory, SNAPSHOT_DIR_PATH)
            if not exists(snapshot_dir):
                # It is possible that we are cloning a repo with no snapshots
                os.mkdir(snapshot_dir)
            current_lineage_dir = join(directory, CURRENT_LINEAGE_DIR_PATH)
            if not exists(current_lineage_dir):
                os.mkdir(current_lineage_dir)
            if is_a_git_fat_repo(directory):
                validate_git_fat_in_path()
                import dataworkspaces.third_party.git_fat as git_fat

                python2_exe = git_fat.find_python2_exe()
                git_fat.run_git_fat(python2_exe, ["init"],
                                    cwd=directory,
                                    verbose=verbose)
                # pull the objects referenced by the current head
                git_fat.run_git_fat(python2_exe, ["pull"],
                                    cwd=directory,
                                    verbose=verbose)
            ensure_git_lfs_configured_if_needed(directory, verbose=verbose)

        except:
            if isdir(initial_path):
                shutil.rmtree(initial_path)
            if (directory is not None) and isdir(directory):
                shutil.rmtree(directory)
            raise

        return WorkspaceFactory.load_workspace(batch, verbose,
                                               urlparse(directory))
Пример #13
0
 def publish(self, *args) -> None:
     if len(args) != 1:
         raise InternalError(
             "publish takes one argument: remote_repository, got %s" % args)
     set_remote_origin(self.workspace_dir, args[0], verbose=self.verbose)
Пример #14
0
 def restore(self, restore_hashval: str) -> None:
     raise InternalError(
         "Attempt to restore resource %s, which is not restoreable" %
         self.name)
Пример #15
0
def run_command(workspace: Workspace, step_name: str, cwd: str, command: str,
                args: List[str]):
    raise InternalError("Run command not yet supported")
Пример #16
0
def restore_command(
    workspace: Workspace,
    tag_or_hash: str,
    only: Optional[List[str]] = None,
    leave: Optional[List[str]] = None,
    strict: bool = False,
) -> int:
    """Run the restore and return the number of resources affected.
    """
    if not isinstance(workspace, SnapshotWorkspaceMixin):
        raise ConfigurationError("Workspace %s does not support snapshots" %
                                 workspace.name)
    mixin = cast(SnapshotWorkspaceMixin, workspace)
    # First, find the history entry
    md = mixin.get_snapshot_by_tag_or_hash(tag_or_hash)

    # process the lists of resources
    current_names = set(workspace.get_resource_names())
    # get the non-null resources in snapshot
    snapshot_names = set([
        rn for rn in md.restore_hashes.keys()
        if md.restore_hashes[rn] is not None
    ])
    all_names = current_names.union(snapshot_names)
    if (only is not None) and (leave is not None):
        raise ApiParamError(
            "Cannot specify both only and leave for restore command.")
    elif only is not None:
        # For only, we will be a little stricter, as the user is explicitly
        # specifying the resources.
        restore_set = set(only)
        strict = True
    elif leave is not None:
        restore_set = all_names.difference(leave)
    else:
        restore_set = all_names

    # We need to remove result resources from the restore set, as we
    # do not restore them to their prior state.
    result_resources = {
        rname
        for rname in restore_set
        if workspace.get_resource_role(rname) == ResourceRoles.RESULTS
    }
    result_resources_in_restore_set = result_resources.intersection(
        restore_set)
    if len(result_resources_in_restore_set) > 0:
        if strict:
            raise ConfigurationError(
                "Restore set contains result resources, which cannot be restored. The following are result resources: %s"
                % ", ".join(result_resources_in_restore_set))
        else:
            click.echo(
                "Skipping the restore of the following result resources, which are left in their latest state: %s"
                % ", ".join(result_resources_in_restore_set))
            restore_set = restore_set.difference(result_resources)

    # error checking
    invalid = restore_set.difference(all_names)
    if len(invalid) > 0:
        raise ConfigurationError("Resource name(s) not found: %s" %
                                 ", ".join(sorted(invalid)))
    removed_names = restore_set.difference(current_names)
    if len(removed_names) > 0:
        if strict:
            raise ConfigurationError(
                "Resources have been removed from workspace or have no restore hash and strict mode is enabled."
                + " Removed resources: %s" % ", ".join(sorted(removed_names)))
        else:
            click.echo(
                "Skipping restore of resources that have been removed from workspace or have no restore hash: %s"
                % ", ".join(sorted(removed_names)),
                err=True,
            )
            restore_set = restore_set.difference(removed_names)
    added_names = restore_set.difference(snapshot_names)
    if len(added_names) > 0:
        if strict:
            raise ConfigurationError(
                "Resources have been added to workspace since restore, and strict mode enabled."
                + " Added resources: %s" % ", ".join(sorted(added_names)))
        else:
            click.echo(
                "Resources have been added to workspace since restore, will leave them as-is: %s"
                % ", ".join(sorted(added_names)),
                err=True,
            )
            restore_set = restore_set.difference(added_names)

    # get ordered list of names and resources as well as restore hashes
    restore_name_list = [
        rn for rn in workspace.get_resource_names() if rn in restore_set
    ]
    if len(restore_name_list) == 0:
        click.echo("No resources to restore.")
        return 0
    restore_resource_list = [
        workspace.get_resource(rn) for rn in restore_name_list
    ]
    for r in restore_resource_list:
        if not isinstance(r, SnapshotResourceMixin):
            raise InternalError(
                "Resource %s was in snapshot, but is not a SnapshotResourceMixin"
                % r.name)
    restore_hashes = {rn: md.restore_hashes[rn] for rn in restore_set}

    tagstr = " (%s)" % ",".join(md.tags) if len(md.tags) > 0 else ""
    click.echo("Restoring snapshot %s%s" % (md.hashval, tagstr))

    def fmt_rlist(rnames):
        if len(rnames) > 0:
            return ", ".join(rnames)
        else:
            return "None"

    click.echo("  Resources to restore: %s" % fmt_rlist(restore_name_list))
    names_to_leave = sorted(current_names.difference(restore_set))
    click.echo("  Resources to leave: %s" % fmt_rlist(names_to_leave))
    if not workspace.batch:
        # Unless in batch mode, we always want to ask for confirmation
        resp = input("Should I perform this restore? [Y/n]")
        if resp.lower() != "y" and resp != "":
            raise UserAbort()

    # do the work!
    mixin.restore(md.hashval, restore_hashes,
                  cast(List[SnapshotResourceMixin], restore_resource_list))
    workspace.save("Restore to %s" % md.hashval)

    return len(restore_name_list)
 def clone(self, params: JSONDict,
           workspace: Workspace) -> LocalStateResourceMixin:
     """Instantiate a resource that was created remotely. This should not be called,
     since we have no local state.
     """
     raise InternalError("Clone called for S3 resource {params['name']}")