예제 #1
0
def decompress_file(archive, dir_):
    """Decompress `archive` into a directory `dir_`

    Parameters
    ----------
    archive: str
    dir_: str
    """
    with swallow_outputs() as cmo:
        archive = ensure_bytes(archive)
        dir_ = ensure_bytes(dir_)
        patoolib.util.check_existing_filename(archive)
        patoolib.util.check_existing_filename(dir_, onlyfiles=False)
        # Call protected one to avoid the checks on existence on unixified path
        outdir = unixify_path(dir_)
        # should be supplied in PY3 to avoid b''
        outdir = ensure_unicode(outdir)
        archive = ensure_unicode(archive)

        format_compression = patoolib.get_archive_format(archive)
        if format_compression == ('gzip', None):
            # Yarik fell into the trap of being lazy and not providing proper
            # support for .gz .xz etc "stream archivers" formats in handling
            # of archives. ATM out support for .gz relies on behavior of 7z while
            # extracting them and respecting possibly present .gz filename
            # header field.
            # See more https://github.com/datalad/datalad/pull/3176#issuecomment-466819861
            # TODO: provide proper handling of all those archives without
            # relying on any filename been stored in the header
            program = patoolib.find_archive_program(format_compression[0],
                                                    'extract')
            if basename(program) != '7z':
                raise MissingExternalDependency(
                    "cmd:7z",
                    msg="(Not) Funny enough but ATM we need p7zip installation "
                    "to handle .gz files extraction 'correctly'")

        patoolib._extract_archive(unixify_path(archive),
                                  outdir=outdir,
                                  verbosity=100)
        if cmo.out:
            lgr.debug("patool gave stdout:\n%s", cmo.out)
        if cmo.err:
            lgr.debug("patool gave stderr:\n%s", cmo.err)

    # Note: (ben) Experienced issue, where extracted tarball
    # lacked execution bit of directories, leading to not being
    # able to delete them while having write permission.
    # Can't imagine a situation, where we would want to fail on
    # that kind of mess. So, to be sure set it.

    if not on_windows:
        os.chmod(dir_, os.stat(dir_).st_mode | os.path.stat.S_IEXEC)
        for root, dirs, files in os.walk(dir_, followlinks=False):
            for d in dirs:
                subdir = opj(root, d)
                os.chmod(subdir,
                         os.stat(subdir).st_mode | os.path.stat.S_IEXEC)
예제 #2
0
파일: archives.py 프로젝트: datalad/datalad
 def _extract_archive(self, path):
     # we need to extract the archive
     # TODO: extract to _tmp and then move in a single command so we
     # don't end up picking up broken pieces
     lgr.debug(
         u"Extracting {self._archive} under {path}".format(**locals()))
     if exists(path):
         lgr.debug(
             "Previous extracted (but probably not fully) cached archive "
             "found. Removing %s", path)
         rmtree(path)
     os.makedirs(path)
     assert (exists(path))
     # remove old stamp
     if exists(self.stamp_path):
         rmtree(self.stamp_path)
     decompress_file(self._archive, path, leading_directories=None)
     # TODO: must optional since we might to use this content, move it
     # into the tree etc
     # lgr.debug("Adjusting permissions to R/O for the extracted content")
     # rotree(path)
     assert (exists(path))
     # create a stamp
     with open(self.stamp_path, 'wb') as f:
         f.write(ensure_bytes(self._archive))
     # assert that stamp mtime is not older than archive's directory
     assert (self.is_extracted)
예제 #3
0
def _get_cached_filename(archive):
    """A helper to generate a filename which has original filename and additional suffix
    which wouldn't collide across files with the same name from different locations
    """
    #return "%s_%s" % (basename(archive), hashlib.md5(archive).hexdigest()[:5])
    # per se there is no reason to maintain any long original name here.
    archive_cached = hashlib.md5(ensure_bytes(str(Path(archive).resolve()))).hexdigest()[:10]
    lgr.debug("Cached directory for archive %s is %s", archive, archive_cached)
    return archive_cached
예제 #4
0
파일: archives.py 프로젝트: datalad/datalad
    def checkurl(self, url):
        # TODO:  what about those MULTI and list to be returned?
        #  should we return all filenames or keys within archive?
        #  might be way too many?
        #  only if just archive portion of url is given or the one pointing
        #  to specific file?
        lgr.debug("Current directory: %s, url: %s", os.getcwd(), url)
        akey, afile, attrs = self._parse_url(url)
        size = attrs.get('size', None)

        # But reply that present only if archive is present
        # TODO: this would throw exception if not present, so this statement is
        # kinda bogus
        akey_path = self.get_contentlocation(akey, absolute=True)
        if akey_path:
            # Extract via cache only if size is not yet known
            if size is None:
                # if for testing we want to force getting the archive extracted
                efile = self.cache[akey_path].get_extracted_filename(afile)
                efile = ensure_bytes(efile)

                if op.exists(efile):
                    size = os.stat(efile).st_size

            # so it was a good successful one -- record
            self._last_url = url

            if size is None:
                return True
            else:
                # FIXME: providing filename causes annex to not even talk to
                # ask upon drop :-/
                return [dict(size=size)]  # , basename(afile))

        else:
            # TODO: theoretically we should first check if key is available
            # from any remote to know if file is available
            return False
예제 #5
0
파일: run.py 프로젝트: ypid/datalad
def run_command(cmd,
                dataset=None,
                inputs=None,
                outputs=None,
                expand=None,
                assume_ready=None,
                explicit=False,
                message=None,
                sidecar=None,
                extra_info=None,
                rerun_info=None,
                extra_inputs=None,
                rerun_outputs=None,
                inject=False):
    """Run `cmd` in `dataset` and record the results.

    `Run.__call__` is a simple wrapper over this function. Aside from backward
    compatibility kludges, the only difference is that `Run.__call__` doesn't
    expose all the parameters of this function. The unexposed parameters are
    listed below.

    Parameters
    ----------
    extra_info : dict, optional
        Additional information to dump with the json run record. Any value
        given here will take precedence over the standard run key. Warning: To
        avoid collisions with future keys added by `run`, callers should try to
        use fairly specific key names and are encouraged to nest fields under a
        top-level "namespace" key (e.g., the project or extension name).
    rerun_info : dict, optional
        Record from a previous run. This is used internally by `rerun`.
    extra_inputs : list, optional
        Inputs to use in addition to those specified by `inputs`. Unlike
        `inputs`, these will not be injected into the {inputs} format field.
    rerun_outputs : list, optional
        Outputs, in addition to those in `outputs`, determined automatically
        from a previous run. This is used internally by `rerun`.
    inject : bool, optional
        Record results as if a command was run, skipping input and output
        preparation and command execution. In this mode, the caller is
        responsible for ensuring that the state of the working tree is
        appropriate for recording the command's results.

    Yields
    ------
    Result records for the run.
    """
    if not cmd:
        lgr.warning("No command given")
        return

    rel_pwd = rerun_info.get('pwd') if rerun_info else None
    if rel_pwd and dataset:
        # recording is relative to the dataset
        pwd = normpath(opj(dataset.path, rel_pwd))
        rel_pwd = relpath(pwd, dataset.path)
    else:
        pwd, rel_pwd = get_command_pwds(dataset)

    ds = require_dataset(dataset,
                         check_installed=True,
                         purpose='tracking outcomes of a command')
    ds_path = ds.path

    lgr.debug('tracking command output underneath %s', ds)

    if not (rerun_info or inject):  # Rerun already takes care of this.
        # For explicit=True, we probably want to check whether any inputs have
        # modifications. However, we can't just do is_dirty(..., path=inputs)
        # because we need to consider subdatasets and untracked files.
        # MIH: is_dirty() is gone, but status() can do all of the above!
        if not explicit and ds.repo.dirty:
            yield get_status_dict(
                'run',
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required to detect changes from command; '
                    'use `datalad status` to inspect unsaved changes'))
            return

    cmd = normalize_command(cmd)

    inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"])
    extra_inputs = GlobbedPaths(
        extra_inputs,
        pwd=pwd,
        # Follow same expansion rules as `inputs`.
        expand=expand in ["inputs", "both"])
    outputs = GlobbedPaths(outputs,
                           pwd=pwd,
                           expand=expand in ["outputs", "both"])

    # ATTN: For correct path handling, all dataset commands call should be
    # unbound. They should (1) receive a string dataset argument, (2) receive
    # relative paths, and (3) happen within a chpwd(pwd) context.
    if not inject:
        with chpwd(pwd):
            for res in prepare_inputs(
                    ds_path,
                [] if assume_ready in ["inputs", "both"] else inputs,
                    # Ignore --assume-ready for extra_inputs. It's an unexposed
                    # implementation detail that lets wrappers sneak in inputs.
                    extra_inputs):
                yield res

            if assume_ready not in ["outputs", "both"]:
                if outputs:
                    for res in _install_and_reglob(ds_path, outputs):
                        yield res
                    for res in _unlock_or_remove(ds_path, outputs.expand()):
                        yield res

                if rerun_outputs is not None:
                    for res in _unlock_or_remove(ds_path, rerun_outputs):
                        yield res
    else:
        # If an inject=True caller wants to override the exit code, they can do
        # so in extra_info.
        cmd_exitcode = 0
        exc = None

    try:
        cmd_expanded = format_command(
            ds,
            cmd,
            pwd=pwd,
            dspath=ds_path,
            # Check if the command contains "{tmpdir}" to avoid creating an
            # unnecessary temporary directory in most but not all cases.
            tmpdir=mkdtemp(prefix="datalad-run-") if "{tmpdir}" in cmd else "",
            inputs=inputs,
            outputs=outputs)
    except KeyError as exc:
        yield get_status_dict(
            'run',
            ds=ds,
            status='impossible',
            message=('command has an unrecognized placeholder: %s', exc))
        return

    if not inject:
        cmd_exitcode, exc = _execute_command(
            cmd_expanded,
            pwd,
            expected_exit=rerun_info.get("exit", 0) if rerun_info else None)

    # amend commit message with `run` info:
    # - pwd if inside the dataset
    # - the command itself
    # - exit code of the command
    run_info = {
        'cmd': cmd,
        'exit': cmd_exitcode,
        'chain': rerun_info["chain"] if rerun_info else [],
        'inputs': inputs.paths,
        'extra_inputs': extra_inputs.paths,
        'outputs': outputs.paths,
    }
    if rel_pwd is not None:
        # only when inside the dataset to not leak information
        run_info['pwd'] = rel_pwd
    if ds.id:
        run_info["dsid"] = ds.id
    if extra_info:
        run_info.update(extra_info)

    record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False)

    if sidecar is None:
        use_sidecar = ds.config.get('datalad.run.record-sidecar',
                                    default=False)
        use_sidecar = anything2bool(use_sidecar)
    else:
        use_sidecar = sidecar

    if use_sidecar:
        # record ID is hash of record itself
        from hashlib import md5
        record_id = md5(record.encode('utf-8')).hexdigest()
        record_dir = ds.config.get('datalad.run.record-directory',
                                   default=op.join('.datalad', 'runinfo'))
        record_path = op.join(ds_path, record_dir, record_id)
        if not op.lexists(record_path):
            # go for compression, even for minimal records not much difference, despite offset cost
            # wrap in list -- there is just one record
            dump2stream([run_info], record_path, compressed=True)

    # compose commit message
    msg = u"""\
[DATALAD RUNCMD] {}

=== Do not change lines below ===
{}
^^^ Do not change lines above ^^^
"""
    msg = msg.format(
        message if message is not None else _format_cmd_shorty(cmd_expanded),
        '"{}"'.format(record_id) if use_sidecar else record)

    outputs_to_save = outputs.expand() if explicit else None
    if outputs_to_save is not None and use_sidecar:
        outputs_to_save.append(record_path)
    do_save = outputs_to_save is None or outputs_to_save
    if not rerun_info and cmd_exitcode:
        if do_save:
            repo = ds.repo
            msg_path = relpath(opj(str(repo.dot_git), "COMMIT_EDITMSG"))
            with open(msg_path, "wb") as ofh:
                ofh.write(ensure_bytes(msg))
            lgr.info(
                "The command had a non-zero exit code. "
                "If this is expected, you can save the changes with "
                "'datalad save -d . -r -F %s'", msg_path)
        raise exc
    elif do_save:
        with chpwd(pwd):
            for r in Save.__call__(dataset=ds_path,
                                   path=outputs_to_save,
                                   recursive=True,
                                   message=msg,
                                   return_type='generator'):
                yield r