Exemplo n.º 1
0
def compress_files(files, archive, path=None, overwrite=True):
    """Compress `files` into an `archive` file

    Parameters
    ----------
    files : list of str
    archive : str
    path : str
      Alternative directory under which compressor will be invoked, to e.g.
      take into account relative paths of files and/or archive
    overwrite : bool
      Whether to allow overwriting the target archive file if one already exists
    """
    runner = Runner(cwd=path)
    apath = Path(archive)
    if apath.exists():
        if overwrite:
            apath.unlink()
        else:
            raise ValueError(
                'Target archive {} already exists and overwrite is forbidden'.
                format(apath))
    if len(apath.suffixes) > 1 and apath.suffixes[-2] == '.tar':
        cmd = '7z u .tar -so -- {} | 7z u -si -- {}'.format(
            ' '.join(quote_cmdlinearg(f) for f in files),
            quote_cmdlinearg(str(apath)),
        )
    else:
        cmd = ['7z', 'u', str(apath), '--'] + files
    runner.run(cmd)
Exemplo n.º 2
0
def test_configs(path=None):

    # set up dataset with registered procedure (c&p from test_basics):
    ds = Dataset(path).create(force=True)
    ds.run_procedure('cfg_yoda')
    # configure dataset to look for procedures in its code folder
    ds.config.add('datalad.locations.dataset-procedures',
                  'code',
                  scope='branch')

    # 1. run procedure based on execution guessing by run_procedure:
    ds.run_procedure(spec=['datalad_test_proc', 'some_arg'])
    # look for traces
    ok_file_has_content(op.join(ds.path, 'fromproc.txt'), 'some_arg\n')

    # 2. now configure specific call format including usage of substitution config
    # for run:
    ds.config.add('datalad.procedures.datalad_test_proc.call-format',
                  u'%s {script} {ds} {{mysub}} {args}' %
                  quote_cmdlinearg(sys.executable),
                  scope='branch')
    ds.config.add('datalad.run.substitutions.mysub',
                  'dataset-call-config',
                  scope='branch')
    # TODO: Should we allow for --inputs/--outputs arguments for run_procedure
    #       (to be passed into run)?
    ds.unlock("fromproc.txt")
    # run again:
    ds.run_procedure(spec=['datalad_test_proc', 'some_arg'])
    # look for traces
    ok_file_has_content(op.join(ds.path, 'fromproc.txt'),
                        'dataset-call-config\n')

    # 3. have a conflicting config at user-level, which should override the
    # config on dataset level:
    ds.config.add('datalad.procedures.datalad_test_proc.call-format',
                  u'%s {script} {ds} local {args}' %
                  quote_cmdlinearg(sys.executable),
                  scope='local')
    ds.unlock("fromproc.txt")
    # run again:
    ds.run_procedure(spec=['datalad_test_proc', 'some_arg'])
    # look for traces
    ok_file_has_content(op.join(ds.path, 'fromproc.txt'), 'local\n')

    # 4. get configured help message:
    r = ds.run_procedure('datalad_test_proc',
                         help_proc=True,
                         on_failure='ignore')
    assert_true(len(r) == 1)
    assert_in_results(r, status="impossible")

    ds.config.add('datalad.procedures.datalad_test_proc.help',
                  "This is a help message",
                  scope='branch')

    r = ds.run_procedure('datalad_test_proc', help_proc=True)
    assert_true(len(r) == 1)
    assert_in_results(r, message="This is a help message", status='ok')
Exemplo n.º 3
0
    def to_str(self, include_output=True):
        from datalad.utils import (
            ensure_unicode,
            ensure_list,
            quote_cmdlinearg,
        )
        to_str = "{}: ".format(self.__class__.__name__)
        if self.cmd:
            to_str += "'{}'".format(
                # go for a compact, normal looking, properly quoted
                # command rendering
                ' '.join(quote_cmdlinearg(c) for c in ensure_list(self.cmd)))
        if self.code:
            to_str += " failed with exitcode {}".format(self.code)
        if self.cwd:
            # only if not under standard PWD
            to_str += " under {}".format(self.cwd)
        if self.msg:
            # typically a command error has no specific idea
            to_str += " [{}]".format(ensure_unicode(self.msg))
        if not include_output:
            return to_str

        if self.stdout:
            to_str += " [out: '{}']".format(
                ensure_unicode(self.stdout).strip())
        if self.stderr:
            to_str += " [err: '{}']".format(
                ensure_unicode(self.stderr).strip())
        if self.kwargs:
            to_str += " [info keys: {}]".format(', '.join(self.kwargs.keys()))
        return to_str
Exemplo n.º 4
0
def normalize_command(command):
    """Convert `command` to the string representation.
    """
    if isinstance(command, list):
        command = list(map(assure_unicode, command))
        if len(command) == 1 and command[0] != "--":
            # This is either a quoted compound shell command or a simple
            # one-item command. Pass it as is.
            #
            # FIXME: This covers the predominant command-line case, but, for
            # Python API callers, it means values like ["./script with spaces"]
            # requires additional string-like escaping, which is inconsistent
            # with the handling of multi-item lists (and subprocess's
            # handling). Once we have a way to detect "running from Python API"
            # (discussed in gh-2986), update this.
            command = command[0]
        else:
            if command and command[0] == "--":
                # Strip disambiguation marker. Note: "running from Python API"
                # FIXME from below applies to this too.
                command = command[1:]
            command = " ".join(quote_cmdlinearg(c) for c in command)
    else:
        command = assure_unicode(command)
    return command
Exemplo n.º 5
0
def _guess_exec(script_file):

    state = None
    try:
        is_exec = os.stat(script_file).st_mode & stat.S_IEXEC
    except OSError as e:
        from errno import ENOENT
        if e.errno == ENOENT and op.islink(script_file):
            # broken symlink
            # does not exist; there's nothing to detect at all
            return {'type': None, 'template': None, 'state': 'absent'}
        else:
            raise e

    # TODO check for exec permission and rely on interpreter
    if is_exec and not os.path.isdir(script_file):
        return {'type': u'executable',
                'template': u'{script} {ds} {args}',
                'state': 'executable'}
    elif script_file.endswith('.sh'):
        return {'type': u'bash_script',
                'template': u'bash {script} {ds} {args}',
                'state': 'executable'}
    elif script_file.endswith('.py'):
        ex = quote_cmdlinearg(sys.executable)
        return {'type': u'python_script',
                'template': u'%s {script} {ds} {args}' % ex,
                'state': 'executable'}
    else:
        return {'type': None, 'template': None, 'state': None}
 def put(self, source, destination, recursive=False, preserve_attrs=False):
     import shutil
     copy_fn = shutil.copy2 if preserve_attrs else shutil.copy
     if recursive:
         args = source, destination
         kwargs = {"copy_function": copy_fn}
         try:
             shutil.copytree(*args, **kwargs)
         except FileExistsError:
             # SSHConnection.put() is okay with copying a tree if the
             # destination directory already exists. With Python 3.8, we can
             # make copytree() do the same with dirs_exist_ok=True. But for
             # now, just rely on `cp`.
             cmd = ["cp", "--recursive"]
             if preserve_attrs:
                 cmd.append("--preserve")
             self(cmd + [quote_cmdlinearg(a) for a in args])
     else:
         copy_fn(source, destination)
Exemplo n.º 7
0
def decompress_file(archive, dir_):
    """Decompress `archive` into a directory `dir_`

    This is an alternative implementation without patool, but directly calling 7z.

    Parameters
    ----------
    archive: str
    dir_: str
    """
    apath = Path(archive)
    runner = Runner(cwd=dir_)
    suffixes = _normalize_fname_suffixes(apath.suffixes)
    if len(suffixes) > 1 and suffixes[-2] == '.tar':
        # we have a compressed tar file that needs to be fed through the
        # decompressor first
        cmd = '7z x {} -so | 7z x -si -ttar'.format(quote_cmdlinearg(archive))
    else:
        # fire and forget
        cmd = ['7z', 'x', archive]
    runner.run(cmd, protocol=KillOutput)
Exemplo n.º 8
0
def decompress_file(archive, dir_):
    """Decompress `archive` into a directory `dir_`

    This is an alternative implementation without patool, but directly calling 7z.

    Parameters
    ----------
    archive: str
    dir_: str
    """
    apath = Path(archive)
    runner = Runner(cwd=dir_)
    if len(apath.suffixes) > 1 and apath.suffixes[-2] == '.tar':
        # we have a compressed tar file that needs to be fed through the
        # decompressor first
        # hangs somehow, do via single string arg
        #cmd = ['7z', 'x', archive, '-so', '|', '7z', 'x', '-si', '-ttar']
        cmd = '7z x {} -so | 7z x -si -ttar'.format(quote_cmdlinearg(archive))
    else:
        # fire and forget
        cmd = ['7z', 'x', archive]
    runner.run(cmd)
Exemplo n.º 9
0
def test_runner(tempfile):

    # test non-dry command call
    runner = Runner()
    content = 'Testing äöü東 real run'
    cmd = 'echo %s > %s' % (content, quote_cmdlinearg(tempfile))
    ret = runner.run(cmd)
    assert_equal(ret, ('', ''))  # no out or err
    ok_file_has_content(tempfile, content, strip=True)
    os.unlink(tempfile)

    # Run with shell
    ret = runner.run(cmd, shell=True)
    assert_equal(ret, ('', ''))  # no out or err
    ok_file_has_content(tempfile, content, strip=True)
    os.unlink(tempfile)

    # Pass as a list and with shell - "not exactly what we expect"
    # Initial suspicion came from incorrect behavior of Runner as a runner
    # for patool.  Apparently (docs for 2.7):
    #   If args is a sequence, the first item specifies the command string,
    #   and any additional items will be treated as additional arguments to
    #   the shell itself.
    # which is what it ruins it for us!  So, for now we are not testing/using
    # this form
    # ret = runner.run(split_cmdline(cmd), shell=True)
    # # ?! for some reason there is an empty line in stdout
    # # TODO: figure out.  It shouldn't though be of critical effect
    # ret = (ret[0].rstrip(), ret[1])
    # assert_equal(ret, ('', ''))  # no out or err
    # # And here we get kaboom ATM!
    # ok_file_has_content(tempfile, content, strip=True)

    # test non-dry python function call
    output = runner.call(os.path.join, 'foo', 'bar')
    assert_equal(os.path.join('foo', 'bar'), output,
                 "Call of: os.path.join, 'foo', 'bar' returned %s" % output)
Exemplo n.º 10
0
def _guess_exec(script_file):
    try:
        is_exec = os.stat(script_file).st_mode & stat.S_IEXEC
    except OSError as e:
        from errno import ENOENT
        if e.errno == ENOENT and op.islink(script_file):
            # broken symlink
            # does not exist; there's nothing to detect at all
            return {'type': None, 'template': None, 'state': 'absent'}
        else:
            raise e

    # on some FS the executable bit might not be all that reliable
    # but a procedure might nevertheless be supported.
    # go by extension with "known" interpreters first, and only then
    # try to execute something that looks executable
    if script_file.endswith('.sh'):
        return {
            'type': u'bash_script',
            'template': u'bash {script} {ds} {args}',
            'state': 'executable'
        }
    elif script_file.endswith('.py'):
        ex = quote_cmdlinearg(sys.executable)
        return {
            'type': u'python_script',
            'template': u'%s {script} {ds} {args}' % ex,
            'state': 'executable'
        }
    elif is_exec and not os.path.isdir(script_file):
        return {
            'type': u'executable',
            'template': u'{script} {ds} {args}',
            'state': 'executable'
        }
    else:
        return {'type': None, 'template': None, 'state': None}
Exemplo n.º 11
0
    def __call__(keyfile=None,
                 merge=False,
                 force_update=False,
                 bids=False,
                 non_bids_dir='non-bids',
                 dataset=None):
        ds = require_dataset(dataset, check_installed=True, purpose='update')

        repo = ds.repo
        if not keyfile:
            # will error out, if no config was given
            keyfile = repo.config.obtain('datalad.ukbiobank.keyfile')

        # prep for yield
        res = dict(
            action='ukb_update',
            path=ds.path,
            type='dataset',
            logger=lgr,
            refds=ds.path,
        )

        if repo.dirty:
            yield dict(
                res,
                status='error',
                message="Refuse to operate on dirty dataset",
            )
            return

        # check if we have 'ukbfetch' before we start fiddling with the dataset
        # and leave it in a mess for no reason
        try:
            subprocess.run(
                # pull version info
                ['ukbfetch', '-i'],
                capture_output=True,
            )
        except Exception as e:
            raise RuntimeError(
                "Cannot execute 'ukbfetch'. Original error: {}".format(e))

        # just to be nice, and to be able to check it out again,
        # when we are done
        initial_branch = repo.get_active_branch()
        initial_incoming = repo.get_hexsha('incoming')

        # make sure we are in incoming
        repo.call_git(['checkout', 'incoming'])

        # first wipe out all prev. downloaded zip files so we can detect
        # when some files are no longer available
        for fp in repo.pathobj.glob('[0-9]*_[0-9]*_[0-9]_[0-9].*'):
            fp.unlink()

        # a place to put the download logs
        # better be semi-persistent to ease inspection
        tmpdir = repo.pathobj / repo.get_git_dir(repo) / 'tmp' / 'ukb'
        tmpdir.mkdir(parents=True, exist_ok=True)

        # redownload, run with explicit mode, because we just deleted the
        # ZIP files and that is OK
        ds.run(
            cmd='ukbfetch -v -a{} -b.ukbbatch -o{}'.format(
                quote_cmdlinearg(keyfile),
                quote_cmdlinearg(str(tmpdir)),
            ),
            explicit=True,
            outputs=['.'],
            message="Update from UKbiobank",
        )

        # TODO what if something broke before? needs force switch
        if not force_update and repo.get_hexsha() == initial_incoming:
            yield dict(
                res,
                status='notneeded',
                message='No new content available',
            )
            repo.call_git(['checkout', initial_branch])
            # TODO drop?
            return

        # onto extraction and transformation of downloaded content
        repo.call_git(['checkout', 'incoming-processed'])

        # mark the incoming change as merged
        # (but we do not actually want any branch content)
        repo.call_git(['merge', 'incoming', '--strategy=ours', 'incoming'])

        for fp in repo.get_content_info(ref='incoming-processed',
                                        eval_file_type=False):
            fp.unlink()

        subid = None
        if bids:
            from datalad_ukbiobank.ukb2bids import restructure_ukb2bids
            # get participant ID from batch file
            subid = list(
                repo.call_git_items_(["cat-file", "-p", "incoming:.ukbbatch"
                                      ]))[0].split(maxsplit=1)[0]

        # discover all zip files present in the last commit in 'incoming'
        for fp, props in repo.get_content_annexinfo(
                ref='incoming', eval_availability=False).items():
            if fp.name.startswith('.'):
                # skip internals
                continue
            # we have to extract into per-instance directories, otherwise files
            # would conflict
            ids = fp.stem.split('_')
            if not len(ids) >= 3:
                raise RuntimeError(
                    'Unrecognized filename structure: {}'.format(fp))
            extract_dir = repo.pathobj / 'instance-{}'.format(ids[2])
            extract_dir.mkdir(exist_ok=True)

            if fp.suffix == '.zip':
                with chpwd(extract_dir):
                    # extract and add their content
                    AddArchiveContent.__call__(
                        props['key'],
                        key=True,
                        annex=repo,
                        # --use-current-dir due to
                        # https://github.com/datalad/datalad/issues/3995
                        use_current_dir=True,
                        allow_dirty=True,
                        commit=False,
                    )
            else:
                # move into instance dir, and strip participant ID, and instance ID
                # but keep array index
                # e.g. -> 25747_3_0.adv -> instance-3/25747_0
                repo.call_git([
                    'annex', 'fromkey', props['key'],
                    str(extract_dir /
                        ('_'.join(ids[1::2]) + ''.join(fp.suffixes)))
                ])

            if bids:
                yield from restructure_ukb2bids(
                    ds,
                    subid=subid,
                    unrecognized_dir=Path('ses-{}'.format(ids[2])) /
                    non_bids_dir,
                    base_path=extract_dir,
                    session=ids[2],
                )

        # save whatever the state is now, `save` will discover deletions
        # automatically and also commit them -- wonderful!
        ds.save(message="Track ZIP file content")
        yield dict(
            res,
            status='ok',
        )

        if not merge:
            return

        # and update active branch
        repo.call_git(['checkout', initial_branch])

        if initial_branch in ('incoming', 'incoming-processed'):
            yield dict(
                res,
                action='ukb_merge_update',
                status='impossible',
                message='Refuse to merge into incoming* branch',
            )
            return

        repo.call_git([
            'merge', '-m', "Merge update from UKbiobank", 'incoming-processed'
        ])

        yield dict(
            res,
            action='ukb_merge_update',
            status='ok',
        )
        return
Exemplo n.º 12
0
 def __str__(self):
     s = self._str
     if s is None:
         s = self._str = \
             '%s(%s)' % (self.__class__.__name__, ut.quote_cmdlinearg(self.path))
     return s
Exemplo n.º 13
0
    def __call__(subjects='list', dataset=None, ifexists=None, force=False):
        from pyxnat import Interface as XNATInterface

        ds = require_dataset(dataset, check_installed=True, purpose='update')

        subjects = ensure_list(subjects)

        # require a clean dataset
        if ds.repo.dirty:
            yield get_status_dict(
                'update',
                ds=ds,
                status='impossible',
                message=(
                    'Clean dataset required; use `datalad status` to inspect '
                    'unsaved changes'))
            return

        # prep for yield
        res = dict(
            action='xnat_update',
            path=ds.path,
            type='dataset',
            logger=lgr,
            refds=ds.path,
        )
        # obtain configured XNAT url and project name
        xnat_cfg_name = ds.config.get('datalad.xnat.default-name', 'default')
        cfg_section = 'datalad.xnat.{}'.format(xnat_cfg_name)
        xnat_url = ds.config.get('{}.url'.format(cfg_section))
        xnat_project = ds.config.get('{}.project'.format(cfg_section))
        file_path = ds.config.get('{}.path'.format(cfg_section))

        # obtain user credentials
        parsed_url = urlparse(xnat_url)
        no_proto_url = '{}{}'.format(parsed_url.netloc,
                                     parsed_url.path).replace(' ', '')
        cred = UserPassword(name=no_proto_url, url=None)()
        xn = XNATInterface(server=xnat_url, **cred)

        # provide subject list
        if 'list' in subjects:
            from datalad.ui import ui
            subs = xn.select.project(xnat_project).subjects().get()
            ui.message('The following subjects are available for XNAT '
                       'project {}:'.format(xnat_project))
            for s in sorted(subs):
                ui.message(" {}".format(quote_cmdlinearg(s)))
            ui.message(
                'Specify a specific subject(s) or "all" to download associated '
                'files for.')
            return

        # query the specified subject(s) to make sure it exists and is accessible
        if 'all' not in subjects:
            from datalad.ui import ui
            subs = []
            for s in subjects:
                sub = xn.select.project(xnat_project).subject(s)
                nexp = len(sub.experiments().get())
                if nexp > 0:
                    subs.append(s)
                else:
                    ui.message(
                        'Failed to obtain information on subject {} from XNAT '
                        'project {}:'.format(s, xnat_project))
                    return
        else:
            # if all, get list of all subjects
            subs = xn.select.project(xnat_project).subjects().get()

        # parse and download one subject at a time
        from datalad_xnat.parser import parse_xnat
        addurl_dir = ds.pathobj / 'code' / 'addurl_files'
        for sub in subs:
            yield from parse_xnat(
                ds,
                sub=sub,
                force=force,
                xn=xn,
                xnat_url=xnat_url,
                xnat_project=xnat_project,
            )

            # add file urls for subject
            lgr.info('Downloading files for subject %s', sub)
            table = f"{addurl_dir}/{sub}_table.csv"
            # this corresponds to the header field 'filename' in the csv table
            filename = '{filename}'
            filenameformat = f"{file_path}{filename}"
            ds.addurls(
                table,
                '{url}',
                filenameformat,
                ifexists=ifexists,
                save=False,
                cfg_proc='xnat_dataset',
                result_renderer='default',
            )

            ds.save(message=f"Update files for subject {sub}", recursive=True)

        lgr.info(
            'Files were updated for the following subjects in XNAT project %s:',
            xnat_project)
        for s in sorted(subs):
            lgr.info(" {}".format(quote_cmdlinearg(s)))

        yield dict(res, status='ok')
        return
Exemplo n.º 14
0
    def __call__(
            spec=None,
            dataset=None,
            discover=False,
            help_proc=False):
        if not spec and not discover:
            raise InsufficientArgumentsError('requires at least a procedure name')
        if help_proc and not spec:
            raise InsufficientArgumentsError('requires a procedure name')

        try:
            ds = require_dataset(
                dataset, check_installed=False,
                purpose='run a procedure')
        except NoDatasetFound:
            ds = None

        if discover:
            # specific path of procedures that were already reported
            reported = set()
            # specific names of procedure for which an active one has been
            # found
            active = set()
            for m, cmd_name, cmd_tmpl, cmd_help in \
                    _get_procedure_implementation('*', ds=ds):
                if m in reported:
                    continue
                ex = _guess_exec(m)
                # configured template (call-format string) takes precedence:
                if cmd_tmpl:
                    ex['template'] = cmd_tmpl
                if ex['state'] is None:
                    # doesn't seem like a match
                    lgr.debug("%s does not look like a procedure, ignored.", m)
                    continue
                state = 'overridden' if cmd_name in active else ex['state']
                message = ex['type'] if ex['type'] else 'unknown type'
                message += ' ({})'.format(state) if state != 'executable' else ''
                res = get_status_dict(
                    action='discover_procedure',
                    path=m,
                    type='file',
                    logger=lgr,
                    refds=ds.path if ds else None,
                    status='ok',
                    state=state,
                    procedure_name=cmd_name,
                    procedure_type=ex['type'],
                    procedure_callfmt=ex['template'],
                    procedure_help=cmd_help,
                    message=message)
                reported.add(m)
                if state == 'executable':
                    active.add(cmd_name)
                yield res
            return

        if not isinstance(spec, (tuple, list)):
            # maybe coming from config
            spec = split_cmdline(spec)
        name = spec[0]
        args = spec[1:]

        try:
            # get the first match an run with it
            procedure_file, cmd_name, cmd_tmpl, cmd_help = \
                next(_get_procedure_implementation(name, ds=ds))
        except StopIteration:
            res = get_status_dict(
                    action='run_procedure',
                    # TODO: Default renderer requires a key "path" to exist.
                    # Doesn't make a lot of sense in this case
                    path=name,
                    logger=lgr,
                    refds=ds.path if ds else None,
                    status='impossible',
                    message="Cannot find procedure with name '%s'" % name)
            yield res
            return

        ex = _guess_exec(procedure_file)
        # configured template (call-format string) takes precedence:
        if cmd_tmpl:
            ex['template'] = cmd_tmpl

        if help_proc:
            if cmd_help:
                res = get_status_dict(
                        action='procedure_help',
                        path=procedure_file,
                        type='file',
                        logger=lgr,
                        refds=ds.path if ds else None,
                        status='ok',
                        state=ex['state'],
                        procedure_name=cmd_name,
                        procedure_type=ex['type'],
                        procedure_callfmt=ex['template'],
                        message=cmd_help)
            else:
                res = get_status_dict(
                        action='procedure_help',
                        path=procedure_file,
                        type='file',
                        logger=lgr,
                        refds=ds.path if ds else None,
                        status='impossible',
                        state=ex['state'],
                        procedure_name=cmd_name,
                        procedure_type=ex['type'],
                        procedure_callfmt=ex['template'],
                        message="No help available for '%s'" % name)

            yield res
            return

        if not ex['template']:
            raise ValueError("No idea how to execute procedure %s. "
                             "Missing 'execute' permissions?" % procedure_file)

        cmd = ex['template'].format(
            script=quote_cmdlinearg(procedure_file),
            ds=quote_cmdlinearg(ds.path) if ds else '',
            args=(u' '.join(quote_cmdlinearg(a) for a in args) if args else ''))
        lgr.info(u"Running procedure %s", name)
        lgr.debug(u'Full procedure command: %r', cmd)
        for r in Run.__call__(
                cmd=cmd,
                dataset=ds,
                explicit=True,
                inputs=None,
                outputs=None,
                # pass through here
                on_failure='ignore',
                return_type='generator'
        ):
            yield r
Exemplo n.º 15
0
    def __call__(url,
                 path="{subject}/{session}/{scan}/",
                 project=None,
                 force=False,
                 dataset=None):
        from pyxnat import Interface as XNATInterface

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='initialization')

        config = ds.config
        path = with_pathsep(path)

        # prep for yield
        res = dict(
            action='xnat_init',
            path=ds.path,
            type='dataset',
            logger=lgr,
            refds=ds.path,
        )

        # obtain user credentials, use simplified/stripped URL as identifier
        # given we don't have more knowledge than the user, do not
        # give a `url` to provide hints on how to obtain credentials
        parsed_url = urlparse(url)
        no_proto_url = '{}{}'.format(parsed_url.netloc,
                                     parsed_url.path).replace(' ', '')
        cred = UserPassword(name=no_proto_url, url=None)()

        xn = XNATInterface(server=url, **cred)

        # now we make a simple request to obtain the server version
        # we don't care much, but if the URL or the credentials are wrong
        # we will not get to see one
        try:
            xnat_version = xn.version()
            lgr.debug("XNAT server version is %s", xnat_version)
        except Exception as e:
            yield dict(
                res,
                status='error',
                message=('Failed to access the XNAT server. Full error:\n%s',
                         e),
            )
            return

        if project is None:
            from datalad.ui import ui
            projects = xn.select.projects().get()
            ui.message('No project name specified. The following projects are '
                       'available on {} for user {}:'.format(
                           url, cred['user']))
            for p in sorted(projects):
                # list and prep for C&P
                # TODO multi-column formatting?
                ui.message("  {}".format(quote_cmdlinearg(p)))
            return

        # query the specified project to make sure it exists and is accessible
        proj = xn.select.project(project)

        try:
            nsubj = len(proj.subjects().get())
        except Exception as e:
            yield dict(
                res,
                status='error',
                message=(
                    'Failed to obtain information on project %s from XNAT. '
                    'Full error:\n%s', project, e),
            )
            return

        lgr.info('XNAT reports %i subjects currently on-record for project %s',
                 nsubj, project)

        # check if dataset already initialized
        auth_dir = ds.pathobj / '.datalad' / 'providers'
        if auth_dir.exists() and not force:
            yield dict(
                res,
                status='error',
                message='Dataset found already initialized, '
                'use `force` to reinitialize',
            )
            return

        # put essential configuration into the dataset
        config.set('datalad.xnat.default.url',
                   url,
                   where='dataset',
                   reload=False)
        config.set('datalad.xnat.default.project', project, where='dataset')
        config.set('datalad.xnat.default.path', path, where='dataset')

        ds.save(
            path=ds.pathobj / '.datalad' / 'config',
            to_git=True,
            message="Configure default XNAT url and project",
        )

        # Configure XNAT access authentication
        ds.run_procedure(spec='cfg_xnat_dataset')

        yield dict(
            res,
            status='ok',
        )
        return
Exemplo n.º 16
0
def _create_sibling_ria(
        ds,
        url,
        name,
        storage_sibling,
        storage_name,
        existing,
        shared,
        group,
        post_update_hook,
        trust_level,
        res_kwargs):
    # be safe across datasets
    res_kwargs = res_kwargs.copy()
    # update dataset
    res_kwargs['ds'] = ds

    if not isinstance(ds.repo, AnnexRepo):
        # No point in dealing with a special remote when there's no annex.
        # Note, that in recursive invocations this might only apply to some of
        # the datasets. Therefore dealing with it here rather than one level up.
        lgr.debug("No annex at %s. Ignoring special remote options.", ds.path)
        storage_sibling = False
        storage_name = None

    # parse target URL
    try:
        ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config)
    except ValueError as e:
        yield get_status_dict(
            status='error',
            message=str(e),
            **res_kwargs
        )
        return

    base_path = Path(base_path)

    git_url = decode_source_spec(
        # append dataset id to url and use magic from clone-helper:
        url + '#{}'.format(ds.id),
        cfg=ds.config
    )['giturl']
    # determine layout locations; go for a v1 layout
    repo_path, _, _ = get_layout_locations(1, base_path, ds.id)

    ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)]
    # Figure whether we are supposed to skip this very dataset
    if existing == 'skip' and (
            name in ds_siblings or (
                storage_name and storage_name in ds_siblings)):
        yield get_status_dict(
            status='notneeded',
            message="Skipped on existing sibling",
            **res_kwargs
        )
        # if we skip here, nothing else can change that decision further
        # down
        return

    # figure whether we need to skip or error due an existing target repo before
    # we try to init a special remote.
    if ssh_host:
        from datalad import ssh_manager
        ssh = ssh_manager.get_connection(
            ssh_host,
            use_remote_annex_bundle=False)
        ssh.open()

    if existing in ['skip', 'error']:
        config_path = repo_path / 'config'
        # No .git -- if it's an existing repo in a RIA store it should be a
        # bare repo.
        # Theoretically we could have additional checks for whether we have
        # an empty repo dir or a non-bare repo or whatever else.
        if ssh_host:
            try:
                ssh('[ -e {p} ]'.format(p=quote_cmdlinearg(str(config_path))))
                exists = True
            except CommandError:
                exists = False
        else:
            exists = config_path.exists()

        if exists:
            if existing == 'skip':
                # 1. not rendered by default
                # 2. message doesn't show up in ultimate result
                #    record as shown by -f json_pp
                yield get_status_dict(
                    status='notneeded',
                    message="Skipped on existing remote "
                            "directory {}".format(repo_path),
                    **res_kwargs
                )
                return
            else:  # existing == 'error'
                yield get_status_dict(
                    status='error',
                    message="remote directory {} already "
                            "exists.".format(repo_path),
                    **res_kwargs
                )
                return

    if storage_sibling == 'only':
        lgr.info("create storage sibling '{}' ...".format(name))
    else:
        lgr.info("create sibling{} '{}'{} ...".format(
            's' if storage_name else '',
            name,
            " and '{}'".format(storage_name) if storage_name else '',
        ))
    create_ds_in_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(),
                       base_path, ds.id, '2', '1')
    if storage_sibling:
        # we are using the main `name`, if the only thing we are creating
        # is the storage sibling
        srname = name if storage_sibling == 'only' else storage_name

        lgr.debug('init special remote {}'.format(srname))
        special_remote_options = [
            'type=external',
            'externaltype=ora',
            'encryption=none',
            'autoenable=true',
            'url={}'.format(url)]
        try:
            ds.repo.init_remote(
                srname,
                options=special_remote_options)
        except CommandError as e:
            if existing == 'reconfigure' \
                    and 'git-annex: There is already a special remote' \
                    in e.stderr:
                # run enableremote instead
                lgr.debug(
                    "special remote '%s' already exists. "
                    "Run enableremote instead.",
                    srname)
                # TODO: Use AnnexRepo.enable_remote (which needs to get
                #       `options` first)
                ds.repo.call_annex([
                    'enableremote',
                    srname] + special_remote_options)
            else:
                yield get_status_dict(
                    status='error',
                    message="initremote failed.\nstdout: %s\nstderr: %s"
                    % (e.stdout, e.stderr),
                    **res_kwargs
                )
                return

        if trust_level:
            ds.repo.call_annex([trust_level, srname])
        # get uuid for use in bare repo's config
        uuid = ds.config.get("remote.{}.annex-uuid".format(srname))

    if storage_sibling == 'only':
        # we can stop here, the rest of the function is about setting up
        # the git remote part of the sibling
        yield get_status_dict(
            status='ok',
            **res_kwargs,
        )
        return

    # 2. create a bare repository in-store:

    lgr.debug("init bare repository")
    # TODO: we should prob. check whether it's there already. How?
    # Note: like the special remote itself, we assume local FS if no
    # SSH host is specified
    disabled_hook = repo_path / 'hooks' / 'post-update.sample'
    enabled_hook = repo_path / 'hooks' / 'post-update'

    if group:
        chgrp_cmd = "chgrp -R {} {}".format(
            quote_cmdlinearg(str(group)),
            quote_cmdlinearg(str(repo_path)))

    if ssh_host:
        ssh('cd {rootdir} && git init --bare{shared}'.format(
            rootdir=quote_cmdlinearg(str(repo_path)),
            shared=" --shared='{}'".format(
                quote_cmdlinearg(shared)) if shared else ''
        ))

        if storage_sibling:
            # write special remote's uuid into git-config, so clone can
            # which one it is supposed to be and enable it even with
            # fallback URL
            ssh("cd {rootdir} && git config datalad.ora-remote.uuid {uuid}"
                "".format(rootdir=quote_cmdlinearg(str(repo_path)),
                          uuid=uuid))

        if post_update_hook:
            ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)),
                                  quote_cmdlinearg(str(enabled_hook))))

        if group:
            # Either repository existed before or a new directory was
            # created for it, set its group to a desired one if was
            # provided with the same chgrp
            ssh(chgrp_cmd)
    else:
        gr = GitRepo(repo_path, create=True, bare=True,
                     shared=shared if shared else None)
        if storage_sibling:
            # write special remote's uuid into git-config, so clone can
            # which one it is supposed to be and enable it even with
            # fallback URL
            gr.config.add("datalad.ora-remote.uuid", uuid, where='local')

        if post_update_hook:
            disabled_hook.rename(enabled_hook)
        if group:
            # TODO; do we need a cwd here?
            subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path))

    # add a git remote to the bare repository
    # Note: needs annex-ignore! Otherwise we might push into dirhash
    # lower annex/object tree instead of mixed, since it's a bare
    # repo. This in turn would be an issue, if we want to pack the
    # entire thing into an archive. Special remote will then not be
    # able to access content in the "wrong" place within the archive
    lgr.debug("set up git remote")
    if name in ds_siblings:
        # otherwise we should have skipped or failed before
        assert existing == 'reconfigure'
    ds.config.set(
        "remote.{}.annex-ignore".format(name),
        value="true",
        where="local")
    ds.siblings(
        'configure',
        name=name,
        url=git_url
        if ssh_host
        else str(repo_path),
        recursive=False,
        # Note, that this should be None if storage_sibling was not set
        publish_depends=storage_name,
        result_renderer=None,
        # Note, that otherwise a subsequent publish will report
        # "notneeded".
        fetch=True
    )

    yield get_status_dict(
        status='ok',
        **res_kwargs,
    )
def _create_sibling_ria(ds, url, name, ria_remote, ria_remote_name, existing,
                        shared, group, post_update_hook, res_kwargs):
    # be safe across datasets
    res_kwargs = res_kwargs.copy()

    # parse target URL
    try:
        ssh_host, base_path = verify_ria_url(url, ds.config)
    except ValueError as e:
        yield get_status_dict(status='error', message=str(e), **res_kwargs)
        return

    base_path = Path(base_path)

    git_url = decode_source_spec(
        # append dataset id to url and use magic from clone-helper:
        url + '#{}'.format(ds.id),
        cfg=ds.config)['giturl']
    # go for a v1 layout
    repo_path, _, _ = get_layout_locations(1, base_path, ds.id)

    ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)]
    # Figure whether we are supposed to skip this very dataset
    if existing == 'skip' and (name in ds_siblings or
                               (ria_remote_name
                                and ria_remote_name in ds_siblings)):
        yield get_status_dict(status='notneeded',
                              message="Skipped on existing sibling",
                              **res_kwargs)
        # if we skip here, nothing else can change that decision further
        # down
        return

    # we might learn that some processing (remote repo creation is
    # not desired)
    skip = False

    lgr.info("create sibling{} '{}'{} ...".format(
        's' if ria_remote_name else '',
        name,
        " and '{}'".format(ria_remote_name) if ria_remote_name else '',
    ))
    if ssh_host:
        from datalad import ssh_manager
        ssh = ssh_manager.get_connection(ssh_host,
                                         use_remote_annex_bundle=False)
        ssh.open()

    # determine layout locations
    if ria_remote:
        lgr.debug('init special remote {}'.format(ria_remote_name))
        ria_remote_options = [
            'type=external', 'externaltype=ria', 'encryption=none',
            'autoenable=true', 'url={}'.format(url)
        ]
        try:
            ds.repo.init_remote(ria_remote_name, options=ria_remote_options)
        except CommandError as e:
            if existing in ['replace', 'reconfigure'] \
                    and 'git-annex: There is already a special remote' \
                    in e.stderr:
                # run enableremote instead
                lgr.debug(
                    "special remote '%s' already exists. "
                    "Run enableremote instead.", ria_remote_name)
                # TODO: Use AnnexRepo.enable_remote (which needs to get
                #       `options` first)
                cmd = ['git', 'annex', 'enableremote', ria_remote_name
                       ] + ria_remote_options
                subprocess.run(cmd, cwd=quote_cmdlinearg(ds.repo.path))
            else:
                yield get_status_dict(
                    status='error',
                    message="initremote failed.\nstdout: %s\nstderr: %s" %
                    (e.stdout, e.stderr),
                    **res_kwargs)
                return

        # 1. create remote object store:
        # Note: All it actually takes is to trigger the special
        # remote's `prepare` method once.
        # ATM trying to achieve that by invoking a minimal fsck.
        # TODO: - It's probably faster to actually talk to the special
        #         remote (i.e. pretending to be annex and use
        #         the protocol to send PREPARE)
        #       - Alternatively we can create the remote directory and
        #         ria version file directly, but this means
        #         code duplication that then needs to be kept in sync
        #         with ria-remote implementation.
        #       - this leads to the third option: Have that creation
        #         routine importable and callable from
        #         ria-remote package without the need to actually
        #         instantiate a RIARemote object
        lgr.debug("initializing object store")
        ds.repo.fsck(remote=ria_remote_name,
                     fast=True,
                     annex_options=['--exclude=*/*'])
    else:
        # with no special remote we currently need to create the
        # required directories
        # TODO: This should be cleaner once we have access to the
        #       special remote's RemoteIO classes without
        #       talking via annex
        if ssh_host:
            try:
                stdout, stderr = ssh('test -e {repo}'.format(
                    repo=quote_cmdlinearg(str(repo_path))))
                exists = True
            except CommandError as e:
                exists = False
            if exists:
                if existing == 'skip':
                    # 1. not rendered by default
                    # 2. message doesn't show up in ultimate result
                    #    record as shown by -f json_pp
                    yield get_status_dict(status='notneeded',
                                          message="Skipped on existing remote "
                                          "directory {}".format(repo_path),
                                          **res_kwargs)
                    skip = True
                elif existing in ['error', 'reconfigure']:
                    yield get_status_dict(
                        status='error',
                        message="remote directory {} already "
                        "exists.".format(repo_path),
                        **res_kwargs)
                    return
                elif existing == 'replace':
                    ssh('chmod u+w -R {}'.format(
                        quote_cmdlinearg(str(repo_path))))
                    ssh('rm -rf {}'.format(quote_cmdlinearg(str(repo_path))))
            if not skip:
                ssh('mkdir -p {}'.format(quote_cmdlinearg(str(repo_path))))
        else:
            if repo_path.exists():
                if existing == 'skip':
                    skip = True
                elif existing in ['error', 'reconfigure']:
                    yield get_status_dict(
                        status='error',
                        message="remote directory {} already "
                        "exists.".format(repo_path),
                        **res_kwargs)
                    return
                elif existing == 'replace':
                    rmtree(repo_path)
            if not skip:
                repo_path.mkdir(parents=True)

    # Note, that this could have changed since last tested due to existing
    # remote dir
    if skip:
        return

    # 2. create a bare repository in-store:

    lgr.debug("init bare repository")
    # TODO: we should prob. check whether it's there already. How?
    # Note: like the special remote itself, we assume local FS if no
    # SSH host is specified
    disabled_hook = repo_path / 'hooks' / 'post-update.sample'
    enabled_hook = repo_path / 'hooks' / 'post-update'

    if group:
        chgrp_cmd = "chgrp -R {} {}".format(quote_cmdlinearg(str(group)),
                                            quote_cmdlinearg(str(repo_path)))

    if ssh_host:
        ssh('cd {rootdir} && git init --bare{shared}'.format(
            rootdir=quote_cmdlinearg(str(repo_path)),
            shared=" --shared='{}'".format(quote_cmdlinearg(shared))
            if shared else ''))
        if post_update_hook:
            ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)),
                                  quote_cmdlinearg(str(enabled_hook))))

        if group:
            # Either repository existed before or a new directory was
            # created for it, set its group to a desired one if was
            # provided with the same chgrp
            ssh(chgrp_cmd)
    else:
        GitRepo(repo_path,
                create=True,
                bare=True,
                shared=" --shared='{}'".format(quote_cmdlinearg(shared))
                if shared else None)
        if post_update_hook:
            disabled_hook.rename(enabled_hook)
        if group:
            # TODO; do we need a cwd here?
            subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path))

    # add a git remote to the bare repository
    # Note: needs annex-ignore! Otherwise we might push into default
    # annex/object tree instead of directory type tree with dirhash
    # lower. This in turn would be an issue, if we want to pack the
    # entire thing into an archive. Special remote will then not be
    # able to access content in the "wrong" place within the archive
    lgr.debug("set up git remote")
    # TODO:
    # - This sibings call results in "[WARNING] Failed to determine
    #   if datastore carries annex."
    #   (see https://github.com/datalad/datalad/issues/4028)
    #   => for now have annex-ignore configured before. Evtl. Allow
    #      configure/add to include that option
    #      - additionally there's
    #        https://github.com/datalad/datalad/issues/3989,
    #        where datalad-siblings might hang forever
    if name in ds_siblings:
        # otherwise we should have skipped or failed before
        assert existing in ['replace', 'reconfigure']
    ds.config.set("remote.{}.annex-ignore".format(name),
                  value="true",
                  where="local")
    ds.siblings(
        'configure',
        name=name,
        url=git_url if ssh_host else str(repo_path),
        recursive=False,
        # Note, that this should be None if ria_remote was not set
        publish_depends=ria_remote_name,
        result_renderer=None,
        # Note, that otherwise a subsequent publish will report
        # "notneeded".
        fetch=True)

    yield get_status_dict(
        status='ok',
        **res_kwargs,
    )
Exemplo n.º 18
0
    def __call__(keyfile=None,
                 merge=False,
                 force=False,
                 drop=None,
                 dataset=None):
        ds = require_dataset(dataset, check_installed=True, purpose='update')

        if drop and drop not in ('extracted', 'archives'):
            raise ValueError(
                "Unrecognized value for 'drop' option: {}".format(drop))

        repo = ds.repo
        if not keyfile:
            # will error out, if no config was given
            keyfile = repo.config.obtain(
                'datalad.ukbiobank.keyfile',
                dialog_type='question',
                title='Key file location',
                text=
                'Where is the location of the file with the UKB access key?',
            )

        # prep for yield
        res = dict(
            action='ukb_update',
            path=ds.path,
            type='dataset',
            logger=lgr,
            refds=ds.path,
        )

        if repo.dirty:
            yield dict(
                res,
                status='error',
                message="Refuse to operate on dirty dataset",
            )
            return

        # check if we have 'ukbfetch' before we start fiddling with the dataset
        # and leave it in a mess for no reason
        try:
            subprocess.run(
                # pull version info
                ['ukbfetch', '-i'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
        except Exception as e:
            raise RuntimeError(
                "Cannot execute 'ukbfetch'. Original error: {}".format(e))

        # just to be nice, and to be able to check it out again,
        # when we are done
        initial_branch = repo.get_active_branch()
        initial_incoming = repo.get_hexsha('incoming')

        # make sure we are in incoming
        repo.call_git(['checkout', 'incoming'])

        # first wipe out all prev. downloaded zip files so we can detect
        # when some files are no longer available
        for fp in repo.pathobj.glob('[0-9]*_[0-9]*_[0-9]_[0-9].*'):
            fp.unlink()

        # a place to put the download logs
        # better be semi-persistent to ease inspection
        tmpdir = repo.pathobj / repo.get_git_dir(repo) / 'tmp' / 'ukb'
        tmpdir.mkdir(parents=True, exist_ok=True)

        # redownload, run with explicit mode, because we just deleted the
        # ZIP files and that is OK
        ds.run(
            cmd='ukbfetch -v -a{} -b.ukbbatch -o{}'.format(
                quote_cmdlinearg(keyfile),
                # use relative path to tmpdir to avoid leakage
                # of system-specific information into the run record
                quote_cmdlinearg(str(tmpdir.relative_to(repo.pathobj))),
            ),
            explicit=True,
            outputs=['.'],
            message="Update from UKBiobank",
        )

        # TODO what if something broke before? needs force switch
        if not force and repo.get_hexsha() == initial_incoming:
            yield dict(
                res,
                status='notneeded',
                message='No new content available',
            )
            repo.call_git(['checkout', initial_branch])
            # TODO drop?
            return

        # onto extraction and transformation of downloaded content
        repo.call_git(['checkout', 'incoming-native'])

        # mark the incoming change as merged
        # (but we do not actually want any branch content)
        repo.call_git(['merge', 'incoming', '--strategy=ours'])

        for fp in repo.get_content_info(ref='incoming-native',
                                        eval_file_type=False):
            fp.unlink()

        # discover all files present in the last commit in 'incoming'
        for fp, props in repo.get_content_annexinfo(
                ref='incoming', eval_availability=False).items():
            if fp.name.startswith('.'):
                # skip internals
                continue
            # we have to extract into per-instance directories, otherwise files
            # would conflict
            ids = fp.stem.split('_')
            if not len(ids) >= 3:
                raise RuntimeError(
                    'Unrecognized filename structure: {}'.format(fp))
            # build an ID from the data record and the array index
            rec_id = '_'.join(ids[1:])

            if fp.suffix == '.zip':
                extract_dir = repo.pathobj / rec_id
                extract_dir.mkdir(exist_ok=True)
                with chpwd(extract_dir):
                    # extract and add their content
                    AddArchiveContent.__call__(
                        props['key'],
                        key=True,
                        annex=repo,
                        # --use-current-dir due to
                        # https://github.com/datalad/datalad/issues/3995
                        use_current_dir=True,
                        allow_dirty=True,
                        commit=False,
                    )
            else:
                # move into instance dir, and strip participant ID, and instance ID
                # but keep array index
                # e.g. -> 25747_3_0.adv -> instance-3/25747_0
                repo.call_git([
                    'annex', 'fromkey', props['key'],
                    str(repo.pathobj / (rec_id + ''.join(fp.suffixes)))
                ])

        # save whatever the state is now, `save` will discover deletions
        # automatically and also commit them -- wonderful!
        ds.save(message="Update native layout")
        yield dict(
            res,
            status='ok',
        )

        want_bids = 'incoming-bids' in repo.get_branches()
        if want_bids:
            repo.call_git(['checkout', 'incoming-bids'])
            # mark the incoming change as merged
            # (but we do not actually want any branch content)
            repo.call_git(['merge', 'incoming', '--strategy=ours'])
            # prepare the worktree to match the latest state
            # of incoming-native but keep histories separate
            # (ie. no merge), because we cannot handle partial
            # changes
            repo.call_git(['read-tree', '-u', '--reset', 'incoming-native'])
            # unstage change to present a later `datalad save` a single
            # changeset to be saved (otherwise it might try to keep staged
            # content staged und only save additional modifications)
            #repo.call_git(['restore', '--staged', '.'])
            repo.call_git(['reset', 'HEAD', '.'])

            # and now do the BIDSification
            from datalad_ukbiobank.ukb2bids import restructure_ukb2bids
            # get participant ID from batch file
            subid = list(
                repo.call_git_items_(["cat-file", "-p", "incoming:.ukbbatch"
                                      ]))[0].split(maxsplit=1)[0]

            yield from restructure_ukb2bids(
                ds,
                subid=subid,
                unrecognized_dir='non-bids',
                base_path=repo.pathobj,
            )
            ds.save(message="Update BIDS layout")

        if drop:
            if drop == 'archives':
                # we need to force the drop, because the download is the
                # only copy we have in general
                drop_opts = ['--force', '--branch', 'incoming', '-I', '*.zip']
            else:  # drop == 'extracted':
                drop_opts = [
                    '--in', 'datalad-archives', '--branch', 'incoming-native'
                ]

            for rec in repo.call_annex_records(['drop'] + drop_opts):
                if not rec.get('success', False):
                    yield dict(
                        action='drop',
                        status='error',
                        message=rec.get('note', 'could not drop key'),
                        key=rec.get('key', None),
                        type='key',
                        path=ds.path,
                    )

        if not merge:
            return

        # and update active branch
        repo.call_git(['checkout', initial_branch])

        if initial_branch in ('incoming', 'incoming-native', 'incoming-bids'):
            yield dict(
                res,
                action='ukb_merge_update',
                status='impossible',
                message='Refuse to merge into incoming* branch',
            )
            return

        repo.call_git([
            'merge', '-m', "Merge update from UKbiobank",
            'incoming-bids' if want_bids else 'incoming-native'
        ])

        yield dict(
            res,
            action='ukb_merge_update',
            status='ok',
        )
        return
Exemplo n.º 19
0
def _test_bare_git_version_2(host, dspath, store):
    # Similarly to test_bare_git_version_1, this should ensure a bare git repo
    # at the store location for a dataset doesn't conflict with the ORA remote.
    # Note: Usability of git remote by annex depends on dataset layout version
    #       (dirhashlower vs. -mixed).
    #       For version 2 (mixed) upload via ORA and consumption via git should
    #       work. But not the other way around, since git-annex uses
    #       dirhashlower with bare repos.

    ds_path = Path(dspath)
    store = Path(store)
    ds = Dataset(ds_path).create()
    populate_dataset(ds)
    ds.save()

    bare_repo_path, _, _ = get_layout_locations(1, store, ds.id)
    # Use git to make sure the remote end is what git thinks a bare clone of it
    # should look like
    subprocess.run([
        'git', 'clone', '--bare',
        quote_cmdlinearg(str(dspath)),
        quote_cmdlinearg(str(bare_repo_path))
    ])

    if host:
        url = "ria+ssh://{host}{path}".format(host=host, path=store)
    else:
        url = "ria+{}".format(store.as_uri())
    init_opts = common_init_opts + ['url={}'.format(url)]
    # set up store:
    io = SSHRemoteIO(host) if host else LocalIO()
    create_store(io, store, '1')
    # set up the dataset location, too.
    # Note: Dataset layout version 2 (dirhash mixed):
    create_ds_in_store(io, store, ds.id, '2', '1')

    # Now, let's have the bare repo as a git remote
    git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \
        if host else bare_repo_path.as_uri()
    ds.repo.add_remote('bare-git', git_url)
    ds.repo.enable_remote('bare-git')
    # and the ORA remote in addition:
    ds.repo.init_remote('ora-remote', options=init_opts)
    # upload keys via ORA:
    ds.repo.copy_to('.', 'ora-remote')
    # bare-git doesn't know yet:
    eq_(len(ds.repo.whereis('one.txt')), 2)
    # fsck to make availability known
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='bare-git', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 3)
    ds.drop('.')
    eq_(len(ds.repo.whereis('one.txt')), 2)
    # actually consumable via git remote:
    ds.repo.call_annex(['move', 'one.txt', '--from', 'bare-git'])
    eq_(len(ds.repo.whereis('one.txt')), 2)
    # now, move back via git - shouldn't be consumable via ORA
    ds.repo.call_annex(['move', 'one.txt', '--to', 'bare-git'])
    # fsck to make availability known, but there's nothing from POV of ORA:
    fsck_res = [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='ora-remote', fast=True)
    ]
    assert_result_count(fsck_res,
                        1,
                        status='error',
                        message='** Based on the location log, one.txt\n'
                        '** was expected to be present, '
                        'but its content is missing.')
    assert_result_count(fsck_res, 1, status='ok')
    eq_(len(fsck_res), 2)
    eq_(len(ds.repo.whereis('one.txt')), 1)
Exemplo n.º 20
0
def _test_bare_git_version_1(host, dspath, store):
    # This test should take a dataset and create a bare repository at the remote
    # end from it.
    # Given, that it is placed correctly within a tree of dataset, that remote
    # thing should then be usable as an ora-remote as well as as a git-type
    # remote.
    # Note: Usability of git remote by annex depends on dataset layout version
    #       (dirhashlower vs. -mixed).
    #       For version 1 (lower) upload and consumption should be
    #       interchangeable. It doesn't matter which remote is used for what
    #       direction.
    ds_path = Path(dspath)
    store = Path(store)
    ds = Dataset(ds_path).create()
    populate_dataset(ds)
    ds.save()

    bare_repo_path, _, _ = get_layout_locations(1, store, ds.id)
    # Use git to make sure the remote end is what git thinks a bare clone of it
    # should look like
    subprocess.run([
        'git', 'clone', '--bare',
        quote_cmdlinearg(str(dspath)),
        quote_cmdlinearg(str(bare_repo_path))
    ])

    if host:
        url = "ria+ssh://{host}{path}".format(host=host, path=store)
    else:
        url = "ria+{}".format(store.as_uri())
    init_opts = common_init_opts + ['url={}'.format(url)]
    # set up store:
    io = SSHRemoteIO(host) if host else LocalIO()
    create_store(io, store, '1')
    # set up the dataset location, too.
    # Note: Dataset layout version 1 (dirhash lower):
    create_ds_in_store(io, store, ds.id, '1', '1')

    # Now, let's have the bare repo as a git remote and use it with annex
    git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \
        if host else bare_repo_path.as_uri()
    ds.repo.add_remote('bare-git', git_url)
    ds.repo.enable_remote('bare-git')

    # copy files to the remote
    ds.repo.copy_to('.', 'bare-git')
    eq_(len(ds.repo.whereis('one.txt')), 2)

    # now we can drop all content locally, reobtain it, and survive an
    # fsck
    ds.drop('.')
    ds.get('.')
    assert_status('ok', [annexjson2result(r, ds) for r in ds.repo.fsck()])

    # Now, add the ora remote:
    ds.repo.init_remote('ora-remote', options=init_opts)
    # fsck to make availability known
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='ora-remote', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 3)

    # Now move content from git-remote to local and see it not being available
    # via bare-git anymore.
    ds.repo.call_annex(['move', '--all', '--from=bare-git'])
    # ora-remote doesn't know yet:
    eq_(len(ds.repo.whereis('one.txt')), 2)

    # But after fsck it does:
    fsck_res = [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='ora-remote', fast=True)
    ]
    assert_result_count(fsck_res,
                        1,
                        status='error',
                        message='** Based on the location log, one.txt\n'
                        '** was expected to be present, '
                        'but its content is missing.')
    assert_result_count(fsck_res,
                        1,
                        status='error',
                        message='** Based on the location log, subdir/two\n'
                        '** was expected to be present, '
                        'but its content is missing.')
    eq_(len(ds.repo.whereis('one.txt')), 1)
    # and the other way around: upload via ora-remote and have it available via
    # git-remote:
    ds.repo.copy_to('.', 'ora-remote')
    # fsck to make availability known
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='bare-git', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 3)
Exemplo n.º 21
0
    def __call__(spec=None, *, dataset=None, discover=False, help_proc=False):
        if not spec and not discover:
            raise InsufficientArgumentsError(
                'requires at least a procedure name')
        if help_proc and not spec:
            raise InsufficientArgumentsError('requires a procedure name')

        try:
            ds = require_dataset(dataset,
                                 check_installed=False,
                                 purpose='run a procedure')
        except NoDatasetFound:
            ds = None

        if discover:
            # specific path of procedures that were already reported
            reported = set()
            # specific names of procedure for which an active one has been
            # found
            active = set()
            for m, cmd_name, cmd_tmpl, cmd_help in \
                    _get_procedure_implementation('*', ds=ds):
                if m in reported:
                    continue
                ex = _guess_exec(m)
                # configured template (call-format string) takes precedence:
                if cmd_tmpl:
                    ex['template'] = cmd_tmpl
                if ex['state'] is None:
                    # doesn't seem like a match
                    lgr.debug("%s does not look like a procedure, ignored.", m)
                    continue
                state = 'overridden' if cmd_name in active else ex['state']
                message = ex['type'] if ex['type'] else 'unknown type'
                message += ' ({})'.format(
                    state) if state != 'executable' else ''
                res = get_status_dict(action='discover_procedure',
                                      path=m,
                                      type='file',
                                      logger=lgr,
                                      refds=ds.path if ds else None,
                                      status='ok',
                                      state=state,
                                      procedure_name=cmd_name,
                                      procedure_type=ex['type'],
                                      procedure_callfmt=ex['template'],
                                      procedure_help=cmd_help,
                                      message=message)
                reported.add(m)
                if state == 'executable':
                    active.add(cmd_name)
                yield res
            return

        if isinstance(spec, dict):
            # Skip getting procedure implementation if called with a
            # dictionary (presumably coming from --discover)
            procedure_file = spec['path']
            cmd_name = spec['procedure_name']
            cmd_tmpl = spec['procedure_callfmt']
            cmd_help = spec['procedure_help']

            name = cmd_name
            args = []

        else:

            if not isinstance(spec, (tuple, list)):
                # maybe coming from config
                spec = split_cmdline(spec)
            name = spec[0]
            args = spec[1:]

            try:
                # get the first match an run with it
                procedure_file, cmd_name, cmd_tmpl, cmd_help = \
                    next(_get_procedure_implementation(name, ds=ds))
            except StopIteration:
                raise ValueError("Cannot find procedure with name '%s'" % name)

        ex = _guess_exec(procedure_file)
        # configured template (call-format string) takes precedence:
        if cmd_tmpl:
            ex['template'] = cmd_tmpl

        if help_proc:
            if cmd_help:
                res = get_status_dict(action='procedure_help',
                                      path=procedure_file,
                                      type='file',
                                      logger=lgr,
                                      refds=ds.path if ds else None,
                                      status='ok',
                                      state=ex['state'],
                                      procedure_name=cmd_name,
                                      procedure_type=ex['type'],
                                      procedure_callfmt=ex['template'],
                                      message=cmd_help)
            else:
                res = get_status_dict(action='procedure_help',
                                      path=procedure_file,
                                      type='file',
                                      logger=lgr,
                                      refds=ds.path if ds else None,
                                      status='impossible',
                                      state=ex['state'],
                                      procedure_name=cmd_name,
                                      procedure_type=ex['type'],
                                      procedure_callfmt=ex['template'],
                                      message="No help available for '%s'" %
                                      name)

            yield res
            return

        if not ex['template']:
            raise ValueError("No idea how to execute procedure %s. "
                             "Missing 'execute' permissions?" % procedure_file)

        cmd = ex['template'].format(
            script=guard_for_format(quote_cmdlinearg(procedure_file)),
            ds=guard_for_format(quote_cmdlinearg(ds.path)) if ds else '',
            args=join_cmdline(args) if args else '')
        lgr.info(u"Running procedure %s", name)
        lgr.debug(u'Full procedure command: %r', cmd)
        for r in Run.__call__(
                cmd=cmd,
                dataset=ds,
                explicit=True,
                inputs=None,
                outputs=None,
                # pass through here
                on_failure='ignore',
                return_type='generator',
                result_renderer='disabled'):
            yield r

        if ds:
            # the procedure ran and we have to anticipate that it might have
            # changed the dataset config, so we need to trigger an unforced
            # reload.
            # we have to do this despite "being done here", because
            # run_procedure() runs in the same process and reuses dataset (config
            # manager) instances, and the next interaction with a dataset should
            # be able to count on an up-to-date config
            ds.config.reload()