Exemplo n.º 1
0
def test_decode_source_spec():
    # resolves datalad RIs:
    eq_(decode_source_spec('///subds'),
        dict(source='///subds', giturl=consts.DATASETS_TOPURL + 'subds', version=None,
             type='dataladri', default_destpath='subds'))
    assert_raises(NotImplementedError, decode_source_spec,
                  '//custom/subds')

    # doesn't harm others:
    for url in (
            'http://example.com',
            '/absolute/path',
            'file://localhost/some',
            'localhost/another/path',
            '[email protected]/mydir',
            'ssh://somewhe.re/else',
            'git://github.com/datalad/testrepo--basic--r1',
    ):
        props = decode_source_spec(url)
        dest = props.pop('default_destpath')
        eq_(props, dict(source=url, version=None, giturl=url, type='giturl'))

    # RIA URIs with and without version specification
    dsid = '6d69ca68-7e85-11e6-904c-002590f97d84'
    for proto, loc, version in (
            ('http', 'example.com', None),
            ('http', 'example.com', 'v1.0'),
            ('http', 'example.com', 'some_with@in_it'),
            ('ssh', 'example.com', 'some_with@in_it'),
    ):
        spec = 'ria+{}://{}{}{}'.format(
            proto,
            loc,
            '#{}'.format(dsid),
            '@{}'.format(version) if version else '')
        eq_(decode_source_spec(spec),
            dict(
                source=spec,
                giturl='{}://{}/{}/{}'.format(
                    proto,
                    loc,
                    dsid[:3],
                    dsid[3:]),
                version=version,
                default_destpath=dsid,
                type='ria')
        )
    # not a dataset UUID
    assert_raises(ValueError, decode_source_spec, 'ria+http://example.com#123')
Exemplo n.º 2
0
def _create_sibling_ria(
        ds,
        url,
        name,
        storage_sibling,
        storage_name,
        existing,
        shared,
        group,
        post_update_hook,
        trust_level,
        res_kwargs):
    # be safe across datasets
    res_kwargs = res_kwargs.copy()
    # update dataset
    res_kwargs['ds'] = ds

    if not isinstance(ds.repo, AnnexRepo):
        # No point in dealing with a special remote when there's no annex.
        # Note, that in recursive invocations this might only apply to some of
        # the datasets. Therefore dealing with it here rather than one level up.
        lgr.debug("No annex at %s. Ignoring special remote options.", ds.path)
        storage_sibling = False
        storage_name = None

    # parse target URL
    try:
        ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config)
    except ValueError as e:
        yield get_status_dict(
            status='error',
            message=str(e),
            **res_kwargs
        )
        return

    base_path = Path(base_path)

    git_url = decode_source_spec(
        # append dataset id to url and use magic from clone-helper:
        url + '#{}'.format(ds.id),
        cfg=ds.config
    )['giturl']
    # determine layout locations; go for a v1 layout
    repo_path, _, _ = get_layout_locations(1, base_path, ds.id)

    ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)]
    # Figure whether we are supposed to skip this very dataset
    if existing == 'skip' and (
            name in ds_siblings or (
                storage_name and storage_name in ds_siblings)):
        yield get_status_dict(
            status='notneeded',
            message="Skipped on existing sibling",
            **res_kwargs
        )
        # if we skip here, nothing else can change that decision further
        # down
        return

    # figure whether we need to skip or error due an existing target repo before
    # we try to init a special remote.
    if ssh_host:
        from datalad import ssh_manager
        ssh = ssh_manager.get_connection(
            ssh_host,
            use_remote_annex_bundle=False)
        ssh.open()

    if existing in ['skip', 'error']:
        config_path = repo_path / 'config'
        # No .git -- if it's an existing repo in a RIA store it should be a
        # bare repo.
        # Theoretically we could have additional checks for whether we have
        # an empty repo dir or a non-bare repo or whatever else.
        if ssh_host:
            try:
                ssh('[ -e {p} ]'.format(p=quote_cmdlinearg(str(config_path))))
                exists = True
            except CommandError:
                exists = False
        else:
            exists = config_path.exists()

        if exists:
            if existing == 'skip':
                # 1. not rendered by default
                # 2. message doesn't show up in ultimate result
                #    record as shown by -f json_pp
                yield get_status_dict(
                    status='notneeded',
                    message="Skipped on existing remote "
                            "directory {}".format(repo_path),
                    **res_kwargs
                )
                return
            else:  # existing == 'error'
                yield get_status_dict(
                    status='error',
                    message="remote directory {} already "
                            "exists.".format(repo_path),
                    **res_kwargs
                )
                return

    if storage_sibling == 'only':
        lgr.info("create storage sibling '{}' ...".format(name))
    else:
        lgr.info("create sibling{} '{}'{} ...".format(
            's' if storage_name else '',
            name,
            " and '{}'".format(storage_name) if storage_name else '',
        ))
    create_ds_in_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(),
                       base_path, ds.id, '2', '1')
    if storage_sibling:
        # we are using the main `name`, if the only thing we are creating
        # is the storage sibling
        srname = name if storage_sibling == 'only' else storage_name

        lgr.debug('init special remote {}'.format(srname))
        special_remote_options = [
            'type=external',
            'externaltype=ora',
            'encryption=none',
            'autoenable=true',
            'url={}'.format(url)]
        try:
            ds.repo.init_remote(
                srname,
                options=special_remote_options)
        except CommandError as e:
            if existing == 'reconfigure' \
                    and 'git-annex: There is already a special remote' \
                    in e.stderr:
                # run enableremote instead
                lgr.debug(
                    "special remote '%s' already exists. "
                    "Run enableremote instead.",
                    srname)
                # TODO: Use AnnexRepo.enable_remote (which needs to get
                #       `options` first)
                ds.repo.call_annex([
                    'enableremote',
                    srname] + special_remote_options)
            else:
                yield get_status_dict(
                    status='error',
                    message="initremote failed.\nstdout: %s\nstderr: %s"
                    % (e.stdout, e.stderr),
                    **res_kwargs
                )
                return

        if trust_level:
            ds.repo.call_annex([trust_level, srname])
        # get uuid for use in bare repo's config
        uuid = ds.config.get("remote.{}.annex-uuid".format(srname))

    if storage_sibling == 'only':
        # we can stop here, the rest of the function is about setting up
        # the git remote part of the sibling
        yield get_status_dict(
            status='ok',
            **res_kwargs,
        )
        return

    # 2. create a bare repository in-store:

    lgr.debug("init bare repository")
    # TODO: we should prob. check whether it's there already. How?
    # Note: like the special remote itself, we assume local FS if no
    # SSH host is specified
    disabled_hook = repo_path / 'hooks' / 'post-update.sample'
    enabled_hook = repo_path / 'hooks' / 'post-update'

    if group:
        chgrp_cmd = "chgrp -R {} {}".format(
            quote_cmdlinearg(str(group)),
            quote_cmdlinearg(str(repo_path)))

    if ssh_host:
        ssh('cd {rootdir} && git init --bare{shared}'.format(
            rootdir=quote_cmdlinearg(str(repo_path)),
            shared=" --shared='{}'".format(
                quote_cmdlinearg(shared)) if shared else ''
        ))

        if storage_sibling:
            # write special remote's uuid into git-config, so clone can
            # which one it is supposed to be and enable it even with
            # fallback URL
            ssh("cd {rootdir} && git config datalad.ora-remote.uuid {uuid}"
                "".format(rootdir=quote_cmdlinearg(str(repo_path)),
                          uuid=uuid))

        if post_update_hook:
            ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)),
                                  quote_cmdlinearg(str(enabled_hook))))

        if group:
            # Either repository existed before or a new directory was
            # created for it, set its group to a desired one if was
            # provided with the same chgrp
            ssh(chgrp_cmd)
    else:
        gr = GitRepo(repo_path, create=True, bare=True,
                     shared=shared if shared else None)
        if storage_sibling:
            # write special remote's uuid into git-config, so clone can
            # which one it is supposed to be and enable it even with
            # fallback URL
            gr.config.add("datalad.ora-remote.uuid", uuid, where='local')

        if post_update_hook:
            disabled_hook.rename(enabled_hook)
        if group:
            # TODO; do we need a cwd here?
            subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path))

    # add a git remote to the bare repository
    # Note: needs annex-ignore! Otherwise we might push into dirhash
    # lower annex/object tree instead of mixed, since it's a bare
    # repo. This in turn would be an issue, if we want to pack the
    # entire thing into an archive. Special remote will then not be
    # able to access content in the "wrong" place within the archive
    lgr.debug("set up git remote")
    if name in ds_siblings:
        # otherwise we should have skipped or failed before
        assert existing == 'reconfigure'
    ds.config.set(
        "remote.{}.annex-ignore".format(name),
        value="true",
        where="local")
    ds.siblings(
        'configure',
        name=name,
        url=git_url
        if ssh_host
        else str(repo_path),
        recursive=False,
        # Note, that this should be None if storage_sibling was not set
        publish_depends=storage_name,
        result_renderer=None,
        # Note, that otherwise a subsequent publish will report
        # "notneeded".
        fetch=True
    )

    yield get_status_dict(
        status='ok',
        **res_kwargs,
    )
def _create_sibling_ria(ds, url, name, ria_remote, ria_remote_name, existing,
                        shared, group, post_update_hook, res_kwargs):
    # be safe across datasets
    res_kwargs = res_kwargs.copy()

    # parse target URL
    try:
        ssh_host, base_path = verify_ria_url(url, ds.config)
    except ValueError as e:
        yield get_status_dict(status='error', message=str(e), **res_kwargs)
        return

    base_path = Path(base_path)

    git_url = decode_source_spec(
        # append dataset id to url and use magic from clone-helper:
        url + '#{}'.format(ds.id),
        cfg=ds.config)['giturl']
    # go for a v1 layout
    repo_path, _, _ = get_layout_locations(1, base_path, ds.id)

    ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)]
    # Figure whether we are supposed to skip this very dataset
    if existing == 'skip' and (name in ds_siblings or
                               (ria_remote_name
                                and ria_remote_name in ds_siblings)):
        yield get_status_dict(status='notneeded',
                              message="Skipped on existing sibling",
                              **res_kwargs)
        # if we skip here, nothing else can change that decision further
        # down
        return

    # we might learn that some processing (remote repo creation is
    # not desired)
    skip = False

    lgr.info("create sibling{} '{}'{} ...".format(
        's' if ria_remote_name else '',
        name,
        " and '{}'".format(ria_remote_name) if ria_remote_name else '',
    ))
    if ssh_host:
        from datalad import ssh_manager
        ssh = ssh_manager.get_connection(ssh_host,
                                         use_remote_annex_bundle=False)
        ssh.open()

    # determine layout locations
    if ria_remote:
        lgr.debug('init special remote {}'.format(ria_remote_name))
        ria_remote_options = [
            'type=external', 'externaltype=ria', 'encryption=none',
            'autoenable=true', 'url={}'.format(url)
        ]
        try:
            ds.repo.init_remote(ria_remote_name, options=ria_remote_options)
        except CommandError as e:
            if existing in ['replace', 'reconfigure'] \
                    and 'git-annex: There is already a special remote' \
                    in e.stderr:
                # run enableremote instead
                lgr.debug(
                    "special remote '%s' already exists. "
                    "Run enableremote instead.", ria_remote_name)
                # TODO: Use AnnexRepo.enable_remote (which needs to get
                #       `options` first)
                cmd = ['git', 'annex', 'enableremote', ria_remote_name
                       ] + ria_remote_options
                subprocess.run(cmd, cwd=quote_cmdlinearg(ds.repo.path))
            else:
                yield get_status_dict(
                    status='error',
                    message="initremote failed.\nstdout: %s\nstderr: %s" %
                    (e.stdout, e.stderr),
                    **res_kwargs)
                return

        # 1. create remote object store:
        # Note: All it actually takes is to trigger the special
        # remote's `prepare` method once.
        # ATM trying to achieve that by invoking a minimal fsck.
        # TODO: - It's probably faster to actually talk to the special
        #         remote (i.e. pretending to be annex and use
        #         the protocol to send PREPARE)
        #       - Alternatively we can create the remote directory and
        #         ria version file directly, but this means
        #         code duplication that then needs to be kept in sync
        #         with ria-remote implementation.
        #       - this leads to the third option: Have that creation
        #         routine importable and callable from
        #         ria-remote package without the need to actually
        #         instantiate a RIARemote object
        lgr.debug("initializing object store")
        ds.repo.fsck(remote=ria_remote_name,
                     fast=True,
                     annex_options=['--exclude=*/*'])
    else:
        # with no special remote we currently need to create the
        # required directories
        # TODO: This should be cleaner once we have access to the
        #       special remote's RemoteIO classes without
        #       talking via annex
        if ssh_host:
            try:
                stdout, stderr = ssh('test -e {repo}'.format(
                    repo=quote_cmdlinearg(str(repo_path))))
                exists = True
            except CommandError as e:
                exists = False
            if exists:
                if existing == 'skip':
                    # 1. not rendered by default
                    # 2. message doesn't show up in ultimate result
                    #    record as shown by -f json_pp
                    yield get_status_dict(status='notneeded',
                                          message="Skipped on existing remote "
                                          "directory {}".format(repo_path),
                                          **res_kwargs)
                    skip = True
                elif existing in ['error', 'reconfigure']:
                    yield get_status_dict(
                        status='error',
                        message="remote directory {} already "
                        "exists.".format(repo_path),
                        **res_kwargs)
                    return
                elif existing == 'replace':
                    ssh('chmod u+w -R {}'.format(
                        quote_cmdlinearg(str(repo_path))))
                    ssh('rm -rf {}'.format(quote_cmdlinearg(str(repo_path))))
            if not skip:
                ssh('mkdir -p {}'.format(quote_cmdlinearg(str(repo_path))))
        else:
            if repo_path.exists():
                if existing == 'skip':
                    skip = True
                elif existing in ['error', 'reconfigure']:
                    yield get_status_dict(
                        status='error',
                        message="remote directory {} already "
                        "exists.".format(repo_path),
                        **res_kwargs)
                    return
                elif existing == 'replace':
                    rmtree(repo_path)
            if not skip:
                repo_path.mkdir(parents=True)

    # Note, that this could have changed since last tested due to existing
    # remote dir
    if skip:
        return

    # 2. create a bare repository in-store:

    lgr.debug("init bare repository")
    # TODO: we should prob. check whether it's there already. How?
    # Note: like the special remote itself, we assume local FS if no
    # SSH host is specified
    disabled_hook = repo_path / 'hooks' / 'post-update.sample'
    enabled_hook = repo_path / 'hooks' / 'post-update'

    if group:
        chgrp_cmd = "chgrp -R {} {}".format(quote_cmdlinearg(str(group)),
                                            quote_cmdlinearg(str(repo_path)))

    if ssh_host:
        ssh('cd {rootdir} && git init --bare{shared}'.format(
            rootdir=quote_cmdlinearg(str(repo_path)),
            shared=" --shared='{}'".format(quote_cmdlinearg(shared))
            if shared else ''))
        if post_update_hook:
            ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)),
                                  quote_cmdlinearg(str(enabled_hook))))

        if group:
            # Either repository existed before or a new directory was
            # created for it, set its group to a desired one if was
            # provided with the same chgrp
            ssh(chgrp_cmd)
    else:
        GitRepo(repo_path,
                create=True,
                bare=True,
                shared=" --shared='{}'".format(quote_cmdlinearg(shared))
                if shared else None)
        if post_update_hook:
            disabled_hook.rename(enabled_hook)
        if group:
            # TODO; do we need a cwd here?
            subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path))

    # add a git remote to the bare repository
    # Note: needs annex-ignore! Otherwise we might push into default
    # annex/object tree instead of directory type tree with dirhash
    # lower. This in turn would be an issue, if we want to pack the
    # entire thing into an archive. Special remote will then not be
    # able to access content in the "wrong" place within the archive
    lgr.debug("set up git remote")
    # TODO:
    # - This sibings call results in "[WARNING] Failed to determine
    #   if datastore carries annex."
    #   (see https://github.com/datalad/datalad/issues/4028)
    #   => for now have annex-ignore configured before. Evtl. Allow
    #      configure/add to include that option
    #      - additionally there's
    #        https://github.com/datalad/datalad/issues/3989,
    #        where datalad-siblings might hang forever
    if name in ds_siblings:
        # otherwise we should have skipped or failed before
        assert existing in ['replace', 'reconfigure']
    ds.config.set("remote.{}.annex-ignore".format(name),
                  value="true",
                  where="local")
    ds.siblings(
        'configure',
        name=name,
        url=git_url if ssh_host else str(repo_path),
        recursive=False,
        # Note, that this should be None if ria_remote was not set
        publish_depends=ria_remote_name,
        result_renderer=None,
        # Note, that otherwise a subsequent publish will report
        # "notneeded".
        fetch=True)

    yield get_status_dict(
        status='ok',
        **res_kwargs,
    )