示例#1
0
def get_repo_url(repo, access_protocol, github_login):
    """Report the repository access URL for Git matching the protocol"""
    prop = {'https': repo.clone_url, 'ssh': repo.ssh_url}[access_protocol]
    if access_protocol == 'https' and github_login:
        # we were provided explicit github login.  For ssh access it is
        # impossible to specify different login within ssh RI, but it is
        # possible to do so for https logins
        url = URL(prop)
        assert url.scheme in ('http', 'https')
        url.username = github_login
        prop = url.as_str()
    return prop
示例#2
0
def test_url_base():
    # Basic checks
    assert_raises(ValueError,
                  URL,
                  "http://example.com",
                  hostname='example.com')
    url = URL("http://example.com")
    eq_(url.hostname, 'example.com')
    eq_(url.scheme, 'http')
    eq_(url.port, '')  # not specified -- empty strings
    eq_(url.username, '')  # not specified -- empty strings
    eq_(repr(url), "URL(hostname='example.com', scheme='http')")
    eq_(url, "http://example.com")  # automagic coercion in __eq__

    neq_(URL(), URL(hostname='x'))

    smth = URL('smth')
    eq_(smth.hostname, '')
    ok_(bool(smth))
    nok_(bool(URL()))

    assert_raises(ValueError, url._set_from_fields, unknown='1')

    with swallow_logs(new_level=logging.WARNING) as cml:
        # we don't "care" about params ATM so there is a warning if there are any
        purl = URL("http://example.com/;param")
        eq_(str(purl),
            'http://example.com/;param')  # but we do maintain original string
        assert_in('ParseResults contains params', cml.out)
        eq_(purl.as_str(), 'http://example.com/')
示例#3
0
def test_url_fragments_and_query():
    url = URL(hostname="host", query=OrderedDict((('a', 'x/b'), ('b', 'y'))))
    eq_(str(url), '//host?a=x%2Fb&b=y')
    eq_(url.query, 'a=x%2Fb&b=y')
    eq_(url.query_dict, {'a': 'x/b', 'b': 'y'})

    url = URL(hostname="host", fragment=OrderedDict((('b', 'x/b'), ('a', 'y'))))
    eq_(str(url), '//host#b=x/b&a=y')
    eq_(url.fragment, 'b=x/b&a=y')
    eq_(url.fragment_dict, {'a': 'y', 'b': 'x/b'})

    fname = get_most_obscure_supported_name()
    url = URL(hostname="host", fragment={'a': fname})
    eq_(url.fragment_dict, {'a': fname})
示例#4
0
def get_repo_url(repo, access_protocol, github_login):
    """Report the repository access URL for Git matching the protocol"""
    prop = {
        'https': repo.clone_url,
        'ssh': repo.ssh_url
    }[access_protocol]
    if access_protocol == 'https' and github_login:
        # we were provided explicit github login.  For ssh access it is
        # impossible to specify different login within ssh RI, but it is
        # possible to do so for https logins
        url = URL(prop)
        assert url.scheme in ('http', 'https')
        url.username = github_login
        prop = url.as_str()
    return prop
示例#5
0
def test_url_compose_archive_one():
    url = URL(scheme='dl+archive',
              path='KEY',
              fragment=OrderedDict((('path', 'f/p/ s+'), ('size', 30))))
    # funny - space is encoded as + but + is %2B
    eq_(str(url), 'dl+archive:KEY#path=f/p/+s%2B&size=30')
    eq_(url.fragment_dict, {'path': 'f/p/ s+', 'size': '30'})
示例#6
0
def add_version_to_url(url, version, replace=False):
    """Add a version ID to `url`.

    Parameters
    ----------
    url : datalad.support.network.URL
        A URL.
    version : str
        The value of 'versionId='.
    replace : boolean, optional
        If a versionID is already present in `url`, replace it.

    Returns
    -------
    A versioned URL (str)
    """
    version_id = "versionId={}".format(version)
    if not url.query:
        query = version_id
    else:
        ver_match = re.match(
            "(?P<pre>.*&)?"
            "(?P<vers>versionId=[^&]+)"
            "(?P<post>&.*)?", url.query)
        if ver_match:
            if replace:
                query = "".join([
                    ver_match.group("pre") or "", version_id,
                    ver_match.group("post") or ""
                ])
            else:
                query = url.query
        else:
            query = url.query + "&" + version_id
    return URL(**dict(url.fields, query=query)).as_str()
示例#7
0
    def get_file_url(self,
                     archive_file=None,
                     archive_key=None,
                     file=None,
                     size=None):
        """Given archive (file or a key) and a file -- compose URL for access

        Examples
        --------

        dl+archive:SHA256E-s176--69...3e.tar.gz#path=1/d2/2d&size=123
            when size of file within archive was known to be 123
        dl+archive:SHA256E-s176--69...3e.tar.gz#path=1/d2/2d
            when size of file within archive was not provided

        Parameters
        ----------
        size: int, optional
          Size of the file.  If not provided, will simply be empty
        """
        assert (file is not None)
        if archive_file is not None:
            if archive_key is not None:
                raise ValueError(
                    "Provide archive_file or archive_key - not both")
            archive_key = self.repo.get_file_annexinfo(archive_file)['key']
        assert (archive_key is not None)
        attrs = OrderedDict()  # looking forward for more
        if file:
            attrs['path'] = file.lstrip('/')
        if size is not None:
            attrs['size'] = size
        return str(
            URL(scheme=self.URL_SCHEME, path=archive_key, fragment=attrs))
示例#8
0
def fix_url(data, keys=['url']):
    """Given data, get value within 'url' key and fix up so it is legit url

    - replace spaces with %20
    """
    data = data.copy()
    for key in keys:
        if key in data:  # catches key error if dictionary does not contain key
            data[key] = URL(data[key]).as_str()
    yield data
示例#9
0
def test_url_eq():
    eq_(URL(), URL())
    # doesn't make sense to ask what kind of a url it is an empty URL
    #eq_(RI(), RI())
    neq_(URL(), URL(hostname='x'))
    # Different types aren't equal even if have the same fields values
    neq_(URL(path='x'), PathRI(path='x'))
    neq_(URL(hostname='x'), SSHRI(hostname='x'))
    neq_(str(URL(hostname='x')), str(SSHRI(hostname='x')))
示例#10
0
def test_add_version_to_url():
    base_url = "http://ex.com/f.txt"
    base_url_query = "http://ex.com/f.txt?k=v"
    for replace in True, False:
        eq_(add_version_to_url(URL(base_url), "new.id", replace=replace),
            base_url + "?versionId=new.id")

        eq_(add_version_to_url(URL(base_url_query),
                               "new.id", replace=replace),
            base_url_query + "&versionId=new.id")

        expected = "new.id" if replace else "orig.id"
        eq_(add_version_to_url(URL(base_url + "?versionId=orig.id"),
                               "new.id",
                               replace=replace),
            base_url + "?versionId=" + expected)

        eq_(add_version_to_url(URL(base_url_query + "&versionId=orig.id"),
                               "new.id",
                               replace=replace),
            base_url_query + "&versionId=" + expected)
示例#11
0
 def _parse_url(self, url):
     """Parse url and return archive key, file within archive and
     additional attributes (such as size)"""
     url = URL(url)
     assert (url.scheme == self.URL_SCHEME)
     fdict = url.fragment_dict
     if 'path' not in fdict:
         # must be old-style key/path#size=
         assert '/' in url.path, "must be of key/path format"
         key, path = url.path.split('/', 1)
     else:
         key, path = url.path, fdict.pop('path')
     if 'size' in fdict:
         fdict['size'] = int(fdict['size'])
     return key, path, fdict
示例#12
0
def verify_ria_url(url, cfg):
    """Verify and decode ria url

    Expects a ria-URL pointing to a RIA store, applies rewrites and tries to
    decode potential host and base path for the store from it. Additionally
    raises if `url` is considered invalid.

    ria+ssh://somehost:/path/to/store
    ria+file:///path/to/store

    Parameters
    ----------
    url : str
      URL to verify an decode.
    cfg : dict-like
      Configuration settings for rewrite_url()

    Raises
    ------
    ValueError

    Returns
    -------
    tuple
      (host, base-path, rewritten url)
    """
    from datalad.config import rewrite_url
    from datalad.support.network import URL

    if not url:
        raise ValueError("Got no URL")

    url = rewrite_url(cfg, url)
    url_ri = URL(url)
    if not url_ri.scheme.startswith('ria+'):
        raise ValueError("Missing ria+ prefix in final URL: %s" % url)
    if url_ri.fragment:
        raise ValueError(
            "Unexpected fragment in RIA-store URL: %s" % url_ri.fragment)
    protocol = url_ri.scheme[4:]
    if protocol not in ['ssh', 'file', 'http', 'https']:
        raise ValueError("Unsupported protocol: %s. "
                         "Supported: ssh, file, http(s)" %
                         protocol)

    return url_ri.hostname if protocol != 'file' else None, \
        url_ri.path if url_ri.path else '/', \
        url
示例#13
0
def verify_ria_url(url, cfg):
    """Verify and decode ria url

    Expects a ria-URL pointing to a RIA store, applies rewrites and tries to
    decode potential host and base path for the store from it. Additionally
    raises if `url` is considered invalid.

    ria+ssh://somehost:/path/to/store
    ria+file:///path/to/store

    Parameters
    ----------
    url : str
      URL to verify an decode.
    cfg : dict-like
      Configuration settings for rewrite_url()

    Raises
    ------
    ValueError

    Returns
    -------
    tuple
      (host, base-path, rewritten url)
      `host` is not just a hostname, but is a stub URL that may also contain
      username, password, and port, if specified in a given URL.
    """
    from datalad.config import rewrite_url
    from datalad.support.network import URL

    if not url:
        raise ValueError("Got no URL")

    url = rewrite_url(cfg, url)
    url_ri = URL(url)
    if not url_ri.scheme.startswith('ria+'):
        raise ValueError("Missing ria+ prefix in final URL: %s" % url)
    if url_ri.fragment:
        raise ValueError("Unexpected fragment in RIA-store URL: %s" %
                         url_ri.fragment)
    protocol = url_ri.scheme[4:]
    if protocol not in ['ssh', 'file', 'http', 'https']:
        raise ValueError("Unsupported protocol: %s. "
                         "Supported: ssh, file, http(s)" % protocol)

    host = '{proto}://{user}{pdlm}{passwd}{udlm}{host}{portdlm}{port}'.format(
        proto=protocol,
        user=url_ri.username or '',
        pdlm=':' if url_ri.password else '',
        passwd=url_ri.password or '',
        udlm='@' if url_ri.username else '',
        host=url_ri.hostname or '',
        portdlm=':' if url_ri.port else '',
        port=url_ri.port or '',
    )
    # this != file is critical behavior, if removed, it will ruin the IO selection
    # in RIARemote!!
    return host if protocol != 'file' else None, \
        url_ri.path if url_ri.path else '/', \
        url
示例#14
0
def postclonecfg_ria(ds, props):
    """Configure a dataset freshly cloned from a RIA store"""
    repo = ds.repo
    # RIA uses hashdir mixed, copying data to it via git-annex (if cloned via
    # ssh) would make it see a bare repo and establish a hashdir lower annex
    # object tree.
    # Moreover, we want the ORA remote to receive all data for the store, so its
    # objects could be moved into archives (the main point of a RIA store).
    RIA_REMOTE_NAME = 'origin'  # don't hardcode everywhere
    ds.config.set(
        'remote.{}.annex-ignore'.format(RIA_REMOTE_NAME), 'true',
        where='local')

    # chances are that if this dataset came from a RIA store, its subdatasets
    # may live there too. Place a subdataset source candidate config that makes
    # get probe this RIA store when obtaining subdatasets
    ds.config.set(
        # we use the label 'origin' for this candidate in order to not have to
        # generate a complicated name from the actual source specification.
        # we pick a cost of 200 to sort it before datalad's default candidates
        # for non-RIA URLs, because they prioritize hierarchical layouts that
        # cannot be found in a RIA store
        'datalad.get.subdataset-source-candidate-200origin',
        # use the entire original URL, up to the fragment + plus dataset ID
        # placeholder, this should make things work with any store setup we
        # support (paths, ports, ...)
        props['source'].split('#', maxsplit=1)[0] + '#{id}',
        where='local')

    # setup publication dependency, if a corresponding special remote exists
    # and was enabled (there could be RIA stores that actually only have repos)
    # make this function be a generator
    ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled')
                   if s.get('annex-externaltype') == 'ora']
    if not ora_remotes and any(
            r.get('externaltype') == 'ora'
            for r in (repo.get_special_remotes().values()
                      if hasattr(repo, 'get_special_remotes')
                      else [])):
        # no ORA remote autoenabled, but configuration known about at least one.
        # Let's check origin's config for datalad.ora-remote.uuid as stored by
        # create-sibling-ria and enable try enabling that one.
        lgr.debug("Found no autoenabled ORA special remote. Trying to look it "
                  "up in source config ...")

        # First figure whether we cloned via SSH, HTTP or local path and then
        # get that config file the same way:
        config_content = None
        scheme = props['giturl'].split(':', 1)[0]
        if scheme in ['http', 'https']:
            try:
                config_content = download_url(
                    "{}{}config".format(
                        props['giturl'],
                        '/' if not props['giturl'].endswith('/') else ''))
            except DownloadError as e:
                lgr.debug("Failed to get config file from source:\n%s",
                          exc_str(e))
        elif scheme == 'ssh':
            # TODO: switch the following to proper command abstraction:
            # SSHRemoteIO ignores the path part ATM. No remote CWD! (To be
            # changed with command abstractions). So we need to get that part to
            # have a valid path to origin's config file:
            cfg_path = PurePosixPath(URL(props['giturl']).path) / 'config'
            op = SSHRemoteIO(props['giturl'])
            try:
                config_content = op.read_file(cfg_path)
            except RIARemoteError as e:
                lgr.debug("Failed to get config file from source: %s",
                          exc_str(e))

        elif scheme == 'file':
            # TODO: switch the following to proper command abstraction:
            op = LocalIO()
            cfg_path = Path(URL(props['giturl']).localpath) / 'config'
            try:
                config_content = op.read_file(cfg_path)
            except (RIARemoteError, OSError) as e:
                lgr.debug("Failed to get config file from source: %s",
                          exc_str(e))
        else:
            lgr.debug("Unknown URL-Scheme %s in %s. Can handle SSH, HTTP or "
                      "FILE scheme URLs.", scheme, props['source'])

        # 3. And read it
        org_uuid = None
        if config_content:
            # TODO: We might be able to spare the saving to a file.
            #       "git config -f -" is not explicitly documented but happens
            #       to work and would read from stdin. Make sure we know this
            #       works for required git versions and on all platforms.
            with make_tempfile(content=config_content) as cfg_file:
                runner = GitWitlessRunner()
                try:
                    result = runner.run(
                        ['git', 'config', '-f', cfg_file,
                         'datalad.ora-remote.uuid'],
                        protocol=StdOutCapture
                    )
                    org_uuid = result['stdout'].strip()
                except CommandError as e:
                    # doesn't contain what we are looking for
                    lgr.debug("Found no UUID for ORA special remote at "
                              "'%s' (%s)", RIA_REMOTE_NAME, exc_str(e))

        # Now, enable it. If annex-init didn't fail to enable it as stored, we
        # wouldn't end up here, so enable with store URL as suggested by the URL
        # we cloned from.
        if org_uuid:
            srs = repo.get_special_remotes()
            if org_uuid in srs.keys():
                # TODO: - Double-check autoenable value and only do this when
                #         true?
                #       - What if still fails? -> Annex shouldn't change config
                #         in that case

                # we only need the store:
                new_url = props['source'].split('#')[0]
                try:
                    repo.enable_remote(srs[org_uuid]['name'],
                                       options=['url={}'.format(new_url)]
                                       )
                    lgr.info("Reconfigured %s for %s",
                             srs[org_uuid]['name'], new_url)
                    # update ora_remotes for considering publication dependency
                    # below
                    ora_remotes = [s for s in
                                   ds.siblings('query',
                                               result_renderer='disabled')
                                   if s.get('annex-externaltype', None) ==
                                   'ora']
                except CommandError as e:
                    lgr.debug("Failed to reconfigure ORA special remote: %s",
                              exc_str(e))
            else:
                lgr.debug("Unknown ORA special remote uuid at '%s': %s",
                          RIA_REMOTE_NAME, org_uuid)
    if ora_remotes:
        if len(ora_remotes) == 1:
            yield from ds.siblings('configure',
                                   name=RIA_REMOTE_NAME,
                                   publish_depends=ora_remotes[0]['name'],
                                   result_filter=None,
                                   result_renderer='disabled')
        else:
            lgr.warning("Found multiple ORA remotes. Couldn't decide which "
                        "publishing to 'origin' should depend on: %s. Consider "
                        "running 'datalad siblings configure -s origin "
                        "--publish-depends ORAREMOTENAME' to set publication "
                        "dependency manually.",
                        [r['name'] for r in ora_remotes])
示例#15
0
def get_versioned_url(url,
                      guarantee_versioned=False,
                      return_all=False,
                      verify=False,
                      s3conn=None,
                      update=False):
    """Given a url return a versioned URL

    Originally targeting AWS S3 buckets with versioning enabled

    Parameters
    ----------
    url : string
    guarantee_versioned : bool, optional
      Would fail if buckets is determined to have no versioning enabled.
      It will not fail if we fail to determine if bucket is versioned or
      not
    return_all: bool, optional
      If True, would return a list with URLs for all the versions of this
      file, sorted chronologically with latest first (when possible, e.g.
      for S3).  Remove markers get ignored
    verify: bool, optional
      Verify that URL is accessible. As discovered some versioned keys might
      be denied access to
    update : bool, optional
      If the URL already contains a version ID, update it to the latest version
      ID.  This option has no effect if return_all is true.

    Returns
    -------
    string or list of string
    """
    url_rec = URL(url)

    s3_bucket, fpath = None, url_rec.path.lstrip('/')

    if url_rec.hostname.endswith('.s3.amazonaws.com'):
        if url_rec.scheme not in ('http', 'https'):
            raise ValueError("Do not know how to handle %s scheme" %
                             url_rec.scheme)
        # we know how to slice this cat
        s3_bucket = url_rec.hostname.split('.', 1)[0]
    elif url_rec.hostname == 's3.amazonaws.com':
        if url_rec.scheme not in ('http', 'https'):
            raise ValueError("Do not know how to handle %s scheme" %
                             url_rec.scheme)
        # url is s3.amazonaws.com/bucket/PATH
        s3_bucket, fpath = fpath.split('/', 1)
    elif url_rec.scheme == 's3':
        s3_bucket = url_rec.hostname  # must be
        # and for now implement magical conversion to URL
        # TODO: wouldn't work if needs special permissions etc
        # actually for now
        raise NotImplementedError

    was_versioned = False
    all_versions = []
    if s3_bucket:
        # TODO: cache
        if s3conn is None:
            # we need to reuse our providers
            from ..downloaders.providers import Providers
            providers = Providers.from_config_files()
            s3url = "s3://%s/" % s3_bucket
            s3provider = providers.get_provider(s3url)
            if s3provider.authenticator.bucket is not None and s3provider.authenticator.bucket.name == s3_bucket:
                # we have established connection before, so let's just reuse
                bucket = s3provider.authenticator.bucket
            else:
                bucket = s3provider.authenticator.authenticate(
                    s3_bucket, s3provider.credential
                )  # s3conn or _get_bucket_connection(S3_TEST_CREDENTIAL)
        else:
            bucket = s3conn.get_bucket(s3_bucket)

        supports_versioning = True  # assume that it does
        try:
            supports_versioning = bucket.get_versioning_status()  # TODO cache
        except S3ResponseError as e:
            # might be forbidden, i.e. "403 Forbidden" so we try then anyways
            supports_versioning = 'maybe'

        if supports_versioning:
            all_keys = bucket.list_versions(fpath)
            # Filter and sort them so the newest one on top
            all_keys = [
                x for x in sorted(all_keys,
                                  key=lambda x: (x.last_modified, x.is_latest))
                if ((x.name == fpath)  # match exact name, not just prefix
                    )
            ][::-1]
            # our current assumptions
            assert (all_keys[0].is_latest)
            # and now filter out delete markers etc
            all_keys = [x for x in all_keys
                        if isinstance(x, Key)]  # ignore DeleteMarkers
            assert (all_keys)

            for key in all_keys:
                url_versioned = add_version_to_url(url_rec,
                                                   key.version_id,
                                                   replace=update
                                                   and not return_all)

                all_versions.append(url_versioned)
                if verify:
                    # it would throw HTTPError exception if not accessible
                    _ = urlopen(Request(url_versioned))
                was_versioned = True
                if not return_all:
                    break

    if guarantee_versioned and not was_versioned:
        raise RuntimeError("Could not version %s" % url)

    if not all_versions:
        # we didn't get a chance
        all_versions = [url_rec.as_str()]

    if return_all:
        return all_versions
    else:
        return all_versions[0]
    def __call__(urls,
                 dataset=None,
                 path=None,
                 overwrite=False,
                 archive=False,
                 save=True,
                 message=None):
        from ..downloaders.providers import Providers

        ds = None
        if save or dataset:
            try:
                ds = require_dataset(dataset,
                                     check_installed=True,
                                     purpose='downloading urls')
            except NoDatasetFound:
                pass

        common_report = {"action": "download_url", "ds": ds}

        got_ds_instance = isinstance(dataset, Dataset)
        dir_is_target = not path or path.endswith(op.sep)
        path = str(resolve_path(path or op.curdir, ds=dataset))
        if dir_is_target:
            # resolve_path() doesn't preserve trailing separators. Add one for
            # the download() call.
            path = path + op.sep
        urls = ensure_list_from_str(urls)

        if not dir_is_target:
            if len(urls) > 1:
                yield get_status_dict(
                    status="error",
                    message=
                    ("When specifying multiple urls, --path should point to "
                     "a directory target (with a trailing separator). Got %r",
                     path),
                    type="file",
                    path=path,
                    **common_report)
                return
            if archive:
                # make sure the file suffix indicated by a URL is preserved
                # so that any further archive processing doesn't have to
                # employ mime type inspection in order to determine the archive
                # type
                from datalad.support.network import URL
                suffixes = PurePosixPath(URL(urls[0]).path).suffixes
                if not Path(path).suffixes == suffixes:
                    path += ''.join(suffixes)
            # we know that we have a single URL
            # download() would be fine getting an existing directory and
            # downloading the URL underneath it, but let's enforce a trailing
            # slash here for consistency.
            if op.isdir(path):
                yield get_status_dict(
                    status="error",
                    message=(
                        "Non-directory path given (no trailing separator) "
                        "but a directory with that name (after adding archive "
                        "suffix) exists"),
                    type="file",
                    path=path,
                    **common_report)
                return

        # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress
        # in % of urls which were already downloaded
        providers = Providers.from_config_files()
        downloaded_paths = []
        path_urls = {}
        for url in urls:
            # somewhat "ugly"
            # providers.get_provider(url).get_downloader(url).download(url, path=path)
            # for now -- via sugaring
            try:
                downloaded_path = providers.download(url,
                                                     path=path,
                                                     overwrite=overwrite)
            except Exception as e:
                yield get_status_dict(status="error",
                                      message=exc_str(e),
                                      type="file",
                                      path=path,
                                      **common_report)
            else:
                downloaded_paths.append(downloaded_path)
                path_urls[downloaded_path] = url
                yield get_status_dict(status="ok",
                                      type="file",
                                      path=downloaded_path,
                                      **common_report)

        if downloaded_paths and save and ds is not None:
            msg = message or """\
[DATALAD] Download URLs

URLs:
  {}""".format("\n  ".join(urls))

            for r in Save()(
                    downloaded_paths,
                    message=msg,
                    # ATTN: Pass the original dataset argument to
                    # preserve relative path handling semantics.
                    dataset=dataset,
                    return_type="generator",
                    result_xfm=None,
                    result_filter=None,
                    on_failure="ignore"):
                yield r

            if isinstance(ds.repo, AnnexRepo):
                if got_ds_instance:
                    # Paths in `downloaded_paths` are already relative to the
                    # dataset.
                    rpaths = dict(zip(downloaded_paths, downloaded_paths))
                else:
                    # Paths in `downloaded_paths` are already relative to the
                    # current working directory. Take these relative to the
                    # dataset for use with the AnnexRepo method calls.
                    rpaths = {}
                    for orig_path, resolved in zip(
                            downloaded_paths,
                            resolve_path(downloaded_paths, ds=dataset)):
                        rpath = path_under_rev_dataset(ds, resolved)
                        if rpath:
                            rpaths[str(rpath)] = orig_path
                        else:
                            lgr.warning("Path %s not under dataset %s",
                                        orig_path, ds)
                annex_paths = [
                    p for p, annexed in zip(
                        rpaths, ds.repo.is_under_annex(list(rpaths.keys())))
                    if annexed
                ]
                if annex_paths:
                    for path in annex_paths:
                        url = path_urls[rpaths[path]]
                        try:
                            # The file is already present. This is just to
                            # register the URL.
                            ds.repo.add_url_to_file(
                                path,
                                url,
                                # avoid batch mode for single files
                                # https://github.com/datalad/datalad/issues/2849
                                batch=len(annex_paths) > 1,
                                # bypass URL size check, we already have the file
                                options=['--relaxed'])
                        except CommandError as exc:
                            lgr.warning("Registering %s with %s failed: %s",
                                        path, url, exc_str(exc))

                    if archive:
                        from datalad.api import add_archive_content
                        for path in annex_paths:
                            add_archive_content(path,
                                                annex=ds.repo,
                                                delete=True)
示例#17
0
def test_get_url_path_on_fileurls():
    eq_(URL('file:///a').path, '/a')
    eq_(URL('file:///a/b').path, '/a/b')
    eq_(URL('file:///a/b').localpath, '/a/b')
    eq_(URL('file:///a/b#id').path, '/a/b')
    eq_(URL('file:///a/b?whatever').path, '/a/b')
示例#18
0
def test_url_dicts():
    eq_(URL("http://host").query_dict, {})
示例#19
0
文件: s3.py 项目: datalad/datalad
def get_versioned_url(url, guarantee_versioned=False, return_all=False, verify=False,
                      s3conn=None, update=False):
    """Given a url return a versioned URL

    Originally targeting AWS S3 buckets with versioning enabled

    Parameters
    ----------
    url : string
    guarantee_versioned : bool, optional
      Would fail if buckets is determined to have no versioning enabled.
      It will not fail if we fail to determine if bucket is versioned or
      not
    return_all: bool, optional
      If True, would return a list with URLs for all the versions of this
      file, sorted chronologically with latest first (when possible, e.g.
      for S3).  Remove markers get ignored
    verify: bool, optional
      Verify that URL is accessible. As discovered some versioned keys might
      be denied access to
    update : bool, optional
      If the URL already contains a version ID, update it to the latest version
      ID.  This option has no effect if return_all is true.

    Returns
    -------
    string or list of string
    """
    url_rec = URL(url)

    s3_bucket, fpath = None, url_rec.path.lstrip('/')

    if url_rec.hostname.endswith('.s3.amazonaws.com'):
        if url_rec.scheme not in ('http', 'https'):
            raise ValueError("Do not know how to handle %s scheme" % url_rec.scheme)
        # we know how to slice this cat
        s3_bucket = url_rec.hostname.split('.', 1)[0]
    elif url_rec.hostname == 's3.amazonaws.com':
        if url_rec.scheme not in ('http', 'https'):
            raise ValueError("Do not know how to handle %s scheme" % url_rec.scheme)
        # url is s3.amazonaws.com/bucket/PATH
        s3_bucket, fpath = fpath.split('/', 1)
    elif url_rec.scheme == 's3':
        s3_bucket = url_rec.hostname  # must be
        # and for now implement magical conversion to URL
        # TODO: wouldn't work if needs special permissions etc
        # actually for now
        raise NotImplementedError

    was_versioned = False
    all_versions = []
    if s3_bucket:
        # TODO: cache
        if s3conn is None:
            # we need to reuse our providers
            from ..downloaders.providers import Providers
            providers = Providers.from_config_files()
            s3url = "s3://%s/" % s3_bucket
            s3provider = providers.get_provider(s3url)
            if s3provider.authenticator.bucket is not None and s3provider.authenticator.bucket.name == s3_bucket:
                # we have established connection before, so let's just reuse
                bucket = s3provider.authenticator.bucket
            else:
                bucket = s3provider.authenticator.authenticate(s3_bucket, s3provider.credential)  # s3conn or _get_bucket_connection(S3_TEST_CREDENTIAL)
        else:
            bucket = s3conn.get_bucket(s3_bucket)

        supports_versioning = True  # assume that it does
        try:
            supports_versioning = bucket.get_versioning_status()  # TODO cache
        except S3ResponseError as e:
            # might be forbidden, i.e. "403 Forbidden" so we try then anyways
            supports_versioning = 'maybe'

        if supports_versioning:
            all_keys = bucket.list_versions(fpath)
            # Filter and sort them so the newest one on top
            all_keys = [x for x in sorted(all_keys, key=lambda x: (x.last_modified, x.is_latest))
                        if ((x.name == fpath)  # match exact name, not just prefix
                            )
                        ][::-1]
            # our current assumptions
            assert(all_keys[0].is_latest)
            # and now filter out delete markers etc
            all_keys = [x for x in all_keys if isinstance(x, Key)]  # ignore DeleteMarkers
            assert(all_keys)

            for key in all_keys:
                url_versioned = add_version_to_url(
                    url_rec, key.version_id, replace=update and not return_all)

                all_versions.append(url_versioned)
                if verify:
                    # it would throw HTTPError exception if not accessible
                    _ = urlopen(Request(url_versioned))
                was_versioned = True
                if not return_all:
                    break

    if guarantee_versioned and not was_versioned:
        raise RuntimeError("Could not version %s" % url)

    if not all_versions:
        # we didn't get a chance
        all_versions = [url_rec.as_str()]

    if return_all:
        return all_versions
    else:
        return all_versions[0]