def test_get_versioned_url(): get_test_providers( 's3://openfmri/tarballs' ) # to verify having credentials to access openfmri via S3 for url_pref in ('http://openfmri.s3.amazonaws.com', 'https://s3.amazonaws.com/openfmri'): eq_(get_versioned_url(url_pref + "/tarballs/ds001_raw.tgz"), url_pref + "/tarballs/ds001_raw.tgz?versionId=null") eq_(get_versioned_url(url_pref + "/tarballs/ds001_raw.tgz?param=1"), url_pref + "/tarballs/ds001_raw.tgz?param=1&versionId=null") # We don't duplicate the version if it already exists. eq_( get_versioned_url(url_pref + "/tarballs/ds001_raw.tgz?versionId=null"), url_pref + "/tarballs/ds001_raw.tgz?versionId=null") # something is wrong there #print(get_versioned_url("http://openfmri.s3.amazonaws.com/ds001/demographics.txt")) eq_(get_versioned_url("someurl"), "someurl") # should just return original one assert_raises(RuntimeError, get_versioned_url, "someurl", guarantee_versioned=True) # TODO: on a bucket without versioning url = "http://datalad-test0-nonversioned.s3.amazonaws.com/2versions-removed-recreated.txt" eq_(get_versioned_url(url), url) eq_(get_versioned_url(url, return_all=True), [url]) assert_raises(NotImplementedError, get_versioned_url, "s3://buga") urls = get_versioned_url( "http://datalad-test0-versioned.s3.amazonaws.com/2versions-removed-recreated.txt", return_all=True, verify=True) eq_(len(set(urls)), len(urls)) # all unique for url in urls: # so we didn't grab other files along with the same prefix ok_startswith( url, 'http://datalad-test0-versioned.s3.amazonaws.com/2versions-removed-recreated.txt?versionId=' ) # Update a versioned URL with a newer version tag. url_3ver = "http://datalad-test0-versioned.s3.amazonaws.com/3versions-allversioned.txt" url_3ver_input = url_3ver + "?versionId=b.qCuh7Sg58VIYj8TVHzbRS97EvejzEl" eq_(get_versioned_url(url_3ver_input), url_3ver_input) eq_(get_versioned_url(url_3ver_input, update=True), url_3ver + "?versionId=Kvuind11HZh._dCPaDAb0OY9dRrQoTMn")
def test_version_url_deleted(): get_test_providers('s3://datalad-test0-versioned/', reload=True) # to verify having credentials to access # openfmri via S3 # it existed and then was removed fpath = "1version-removed.txt" url = "http://datalad-test0-versioned.s3.amazonaws.com/%s" % fpath turl = "http://datalad-test0-versioned.s3.amazonaws.com/%s" \ "?versionId=eZ5Hgwo8azfBv3QT7aW9dmm2sbLUY.QP" % fpath eq_(get_versioned_url(url), turl)
def test_get_versioned_url_anon(): # The one without any authenticator, was crashing. # Also it triggered another bug about having . in the bucket name url_on = "http://openneuro.org.s3.amazonaws.com/ds000001/dataset_description.json" url_on_versioned = get_versioned_url(url_on) ok_startswith(url_on_versioned, url_on + "?versionId=")
def __call__(dataset, urlfile, urlformat, filenameformat, input_type="ext", exclude_autometa=None, meta=None, message=None, dry_run=False, fast=False, ifexists=None, missing_value=None, save=True, version_urls=False): # Temporarily work around gh-2269. url_file = urlfile url_format, filename_format = urlformat, filenameformat from requests.exceptions import RequestException from datalad.distribution.add import Add from datalad.distribution.create import Create from datalad.distribution.dataset import Dataset, require_dataset from datalad.interface.results import get_status_dict from datalad.support.annexrepo import AnnexRepo lgr = logging.getLogger("datalad.plugin.addurls") dataset = require_dataset(dataset, check_installed=False) if dataset.repo and not isinstance(dataset.repo, AnnexRepo): yield get_status_dict(action="addurls", ds=dataset, status="error", message="not an annex repo") return if input_type == "ext": extension = os.path.splitext(url_file)[1] input_type = "json" if extension == ".json" else "csv" with open(url_file) as fd: try: rows, subpaths = extract(fd, input_type, url_format, filename_format, exclude_autometa, meta, dry_run, missing_value) except (ValueError, RequestException) as exc: yield get_status_dict(action="addurls", ds=dataset, status="error", message=exc_str(exc)) return if len(rows) != len(set(row["filename"] for row in rows)): yield get_status_dict(action="addurls", ds=dataset, status="error", message=("There are file name collisions; " "consider using {_repindex}")) return if dry_run: for subpath in subpaths: lgr.info("Would create a subdataset at %s", subpath) for row in rows: lgr.info("Would download %s to %s", row["url"], os.path.join(dataset.path, row["filename"])) lgr.info( "Metadata: %s", sorted(u"{}={}".format(k, v) for k, v in row["meta_args"].items())) yield get_status_dict(action="addurls", ds=dataset, status="ok", message="dry-run finished") return if not dataset.repo: # Populate a new dataset with the URLs. for r in dataset.create(result_xfm=None, return_type='generator', save=save): yield r annex_options = ["--fast"] if fast else [] for spath in subpaths: if os.path.exists(os.path.join(dataset.path, spath)): lgr.warning("Not creating subdataset at existing path: %s", spath) else: for r in dataset.create(spath, result_xfm=None, return_type='generator', save=save): yield r for row in rows: # Add additional information that we'll need for various # operations. filename_abs = os.path.join(dataset.path, row["filename"]) if row["subpath"]: ds_current = Dataset(os.path.join(dataset.path, row["subpath"])) ds_filename = os.path.relpath(filename_abs, ds_current.path) else: ds_current = dataset ds_filename = row["filename"] row.update({ "filename_abs": filename_abs, "ds": ds_current, "ds_filename": ds_filename }) if version_urls: num_urls = len(rows) log_progress(lgr.info, "addurls_versionurls", "Versioning %d URLs", num_urls, label="Versioning URLs", total=num_urls, unit=" URLs") for row in rows: url = row["url"] try: row["url"] = get_versioned_url(url) except (ValueError, NotImplementedError) as exc: # We don't expect this to happen because get_versioned_url # should return the original URL if it isn't an S3 bucket. # It only raises exceptions if it doesn't know how to # handle the scheme for what looks like an S3 bucket. lgr.warning("error getting version of %s: %s", row["url"], exc_str(exc)) log_progress(lgr.info, "addurls_versionurls", "Versioned result for %s: %s", url, row["url"], update=1, increment=True) log_progress(lgr.info, "addurls_versionurls", "Finished versioning URLs") files_to_add = set() for r in add_urls(rows, ifexists=ifexists, options=annex_options): if r["status"] == "ok": files_to_add.add(r["path"]) yield r msg = message or """\ [DATALAD] add files from URLs url_file='{}' url_format='{}' filename_format='{}'""".format(url_file, url_format, filename_format) if files_to_add: for r in dataset.add(files_to_add, save=False): yield r meta_rows = [r for r in rows if r["filename_abs"] in files_to_add] for r in add_meta(meta_rows): yield r # Save here rather than the add call above to trigger a metadata # commit on the git-annex branch. if save: for r in dataset.save(message=msg, recursive=True): yield r
def __call__(dataset, urlfile, urlformat, filenameformat, input_type="ext", exclude_autometa=None, meta=None, message=None, dry_run=False, fast=False, ifexists=None, missing_value=None, save=True, version_urls=False): # Temporarily work around gh-2269. url_file = urlfile url_format, filename_format = urlformat, filenameformat from requests.exceptions import RequestException from datalad.distribution.dataset import Dataset, require_dataset from datalad.interface.results import get_status_dict from datalad.support.annexrepo import AnnexRepo lgr = logging.getLogger("datalad.plugin.addurls") dataset = require_dataset(dataset, check_installed=False) if dataset.repo and not isinstance(dataset.repo, AnnexRepo): yield get_status_dict(action="addurls", ds=dataset, status="error", message="not an annex repo") return if input_type == "ext": extension = os.path.splitext(url_file)[1] input_type = "json" if extension == ".json" else "csv" with open(url_file) as fd: try: rows, subpaths = extract(fd, input_type, url_format, filename_format, exclude_autometa, meta, dry_run, missing_value) except (ValueError, RequestException) as exc: yield get_status_dict(action="addurls", ds=dataset, status="error", message=exc_str(exc)) return if len(rows) != len(set(row["filename"] for row in rows)): yield get_status_dict(action="addurls", ds=dataset, status="error", message=("There are file name collisions; " "consider using {_repindex}")) return if dry_run: for subpath in subpaths: lgr.info("Would create a subdataset at %s", subpath) for row in rows: lgr.info("Would download %s to %s", row["url"], os.path.join(dataset.path, row["filename"])) lgr.info("Metadata: %s", sorted(u"{}={}".format(k, v) for k, v in row["meta_args"].items())) yield get_status_dict(action="addurls", ds=dataset, status="ok", message="dry-run finished") return if not dataset.repo: # Populate a new dataset with the URLs. for r in dataset.create(result_xfm=None, return_type='generator'): yield r annex_options = ["--fast"] if fast else [] for spath in subpaths: if os.path.exists(os.path.join(dataset.path, spath)): lgr.warning( "Not creating subdataset at existing path: %s", spath) else: for r in dataset.create(spath, result_xfm=None, return_type='generator'): yield r for row in rows: # Add additional information that we'll need for various # operations. filename_abs = os.path.join(dataset.path, row["filename"]) if row["subpath"]: ds_current = Dataset(os.path.join(dataset.path, row["subpath"])) ds_filename = os.path.relpath(filename_abs, ds_current.path) else: ds_current = dataset ds_filename = row["filename"] row.update({"filename_abs": filename_abs, "ds": ds_current, "ds_filename": ds_filename}) if version_urls: num_urls = len(rows) log_progress(lgr.info, "addurls_versionurls", "Versioning %d URLs", num_urls, label="Versioning URLs", total=num_urls, unit=" URLs") for row in rows: url = row["url"] try: row["url"] = get_versioned_url(url) except (ValueError, NotImplementedError) as exc: # We don't expect this to happen because get_versioned_url # should return the original URL if it isn't an S3 bucket. # It only raises exceptions if it doesn't know how to # handle the scheme for what looks like an S3 bucket. lgr.warning("error getting version of %s: %s", row["url"], exc_str(exc)) log_progress(lgr.info, "addurls_versionurls", "Versioned result for %s: %s", url, row["url"], update=1, increment=True) log_progress(lgr.info, "addurls_versionurls", "Finished versioning URLs") files_to_add = set() for r in add_urls(rows, ifexists=ifexists, options=annex_options): if r["status"] == "ok": files_to_add.add(r["path"]) yield r msg = message or """\ [DATALAD] add files from URLs url_file='{}' url_format='{}' filename_format='{}'""".format(url_file, url_format, filename_format) if files_to_add: meta_rows = [r for r in rows if r["filename_abs"] in files_to_add] for r in add_meta(meta_rows): yield r if save: for r in dataset.save(path=files_to_add, message=msg, recursive=True): yield r