def _get_content_metadata(self): """Get ALL metadata for all dataset content. Returns ------- generator((location, metadata_dict)) """ log_progress( lgr.info, 'extractordataladcore', 'Start core metadata extraction from %s', self.ds, total=len(self.paths), label='Core metadata extraction', unit=' Files', ) if not isinstance(self.ds.repo, AnnexRepo): for p in self.paths: # this extractor does give a response for ANY file as it serves # an an indicator of file presence (i.e. a file list) in the # content metadata, even if we know nothing but the filename # about a file yield (p, dict()) log_progress( lgr.info, 'extractordataladcore', 'Finished core metadata extraction from %s', self.ds ) return valid_paths = None if self.paths and sum(len(i) for i in self.paths) > 500000: valid_paths = set(self.paths) # Availability information for file, whereis in self.ds.repo.whereis( self.paths if self.paths and valid_paths is None else '.', output='full').items(): if file.startswith('.datalad') or valid_paths and file not in valid_paths: # do not report on our own internal annexed files (e.g. metadata blobs) continue log_progress( lgr.info, 'extractordataladcore', 'Extracted core metadata from %s', file, update=1, increment=True) # pull out proper (public) URLs # TODO possibly extend with special remote info later on meta = {'url': whereis[remote].get('urls', []) for remote in whereis # "web" remote if remote == "00000000-0000-0000-0000-000000000001" and whereis[remote].get('urls', None)} yield (file, meta) log_progress( lgr.info, 'extractordataladcore', 'Finished core metadata extraction from %s', self.ds )
def get_metadata(self, dataset, content): if not content: return {}, [] log_progress( lgr.info, 'extractoraudio', 'Start audio metadata extraction from %s', self.ds, total=len(self.paths), label='audio metadata extraction', unit=' Files', ) contentmeta = [] for f in self.paths: absfp = opj(self.ds.path, f) log_progress( lgr.info, 'extractoraudio', 'Extract audio metadata from %s', absfp, update=1, increment=True) info = audiofile(absfp, easy=True) if info is None: continue meta = {vocab_map.get(k, k): info[k][0] if isinstance(info[k], list) and len(info[k]) == 1 else info[k] for k in info} if hasattr(info, 'mime') and len(info.mime): meta['format'] = 'mime:{}'.format(info.mime[0]) for k in ('length', 'channels', 'bitrate', 'sample_rate'): if hasattr(info.info, k): val = getattr(info.info, k) if k == 'length': # duration comes in seconds, cap at millisecond level val = round(val, 3) meta[vocab_map.get(k, k)] = val contentmeta.append((f, meta)) log_progress( lgr.info, 'extractoraudio', 'Finished audio metadata extraction from %s', self.ds ) return { '@context': { 'music': { '@id': 'http://purl.org/ontology/mo/', 'description': 'Music Ontology with main concepts and properties for describing music', 'type': vocabulary_id, }, 'duration(s)': { "@id": 'time:Duration', "unit": "uo:0000010", 'unit_label': 'second', }, }, }, \ contentmeta
def get_metadata(self, dataset, content): if not content: return {}, [] contentmeta = [] log_progress( lgr.info, 'extractorimage', 'Start image metadata extraction from %s', self.ds, total=len(self.paths), label='image metadata extraction', unit=' Files', ) for f in self.paths: absfp = opj(self.ds.path, f) log_progress( lgr.info, 'extractorimage', 'Extract image metadata from %s', absfp, update=1, increment=True) try: img = Image.open(absfp) except Exception as e: lgr.debug("Image metadata extractor failed to load %s: %s", absfp, exc_str(e)) continue meta = { 'type': 'dctype:Image', } # run all extractors meta.update({k: v(img) for k, v in self._extractors.items()}) # filter useless fields (empty strings and NaNs) meta = {k: v for k, v in meta.items() if not (hasattr(v, '__len__') and not len(v))} contentmeta.append((f, meta)) log_progress( lgr.info, 'extractorimage', 'Finished image metadata extraction from %s', self.ds ) return { '@context': vocabulary, }, \ contentmeta
def get_metadata(self, dataset, content): if not content: return {}, [] log_progress( lgr.info, 'extractorexif', 'Start EXIF metadata extraction from %s', self.ds, total=len(self.paths), label='EXIF metadata extraction', unit=' Files', ) contentmeta = [] for f in self.paths: absfp = opj(self.ds.path, f) log_progress( lgr.info, 'extractorexif', 'Extract EXIF metadata from %s', absfp, update=1, increment=True) # TODO we might want to do some more elaborate extraction in the future # but for now plain EXIF, no maker extensions, no thumbnails info = process_file(open(opj(self.ds.path, f), 'rb'), details=False) if not info: # got nothing, likely nothing there continue meta = {k.split()[-1]: _return_as_appropriate_dtype(info[k].printable) for k in info} contentmeta.append((f, meta)) log_progress( lgr.info, 'extractorexif', 'Finished EXIF metadata extraction from %s', self.ds ) return { '@context': { 'exif': { '@id': 'http://www.w3.org/2003/12/exif/ns/', 'description': 'Vocabulary to describe an Exif format picture data', 'type': vocabulary_id, }, }, }, \ contentmeta
def branch_blobs(repo, branch): """Get all blobs for `branch`. Parameters ---------- repo : GitRepo branch : str Returns ------- A generator object that returns (hexsha, content, file name) for each blob in `branch`. Note: By design a blob isn't tied to a particular file name; the returned file name matches what is returned by 'git rev-list'. """ git = repo.repo.git # Note: This might be nicer with rev-list's --filter and # --filter-print-omitted, but those aren't available until Git v2.16. lines = git.rev_list(branch, objects=True).splitlines() # Trees and blobs have an associated path printed. objects = (ln.split() for ln in lines) blob_trees = [obj for obj in objects if len(obj) == 2] num_objects = len(blob_trees) log_progress(lgr.info, "repodates_branch_blobs", "Checking %d objects", num_objects, label="Checking objects", total=num_objects, unit=" objects") # This is inefficient. It makes a git call for each object, some of which # aren't even blobs. We could instead use 'git cat-file --batch'. for obj, fname in blob_trees: log_progress(lgr.info, "repodates_branch_blobs", "Checking %s", obj, increment=True, update=1) try: yield obj, git.cat_file("blob", obj), fname except GitCommandError: # The object was a tree. continue log_progress(lgr.info, "repodates_branch_blobs", "Finished checking %d objects", num_objects)
def branch_blobs_in_tree(repo, branch): """Get all blobs for the current tree of `branch`. Parameters ---------- repo : GitRepo branch : str, optional Returns ------- A generator object that returns (hexsha, content, file name) for each blob. Note: If there are multiple files in the tree that point to the blob, only the first file name that is reported by 'git ls-tree' is used (i.e., one entry per blob is yielded). """ seen_blobs = set() git = repo.repo.git out = git.ls_tree(branch, z=True, r=True) if out: lines = out.strip("\0").split("\0") num_lines = len(lines) log_progress(lgr.info, "repodates_blobs_in_tree", "Checking %d objects in git-annex tree", num_lines, label="Checking objects", total=num_lines, unit=" objects") for line in lines: _, obj_type, obj, fname = line.split() log_progress(lgr.info, "repodates_blobs_in_tree", "Checking %s", obj, increment=True, update=1) if obj_type == "blob" and obj not in seen_blobs: yield obj, git.cat_file("blob", obj), fname seen_blobs.add(obj) log_progress(lgr.info, "repodates_blobs_in_tree", "Finished checking %d blobs", num_lines)
def add_extra_filename_values(filename_format, rows, urls, dry_run): """Extend `rows` with values for special formatting fields. """ file_fields = list(get_fmt_names(filename_format)) if any(i.startswith("_url") for i in file_fields): for row, url in zip(rows, urls): row.update(get_url_parts(url)) if any(i.startswith("_url_filename") for i in file_fields): if dry_run: # Don't waste time making requests. dummy = get_file_parts("BASE.EXT", "_url_filename") for idx, row in enumerate(rows): row.update( {k: v + str(idx) for k, v in dummy.items()}) else: num_urls = len(urls) log_progress(lgr.info, "addurls_requestnames", "Requesting file names for %d URLs", num_urls, label="Requesting names", total=num_urls, unit=" Files") for row, url in zip(rows, urls): # If we run into any issues here, we're just going to raise an # exception and then abort inside dlplugin. It'd be good to # disentangle this from `extract` so that we could yield an # individual error, drop the row, and keep going. filename = get_url_filename(url) if filename: row.update(get_file_parts(filename, "_url_filename")) else: raise ValueError( "{} does not contain a filename".format(url)) log_progress(lgr.info, "addurls_requestnames", "%s returned for %s", url, filename, update=1, increment=True) log_progress(lgr.info, "addurls_requestnames", "Finished requesting file names")
def _yield_res_from_pre2019_extractor(ds, name, extractor_cls, process_type, paths): # pragma: no cover """This implements dealing with our first extractor class concept""" want_dataset_meta = process_type in ('all', 'dataset') \ if process_type else ds.config.obtain( 'datalad.metadata.extract-dataset-{}'.format( name.replace('_', '-')), default=True, valtype=EnsureBool()) want_content_meta = process_type in ('all', 'content') \ if process_type else ds.config.obtain( 'datalad.metadata.extract-content-{}'.format( name.replace('_', '-')), default=True, valtype=EnsureBool()) if not (want_dataset_meta or want_content_meta): # pragma: no cover log_progress( lgr.info, 'metadataextractors', 'Skipping %s metadata extraction from %s, ' 'disabled by configuration', name, ds, ) return try: extractor = extractor_cls(ds, paths) except Exception as e: # pragma: no cover log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', name, ds, ) raise ValueError( "Failed to load metadata extractor for '%s', " "broken dataset configuration (%s)?: %s", name, ds, exc_str(e)) # this is the old way of extractor operation dsmeta_t, contentmeta_t = extractor.get_metadata( dataset=want_dataset_meta, content=want_content_meta, ) # fake the new way of reporting results directly # extractors had no way to report errors, hence # everything is unconditionally 'ok' for loc, meta in contentmeta_t or []: yield dict( status='ok', path=loc, type='file', metadata=meta, ) yield dict( status='ok', path=ds.path, type='dataset', metadata=dsmeta_t, )
def __call__( url, name, dataset=None, storage_name=None, post_update_hook=False, shared=None, group=None, storage_sibling=True, existing='error', trust_level=None, recursive=False, recursion_limit=None, disable_storage__=None, ): if disable_storage__ is not None: import warnings warnings.warn( "datalad-create-sibling-ria --no-storage-sibling " "is deprecated, use --storage-sibling off instead.", DeprecationWarning) # recode to new setup disable_storage__ = None storage_sibling = False if storage_sibling == 'only' and storage_name: lgr.warning( "Sibling name will be used for storage sibling in " "storage-sibling-only mode, but a storage sibling name " "was provided") ds = require_dataset(dataset, check_installed=True, purpose='create sibling RIA') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) # parse target URL try: ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config) except ValueError as e: yield get_status_dict(status='error', message=str(e), **res_kwargs) return if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError("Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format( ds.path)) if not storage_sibling and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided") if storage_sibling and not storage_name: storage_name = "{}-storage".format(name) if storage_sibling and name == storage_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': # in recursive mode this check could take a substantial amount of # time: employ a progress bar (or rather a counter, because we don't # know the total in advance pbar_id = 'check-siblings-{}'.format(id(ds)) log_progress( lgr.info, pbar_id, 'Start checking pre-existing sibling configuration %s', ds, label='Query siblings', unit=' Siblings', ) # even if we have to fail, let's report all conflicting siblings # in subdatasets failed = False for r in ds.siblings(result_renderer=None, recursive=recursive, recursion_limit=recursion_limit): log_progress(lgr.info, pbar_id, 'Discovered sibling %s in dataset at %s', r['name'], r['path'], update=1, increment=True) if not r['type'] == 'sibling' or r['status'] != 'ok': # this is an internal status query that has not consequence # for the outside world. Be silent unless something useful # can be said #yield r continue if r['name'] == name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(name, r['path']), **res_kwargs, ) failed = True yield res continue if storage_name and r['name'] == storage_name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(storage_name, r['path']), **res_kwargs, ) failed = True yield res continue log_progress( lgr.info, pbar_id, 'Finished checking pre-existing sibling configuration %s', ds, ) if failed: return # TODO: - URL parsing + store creation needs to be RF'ed based on # command abstractions # - more generally consider store creation a dedicated command or # option # Note: URL parsing is done twice ATM (for top-level ds). This can't be # reduced to single instance, since rewriting url based on config could # be different for subdatasets. create_store( SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), Path(base_path), '1') yield from _create_sibling_ria(ds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(fulfilled=True, recursive=True, recursion_limit=recursion_limit, result_xfm='datasets'): yield from _create_sibling_ria(subds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs)
def __call__( url, name, *, # note that `name` is required but not posarg in CLI dataset=None, storage_name=None, alias=None, post_update_hook=False, shared=None, group=None, storage_sibling=True, existing='error', new_store_ok=False, trust_level=None, recursive=False, recursion_limit=None, disable_storage__=None, push_url=None): if disable_storage__ is not None: import warnings warnings.warn( "datalad-create-sibling-ria --no-storage-sibling " "is deprecated, use --storage-sibling off instead.", DeprecationWarning) # recode to new setup disable_storage__ = None storage_sibling = False if storage_sibling == 'only' and storage_name: lgr.warning( "Sibling name will be used for storage sibling in " "storage-sibling-only mode, but a storage sibling name " "was provided") ds = require_dataset(dataset, check_installed=True, purpose='create RIA sibling(s)') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) # parse target URL # Note: URL parsing is done twice ATM (for top-level ds). This can't be # reduced to single instance, since rewriting url based on config could # be different for subdatasets. try: ssh_host, base_path, rewritten_url = \ verify_ria_url(push_url if push_url else url, ds.config) except ValueError as e: yield get_status_dict(status='error', message=str(e), **res_kwargs) return if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError("Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format( ds.path)) if not storage_sibling and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided") if storage_sibling and not storage_name: storage_name = "{}-storage".format(name) if storage_sibling and name == storage_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': failed = False for dpath, sname in _yield_ds_w_matching_siblings( ds, (name, storage_name), recursive=recursive, recursion_limit=recursion_limit): res = get_status_dict( status='error', message=( "a sibling %r is already configured in dataset %r", sname, dpath), type='sibling', name=sname, ds=ds, **res_kwargs, ) failed = True yield res if failed: return # TODO: - URL parsing + store creation needs to be RF'ed based on # command abstractions # - more generally consider store creation a dedicated command or # option io = SSHRemoteIO(ssh_host) if ssh_host else LocalIO() try: # determine the existence of a store by trying to read its layout. # Because this raises a FileNotFound error if non-existent, we need # to catch it io.read_file(Path(base_path) / 'ria-layout-version') except (FileNotFoundError, RIARemoteError, RemoteCommandFailedError) as e: if not new_store_ok: # we're instructed to only act in case of an existing RIA store res = get_status_dict(status='error', message="No store found at '{}'. Forgot " "--new-store-ok ?".format( Path(base_path)), **res_kwargs) yield res return log_progress( lgr.info, 'create-sibling-ria', 'Creating a new RIA store at %s', Path(base_path), ) create_store(io, Path(base_path), '1') yield from _create_sibling_ria(ds, url, push_url, name, storage_sibling, storage_name, alias, existing, shared, group, post_update_hook, trust_level, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(state='present', recursive=True, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm='datasets'): yield from _create_sibling_ria( subds, url, push_url, name, storage_sibling, storage_name, None, # subdatasets can't have the same alias as the parent existing, shared, group, post_update_hook, trust_level, res_kwargs)
def __call__(url, name, dataset=None, ria_remote_name=None, post_update_hook=False, shared=None, group=None, ria_remote=True, existing='error', recursive=False, recursion_limit=None): ds = require_dataset(dataset, check_installed=True, purpose='create sibling RIA') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError("Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format( ds.path)) if not ria_remote and ria_remote_name: lgr.warning( "RIA remote setup disabled, but a ria-remote name was provided" ) if ria_remote and not ria_remote_name: ria_remote_name = "{}-ria".format(name) if ria_remote and name == ria_remote_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': # in recursive mode this check could take a substantial amount of # time: employ a progress bar (or rather a counter, because we dont # know the total in advance pbar_id = 'check-siblings-{}'.format(id(ds)) log_progress( lgr.info, pbar_id, 'Start checking pre-existing sibling configuration %s', ds, label='Query siblings', unit=' Siblings', ) # even if we have to fail, let's report all conflicting siblings # in subdatasets failed = False for r in ds.siblings(result_renderer=None, recursive=recursive, recursion_limit=recursion_limit): log_progress(lgr.info, pbar_id, 'Discovered sibling %s in dataset at %s', r['name'], r['path'], update=1, increment=True) if not r['type'] == 'sibling' or r['status'] != 'ok': # this is an internal status query that has not consequence # for the outside world. Be silent unless something useful # can be said #yield r continue if r['name'] == name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(name, r['path']), **res_kwargs, ) failed = True yield res continue if ria_remote_name and r['name'] == ria_remote_name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(ria_remote_name, r['path']), **res_kwargs, ) failed = True yield res continue log_progress( lgr.info, pbar_id, 'Finished checking pre-existing sibling configuration %s', ds, ) if failed: return yield from _create_sibling_ria(ds, url, name, ria_remote, ria_remote_name, existing, shared, group, post_update_hook, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(fulfilled=True, recursive=True, recursion_limit=recursion_limit, result_xfm='datasets'): yield from _create_sibling_ria(subds, url, name, ria_remote, ria_remote_name, existing, shared, group, post_update_hook, res_kwargs)
def __call__(dataset, urlfile, urlformat, filenameformat, input_type="ext", exclude_autometa=None, meta=None, message=None, dry_run=False, fast=False, ifexists=None, missing_value=None, save=True, version_urls=False): # Temporarily work around gh-2269. url_file = urlfile url_format, filename_format = urlformat, filenameformat from requests.exceptions import RequestException from datalad.distribution.dataset import Dataset, require_dataset from datalad.interface.results import get_status_dict from datalad.support.annexrepo import AnnexRepo lgr = logging.getLogger("datalad.plugin.addurls") dataset = require_dataset(dataset, check_installed=False) if dataset.repo and not isinstance(dataset.repo, AnnexRepo): yield get_status_dict(action="addurls", ds=dataset, status="error", message="not an annex repo") return if input_type == "ext": extension = os.path.splitext(url_file)[1] input_type = "json" if extension == ".json" else "csv" with open(url_file) as fd: try: rows, subpaths = extract(fd, input_type, url_format, filename_format, exclude_autometa, meta, dry_run, missing_value) except (ValueError, RequestException) as exc: yield get_status_dict(action="addurls", ds=dataset, status="error", message=exc_str(exc)) return if len(rows) != len(set(row["filename"] for row in rows)): yield get_status_dict(action="addurls", ds=dataset, status="error", message=("There are file name collisions; " "consider using {_repindex}")) return if dry_run: for subpath in subpaths: lgr.info("Would create a subdataset at %s", subpath) for row in rows: lgr.info("Would download %s to %s", row["url"], os.path.join(dataset.path, row["filename"])) lgr.info("Metadata: %s", sorted(u"{}={}".format(k, v) for k, v in row["meta_args"].items())) yield get_status_dict(action="addurls", ds=dataset, status="ok", message="dry-run finished") return if not dataset.repo: # Populate a new dataset with the URLs. for r in dataset.create(result_xfm=None, return_type='generator'): yield r annex_options = ["--fast"] if fast else [] for spath in subpaths: if os.path.exists(os.path.join(dataset.path, spath)): lgr.warning( "Not creating subdataset at existing path: %s", spath) else: for r in dataset.create(spath, result_xfm=None, return_type='generator'): yield r for row in rows: # Add additional information that we'll need for various # operations. filename_abs = os.path.join(dataset.path, row["filename"]) if row["subpath"]: ds_current = Dataset(os.path.join(dataset.path, row["subpath"])) ds_filename = os.path.relpath(filename_abs, ds_current.path) else: ds_current = dataset ds_filename = row["filename"] row.update({"filename_abs": filename_abs, "ds": ds_current, "ds_filename": ds_filename}) if version_urls: num_urls = len(rows) log_progress(lgr.info, "addurls_versionurls", "Versioning %d URLs", num_urls, label="Versioning URLs", total=num_urls, unit=" URLs") for row in rows: url = row["url"] try: row["url"] = get_versioned_url(url) except (ValueError, NotImplementedError) as exc: # We don't expect this to happen because get_versioned_url # should return the original URL if it isn't an S3 bucket. # It only raises exceptions if it doesn't know how to # handle the scheme for what looks like an S3 bucket. lgr.warning("error getting version of %s: %s", row["url"], exc_str(exc)) log_progress(lgr.info, "addurls_versionurls", "Versioned result for %s: %s", url, row["url"], update=1, increment=True) log_progress(lgr.info, "addurls_versionurls", "Finished versioning URLs") files_to_add = set() for r in add_urls(rows, ifexists=ifexists, options=annex_options): if r["status"] == "ok": files_to_add.add(r["path"]) yield r msg = message or """\ [DATALAD] add files from URLs url_file='{}' url_format='{}' filename_format='{}'""".format(url_file, url_format, filename_format) if files_to_add: meta_rows = [r for r in rows if r["filename_abs"] in files_to_add] for r in add_meta(meta_rows): yield r if save: for r in dataset.save(path=files_to_add, message=msg, recursive=True): yield r
def clone_dataset(srcs, destds, reckless=None, description=None, result_props=None, cfg=None): """Internal helper to perform cloning without sanity checks (assumed done) This helper does not handle any saving of subdataset modification or adding in a superdataset. Parameters ---------- srcs : list Any suitable clone source specifications (paths, URLs) destds : Dataset Dataset instance for the clone destination reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e. sacrifice data safety for performance or resource footprint. description : str, optional Location description for the annex of the dataset clone (if there is any). result_props : dict, optional Default properties for any yielded result, passed on to get_status_dict(). cfg : ConfigManager, optional Configuration will be queried from this instance (i.e. from a particular dataset). If None is given, the global DataLad configuration will be queried. Yields ------ dict DataLad result records """ if not result_props: # in case the caller had no specific idea on how results should look # like, provide sensible defaults result_props = dict( action='install', logger=lgr, ds=destds, ) dest_path = destds.pathobj # decode all source candidate specifications candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs] # now expand the candidate sources with additional variants of the decoded # giturl, while duplicating the other properties in the additional records # for simplicity. The hope is to overcome a few corner cases and be more # robust than git clone candidate_sources = [ dict(props, giturl=s) for props in candidate_sources for s in _get_flexible_source_candidates(props['giturl']) ] # important test! based on this `rmtree` will happen below after failed clone dest_path_existed = dest_path.exists() if dest_path_existed and any(dest_path.iterdir()): if destds.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from # this is where it was actually installed from track_name, track_url = _get_tracking_source(destds) try: # this will get us track_url in system native path conventions, # whenever it is a path (and not a URL) # this is needed to match it to any potentially incoming local # source path in the 'notneeded' test below track_path = str(Path(track_url)) except Exception: # this should never happen, because Path() will let any non-path stringification # pass through unmodified, but we do not want any potential crash due to # pathlib behavior changes lgr.debug("Unexpected behavior of pathlib!") track_path = None for cand in candidate_sources: src = cand['giturl'] if track_url == src \ or get_local_file_url(track_url, compatibility='git') == src \ or track_path == expanduser(src): yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destds, src), **result_props) return # anything else is an error yield get_status_dict( status='error', message= 'target path already exists and not empty, refuse to clone into target path', **result_props) return log_progress( lgr.info, 'cloneds', 'Cloning dataset to %s', destds, total=len(candidate_sources), label='Clone attempt', unit=' Candidate locations', ) error_msgs = OrderedDict( ) # accumulate all error messages formatted per each url for cand in candidate_sources: log_progress(lgr.info, 'cloneds', 'Attempting to clone from %s to %s', cand['giturl'], dest_path, update=1, increment=True) clone_opts = {} if cand.get('version', None): clone_opts['branch'] = cand['version'] try: # TODO for now GitRepo.clone() cannot handle Path instances, and PY35 # doesn't make it happen seemlessly GitRepo.clone(path=str(dest_path), url=cand['giturl'], clone_options=clone_opts, create=True) except CommandError as e: e_stderr = e.stderr error_msgs[cand['giturl']] = e lgr.debug("Failed to clone from URL: %s (%s)", cand['giturl'], exc_str(e)) if dest_path.exists(): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it # TODO stringification can be removed once patlib compatible # or if PY35 is no longer supported rmtree(str(dest_path), children_only=dest_path_existed) if 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) # cancel progress bar log_progress(lgr.info, 'cloneds', 'Completed clone attempts for %s', destds) yield get_status_dict(status='error', message=re_match.group(1).strip() if re_match else "stderr: " + e_stderr, **result_props) return # next candidate continue result_props['source'] = cand # do not bother with other sources if succeeded break log_progress(lgr.info, 'cloneds', 'Completed clone attempts for %s', destds) if not destds.is_installed(): if len(error_msgs): if all(not e.stdout and not e.stderr for e in error_msgs.values()): # there is nothing we can learn from the actual exception, # the exit code is uninformative, the command is predictable error_msg = "Failed to clone from all attempted sources: %s" error_args = list(error_msgs.keys()) else: error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were:\n- %s" error_args = '\n- '.join('{}\n {}'.format(url, exc_str(exc)) for url, exc in error_msgs.items()) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destds.path, cand['giturl']) yield get_status_dict(status='error', message=(error_msg, error_args), **result_props) return if not cand.get("version"): postclone_check_head(destds) # act on --reckless=shared-... # must happen prior git-annex-init, where we can cheaply alter the repo # setup through safe re-init'ing if reckless and reckless.startswith('shared-'): lgr.debug('Reinit %s to enable shared access permissions', destds) destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])]) yield from postclonecfg_annexdataset(destds, reckless, description) # perform any post-processing that needs to know details of the clone # source if result_props['source']['type'] == 'ria': yield from postclonecfg_ria(destds, result_props['source']) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **result_props)
def __call__(target, opts=None, dataset=None): # only non-bare repos have hashdirmixed, so require one ds = require_dataset(dataset, check_installed=True, purpose='ORA archive export') ds_repo = ds.repo annex_objs = ds_repo.dot_git / 'annex' / 'objects' archive = resolve_path(target, dataset) if archive.is_dir(): archive = archive / 'archive.7z' else: archive.parent.mkdir(exist_ok=True, parents=True) if not opts: # uncompressed by default opts = ['-mx0'] res_kwargs = dict( action="export-archive-ora", logger=lgr, ) if not annex_objs.is_dir(): yield get_status_dict( ds=ds, status='notneeded', message='no annex keys present', **res_kwargs, ) return exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ora_archive' if exportdir.exists(): yield get_status_dict( ds=ds, status='error', message=( 'export directory already exists, please remove first: %s', str(exportdir)), **res_kwargs, ) return keypaths = [ k for k in annex_objs.glob(op.join('**', '*')) if k.is_file() ] log_progress( lgr.info, 'oraarchiveexport', 'Start ORA archive export %s', ds, total=len(keypaths), label='ORA archive export', unit=' Keys', ) link_fx = os.link for keypath in keypaths: key = keypath.name hashdir = op.join(keypath.parts[-4], keypath.parts[-3]) log_progress(lgr.info, 'oraarchiveexport', 'Export key %s to %s', key, hashdir, update=1, increment=True) keydir = exportdir / hashdir / key keydir.mkdir(parents=True, exist_ok=True) try: link_fx(str(keypath), str(keydir / key)) except OSError: lgr.warning( 'No hard links supported at %s, will copy files instead', str(keydir)) # no hard links supported # switch function after first error link_fx = shutil.copyfile link_fx(str(keypath), str(keydir / key)) log_progress(lgr.info, 'oraarchiveexport', 'Finished RIA archive export from %s', ds) try: subprocess.run( ['7z', 'u', str(archive), '.'] + opts, cwd=str(exportdir), ) yield get_status_dict(path=str(archive), type='file', status='ok', **res_kwargs) except Exception as e: yield get_status_dict(path=str(archive), type='file', status='error', message=('7z failed: %s', exc_str(e)), **res_kwargs) return finally: rmtree(str(exportdir))
def _push(dspath, content, target, data, force, jobs, res_kwargs, pbars, got_path_arg=False): force_git_push = force in ('all', 'gitpush') # nothing recursive in here, we only need a repo to work with ds = Dataset(dspath) repo = ds.repo res_kwargs.update(type='dataset', path=dspath) # content will be unique for every push (even on the same dataset) pbar_id = 'push-{}-{}'.format(target, id(content)) # register for final orderly take down pbars[pbar_id] = ds log_progress( lgr.info, pbar_id, 'Determine push target', unit=' Steps', label='Push', total=4, ) # pristine input arg _target = target # verified or auto-detected target = None if not _target: try: # let Git figure out what needs doing # we will reuse the result further down again, so nothing is wasted wannabe_gitpush = repo.push(remote=None, git_options=['--dry-run']) # we did not get an explicit push target, get it from Git target = set(p.get('remote', None) for p in wannabe_gitpush) # handle case where a pushinfo record did not have a 'remote' # property -- should not happen, but be robust target.discard(None) except Exception as e: lgr.debug( 'Dry-run push to determine default push target failed, ' 'assume no configuration: %s', e) target = set() if not len(target): yield dict( res_kwargs, status='impossible', message='No push target given, and none could be ' 'auto-detected, please specify via --to', ) return elif len(target) > 1: # dunno if this can ever happen, but if it does, report # nicely yield dict(res_kwargs, status='error', message=( 'No push target given, ' 'multiple candidates auto-detected: %s', list(target), )) return else: # can only be a single one at this point target = target.pop() if not target: if _target not in repo.get_remotes(): yield dict(res_kwargs, status='error', message=("Unknown target sibling '%s'.", _target)) return target = _target log_progress(lgr.info, pbar_id, "Push refspecs", label="Push to '{}'".format(target), update=1, total=4) # define config var name for potential publication dependencies depvar = 'remote.{}.datalad-publish-depends'.format(target) # list of remotes that are publication dependencies for the # target remote publish_depends = ensure_list(ds.config.get(depvar, [])) if publish_depends: lgr.debug("Discovered publication dependencies for '%s': %s'", target, publish_depends) # cache repo type is_annex_repo = isinstance(ds.repo, AnnexRepo) # TODO prevent this when `target` is a special remote # (possibly redo) a push attempt to figure out what needs pushing # do this on the main target only, and apply the result to all # dependencies try: if _target: # only do it when an explicit target was given, otherwise # we can reuse the result from the auto-probing above wannabe_gitpush = repo.push(remote=target, git_options=['--dry-run']) except Exception as e: lgr.debug( 'Dry-run push to check push configuration failed, ' 'assume no configuration: %s', e) wannabe_gitpush = [] refspecs2push = [ # if an upstream branch is set, go with it p['from_ref'] if ds.config.get( # refs come in as refs/heads/<branchname> # need to cut the prefix 'branch.{}.remote'.format(p['from_ref'][11:]), None) == target and ds.config.get('branch.{}.merge'.format(p['from_ref'][11:]), None) # if not, define target refspec explicitly to avoid having to # set an upstream branch, which would happen implicitly from # a users POV, and may also be hard to decide when publication # dependencies are present else '{}:{}'.format(p['from_ref'], p['to_ref']) for p in wannabe_gitpush # TODO: what if a publication dependency doesn't have it yet # should we not attempt to push, because the main target has it? if 'uptodate' not in p['operations'] and ( # cannot think of a scenario where we would want to push a # managed branch directly, instead of the corresponding branch 'refs/heads/adjusted' not in p['from_ref']) ] # TODO this is not right with managed branches active_branch = repo.get_active_branch() if active_branch and is_annex_repo: # we could face a managed branch, in which case we need to # determine the actual one and make sure it is sync'ed with the # managed one, and push that one instead. following methods can # be called unconditionally repo.localsync(managed_only=True) active_branch = repo.get_corresponding_branch( active_branch) or active_branch if not refspecs2push and not active_branch: # nothing was set up for push, and we have no active branch # this is a weird one, let's confess and stop here # I don't think we need to support such a scenario if not active_branch: yield dict( res_kwargs, status='impossible', message='There is no active branch, cannot determine remote ' 'branch') return # make sure that we always push the active branch (the context for the # potential path arguments) and the annex branch -- because we claim # to know better than any git config must_have_branches = [active_branch] if active_branch else [] if is_annex_repo: must_have_branches.append('git-annex') for branch in must_have_branches: _append_branch_to_refspec_if_needed(ds, refspecs2push, branch) # we know what to push and where, now dependency processing first for r in publish_depends: # simply make a call to this function again, all the same, but # target is different yield from _push( dspath, content, # to this particular dependency r, data, force, jobs, res_kwargs.copy(), pbars, got_path_arg=got_path_arg, ) # and lastly the primary push target target_is_git_remote = repo.config.get('remote.{}.url'.format(target), None) is not None # git-annex data copy # if is_annex_repo: if data != "nothing": log_progress(lgr.info, pbar_id, "Transfer data", label="Transfer data to '{}'".format(target), update=2, total=4) yield from _push_data( ds, target, content, data, force, jobs, res_kwargs.copy(), got_path_arg=got_path_arg, ) else: lgr.debug("Data transfer to '%s' disabled by argument", target) else: lgr.debug("No data transfer: %s is not a git annex repository", repo) if not target_is_git_remote: # there is nothing that we need to push or sync with on the git-side # of things with this remote return log_progress(lgr.info, pbar_id, "Update availability information", label="Update availability for '{}'".format(target), update=3, total=4) # TODO fetch is only needed if anything was actually transferred. Collect this # info and make the following conditional on it # after file transfer the remote might have different commits to # the annex branch. They have to be merged locally, otherwise a # push of it further down will fail try: # fetch remote, let annex sync them locally, so that the push # later on works. # We have to fetch via the push url (if there is any), # not a pull url. # The latter might be dumb and without the execution of a # post-update hook we might not be able to retrieve the # server-side git-annex branch updates (and git-annex does # not trigger the hook on copy), but we know we have # full access via the push url -- we have just used it to copy. lgr.debug("Fetch 'git-annex' branch updates from '%s'", target) fetch_cmd = ['fetch', target, 'git-annex'] pushurl = repo.config.get('remote.{}.pushurl'.format(target), None) if pushurl: # for some reason overwriting remote.{target}.url # does not have any effect... fetch_cmd = [ '-c', 'url.{}.insteadof={}'.format( pushurl, repo.config.get('remote.{}.url'.format(target), None)) ] + fetch_cmd lgr.debug("Sync local annex branch from pushurl after remote " 'availability update.') repo.call_git(fetch_cmd) # If no CommandError was raised, it means that remote has git-annex # but local repo might not be an annex yet. Since there is nothing to "sync" # from us, we just skip localsync without mutating repo into an AnnexRepo if is_annex_repo: repo.localsync(target) except CommandError as e: # it is OK if the remote doesn't have a git-annex branch yet # (e.g. fresh repo) # TODO is this possible? we just copied? Maybe check if anything # was actually copied? if "fatal: couldn't find remote ref git-annex" not in e.stderr.lower(): raise lgr.debug('Remote does not have a git-annex branch: %s', e) if not refspecs2push: lgr.debug('No refspecs found that need to be pushed') return # and push all relevant branches, plus the git-annex branch to announce # local availability info too yield from _push_refspecs( repo, target, refspecs2push, force_git_push, res_kwargs.copy(), )
def __call__(self, dataset, refcommit, process_type, status): if process_type not in ('all', 'dataset'): return None ds = dataset log_progress( lgr.info, 'extractorstudyminimeta', 'Start studyminimeta metadata extraction from {path}'.format(path=ds.path), total=len(tuple(status)) + 1, label='Studyminimeta metadata extraction', unit=' Files', ) source_file = self._get_absolute_studyminimeta_file_name(dataset) try: with open(source_file, "rt") as input_stream: metadata_object = yaml.safe_load(input_stream) except FileNotFoundError: yield { "status": "error", "metadata": {}, "type": process_type, "message": "file " + source_file + " could not be opened" } return except yaml.YAMLError as e: yield { "status": "error", "metadata": {}, "type": process_type, "message": "YAML parsing failed with: " + str(e) } return ld_creator_result = LDCreator( dataset.id, refcommit, self._get_relative_studyminimeta_file_name(dataset) ).create_ld_from_spec(metadata_object) if ld_creator_result.success: log_progress( lgr.info, 'extractorstudyminimeta', 'Finished studyminimeta metadata extraction from {path}'.format(path=ds.path) ) yield { "status": "ok", "metadata": ld_creator_result.json_ld_object, "type": process_type } else: log_progress( lgr.error, 'extractorstudyminimeta', 'Error in studyminimeta metadata extraction from {path}'.format(path=ds.path) ) yield { "status": "error", "metadata": {}, "type": process_type, "message": "data structure conversion to JSON-LD failed" }
def get_metadata(self, dataset, content): if not content: return {}, [] contentmeta = [] log_progress( lgr.info, 'extractornifti1', 'Start NIfTI1 metadata extraction from %s', self.ds, total=len(self.paths), label='NIfTI1 metadata extraction', unit=' Files', ) for f in self.paths: absfp = opj(self.ds.path, f) log_progress(lgr.info, 'extractornifti1', 'Extract NIfTI1 metadata from %s', absfp, update=1, increment=True) try: header = nibabel.load(absfp).header except Exception as e: lgr.debug("NIfTI metadata extractor failed to load %s: %s", absfp, exc_str(e)) continue if not isinstance(header, nibabel.Nifti1Header): # all we can do for now lgr.debug("Ignoring non-NIfTI1 file %s", absfp) continue # blunt conversion of the entire header meta = { self._key2stdkey.get(k, k): [np.asscalar(i) for i in v] if len(v.shape) # scalar else np.asscalar(v) for k, v in header.items() if k not in self._ignore } # more convenient info from nibabel's support functions meta.update({k: v(header) for k, v in self._extractors.items()}) # filter useless fields (empty strings and NaNs) meta = { k: v for k, v in meta.items() if not (isinstance(v, float) and isnan(v)) and not (hasattr(v, '__len__') and not len(v)) } # a few more convenient targeted extracts from the header # spatial resolution in millimeter spatial_unit = header.get_xyzt_units()[0] # by what factor to multiply by to get to 'mm' if spatial_unit == 'unknown': lgr.debug( "unit of spatial resolution for '{}' unknown, assuming 'millimeter'" .format(absfp)) spatial_unit_conversion = { 'unknown': 1, 'meter': 1000, 'mm': 1, 'micron': 0.001 }.get(spatial_unit, None) if spatial_unit_conversion is None: lgr.debug( "unexpected spatial unit code '{}' from NiBabel".format( spatial_unit)) # TODO does not see the light of day meta['spatial_resolution(mm)'] = \ [(i * spatial_unit_conversion) for i in header.get_zooms()[:3]] # time if len(header.get_zooms()) > 3: # got a 4th dimension rts_unit = header.get_xyzt_units()[1] if rts_unit == 'unknown': lgr.warn( "RTS unit '{}' unknown, assuming 'seconds'".format( absfp)) # normalize to seconds, if possible rts_unit_conversion = { 'msec': 0.001, 'micron': 0.000001 }.get(rts_unit, 1.0) if rts_unit not in ('hz', 'ppm', 'rads'): meta['temporal_spacing(s)'] = \ header.get_zooms()[3] * rts_unit_conversion contentmeta.append((f, meta)) # Decode entries which might be bytes # TODO: consider doing that in above "metalad" logic for k, v in meta.items(): if isinstance(v, bytes): meta[k] = v.decode() log_progress(lgr.info, 'extractornifti1', 'Finished NIfTI1 metadata extraction from %s', self.ds) return { '@context': vocabulary, }, \ contentmeta
def _proc(ds, refcommit, sources, status, extractors, process_type): dsmeta = dict() contentmeta = {} log_progress( lgr.info, 'metadataextractors', 'Start metadata extraction from %s', ds, total=len(sources), label='Metadata extraction', unit=' extractors', ) for msrc in sources: msrc_key = msrc extractor = extractors[msrc] log_progress(lgr.info, 'metadataextractors', 'Engage %s metadata extractor', msrc_key, update=1, increment=True) # actually pull the metadata records out of the extractor for res in _run_extractor(extractor['class'], msrc, ds, refcommit, status, extractor['process_type']): # always have a path, use any absolute path coming in, # make any relative path absolute using the dataset anchor, # use the dataset path if nothing is coming in (better then # no path at all) # for now normalize the reported path to be a plain string # until DataLad as a whole can deal with pathlib objects if 'path' in res: res['path'] = text_type(Path(res['path'])) res.update(path=ds.path if 'path' not in res else res['path'] if op .isabs(res['path']) else op.join(ds.path, res['path'])) # the following two conditionals are untested, as a test would # require a metadata extractor to yield broken metadata, and in # order to have such one, we need a mechanism to have the test # inject one on the fly MIH thinks that the code neeeded to do that # is more chances to be broken then the code it would test if success_status_map.get(res['status'], False) != 'success': # pragma: no cover yield res # no further processing of broken stuff continue else: # pragma: no cover # if the extractor was happy check the result if not _ok_metadata(res, msrc, ds, None): res.update( # this will prevent further processing a few lines down status='error', # TODO have _ok_metadata report the real error message=('Invalid metadata (%s)', msrc), ) yield res continue # we do not want to report info that there was no metadata if not res['metadata']: # pragma: no cover lgr.debug( 'Skip %s %s metadata in record of %s: ' 'extractor reported nothing', msrc_key, res.get('type', ''), res['path']) continue if res['type'] == 'dataset': # TODO warn if two dataset records are generated by the same # extractor dsmeta[msrc_key] = res['metadata'] else: # this is file metadata, _ok_metadata() checks unknown types # assign only ask each metadata extractor once, hence no # conflict possible loc_dict = contentmeta.get(res['path'], {}) loc_dict[msrc_key] = res['metadata'] contentmeta[res['path']] = loc_dict log_progress( lgr.info, 'metadataextractors', 'Finished metadata extraction from %s', ds, ) # top-level code relies on the fact that any dataset metadata # is yielded before content metadata if process_type in (None, 'all', 'dataset') and \ dsmeta and ds is not None and ds.is_installed(): yield get_status_dict( ds=ds, metadata=dsmeta, # any errors will have been reported before status='ok', ) for p in contentmeta: res = get_status_dict( # TODO avoid is_installed() call path=op.join(ds.path, p) if ds.is_installed() else p, metadata=contentmeta[p], type='file', # any errors will have been reported before status='ok', ) # TODO avoid is_installed() call, check if such info is # useful and accurate at all if ds.is_installed(): res['parentds'] = ds.path yield res
def __call__(dataset=None, path=None, sources=None, process_type=None, format='native'): ds = require_dataset(dataset or curdir, purpose="extract metadata", check_installed=not path) # check what extractors we want as sources, and whether they are # available if not sources: sources = ['metalad_core', 'metalad_annex'] \ + assure_list(get_metadata_type(ds)) # keep local, who knows what some extractors might pull in from pkg_resources import iter_entry_points # delayed heavy import extractors = {} for ep in iter_entry_points('datalad.metadata.extractors'): if ep.name not in sources: # not needed here continue rec = dict(entrypoint=ep) if ep.name in extractors: # pragma: no cover # potential conflict if extractors[ ep.name]['entrypoint'].dist.project_name == 'datalad': # this is OK, just state it is happening lgr.debug('Extractor %s overrides datalad-core variant', ep) extractors[ep.name] = rec elif ep.dist.project_name == 'datalad': # also OK lgr.debug('Prefer extractor %s over datalad-core variant', ep) else: msg = ('At least two DataLad extensions provide metadata ' 'extractor %s: %s vs. %s', ep.name, ep.dist, extractors[ep.name].dist) if ep.name in sources: # this extractor is required -> blow hard raise RuntimeError(msg[0] % msg[1:]) else: # still moan lgr.warn(msg) # ignore the newcomer, is listed second in sys.path else: # this fresh and unique extractors[ep.name] = rec for msrc in sources: if msrc not in extractors: # we said that we want to fail, rather then just moan about # less metadata raise ValueError( "Enabled metadata extractor '{}' not available".format( msrc), ) # load extractor implementation rec = extractors[msrc] rec['process_type'] = process_type \ if process_type and not process_type == 'extractors' \ else ds.config.obtain( 'datalad.metadata.extract-from-{}'.format( msrc.replace('_', '-')), default='all') # load the extractor class, no instantiation yet try: rec['class'] = rec['entrypoint'].load() except Exception as e: # pragma: no cover msg = ('Failed %s metadata extraction from %s: %s', msrc, ds, exc_str(e)) log_progress(lgr.error, 'metadataextractors', *msg) raise ValueError(msg[0] % msg[1:]) res_props = dict( action='meta_extract', logger=lgr, ) # build report on extractors and their state info if process_type == 'extractors': for ename, eprops in iteritems(extractors): state = {} # do not trip over old extractors if hasattr(eprops['class'], 'get_state'): state.update(eprops['class']().get_state(ds)) yield dict(action='meta_extract', path=ds.path, status='ok', logger=lgr, extractor=ename, state=dict( state, process_type=eprops['process_type'], )) return # build a representation of the dataset's content (incl subds # records) # go through a high-level command (not just the repo methods) to # get all the checks and sanitization of input arguments # this call is relatively expensive, but already anticipates # demand for information by our core extractors that always run # unconditionally, hence no real slowdown here # TODO this could be a dict, but MIH cannot think of an access # pattern that does not involve iteration over all items status = [] exclude_paths = [ ds.pathobj / PurePosixPath(e) for e in (list(exclude_from_metadata) + assure_list(ds.config.get('datalad.metadata.exclude-path', []))) ] if ds.is_installed(): # we can make use of status res_props.update(refds=ds.path) for r in ds.status( # let status sort out all path arg handling # but this will likely make it impossible to use this # command to just process an individual file independent # of a dataset path=path, # it is safe to ask for annex info even when a dataset is # plain Git # NOTE changing to 'annex=availability' has substantial # performance costs, as it involved resolving each annex # symlink on the file-system, which can be really slow # depending on the FS and the number of annexed files annex='basic', # TODO we never want to aggregate metadata from untracked # content, but we might just want to see what we can get # from a file untracked='no', # this command cannot and will not work recursively recursive=False, result_renderer='disabled'): # path reports are always absolute and anchored on the dataset # (no repo) path p = Path(r['path']) if p in exclude_paths or \ any(e in p.parents for e in exclude_paths): # this needs to be ignore for any further processing continue # strip useless context information status.append({ k: v for k, v in iteritems(r) if (k not in ('refds', 'parentds', 'action', 'status') and not k.startswith('prev_')) }) # determine the commit that we are describing refcommit = get_refcommit(ds) if refcommit is None or not len(status): # this seems extreme, but without a single commit there is # nothing we can have, or describe -> blow yield dict( res_props, status='error', message=\ 'No metadata-relevant repository content found. ' \ 'Cannot determine reference commit for metadata ID', type='dataset', path=ds.path, ) return # stamp every result res_props['refcommit'] = refcommit else: # no dataset at hand, take path arg at face value and hope # for the best # TODO we have to resolve the given path to make it match what # status is giving (abspath with ds (not repo) anchor) status = [dict(path=p, type='file') for p in assure_list(path)] # just for compatibility, mandatory argument list below refcommit = None if ds.is_installed(): # check availability requirements and obtain data as needed needed_paths = set() for rec in extractors.values(): if hasattr(rec['class'], 'get_required_content'): needed_paths.update( # new extractors do not need any instantiation args s['path'] for s in rec['class']().get_required_content( ds, rec['process_type'], status)) if needed_paths: for r in ds.get(path=needed_paths, return_type='generator', result_renderer='disabled'): if success_status_map.get( r['status'], False) != 'success': # pragma: no cover # online complain when something goes wrong yield r contexts = {} nodes_by_context = {} try: for res in _proc(ds, refcommit, sources, status, extractors, process_type): if format == 'native': # that is what we pass around internally res.update(**res_props) yield res elif format == 'jsonld': collect_jsonld_metadata(ds.pathobj, res, nodes_by_context, contexts) finally: # extractors can come from any source with no guarantee for # proper implementation. Let's make sure that we bring the # dataset back into a sane state (e.g. no batch processes # hanging around). We should do this here, as it is not # clear whether extraction results will be saved to the # dataset(which would have a similar sanitization effect) if ds.repo: ds.repo.precommit() if format == 'jsonld': yield dict(status='ok', type='dataset', path=ds.path, metadata=format_jsonld_metadata(nodes_by_context), **res_props)
def __call__(path=None, dataset=None, to=None, since=None, force=None, recursive=False, recursion_limit=None, jobs=None): # we resolve here, because we need to perform inspection on what was given # as an input argument further down paths = [resolve_path(p, dataset) for p in assure_list(path)] ds = require_dataset(dataset, check_installed=True, purpose='pushing') ds_repo = ds.repo res_kwargs = dict( action='publish', refds=ds.path, logger=lgr, ) get_remote_kwargs = {'exclude_special_remotes': False} \ if isinstance(ds_repo, AnnexRepo) else {} if to and to not in ds_repo.get_remotes(**get_remote_kwargs): # get again for proper error: sr = ds_repo.get_remotes(**get_remote_kwargs) # yield an error result instead of raising a ValueError, # to enable the use case of pushing to a target that # a superdataset doesn't know, but some subdatasets to # (in combination with '--on-failure ignore') yield dict(res_kwargs, status='error', message="Unknown push target '{}'. {}".format( to, 'Known targets: {}.'.format(', '.join( repr(s) for s in sr)) if sr else 'No targets configured in dataset.')) return if since: # will blow with ValueError if unusable ds_repo.get_hexsha(since) if not since and since is not None: # special case: --since='' # figure out state of remote branch and set `since` since = _get_corresponding_remote_state(ds_repo, to) if not since: lgr.info("No tracked remote for active branch, " "detection of last pushed state not in effect.") # obtain a generator for information on the datasets to process # idea is to turn the `paths` argument into per-dataset # content listings that can be acted upon ds_spec = _datasets_since_( # important to pass unchanged dataset arg dataset, since, paths, recursive, recursion_limit) # instead of a loop, this could all be done in parallel matched_anything = False for dspath, dsrecords in ds_spec: matched_anything = True lgr.debug('Attempt push of Dataset at %s', dspath) pbars = {} yield from _push(dspath, dsrecords, to, force, jobs, res_kwargs.copy(), pbars, got_path_arg=True if path else False) # take down progress bars for this dataset for i, ds in pbars.items(): log_progress(lgr.info, i, 'Finished push of %s', ds) if not matched_anything: yield dict( res_kwargs, status='notneeded', message= 'Given constraints did not match any changes to publish', type='dataset', path=ds.path, )
def _push(dspath, content, target, force, jobs, res_kwargs, pbars, done_fetch=None, got_path_arg=False): if not done_fetch: done_fetch = set() # nothing recursive in here, we only need a repo to work with ds = Dataset(dspath) repo = ds.repo res_kwargs.update(type='dataset', path=dspath) # content will be unique for every push (even on the some dataset) pbar_id = 'push-{}-{}'.format(target, id(content)) # register for final orderly take down pbars[pbar_id] = ds log_progress( lgr.info, pbar_id, 'Determine push target', unit=' Steps', label='Push', total=4, ) if not target: try: # let Git figure out what needs doing wannabe_gitpush = repo.push(remote=None, git_options=['--dry-run']) # we did not get an explicit push target, get it from Git target = set(p.get('remote', None) for p in wannabe_gitpush) # handle case where a pushinfo record did not have a 'remote' # property -- should not happen, but be robust target.discard(None) except Exception as e: lgr.debug( 'Dry-run push to determine default push target failed, ' 'assume no configuration: %s', e) target = set() if not len(target): yield dict( res_kwargs, status='impossible', message='No push target given, and none could be ' 'auto-detected, please specific via --to', ) return elif len(target) > 1: # dunno if this can ever happen, but if it does, report # nicely yield dict(res_kwargs, status='error', message=( 'No push target given, ' 'multiple candidates auto-detected: %s', list(target), )) return else: # can only be a single one at this point target = target.pop() if target not in repo.get_remotes(): yield dict(res_kwargs, status='error', message=("Unknown target sibling '%s'.", target)) return log_progress(lgr.info, pbar_id, "Push refspecs", label="Push to '{}'".format(target), update=1, total=4) # define config var name for potential publication dependencies depvar = 'remote.{}.datalad-publish-depends'.format(target) # list of remotes that are publication dependencies for the # target remote publish_depends = assure_list(ds.config.get(depvar, [])) if publish_depends: lgr.debug("Discovered publication dependencies for '%s': %s'", target, publish_depends) # cache repo type is_annex_repo = isinstance(ds.repo, AnnexRepo) # TODO prevent this when `target` is a special remote # (possibly redo) a push attempt to figure out what needs pushing # do this on the main target only, and apply the result to all # dependencies try: wannabe_gitpush = repo.push(remote=target, git_options=['--dry-run']) except Exception as e: lgr.debug( 'Dry-run push to check push configuration failed, ' 'assume no configuration: %s', e) wannabe_gitpush = [] refspecs2push = [ # if an upstream branch is set, go with it p['from_ref'] if ds.config.get( # refs come in as refs/heads/<branchname> # need to cut the prefix 'branch.{}.remote'.format(p['from_ref'][11:]), None) == target and ds.config.get('branch.{}.merge'.format(p['from_ref'][11:]), None) # if not, define target refspec explicitly to avoid having to # set an upstream branch, which would happen implicitly from # a users POV, and may also be hard to decide when publication # dependencies are present else '{}:{}'.format(p['from_ref'], p['to_ref']) for p in wannabe_gitpush # TODO: what if a publication dependency doesn't have it yet # should we not attempt to push, because the main target has it? if 'uptodate' not in p['operations'] and ( # cannot think of a scenario where we would want to push a # managed branch directly, instead of the corresponding branch 'refs/heads/adjusted' not in p['from_ref']) ] if not refspecs2push: lgr.debug( 'No refspecs configured for push, attempting to use active branch') # nothing was set up for push, push the current branch at minimum # TODO this is not right with managed branches active_branch = repo.get_active_branch() if not active_branch: yield dict( res_kwargs, status='impossible', message='There is no active branch, cannot determine remote ' 'branch') return if is_annex_repo: # we could face a managed branch, in which case we need to # determine the actual one and make sure it is sync'ed with the # managed one, and push that one instead. following methods can # be called unconditionally repo.localsync(managed_only=True) active_branch = repo.get_corresponding_branch( active_branch) or active_branch refspecs2push.append( # same dance as above active_branch if ds.config. get('branch.{}.merge'.format(active_branch), None ) else '{ab}:{ab}'.format(ab=active_branch)) # we know what to push and where, now dependency processing first for r in publish_depends: # simply make a call to this function again, all the same, but # target is different, pass done_fetch to avoid duplicate # and expensive calls to git-fetch yield from _push( dspath, content, # to this particular dependency r, force, jobs, res_kwargs.copy(), pbars, done_fetch=None, got_path_arg=got_path_arg, ) # and lastly the primary push target target_is_git_remote = repo.config.get('remote.{}.url'.format(target), None) is not None # only attempt, if Git knows about a URL, otherwise this is # a pure special remote that doesn't deal with the git repo if target_is_git_remote: # push the main branches of interest first, but not yet (necessarily) # the git-annex branch. We ant to push first in order to hit any # conflicts or unknown history before we move data. Otherwise out # decision making done above (--since ...) might have been # inappropriate. push_ok = True for p in _push_refspecs(repo, target, refspecs2push, force, res_kwargs.copy()): if p['status'] not in ('ok', 'notneeded'): push_ok = False yield p if not push_ok: # error-type results have been yielded, the local status quo is # outdated/invalid, stop to let user decide how to proceed. # TODO final global error result for the dataset?! return # git-annex data move # if not is_annex_repo: return if force == 'no-datatransfer': lgr.debug("Data transfer to '%s' disabled by argument", target) return log_progress(lgr.info, pbar_id, "Transfer data", label="Transfer data to '{}'".format(target), update=2, total=4) yield from _push_data( ds, target, content, force, jobs, res_kwargs.copy(), got_path_arg=got_path_arg, ) if not target_is_git_remote: # there is nothing that we need to push or sync with on the git-side # of things with this remote return log_progress(lgr.info, pbar_id, "Update availability information", label="Update availability for '{}'".format(target), update=3, total=4) # after file transfer the remote might have different commits to # the annex branch. They have to be merged locally, otherwise a # push of it further down will fail try: # fetch remote, let annex sync them locally, so that the push # later on works. # We have to fetch via the push url (if there is any), # not a pull url. # The latter might be dumb and without the execution of a # post-update hook we might not be able to retrieve the # server-side git-annex branch updates (and git-annex does # not trigger the hook on copy), but we know we have # full access via the push url -- we have just used it to copy. lgr.debug("Fetch 'git-annex' branch updates from '%s'", target) fetch_cmd = ['fetch', target, 'git-annex'] pushurl = repo.config.get('remote.{}.pushurl'.format(target), None) if pushurl: # for some reason overwriting remote.{target}.url # does not have any effect... fetch_cmd = [ '-c', 'url.{}.insteadof={}'.format( pushurl, repo.config.get('remote.{}.url'.format(target), None)) ] + fetch_cmd lgr.debug("Sync local annex branch from pushurl after remote " 'availability update.') repo.call_git(fetch_cmd) repo.localsync(target) except CommandError as e: # it is OK if the remote doesn't have a git-annex branch yet # (e.g. fresh repo) # TODO is this possible? we just copied? Maybe check if anything # was actually copied? if "fatal: couldn't find remote ref git-annex" not in e.stderr.lower(): raise lgr.debug('Remote does not have a git-annex branch: %s', e) # and push the annex branch to announce local availability info # too yield from _push_refspecs( repo, target, [ 'git-annex' if ds.config.get('branch.git-annex.merge', None) else 'git-annex:git-annex' ], force, res_kwargs.copy(), )
def _get_cnmeta(self, bids): # TODO any custom handling of participants infos should eventually # be done by pybids in one way or another path_props = {} participants_fname = opj(self.ds.path, 'participants.tsv') if exists(participants_fname): try: for rx, info in yield_participant_info(bids): path_props[rx] = {'subject': info} except Exception as exc: if isinstance(exc, ImportError): raise exc lgr.warning( "Failed to load participants info due to: %s. Skipping the rest of file", exc_str(exc)) log_progress( lgr.info, 'extractorbids', 'Start BIDS metadata extraction from %s', self.ds, total=len(self.paths), label='BIDS metadata extraction', unit=' Files', ) # now go over all files in the dataset and query pybids for its take # on each of them for f in self.paths: absfp = opj(self.ds.path, f) log_progress(lgr.info, 'extractorbids', 'Extract BIDS metadata from %s', absfp, update=1, increment=True) # BIDS carries a substantial portion of its metadata in JSON # sidecar files. we ignore them here completely # this might yield some false-negatives in theory, but # this case has not been observed in practice yet, hence # doing it cheap for now if f.endswith('.json'): continue md = {} try: md.update({ k: v for k, v in bids.get_metadata( opj(self.ds.path, f), include_entities=True).items() # no nested structures for now (can be monstrous when DICOM # metadata is embedded) if not isinstance(v, dict) }) except ValueError as e: lgr.debug( 'PyBIDS errored on file %s in %s: %s ' '(possibly not BIDS-compliant or not recognized', f, self.ds, exc_str(e)) lgr.debug('no usable BIDS metadata for %s in %s: %s', f, self.ds, exc_str(e)) # do not raise here: # https://github.com/datalad/datalad-neuroimaging/issues/34 except Exception as e: lgr.debug('no usable BIDS metadata for %s in %s: %s', f, self.ds, exc_str(e)) if cfg.get('datalad.runtime.raiseonerror'): raise # no check al props from other sources and apply them for rx in path_props: if rx.match(f): md.update(path_props[rx]) yield f, md log_progress(lgr.info, 'extractorbids', 'Finished BIDS metadata extraction from %s', self.ds)
def __call__( archive, *, dataset=None, annex=None, add_archive_leading_dir=False, strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None, use_current_dir=False, delete=False, key=False, exclude=None, rename=None, existing='fail', annex_options=None, copy=False, commit=True, allow_dirty=False, stats=None, drop_after=False, delete_after=False): if exclude: exclude = ensure_tuple_or_list(exclude) if rename: rename = ensure_tuple_or_list(rename) ds = require_dataset(dataset, check_installed=True, purpose='add-archive-content') # set up common params for result records res_kwargs = { 'action': 'add-archive-content', 'logger': lgr, } if not isinstance(ds.repo, AnnexRepo): yield get_status_dict( ds=ds, status='impossible', message="Can't operate in a pure Git repository", **res_kwargs ) return if annex: warnings.warn( "datalad add_archive_content's `annex` parameter is " "deprecated and will be removed in a future release. " "Use the 'dataset' parameter instead.", DeprecationWarning) annex = ds.repo # get the archive path relative from the ds root archive_path = resolve_path(archive, ds=dataset) # let Status decide whether we can act on the given file for s in ds.status( path=archive_path, on_failure='ignore', result_renderer='disabled'): if s['status'] == 'error': if 'path not underneath the reference dataset %s' in s['message']: yield get_status_dict( ds=ds, status='impossible', message='Can not add archive outside of the dataset', **res_kwargs) return # status errored & we haven't anticipated the cause. Bubble up yield s return elif s['state'] == 'untracked': # we can't act on an untracked file message = ( "Can not add an untracked archive. " "Run 'datalad save {}'".format(archive) ) yield get_status_dict( ds=ds, status='impossible', message=message, **res_kwargs) return if not allow_dirty and annex.dirty: # error out here if the dataset contains untracked changes yield get_status_dict( ds=ds, status='impossible', message=( 'clean dataset required. ' 'Use `datalad status` to inspect unsaved changes'), **res_kwargs ) return # ensure the archive exists, status doesn't error on a non-existing file if not key and not lexists(archive_path): yield get_status_dict( ds=ds, status='impossible', message=( 'No such file: {}'.format(archive_path), ), **res_kwargs ) return if not key: check_path = archive_path.relative_to(ds.pathobj) # TODO: support adding archives content from outside the annex/repo origin = 'archive' # can become get_file_annexinfo once #6104 is merged key = annex.get_file_annexinfo(check_path)['key'] if not key: raise RuntimeError( f"Archive must be an annexed file in {ds}") archive_dir = Path(archive_path).parent else: origin = 'key' key = archive # We must not have anything to do with the location under .git/annex archive_dir = None # instead, we will go from the current directory use_current_dir = True archive_basename = file_basename(archive) if not key: # if we didn't manage to get a key, the file must be in Git raise NotImplementedError( "Provided file %s does not seem to be under annex control. " "We don't support adding everything straight to Git" % archive ) # figure out our location pwd = getpwd() # are we in a subdirectory of the repository? pwd_in_root = annex.path == archive_dir # then we should add content under that subdirectory, # get the path relative to the repo top if use_current_dir: # extract the archive under the current directory, not the directory # where the archive is located extract_rpath = Path(pwd).relative_to(ds.path) \ if not pwd_in_root \ else None else: extract_rpath = archive_dir.relative_to(ds.path) # relpath might return '.' as the relative path to curdir, which then normalize_paths # would take as instructions to really go from cwd, so we need to sanitize if extract_rpath == curdir: extract_rpath = None try: key_rpath = annex.get_contentlocation(key) except: # the only probable reason for this to fail is that there is no # content present raise RuntimeError( "Content of %s seems to be N/A. Fetch it first" % key ) # now we simply need to go through every file in that archive and lgr.info( "Adding content of the archive %s into annex %s", archive, annex ) from datalad.customremotes.archives import ArchiveAnnexCustomRemote # TODO: shouldn't we be able just to pass existing AnnexRepo instance? # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive # OK, let's ignore that the following class is actually a special # remote implementation, and use it only to work with its cache annexarchive = ArchiveAnnexCustomRemote(annex=None, path=annex.path, persistent_cache=True) # We will move extracted content so it must not exist prior running annexarchive.cache.allow_existing = True earchive = annexarchive.cache[key_rpath] # make sure there is an enabled datalad-archives special remote ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE, autoenable=True) precommitted = False old_always_commit = annex.always_commit # batch mode is disabled when faking dates, we want to always commit annex.always_commit = annex.fake_dates_enabled if annex_options: if isinstance(annex_options, str): annex_options = split_cmdline(annex_options) delete_after_rpath = None prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad", dir=annex.path)) \ if delete_after \ else None # dedicated stats which would be added to passed in (if any) outside_stats = stats stats = ActivityStats() try: # keep track of extracted files for progress bar logging file_counter = 0 # iterative over all files in the archive extracted_files = list(earchive.get_extracted_files()) # start a progress bar for extraction pbar_id = f'add-archive-{archive_path}' log_progress( lgr.info, pbar_id, 'Extracting archive', label="Extracting archive", unit=' Files', total = len(extracted_files), noninteractive_level = logging.INFO) for extracted_file in extracted_files: file_counter += 1 files_left = len(extracted_files) - file_counter log_progress( lgr.info, pbar_id, "Files to extract %i ", files_left, update=1, increment=True, noninteractive_level=logging.DEBUG) stats.files += 1 extracted_path = Path(earchive.path) / Path(extracted_file) if extracted_path.is_symlink(): link_path = str(extracted_path.resolve()) if not exists(link_path): # TODO: config addarchive.symlink-broken='skip' lgr.warning( "Path %s points to non-existing file %s" % (extracted_path, link_path) ) stats.skipped += 1 continue # TODO: check if points outside of archive - warn & skip url = annexarchive.get_file_url( archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size) # preliminary target name which might get modified by renames target_file_orig = target_file = Path(extracted_file) # stream archives would not have had the original filename # information in them, so would be extracted under a name # derived from their annex key. # Provide ad-hoc handling for such cases if (len(extracted_files) == 1 and Path(archive).suffix in ('.xz', '.gz', '.lzma') and Path(key_rpath).name.startswith(Path( extracted_file).name)): # take archive's name without extension for filename & place # where it was originally extracted target_file = \ Path(extracted_file).parent / Path(archive).stem if strip_leading_dirs: leading_dir = earchive.get_leading_directory( depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) leading_dir_len = \ len(leading_dir) + len(opsep) if leading_dir else 0 target_file = str(target_file)[leading_dir_len:] if add_archive_leading_dir: # place extracted content under a directory corresponding to # the archive name with suffix stripped. target_file = Path(archive_basename) / target_file if rename: target_file = apply_replacement_rules(rename, str(target_file)) # continue to next iteration if extracted_file in excluded if exclude: try: # since we need to skip outside loop from inside loop for regexp in exclude: if re.search(regexp, extracted_file): lgr.debug( "Skipping {extracted_file} since contains " "{regexp} pattern".format(**locals())) stats.skipped += 1 raise StopIteration except StopIteration: continue if delete_after: # place target file in a temporary directory target_file = Path(prefix_dir) / Path(target_file) # but also allow for it in the orig target_file_orig = Path(prefix_dir) / Path(target_file_orig) target_file_path_orig = annex.pathobj / target_file_orig # If we were invoked in a subdirectory, patch together the # correct path target_file_path = extract_rpath / target_file \ if extract_rpath else target_file target_file_path = annex.pathobj / target_file_path # when the file already exists... if lexists(target_file_path): handle_existing = True if md5sum(str(target_file_path)) == \ md5sum(str(extracted_path)): if not annex.is_under_annex(str(extracted_path)): # if under annex -- must be having the same content, # we should just add possibly a new extra URL # but if under git -- we cannot/should not do # anything about it ATM if existing != 'overwrite': continue else: handle_existing = False if not handle_existing: pass # nothing... just to avoid additional indentation elif existing == 'fail': message = \ "{} exists, but would be overwritten by new file " \ "{}. Consider adjusting --existing".format\ (target_file_path, extracted_file) yield get_status_dict( ds=ds, status='error', message=message, **res_kwargs) return elif existing == 'overwrite': stats.overwritten += 1 # to make sure it doesn't conflict -- might have been a # tree rmtree(target_file_path) else: # an elaborate dance to piece together new archive names target_file_path_orig_ = target_file_path # To keep extension intact -- operate on the base of the # filename p, fn = os.path.split(target_file_path) ends_with_dot = fn.endswith('.') fn_base, fn_ext = file_basename(fn, return_ext=True) if existing == 'archive-suffix': fn_base += '-%s' % archive_basename elif existing == 'numeric-suffix': pass # archive-suffix will have the same logic else: # we shouldn't get here, argparse should catch a # non-existing value for --existing right away raise ValueError(existing) # keep incrementing index in the suffix until file # doesn't collide suf, i = '', 0 while True: connector = \ ('.' if (fn_ext or ends_with_dot) else '') file = fn_base + suf + connector + fn_ext target_file_path_new = \ Path(p) / Path(file) if not lexists(target_file_path_new): # we found a file name that is not yet taken break lgr.debug("Iteration %i of file name finding. " "File %s already exists", i, target_file_path_new) i += 1 suf = '.%d' % i target_file_path = target_file_path_new lgr.debug("Original file %s will be saved into %s" % (target_file_path_orig_, target_file_path)) # TODO: should we reserve smth like # stats.clobbed += 1 if target_file_path != target_file_path_orig: stats.renamed += 1 if copy: raise NotImplementedError( "Not yet copying from 'persistent' cache" ) lgr.debug("Adding %s to annex pointing to %s and with options " "%r", target_file_path, url, annex_options) out_json = annex.add_url_to_file( target_file_path, url, options=annex_options, batch=True) if 'key' in out_json and out_json['key'] is not None: # annex.is_under_annex(target_file, batch=True): # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated # we need to maintain a list of those to be dropped files if drop_after: # drop extracted files after adding to annex annex.drop_key(out_json['key'], batch=True) stats.dropped += 1 stats.add_annex += 1 else: lgr.debug("File {} was added to git, not adding url".format( target_file_path)) stats.add_git += 1 if delete_after: # we count the removal here, but don't yet perform it # to not interfer with batched processes - any pure Git # action invokes precommit which closes batched processes. stats.removed += 1 # Done with target_file -- just to have clear end of the loop del target_file if delete and archive and origin != 'key': lgr.debug("Removing the original archive {}".format(archive)) # force=True since some times might still be staged and fail annex.remove(str(archive_path), force=True) lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line')) if outside_stats: outside_stats += stats if delete_after: # force since not committed. r=True for -r (passed into git call # to recurse) delete_after_rpath = opj(extract_rpath, prefix_dir) \ if extract_rpath else prefix_dir delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) lgr.debug( "Removing extracted and annexed files under %s", delete_after_rpath ) annex.remove(str(delete_after_rpath), r=True, force=True) if commit: archive_rpath = archive_path.relative_to(ds.path) commit_stats = outside_stats if outside_stats else stats # so batched ones close and files become annex symlinks etc annex.precommit() precommitted = True if any(r.get('state', None) != 'clean' for p, r in annex.status(untracked='no').items()): annex.commit( "Added content extracted from %s %s\n\n%s" % (origin, archive_rpath, commit_stats.as_str(mode='full')), _datalad_msg=True ) commit_stats.reset() else: # don't commit upon completion pass finally: # take down the progress bar log_progress( lgr.info, pbar_id, 'Finished extraction', noninteractive_level=logging.INFO) # since we batched addurl, we should close those batched processes # if haven't done yet. explicitly checked to avoid any possible # "double-action" if not precommitted: annex.precommit() if delete_after_rpath: delete_after_path = opj(annex.path, delete_after_rpath) delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) if exists(delete_after_path): # should not be there # but for paranoid yoh lgr.warning( "Removing temporary directory under which extracted " "files were annexed and should have been removed: %s", delete_after_path) rmtree(delete_after_path) annex.always_commit = old_always_commit # remove what is left and/or everything upon failure earchive.clean(force=True) # remove tempfile directories (not cleaned up automatically): if prefix_dir is not None and lexists(prefix_dir): os.rmdir(prefix_dir) yield get_status_dict( ds=ds, status='ok', **res_kwargs) return annex
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None): """Make a direct query of a dataset to extract its metadata. Parameters ---------- ds : Dataset types : list """ errored = False dsmeta = dict() contentmeta = {} if global_meta is not None and content_meta is not None and \ not global_meta and not content_meta: # both are false and not just none return dsmeta, contentmeta, errored context = { '@vocab': 'http://docs.datalad.org/schema_v{}.json'.format( vocabulary_version)} fullpathlist = paths if paths and isinstance(ds.repo, AnnexRepo): # Ugly? Jep: #2055 content_info = zip(paths, ds.repo.file_has_content(paths), ds.repo.is_under_annex(paths)) paths = [p for p, c, a in content_info if not a or c] nocontent = len(fullpathlist) - len(paths) if nocontent: # TODO better fail, or support incremental and label this file as no present lgr.warn( '{} files have no content present, ' 'some extractors will not operate on {}'.format( nocontent, 'them' if nocontent > 10 else [p for p, c, a in content_info if not c and a]) ) # pull out potential metadata field blacklist config settings blacklist = [re.compile(bl) for bl in assure_list(ds.config.obtain( 'datalad.metadata.aggregate-ignore-fields', default=[]))] # enforce size limits max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize') # keep local, who knows what some extractors might pull in from pkg_resources import iter_entry_points # delayed heavy import extractors = {ep.name: ep for ep in iter_entry_points('datalad.metadata.extractors')} log_progress( lgr.info, 'metadataextractors', 'Start metadata extraction from %s', ds, total=len(types), label='Metadata extraction', unit=' extractors', ) for mtype in types: mtype_key = mtype log_progress( lgr.info, 'metadataextractors', 'Engage %s metadata extractor', mtype_key, update=1, increment=True) if mtype_key not in extractors: # we said that we want to fail, rather then just moan about less metadata log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError( 'Enabled metadata extractor %s is not available in this installation', mtype_key) try: extractor_cls = extractors[mtype_key].load() extractor = extractor_cls( ds, paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist) except Exception as e: log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError( "Failed to load metadata extractor for '%s', " "broken dataset configuration (%s)?: %s", mtype, ds, exc_str(e)) continue try: dsmeta_t, contentmeta_t = extractor.get_metadata( dataset=global_meta if global_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-dataset-{}'.format(mtype.replace('_', '-')), default=True, valtype=EnsureBool()), content=content_meta if content_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-content-{}'.format(mtype.replace('_', '-')), default=True, valtype=EnsureBool())) except Exception as e: lgr.error('Failed to get dataset metadata ({}): {}'.format( mtype, exc_str(e))) if cfg.get('datalad.runtime.raiseonerror'): log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise errored = True # if we dont get global metadata we do not want content metadata continue if dsmeta_t: if _ok_metadata(dsmeta_t, mtype, ds, None): dsmeta_t = _filter_metadata_fields( dsmeta_t, maxsize=max_fieldsize, blacklist=blacklist) dsmeta[mtype_key] = dsmeta_t else: errored = True unique_cm = {} extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude", set()) # TODO: ATM neuroimaging extractors all provide their own internal # log_progress but if they are all generators, we could provide generic # handling of the progress here. Note also that log message is actually # seems to be ignored and not used, only the label ;-) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Metadata extraction per location for %s', mtype, # # contentmeta_t is a generator... so no cound is known # # total=len(contentmeta_t or []), # label='Metadata extraction per location', # unit=' locations', # ) for loc, meta in contentmeta_t or {}: lgr.log(5, "Analyzing metadata for %s", loc) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label=loc, # update=1, # increment=True) if not _ok_metadata(meta, mtype, ds, loc): errored = True # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label='Failed for %s' % loc, # ) continue # we also want to store info that there was no metadata(e.g. to get a list of # files that have no metadata) # if there is an issue that a extractor needlessly produces empty records, the # extractor should be fixed and not a general switch. For example the datalad_core # issues empty records to document the presence of a file #elif not meta: # continue # apply filters meta = _filter_metadata_fields( meta, maxsize=max_fieldsize, blacklist=blacklist) if not meta: continue # assign # only ask each metadata extractor once, hence no conflict possible loc_dict = contentmeta.get(loc, {}) loc_dict[mtype_key] = meta contentmeta[loc] = loc_dict if ds.config.obtain( 'datalad.metadata.generate-unique-{}'.format(mtype_key.replace('_', '-')), default=True, valtype=EnsureBool()): # go through content metadata and inject report of unique keys # and values into `dsmeta` for k, v in iteritems(meta): if k in dsmeta.get(mtype_key, {}): # if the dataset already has a dedicated idea # about a key, we skip it from the unique list # the point of the list is to make missing info about # content known in the dataset, not to blindly # duplicate metadata. Example: list of samples data # were recorded from. If the dataset has such under # a 'sample' key, we should prefer that, over an # aggregated list of a hopefully-kinda-ok structure continue elif k in extractor_unique_exclude: # the extractor thinks this key is worthless for the purpose # of discovering whole datasets # we keep the key (so we know that some file is providing this key), # but ignore any value it came with unique_cm[k] = None continue vset = unique_cm.get(k, set()) vset.add(_val2hashable(v)) unique_cm[k] = vset # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Finished metadata extraction across locations for %s', mtype) if unique_cm: # per source storage here too ucp = dsmeta.get('datalad_unique_content_properties', {}) # important: we want to have a stable order regarding # the unique values (a list). we cannot guarantee the # same order of discovery, hence even when not using a # set above we would still need sorting. the callenge # is that any value can be an arbitrarily complex nested # beast # we also want to have each unique value set always come # in a top-level list, so we known if some unique value # was a list, os opposed to a list of unique values def _ensure_serializable(val): if isinstance(val, ReadOnlyDict): return {k: _ensure_serializable(v) for k, v in iteritems(val)} if isinstance(val, (tuple, list)): return [_ensure_serializable(v) for v in val] else: return val ucp[mtype_key] = { k: [_ensure_serializable(i) for i in sorted( v, key=_unique_value_key)] if v is not None else None for k, v in iteritems(unique_cm) # v == None (disable unique, but there was a value at some point) # otherwise we only want actual values, and also no single-item-lists # of a non-value # those contribute no information, but bloat the operation # (inflated number of keys, inflated storage, inflated search index, ...) if v is None or (v and not v == {''})} dsmeta['datalad_unique_content_properties'] = ucp log_progress( lgr.info, 'metadataextractors', 'Finished metadata extraction from %s', ds, ) # always identify the effective vocabulary - JSON-LD style if context: dsmeta['@context'] = context return dsmeta, contentmeta, errored
def get_metadata(self, dataset, content): if not content: return {}, [] context = {} contentmeta = [] log_progress( lgr.info, 'extractorxmp', 'Start XMP metadata extraction from %s', self.ds, total=len(self.paths), label='XMP metadata extraction', unit=' Files', ) for f in self.paths: absfp = opj(self.ds.path, f) log_progress(lgr.info, 'extractorxmp', 'Extract XMP metadata from %s', absfp, update=1, increment=True) info = file_to_dict(absfp) if not info: # got nothing, likely nothing there # TODO check if this is an XMP sidecar file, parse that, and assign metadata # to the base file continue # update vocabulary vocab = { info[ns][0][0].split(':')[0]: { '@id': ns, 'type': vocabulary_id } for ns in info } # TODO this is dirty and assumed that XMP is internally consistent with the # definitions across all files -- which it likely isn't context.update(vocab) # now pull out actual metadata # cannot do simple dict comprehension, because we need to beautify things a little meta = {} for ns in info: for key, val, props in info[ns]: if not val: # skip everything empty continue if key.count('[') > 1: # this is a nested array # MIH: I do not think it is worth going here continue if props['VALUE_IS_ARRAY']: # we'll catch the actuall array values later continue # normalize value val = assure_unicode(val) # non-breaking space val = val.replace(u"\xa0", ' ') field, idx, qual = xmp_field_re.match(key).groups() normkey = u'{}{}'.format(field, qual) if '/' in key: normkey = u'{0}<{1}>'.format(*normkey.split('/')) if idx: # array arr = meta.get(normkey, []) arr.append(val) meta[normkey] = arr else: meta[normkey] = val # compact meta = { k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in meta.items() } contentmeta.append((f, meta)) log_progress(lgr.info, 'extractorxmp', 'Finished XMP metadata extraction from %s', self.ds) return { '@context': context, }, \ contentmeta
def _yield_ds_w_matching_siblings( ds, names, recursive=False, recursion_limit=None): """(Recursively) inspect a dataset for siblings with particular name(s) Parameters ---------- ds: Dataset The dataset to be inspected. names: iterable Sibling names (str) to test for. recursive: bool, optional Whether to recurse into subdatasets. recursion_limit: int, optional Recursion depth limit. Yields ------ str, str Path to the dataset with a matching sibling, and name of the matching sibling in that dataset. """ def _discover_all_remotes(ds, refds, **kwargs): """Helper to be run on all relevant datasets via foreach """ # Note, that `siblings` doesn't tell us about not enabled special # remotes. There could still be conflicting names we need to know # about in order to properly deal with the `existing` switch. repo = ds.repo # list of known git remotes if isinstance(repo, AnnexRepo): remotes = repo.get_remotes(exclude_special_remotes=True) remotes.extend([v['name'] for k, v in repo.get_special_remotes().items()] ) else: remotes = repo.get_remotes() return remotes if not recursive: for name in _discover_all_remotes(ds, ds): if name in names: yield ds.path, name return # in recursive mode this check could take a substantial amount of # time: employ a progress bar (or rather a counter, because we don't # know the total in advance pbar_id = 'check-siblings-{}'.format(id(ds)) log_progress( lgr.info, pbar_id, 'Start checking pre-existing sibling configuration %s', ds, label='Query siblings', unit=' Siblings', ) for res in ds.foreach_dataset( _discover_all_remotes, recursive=recursive, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', ): # unwind result generator if 'result' in res: for name in res['result']: log_progress( lgr.info, pbar_id, 'Discovered sibling %s in dataset at %s', name, res['path'], update=1, increment=True) if name in names: yield res['path'], name log_progress( lgr.info, pbar_id, 'Finished checking pre-existing sibling configuration %s', ds, )
def __call__(path=None, dataset=None, to=None, since=None, data='auto-if-wanted', force=None, recursive=False, recursion_limit=None, jobs=None): # push uses '^' to annotate the previous pushed committish, and None for default # behavior. '' was/is (to be deprecated) used in `publish`. Alert user about the mistake if since == '': raise ValueError("'since' should point to commitish or use '^'.") # we resolve here, because we need to perform inspection on what was given # as an input argument further down paths = [resolve_path(p, dataset) for p in ensure_list(path)] ds = require_dataset(dataset, check_installed=True, purpose='push') ds_repo = ds.repo res_kwargs = dict( action='publish', refds=ds.path, logger=lgr, ) get_remote_kwargs = {'exclude_special_remotes': False} \ if isinstance(ds_repo, AnnexRepo) else {} if to and to not in ds_repo.get_remotes(**get_remote_kwargs): # get again for proper error: sr = ds_repo.get_remotes(**get_remote_kwargs) # yield an error result instead of raising a ValueError, # to enable the use case of pushing to a target that # a superdataset doesn't know, but some subdatasets to # (in combination with '--on-failure ignore') yield dict(res_kwargs, status='error', path=ds.path, message="Unknown push target '{}'. {}".format( to, 'Known targets: {}.'.format(', '.join( repr(s) for s in sr)) if sr else 'No targets configured in dataset.')) return if since == '^': # figure out state of remote branch and set `since` since = _get_corresponding_remote_state(ds_repo, to) if not since: lgr.info("No tracked remote for active branch, " "detection of last pushed state not in effect.") elif since: # will blow with ValueError if unusable ds_repo.get_hexsha(since) # obtain a generator for information on the datasets to process # idea is to turn the `paths` argument into per-dataset # content listings that can be acted upon ds_spec = _datasets_since_( # important to pass unchanged dataset arg dataset, since, paths, recursive, recursion_limit) # instead of a loop, this could all be done in parallel matched_anything = False for dspath, dsrecords in ds_spec: matched_anything = True lgr.debug('Attempt push of Dataset at %s', dspath) pbars = {} yield from _push(dspath, dsrecords, to, data, force, jobs, res_kwargs.copy(), pbars, got_path_arg=True if path else False) # take down progress bars for this dataset for i, ds in pbars.items(): log_progress(lgr.info, i, 'Finished push of %s', ds) if not matched_anything: potential_remote = False if not to and len(paths) == 1: # if we get a remote name without --to, provide a hint sr = ds_repo.get_remotes(**get_remote_kwargs) potential_remote = [p for p in ensure_list(path) if p in sr] if potential_remote: hint = "{} matches a sibling name and not a path. " \ "Forgot --to?".format(potential_remote) yield dict( res_kwargs, status='notneeded', message=hint, hints=hint, type='dataset', path=ds.path, ) # there's no matching path and we have generated a hint on # fixing the call - we can return now return yield dict( res_kwargs, status='notneeded', message= 'Given constraints did not match any changes to publish', type='dataset', path=ds.path, )
def __call__( path=None, *, dataset=None, recursive=False, recursion_limit=None): refds = require_dataset(dataset, check_installed=True, purpose="unlock") # Before passing the results to status() # * record explicitly specified non-directory paths so that we can # decide whether to yield a result for reported paths # * filter out and yield results for paths that don't exist res_paths_nondir = set() paths_lexist = None res_paths = list() if path: # Note, that we need unresolved versions of the path input to be # passed on to status. See gh-5456 for example. path = ensure_list(path) res_paths = resolve_path(path, ds=dataset) paths_lexist = [] res_paths_lexist = [] for p, p_r in zip(path, res_paths): if p_r.exists() or p_r.is_symlink(): paths_lexist.append(p) res_paths_lexist.append(p_r) if not p_r.is_dir(): res_paths_nondir.add(p_r) res_kwargs = dict(action='unlock', logger=lgr, refds=refds.path) if res_paths: for p in set(res_paths).difference(set(res_paths_lexist)): yield get_status_dict( status="impossible", path=str(p), type="file", message="path does not exist", **res_kwargs) if not (paths_lexist or paths_lexist is None): return # Collect information on the paths to unlock. to_unlock = defaultdict(list) # ds => paths (relative to ds) for res in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=paths_lexist, untracked="normal" if res_paths_nondir else "no", annex="availability", recursive=recursive, recursion_limit=recursion_limit, result_renderer="disabled", return_type="generator", on_failure="ignore"): if res["action"] != "status" or res["status"] != "ok": yield res continue has_content = res.get("has_content") if has_content: parentds = res["parentds"] to_unlock[parentds].append(op.relpath(res["path"], parentds)) elif res_paths_nondir and Path(res["path"]) in res_paths_nondir: if has_content is False: msg = "no content present" status = "impossible" elif res["state"] == "untracked": msg = "untracked" status = "impossible" else: # This is either a regular git file or an unlocked annex # file. msg = "non-annex file" status = "notneeded" yield get_status_dict( status=status, path=res["path"], type="file", message="{}; cannot unlock".format(msg), **res_kwargs) # Do the actual unlocking. for ds_path, files in to_unlock.items(): # register for final orderly take down pbar_id = f'unlock-{ds_path}' nfiles = len(files) log_progress( lgr.info, pbar_id, 'Unlocking files', unit=' Files', label='Unlocking', total=nfiles, noninteractive_level=logging.INFO, ) ds = Dataset(ds_path) for r in ds.repo._call_annex_records_items_( ["unlock"], files=files, ): log_progress( lgr.info, pbar_id, "Files to unlock %i", nfiles, update=1, increment=True, noninteractive_level=logging.DEBUG) nfiles -= 1 yield get_status_dict( path=op.join(ds.path, r['file']), status='ok' if r['success'] else 'error', type='file', **res_kwargs) if nfiles < 1: # git-annex will spend considerable time after the last # file record to finish things up, let this be known log_progress( lgr.info, pbar_id, "Recording unlocked state in git", update=0, increment=True, noninteractive_level=logging.INFO) log_progress( lgr.info, pbar_id, "Completed unlocking files", noninteractive_level=logging.INFO)
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None): """Make a direct query of a dataset to extract its metadata. Parameters ---------- ds : Dataset types : list """ errored = False dsmeta = dict() contentmeta = {} if global_meta is not None and content_meta is not None and \ not global_meta and not content_meta: # both are false and not just none return dsmeta, contentmeta, errored context = { '@vocab': 'http://docs.datalad.org/schema_v{}.json'.format(vocabulary_version) } fullpathlist = paths if paths and isinstance(ds.repo, AnnexRepo): # Ugly? Jep: #2055 content_info = zip(paths, ds.repo.file_has_content(paths), ds.repo.is_under_annex(paths)) paths = [p for p, c, a in content_info if not a or c] nocontent = len(fullpathlist) - len(paths) if nocontent: # TODO better fail, or support incremental and label this file as no present lgr.warn('{} files have no content present, ' 'some extractors will not operate on {}'.format( nocontent, 'them' if nocontent > 10 else [p for p, c, a in content_info if not c and a])) # pull out potential metadata field blacklist config settings blacklist = [ re.compile(bl) for bl in assure_list( ds.config.obtain('datalad.metadata.aggregate-ignore-fields', default=[])) ] # enforce size limits max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize') # keep local, who knows what some extractors might pull in from pkg_resources import iter_entry_points # delayed heavy import extractors = { ep.name: ep for ep in iter_entry_points('datalad.metadata.extractors') } log_progress( lgr.info, 'metadataextractors', 'Start metadata extraction from %s', ds, total=len(types), label='Metadata extraction', unit=' extractors', ) for mtype in types: mtype_key = mtype log_progress(lgr.info, 'metadataextractors', 'Engage %s metadata extractor', mtype_key, update=1, increment=True) if mtype_key not in extractors: # we said that we want to fail, rather then just moan about less metadata log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError( 'Enabled metadata extractor %s is not available in this installation', mtype_key) try: extractor_cls = extractors[mtype_key].load() extractor = extractor_cls( ds, paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist) except Exception as e: log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError( "Failed to load metadata extractor for '%s', " "broken dataset configuration (%s)?: %s", mtype, ds, exc_str(e)) continue try: dsmeta_t, contentmeta_t = extractor.get_metadata( dataset=global_meta if global_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-dataset-{}'.format( mtype.replace('_', '-')), default=True, valtype=EnsureBool()), content=content_meta if content_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-content-{}'.format( mtype.replace('_', '-')), default=True, valtype=EnsureBool())) except Exception as e: lgr.error('Failed to get dataset metadata ({}): {}'.format( mtype, exc_str(e))) if cfg.get('datalad.runtime.raiseonerror'): log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise errored = True # if we dont get global metadata we do not want content metadata continue if dsmeta_t: if _ok_metadata(dsmeta_t, mtype, ds, None): dsmeta_t = _filter_metadata_fields(dsmeta_t, maxsize=max_fieldsize, blacklist=blacklist) dsmeta[mtype_key] = dsmeta_t else: errored = True unique_cm = {} extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude", set()) # TODO: ATM neuroimaging extractors all provide their own internal # log_progress but if they are all generators, we could provide generic # handling of the progress here. Note also that log message is actually # seems to be ignored and not used, only the label ;-) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Metadata extraction per location for %s', mtype, # # contentmeta_t is a generator... so no cound is known # # total=len(contentmeta_t or []), # label='Metadata extraction per location', # unit=' locations', # ) for loc, meta in contentmeta_t or {}: lgr.log(5, "Analyzing metadata for %s", loc) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label=loc, # update=1, # increment=True) if not _ok_metadata(meta, mtype, ds, loc): errored = True # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label='Failed for %s' % loc, # ) continue # we also want to store info that there was no metadata(e.g. to get a list of # files that have no metadata) # if there is an issue that a extractor needlessly produces empty records, the # extractor should be fixed and not a general switch. For example the datalad_core # issues empty records to document the presence of a file #elif not meta: # continue # apply filters meta = _filter_metadata_fields(meta, maxsize=max_fieldsize, blacklist=blacklist) if not meta: continue # assign # only ask each metadata extractor once, hence no conflict possible loc_dict = contentmeta.get(loc, {}) loc_dict[mtype_key] = meta contentmeta[loc] = loc_dict if ds.config.obtain('datalad.metadata.generate-unique-{}'.format( mtype_key.replace('_', '-')), default=True, valtype=EnsureBool()): # go through content metadata and inject report of unique keys # and values into `dsmeta` for k, v in iteritems(meta): if k in dsmeta.get(mtype_key, {}): # if the dataset already has a dedicated idea # about a key, we skip it from the unique list # the point of the list is to make missing info about # content known in the dataset, not to blindly # duplicate metadata. Example: list of samples data # were recorded from. If the dataset has such under # a 'sample' key, we should prefer that, over an # aggregated list of a hopefully-kinda-ok structure continue elif k in extractor_unique_exclude: # the extractor thinks this key is worthless for the purpose # of discovering whole datasets # we keep the key (so we know that some file is providing this key), # but ignore any value it came with unique_cm[k] = None continue vset = unique_cm.get(k, set()) vset.add(_val2hashable(v)) unique_cm[k] = vset # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Finished metadata extraction across locations for %s', mtype) if unique_cm: # per source storage here too ucp = dsmeta.get('datalad_unique_content_properties', {}) # important: we want to have a stable order regarding # the unique values (a list). we cannot guarantee the # same order of discovery, hence even when not using a # set above we would still need sorting. the callenge # is that any value can be an arbitrarily complex nested # beast # we also want to have each unique value set always come # in a top-level list, so we known if some unique value # was a list, os opposed to a list of unique values def _ensure_serializable(val): if isinstance(val, ReadOnlyDict): return { k: _ensure_serializable(v) for k, v in iteritems(val) } if isinstance(val, (tuple, list)): return [_ensure_serializable(v) for v in val] else: return val ucp[mtype_key] = { k: [ _ensure_serializable(i) for i in sorted(v, key=_unique_value_key) ] if v is not None else None for k, v in iteritems(unique_cm) # v == None (disable unique, but there was a value at some point) # otherwise we only want actual values, and also no single-item-lists # of a non-value # those contribute no information, but bloat the operation # (inflated number of keys, inflated storage, inflated search index, ...) if v is None or (v and not v == {''}) } dsmeta['datalad_unique_content_properties'] = ucp log_progress( lgr.info, 'metadataextractors', 'Finished metadata extraction from %s', ds, ) # always identify the effective vocabulary - JSON-LD style if context: dsmeta['@context'] = context return dsmeta, contentmeta, errored
def __call__(target, dataset=None, opts=None): # only non-bare repos have hashdirmixed, so require one ds = require_dataset(dataset, check_installed=True, purpose='RIA archive export') ds_repo = ds.repo # TODO remove once datalad 0.12rc7 or later is released if not hasattr(ds_repo, 'dot_git'): from datalad.support.gitrepo import GitRepo ds_repo.dot_git = ds_repo.pathobj / GitRepo.get_git_dir(ds_repo) annex_objs = ds_repo.dot_git / 'annex' / 'objects' archive = resolve_path(target, dataset) if archive.is_dir(): archive = archive / 'archive.7z' else: archive.parent.mkdir(exist_ok=True, parents=True) if not opts: # uncompressed by default opts = ['-mx0'] res_kwargs = dict( action="export-ria-archive", logger=lgr, ) if not annex_objs.is_dir(): yield get_status_dict( ds=ds, status='notneeded', message='no annex keys present', **res_kwargs, ) return exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ria_archive' if exportdir.exists(): yield get_status_dict( ds=ds, status='error', message=( 'export directory already exists, please remove first: %s', str(exportdir)), **res_kwargs, ) return keypaths = [ k for k in annex_objs.glob(op.join('**', '*')) if k.is_file() ] log_progress( lgr.info, 'riaarchiveexport', 'Start RIA archive export %s', ds, total=len(keypaths), label='RIA archive export', unit=' Keys', ) for keypath in keypaths: key = keypath.name hashdir = op.join(keypath.parts[-4], keypath.parts[-3]) log_progress(lgr.info, 'riaarchiveexport', 'Export key %s to %s', key, hashdir, update=1, increment=True) keydir = exportdir / hashdir / key keydir.mkdir(parents=True, exist_ok=True) os.link(str(keypath), str(keydir / key)) log_progress(lgr.info, 'riaarchiveexport', 'Finished RIA archive export from %s', ds) try: subprocess.run( ['7z', 'u', str(archive), '.'] + opts, cwd=str(exportdir), ) yield get_status_dict(path=str(archive), type='file', status='ok', **res_kwargs) except Exception as e: yield get_status_dict(path=str(archive), type='file', status='error', message=('7z failed: %s', exc_str(e)), **res_kwargs) return finally: rmtree(str(exportdir))
def get_metadata(self, dataset, content): if not content: return {}, [] context = {} contentmeta = [] log_progress( lgr.info, 'extractorxmp', 'Start XMP metadata extraction from %s', self.ds, total=len(self.paths), label='XMP metadata extraction', unit=' Files', ) for f in self.paths: absfp = opj(self.ds.path, f) log_progress( lgr.info, 'extractorxmp', 'Extract XMP metadata from %s', absfp, update=1, increment=True) info = file_to_dict(absfp) if not info: # got nothing, likely nothing there # TODO check if this is an XMP sidecar file, parse that, and assign metadata # to the base file continue # update vocabulary vocab = {info[ns][0][0].split(':')[0]: {'@id': ns, 'type': vocabulary_id} for ns in info} # TODO this is dirty and assumed that XMP is internally consistent with the # definitions across all files -- which it likely isn't context.update(vocab) # now pull out actual metadata # cannot do simple dict comprehension, because we need to beautify things a little meta = {} for ns in info: for key, val, props in info[ns]: if not val: # skip everything empty continue if key.count('[') > 1: # this is a nested array # MIH: I do not think it is worth going here continue if props['VALUE_IS_ARRAY']: # we'll catch the actuall array values later continue # normalize value val = assure_unicode(val) # non-breaking space val = val.replace(u"\xa0", ' ') field, idx, qual = xmp_field_re.match(key).groups() normkey = u'{}{}'.format(field, qual) if '/' in key: normkey = u'{0}<{1}>'.format(*normkey.split('/')) if idx: # array arr = meta.get(normkey, []) arr.append(val) meta[normkey] = arr else: meta[normkey] = val # compact meta = {k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in meta.items()} contentmeta.append((f, meta)) log_progress( lgr.info, 'extractorxmp', 'Finished XMP metadata extraction from %s', self.ds ) return { '@context': context, }, \ contentmeta
def get_metadata(self, dataset, content): imgseries = {} imgs = {} log_progress( lgr.info, 'extractordicom', 'Start DICOM metadata extraction from %s', self.ds, total=len(self.paths), label='DICOM metadata extraction', unit=' Files', ) for f in self.paths: absfp = op.join(self.ds.path, f) log_progress(lgr.info, 'extractordicom', 'Extract DICOM metadata from %s', absfp, update=1, increment=True) if op.basename(f).startswith('PSg'): # ignore those dicom files, since they appear to not contain # any relevant metadata for image series, but causing trouble # (see gh-2210). We might want to change that whenever we get # a better understanding of how to deal with those files. lgr.debug("Ignoring DICOM file %s", f) continue try: d = dcm.read_file(absfp, defer_size=1000, stop_before_pixels=True) except InvalidDicomError: # we can only ignore lgr.debug('"%s" does not look like a DICOM file, skipped', f) continue if isinstance(d, DicomDir): lgr.debug( "%s appears to be a DICOMDIR file. Extraction not yet" " implemented, skipped", f) continue ddict = None if content: ddict = _struct2dict(d) imgs[f] = ddict if d.SeriesInstanceUID not in imgseries: # start with a copy of the metadata of the first dicom in a series series = _struct2dict(d) if ddict is None else ddict.copy() # store directory containing the image series (good for sorted # DICOM datasets) series_dir = op.dirname(f) series[ 'SeriesDirectory'] = series_dir if series_dir else op.curdir series_files = [] else: series, series_files = imgseries[d.SeriesInstanceUID] # compare incoming with existing metadata set series = { k: series[k] for k in series # only keys that exist and have values that are identical # across all images in the series if _convert_value(getattr(d, k, None)) == series[k] } series_files.append(f) # store imgseries[d.SeriesInstanceUID] = (series, series_files) log_progress(lgr.info, 'extractordicom', 'Finished DICOM metadata extraction from %s', self.ds) dsmeta = { '@context': context, 'Series': [info for info, files in imgseries.values()] } return ( dsmeta, # yield the corresponding series description for each file imgs.items() if content else [])
def _mk_search_index(self, force_reindex): """Generic entrypoint to index generation The actual work that determines the structure and content of the index is done by functions that are passed in as arguments `meta2doc` - must return dict for index document from result input """ from whoosh import index as widx from .metadata import get_ds_aggregate_db_locations dbloc, db_base_path = get_ds_aggregate_db_locations(self.ds) # what is the lastest state of aggregated metadata metadata_state = self.ds.repo.get_last_commit_hexsha(relpath(dbloc, start=self.ds.path)) # use location common to all index types, they would all invalidate # simultaneously stamp_fname = opj(self.index_dir, 'datalad_metadata_state') index_dir = opj(self.index_dir, self._mode_label) if (not force_reindex) and \ exists(index_dir) and \ exists(stamp_fname) and \ open(stamp_fname).read() == metadata_state: try: # TODO check that the index schema is the same # as the one we would have used for reindexing # TODO support incremental re-indexing, whoosh can do it idx = widx.open_dir(index_dir) lgr.debug( 'Search index contains %i documents', idx.doc_count()) self.idx_obj = idx return except widx.LockError as e: raise e except widx.IndexError as e: # Generic index error. # we try to regenerate lgr.warning( "Cannot open existing index %s (%s), will regenerate", index_dir, exc_str(e) ) except widx.IndexVersionError as e: # (msg, version, release=None) # Raised when you try to open an index using a format that the # current version of Whoosh cannot read. That is, when the index # you're trying to open is either not backward or forward # compatible with this version of Whoosh. # we try to regenerate lgr.warning(exc_str(e)) pass except widx.OutOfDateError as e: # Raised when you try to commit changes to an index which is not # the latest generation. # this should not happen here, but if it does ... KABOOM raise except widx.EmptyIndexError as e: # Raised when you try to work with an index that has no indexed # terms. # we can just continue with generating an index pass except ValueError as e: if 'unsupported pickle protocol' in str(e): lgr.warning( "Cannot open existing index %s (%s), will regenerate", index_dir, exc_str(e) ) else: raise lgr.info('{} search index'.format( 'Rebuilding' if exists(index_dir) else 'Building')) if not exists(index_dir): os.makedirs(index_dir) # this is a pretty cheap call that just pull this info from a file dsinfo = self.ds.metadata( get_aggregates=True, return_type='list', result_renderer='disabled') self._mk_schema(dsinfo) idx_obj = widx.create_in(index_dir, self.schema) idx = idx_obj.writer( # cache size per process limitmb=cfg.obtain('datalad.search.indexercachesize'), # disable parallel indexing for now till #1927 is resolved ## number of processes for indexing #procs=multiprocessing.cpu_count(), ## write separate index segments in each process for speed ## asks for writer.commit(optimize=True) #multisegment=True, ) # load metadata of the base dataset and what it knows about all its subdatasets # (recursively) old_idx_size = 0 old_ds_rpath = '' idx_size = 0 log_progress( lgr.info, 'autofieldidxbuild', 'Start building search index', total=len(dsinfo), label='Building search index', unit=' Datasets', ) for res in query_aggregated_metadata( reporton=self.documenttype, ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again meta = res.get('metadata', {}) doc = self._meta2doc(meta) admin = { 'type': res['type'], 'path': relpath(res['path'], start=self.ds.path), } if 'parentds' in res: admin['parentds'] = relpath(res['parentds'], start=self.ds.path) if admin['type'] == 'dataset': if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural( 'document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) log_progress(lgr.info, 'autofieldidxbuild', 'Indexed dataset at %s', old_ds_rpath, update=1, increment=True) old_idx_size = idx_size old_ds_rpath = admin['path'] admin['id'] = res.get('dsid', None) doc.update({k: assure_unicode(v) for k, v in admin.items()}) lgr.debug("Adding document to search index: {}".format(doc)) # inject into index idx.add_document(**doc) idx_size += 1 if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural( 'document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) lgr.debug("Committing index") idx.commit(optimize=True) log_progress( lgr.info, 'autofieldidxbuild', 'Done building search index') # "timestamp" the search index to allow for automatic invalidation with open(stamp_fname, 'w') as f: f.write(metadata_state) lgr.info('Search index contains %i documents', idx_size) self.idx_obj = idx_obj
def __call__( target, opts=None, *, # opts is positional but optional in CLI dataset=None, remote=None, annex_wanted=None, froms=None, missing_content='error',): # only non-bare repos have hashdirmixed, so require one ds = require_dataset( dataset, check_installed=True, purpose='export to ORA archive') ds_repo = ds.repo annex_objs = ds_repo.dot_git / 'annex' / 'objects' archive = resolve_path(target, dataset) if archive.is_dir(): archive = archive / 'archive.7z' else: archive.parent.mkdir(exist_ok=True, parents=True) froms = ensure_list(froms) if not opts: # uncompressed by default opts = ['-mx0'] res_kwargs = dict( action="export-archive-ora", logger=lgr, ) if not annex_objs.is_dir(): yield get_status_dict( ds=ds, status='notneeded', message='no annex keys present', **res_kwargs, ) return exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ora_archive' if exportdir.exists(): yield get_status_dict( ds=ds, status='error', message=( 'export directory already exists, please remove first: %s', str(exportdir)), **res_kwargs, ) return def expr_to_opts(expr): opts = [] expr = expr.replace('(', ' ( ').replace(')', ' ) ') for sub_expr in expr.split(' '): if len(sub_expr): if sub_expr in '()': opts.append(f"-{sub_expr}") else: opts.append(f"--{sub_expr}") return opts find_filters = [] if remote: find_filters = ['-('] + expr_to_opts(ds_repo.get_preferred_content('wanted', remote)) + ['-)'] if annex_wanted: find_filters.extend(expr_to_opts(annex_wanted)) # git-annex find results need to be uniqued with set, as git-annex find # will return duplicates if multiple symlinks point to the same key. if froms: keypaths = set([ annex_objs.joinpath(k) for treeish in froms for k in ds_repo.call_annex_items_([ 'find', *find_filters, f"--branch={treeish}", "--format=${hashdirmixed}${key}/${key}\\n"]) ]) else: keypaths = set(annex_objs.joinpath(k) for k in ds_repo.call_annex_items_([ 'find', *find_filters, "--format=${hashdirmixed}${key}/${key}\\n" ])) log_progress( lgr.info, 'oraarchiveexport', 'Start ORA archive export %s', ds, total=len(keypaths), label='ORA archive export', unit=' Keys', ) if missing_content == 'continue': missing_file_lgr_func = lgr.warning elif missing_content == 'ignore': missing_file_lgr_func = lgr.debug link_fx = os.link for keypath in keypaths: key = keypath.name hashdir = op.join(keypath.parts[-4], keypath.parts[-3]) log_progress( lgr.info, 'oraarchiveexport', 'Export key %s to %s', key, hashdir, update=1, increment=True) keydir = exportdir / hashdir / key keydir.mkdir(parents=True, exist_ok=True) try: link_fx(str(keypath), str(keydir / key)) except FileNotFoundError as e: if missing_content == 'error': raise IOError('Key %s has no content available' % keypath) missing_file_lgr_func( 'Key %s has no content available', str(keypath)) except OSError: lgr.warning( 'No hard links supported at %s, will copy files instead', str(keypath)) # no hard links supported # switch function after first error link_fx = shutil.copyfile link_fx(str(keypath), str(keydir / key)) log_progress( lgr.info, 'oraarchiveexport', 'Finished RIA archive export from %s', ds ) try: subprocess.run( ['7z', 'u', str(archive), '.'] + opts, cwd=str(exportdir), ) yield get_status_dict( path=str(archive), type='file', status='ok', **res_kwargs) except Exception as e: ce = CapturedException(e) yield get_status_dict( path=str(archive), type='file', status='error', message=('7z failed: %s', ce), exception=ce, **res_kwargs) return finally: rmtree(str(exportdir))
def __call__(dataset, urlfile, urlformat, filenameformat, input_type="ext", exclude_autometa=None, meta=None, message=None, dry_run=False, fast=False, ifexists=None, missing_value=None, save=True, version_urls=False): # Temporarily work around gh-2269. url_file = urlfile url_format, filename_format = urlformat, filenameformat from requests.exceptions import RequestException from datalad.distribution.dataset import Dataset, require_dataset from datalad.interface.results import get_status_dict from datalad.support.annexrepo import AnnexRepo lgr = logging.getLogger("datalad.plugin.addurls") dataset = require_dataset(dataset, check_installed=False) if dataset.repo and not isinstance(dataset.repo, AnnexRepo): yield get_status_dict(action="addurls", ds=dataset, status="error", message="not an annex repo") return if input_type == "ext": extension = os.path.splitext(url_file)[1] input_type = "json" if extension == ".json" else "csv" with open(url_file) as fd: try: rows, subpaths = extract(fd, input_type, url_format, filename_format, exclude_autometa, meta, dry_run, missing_value) except (ValueError, RequestException) as exc: yield get_status_dict(action="addurls", ds=dataset, status="error", message=exc_str(exc)) return if len(rows) != len(set(row["filename"] for row in rows)): yield get_status_dict(action="addurls", ds=dataset, status="error", message=("There are file name collisions; " "consider using {_repindex}")) return if dry_run: for subpath in subpaths: lgr.info("Would create a subdataset at %s", subpath) for row in rows: lgr.info("Would download %s to %s", row["url"], os.path.join(dataset.path, row["filename"])) lgr.info( "Metadata: %s", sorted(u"{}={}".format(k, v) for k, v in row["meta_args"].items())) yield get_status_dict(action="addurls", ds=dataset, status="ok", message="dry-run finished") return if not dataset.repo: # Populate a new dataset with the URLs. for r in dataset.create(result_xfm=None, return_type='generator'): yield r annex_options = ["--fast"] if fast else [] for spath in subpaths: if os.path.exists(os.path.join(dataset.path, spath)): lgr.warning("Not creating subdataset at existing path: %s", spath) else: for r in dataset.create(spath, result_xfm=None, return_type='generator'): yield r for row in rows: # Add additional information that we'll need for various # operations. filename_abs = os.path.join(dataset.path, row["filename"]) if row["subpath"]: ds_current = Dataset(os.path.join(dataset.path, row["subpath"])) ds_filename = os.path.relpath(filename_abs, ds_current.path) else: ds_current = dataset ds_filename = row["filename"] row.update({ "filename_abs": filename_abs, "ds": ds_current, "ds_filename": ds_filename }) if version_urls: num_urls = len(rows) log_progress(lgr.info, "addurls_versionurls", "Versioning %d URLs", num_urls, label="Versioning URLs", total=num_urls, unit=" URLs") for row in rows: url = row["url"] try: row["url"] = get_versioned_url(url) except (ValueError, NotImplementedError) as exc: # We don't expect this to happen because get_versioned_url # should return the original URL if it isn't an S3 bucket. # It only raises exceptions if it doesn't know how to # handle the scheme for what looks like an S3 bucket. lgr.warning("error getting version of %s: %s", row["url"], exc_str(exc)) log_progress(lgr.info, "addurls_versionurls", "Versioned result for %s: %s", url, row["url"], update=1, increment=True) log_progress(lgr.info, "addurls_versionurls", "Finished versioning URLs") files_to_add = set() for r in add_urls(rows, ifexists=ifexists, options=annex_options): if r["status"] == "ok": files_to_add.add(r["path"]) yield r msg = message or """\ [DATALAD] add files from URLs url_file='{}' url_format='{}' filename_format='{}'""".format(url_file, url_format, filename_format) if files_to_add: meta_rows = [r for r in rows if r["filename_abs"] in files_to_add] for r in add_meta(meta_rows): yield r if save: for r in dataset.save(path=files_to_add, message=msg, recursive=True): yield r
def __call__(self, dataset, refcommit, process_type, status): # shortcut ds = dataset log_progress( lgr.info, 'extractorcustom', 'Start custom metadata extraction from %s', ds, total=len(status) + 1, label='Custom metadata extraction', unit=' Files', ) if process_type in ('all', 'content'): mfile_expr = _get_fmeta_expr(ds) for rec in status: log_progress(lgr.info, 'extractorcustom', 'Extracted custom metadata from %s', rec['path'], update=1, increment=True) # build metadata file path meta_fpath = _get_fmeta_objpath(ds, mfile_expr, rec) if meta_fpath is not None and op.exists(meta_fpath): try: meta = jsonload(text_type(meta_fpath)) if isinstance(meta, dict) and meta \ and '@id' not in meta: # in case we have a single, top-level # document, and it has no ID: assume that # it describes the file and assign the # datalad file ID meta['@id'] = get_file_id(rec) if meta: yield dict( path=rec['path'], metadata=meta, type=rec['type'], status='ok', ) except Exception as e: yield dict( path=rec['path'], type=rec['type'], status='error', message=exc_str(e), ) if process_type in ('all', 'dataset'): for r in _yield_dsmeta(ds): yield r log_progress(lgr.info, 'extractorcustom', 'Extracted custom metadata from %s', ds.path, update=1, increment=True) log_progress(lgr.info, 'extractorcustom', 'Finished custom metadata extraction from %s', ds.path)