def _save_outputs(ds, to_save, msg): """Helper to save results after command execution is completed""" return Save.__call__(to_save, dataset=ds, recursive=True, message=msg, return_type='generator')
def handle_dirty_dataset(ds, mode, msg=None): """Detect and treat unsaved changes as instructed by `mode` Parameters ---------- ds : Dataset or None Dataset to be inspected. Does nothing if `None`. mode : {'fail', 'ignore', 'save-before'} How to act upon discovering unsaved changes. msg : str or None Custom message to use for a potential commit. Returns ------- None """ if ds is None: # nothing to be handled return if msg is None: msg = '[DATALAD] auto-saved changes' # make sure that all pending changes (batched annex operations, etc.) # are actually reflected in Git if ds.repo: ds.repo.precommit() if mode == 'ignore': return elif mode == 'fail': if not ds.repo or ds.repo.dirty: raise RuntimeError('dataset {} has unsaved changes'.format(ds)) elif mode == 'save-before': if not ds.is_installed(): raise RuntimeError('dataset {} is not yet installed'.format(ds)) from datalad.core.local.save import Save Save.__call__(dataset=ds, message=msg, updated=True) else: raise ValueError("unknown if-dirty mode '{}'".format(mode))
def __call__( path=None, *, dataset=None, recursive=False, recursion_limit=None, update_mode='target', incremental=False, force_extraction=False, save=True): refds_path = require_dataset(dataset) # it really doesn't work without a dataset ds = require_dataset( dataset, check_installed=True, purpose='metadata aggregation') path = ensure_list(path) if not path: # then current/reference dataset is "aggregated" # We should not add ds.path always since then --recursive would # also recurse current even if paths are given path.append(ds.path) agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations( ds, # do not warn here, next call triggers the same warning warn_absent=False) agginfo_db = load_ds_aggregate_db(ds, abspath=True) to_save = [] to_aggregate = set() paths_by_ds, errors = get_paths_by_ds( require_dataset(dataset), dataset, paths=ensure_list(path), subdsroot_mode='super') for ap in _minimal_annotate_paths( paths_by_ds, errors, action='aggregate_metadata', recursive=recursive, recursion_limit=recursion_limit): if ap.get('status', None): # this is done yield ap continue ap_type = ap.get('type', None) ap_state = ap.get('state', None) assert('parentds' in ap or ap_type == 'dataset') if ap_type == 'dataset' and ap_state != 'absent': # a present dataset, we can take directly from it aggsrc = ap['path'] lgr.info('Aggregate metadata for dataset %s', aggsrc) else: # everything else needs to come from the parent aggsrc = ap['parentds'] if ap_state == 'absent': lgr.info( 'Attempt to use pre-aggregate metadata for absent %s from dataset at %s', ap['path'], aggsrc) else: lgr.info( 'Aggregate metadata for %s from dataset at %s', ap['path'], aggsrc) to_aggregate.add(aggsrc) if ap_state == 'absent': # key thought: recursive is done by path annotation, hence # once we hit an absent dataset, we are 100% certain that # there is nothing to recurse into on the file system # hence we only have to look into the aggregated metadata # of the last available dataset in the dataset tree edge # # if there is nothing at this path, we need to look into the # parentds and check if we know anything about this path # if we do, we need to grab all the info and objects # if not, we need to error res = _get_dsinfo_from_aggmetadata( aggsrc, ap['path'], recursive, agginfo_db) if not isinstance(res, list): yield get_status_dict( status='impossible', message=res, action='aggregate_metadata', path=ap['path'], logger=lgr) continue # cue for aggregation to_aggregate.update(res) else: # actually aggregate metadata for this dataset, immediately place # generated objects into the aggregated or reference dataset, # and put info into DB to get the distributed to all datasets # that need to be updated errored = _dump_extracted_metadata( ds, Dataset(aggsrc), agginfo_db, to_save, force_extraction, agg_base_path) if errored: yield get_status_dict( status='error', message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)', action='aggregate_metadata', path=aggsrc, logger=lgr) # at this point we have dumped all aggregated metadata into object files # somewhere, we know what needs saving, but having saved anything, and # we know about the states of all aggregated dataset in the DB # what remains to do is to update all dataset, so they have there own copy # of aggregated metadata and update their respective aggregate.json with # info on what states we just aggregated from # first, let's figure out what dataset need updating at all # get adjencency info of the dataset tree spanning the base to all leaf dataset # associated with the path arguments if update_mode == 'all': ds_adj = {} discover_dataset_trace_to_targets( ds.path, to_aggregate, [], ds_adj, # we know that to_aggregate only lists datasets, existing and # absent ones -- we want to aggregate all of them, either from # just extracted metadata, or from previously aggregated metadata # of the closest superdataset includeds=to_aggregate) # TODO we need to work in the info about dataset that we only got from # aggregated metadata, that had no trace on the file system in here!! subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate) elif update_mode == 'target': subtrees = {ds.path: list(agginfo_db.keys())} else: raise ValueError( "unknown `update_mode` '%s' for metadata aggregation", update_mode) # go over datasets in bottom-up fashion for parentds_path in sorted(subtrees, reverse=True): lgr.info('Update aggregate metadata in dataset at: %s', parentds_path) _update_ds_agginfo( ds.path, parentds_path, subtrees[parentds_path], incremental, agginfo_db, to_save) # update complete res = get_status_dict( status='ok', action='aggregate_metadata', path=parentds_path, type='dataset', logger=lgr) res.update(agginfo_db.get(parentds_path, {})) yield res # # save potential modifications to dataset global metadata # if not to_save: return lgr.info('Attempting to save %i files/datasets', len(to_save)) for res in Save.__call__( # save does not need any pre-annotated path hints path=[r['path'] for r in to_save], dataset=refds_path, message='[DATALAD] Dataset aggregate metadata update', return_type='generator', result_renderer='disabled', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, extra_info=None, rerun_info=None, extra_inputs=None, rerun_outputs=None, inject=False): """Run `cmd` in `dataset` and record the results. `Run.__call__` is a simple wrapper over this function. Aside from backward compatibility kludges, the only difference is that `Run.__call__` doesn't expose all the parameters of this function. The unexposed parameters are listed below. Parameters ---------- extra_info : dict, optional Additional information to dump with the json run record. Any value given here will take precedence over the standard run key. Warning: To avoid collisions with future keys added by `run`, callers should try to use fairly specific key names and are encouraged to nest fields under a top-level "namespace" key (e.g., the project or extension name). rerun_info : dict, optional Record from a previous run. This is used internally by `rerun`. extra_inputs : list, optional Inputs to use in addition to those specified by `inputs`. Unlike `inputs`, these will not be injected into the {inputs} format field. rerun_outputs : list, optional Outputs, in addition to those in `outputs`, determined automatically from a previous run. This is used internally by `rerun`. inject : bool, optional Record results as if a command was run, skipping input and output preparation and command execution. In this mode, the caller is responsible for ensuring that the state of the working tree is appropriate for recording the command's results. Yields ------ Result records for the run. """ if not cmd: lgr.warning("No command given") return rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset(dataset, check_installed=True, purpose='tracking outcomes of a command') ds_path = ds.path lgr.debug('tracking command output underneath %s', ds) if not (rerun_info or inject): # Rerun already takes care of this. # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. # MIH: is_dirty() is gone, but status() can do all of the above! if not explicit and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=( 'clean dataset required to detect changes from command; ' 'use `datalad status` to inspect unsaved changes')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) extra_inputs = GlobbedPaths( extra_inputs, pwd=pwd, # Follow same expansion rules as `inputs`. expand=expand in ["inputs", "both"]) outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"]) # ATTN: For correct path handling, all dataset commands call should be # unbound. They should (1) receive a string dataset argument, (2) receive # relative paths, and (3) happen within a chpwd(pwd) context. if not inject: with chpwd(pwd): for res in prepare_inputs(ds_path, inputs, extra_inputs): yield res if outputs: for res in _install_and_reglob(ds_path, outputs): yield res for res in _unlock_or_remove(ds_path, outputs.expand()): yield res if rerun_outputs is not None: for res in _unlock_or_remove(ds_path, rerun_outputs): yield res else: # If an inject=True caller wants to override the exit code, they can do # so in extra_info. cmd_exitcode = 0 exc = None try: cmd_expanded = format_command( ds, cmd, pwd=pwd, dspath=ds_path, # Check if the command contains "{tmpdir}" to avoid creating an # unnecessary temporary directory in most but not all cases. tmpdir=mkdtemp(prefix="datalad-run-") if "{tmpdir}" in cmd else "", inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return if not inject: cmd_exitcode, exc = _execute_command( cmd_expanded, pwd, expected_exit=rerun_info.get("exit", 0) if rerun_info else None) # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'extra_inputs': extra_inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id if extra_info: run_info.update(extra_info) record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) # If ConfigManager gets the ability to say "return single value", # update this code to use that. if isinstance(use_sidecar, tuple): # Use same precedence as 'git config'. use_sidecar = use_sidecar[-1] use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds_path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd_expanded), '"{}"'.format(record_id) if use_sidecar else record) outputs_to_save = outputs.expand() if explicit else None do_save = outputs_to_save is None or outputs_to_save if not rerun_info and cmd_exitcode: if do_save: repo = ds.repo msg_path = relpath(opj(str(repo.dot_git), "COMMIT_EDITMSG")) with open(msg_path, "wb") as ofh: ofh.write(assure_bytes(msg)) lgr.info( "The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad save -d . -r -F %s'", msg_path) raise exc elif do_save: with chpwd(pwd): for r in Save.__call__(dataset=ds_path, path=outputs_to_save, recursive=True, message=msg, return_type='generator'): yield r
def __call__( path=None, dataset=None, recursive=False, check=True, save=True, message=None, if_dirty='save-before'): res_kwargs = dict(action='remove', logger=lgr) if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `remove`: requires at least a path or dataset") refds_path = Interface.get_refds_path(dataset) res_kwargs['refds'] = refds_path if refds_path and not path and not GitRepo.is_valid_repo(refds_path): # nothing here, nothing to remove yield get_status_dict(path=refds_path, status='notneeded', **res_kwargs) return if refds_path and not path: # act on the whole dataset if nothing else was specified # TODO i think that would happen automatically in annotation? path = refds_path to_process = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, # we only ever want to discover immediate subdatasets, the rest # will happen in `uninstall` recursion_limit=1, action='remove', unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('state', None) == 'absent' and \ ap.get('parentds', None) is None: # nothing exists at location, and there is no parent to # remove from ap['status'] = 'notneeded' ap['message'] = "path does not exist and is not in a dataset" yield ap continue if ap.get('raw_input', False) and ap.get('type', None) == 'dataset': # make sure dataset sorting yields a dedicted entry for this one ap['process_content'] = True to_process.append(ap) if not to_process: # nothing left to do, potentially all errored before return if path_is_under([ap['path'] for ap in to_process]): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom # to make the removal of dataset content known upstairs to_save = [] # track which submodules we have removed in the process, to avoid # failure in case we revisit them due to a subsequent path argument subm_removed = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] to_reporemove = dict() # PLAN any dataset that was not raw_input, uninstall (passing recursive flag) # if dataset itself is in paths, skip any nondataset # sort reverse so we get subdatasets first for ap in sorted(paths, key=lambda x: x['path'], reverse=True): if ap.get('type', None) == 'dataset': # entire dataset needs to go, uninstall if present, pass recursive! uninstall_failed = False if ap['path'] == refds_path or \ (refds_path is None and ap.get('raw_input', False)): # top-level handling, cannot use regular uninstall call, as # it will refuse to uninstall a top-level dataset # and rightfully so, it is really a remove in that case # bypass all the safety by using low-level helper for r in _uninstall_dataset(ds, check=check, has_super=False, **res_kwargs): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True r['refds'] = refds_path yield r # recheck that it wasn't removed during a previous iteration elif ap.get('state', None) != 'absent' and GitRepo.is_valid_repo(ap['path']): # anything that is not the top-level -> regular uninstall # this is for subdatasets of the to-be-removed dataset # we want to simply uninstall them in a regular manner for r in Uninstall.__call__( ap['path'], dataset=refds_path, recursive=recursive, check=check, if_dirty=if_dirty, result_xfm=None, result_filter=None, on_failure='ignore'): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True yield r if not ap.get('raw_input', False): # we only ever want to actually unregister subdatasets that # were given explicitly continue if not uninstall_failed and \ not ap['path'] in subm_removed and \ refds_path and \ ap.get('parentds', None) and \ not (relpath(ap['path'], start=refds_path).startswith(pardir) or ap['path'] == refds_path) and \ ap.get('registered_subds', False): # strip from superdataset, but only if a dataset was given explcitly # as in "remove from this dataset", but not when just a path was given # as in "remove from the filesystem" subds_relpath = relpath(ap['path'], start=ap['parentds']) # remove submodule reference parentds = Dataset(ap['parentds']) # play safe, will fail on dirty parentds.repo.deinit_submodule(ap['path']) # remove now empty submodule link parentds.repo.remove(ap['path']) # make a record that we removed this already, should it be # revisited via another path argument, because do not reannotate # the paths after every removal subm_removed.append(ap['path']) yield dict(ap, status='ok', **res_kwargs) # need .gitmodules update in parent to_save.append(dict( path=opj(parentds.path, '.gitmodules'), parents=parentds.path, type='file')) # and the removal itself needs to be committed # inform `save` that it is OK that this path # doesn't exist on the filesystem anymore ap['unavailable_path_status'] = '' ap['process_content'] = False to_save.append(ap) if not uninstall_failed and exists(ap['path']): # could be an empty dir in case an already uninstalled subdataset # got removed rmdir(ap['path']) else: # anything that is not a dataset can simply be passed on to_reporemove[ap['path']] = ap # avoid unnecessary git calls when there is nothing to do if to_reporemove: if check and hasattr(ds.repo, 'drop'): for r in _drop_files(ds, list(to_reporemove), check=True): if r['status'] == 'error': # if drop errored on that path, we can't remove it to_reporemove.pop(r['path'], 'avoidKeyError') yield r if to_reporemove: for r in ds.repo.remove(list(to_reporemove), r=True): # these were removed, but we still need to save the # removal r_abs = opj(ds.path, r) if r_abs in to_reporemove: ap = to_reporemove[r_abs] else: ap = {'path': r_abs, 'parentds': ds.path, 'refds': refds_path } ap['unavailable_path_status'] = '' to_save.append(ap) yield get_status_dict( status='ok', path=r, **res_kwargs) if not to_save: # nothing left to do, potentially all errored before return if not save: lgr.debug('Not calling `save` as instructed') return for res in Save.__call__( path=[ap["path"] for ap in to_save], # we might have removed the reference dataset by now, recheck dataset=refds_path if (refds_path and GitRepo.is_valid_repo(refds_path)) else None, message=message if message else '[DATALAD] removed content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__(path=None, spec=None, dataset=None, subject=None, anon_subject=None, acquisition=None, properties=None): # TODO: acquisition can probably be removed (or made an alternative to # derive spec and/or dicom location from) # Change, so path needs to point directly to dicom ds? # Or just use acq and remove path? dataset = require_dataset(dataset, check_installed=True, purpose="spec from dicoms") from datalad.utils import assure_list if path is not None: path = assure_list(path) path = [resolve_path(p, dataset) for p in path] else: raise InsufficientArgumentsError( "insufficient arguments for dicom2spec: a path is required") # TODO: We should be able to deal with several paths at once # ATM we aren't (see also commit + message of actual spec) assert len(path) == 1 if not spec: raise InsufficientArgumentsError( "insufficient arguments for dicom2spec: a spec file is required" ) # TODO: That's prob. wrong. We can derive default spec from acquisition else: spec = resolve_path(spec, dataset) spec_series_list = \ [r for r in json_py.load_stream(spec)] if op.exists(spec) else list() # get dataset level metadata: found_some = False for meta in dataset.meta_dump( path, recursive=False, # always False? reporton='datasets', return_type='generator', result_renderer='disabled'): if meta.get('status', None) not in ['ok', 'notneeded']: yield meta continue if 'dicom' not in meta['metadata']: # TODO: Really "notneeded" or simply not a result at all? yield dict(status='notneeded', message=("found no DICOM metadata for %s", meta['path']), path=meta['path'], type='dataset', action='dicom2spec', logger=lgr) continue if 'Series' not in meta['metadata']['dicom'] or \ not meta['metadata']['dicom']['Series']: yield dict( status='impossible', message=("no image series detected in DICOM metadata of" " %s", meta['path']), path=meta['path'], type='dataset', action='dicom2spec', logger=lgr) continue found_some = True overrides = dict() if properties: # load from file or json string props = json_py.load(properties) \ if op.exists(properties) else json_py.loads(properties) # turn into editable, pre-approved records props = { k: dict(value=v, approved=True) for k, v in props.items() } overrides.update(props) spec_series_list = add_to_spec( meta, spec_series_list, op.dirname(spec), subject=subject, anon_subject=anon_subject, # session=session, # TODO: parameter "session" was what # we now call acquisition. This is # NOT a good default for bids_session! # Particularly wrt to anonymization overrides=overrides, dataset=dataset) if not found_some: yield dict( status='impossible', message="found no DICOM metadata", path=path, type= 'file', # TODO: arguable should be 'file' or 'dataset', depending on path action='dicom2spec', logger=lgr) return # TODO: RF needed. This rule should go elsewhere: # ignore duplicates (prob. reruns of aborted runs) # -> convert highest id only # Note: This sorting is a q&d hack! # TODO: Sorting needs to become more sophisticated + include notion of :all spec_series_list = sorted(spec_series_list, key=lambda x: get_specval(x, 'id') if 'id' in x.keys() else 0) for i in range(len(spec_series_list)): # Note: Removed the following line from condition below, # since it appears to be pointless. Value for 'converter' # used to be 'heudiconv' or 'ignore' for a 'dicomseries', so # it's not clear ATM what case this could possibly have catched: # heuristic.has_specval(spec_series_list[i], "converter") and \ if spec_series_list[i]["type"] == "dicomseries" and \ has_specval(spec_series_list[i], "bids-run") and \ get_specval(spec_series_list[i], "bids-run") in \ [get_specval(s, "bids-run") for s in spec_series_list[i + 1:] if get_specval( s, "description") == get_specval( spec_series_list[i], "description") and \ get_specval(s, "id") > get_specval( spec_series_list[i], "id") ]: lgr.debug("Ignore SeriesNumber %s for conversion" % i) spec_series_list[i]["tags"].append( 'hirni-dicom-converter-ignore') lgr.debug("Storing specification (%s)", spec) # store as a stream (one record per file) to be able to # easily concat files without having to parse them, or # process them line by line without having to fully parse them from datalad_hirni.support.spec_helpers import sort_spec # Note: Sorting paradigm needs to change. See above. # spec_series_list = sorted(spec_series_list, key=lambda x: sort_spec(x)) json_py.dump2stream(spec_series_list, spec) # make sure spec is in git: dataset.repo.set_gitattributes([(spec, { 'annex.largefiles': 'nothing' })], '.gitattributes') for r in Save.__call__(dataset=dataset, path=[spec, '.gitattributes'], to_git=True, message="[HIRNI] Added study specification " "snippet for %s" % op.relpath(path[0], dataset.path), return_type='generator', result_renderer='disabled'): if r.get('status', None) not in ['ok', 'notneeded']: yield r elif r['path'] in [spec, op.join(dataset.path, '.gitattributes')] \ and r['type'] == 'file': r['action'] = 'dicom2spec' r['logger'] = lgr yield r elif r['type'] == 'dataset': # 'ok' or 'notneeded' for a dataset is okay, since we commit # the spec. But it's not a result to yield continue else: # anything else shouldn't happen yield dict( status='error', message=("unexpected result from save: %s", r), path= spec, # TODO: This actually isn't clear - get it from `r` type='file', action='dicom2spec', logger=lgr)
def __call__(urls, *, dataset=None, path=None, overwrite=False, archive=False, save=True, message=None): from ..downloaders.http import HTTPDownloader from ..downloaders.providers import Providers ds = None if save or dataset: try: ds = require_dataset(dataset, check_installed=True, purpose='download urls') except NoDatasetFound: pass common_report = {"action": "download_url", "ds": ds} got_ds_instance = isinstance(dataset, Dataset) dir_is_target = not path or str(path).endswith(op.sep) path = str(resolve_path(path or op.curdir, ds=dataset)) if dir_is_target: # resolve_path() doesn't preserve trailing separators. Add one for # the download() call. path = path + op.sep urls = ensure_list_from_str(urls) if not dir_is_target: if len(urls) > 1: yield get_status_dict( status="error", message= ("When specifying multiple urls, --path should point to " "a directory target (with a trailing separator). Got %r", path), type="file", path=path, **common_report) return if archive: # make sure the file suffix indicated by a URL is preserved # so that any further archive processing doesn't have to # employ mime type inspection in order to determine the archive # type from datalad.support.network import URL suffixes = PurePosixPath(URL(urls[0]).path).suffixes if not Path(path).suffixes == suffixes: path += ''.join(suffixes) # we know that we have a single URL # download() would be fine getting an existing directory and # downloading the URL underneath it, but let's enforce a trailing # slash here for consistency. if op.isdir(path): yield get_status_dict( status="error", message=( "Non-directory path given (no trailing separator) " "but a directory with that name (after adding archive " "suffix) exists"), type="file", path=path, **common_report) return # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress # in % of urls which were already downloaded providers = Providers.from_config_files() downloaded_paths = [] path_urls = {} need_datalad_remote = False for url in urls: # somewhat "ugly" downloader = providers.get_provider(url).get_downloader(url) try: downloaded_path = downloader.download(url, path=path, overwrite=overwrite) except Exception as e: ce = CapturedException(e) yield get_status_dict(status="error", message=str(ce), type="file", path=path, exception=ce, **common_report) else: if not need_datalad_remote \ and (downloader.authenticator or downloader.credential or type(downloader) != HTTPDownloader): need_datalad_remote = True downloaded_paths.append(downloaded_path) path_urls[downloaded_path] = url yield get_status_dict(status="ok", type="file", path=downloaded_path, **common_report) if downloaded_paths and save and ds is not None: msg = message or """\ [DATALAD] Download URLs URLs: {}""".format("\n ".join(urls)) for r in Save()( downloaded_paths, message=msg, # ATTN: Pass the original dataset argument to # preserve relative path handling semantics. dataset=dataset, return_type="generator", result_renderer='disabled', result_xfm=None, result_filter=None, on_failure="ignore"): yield r ds_repo = ds.repo if isinstance(ds_repo, AnnexRepo): if need_datalad_remote: from datalad.customremotes.base import ( ensure_datalad_remote, ) ensure_datalad_remote(ds_repo, autoenable=True, encryption=None) if got_ds_instance: # Paths in `downloaded_paths` are already relative to the # dataset. rpaths = dict(zip(downloaded_paths, downloaded_paths)) else: # Paths in `downloaded_paths` are already relative to the # current working directory. Take these relative to the # dataset for use with the AnnexRepo method calls. rpaths = {} for orig_path, resolved in zip( downloaded_paths, resolve_path(downloaded_paths, ds=dataset)): rpath = path_under_rev_dataset(ds, resolved) if rpath: rpaths[str(rpath)] = orig_path else: lgr.warning("Path %s not under dataset %s", orig_path, ds) annex_paths = [ p for p, annexed in zip( rpaths, ds_repo.is_under_annex(list(rpaths.keys()))) if annexed ] if annex_paths: for path in annex_paths: url = path_urls[rpaths[path]] try: # The file is already present. This is just to # register the URL. ds_repo.add_url_to_file( path, url, # avoid batch mode for single files # https://github.com/datalad/datalad/issues/2849 batch=len(annex_paths) > 1, # bypass URL size check, we already have the file options=['--relaxed']) except CommandError as exc: lgr.warning("Registering %s with %s failed: %s", path, url, CapturedException(exc)) if archive: for path in annex_paths: yield from ds.add_archive_content( path, delete=True, on_failure='ignore', return_type='generator', result_renderer='disabled')
def __call__( path=None, *, dataset=None, drop='datasets', reckless=None, message=None, jobs=None, # deprecated below recursive=None, check=None, save=None, if_dirty=None): # deprecate checks if if_dirty is not None: warnings.warn( "The `if_dirty` argument of `datalad remove` is ignored, " "it can be removed for a safe-by-default behavior. For " "other cases consider the `reckless` argument.", DeprecationWarning) if save is not None: warnings.warn( "The `save` argument of `datalad remove` is ignored. " "A dataset modification is always saved. Consider " "`save --amend` if post-remove fix-ups are needed.", DeprecationWarning) if recursive is not None: warnings.warn( "The `recursive` argument of `datalad remove` is ignored. " "Removal operations are always recursive, and the parameter " "can be stripped from calls for a safe-by-default behavior. ", DeprecationWarning) if check is not None: warnings.warn( "The `check` argument of `datalad remove` is deprecated, " "use the `reckless` argument instead.", DeprecationWarning) if check is False: if reckless is not None: raise ValueError( 'Must not use deprecated `check` argument, and new ' '`reckless` argument together with `datalad remove`.') reckless = 'availability' refds = require_dataset(dataset, check_installed=True, purpose='remove') # same path resolution that drop will do paths_by_ds, errors = get_paths_by_ds( refds, dataset, ensure_list(path), # super-mode will readily tell us which datasets to # save as the end subdsroot_mode='super') drop_success = True for res in Drop.__call__( dataset=dataset, path=path, what=drop, reckless=reckless, recursive=True, recursion_limit=None, jobs=jobs, result_xfm=None, return_type='generator', result_renderer='disabled', # delegate error handling here on_failure='ignore'): if res.get('status') not in ('ok', 'notneeded'): drop_success = False yield res if not drop_success: # there will be 'rm -rf' below, so play safe lgr.debug('Observed drop failure, will not attempt remove') return for dpath, paths in paths_by_ds.items(): for delpath in ([dpath] if paths is None else paths): if lexists(str(delpath)): # here we still have something around on the # filesystem. There is no need to fiddle with # Git, just wipe it out. A later save() will # act on it properly if delpath.is_dir(): lgr.debug('Remove directory: %s', delpath) rmtree(delpath) # cannot use .exists() must forsee dead symlinks else: lgr.debug('Remove file: %s', delpath) delpath.unlink() continue # if we get here, there is nothing on the file system # anymore at this path. Either because the parent # dataset vanished already, or because we dropped a # dataset, and it still needs to be unregistered # from its parent -> `git rm` if dpath.exists(): GitRepo(dpath).call_git( # no need for recursion, we know that even the root # path not longer exists ['rm', '-q'], files=[str(delpath.relative_to(dpath))]) # this path was already being removed by drop # so it must belong to a dropped dataset # save won't report about this, let's do it yield dict( action='remove', status='ok', path=str(delpath), type='dataset', ) if not refds.is_installed(): # we already dropped the whole thing return for res in Save.__call__( dataset=dataset, path=path, # we might have removed the reference dataset by now, recheck message=message if message else '[DATALAD] removed content', return_type='generator', result_renderer='disabled', result_xfm=None, result_filter=None, on_failure='ignore'): if res.get('action') == 'delete': # normalize to previous remove results res['action'] = 'remove' yield res
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, assume_ready=None, explicit=False, message=None, sidecar=None, dry_run=False, jobs=None, extra_info=None, rerun_info=None, extra_inputs=None, rerun_outputs=None, inject=False, parametric_record=False, remove_outputs=False, skip_dirtycheck=False, yield_expanded=None): """Run `cmd` in `dataset` and record the results. `Run.__call__` is a simple wrapper over this function. Aside from backward compatibility kludges, the only difference is that `Run.__call__` doesn't expose all the parameters of this function. The unexposed parameters are listed below. Parameters ---------- extra_info : dict, optional Additional information to dump with the json run record. Any value given here will take precedence over the standard run key. Warning: To avoid collisions with future keys added by `run`, callers should try to use fairly specific key names and are encouraged to nest fields under a top-level "namespace" key (e.g., the project or extension name). rerun_info : dict, optional Record from a previous run. This is used internally by `rerun`. extra_inputs : list, optional Inputs to use in addition to those specified by `inputs`. Unlike `inputs`, these will not be injected into the {inputs} format field. rerun_outputs : list, optional Outputs, in addition to those in `outputs`, determined automatically from a previous run. This is used internally by `rerun`. inject : bool, optional Record results as if a command was run, skipping input and output preparation and command execution. In this mode, the caller is responsible for ensuring that the state of the working tree is appropriate for recording the command's results. parametric_record : bool, optional If enabled, substitution placeholders in the input/output specification are retained verbatim in the run record. This enables using a single run record for multiple different re-runs via individual parametrization. remove_outputs : bool, optional If enabled, all declared outputs will be removed prior command execution, except for paths that are also declared inputs. skip_dirtycheck : bool, optional If enabled, a check for dataset modifications is unconditionally disabled, even if other parameters would indicate otherwise. This can be used by callers that already performed analog verififcations to avoid duplicate processing. yield_expanded : {'inputs', 'outputs', 'both'}, optional Include a 'expanded_%s' item into the run result with the exanded list of paths matching the inputs and/or outputs specification, respectively. Yields ------ Result records for the run. """ if not cmd: lgr.warning("No command given") return specs = { k: ensure_list(v) for k, v in (('inputs', inputs), ('extra_inputs', extra_inputs), ('outputs', outputs)) } rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = op.normpath(op.join(dataset.path, rel_pwd)) rel_pwd = op.relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset(dataset, check_installed=True, purpose='track command outcomes') ds_path = ds.path lgr.debug('tracking command output underneath %s', ds) # skip for callers that already take care of this if not (skip_dirtycheck or rerun_info or inject): # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. # MIH: is_dirty() is gone, but status() can do all of the above! if not explicit and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=( 'clean dataset required to detect changes from command; ' 'use `datalad status` to inspect unsaved changes')) return # everything below expects the string-form of the command cmd = normalize_command(cmd) # pull substitutions from config cmd_fmt_kwargs = _get_substitutions(ds) # amend with unexpanded dependency/output specifications, which might # themselves contain substitution placeholder for n, val in specs.items(): if val: cmd_fmt_kwargs[n] = val # apply the substitution to the IO specs expanded_specs = { k: _format_iospecs(v, **cmd_fmt_kwargs) for k, v in specs.items() } # try-expect to catch expansion issues in _format_iospecs() which # expands placeholders in dependency/output specification before # globbing try: globbed = { k: GlobbedPaths( v, pwd=pwd, expand=expand in ( # extra_inputs follow same expansion rules as `inputs`. ["both"] + (['outputs'] if k == 'outputs' else ['inputs']))) for k, v in expanded_specs.items() } except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('input/output specification has an unrecognized ' 'placeholder: %s', exc)) return if not (inject or dry_run): yield from _prep_worktree(ds_path, pwd, globbed, assume_ready=assume_ready, remove_outputs=remove_outputs, rerun_outputs=rerun_outputs, jobs=None) else: # If an inject=True caller wants to override the exit code, they can do # so in extra_info. cmd_exitcode = 0 exc = None # prepare command formatting by extending the set of configurable # substitutions with the essential components cmd_fmt_kwargs.update( pwd=pwd, dspath=ds_path, # Check if the command contains "{tmpdir}" to avoid creating an # unnecessary temporary directory in most but not all cases. tmpdir=mkdtemp(prefix="datalad-run-") if "{tmpdir}" in cmd else "", # the following override any matching non-glob substitution # values inputs=globbed['inputs'], outputs=globbed['outputs'], ) try: cmd_expanded = format_command(ds, cmd, **cmd_fmt_kwargs) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, # rerun does not handle any prop being None, hence all # the `or/else []` 'chain': rerun_info["chain"] if rerun_info else [], } # for all following we need to make sure that the raw # specifications, incl. any placeholders make it into # the run-record to enable "parametric" re-runs # ...except when expansion was requested for k, v in specs.items(): run_info[k] = globbed[k].paths \ if expand in ["both"] + ( ['outputs'] if k == 'outputs' else ['inputs']) \ else (v if parametric_record else expanded_specs[k]) or [] if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id if extra_info: run_info.update(extra_info) if dry_run: yield get_status_dict( "run [dry-run]", ds=ds, status="ok", message="Dry run", run_info=run_info, dry_run_info=dict( cmd_expanded=cmd_expanded, pwd_full=pwd, **{k: globbed[k].expand() for k in ('inputs', 'outputs')}, )) return if not inject: cmd_exitcode, exc = _execute_command(cmd_expanded, pwd) run_info['exit'] = cmd_exitcode # Re-glob to capture any new outputs. # # TODO: If a warning or error is desired when an --output pattern doesn't # have a match, this would be the spot to do it. if explicit or expand in ["outputs", "both"]: # also for explicit mode we have to re-glob to be able to save all # matching outputs globbed['outputs'].expand(refresh=True) if expand in ["outputs", "both"]: run_info["outputs"] = globbed['outputs'].paths # create the run record, either as a string, or written to a file # depending on the config/request record, record_path = _create_record(run_info, sidecar, ds) # abbreviate version of the command for illustrative purposes cmd_shorty = _format_cmd_shorty(cmd_expanded) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format(message if message is not None else cmd_shorty, '"{}"'.format(record) if record_path else record) outputs_to_save = globbed['outputs'].expand_strict() if explicit else None if outputs_to_save is not None and record_path: outputs_to_save.append(record_path) do_save = outputs_to_save is None or outputs_to_save msg_path = None if not rerun_info and cmd_exitcode: if do_save: repo = ds.repo # must record path to be relative to ds.path to meet # result record semantics (think symlink resolution, etc) msg_path = ds.pathobj / \ repo.dot_git.relative_to(repo.pathobj) / "COMMIT_EDITMSG" msg_path.write_text(msg) expected_exit = rerun_info.get("exit", 0) if rerun_info else None if cmd_exitcode and expected_exit != cmd_exitcode: status = "error" else: status = "ok" run_result = get_status_dict( "run", ds=ds, status=status, # use the abbrev. command as the message to give immediate clarity what # completed/errors in the generic result rendering message=cmd_shorty, run_info=run_info, # use the same key that `get_status_dict()` would/will use # to record the exit code in case of an exception exit_code=cmd_exitcode, exception=exc, # Provide msg_path and explicit outputs so that, under # on_failure='stop', callers can react to a failure and then call # save(). msg_path=str(msg_path) if msg_path else None, ) if record_path: # we the record is in a sidecar file, report its ID run_result['record_id'] = record for s in ('inputs', 'outputs'): # this enables callers to further inspect the outputs without # performing globbing again. Together with remove_outputs=True # these would be guaranteed to be the outcome of the executed # command. in contrast to `outputs_to_save` this does not # include aux file, such as the run record sidecar file. # calling .expand_strict() again is largely reporting cached # information # (format: relative paths) if yield_expanded in (s, 'both'): run_result[f'expanded_{s}'] = globbed[s].expand_strict() yield run_result if do_save: with chpwd(pwd): for r in Save.__call__( dataset=ds_path, path=outputs_to_save, recursive=True, message=msg, jobs=jobs, return_type='generator', # we want this command and its parameterization to be in full # control about the rendering of results, hence we must turn # off internal rendering result_renderer='disabled', on_failure='ignore'): yield r
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, update_mode='target', incremental=False, force_extraction=False, save=True): refds_path = Interface.get_refds_path(dataset) # it really doesn't work without a dataset ds = require_dataset( dataset, check_installed=True, purpose='metadata aggregation') path = assure_list(path) if not path: # then current/reference dataset is "aggregated" # We should not add ds.path always since then --recursive would # also recurse current even if paths are given path.append(ds.path) agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations( ds, # do not warn here, next call triggers the same warning warn_absent=False) agginfo_db = load_ds_aggregate_db(ds, abspath=True) to_save = [] to_aggregate = set() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='aggregate_metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue ap_type = ap.get('type', None) ap_state = ap.get('state', None) assert('parentds' in ap or ap_type == 'dataset') if ap_type == 'dataset' and ap_state != 'absent': # a present dataset, we can take directly from it aggsrc = ap['path'] lgr.info('Aggregate metadata for dataset %s', aggsrc) else: # everything else needs to come from the parent aggsrc = ap['parentds'] if ap_state == 'absent': lgr.info( 'Attempt to use pre-aggregate metadata for absent %s from dataset at %s', ap['path'], aggsrc) else: lgr.info( 'Aggregate metadata for %s from dataset at %s', ap['path'], aggsrc) to_aggregate.add(aggsrc) if ap_state == 'absent': # key thought: recursive is done by path annotation, hence # once we hit an absent dataset, we are 100% certain that # there is nothing to recurse into on the file system # hence we only have to look into the aggregated metadata # of the last available dataset in the dataset tree edge # # if there is nothing at this path, we need to look into the # parentds and check if we know anything about this path # if we do, we need to grab all the info and objects # if not, we need to error res = _get_dsinfo_from_aggmetadata( aggsrc, ap['path'], recursive, agginfo_db) if not isinstance(res, list): yield get_status_dict( status='impossible', message=res, action='aggregate_metadata', path=ap['path'], logger=lgr) continue # cue for aggregation to_aggregate.update(res) else: # actually aggregate metadata for this dataset, immediately place # generated objects into the aggregated or reference dataset, # and put info into DB to get the distributed to all datasets # that need to be updated errored = _dump_extracted_metadata( ds, Dataset(aggsrc), agginfo_db, to_save, force_extraction, agg_base_path) if errored: yield get_status_dict( status='error', message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)', action='aggregate_metadata', path=aggsrc, logger=lgr) # at this point we have dumped all aggregated metadata into object files # somewhere, we know what needs saving, but having saved anything, and # we know about the states of all aggregated dataset in the DB # what remains to do is to update all dataset, so they have there own copy # of aggregated metadata and update their respective aggregate.json with # info on what states we just aggregated from # first, let's figure out what dataset need updating at all # get adjencency info of the dataset tree spanning the base to all leaf dataset # associated with the path arguments if update_mode == 'all': ds_adj = {} discover_dataset_trace_to_targets( ds.path, to_aggregate, [], ds_adj, # we know that to_aggregate only lists datasets, existing and # absent ones -- we want to aggregate all of them, either from # just extracted metadata, or from previously aggregated metadata # of the closest superdataset includeds=to_aggregate) # TODO we need to work in the info about dataset that we only got from # aggregated metadata, that had no trace on the file system in here!! subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate) elif update_mode == 'target': subtrees = {ds.path: list(agginfo_db.keys())} else: raise ValueError( "unknown `update_mode` '%s' for metadata aggregation", update_mode) # go over datasets in bottom-up fashion for parentds_path in sorted(subtrees, reverse=True): lgr.info('Update aggregate metadata in dataset at: %s', parentds_path) _update_ds_agginfo( ds.path, parentds_path, subtrees[parentds_path], incremental, agginfo_db, to_save) # update complete res = get_status_dict( status='ok', action='aggregate_metadata', path=parentds_path, type='dataset', logger=lgr) res.update(agginfo_db.get(parentds_path, {})) yield res # # save potential modifications to dataset global metadata # if not to_save: return lgr.info('Attempting to save %i files/datasets', len(to_save)) for res in Save.__call__( # rev-save does not need any pre-annotated path hints path=[r['path'] for r in to_save], dataset=refds_path, message='[DATALAD] Dataset aggregate metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res