def _create_record(run_info, sidecar_flag, ds): """ Returns ------- str or None, str or None The first value is either the full run record in JSON serialzied form, or content-based ID hash, if the record was written to a file. In that latter case, the second value is the path to the record sidecar file, or None otherwise. """ record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar_flag is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar_flag record_id = None record_path = None if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() # nosec record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = ds.pathobj / record_dir / record_id if not op.lexists(record_path): # go for compression, even for minimal records not much difference, # despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) return record_id or record, record_path
def _get_format(self, log_name=False, log_pid=False): from datalad import cfg from datalad.config import anything2bool show_timestamps = anything2bool(cfg.get('datalad.log.timestamp', False)) return (("" if not show_timestamps else "$BOLD%(asctime)-15s$RESET ") + ("%(name)-15s " if log_name else "") + ("{%(process)d}" if log_pid else "") + "[%(levelname)s] " "%(message)s ")
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, extra_info=None, rerun_info=None, extra_inputs=None, rerun_outputs=None, inject=False): """Run `cmd` in `dataset` and record the results. `Run.__call__` is a simple wrapper over this function. Aside from backward compatibility kludges, the only difference is that `Run.__call__` doesn't expose all the parameters of this function. The unexposed parameters are listed below. Parameters ---------- extra_info : dict, optional Additional information to dump with the json run record. Any value given here will take precedence over the standard run key. Warning: To avoid collisions with future keys added by `run`, callers should try to use fairly specific key names and are encouraged to nest fields under a top-level "namespace" key (e.g., the project or extension name). rerun_info : dict, optional Record from a previous run. This is used internally by `rerun`. extra_inputs : list, optional Inputs to use in addition to those specified by `inputs`. Unlike `inputs`, these will not be injected into the {inputs} format field. rerun_outputs : list, optional Outputs, in addition to those in `outputs`, determined automatically from a previous run. This is used internally by `rerun`. inject : bool, optional Record results as if a command was run, skipping input and output preparation and command execution. In this mode, the caller is responsible for ensuring that the state of the working tree is appropriate for recording the command's results. Yields ------ Result records for the run. """ if not cmd: lgr.warning("No command given") return rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset(dataset, check_installed=True, purpose='tracking outcomes of a command') ds_path = ds.path lgr.debug('tracking command output underneath %s', ds) if not (rerun_info or inject): # Rerun already takes care of this. # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. # MIH: is_dirty() is gone, but status() can do all of the above! if not explicit and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=( 'clean dataset required to detect changes from command; ' 'use `datalad status` to inspect unsaved changes')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) extra_inputs = GlobbedPaths( extra_inputs, pwd=pwd, # Follow same expansion rules as `inputs`. expand=expand in ["inputs", "both"]) outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"]) # ATTN: For correct path handling, all dataset commands call should be # unbound. They should (1) receive a string dataset argument, (2) receive # relative paths, and (3) happen within a chpwd(pwd) context. if not inject: with chpwd(pwd): for res in prepare_inputs(ds_path, inputs, extra_inputs): yield res if outputs: for res in _install_and_reglob(ds_path, outputs): yield res for res in _unlock_or_remove(ds_path, outputs.expand()): yield res if rerun_outputs is not None: for res in _unlock_or_remove(ds_path, rerun_outputs): yield res else: # If an inject=True caller wants to override the exit code, they can do # so in extra_info. cmd_exitcode = 0 exc = None try: cmd_expanded = format_command( ds, cmd, pwd=pwd, dspath=ds_path, # Check if the command contains "{tmpdir}" to avoid creating an # unnecessary temporary directory in most but not all cases. tmpdir=mkdtemp(prefix="datalad-run-") if "{tmpdir}" in cmd else "", inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return if not inject: cmd_exitcode, exc = _execute_command( cmd_expanded, pwd, expected_exit=rerun_info.get("exit", 0) if rerun_info else None) # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'extra_inputs': extra_inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id if extra_info: run_info.update(extra_info) record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) # If ConfigManager gets the ability to say "return single value", # update this code to use that. if isinstance(use_sidecar, tuple): # Use same precedence as 'git config'. use_sidecar = use_sidecar[-1] use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds_path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd_expanded), '"{}"'.format(record_id) if use_sidecar else record) outputs_to_save = outputs.expand() if explicit else None do_save = outputs_to_save is None or outputs_to_save if not rerun_info and cmd_exitcode: if do_save: repo = ds.repo msg_path = relpath(opj(str(repo.dot_git), "COMMIT_EDITMSG")) with open(msg_path, "wb") as ofh: ofh.write(assure_bytes(msg)) lgr.info( "The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad save -d . -r -F %s'", msg_path) raise exc elif do_save: with chpwd(pwd): for r in Save.__call__(dataset=ds_path, path=outputs_to_save, recursive=True, message=msg, return_type='generator'): yield r
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, extra_info=None, rerun_info=None, extra_inputs=None, rerun_outputs=None, inject=False, saver=_save_outputs): """Run `cmd` in `dataset` and record the results. `Run.__call__` is a simple wrapper over this function. Aside from backward compatibility kludges, the only difference is that `Run.__call__` doesn't expose all the parameters of this function. The unexposed parameters are listed below. Parameters ---------- extra_info : dict, optional Additional information to dump with the json run record. Any value given here will take precedence over the standard run key. Warning: To avoid collisions with future keys added by `run`, callers should try to use fairly specific key names and are encouraged to nest fields under a top-level "namespace" key (e.g., the project or extension name). rerun_info : dict, optional Record from a previous run. This is used internally by `rerun`. extra_inputs : list, optional Inputs to use in addition to those specified by `inputs`. Unlike `inputs`, these will not be injected into the {inputs} format field. rerun_outputs : list, optional Outputs, in addition to those in `outputs`, determined automatically from a previous run. This is used internally by `rerun`. inject : bool, optional Record results as if a command was run, skipping input and output preparation and command execution. In this mode, the caller is responsible for ensuring that the state of the working tree is appropriate for recording the command's results. saver : callable, optional Must take a dataset instance, a list of paths to save, and a message string as arguments and must record any changes done to any content matching an entry in the path list. Must yield result dictionaries as a generator. Yields ------ Result records for the run. """ if not cmd: lgr.warning("No command given") return rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset( dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path lgr.debug('tracking command output underneath %s', ds) if not (rerun_info or inject): # Rerun already takes care of this. # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. if not explicit and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) extra_inputs = GlobbedPaths(extra_inputs, pwd=pwd, # Follow same expansion rules as `inputs`. expand=expand in ["inputs", "both"]) outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"]) if not inject: for res in prepare_inputs(ds, inputs, extra_inputs): yield res if outputs: for res in _install_and_reglob(ds, outputs): yield res for res in _unlock_or_remove(ds, outputs.expand(full=True)): yield res if rerun_outputs is not None: # These are files we need to unlock/remove for a rerun that aren't # included in the explicit outputs. Unlike inputs/outputs, these are # full paths, so we can pass them directly to unlock. for res in _unlock_or_remove(ds, rerun_outputs): yield res else: # If an inject=True caller wants to override the exit code, they can do # so in extra_info. cmd_exitcode = 0 exc = None try: cmd_expanded = format_command(ds, cmd, pwd=pwd, dspath=ds.path, inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return if not inject: cmd_exitcode, exc = _execute_command( cmd_expanded, pwd, expected_exit=rerun_info.get("exit", 0) if rerun_info else None) # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'extra_inputs': extra_inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id if extra_info: run_info.update(extra_info) record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) # If ConfigManager gets the ability to say "return single value", # update this code to use that. if isinstance(use_sidecar, tuple): # Use same precedence as 'git config'. use_sidecar = use_sidecar[-1] use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds.path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd_expanded), '"{}"'.format(record_id) if use_sidecar else record) outputs_to_save = outputs.expand(full=True) if explicit else '.' if not rerun_info and cmd_exitcode: if outputs_to_save: msg_path = relpath(opj(ds.repo.path, ds.repo.get_git_dir(ds.repo), "COMMIT_EDITMSG")) with open(msg_path, "wb") as ofh: ofh.write(assure_bytes(msg)) lgr.info("The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad add -d . -r -F %s .'", msg_path) raise exc elif outputs_to_save: for r in saver(ds, outputs_to_save, msg): yield r
# spec # subject # replacements? import sys import os import os.path as op from datalad.api import Dataset from datalad.utils import rmtree from datalad.config import anything2bool import hirni_heuristic as heuristic dataset = Dataset(sys.argv[1]) rel_spec_path = sys.argv[2] anonymize = anything2bool(sys.argv[3]) subject = sys.argv[4] location = sys.argv[5] from mock import patch from tempfile import mkdtemp # relative path to heuristic to be recorded by datalad-run heuristic_path = op.relpath(heuristic.__file__, dataset.path) # relative path to not-needed-heudiconv output: rel_trash_path = op.relpath( mkdtemp(prefix="hirni-tmp-", dir=op.join(dataset.path, ".git")), dataset.path)
def __call__(specfile, dataset=None, anonymize=False, only_type=None): dataset = require_dataset(dataset, check_installed=True, purpose="spec2bids") specfile = assure_list(specfile) specfile = [resolve_path(p, dataset) for p in specfile] specfile = [str(p) for p in specfile] for spec_path in specfile: # Note/TODO: ran_procedure per spec file still isn't ideal. Could # be different spec files for same acquisition. It's actually about # the exact same call. How to best get around substitutions? # Also: per snippet isn't correct either. # substitutions is real issue. Example "copy {location} ." # # => datalad.interface.run.format_command / normalize_command ? # TODO: Also can we skip prepare_inputs within run? At least specify # more specifically. Note: Can be globbed! ran_procedure = dict() if not lexists(spec_path): yield get_status_dict( action='spec2bids', path=spec_path, status='impossible', message="{} not found".format(spec_path) ) if op.isdir(spec_path): if op.realpath(op.join(spec_path, op.pardir)) == \ op.realpath(dataset.path): spec_path = op.join( spec_path, dataset.config.get( "datalad.hirni.studyspec.filename", "studyspec.json") ) # TODO: check existence of that file! else: yield get_status_dict( action='spec2bids', path=spec_path, status='impossible', message="{} is neither a specification file nor an " "acquisition directory".format(spec_path) ) # relative path to spec to be recorded: rel_spec_path = relpath(spec_path, dataset.path) \ if isabs(spec_path) else spec_path # check each dict (snippet) in the specification for what to do # wrt conversion: for spec_snippet in load_stream(spec_path): if only_type and not spec_snippet['type'].startswith(only_type): # ignore snippets not matching `only_type` # Note/TODO: the .startswith part is meant for # matching "dicomseries:all" to given "dicomseries" but not # vice versa. This prob. needs refinement (and doc) continue if 'procedures' not in spec_snippet: # no conversion procedures defined at all: yield get_status_dict( action='spec2bids', path=spec_path, snippet=spec_snippet, status='notneeded', ) continue procedure_list = spec_snippet['procedures'] if not procedure_list: # no conversion procedures defined at all: yield get_status_dict( action='spec2bids', path=spec_path, snippet=spec_snippet, status='notneeded', ) continue # accept a single dict as a one item list: if isinstance(procedure_list, dict): procedure_list = [procedure_list] # build a dict available for placeholders in format strings: # Note: This is flattening the structure since we don't need # value/approved for the substitutions. In addition 'subject' # and 'anon_subject' are not passed on, but a new key # 'bids_subject' instead the value of which depends on the # --anonymize switch. # Additionally 'location' is recomputed to be relative to # dataset.path, since this is where the procedures are running # from within. replacements = dict() for k, v in spec_snippet.items(): if k == 'subject': if not anonymize: replacements['bids-subject'] = v['value'] elif k == 'anon-subject': if anonymize: replacements['bids-subject'] = v['value'] elif k == 'location': replacements[k] = op.join(op.dirname(rel_spec_path), v) elif k == 'procedures': # 'procedures' is a list of dicts (not suitable for # substitutions) and it makes little sense to be # referenced by converter format strings anyway: continue else: replacements[k] = v['value'] if isinstance(v, dict) else v # build dict to patch os.environ with for passing # replacements on to procedures: env_subs = dict() for k, v in replacements.items(): env_subs['DATALAD_RUN_SUBSTITUTIONS_{}' ''.format(k.upper().replace('-', '__'))] = str(v) env_subs['DATALAD_RUN_SUBSTITUTIONS_SPECPATH'] = rel_spec_path env_subs['DATALAD_RUN_SUBSTITUTIONS_ANONYMIZE'] = str(anonymize) # TODO: The above two blocks to build replacements dict and # env_subs should be joined eventually. for proc in procedure_list: if has_specval(proc, 'procedure-name'): proc_name = get_specval(proc, 'procedure-name') else: # invalid procedure spec lgr.warning("conversion procedure missing key " "'procedure-name' in %s: %s", spec_path, proc) # TODO: continue or yield impossible/error so it can be # dealt with via on_failure? continue if has_specval(proc, 'on-anonymize') \ and anything2bool( get_specval(proc, 'on-anonymize') ) and not anonymize: # don't run that procedure, if we weren't called with # --anonymize while procedure is specified to be run on # that switch only continue proc_call = get_specval(proc, 'procedure-call') \ if has_specval(proc, 'procedure-call') \ else None if ran_procedure.get(hash((proc_name, proc_call)), None): # if we ran the exact same call already, # don't call it again # TODO: notneeded? continue # if spec comes with call format string, it takes precedence # over what is generally configured for the procedure # TODO: Not sure yet whether this is how we should deal with it if proc_call: env_subs['DATALAD_PROCEDURES_{}_CALL__FORMAT' ''.format(proc_name.upper().replace('-', '__')) ] = proc_call run_results = list() # Note, that we can't use dataset.config.overrides to # pass run-substitution config to procedures, since we # leave python context and thereby loose the dataset # instance. Use patched os.environ instead. Note also, # that this requires names of substitutions to not # contain underscores, since they would be translated to # '.' by ConfigManager when reading them from within the # procedure's datalad-run calls. from mock import patch # TODO: Reconsider that patching. Shouldn't it be an update? with patch.dict('os.environ', env_subs): # apparently reload is necessary to consider config # overrides via env: dataset.config.reload() for r in dataset.run_procedure( spec=proc_name, return_type='generator' ): # # if there was an issue yield original result, # # otherwise swallow: # if r['status'] not in ['ok', 'notneeded']: yield r run_results.append(r) if not all(r['status'] in ['ok', 'notneeded'] for r in run_results): yield {'action': proc_name, 'path': spec_path, 'snippet': spec_snippet, 'status': 'error', 'message': "acquisition conversion failed. " "See previous message(s)."} else: yield {'action': proc_name, 'path': spec_path, 'snippet': spec_snippet, 'status': 'ok', 'message': "acquisition converted."} # mark as a procedure we ran on this acquisition: # TODO: rethink. Doesn't work that way. Disabled for now # ran_procedure[hash((proc_name, proc_call))] = True # elif proc_name != 'hirni-dicom-converter': # # specific converter procedure call # # from mock import patch # with patch.dict('os.environ', env_subs): # # apparently reload is necessary to consider config # # overrides via env: # dataset.config.reload() # # for r in dataset.run_procedure( # spec=[proc_name, rel_spec_path, anonymize], # return_type='generator' # ): # # # if there was an issue with containers-run, # # yield original result, otherwise swallow: # if r['status'] not in ['ok', 'notneeded']: # yield r # # run_results.append(r) # # if not all(r['status'] in ['ok', 'notneeded'] # for r in run_results): # yield {'action': proc_name, # 'path': spec_path, # 'snippet': spec_snippet, # 'status': 'error', # 'message': "Conversion failed. " # "See previous message(s)."} # # else: # yield {'action': proc_name, # 'path': spec_path, # 'snippet': spec_snippet, # 'status': 'ok', # 'message': "specification converted."} # elif ran_heudiconv and proc_name == 'hirni-dicom-converter': # # in this case we acted upon this snippet already and # # do not have to produce a result # pass # # else: # # this shouldn't happen! # raise RuntimeError yield {'action': 'spec2bids', 'path': spec_path, 'status': 'ok'}