示例#1
0
def add_name_fixing(path):
    import datalad.support.json_py as json_py

    spec = [r for r in json_py.load_stream(path)]

    # Note: We append the procedure to dicomseries:all, since we do not
    # technically act upon a particular series. This is because the procedure
    # concerns the outcome of the conversion, not the raw data. The file
    # selection has to be done within the procedure and can't be controlled by
    # the spec or hirni-spec2bids ATM.
    for snippet in spec:
        if snippet['type'] == 'dicomseries:all':
            snippet['procedures'].append({
                'procedure-name': {
                    'value': 'change-dwi-run-to-acq_fix_all',
                    'approved': True
                },
                'procedure-name': {
                    'value': 'fieldmaps-to-phase-or-magnitude_fix_all',
                    'approved': True
                },
                'on-anonymize': {
                    'value': False,
                    'approved': True
                },
            })

    json_py.dump2stream(spec, path)
示例#2
0
 def put(self, path=None):
     if self.read_only:
         abort(403)
     args = self.rp.parse_args()
     path = path or args.path
     if path is None or args.content is None:
         # BadRequest
         abort(400)
     file_abspath = self._validate_file_path(
         path, fail_nonexistent=False)
     # TODO handle failure without crashing
     if op.exists(file_abspath):
         self.ds.repo.remove(file_abspath)
     # TODO git checkout of that removed files, when
     # below fails
     # TODO support file uploads
     dirname = op.dirname(file_abspath)
     if not op.exists(dirname):
         os.makedirs(dirname)
     if args.json == 'stream':
         json_py.dump2stream(
             json_py.loads(args.content), file_abspath)
     elif args.json == 'yes':
         json_py.dump(
             json_py.loads(args.content), file_abspath)
     else:
         open(file_abspath, 'w').write(args.content)
     self.ds.save(
         file_abspath,
         to_git=args.togit,
         #message="",
     )
示例#3
0
def _create_record(run_info, sidecar_flag, ds):
    """
    Returns
    -------
    str or None, str or None
      The first value is either the full run record in JSON serialzied form,
      or content-based ID hash, if the record was written to a file. In that
      latter case, the second value is the path to the record sidecar file,
      or None otherwise.
    """
    record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False)
    if sidecar_flag is None:
        use_sidecar = ds.config.get('datalad.run.record-sidecar',
                                    default=False)
        use_sidecar = anything2bool(use_sidecar)
    else:
        use_sidecar = sidecar_flag

    record_id = None
    record_path = None
    if use_sidecar:
        # record ID is hash of record itself
        from hashlib import md5
        record_id = md5(record.encode('utf-8')).hexdigest()  # nosec
        record_dir = ds.config.get('datalad.run.record-directory',
                                   default=op.join('.datalad', 'runinfo'))
        record_path = ds.pathobj / record_dir / record_id
        if not op.lexists(record_path):
            # go for compression, even for minimal records not much difference,
            # despite offset cost
            # wrap in list -- there is just one record
            dump2stream([run_info], record_path, compressed=True)
    return record_id or record, record_path
示例#4
0
def test_dump2stream(path):
    stream = [dict(a=5), dict(b=4)]
    dump2stream([dict(a=5), dict(b=4)], path)
    eq_(list(load_stream(path)), stream)

    # the same for compression
    dump2xzstream([dict(a=5), dict(b=4)], path)
    eq_(list(load_xzstream(path)), stream)
示例#5
0
文件: run.py 项目: mprati/datalad
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None,
                message=None, rerun_info=None, rerun_outputs=None, sidecar=None):
    rel_pwd = rerun_info.get('pwd') if rerun_info else None
    if rel_pwd and dataset:
        # recording is relative to the dataset
        pwd = normpath(opj(dataset.path, rel_pwd))
        rel_pwd = relpath(pwd, dataset.path)
    else:
        pwd, rel_pwd = get_command_pwds(dataset)

    ds = require_dataset(
        dataset, check_installed=True,
        purpose='tracking outcomes of a command')

    # not needed ATM
    #refds_path = ds.path

    # delayed imports
    from datalad.cmd import Runner

    lgr.debug('tracking command output underneath %s', ds)
    if not rerun_info and ds.repo.dirty:  # Rerun already takes care of this.
        yield get_status_dict(
            'run',
            ds=ds,
            status='impossible',
            message=('unsaved modifications present, '
                     'cannot detect changes by command'))
        return

    cmd = normalize_command(cmd)

    inputs = GlobbedPaths(inputs, pwd=pwd,
                          expand=expand in ["inputs", "both"])
    if inputs:
        for res in ds.get(inputs.expand(full=True), on_failure="ignore"):
            yield res

    outputs = GlobbedPaths(outputs, pwd=pwd,
                           expand=expand in ["outputs", "both"],
                           warn=not rerun_info)
    if outputs:
        for res in _unlock_or_remove(ds, outputs.expand(full=True)):
            yield res

    if rerun_outputs is not None:
        # These are files we need to unlock/remove for a rerun that aren't
        # included in the explicit outputs. Unlike inputs/outputs, these are
        # full paths, so we can pass them directly to unlock.
        for res in _unlock_or_remove(ds, rerun_outputs):
            yield res

    sfmt = SequenceFormatter()
    cmd_expanded = sfmt.format(cmd,
                               pwd=pwd,
                               dspath=ds.path,
                               inputs=inputs.expand(dot=False),
                               outputs=outputs.expand(dot=False))

    # we have a clean dataset, let's run things
    exc = None
    cmd_exitcode = None
    runner = Runner(cwd=pwd)
    try:
        lgr.info("== Command start (output follows) =====")
        runner.run(
            cmd_expanded,
            # immediate output
            log_online=True,
            # not yet sure what we should do with the command output
            # IMHO `run` itself should be very silent and let the command talk
            log_stdout=False,
            log_stderr=False,
            expect_stderr=True,
            expect_fail=True,
            # TODO stdin
        )
    except CommandError as e:
        # strip our own info from the exception. The original command output
        # went to stdout/err -- we just have to exitcode in the same way
        exc = e
        cmd_exitcode = e.code

        if rerun_info and rerun_info.get("exit", 0) != cmd_exitcode:
            # we failed in a different way during a rerun.  This can easily
            # happen if we try to alter a locked file
            #
            # TODO add the ability to `git reset --hard` the dataset tree on failure
            # we know that we started clean, so we could easily go back, needs gh-1424
            # to be able to do it recursively
            raise exc

    lgr.info("== Command exit (modification check follows) =====")

    # amend commit message with `run` info:
    # - pwd if inside the dataset
    # - the command itself
    # - exit code of the command
    run_info = {
        'cmd': cmd,
        'exit': cmd_exitcode if cmd_exitcode is not None else 0,
        'chain': rerun_info["chain"] if rerun_info else [],
        'inputs': inputs.paths,
        'outputs': outputs.paths,
    }
    if rel_pwd is not None:
        # only when inside the dataset to not leak information
        run_info['pwd'] = rel_pwd
    if ds.id:
        run_info["dsid"] = ds.id

    record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False)

    use_sidecar = sidecar or (
        sidecar is None and
        ds.config.get('datalad.run.record-sidecar', default=False))

    if use_sidecar:
        # record ID is hash of record itself
        from hashlib import md5
        record_id = md5(record.encode('utf-8')).hexdigest()
        record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo'))
        record_path = op.join(ds.path, record_dir, record_id)
        if not op.lexists(record_path):
            # go for compression, even for minimal records not much difference, despite offset cost
            # wrap in list -- there is just one record
            dump2stream([run_info], record_path, compressed=True)

    # compose commit message
    msg = u"""\
[DATALAD RUNCMD] {}

=== Do not change lines below ===
{}
^^^ Do not change lines above ^^^
"""
    msg = msg.format(
        message if message is not None else _format_cmd_shorty(cmd),
        '"{}"'.format(record_id) if use_sidecar else record)
    msg = assure_bytes(msg)

    if not rerun_info and cmd_exitcode:
        msg_path = opj(relpath(ds.repo.repo.git_dir), "COMMIT_EDITMSG")
        with open(msg_path, "wb") as ofh:
            ofh.write(msg)
        lgr.info("The command had a non-zero exit code. "
                 "If this is expected, you can save the changes with "
                 "'datalad save -r -F%s .'",
                 msg_path)
        raise exc
    else:
        for r in ds.add('.', recursive=True, message=msg):
            yield r
示例#6
0
文件: run.py 项目: nicholsn/datalad
def run_command(cmd,
                dataset=None,
                inputs=None,
                outputs=None,
                expand=None,
                explicit=False,
                message=None,
                sidecar=None,
                extra_info=None,
                rerun_info=None,
                extra_inputs=None,
                rerun_outputs=None,
                inject=False):
    """Run `cmd` in `dataset` and record the results.

    `Run.__call__` is a simple wrapper over this function. Aside from backward
    compatibility kludges, the only difference is that `Run.__call__` doesn't
    expose all the parameters of this function. The unexposed parameters are
    listed below.

    Parameters
    ----------
    extra_info : dict, optional
        Additional information to dump with the json run record. Any value
        given here will take precedence over the standard run key. Warning: To
        avoid collisions with future keys added by `run`, callers should try to
        use fairly specific key names and are encouraged to nest fields under a
        top-level "namespace" key (e.g., the project or extension name).
    rerun_info : dict, optional
        Record from a previous run. This is used internally by `rerun`.
    extra_inputs : list, optional
        Inputs to use in addition to those specified by `inputs`. Unlike
        `inputs`, these will not be injected into the {inputs} format field.
    rerun_outputs : list, optional
        Outputs, in addition to those in `outputs`, determined automatically
        from a previous run. This is used internally by `rerun`.
    inject : bool, optional
        Record results as if a command was run, skipping input and output
        preparation and command execution. In this mode, the caller is
        responsible for ensuring that the state of the working tree is
        appropriate for recording the command's results.

    Yields
    ------
    Result records for the run.
    """
    if not cmd:
        lgr.warning("No command given")
        return

    rel_pwd = rerun_info.get('pwd') if rerun_info else None
    if rel_pwd and dataset:
        # recording is relative to the dataset
        pwd = normpath(opj(dataset.path, rel_pwd))
        rel_pwd = relpath(pwd, dataset.path)
    else:
        pwd, rel_pwd = get_command_pwds(dataset)

    ds = require_dataset(dataset,
                         check_installed=True,
                         purpose='tracking outcomes of a command')
    ds_path = ds.path

    lgr.debug('tracking command output underneath %s', ds)

    if not (rerun_info or inject):  # Rerun already takes care of this.
        # For explicit=True, we probably want to check whether any inputs have
        # modifications. However, we can't just do is_dirty(..., path=inputs)
        # because we need to consider subdatasets and untracked files.
        # MIH: is_dirty() is gone, but status() can do all of the above!
        if not explicit and ds.repo.dirty:
            yield get_status_dict(
                'run',
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required to detect changes from command; '
                    'use `datalad status` to inspect unsaved changes'))
            return

    cmd = normalize_command(cmd)

    inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"])
    extra_inputs = GlobbedPaths(
        extra_inputs,
        pwd=pwd,
        # Follow same expansion rules as `inputs`.
        expand=expand in ["inputs", "both"])
    outputs = GlobbedPaths(outputs,
                           pwd=pwd,
                           expand=expand in ["outputs", "both"])

    # ATTN: For correct path handling, all dataset commands call should be
    # unbound. They should (1) receive a string dataset argument, (2) receive
    # relative paths, and (3) happen within a chpwd(pwd) context.
    if not inject:
        with chpwd(pwd):
            for res in prepare_inputs(ds_path, inputs, extra_inputs):
                yield res

            if outputs:
                for res in _install_and_reglob(ds_path, outputs):
                    yield res
                for res in _unlock_or_remove(ds_path, outputs.expand()):
                    yield res

            if rerun_outputs is not None:
                for res in _unlock_or_remove(ds_path, rerun_outputs):
                    yield res
    else:
        # If an inject=True caller wants to override the exit code, they can do
        # so in extra_info.
        cmd_exitcode = 0
        exc = None

    try:
        cmd_expanded = format_command(
            ds,
            cmd,
            pwd=pwd,
            dspath=ds_path,
            # Check if the command contains "{tmpdir}" to avoid creating an
            # unnecessary temporary directory in most but not all cases.
            tmpdir=mkdtemp(prefix="datalad-run-") if "{tmpdir}" in cmd else "",
            inputs=inputs,
            outputs=outputs)
    except KeyError as exc:
        yield get_status_dict(
            'run',
            ds=ds,
            status='impossible',
            message=('command has an unrecognized placeholder: %s', exc))
        return

    if not inject:
        cmd_exitcode, exc = _execute_command(
            cmd_expanded,
            pwd,
            expected_exit=rerun_info.get("exit", 0) if rerun_info else None)

    # amend commit message with `run` info:
    # - pwd if inside the dataset
    # - the command itself
    # - exit code of the command
    run_info = {
        'cmd': cmd,
        'exit': cmd_exitcode,
        'chain': rerun_info["chain"] if rerun_info else [],
        'inputs': inputs.paths,
        'extra_inputs': extra_inputs.paths,
        'outputs': outputs.paths,
    }
    if rel_pwd is not None:
        # only when inside the dataset to not leak information
        run_info['pwd'] = rel_pwd
    if ds.id:
        run_info["dsid"] = ds.id
    if extra_info:
        run_info.update(extra_info)

    record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False)

    if sidecar is None:
        use_sidecar = ds.config.get('datalad.run.record-sidecar',
                                    default=False)
        # If ConfigManager gets the ability to say "return single value",
        # update this code to use that.
        if isinstance(use_sidecar, tuple):
            # Use same precedence as 'git config'.
            use_sidecar = use_sidecar[-1]
        use_sidecar = anything2bool(use_sidecar)
    else:
        use_sidecar = sidecar

    if use_sidecar:
        # record ID is hash of record itself
        from hashlib import md5
        record_id = md5(record.encode('utf-8')).hexdigest()
        record_dir = ds.config.get('datalad.run.record-directory',
                                   default=op.join('.datalad', 'runinfo'))
        record_path = op.join(ds_path, record_dir, record_id)
        if not op.lexists(record_path):
            # go for compression, even for minimal records not much difference, despite offset cost
            # wrap in list -- there is just one record
            dump2stream([run_info], record_path, compressed=True)

    # compose commit message
    msg = u"""\
[DATALAD RUNCMD] {}

=== Do not change lines below ===
{}
^^^ Do not change lines above ^^^
"""
    msg = msg.format(
        message if message is not None else _format_cmd_shorty(cmd_expanded),
        '"{}"'.format(record_id) if use_sidecar else record)

    outputs_to_save = outputs.expand() if explicit else None
    do_save = outputs_to_save is None or outputs_to_save
    if not rerun_info and cmd_exitcode:
        if do_save:
            repo = ds.repo
            msg_path = relpath(opj(str(repo.dot_git), "COMMIT_EDITMSG"))
            with open(msg_path, "wb") as ofh:
                ofh.write(assure_bytes(msg))
            lgr.info(
                "The command had a non-zero exit code. "
                "If this is expected, you can save the changes with "
                "'datalad save -d . -r -F %s'", msg_path)
        raise exc
    elif do_save:
        with chpwd(pwd):
            for r in Save.__call__(dataset=ds_path,
                                   path=outputs_to_save,
                                   recursive=True,
                                   message=msg,
                                   return_type='generator'):
                yield r
示例#7
0
文件: run.py 项目: hanke/datalad
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None,
                explicit=False, message=None, sidecar=None,
                extra_info=None,
                rerun_info=None,
                extra_inputs=None,
                rerun_outputs=None,
                inject=False,
                saver=_save_outputs):
    """Run `cmd` in `dataset` and record the results.

    `Run.__call__` is a simple wrapper over this function. Aside from backward
    compatibility kludges, the only difference is that `Run.__call__` doesn't
    expose all the parameters of this function. The unexposed parameters are
    listed below.

    Parameters
    ----------
    extra_info : dict, optional
        Additional information to dump with the json run record. Any value
        given here will take precedence over the standard run key. Warning: To
        avoid collisions with future keys added by `run`, callers should try to
        use fairly specific key names and are encouraged to nest fields under a
        top-level "namespace" key (e.g., the project or extension name).
    rerun_info : dict, optional
        Record from a previous run. This is used internally by `rerun`.
    extra_inputs : list, optional
        Inputs to use in addition to those specified by `inputs`. Unlike
        `inputs`, these will not be injected into the {inputs} format field.
    rerun_outputs : list, optional
        Outputs, in addition to those in `outputs`, determined automatically
        from a previous run. This is used internally by `rerun`.
    inject : bool, optional
        Record results as if a command was run, skipping input and output
        preparation and command execution. In this mode, the caller is
        responsible for ensuring that the state of the working tree is
        appropriate for recording the command's results.
    saver : callable, optional
        Must take a dataset instance, a list of paths to save, and a
        message string as arguments and must record any changes done
        to any content matching an entry in the path list. Must yield
        result dictionaries as a generator.

    Yields
    ------
    Result records for the run.
    """
    if not cmd:
        lgr.warning("No command given")
        return

    rel_pwd = rerun_info.get('pwd') if rerun_info else None
    if rel_pwd and dataset:
        # recording is relative to the dataset
        pwd = normpath(opj(dataset.path, rel_pwd))
        rel_pwd = relpath(pwd, dataset.path)
    else:
        pwd, rel_pwd = get_command_pwds(dataset)

    ds = require_dataset(
        dataset, check_installed=True,
        purpose='tracking outcomes of a command')

    # not needed ATM
    #refds_path = ds.path

    lgr.debug('tracking command output underneath %s', ds)

    if not (rerun_info or inject):  # Rerun already takes care of this.
        # For explicit=True, we probably want to check whether any inputs have
        # modifications. However, we can't just do is_dirty(..., path=inputs)
        # because we need to consider subdatasets and untracked files.
        if not explicit and ds.repo.dirty:
            yield get_status_dict(
                'run',
                ds=ds,
                status='impossible',
                message=('unsaved modifications present, '
                         'cannot detect changes by command'))
            return

    cmd = normalize_command(cmd)

    inputs = GlobbedPaths(inputs, pwd=pwd,
                          expand=expand in ["inputs", "both"])
    extra_inputs = GlobbedPaths(extra_inputs, pwd=pwd,
                                # Follow same expansion rules as `inputs`.
                                expand=expand in ["inputs", "both"])
    outputs = GlobbedPaths(outputs, pwd=pwd,
                           expand=expand in ["outputs", "both"])

    if not inject:
        for res in prepare_inputs(ds, inputs, extra_inputs):
            yield res

        if outputs:
            for res in _install_and_reglob(ds, outputs):
                yield res
            for res in _unlock_or_remove(ds, outputs.expand(full=True)):
                yield res

        if rerun_outputs is not None:
            # These are files we need to unlock/remove for a rerun that aren't
            # included in the explicit outputs. Unlike inputs/outputs, these are
            # full paths, so we can pass them directly to unlock.
            for res in _unlock_or_remove(ds, rerun_outputs):
                yield res
    else:
        # If an inject=True caller wants to override the exit code, they can do
        # so in extra_info.
        cmd_exitcode = 0
        exc = None

    try:
        cmd_expanded = format_command(ds, cmd,
                                      pwd=pwd,
                                      dspath=ds.path,
                                      inputs=inputs,
                                      outputs=outputs)
    except KeyError as exc:
        yield get_status_dict(
            'run',
            ds=ds,
            status='impossible',
            message=('command has an unrecognized placeholder: %s',
                     exc))
        return

    if not inject:
        cmd_exitcode, exc = _execute_command(
            cmd_expanded, pwd,
            expected_exit=rerun_info.get("exit", 0) if rerun_info else None)


    # amend commit message with `run` info:
    # - pwd if inside the dataset
    # - the command itself
    # - exit code of the command
    run_info = {
        'cmd': cmd,
        'exit': cmd_exitcode,
        'chain': rerun_info["chain"] if rerun_info else [],
        'inputs': inputs.paths,
        'extra_inputs': extra_inputs.paths,
        'outputs': outputs.paths,
    }
    if rel_pwd is not None:
        # only when inside the dataset to not leak information
        run_info['pwd'] = rel_pwd
    if ds.id:
        run_info["dsid"] = ds.id
    if extra_info:
        run_info.update(extra_info)

    record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False)

    if sidecar is None:
        use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False)
        # If ConfigManager gets the ability to say "return single value",
        # update this code to use that.
        if isinstance(use_sidecar, tuple):
            # Use same precedence as 'git config'.
            use_sidecar = use_sidecar[-1]
        use_sidecar = anything2bool(use_sidecar)
    else:
        use_sidecar = sidecar


    if use_sidecar:
        # record ID is hash of record itself
        from hashlib import md5
        record_id = md5(record.encode('utf-8')).hexdigest()
        record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo'))
        record_path = op.join(ds.path, record_dir, record_id)
        if not op.lexists(record_path):
            # go for compression, even for minimal records not much difference, despite offset cost
            # wrap in list -- there is just one record
            dump2stream([run_info], record_path, compressed=True)

    # compose commit message
    msg = u"""\
[DATALAD RUNCMD] {}

=== Do not change lines below ===
{}
^^^ Do not change lines above ^^^
"""
    msg = msg.format(
        message if message is not None else _format_cmd_shorty(cmd_expanded),
        '"{}"'.format(record_id) if use_sidecar else record)

    outputs_to_save = outputs.expand(full=True) if explicit else '.'
    if not rerun_info and cmd_exitcode:
        if outputs_to_save:
            msg_path = relpath(opj(ds.repo.path, ds.repo.get_git_dir(ds.repo),
                                   "COMMIT_EDITMSG"))
            with open(msg_path, "wb") as ofh:
                ofh.write(assure_bytes(msg))
            lgr.info("The command had a non-zero exit code. "
                     "If this is expected, you can save the changes with "
                     "'datalad add -d . -r -F %s .'",
                     msg_path)
        raise exc
    elif outputs_to_save:
        for r in saver(ds, outputs_to_save, msg):
            yield r
示例#8
0
    def __call__(path=None,
                 spec=None,
                 dataset=None,
                 subject=None,
                 anon_subject=None,
                 acquisition=None,
                 properties=None):

        # TODO: acquisition can probably be removed (or made an alternative to
        # derive spec and/or dicom location from)

        # Change, so path needs to point directly to dicom ds?
        # Or just use acq and remove path?

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose="spec from dicoms")

        from datalad.utils import assure_list
        if path is not None:
            path = assure_list(path)
            path = [resolve_path(p, dataset) for p in path]
        else:
            raise InsufficientArgumentsError(
                "insufficient arguments for dicom2spec: a path is required")

        # TODO: We should be able to deal with several paths at once
        #       ATM we aren't (see also commit + message of actual spec)
        assert len(path) == 1

        if not spec:
            raise InsufficientArgumentsError(
                "insufficient arguments for dicom2spec: a spec file is required"
            )

            # TODO: That's prob. wrong. We can derive default spec from acquisition
        else:
            spec = resolve_path(spec, dataset)

        spec_series_list = \
            [r for r in json_py.load_stream(spec)] if op.exists(spec) else list()

        # get dataset level metadata:
        found_some = False
        for meta in dataset.meta_dump(
                path,
                recursive=False,  # always False?
                reporton='datasets',
                return_type='generator',
                result_renderer='disabled'):
            if meta.get('status', None) not in ['ok', 'notneeded']:
                yield meta
                continue

            if 'dicom' not in meta['metadata']:

                # TODO: Really "notneeded" or simply not a result at all?
                yield dict(status='notneeded',
                           message=("found no DICOM metadata for %s",
                                    meta['path']),
                           path=meta['path'],
                           type='dataset',
                           action='dicom2spec',
                           logger=lgr)
                continue

            if 'Series' not in meta['metadata']['dicom'] or \
                    not meta['metadata']['dicom']['Series']:
                yield dict(
                    status='impossible',
                    message=("no image series detected in DICOM metadata of"
                             " %s", meta['path']),
                    path=meta['path'],
                    type='dataset',
                    action='dicom2spec',
                    logger=lgr)
                continue

            found_some = True

            overrides = dict()
            if properties:
                # load from file or json string
                props = json_py.load(properties) \
                        if op.exists(properties) else json_py.loads(properties)
                # turn into editable, pre-approved records
                props = {
                    k: dict(value=v, approved=True)
                    for k, v in props.items()
                }
                overrides.update(props)

            spec_series_list = add_to_spec(
                meta,
                spec_series_list,
                op.dirname(spec),
                subject=subject,
                anon_subject=anon_subject,
                # session=session,
                # TODO: parameter "session" was what
                # we now call acquisition. This is
                # NOT a good default for bids_session!
                # Particularly wrt to anonymization
                overrides=overrides,
                dataset=dataset)

        if not found_some:
            yield dict(
                status='impossible',
                message="found no DICOM metadata",
                path=path,
                type=
                'file',  # TODO: arguable should be 'file' or 'dataset', depending on path
                action='dicom2spec',
                logger=lgr)
            return

        # TODO: RF needed. This rule should go elsewhere:
        # ignore duplicates (prob. reruns of aborted runs)
        # -> convert highest id only
        # Note: This sorting is a q&d hack!
        # TODO: Sorting needs to become more sophisticated + include notion of :all
        spec_series_list = sorted(spec_series_list,
                                  key=lambda x: get_specval(x, 'id')
                                  if 'id' in x.keys() else 0)
        for i in range(len(spec_series_list)):
            # Note: Removed the following line from condition below,
            # since it appears to be pointless. Value for 'converter'
            # used to be 'heudiconv' or 'ignore' for a 'dicomseries', so
            # it's not clear ATM what case this could possibly have catched:
            # heuristic.has_specval(spec_series_list[i], "converter") and \
            if spec_series_list[i]["type"] == "dicomseries" and \
                has_specval(spec_series_list[i], "bids-run") and \
                get_specval(spec_series_list[i], "bids-run") in \
                    [get_specval(s, "bids-run")
                     for s in spec_series_list[i + 1:]
                     if get_specval(
                            s,
                            "description") == get_specval(
                                spec_series_list[i], "description") and \
                     get_specval(s, "id") > get_specval(
                                             spec_series_list[i], "id")
                     ]:
                lgr.debug("Ignore SeriesNumber %s for conversion" % i)
                spec_series_list[i]["tags"].append(
                    'hirni-dicom-converter-ignore')

        lgr.debug("Storing specification (%s)", spec)
        # store as a stream (one record per file) to be able to
        # easily concat files without having to parse them, or
        # process them line by line without having to fully parse them
        from datalad_hirni.support.spec_helpers import sort_spec
        # Note: Sorting paradigm needs to change. See above.
        # spec_series_list = sorted(spec_series_list, key=lambda x: sort_spec(x))
        json_py.dump2stream(spec_series_list, spec)

        # make sure spec is in git:
        dataset.repo.set_gitattributes([(spec, {
            'annex.largefiles': 'nothing'
        })], '.gitattributes')

        for r in Save.__call__(dataset=dataset,
                               path=[spec, '.gitattributes'],
                               to_git=True,
                               message="[HIRNI] Added study specification "
                               "snippet for %s" %
                               op.relpath(path[0], dataset.path),
                               return_type='generator',
                               result_renderer='disabled'):
            if r.get('status', None) not in ['ok', 'notneeded']:
                yield r
            elif r['path'] in [spec, op.join(dataset.path, '.gitattributes')] \
                    and r['type'] == 'file':
                r['action'] = 'dicom2spec'
                r['logger'] = lgr
                yield r
            elif r['type'] == 'dataset':
                # 'ok' or 'notneeded' for a dataset is okay, since we commit
                # the spec. But it's not a result to yield
                continue
            else:
                # anything else shouldn't happen
                yield dict(
                    status='error',
                    message=("unexpected result from save: %s", r),
                    path=
                    spec,  # TODO: This actually isn't clear - get it from `r`
                    type='file',
                    action='dicom2spec',
                    logger=lgr)
示例#9
0
    def __call__(path,
                 dataset=None,
                 spec_file=None,
                 properties=None,
                 replace=False):
        # TODO: message

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose="hirni spec4anything")
        path = assure_list(path)
        path = [resolve_path(p, dataset) for p in path]

        res_kwargs = dict(action='hirni spec4anything', logger=lgr)
        res_kwargs['refds'] = Interface.get_refds_path(dataset)

        # ### This might become superfluous. See datalad-gh-2653
        ds_path = PathRI(dataset.path)
        # ###

        updated_files = []
        paths = []
        for ap in AnnotatePaths.__call__(
                dataset=dataset,
                path=path,
                action='hirni spec4anything',
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                return_type='generator',
                # TODO: Check this one out:
                on_failure='ignore',
                # Note/TODO: Not sure yet whether and when we need those.
                # Generally we want to be able to create a spec for subdatasets,
                # too:
                # recursive=recursive,
                # recursion_limit=recursion_limit,
                # force_subds_discovery=True,
                # force_parentds_discovery=True,
        ):

            if ap.get('status', None) in ['error', 'impossible']:
                yield ap
                continue

            # ### This might become superfluous. See datalad-gh-2653
            ap_path = PathRI(ap['path'])
            # ###

            # find acquisition and respective specification file:
            rel_path = posixpath.relpath(ap_path.posixpath, ds_path.posixpath)

            path_parts = rel_path.split('/')

            # TODO: Note: Outcommented this warning for now. We used to not have
            # a spec file at the toplevel of the study dataset, but now we do.
            # The logic afterwards works, but should be revisited. At least,
            # `acq` should be called differently now.
            # if len(path_parts) < 2:
            #     lgr.warning("Not within an acquisition")
            acq = path_parts[0]

            # TODO: spec file specifiable or fixed path?
            #       if we want the former, what we actually need is an
            #       association of acquisition and its spec path
            #       => prob. not an option but a config

            spec_path = spec_file if spec_file \
                else posixpath.join(ds_path.posixpath, acq,
                                    dataset.config.get("datalad.hirni.studyspec.filename",
                                                       "studyspec.json"))

            spec = [r for r in json_py.load_stream(spec_path)] \
                if posixpath.exists(spec_path) else list()

            lgr.debug("Add specification snippet for %s", ap['path'])
            # XXX 'add' does not seem to be the thing we want to do
            # rather 'set', so we have to check whether a spec for a location
            # is already known and fail or replace it (maybe with --force)

            # go through all existing specs and extract unique value
            # and also assign them to the new record (subjects, ...), but only
            # editable fields!!
            uniques = dict()
            for s in spec:
                for k in s:
                    if isinstance(s[k], dict) and 'value' in s[k]:
                        if k not in uniques:
                            uniques[k] = set()
                        uniques[k].add(s[k]['value'])
            overrides = dict()
            for k in uniques:
                if len(uniques[k]) == 1:
                    overrides[k] = _get_edit_dict(value=uniques[k].pop(),
                                                  approved=False)

            if properties:

                # TODO: This entire reading of properties needs to be RF'd
                # into proper generalized functions.
                # spec got more complex. update() prob. can't simply override
                # (think: 'procedures' and 'tags' prob. need to be appended
                # instead)

                # load from file or json string
                if isinstance(properties, dict):
                    props = properties
                elif op.exists(properties):
                    props = json_py.load(properties)
                else:
                    props = json_py.loads(properties)
                # turn into editable, pre-approved records
                spec_props = {
                    k: dict(value=v, approved=True)
                    for k, v in props.items()
                    if k not in non_editables + ['tags', 'procedures']
                }
                spec_props.update({
                    k: v
                    for k, v in props.items() if k in non_editables + ['tags']
                })

                # TODO: still wrong. It's a list. Append or override? How to decide?
                spec_props.update({
                    o_k: [{
                        i_k: dict(value=i_v, approved=True)
                        for i_k, i_v in o_v.items()
                    }]
                    for o_k, o_v in props.items() if o_k in ['procedures']
                })

                overrides.update(spec_props)

            # TODO: It's probably wrong to use uniques for overwriting! At least
            # they cannot be used to overwrite values explicitly set in
            # _add_to_spec like "location", "type", etc.
            #
            # But then: This should concern non-editable fields only, right?

            spec = _add_to_spec(spec,
                                posixpath.split(spec_path)[0],
                                ap,
                                dataset,
                                overrides=overrides,
                                replace=replace)

            # Note: Not sure whether we really want one commit per snippet.
            #       If not - consider:
            #       - What if we fail amidst? => Don't write to file yet.
            #       - What about input paths from different acquisitions?
            #         => store specs per acquisition in memory
            # MIH: One commit per line seems silly. why not update all files
            # collect paths of updated files, and give them to a single `add`
            # at the very end?
            # MIH: if we fail, we fail and nothing is committed
            from datalad_hirni.support.spec_helpers import sort_spec
            json_py.dump2stream(sorted(spec, key=lambda x: sort_spec(x)),
                                spec_path)
            updated_files.append(spec_path)

            yield get_status_dict(status='ok',
                                  type=ap['type'],
                                  path=ap['path'],
                                  **res_kwargs)
            paths.append(ap)

        from datalad.dochelpers import single_or_plural
        from os import linesep
        message = "[HIRNI] Add specification {n_snippets} for: {paths}".format(
            n_snippets=single_or_plural("snippet", "snippets", len(paths)),
            paths=linesep.join(" - " + op.relpath(p['path'], dataset.path)
                               for p in paths)
            if len(paths) > 1 else op.relpath(paths[0]['path'], dataset.path))
        for r in dataset.save(updated_files,
                              to_git=True,
                              message=message,
                              return_type='generator',
                              result_renderer='disabled'):
            yield r