Exemplo n.º 1
0
def test_path_is_subpath():
    ok_(path_is_subpath('/a/b', '/a'))
    ok_(path_is_subpath('/a/b/c', '/a'))
    nok_(path_is_subpath('/a/b', '/a/b'))
    nok_(path_is_subpath('/a/b', '/a/b/'))
    nok_(path_is_subpath('/a/b/', '/a/b'))
    ok_(path_is_subpath('/a/b', '/'))
    ok_(path_is_subpath('/aaa/b/c', '/aaa'))
    nok_(path_is_subpath('/aaa/b/c', '/aa'))
    nok_(path_is_subpath('/a/b', '/a/c'))
    nok_(path_is_subpath('/a/b/c', '/a/c'))
    # must not mix relative and abs
    assert_raises(ValueError, path_is_subpath, 'a/b', '/a')
    assert_raises(ValueError, path_is_subpath, '/a/b', 'a')
Exemplo n.º 2
0
def _get_containingds_from_agginfo(info, rpath):
    """Return the relative path of a dataset that contains a relative query path

    Parameters
    ----------
    info : dict
      Content of aggregate.json (dict with relative subdataset paths as keys)
    rpath : str
      Relative query path

    Returns
    -------
    str or None
      None is returned if the is no match, the relative path of the closest
      containing subdataset otherwise.
    """
    if rpath in info:
        dspath = rpath
    else:
        # not a direct hit, hence we find the closest
        # containing subdataset (if there is any)
        containing_ds = sorted(
            [subds for subds in sorted(info) if path_is_subpath(rpath, subds)],
            # TODO os.sep might not be OK on windows,
            # depending on where it was aggregated, ensure uniform UNIX
            # storage
            key=lambda x: x.count(os.sep),
            reverse=True)
        dspath = containing_ds[0] if len(containing_ds) else None
    return dspath
Exemplo n.º 3
0
def _get_containingds_from_agginfo(info, rpath):
    """Return the path of a dataset that contains a query path

    If a query path matches a dataset path directly, the matching dataset path
    is return -- not the parent dataset!

    Parameters
    ----------
    info : dict
      Content of aggregate.json (dict with (relative) subdataset paths as keys)
    rpath : str
      Query path can be absolute or relative, but must match the convention
      used in the info dict.

    Returns
    -------
    str or None
      None is returned if there is no match, the path of the closest
      containing subdataset otherwise (in the convention used in the
      info dict).
    """
    if rpath in info:
        dspath = rpath
    else:
        # not a direct hit, hence we find the closest
        # containing subdataset (if there is any)
        containing_ds = sorted(
            [subds for subds in sorted(info) if path_is_subpath(rpath, subds)],
            # TODO os.sep might not be OK on windows,
            # depending on where it was aggregated, ensure uniform UNIX
            # storage
            key=lambda x: x.count(os.sep),
            reverse=True)
        dspath = containing_ds[0] if len(containing_ds) else None
    return dspath
Exemplo n.º 4
0
def _get_containingds_from_agginfo(info, rpath):
    """Return the path of a dataset that contains a query path

    If a query path matches a dataset path directly, the matching dataset path
    is return -- not the parent dataset!

    Parameters
    ----------
    info : dict
      Content of aggregate.json (dict with (relative) subdataset paths as keys)
    rpath : str
      Query path can be absolute or relative, but must match the convention
      used in the info dict.

    Returns
    -------
    str or None
      None is returned if there is no match, the path of the closest
      containing subdataset otherwise (in the convention used in the
      info dict).
    """
    if rpath in info:
        dspath = rpath
    else:
        # not a direct hit, hence we find the closest
        # containing subdataset (if there is any)
        containing_ds = sorted(
            [subds for subds in sorted(info)
             if path_is_subpath(rpath, subds)],
            # TODO os.sep might not be OK on windows,
            # depending on where it was aggregated, ensure uniform UNIX
            # storage
            key=lambda x: x.count(os.sep), reverse=True)
        dspath = containing_ds[0] if len(containing_ds) else None
    return dspath
Exemplo n.º 5
0
 def _filterpaths(basepath, paths, exclude):
     final_paths = []
     for rp in [op.join(basepath, p) if basepath else p for p in paths]:
         if rp in exclude:
             continue
         elif any(path_is_subpath(ep, rp) for ep in exclude):
             final_paths.extend(
                 _filterpaths(rp, listdir(op.join(ds.path, rp)), exclude))
             pass
         else:
             final_paths.append(rp)
     return final_paths
Exemplo n.º 6
0
 def _filterpaths(basepath, paths, exclude):
     final_paths = []
     for rp in [op.join(basepath, p) if basepath else p for p in paths]:
         if rp in exclude:
             continue
         elif any(path_is_subpath(ep, rp) for ep in exclude):
             final_paths.extend(
                 _filterpaths(rp, listdir(op.join(ds.path, rp)), exclude))
             pass
         else:
             final_paths.append(rp)
     return final_paths
Exemplo n.º 7
0
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None,
                                        refds_path=None, description=None):
    if isinstance(recursion_limit, int) and recursion_limit <= 0:
        return
    # install using helper that give some flexibility regarding where to
    # get the module from
    for sub in ds.subdatasets(
            return_type='generator', result_renderer='disabled'):
        subds = Dataset(sub['path'])
        if sub.get('gitmodule_datalad-recursiveinstall', '') == 'skip':
            lgr.debug(
                "subdataset %s is configured to be skipped on recursive installation",
                sub['path'])
            continue
        if start is not None and not path_is_subpath(subds.path, start):
            # this one we can ignore, not underneath the start path
            continue
        if sub.get('state', None) != 'absent':
            # dataset was already found to exist
            yield get_status_dict(
                'install', ds=subds, status='notneeded', logger=lgr,
                refds=refds_path)
            # do not continue, even if an intermediate dataset exists it
            # does not imply that everything below it does too
        else:
            # try to get this dataset
            try:
                subds = _install_subds_from_flexible_source(
                    ds,
                    relpath(sub['path'], start=ds.path),
                    sub['gitmodule_url'],
                    reckless,
                    description=description)
                yield get_status_dict(
                    'install', ds=subds, status='ok', logger=lgr, refds=refds_path,
                    message=("Installed subdataset %s", subds), parentds=ds.path)
            except Exception as e:
                # skip all of downstairs, if we didn't manage to install subdataset
                yield get_status_dict(
                    'install', ds=subds, status='error', logger=lgr, refds=refds_path,
                    message=("Installation of subdatasets %s failed with exception: %s",
                             subds, exc_str(e)))
                continue
        # otherwise recurse
        # we can skip the start expression, we know we are within
        for res in _recursive_install_subds_underneath(
                subds,
                recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit,
                reckless=reckless,
                refds_path=refds_path):
            yield res
Exemplo n.º 8
0
Arquivo: get.py Projeto: hanke/datalad
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None,
                                        refds_path=None, description=None):
    if isinstance(recursion_limit, int) and recursion_limit <= 0:
        return
    # install using helper that give some flexibility regarding where to
    # get the module from
    for sub in ds.subdatasets(
            return_type='generator', result_renderer='disabled'):
        subds = Dataset(sub['path'])
        if sub.get('gitmodule_datalad-recursiveinstall', '') == 'skip':
            lgr.debug(
                "subdataset %s is configured to be skipped on recursive installation",
                sub['path'])
            continue
        if start is not None and not path_is_subpath(subds.path, start):
            # this one we can ignore, not underneath the start path
            continue
        if sub.get('state', None) != 'absent':
            # dataset was already found to exist
            yield get_status_dict(
                'install', ds=subds, status='notneeded', logger=lgr,
                refds=refds_path)
            # do not continue, even if an intermediate dataset exists it
            # does not imply that everything below it does too
        else:
            # try to get this dataset
            try:
                subds = _install_subds_from_flexible_source(
                    ds,
                    relpath(sub['path'], start=ds.path),
                    sub['gitmodule_url'],
                    reckless,
                    description=description)
                yield get_status_dict(
                    'install', ds=subds, status='ok', logger=lgr, refds=refds_path,
                    message=("Installed subdataset %s", subds), parentds=ds.path)
            except Exception as e:
                # skip all of downstairs, if we didn't manage to install subdataset
                yield get_status_dict(
                    'install', ds=subds, status='error', logger=lgr, refds=refds_path,
                    message=("Installation of subdatasets %s failed with exception: %s",
                             subds, exc_str(e)))
                continue
        # otherwise recurse
        # we can skip the start expression, we know we are within
        for res in _recursive_install_subds_underneath(
                subds,
                recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit,
                reckless=reckless,
                refds_path=refds_path):
            yield res
Exemplo n.º 9
0
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db):
    """Grab info on aggregated metadata for a path from a given dataset.

    The actual info is stored in a `db` dict under the absolute path
    of the dataset that contains the query path, plus any subdataset
    in case of recursion (with their own DB entries).

    Parameters
    ----------
    ds : Dataset
      source dataset
    path : str
      absolute path for which to obtain metadata
    recursive : bool

    Returns
    -------
    str or list
      A string/tuple is an error message, a list contains all absolute paths
      for all datasets on which info was put into the DB.
    """
    # TODO cache these
    agginfos = load_ds_aggregate_db(Dataset(ds_path), abspath=True)

    seed_ds = _get_containingds_from_agginfo(agginfos, path)
    if seed_ds is None:
        # nothing found
        # this will be the message in the result for the query path
        # and could be a tuple
        return (
            "No matching aggregated metadata for path '%s' in Dataset at %s",
            op.relpath(path, start=ds_path), ds_path)

    # easy peasy
    db[seed_ds] = agginfos[seed_ds]
    hits = [seed_ds]

    if not recursive:
        return hits

    # a little more complicated: we need to loop over all subdataset
    # records and pick the ones that are underneath the seed
    for agginfo_path in agginfos:
        if path_is_subpath(agginfo_path, seed_ds):
            db[agginfo_path] = agginfos[agginfo_path]
            hits.append(agginfo_path)
    # TODO we must keep the info on these recursively discovered datasets
    # somewhere, because we cannot rediscover them on the filesystem
    # when updating the datasets later on
    return hits
Exemplo n.º 10
0
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db):
    """Grab info on aggregated metadata for a path from a given dataset.

    The actual info is stored in a `db` dict under the absolute path
    of the dataset that contains the query path, plus any subdataset
    in case of recursion (with their own DB entries).

    Parameters
    ----------
    ds : Dataset
      source dataset
    path : str
      absolute path for which to obtain metadata
    recursive : bool

    Returns
    -------
    str or list
      A string/tuple is an error message, a list contains all absolute paths
      for all datasets on which info was put into the DB.
    """
    # TODO cache these
    agginfos = load_ds_aggregate_db(Dataset(ds_path), abspath=True)

    seed_ds = _get_containingds_from_agginfo(agginfos, path)
    if seed_ds is None:
        # nothing found
        # this will be the message in the result for the query path
        # and could be a tuple
        return ("No matching aggregated metadata for path '%s' in Dataset at %s",
                op.relpath(path, start=ds_path), ds_path)

    # easy peasy
    db[seed_ds] = agginfos[seed_ds]
    hits = [seed_ds]

    if not recursive:
        return hits

    # a little more complicated: we need to loop over all subdataset
    # records and pick the ones that are underneath the seed
    for agginfo_path in agginfos:
        if path_is_subpath(agginfo_path, seed_ds):
            db[agginfo_path] = agginfos[agginfo_path]
            hits.append(agginfo_path)
    # TODO we must keep the info on these recursively discovered datasets
    # somewhere, because we cannot rediscover them on the filesystem
    # when updating the datasets later on
    return hits
Exemplo n.º 11
0
def yield_recursive(ds, path, action, recursion_limit):
    # make sure we get everything relevant in all _checked out_
    # subdatasets, obtaining of previously unavailable subdataset
    # is elsewhere
    for subd_res in ds.subdatasets(recursive=True,
                                   recursion_limit=recursion_limit,
                                   return_type='generator'):
        # this check is not the same as subdatasets --contains=path
        # because we want all subdataset below a path, not just the
        # containing one
        if path_is_subpath(subd_res['path'], path):
            # this subdatasets is underneath the search path
            # be careful to not overwrite anything, in case
            # this subdataset has been processed before
            subd_res['action'] = action
            # mark as "notprocessed"
            subd_res['status'] = ''
            # we know that this is a known subdataset, that is how
            # we got here, make a record
            subd_res['registered_subds'] = True
            yield subd_res
Exemplo n.º 12
0
def yield_recursive(ds, path, action, recursion_limit):
    # make sure we get everything relevant in all _checked out_
    # subdatasets, obtaining of previously unavailable subdataset
    # is elsewhere
    for subd_res in ds.subdatasets(
            recursive=True,
            recursion_limit=recursion_limit,
            return_type='generator'):
        # this check is not the same as subdatasets --contains=path
        # because we want all subdataset below a path, not just the
        # containing one
        if path_is_subpath(subd_res['path'], path):
            # this subdatasets is underneath the search path
            # be careful to not overwrite anything, in case
            # this subdataset has been processed before
            subd_res['action'] = action
            # mark as "notprocessed"
            subd_res['status'] = ''
            # we know that this is a known subdataset, that is how
            # we got here, make a record
            subd_res['registered_subds'] = True
            yield subd_res
Exemplo n.º 13
0
def query_aggregated_metadata(reporton, ds, aps, recursive=False, **kwargs):
    """Query the aggregated metadata in a dataset

    Query paths (`aps`) have to be composed in an intelligent fashion
    by the caller of this function, i.e. it should have been decided
    outside which dataset to query for any given path.

    Also this function doesn't cache anything, hence the caller must
    make sure to only call this once per dataset to avoid waste.

    Parameters
    ----------
    reporton : {None, 'none', 'dataset', 'files', 'all'}
      If `None`, reporting will be based on the `type` property of the
      incoming annotated paths.
    ds : Dataset
      Dataset to query
    aps : list
      Sequence of annotated paths to query metadata for.
    recursive : bool
      Whether or not to report metadata underneath all query paths
      recursively.
    **kwargs
      Any other argument will be passed on to the query result dictionary.

    Returns
    -------
    generator
      Of result dictionaries.
    """
    from datalad.coreapi import get
    # look for and load the aggregation info for the base dataset
    info_fpath = opj(ds.path, agginfo_relpath)
    agg_base_path = dirname(info_fpath)
    agginfos = _load_json_object(info_fpath)
    if not agginfos and not exists(info_fpath):
        # This dataset does not have aggregated metadata.  Does it have any
        # other version?
        info_glob = agginfo_relpath_template.format('*')
        info_files = glob.glob(info_glob)
        msg = "Found no aggregated metadata info file %s." \
              % info_fpath
        old_metadata_file = opj(ds.path, METADATA_DIR, METADATA_FILENAME)
        if exists(old_metadata_file):
            msg += " Found metadata generated with pre-0.10 version of " \
                   "DataLad, but it will not be used."
        upgrade_msg = ""
        if info_files:
            msg += " Found following info files, which might have been " \
                   "generated with newer version(s) of datalad: %s." \
                   % (', '.join(info_files))
            upgrade_msg = ", upgrade datalad"
        msg += " You will likely need to either update the dataset from its " \
               "original location,%s or reaggregate metadata locally." \
               % upgrade_msg
        lgr.warning(msg)

    # cache once loaded metadata objects for additional lookups
    # TODO possibly supply this cache from outside, if objects could
    # be needed again -- their filename does not change in a superdataset
    # if done, cache under relpath, not abspath key
    cache = {
        'objcache': {},
        'subds_relpaths': None,
    }
    reported = set()

    # for all query paths
    for ap in aps:
        # all metadata is registered via its relative path to the
        # dataset that is being queried
        rpath = relpath(ap['path'], start=ds.path)
        if rpath in reported:
            # we already had this, probably via recursion of some kind
            continue
        rap = dict(ap, rpath=rpath, type=ap.get('type', None))

        # we really have to look this up from the aggregated metadata
        # and cannot use any 'parentds' property in the incoming annotated
        # path. the latter will reflect the situation on disk, we need
        # the record of the containing subdataset in the aggregated metadata
        # instead
        containing_ds = _get_containingds_from_agginfo(agginfos, rpath)
        if containing_ds is None:
            # could happen if there was no aggregated metadata at all
            # or the path is in this dataset, but luckily the queried dataset
            # is known to be present
            containing_ds = curdir
        rap['metaprovider'] = containing_ds

        # build list of datasets and paths to be queried for this annotated path
        # in the simple case this is just the containing dataset and the actual
        # query path
        to_query = [rap]
        if recursive:
            # in case of recursion this is also anything in any dataset underneath
            # the query path
            matching_subds = [
                {
                    'metaprovider': sub,
                    'rpath': sub,
                    'type': 'dataset'
                } for sub in sorted(agginfos)
                # we already have the base dataset
                if (rpath == curdir and sub != curdir)
                or path_is_subpath(sub, rpath)
            ]
            to_query.extend(matching_subds)

        to_query_available = []
        for qap in to_query:
            if qap['metaprovider'] not in agginfos:
                res = get_status_dict(
                    status='impossible',
                    path=qap['path'],
                    message=
                    ('Dataset at %s contains no aggregated metadata on this path',
                     qap['metaprovider']),
                )
                res.update(res, **kwargs)
                if 'type' in qap:
                    res['type'] = qap['type']
                yield res
            else:
                to_query_available.append(qap)

        # one heck of a beast to get the set of filenames for all metadata objects that are
        # required to be present to fulfill this query
        objfiles = set(
            agginfos.get(qap['metaprovider'], {}).get(t, None)
            for qap in to_query_available
            for t in ('dataset_info',) + \
            (('content_info',)
                if ((reporton is None and qap.get('type', None) == 'file') or
                    reporton in ('files', 'all')) else tuple())
        )
        # in case there was no metadata provider, we do not want to start
        # downloading everything: see https://github.com/datalad/datalad/issues/2458
        objfiles.difference_update([None])
        lgr.debug(
            'Verifying/achieving local availability of %i metadata objects',
            len(objfiles))
        if objfiles:
            get(path=[
                dict(path=opj(agg_base_path, of),
                     parentds=ds.path,
                     type='file') for of in objfiles if of
            ],
                dataset=ds,
                result_renderer='disabled')
        for qap in to_query_available:
            # info about the dataset that contains the query path
            dsinfo = agginfos.get(qap['metaprovider'], dict(id=ds.id))
            res_tmpl = get_status_dict()
            for s, d in (('id', 'dsid'), ('refcommit', 'refcommit')):
                if s in dsinfo:
                    res_tmpl[d] = dsinfo[s]

            # pull up dataset metadata, always needed if only for the context
            dsmeta = {}
            dsobjloc = dsinfo.get('dataset_info', None)
            if dsobjloc is not None:
                dsmeta = _load_json_object(opj(agg_base_path, dsobjloc),
                                           cache=cache['objcache'])

            for r in _query_aggregated_metadata_singlepath(
                    ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta,
                    dsinfo.get('content_info', None)):
                r.update(res_tmpl, **kwargs)
                # if we are coming from `search` we want to record why this is being
                # reported
                if 'query_matched' in ap:
                    r['query_matched'] = ap['query_matched']
                if r.get('type', None) == 'file':
                    r['parentds'] = normpath(opj(ds.path, qap['metaprovider']))
                yield r
                reported.add(qap['rpath'])
Exemplo n.º 14
0
def query_aggregated_metadata(reporton, ds, aps, recursive=False,
                              **kwargs):
    """Query the aggregated metadata in a dataset

    Query paths (`aps`) have to be composed in an intelligent fashion
    by the caller of this function, i.e. it should have been decided
    outside which dataset to query for any given path.

    Also this function doesn't cache anything, hence the caller must
    make sure to only call this once per dataset to avoid waste.

    Parameters
    ----------
    reporton : {None, 'none', 'dataset', 'files', 'all'}
      If `None`, reporting will be based on the `type` property of the
      incoming annotated paths.
    ds : Dataset
      Dataset to query
    aps : list
      Sequence of annotated paths to query metadata for.
    recursive : bool
      Whether or not to report metadata underneath all query paths
      recursively.
    **kwargs
      Any other argument will be passed on to the query result dictionary.

    Returns
    -------
    generator
      Of result dictionaries.
    """
    from datalad.coreapi import get
    # look for and load the aggregation info for the base dataset
    agginfos, agg_base_path = load_ds_aggregate_db(ds)

    # cache once loaded metadata objects for additional lookups
    # TODO possibly supply this cache from outside, if objects could
    # be needed again -- their filename does not change in a superdataset
    # if done, cache under relpath, not abspath key
    cache = {
        'objcache': {},
        'subds_relpaths': None,
    }
    reported = set()

    # for all query paths
    for ap in aps:
        # all metadata is registered via its relative path to the
        # dataset that is being queried
        rpath = op.relpath(ap['path'], start=ds.path)
        if rpath in reported:
            # we already had this, probably via recursion of some kind
            continue
        rap = dict(ap, rpath=rpath, type=ap.get('type', None))

        # we really have to look this up from the aggregated metadata
        # and cannot use any 'parentds' property in the incoming annotated
        # path. the latter will reflect the situation on disk, we need
        # the record of the containing subdataset in the aggregated metadata
        # instead
        containing_ds = _get_containingds_from_agginfo(agginfos, rpath)
        if containing_ds is None:
            # could happen if there was no aggregated metadata at all
            # or the path is in this dataset, but luckily the queried dataset
            # is known to be present
            containing_ds = op.curdir
        rap['metaprovider'] = containing_ds

        # build list of datasets and paths to be queried for this annotated path
        # in the simple case this is just the containing dataset and the actual
        # query path
        to_query = [rap]
        if recursive:
            # in case of recursion this is also anything in any dataset underneath
            # the query path
            matching_subds = [{'metaprovider': sub, 'rpath': sub, 'type': 'dataset'}
                              for sub in sorted(agginfos)
                              # we already have the base dataset
                              if (rpath == op.curdir and sub != op.curdir) or
                              path_is_subpath(sub, rpath)]
            to_query.extend(matching_subds)

        to_query_available = []
        for qap in to_query:
            if qap['metaprovider'] not in agginfos:
                res = get_status_dict(
                    status='impossible',
                    path=qap['path'],
                    message=(
                        'Dataset at %s contains no aggregated metadata on this path',
                        qap['metaprovider']),
                )
                res.update(res, **kwargs)
                if 'type' in qap:
                    res['type'] = qap['type']
                yield res
            else:
                to_query_available.append(qap)

        # one heck of a beast to get the set of filenames for all metadata objects that are
        # required to be present to fulfill this query
        objfiles = set(
            agginfos.get(qap['metaprovider'], {}).get(t, None)
            for qap in to_query_available
            for t in ('dataset_info',) + \
            (('content_info',)
                if ((reporton is None and qap.get('type', None) == 'file') or
                    reporton in ('files', 'all')) else tuple())
        )
        # in case there was no metadata provider, we do not want to start
        # downloading everything: see https://github.com/datalad/datalad/issues/2458
        objfiles.difference_update([None])
        lgr.debug(
            'Verifying/achieving local availability of %i metadata objects',
            len(objfiles))
        if objfiles:
            get(path=[dict(path=op.join(agg_base_path, of),
                           parentds=ds.path, type='file')
                      for of in objfiles if of],
                dataset=ds,
                result_renderer='disabled')
        for qap in to_query_available:
            # info about the dataset that contains the query path
            dsinfo = agginfos.get(qap['metaprovider'], dict(id=ds.id))
            res_tmpl = get_status_dict()
            for s, d in (('id', 'dsid'), ('refcommit', 'refcommit')):
                if s in dsinfo:
                    res_tmpl[d] = dsinfo[s]

            # pull up dataset metadata, always needed if only for the context
            dsmeta = {}
            dsobjloc = dsinfo.get('dataset_info', None)
            if dsobjloc is not None:
                dsmeta = _load_json_object(
                    op.join(agg_base_path, dsobjloc),
                    cache=cache['objcache'])

            for r in _query_aggregated_metadata_singlepath(
                    ds, agginfos, agg_base_path, qap, reporton,
                    cache, dsmeta,
                    dsinfo.get('content_info', None)):
                r.update(res_tmpl, **kwargs)
                # if we are coming from `search` we want to record why this is being
                # reported
                if 'query_matched' in ap:
                    r['query_matched'] = ap['query_matched']
                if r.get('type', None) == 'file':
                    r['parentds'] = op.normpath(op.join(ds.path, qap['metaprovider']))
                yield r
                reported.add(qap['rpath'])
Exemplo n.º 15
0
def results_from_annex_noinfo(ds,
                              requested_paths,
                              respath_by_status,
                              dir_fail_msg,
                              noinfo_dir_msg,
                              noinfo_file_msg,
                              noinfo_status='notneeded',
                              **kwargs):
    """Helper to yield results based on what information git annex did no give us.

    The helper assumes that the annex command returned without an error code,
    and interprets which of the requested paths we have heard nothing about,
    and assumes that git annex was happy with their current state.

    Parameters
    ==========
    ds : Dataset
      All results have to be concerning this single dataset (used to resolve
      relpaths).
    requested_paths : list
      List of path arguments sent to `git annex`
    respath_by_status : dict
      Mapping of 'success' or 'failure' labels to lists of result paths
      reported by `git annex`. Everything that is not in here, we assume
      that `git annex` was happy about.
    dir_fail_msg : str
      Message template to inject into the result for a requested directory where
      a failure was reported for some of its content. The template contains two
      string placeholders that will be expanded with 1) the path of the
      directory, and 2) the content failure paths for that directory
    noinfo_dir_msg : str
      Message template to inject into the result for a requested directory that
      `git annex` was silent about (incl. any content). There must be one string
      placeholder that is expanded with the path of that directory.
    noinfo_file_msg : str
      Message to inject into the result for a requested file that `git
      annex` was silent about.
    noinfo_status : str
      Status to report when annex provides no information
    **kwargs
      Any further kwargs are included in the yielded result dictionary.
    """
    for p in requested_paths:
        # any relpath is relative to the currently processed dataset
        # not the global reference dataset
        p = p if isabs(p) else normpath(opj(ds.path, p))
        if any(p in ps for ps in respath_by_status.values()):
            # we have a report for this path already
            continue
        common_report = dict(path=p, **kwargs)
        if isdir(p):
            # `annex` itself will not report on directories, but if a
            # directory was requested, we want to say something about
            # it in the results.  we are inside a single, existing
            # repo, hence all directories are already present, if not
            # we had an error
            # do we have any failures in a subdir of the requested dir?
            failure_results = [
                fp for fp in respath_by_status.get('failure', [])
                if path_is_subpath(fp, p)
            ]
            if failure_results:
                # we were not able to process all requested_paths, let's label
                # this 'impossible' to get a warning-type report
                # after all we have the directory itself, but not
                # (some) of its requested_paths
                yield get_status_dict(status='impossible',
                                      type='directory',
                                      message=(dir_fail_msg, p,
                                               failure_results),
                                      **common_report)
            else:
                # otherwise cool, but how cool?
                success_results = [
                    fp for fp in respath_by_status.get('success', [])
                    if path_is_subpath(fp, p)
                ]
                yield get_status_dict(
                    status='ok' if success_results else noinfo_status,
                    message=None if success_results else (noinfo_dir_msg, p),
                    type='directory',
                    **common_report)
            continue
        else:
            # not a directory, and we have had no word from `git annex`,
            # yet no exception, hence the file was most probably
            # already in the desired state
            yield get_status_dict(status=noinfo_status,
                                  type='file',
                                  message=noinfo_file_msg,
                                  **common_report)
Exemplo n.º 16
0
    def __call__(
            path=None,
            dataset=None,
            get_aggregates=False,
            reporton='all',
            recursive=False):
        # prep results
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr)
        if refds_path:
            res_kwargs['refds'] = refds_path

        if get_aggregates:
            # yield all datasets for which we have aggregated metadata as results
            # the get actual dataset results, so we can turn them into dataset
            # instances using generic top-level code if desired
            ds = require_dataset(
                refds_path,
                check_installed=True,
                purpose='aggregate metadata query')
            agginfos = load_ds_aggregate_db(
                ds,
                version=str(aggregate_layout_version),
                abspath=True
            )
            if not agginfos:
                # if there has ever been an aggregation run, this file would
                # exist, hence there has not been and we need to tell this
                # to people
                yield get_status_dict(
                    ds=ds,
                    status='impossible',
                    action='metadata',
                    logger=lgr,
                    message='metadata aggregation has never been performed in this dataset')
                return
            parentds = []
            for dspath in sorted(agginfos):
                info = agginfos[dspath]
                if parentds and not path_is_subpath(dspath, parentds[-1]):
                    parentds.pop()
                info.update(
                    path=dspath,
                    type='dataset',
                    status='ok',
                )
                if dspath == ds.path:
                    info['layout_version'] = aggregate_layout_version
                if parentds:
                    info['parentds'] = parentds[-1]
                yield dict(
                    info,
                    **res_kwargs
                )
                parentds.append(dspath)
            return

        if not dataset and not path:
            # makes no sense to have no dataset, go with "here"
            # error generation happens during annotation
            path = op.curdir

        content_by_ds = OrderedDict()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                # MIH: we are querying the aggregated metadata anyways, and that
                # mechanism has its own, faster way to go down the hierarchy
                #recursive=recursive,
                #recursion_limit=recursion_limit,
                action='metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                # we need to know when to look into aggregated data
                force_subds_discovery=True,
                force_parentds_discovery=True,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(ap['path']):
                ap['process_content'] = True
            to_query = None
            if ap.get('state', None) == 'absent' or \
                    ap.get('type', 'dataset') != 'dataset':
                # this is a lonely absent dataset/file or content in a present dataset
                # -> query through parent
                # there must be a parent, otherwise this would be a non-dataset path
                # and would have errored during annotation
                to_query = ap['parentds']
            else:
                to_query = ap['path']
            if to_query:
                pcontent = content_by_ds.get(to_query, [])
                pcontent.append(ap)
                content_by_ds[to_query] = pcontent

        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            query_agg = [ap for ap in content_by_ds[ds_path]
                         # this is an available subdataset, will be processed in another
                         # iteration
                         if ap.get('state', None) == 'absent' or
                         not(ap.get('type', None) == 'dataset' and ap['path'] != ds_path)]
            if not query_agg:
                continue
            # report from aggregated metadata
            for r in query_aggregated_metadata(
                    reporton,
                    # by default query the reference dataset, only if there is none
                    # try our luck in the dataset that contains the queried path
                    # this is consistent with e.g. `get_aggregates` reporting the
                    # situation in the reference dataset only
                    Dataset(refds_path) if refds_path else ds,
                    query_agg,
                    # recursion above could only recurse into datasets
                    # on the filesystem, but there might be any number of
                    # uninstalled datasets underneath the last installed one
                    # for which we might have metadata
                    recursive=recursive,
                    **res_kwargs):
                yield r
        return
Exemplo n.º 17
0
def discover_dataset_trace_to_targets(basepath,
                                      targetpaths,
                                      current_trace,
                                      spec,
                                      includeds=None):
    """Discover the edges and nodes in a dataset tree to given target paths

    Parameters
    ----------
    basepath : path
      Path to a start or top-level dataset. Really has to be a path to a
      dataset!
    targetpaths : list(path)
      Any non-zero number of paths that are termination points for the
      search algorithm. Can be paths to datasets, directories, or files
      (and any combination thereof).
    current_trace : list
      For a top-level call this should probably always be `[]`
    spec : dict
      `content_by_ds`-style dictionary that will receive information about the
      discovered datasets. Specifically, for each discovered dataset there
      will be an item with its path under the key (path) of the respective
      superdataset.
    includeds : sequence, optional
      Any paths given are treated as existing subdatasets, regardless of
      whether they can be found in the filesystem. Such subdatasets will appear
      under the key of the closest existing dataset in the `spec`.

    Returns
    -------
    None
      Function calls itself recursively and populates `spec` dict in-place.
      Keys are dataset paths, values are sets of subdataset paths
    """
    # convert to set for faster lookup
    includeds = includeds if isinstance(includeds, set) else \
        set() if includeds is None else set(includeds)
    # this beast walks the directory tree from a given `basepath` until
    # it discovers any of the given `targetpaths`
    # if it finds one, it commits any accummulated trace of visited
    # datasets on this edge to the spec
    valid_repo = GitRepo.is_valid_repo(basepath)
    if valid_repo:
        # we are passing into a new dataset, extend the dataset trace
        current_trace = current_trace + [basepath]
    # this edge is not done, we need to try to reach any downstream
    # dataset
    undiscovered_ds = set(t for t in targetpaths)  # if t != basepath)
    # whether anything in this directory matched a targetpath
    filematch = False
    if isdir(basepath):
        for p in listdir(basepath):
            p = ensure_unicode(opj(basepath, p))
            if not isdir(p):
                if p in targetpaths:
                    filematch = True
                # we cannot have anything below this one
                continue
            # OPT listdir might be large and we could have only few items
            # in `targetpaths` -- so traverse only those in spec which have
            # leading dir basepath
            # filter targets matching this downward path
            downward_targets = set(t for t in targetpaths
                                   if path_startswith(t, p))
            if not downward_targets:
                continue
            # remove the matching ones from the "todo" list
            undiscovered_ds.difference_update(downward_targets)
            # go one deeper
            discover_dataset_trace_to_targets(
                p,
                downward_targets,
                current_trace,
                spec,
                includeds=includeds
                if not includeds else includeds.intersection(downward_targets))
    undiscovered_ds = [
        t for t in undiscovered_ds if includeds
        and path_is_subpath(t, current_trace[-1]) and t in includeds
    ]
    if filematch or basepath in targetpaths or undiscovered_ds:
        for i, p in enumerate(current_trace[:-1]):
            # TODO RF prepare proper annotated path dicts
            subds = spec.get(p, set())
            subds.add(current_trace[i + 1])
            spec[p] = subds
        if undiscovered_ds:
            spec[current_trace[-1]] = spec.get(current_trace[-1],
                                               set()).union(undiscovered_ds)
Exemplo n.º 18
0
def query_aggregated_metadata(reporton, ds, aps, recursive=False, **kwargs):
    """Query the aggregated metadata in a dataset

    Query paths (`aps`) have to be composed in an intelligent fashion
    by the caller of this function, i.e. it should have been decided
    outside which dataset to query for any given path.

    Also this function doesn't cache anything, hence the caller must
    make sure to only call this once per dataset to avoid waste.

    Parameters
    ----------
    reporton : {None, 'none', 'dataset', 'files', 'all'}
      If `None`, reporting will be based on the `type` property of the
      incoming annotated paths.
    ds : Dataset
      Dataset to query
    aps : list
      Sequence of annotated paths to query metadata for.
    recursive : bool
      Whether or not to report metadata underneath all query paths
      recursively.
    **kwargs
      Any other argument will be passed on to the query result dictionary.

    Returns
    -------
    generator
      Of result dictionaries.
    """
    from datalad.coreapi import get
    # look for and load the aggregation info for the base dataset
    info_fpath = opj(ds.path, agginfo_relpath)
    agg_base_path = dirname(info_fpath)
    agginfos = _load_json_object(info_fpath)

    # cache once loaded metadata objects for additional lookups
    # TODO possibly supply this cache from outside, if objects could
    # be needed again -- their filename does not change in a superdataset
    # if done, cache under relpath, not abspath key
    cache = {
        'objcache': {},
        'subds_relpaths': None,
    }
    reported = set()

    # for all query paths
    for ap in aps:
        # all metadata is registered via its relative path to the
        # dataset that is being queried
        rpath = relpath(ap['path'], start=ds.path)
        if rpath in reported:
            # we already had this, probably via recursion of some kind
            continue
        rap = dict(ap, rpath=rpath, type=ap.get('type', None))

        # we really have to look this up from the aggregated metadata
        # and cannot use any 'parentds' property in the incoming annotated
        # path. the latter will reflect the situation on disk, we need
        # the record of the containing subdataset in the aggregated metadata
        # instead
        containing_ds = _get_containingds_from_agginfo(agginfos, rpath)
        if containing_ds is None:
            # could happen if there was no aggregated metadata at all
            # or the path is in this dataset, but luckily the queried dataset
            # is known to be present
            containing_ds = curdir
        rap['metaprovider'] = containing_ds

        # build list of datasets and paths to be queried for this annotated path
        # in the simple case this is just the containing dataset and the actual
        # query path
        to_query = [rap]
        if recursive:
            # in case of recursion this is also anything in any dataset underneath
            # the query path
            matching_subds = [
                {
                    'metaprovider': sub,
                    'rpath': sub,
                    'type': 'dataset'
                } for sub in sorted(agginfos)
                # we already have the base dataset
                if (rpath == curdir and sub != curdir)
                or path_is_subpath(sub, rpath)
            ]
            to_query.extend(matching_subds)

        # one heck of a beast to get the set of filenames for all metadata objects that are
        # required to be present to fulfill this query
        objfiles = set(
            agginfos.get(qap['metaprovider'], {}).get(t, None)
            for qap in to_query
            for t in ('dataset_info',) + \
            (('content_info',)
                if ((reporton is None and qap.get('type', None) == 'file') or
                    reporton in ('files', 'all')) else tuple())
        )
        lgr.debug(
            'Verifying/achieving local availability of %i metadata objects',
            len(objfiles))
        get(path=[
            dict(path=opj(agg_base_path, of), parentds=ds.path, type='file')
            for of in objfiles if of
        ],
            dataset=ds,
            result_renderer='disabled')
        for qap in to_query:
            # info about the dataset that contains the query path
            dsinfo = agginfos.get(qap['metaprovider'], dict(id=ds.id))
            res_tmpl = get_status_dict()
            for s, d in (('id', 'dsid'), ('refcommit', 'refcommit')):
                if s in dsinfo:
                    res_tmpl[d] = dsinfo[s]

            # pull up dataset metadata, always needed if only for the context
            dsmeta = {}
            dsobjloc = dsinfo.get('dataset_info', None)
            if dsobjloc is not None:
                dsmeta = _load_json_object(opj(agg_base_path, dsobjloc),
                                           cache=cache['objcache'])

            for r in _query_aggregated_metadata_singlepath(
                    ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta,
                    dsinfo.get('content_info', None)):
                r.update(res_tmpl, **kwargs)
                # if we are coming from `search` we want to record why this is being
                # reported
                if 'query_matched' in ap:
                    r['query_matched'] = ap['query_matched']
                if r.get('type', None) == 'file':
                    r['parentds'] = normpath(opj(ds.path, qap['metaprovider']))
                yield r
                reported.add(qap['rpath'])
Exemplo n.º 19
0
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db):
    """Grab info on aggregated metadata for a path from a given dataset.

    The actual info is stored in a `db` dict under the absolute path
    of the dataset that contains the query path, plus any subdataset
    in case of recursion (with their own DB entries).

    Parameters
    ----------
    ds : Dataset
      source dataset
    path : str
      absolute path for which to obtain metadata
    recursive : bool

    Returns
    -------
    str or list
      A string is an error message, a list contains all absolute paths for
      all datasets on which info was put into the DB.
    """
    info_fpath = opj(ds_path, agginfo_relpath)
    info_basepath = dirname(info_fpath)
    # TODO cache these
    agginfos = _load_json_object(info_fpath)

    def _ensure_abs_obj_location(rec):
        # object location in the DB must be absolute so we can copy easily
        # to all relevant datasets
        for key in location_keys:
            if key in rec and not isabs(rec[key]):
                rec[key] = opj(info_basepath, rec[key])
        return rec

    rpath = relpath(path, start=ds_path)
    seed_ds = _get_containingds_from_agginfo(agginfos, rpath)
    if seed_ds is None:
        # nothing found
        # this will be the message in the result for the query path
        # and could be a tuple
        return ("No matching aggregated metadata for path '%s' in Dataset at %s", rpath, ds_path)

    # easy peasy
    seed_abs = opj(ds_path, seed_ds)
    db[seed_abs] = _ensure_abs_obj_location(agginfos[seed_ds])
    hits = [seed_abs]

    if not recursive:
        return hits

    # a little more complicated: we need to loop over all subdataset
    # records and pick the ones that are underneath the seed
    for agginfo_path in agginfos:
        if path_is_subpath(agginfo_path, seed_ds):
            absp = opj(ds_path, agginfo_path)
            db[absp] = _ensure_abs_obj_location(agginfos[agginfo_path])
            hits.append(absp)
    # TODO we must keep the info on these recursively discovered datasets
    # somewhere, because we cannot rediscover them on the filesystem
    # when updating the datasets later on
    return hits
Exemplo n.º 20
0
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace,
                                      spec, includeds=None):
    """Discover the edges and nodes in a dataset tree to given target paths

    Parameters
    ----------
    basepath : path
      Path to a start or top-level dataset. Really has to be a path to a
      dataset!
    targetpaths : list(path)
      Any non-zero number of paths that are termination points for the
      search algorithm. Can be paths to datasets, directories, or files
      (and any combination thereof).
    current_trace : list
      For a top-level call this should probably always be `[]`
    spec : dict
      `content_by_ds`-style dictionary that will receive information about the
      discovered datasets. Specifically, for each discovered dataset there
      will be in item with its path under the key (path) of the respective
      superdataset.
    includeds : sequence, optional
      Any paths given are treated as existing subdatasets, regardless of
      whether they can be found in the filesystem. Such subdatasets will appear
      under the key of the closest existing dataset in the `spec`.

    Returns
    -------
    None
      Function calls itself recursively and populates `spec` dict in-place.
      Keys are dataset paths, values are sets of subdataset paths
    """
    # convert to set for faster lookup
    includeds = includeds if isinstance(includeds, set) else \
        set() if includeds is None else set(includeds)
    # this beast walks the directory tree from a given `basepath` until
    # it discovers any of the given `targetpaths`
    # if it finds one, it commits any accummulated trace of visited
    # datasets on this edge to the spec
    valid_repo = GitRepo.is_valid_repo(basepath)
    if valid_repo:
        # we are passing into a new dataset, extend the dataset trace
        current_trace = current_trace + [basepath]
    # this edge is not done, we need to try to reach any downstream
    # dataset
    undiscovered_ds = set(t for t in targetpaths) # if t != basepath)
    # whether anything in this directory matched a targetpath
    filematch = False
    if isdir(basepath):
        for p in listdir(basepath):
            p = assure_unicode(opj(basepath, p))
            if not isdir(p):
                if p in targetpaths:
                    filematch = True
                # we cannot have anything below this one
                continue
            # OPT listdir might be large and we could have only few items
            # in `targetpaths` -- so traverse only those in spec which have
            # leading dir basepath
            # filter targets matching this downward path
            downward_targets = set(
                t for t in targetpaths if path_startswith(t, p))
            if not downward_targets:
                continue
            # remove the matching ones from the "todo" list
            undiscovered_ds.difference_update(downward_targets)
            # go one deeper
            discover_dataset_trace_to_targets(
                p, downward_targets, current_trace, spec,
                includeds=includeds if not includeds else includeds.intersection(
                    downward_targets))
    undiscovered_ds = [t for t in undiscovered_ds
                       if includeds and
                          path_is_subpath(t, current_trace[-1]) and
                          t in includeds]
    if filematch or basepath in targetpaths or undiscovered_ds:
        for i, p in enumerate(current_trace[:-1]):
            # TODO RF prepare proper annotated path dicts
            subds = spec.get(p, set())
            subds.add(current_trace[i + 1])
            spec[p] = subds
        if undiscovered_ds:
            spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union(
                undiscovered_ds)
Exemplo n.º 21
0
def get_modified_subpaths(aps,
                          refds,
                          revision,
                          recursion_limit=None,
                          report_no_revision_change=True,
                          report_untracked='all'):
    """
    Parameters
    ----------
    aps : list
    refds : Dataset
    revision : str
      Commit-ish
    """
    # TODO needs recursion limit
    # NOTE this is implemented as a generator despite that fact that we need
    # to sort through _all_ the inputs initially, diff'ing each involved
    # dataset takes time that we can use to already act on intermediate
    # result paths, without having to wait for 100% completion
    if revision is None:
        # we want all, subds not matching the ref are assumed to have been
        # sorted out before (e.g. one level up)
        for r in aps:
            yield r

    # life is simple: we diff the base dataset
    modified = []
    # Diff.__call__ is used to get access to the now obsolete interface.diff
    # that exists merely for annotate_paths. (refds.diff corresponds to
    # core.local.diff.)
    from datalad.interface.diff import Diff
    for r in Diff.__call__(
            dataset=refds,
            # we cannot really limit the diff paths easily because we might get
            # or miss content (e.g. subdatasets) if we don't figure out which
            # ones are known -- and we don't want that
            path=None,
            # `revision` can be anything that Git support for `diff`
            # `True` is code for diff without revision
            revision=revision if revision is not True else None,
            # it is important that staged is False, otherwise we would miss unstaged
            # changes when e.g. diffing against HEAD (save does that)
            staged=False,
            # we might want to consider putting 'untracked' here
            # maybe that is a little faster, not tested yet
            ignore_subdatasets='none',
            # by default, we want to see any individual untracked file, this simplifies further
            # processing dramatically, but may require subsequent filtering
            # in order to avoid flooding user output with useless info
            report_untracked=report_untracked,
            # no recursion, we needs to update `revision` for every subdataset
            # before we can `diff`
            recursive=False,
            return_type='generator',
            result_renderer=None,
            # need to be able to yield the errors
            on_failure='ignore'):
        if r['status'] in ('impossible', 'error'):
            # something unexpected, tell daddy
            yield r
            continue
        # if asked, and no change in revision -- skip
        if not report_no_revision_change \
                and (r.get('revision_src') or r.get('revision')) \
                and (r.get('revision_src') == r.get('revision')):
            continue
        r['status'] = ''
        modified.append(r)

    if not len(modified):
        # nothing modified nothing to report
        return

    # now we can grab the APs that are in this dataset and yield them
    for ap in aps:
        # need to preserve pristine info first
        ap = ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path)
        for m in modified:
            if ap['path'] == m['path']:
                # is directly modified, yield input AP
                # but update with what we learned about the modification
                ap.update(m)
                yield ap
                break
            if path_is_subpath(m['path'], ap['path']):
                # a modified path is underneath this AP
                # yield the modified one instead
                yield m
                continue

    mod_subs = [m for m in modified if m.get('type', None) == 'dataset']
    if not mod_subs or (recursion_limit is not None and recursion_limit < 1):
        return

    aps = [
        ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path)
        for ap in aps
    ]
    # now for all submodules that were found modified
    for sub in [m for m in modified if m.get('type', None) == 'dataset']:
        sub_path_ = _with_sep(sub['path'])
        # these AP match something inside this submodule, or the whole submodule
        sub_aps = [
            ap for ap in aps if _with_sep(ap['path']).startswith(sub_path_)
        ]
        if not sub_aps:
            continue
        # we are interested in the modifications within this subdataset
        # from the state we previously had on record, till the state
        # we have in record now
        diff_range = '{}..{}'.format(
            sub['revision_src'] if sub['revision_src'] else
            PRE_INIT_COMMIT_SHA, sub['revision'] if sub['revision'] else '')
        if sub['revision_src'] and sub['revision_src'] == sub['revision']:
            # this is a special case, where subdataset reported changes without
            # a change in state/commit -- this is code for uncommitted changes
            # in the subdataset (including staged ones). In such a case, we
            # must not provide a diff range, but only the source commit we want
            # to diff against
            # XXX if this is changed, likely the same logic in diff needs
            # changing too!
            diff_range = sub['revision_src']

        for r in get_modified_subpaths(
                sub_aps,
                Dataset(sub['path']),
                diff_range,
                recursion_limit=(recursion_limit -
                                 1) if recursion_limit is not None else None):
            yield r
Exemplo n.º 22
0
    def __call__(path=None,
                 dataset=None,
                 get_aggregates=False,
                 reporton='all',
                 recursive=False):
        # prep results
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr)
        if refds_path:
            res_kwargs['refds'] = refds_path

        if get_aggregates:
            # yield all datasets for which we have aggregated metadata as results
            # the get actual dataset results, so we can turn them into dataset
            # instances using generic top-level code if desired
            ds = require_dataset(refds_path,
                                 check_installed=True,
                                 purpose='aggregate metadata query')
            info_fpath = opj(ds.path, agginfo_relpath)
            if not exists(info_fpath):
                # if there has ever been an aggregation run, this file would
                # exist, hence there has not been and we need to tell this
                # to people
                yield get_status_dict(
                    ds=ds,
                    status='impossible',
                    action='metadata',
                    logger=lgr,
                    message=
                    'metadata aggregation has never been performed in this dataset'
                )
                return
            agginfos = _load_json_object(info_fpath)
            parentds = []
            for sd in sorted(agginfos):
                info = agginfos[sd]
                dspath = normpath(opj(ds.path, sd))
                if parentds and not path_is_subpath(dspath, parentds[-1]):
                    parentds.pop()
                info.update(
                    path=dspath,
                    type='dataset',
                    status='ok',
                )
                if sd == curdir:
                    info['layout_version'] = aggregate_layout_version
                if parentds:
                    info['parentds'] = parentds[-1]
                yield dict(info, **res_kwargs)
                parentds.append(dspath)
            return

        if not dataset and not path:
            # makes no sense to have no dataset, go with "here"
            # error generation happens during annotation
            path = curdir

        content_by_ds = OrderedDict()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                # MIH: we are querying the aggregated metadata anyways, and that
                # mechanism has its own, faster way to go down the hierarchy
                #recursive=recursive,
                #recursion_limit=recursion_limit,
                action='metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                # we need to know when to look into aggregated data
                force_subds_discovery=True,
                force_parentds_discovery=True,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(
                    ap['path']):
                ap['process_content'] = True
            to_query = None
            if ap.get('state', None) == 'absent' or \
                    ap.get('type', 'dataset') != 'dataset':
                # this is a lonely absent dataset/file or content in a present dataset
                # -> query through parent
                # there must be a parent, otherwise this would be a non-dataset path
                # and would have errored during annotation
                to_query = ap['parentds']
            else:
                to_query = ap['path']
            if to_query:
                pcontent = content_by_ds.get(to_query, [])
                pcontent.append(ap)
                content_by_ds[to_query] = pcontent

        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            query_agg = [
                ap for ap in content_by_ds[ds_path]
                # this is an available subdataset, will be processed in another
                # iteration
                if ap.get('state', None) == 'absent' or not (ap.get(
                    'type', None) == 'dataset' and ap['path'] != ds_path)
            ]
            if not query_agg:
                continue
            # report from aggregated metadata
            for r in query_aggregated_metadata(
                    reporton,
                    # by default query the reference dataset, only if there is none
                    # try our luck in the dataset that contains the queried path
                    # this is consistent with e.g. `get_aggregates` reporting the
                    # situation in the reference dataset only
                    Dataset(refds_path) if refds_path else ds,
                    query_agg,
                    # recursion above could only recurse into datasets
                    # on the filesystem, but there might be any number of
                    # uninstalled datasets underneath the last installed one
                    # for which we might have metadata
                    recursive=recursive,
                    **res_kwargs):
                yield r
        return
Exemplo n.º 23
0
def results_from_annex_noinfo(ds, requested_paths, respath_by_status, dir_fail_msg,
                              noinfo_dir_msg, noinfo_file_msg, noinfo_status='notneeded',
                              **kwargs):
    """Helper to yield results based on what information git annex did no give us.

    The helper assumes that the annex command returned without an error code,
    and interprets which of the requested paths we have heard nothing about,
    and assumes that git annex was happy with their current state.

    Parameters
    ==========
    ds : Dataset
      All results have to be concerning this single dataset (used to resolve
      relpaths).
    requested_paths : list
      List of path arguments sent to `git annex`
    respath_by_status : dict
      Mapping of 'success' or 'failure' labels to lists of result paths
      reported by `git annex`. Everything that is not in here, we assume
      that `git annex` was happy about.
    dir_fail_msg : str
      Message template to inject into the result for a requested directory where
      a failure was reported for some of its content. The template contains two
      string placeholders that will be expanded with 1) the path of the
      directory, and 2) the content failure paths for that directory
    noinfo_dir_msg : str
      Message template to inject into the result for a requested directory that
      `git annex` was silent about (incl. any content). There must be one string
      placeholder that is expanded with the path of that directory.
    noinfo_file_msg : str
      Message to inject into the result for a requested file that `git
      annex` was silent about.
    noinfo_status : str
      Status to report when annex provides no information
    **kwargs
      Any further kwargs are included in the yielded result dictionary.
    """
    for p in requested_paths:
        # any relpath is relative to the currently processed dataset
        # not the global reference dataset
        p = p if isabs(p) else normpath(opj(ds.path, p))
        if any(p in ps for ps in respath_by_status.values()):
            # we have a report for this path already
            continue
        common_report = dict(path=p, **kwargs)
        if isdir(p):
            # `annex` itself will not report on directories, but if a
            # directory was requested, we want to say something about
            # it in the results.  we are inside a single, existing
            # repo, hence all directories are already present, if not
            # we had an error
            # do we have any failures in a subdir of the requested dir?
            failure_results = [
                fp for fp in respath_by_status.get('failure', [])
                if path_is_subpath(fp, p)]
            if failure_results:
                # we were not able to process all requested_paths, let's label
                # this 'impossible' to get a warning-type report
                # after all we have the directory itself, but not
                # (some) of its requested_paths
                yield get_status_dict(
                    status='impossible', type='directory',
                    message=(dir_fail_msg, p, failure_results),
                    **common_report)
            else:
                # otherwise cool, but how cool?
                success_results = [
                    fp for fp in respath_by_status.get('success', [])
                    if path_is_subpath(fp, p)]
                yield get_status_dict(
                    status='ok' if success_results else noinfo_status,
                    message=None if success_results else (noinfo_dir_msg, p),
                    type='directory', **common_report)
            continue
        else:
            # not a directory, and we have had no word from `git annex`,
            # yet no exception, hence the file was most probably
            # already in the desired state
            yield get_status_dict(
                status=noinfo_status, type='file',
                message=noinfo_file_msg,
                **common_report)
Exemplo n.º 24
0
def get_modified_subpaths(aps, refds, revision, recursion_limit=None,
                          report_no_revision_change=True,
                          report_untracked='all'):
    """
    Parameters
    ----------
    aps : list
    refds : Dataset
    revision : str
      Commit-ish
    """
    # TODO needs recursion limit
    # NOTE this is implemented as a generator despite that fact that we need
    # to sort through _all_ the inputs initially, diff'ing each involved
    # dataset takes time that we can use to already act on intermediate
    # result paths, without having to wait for 100% completion
    if revision is None:
        # we want all, subds not matching the ref are assumed to have been
        # sorted out before (e.g. one level up)
        for r in aps:
            yield r

    # life is simple: we diff the base dataset
    modified = []
    # Diff.__call__ is used to get access to the now obsolete interface.diff
    # that exists merely for annotate_paths. (refds.diff corresponds to
    # core.local.diff.)
    from datalad.interface.diff import Diff
    for r in Diff.__call__(
            dataset=refds,
            # we cannot really limit the diff paths easily because we might get
            # or miss content (e.g. subdatasets) if we don't figure out which
            # ones are known -- and we don't want that
            path=None,
            # `revision` can be anything that Git support for `diff`
            # `True` is code for diff without revision
            revision=revision if revision is not True else None,
            # it is important that staged is False, otherwise we would miss unstaged
            # changes when e.g. diffing against HEAD (save does that)
            staged=False,
            # we might want to consider putting 'untracked' here
            # maybe that is a little faster, not tested yet
            ignore_subdatasets='none',
            # by default, we want to see any individual untracked file, this simplifies further
            # processing dramatically, but may require subsequent filtering
            # in order to avoid flooding user output with useless info
            report_untracked=report_untracked,
            # no recursion, we needs to update `revision` for every subdataset
            # before we can `diff`
            recursive=False,
            return_type='generator',
            result_renderer=None,
            # need to be able to yield the errors
            on_failure='ignore'):
        if r['status'] in ('impossible', 'error'):
            # something unexpected, tell daddy
            yield r
            continue
        # if asked, and no change in revision -- skip
        if not report_no_revision_change \
                and (r.get('revision_src') or r.get('revision')) \
                and (r.get('revision_src') == r.get('revision')):
            continue
        r['status'] = ''
        modified.append(r)

    if not len(modified):
        # nothing modified nothing to report
        return

    # now we can grab the APs that are in this dataset and yield them
    for ap in aps:
        # need to preserve pristine info first
        ap = ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path)
        for m in modified:
            if ap['path'] == m['path']:
                # is directly modified, yield input AP
                # but update with what we learned about the modification
                ap.update(m)
                yield ap
                break
            if path_is_subpath(m['path'], ap['path']):
                # a modified path is underneath this AP
                # yield the modified one instead
                yield m
                continue

    mod_subs = [m for m in modified if m.get('type', None) == 'dataset']
    if not mod_subs or (recursion_limit is not None and recursion_limit < 1):
        return

    aps = [ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for ap in aps]
    # now for all submodules that were found modified
    for sub in [m for m in modified if m.get('type', None) == 'dataset']:
        sub_path_ = _with_sep(sub['path'])
        # these AP match something inside this submodule, or the whole submodule
        sub_aps = [ap for ap in aps if _with_sep(ap['path']).startswith(sub_path_)]
        if not sub_aps:
            continue
        # we are interested in the modifications within this subdataset
        # from the state we previously had on record, till the state
        # we have in record now
        diff_range = '{}..{}'.format(
            sub['revision_src'] if sub['revision_src'] else PRE_INIT_COMMIT_SHA,
            sub['revision'] if sub['revision'] else '')
        if sub['revision_src'] and sub['revision_src'] == sub['revision']:
            # this is a special case, where subdataset reported changes without
            # a change in state/commit -- this is code for uncommited changes
            # in the subdataset (including staged ones). In such a case, we
            # must not provide a diff range, but only the source commit we want
            # to diff against
            # XXX if this is changed, likely the same logic in diff needs
            # changing too!
            diff_range = sub['revision_src']

        for r in get_modified_subpaths(
                sub_aps,
                Dataset(sub['path']),
                diff_range,
                recursion_limit=(recursion_limit - 1) if recursion_limit is not None else None ):
            yield r
Exemplo n.º 25
0
    def __call__(path=None,
                 *,
                 dataset=None,
                 get_aggregates=False,
                 reporton='all',
                 recursive=False):
        # prep results
        refds_path = dataset if dataset is None \
            else require_dataset(dataset).path
        res_kwargs = dict(action='metadata', logger=lgr)
        if refds_path:
            res_kwargs['refds'] = refds_path

        if get_aggregates:
            # yield all datasets for which we have aggregated metadata as results
            # the get actual dataset results, so we can turn them into dataset
            # instances using generic top-level code if desired
            ds = require_dataset(refds_path,
                                 check_installed=True,
                                 purpose='aggregate metadata query')
            agginfos = load_ds_aggregate_db(
                ds, version=str(aggregate_layout_version), abspath=True)
            if not agginfos:
                # if there has ever been an aggregation run, this file would
                # exist, hence there has not been and we need to tell this
                # to people
                yield get_status_dict(
                    ds=ds,
                    status='impossible',
                    action='metadata',
                    logger=lgr,
                    message=
                    'metadata aggregation has never been performed in this dataset'
                )
                return
            parentds = []
            for dspath in sorted(agginfos):
                info = agginfos[dspath]
                if parentds and not path_is_subpath(dspath, parentds[-1]):
                    parentds.pop()
                info.update(
                    path=dspath,
                    type='dataset',
                    status='ok',
                )
                if dspath == ds.path:
                    info['layout_version'] = aggregate_layout_version
                if parentds:
                    info['parentds'] = parentds[-1]
                yield dict(info, **res_kwargs)
                parentds.append(dspath)
            return

        if not dataset and not path:
            # makes no sense to have no dataset, go with "here"
            # error generation happens during annotation
            path = op.curdir

        paths_by_ds, errors = get_paths_by_ds(require_dataset(dataset),
                                              dataset,
                                              paths=ensure_list(path),
                                              subdsroot_mode='super')
        content_by_ds = OrderedDict()
        for ap in _minimal_annotate_paths(paths_by_ds,
                                          errors,
                                          action='metadata',
                                          refds=refds_path):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(
                    ap['path']):
                ap['process_content'] = True
            to_query = None
            if ap.get('state', None) == 'absent' or \
                    ap.get('type', 'dataset') != 'dataset':
                # this is a lonely absent dataset/file or content in a present dataset
                # -> query through parent
                # there must be a parent, otherwise this would be a non-dataset path
                # and would have errored during annotation
                to_query = ap['parentds']
            else:
                to_query = ap['path']
            if to_query:
                pcontent = content_by_ds.get(to_query, [])
                pcontent.append(ap)
                content_by_ds[to_query] = pcontent

        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            query_agg = [
                ap for ap in content_by_ds[ds_path]
                # this is an available subdataset, will be processed in another
                # iteration
                if ap.get('state', None) == 'absent' or not (ap.get(
                    'type', None) == 'dataset' and ap['path'] != ds_path)
            ]
            if not query_agg:
                continue
            # report from aggregated metadata
            for r in query_aggregated_metadata(
                    reporton,
                    # by default query the reference dataset, only if there is none
                    # try our luck in the dataset that contains the queried path
                    # this is consistent with e.g. `get_aggregates` reporting the
                    # situation in the reference dataset only
                    Dataset(refds_path) if refds_path else ds,
                    query_agg,
                    # recursion above could only recurse into datasets
                    # on the filesystem, but there might be any number of
                    # uninstalled datasets underneath the last installed one
                    # for which we might have metadata
                    recursive=recursive,
                    **res_kwargs):
                yield r
        return