def test_get_content_info(path):
    repo = GitRepo(path)
    assert_equal(repo.get_content_info(), {})
    # an invalid reference causes an exception
    assert_raises(ValueError, repo.get_content_info, ref='HEAD')

    ds = get_convoluted_situation(path)
    repopath = ds.repo.pathobj

    assert_equal(ds.repo.pathobj, repopath)
    assert_equal(ds.pathobj, ut.Path(path))

    # verify general rules on fused info records that are incrementally
    # assembled: for git content info, amended with annex info on 'HEAD'
    # (to get the last committed stage and with it possibly vanished
    # content), and lastly annex info wrt to the present worktree, to
    # also get info on added/staged content
    # this fuses the info reported from
    # - git ls-files
    # - git annex findref HEAD
    # - git annex find --include '*'
    for f, r in ds.repo.annexstatus().items():
        if f.match('*_untracked'):
            assert (r.get('gitshasum', None) is None)
        if f.match('*_deleted'):
            assert (not f.exists() and not f.is_symlink() is None)
        if f.match('subds_*'):
            assert (r['type'] == 'dataset'
                    if r.get('gitshasum', None) else 'directory')
        if f.match('file_*'):
            # which one exactly depends on many things
            assert_in(r['type'], ('file', 'symlink'))
        if f.match('file_ingit*'):
            assert (r['type'] == 'file')
        elif '.datalad' not in f.parts and not f.match('.git*') and \
                r.get('gitshasum', None) and not f.match('subds*'):
            # this should be known to annex, one way or another
            # regardless of whether things add deleted or staged
            # or anything in between
            assert_in('key', r, f)
            assert_in('keyname', r, f)
            assert_in('backend', r, f)
            assert_in('bytesize', r, f)
            # no duplication with path
            assert_not_in('file', r, f)

    # query full untracked report
    res = ds.repo.get_content_info()
    assert_in(repopath.joinpath('dir_untracked', 'file_untracked'), res)
    assert_not_in(repopath.joinpath('dir_untracked'), res)
    # query for compact untracked report
    res = ds.repo.get_content_info(untracked='normal')
    assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res)
    assert_in(repopath.joinpath('dir_untracked'), res)
    # query no untracked report
    res = ds.repo.get_content_info(untracked='no')
    assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res)
    assert_not_in(repopath.joinpath('dir_untracked'), res)

    # git status integrity
    status = ds.repo.status()
    for t in ('subds', 'file'):
        for s in ('untracked', 'added', 'deleted', 'clean', 'ingit_clean',
                  'dropped_clean', 'modified', 'ingit_modified'):
            for l in ('', ut.PurePosixPath('subdir', '')):
                if t == 'subds' and 'ingit' in s or 'dropped' in s:
                    # invalid combination
                    continue
                if t == 'subds' and s == 'deleted':
                    # same as subds_unavailable -> clean
                    continue
                p = repopath.joinpath(l, '{}_{}'.format(t, s))
                assert p.match('*_{}'.format(status[p]['state'])), p
                if t == 'subds':
                    assert_in(status[p]['type'], ('dataset', 'directory'), p)
                else:
                    assert_in(status[p]['type'], ('file', 'symlink'), p)

    # git annex status integrity
    annexstatus = ds.repo.annexstatus()
    for t in ('file', ):
        for s in ('untracked', 'added', 'deleted', 'clean', 'ingit_clean',
                  'dropped_clean', 'modified', 'ingit_modified'):
            for l in ('', ut.PurePosixPath('subdir', '')):
                p = repopath.joinpath(l, '{}_{}'.format(t, s))
                if s in ('untracked', 'ingit_clean', 'ingit_modified'):
                    # annex knows nothing about these things
                    assert_not_in('key', annexstatus[p])
                    continue
                assert_in('key', annexstatus[p])
                # dear future,
                # if the next one fails, git-annex might have changed the
                # nature of the path that are being reported by
                # `annex find --json`
                # when this was written `hashir*` was a native path, but
                # `file` was a POSIX path
                assert_equal(annexstatus[p]['has_content'], 'dropped' not in s)

    # check the different subds evaluation modes
    someds = Dataset(ds.pathobj / 'subds_modified' / 'someds')
    dirtyds_path = someds.pathobj / 'dirtyds'
    assert_not_in('state',
                  someds.repo.status(eval_submodule_state='no')[dirtyds_path])
    assert_equal(
        'clean',
        someds.repo.status(
            eval_submodule_state='commit')[dirtyds_path]['state'])
    assert_equal(
        'modified',
        someds.repo.status(eval_submodule_state='full')[dirtyds_path]['state'])
示例#2
0
def _yield_metadata_records(aggdspath, agg_record, query_paths, reporton,
                            parentds):
    dsmeta = None
    if reporton in ('datasets', 'all', 'jsonld') \
            and 'dataset_info' in agg_record:
        # we do not need path matching here, we already know
        # that something in this dataset is relevant
        objfile = text_type(agg_record['dataset_info'])
        # TODO if it doesn't exist but is requested say impossible?
        dsmeta = json_load(objfile)
        info = dict(
            path=text_type(aggdspath),
            status='ok',
            type='dataset',
            metadata=dsmeta,
            # some things that should be there, but maybe not
            # -- make optional to be more robust
            dsid=agg_record.get('id', None),
            refcommit=agg_record.get('refcommit', None),
            datalad_version=agg_record.get('datalad_version', None),
        )
        if parentds:
            info['parentds'] = parentds
        yield info
    if reporton in ('files', 'all', 'jsonld') and 'content_info' in agg_record:
        objfile = text_type(agg_record['content_info'])
        # TODO if it doesn't exist but is requested say impossible?
        for file_record in json_streamload(objfile):
            if 'path' not in file_record:  # pragma: no cover
                yield dict(
                    status='error',
                    message=("content metadata contains record "
                             "without a 'path' specification: %s", agg_record),
                    type='dataset',
                    path=aggdspath,
                )
                continue
            # absolute path for this file record
            # metadata record always uses POSIX conventions
            fpath = aggdspath / ut.PurePosixPath(file_record['path'])
            if not any(p == fpath or p in fpath.parents for p in query_paths):
                # ignore any file record that doesn't match any query
                # path (direct hit or git-annex-like recursion within a
                # dataset)
                continue
            if dsmeta is not None and \
                    '@context' in dsmeta and \
                    '@context' not in file_record:
                file_record['@context'] = dsmeta['@context']
            info = dict(
                path=text_type(fpath),
                parentds=text_type(aggdspath),
                status='ok',
                type='file',
                metadata={
                    k: v
                    for k, v in iteritems(file_record) if k not in ('path', )
                },
                # really old extracts did not have 'id'
                dsid=agg_record.get('id', None),
                refcommit=agg_record['refcommit'],
                datalad_version=agg_record['datalad_version'],
            )
            yield info