Python find_files示例，datalad.utils.find_files Python示例

示例#1

0

显示文件

文件： test_utils.py 项目： seldamat/datalad

def test_find_files():
    tests_dir = dirname(__file__)
    proj_dir = normpath(opj(dirname(__file__), pardir))

    ff = find_files('.*', proj_dir)
    ok_generator(ff)
    files = list(ff)
    assert (len(files) > 10)  # we have more than 10 test files here
    assert_in(opj(tests_dir, 'test_utils.py'), files)
    # and no directories should be mentioned
    assert_not_in(tests_dir, files)

    ff2 = find_files('.*', proj_dir, dirs=True)
    files2 = list(ff2)
    assert_in(opj(tests_dir, 'test_utils.py'), files2)
    assert_in(tests_dir, files2)

    # now actually matching the path
    ff3 = find_files(r'.*\\test_.*\.py$' if on_windows else r'.*/test_.*\.py$',
                     proj_dir,
                     dirs=True)
    files3 = list(ff3)
    assert_in(opj(tests_dir, 'test_utils.py'), files3)
    assert_not_in(tests_dir, files3)
    for f in files3:
        ok_startswith(basename(f), 'test_')

示例#2

0

显示文件

    def test_add_delete_after_and_drop(self):
        # To test that .tar gets removed
        # but that new stuff was added to annex repo.  We know the key since default
        # backend and content remain the same
        key1 = 'SHA256E-s5--16d3ad1974655987dd7801d70659990b89bfe7e931a0a358964e64e901761cc0.dat'

        # previous state of things:
        prev_files = list(find_files('.*', self.annex.path))
        assert_equal(self.annex.whereis(key1, key=True, output='full'), {})

        commits_prior = list(self.annex.get_branch_commits_('git-annex'))
        add_archive_content('1.tar', annex=self.annex, strip_leading_dirs=True, delete_after=True)
        commits_after = list(self.annex.get_branch_commits_('git-annex'))
        # There should be a single commit for all additions +1 to initiate datalad-archives gh-1258
        # If faking dates, there should be another +1 because
        # annex.alwayscommit isn't set to false.
        assert_equal(len(commits_after),
                     # We expect one more when faking dates because
                     # annex.alwayscommit isn't set to false.
                     len(commits_prior) + 2 + self.annex.fake_dates_enabled)
        assert_equal(prev_files, list(find_files('.*', self.annex.path)))
        w = self.annex.whereis(key1, key=True, output='full')
        assert_equal(len(w), 2)  # in archive, and locally since we didn't drop

        # Let's now do the same but also drop content
        add_archive_content('1.tar', annex=self.annex, strip_leading_dirs=True, delete_after=True,
                            drop_after=True)
        assert_equal(prev_files, list(find_files('.*', self.annex.path)))
        w = self.annex.whereis(key1, key=True, output='full')
        assert_equal(len(w), 1)  # in archive

        # there should be no .datalad temporary files hanging around
        self.assert_no_trash_left_behind()

示例#3

0

显示文件

文件： test_utils.py 项目： seldamat/datalad

def test_find_files_exclude_vcs(repo):
    ff = find_files('.*', repo, dirs=True)
    files = list(ff)
    assert_equal({basename(f) for f in files}, {'d1', 'git'})
    assert_not_in(opj(repo, '.git'), files)

    ff = find_files('.*', repo, dirs=True, exclude_vcs=False)
    files = list(ff)
    assert_equal({basename(f) for f in files}, {'d1', 'git', '.git', '1'})
    assert_in(opj(repo, '.git'), files)

示例#4

0

显示文件

文件： test_add_archive_content.py 项目： datalad/datalad

 def assert_no_trash_left_behind(self):
     assert_equal(
         list(
             find_files(r'\.datalad..*',
                        self.annex.path,
                        exclude="config",
                        dirs=True)), [])

示例#5

0

显示文件

def test_crawl_api_recursive(get_subdatasets_, run_pipeline_,
                             load_pipeline_from_config_,
                             get_repo_pipeline_script_path_, get_lofilename_,
                             chpwd_, tdir):
    pwd = getpwd()
    with chpwd(tdir):
        output, stats = crawl(recursive=True)
    assert_equal(pwd, getpwd())
    if external_versions['mock'] < '1.0.1':
        raise SkipTest(
            "needs a more recent mock which throws exceptions in side_effects")
    assert_equal(output,
                 [[]] * 4 + [None])  # for now output is just a list of outputs
    assert_equal(
        stats, ActivityStats(
            datasets_crawled=5,
            datasets_crawl_failed=1))  # nothing was done but we got it crawled
    chpwd_.assert_has_calls([
        call(None),
        call('path1'),
        call('path1/path1_1'),
        call('path2'),
    ],
                            any_order=True)
    assert_equal(
        list(find_files('.*', tdir, exclude_vcs=False)),
        [_path_(tdir, 'some.log')])  # no files were generated besides the log

示例#6

0

显示文件

文件： aggregate.py 项目： debanjum/datalad

 def get_core_metadata_filenames(self):
     return list(find_files(
         'meta\.json',
         topdir=opj(self.ds.path, '.datalad', 'meta'),
         exclude=None,
         exclude_vcs=False,
         exclude_datalad=False,
         dirs=False))

示例#7

0

显示文件

 def get_core_metadata_filenames(self):
     return list(
         find_files('meta\.json',
                    topdir=opj(self.ds.path, '.datalad', 'meta'),
                    exclude=None,
                    exclude_vcs=False,
                    exclude_datalad=False,
                    dirs=False))

示例#8

0

显示文件

文件： test_balsa.py 项目： kyleam/datalad-crawler

def test_balsa_pipeline1(ind, topurl, outd, clonedir):
    list(initiate_dataset(
        template="balsa",
        dataset_name='dataladtest-WG33',
        path=outd,
        data_fields=['dataset_id'])({'dataset_id': 'WG33'}))

    with chpwd(outd):
        pipeline = ofpipeline('WG33', url=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # since now we base incoming on master -- and there were nothing custom
    # in master after incoming-processed, both branches should be the same
    eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # but that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed'))

    get_branch_commits = repo.get_branch_commits_ \
        if hasattr(repo, 'get_branch_commits_') else repo.get_branch_commits
    commits = {b: list(get_branch_commits(b)) for b in branches}
    # all commits out there -- init ds + init crawler + 1*(incoming, processed)
    # The number of commits in master differs based on the create variant used
    # (the one DataLad's master makes only one commit).
    ncommits_master = len(commits["master"])
    assert_in(ncommits_master, [4, 5])
    # incoming branches from master but lacks one merge commit.
    eq_(len(commits['incoming']), ncommits_master - 1)
    # incoming-processed is on master.
    eq_(len(commits['incoming-processed']), ncommits_master)

    with chpwd(outd):
        eq_(set(glob('*')), {'dir1', 'file1.nii'})
        all_files = sorted(find_files('.'))

    fpath = opj(outd, 'file1.nii')
    ok_file_has_content(fpath, "content of file1.nii")
    ok_file_under_git(fpath, annexed=True)
    fpath2 = opj(outd, 'dir1', 'file2.nii')
    ok_file_has_content(fpath2, "content of file2.nii")
    ok_file_under_git(fpath2, annexed=True)

    target_files = {
        './.datalad/crawl/crawl.cfg',
        './.datalad/crawl/statuses/incoming.json',
        './.datalad/meta/balsa.json',
        './.datalad/config',
        './file1.nii', './dir1/file2.nii',
    }

    eq_(set(all_files), target_files)

示例#9

0

显示文件

def test_balsa_pipeline1(ind, topurl, outd, clonedir):
    list(
        initiate_dataset(template="balsa",
                         dataset_name='dataladtest-WG33',
                         path=outd,
                         data_fields=['dataset_id'])({
                             'dataset_id': 'WG33'
                         }))

    with chpwd(outd):
        pipeline = ofpipeline('WG33', url=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    commits = {b: list(repo.get_branch_commits(b)) for b in branches}
    eq_(len(commits['incoming']), 1)
    eq_(len(commits['incoming-processed']), 2)
    eq_(
        len(commits['master']), 6
    )  # all commits out there -- init ds + init crawler + 1*(incoming, processed, merge)

    with chpwd(outd):
        eq_(set(glob('*')), {'dir1', 'file1.nii'})
        all_files = sorted(find_files('.'))

    fpath = opj(outd, 'file1.nii')
    ok_file_has_content(fpath, "content of file1.nii")
    ok_file_under_git(fpath, annexed=True)
    fpath2 = opj(outd, 'dir1', 'file2.nii')
    ok_file_has_content(fpath2, "content of file2.nii")
    ok_file_under_git(fpath2, annexed=True)

    target_files = {
        './.datalad/crawl/crawl.cfg',
        './.datalad/crawl/statuses/incoming.json',
        './.datalad/meta/balsa.json',
        './.datalad/config',
        './file1.nii',
        './dir1/file2.nii',
    }

    eq_(set(all_files), target_files)

示例#10

0

显示文件

文件： test_add_archive_content.py 项目： datalad/datalad

    def test_add_delete_after_and_drop(self):
        # To test that .tar gets removed
        # but that new stuff was added to annex repo.  We know the key since
        # default backend and content remain the same
        key1 = 'MD5E-s5--db87ebcba59a8c9f34b68e713c08a718.dat'
        repo = self.ds.repo
        # previous state of things:
        prev_files = list(find_files('.*', self.ds.path))
        assert_equal(repo.whereis(key1, key=True, output='full'), {})

        commits_prior = list(repo.get_branch_commits_('git-annex'))
        self.ds.add_archive_content('1.tar',
                                    strip_leading_dirs=True,
                                    delete_after=True)
        commits_after = list(repo.get_branch_commits_('git-annex'))
        # There should be a single commit for all additions +1 to initiate
        # datalad-archives gh-1258
        # If faking dates, there should be another +1 because
        # annex.alwayscommit isn't set to false.
        assert_equal(
            len(commits_after),
            # We expect one more when faking dates because
            # annex.alwayscommit isn't set to false.
            len(commits_prior) + 2 + repo.fake_dates_enabled)
        assert_equal(prev_files, list(find_files('.*', self.ds.path)))
        w = repo.whereis(key1, key=True, output='full')
        assert_equal(len(w), 2)  # in archive, and locally since we didn't drop

        # Let's now do the same but also drop content
        self.ds.add_archive_content('1.tar',
                                    strip_leading_dirs=True,
                                    delete_after=True,
                                    drop_after=True)
        assert_equal(prev_files, list(find_files('.*', self.ds.path)))
        w = repo.whereis(key1, key=True, output='full')
        assert_equal(len(w), 1)  # in archive

        # there should be no .datalad temporary files hanging around
        self.assert_no_trash_left_behind()

示例#11

0

显示文件

文件： test_add_archive_content.py 项目： datalad/datalad

def test_add_archive_dirs(path_orig=None, url=None, repo_path=None):
    # change to repo_path
    with chpwd(repo_path):
        # create annex repo
        ds = Dataset(repo_path).create(force=True)
        repo = ds.repo
        # add archive to the repo so we could test
        with swallow_outputs():
            repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz'))
        repo.commit("added 1.tar.gz")

        # test with excludes and annex options
        add_archive_content(
            '1.tar.gz',
            existing='archive-suffix',
            # Since inconsistent and seems in many cases no
            # leading dirs to strip, keep them as provided
            strip_leading_dirs=True,
            delete=True,
            leading_dirs_consider=['crcns.*', '1'],
            leading_dirs_depth=2,
            use_current_dir=False,
            exclude='.*__MACOSX.*')  # some junk penetrates

        eq_(
            repo.get_description(
                uuid=DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE]),
            '[%s]' % ARCHIVES_SPECIAL_REMOTE)

        all_files = sorted(find_files('.'))
        # posixify paths to make it work on Windows as well
        all_files = [Path(file).as_posix() for file in all_files]
        target_files = {
            'CR24A/behaving1/1 f.txt',
            'CR24C/behaving3/3 f.txt',
            'CR24D/behaving2/2 f.txt',
            '.datalad/config',
        }
        eq_(set(all_files), target_files)

        # regression test: the subdir in MACOSX wasn't excluded and its name was
        # getting stripped by leading_dir_len
        # if stripping and exclude didn't work this fails
        assert_false(exists('__MACOSX'))
        # if exclude doesn't work then name of subdir gets stripped by
        # leading_dir_len
        assert_false(exists('c-1_data'))
        # if exclude doesn't work but everything else works this fails
        assert_false(exists('CR24B'))

示例#12

0

显示文件

def test_add_archive_dirs(path_orig, url, repo_path):
    # change to repo_path
    with chpwd(repo_path):
        # create annex repo
        repo = AnnexRepo(repo_path, create=True)

        # add archive to the repo so we could test
        with swallow_outputs():
            repo.add_urls([opj(url, '1.tar.gz')],
                          options=["--pathdepth", "-1"])
        repo.commit("added 1.tar.gz")

        # test with excludes and annex options
        add_archive_content(
            '1.tar.gz',
            existing='archive-suffix',
            # Since inconsistent and seems in many cases no leading dirs to strip, keep them as provided
            strip_leading_dirs=True,
            delete=True,
            leading_dirs_consider=['crcns.*', '1'],
            leading_dirs_depth=2,
            use_current_dir=False,
            exclude='.*__MACOSX.*')  # some junk penetrates

        if external_versions['cmd:annex'] >= '6.20170208':
            # should have fixed remotes
            eq_(
                repo.get_description(
                    uuid=DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE]
                ), '[%s]' % ARCHIVES_SPECIAL_REMOTE)

        all_files = sorted(find_files('.'))
        target_files = {
            './CR24A/behaving1/1 f.txt',
            './CR24C/behaving3/3 f.txt',
            './CR24D/behaving2/2 f.txt',
        }
        eq_(set(all_files), target_files)

        # regression test: the subdir in MACOSX wasn't excluded and its name was getting stripped by leading_dir_len
        assert_false(exists(
            '__MACOSX'))  # if stripping and exclude didn't work this fails
        assert_false(
            exists('c-1_data')
        )  # if exclude doesn't work then name of subdir gets stripped by leading_dir_len
        assert_false(
            exists('CR24B')
        )  # if exclude doesn't work but everything else works this fails

示例#13

0

显示文件

def get_mtimes_and_digests(target_path):
    """Return digests (md5) and mtimes for all the files under target_path"""
    from datalad.utils import find_files
    from datalad.support.digests import Digester
    digester = Digester(['md5'])

    # bother only with existing ones for this test, i.e. skip annexed files without content
    target_files = [
        f for f in find_files('.*', topdir=target_path, exclude_vcs=False, exclude_datalad=False)
        if exists(f)
    ]
    # let's leave only relative paths for easier analysis
    target_files_ = [relpath(f, target_path) for f in target_files]

    digests = {frel: digester(f) for f, frel in zip(target_files, target_files_)}
    mtimes = {frel: os.stat(f).st_mtime for f, frel in zip(target_files, target_files_)}
    return digests, mtimes

示例#14

0

显示文件

文件： utils.py 项目： debanjum/datalad

def get_mtimes_and_digests(target_path):
    """Return digests (md5) and mtimes for all the files under target_path"""
    from datalad.utils import find_files
    from datalad.support.digests import Digester
    digester = Digester(['md5'])

    # bother only with existing ones for this test, i.e. skip annexed files without content
    target_files = [
        f for f in find_files('.*', topdir=target_path, exclude_vcs=False, exclude_datalad=False)
        if exists(f)
    ]
    # let's leave only relative paths for easier analysis
    target_files_ = [relpath(f, target_path) for f in target_files]

    digests = {frel: digester(f) for f, frel in zip(target_files, target_files_)}
    mtimes = {frel: os.stat(f).st_mtime for f, frel in zip(target_files, target_files_)}
    return digests, mtimes

示例#15

0

显示文件

def test_balsa_pipeline2(ind, topurl, outd, clonedir):
    list(
        initiate_dataset(template="balsa",
                         dataset_name='dataladtest-WG33',
                         path=outd,
                         data_fields=['dataset_id'])({
                             'dataset_id': 'WG33'
                         }))

    with chpwd(outd):
        with swallow_logs(new_level=logging.WARN) as cml:
            pipeline = ofpipeline('WG33', url=topurl)
            out = run_pipeline(pipeline)
            assert_true(
                'The following files do not exist in the canonical tarball, '
                'but are individually listed files and will not be kept:' in
                cml.out)
            assert_true(
                './file1.nii varies in content from the individually downloaded '
                'file with the same name, it is removed and file from canonical '
                'tarball is kept' in cml.out)
    eq_(len(out), 1)

    with chpwd(outd):
        eq_(set(glob('*')), {'dir1', 'file1.nii'})
        all_files = sorted(find_files('.'))

    fpath = opj(outd, 'file1.nii')
    ok_file_has_content(fpath, "content of file1.nii")
    ok_file_under_git(fpath, annexed=True)
    fpath2 = opj(outd, 'dir1', 'file2.nii')
    ok_file_has_content(fpath2, "content of file2.nii")
    ok_file_under_git(fpath2, annexed=True)

    target_files = {
        './.datalad/config',
        './.datalad/crawl/crawl.cfg',
        './.datalad/crawl/statuses/incoming.json',
        './.datalad/meta/balsa.json',
        './file1.nii',
        './dir1/file2.nii',
    }

    eq_(set(all_files), target_files)

示例#16

0

显示文件

文件： base.py 项目： xlecours/datalad-crawler

    def get_obsolete(self):
        """Returns full paths for files which weren't queried, thus must have been deleted

        Note that it doesn't track across branches, etc.
        """
        if not self._track_queried:
            raise RuntimeError(
                "Cannot determine which files were removed since track_queried was set to False"
            )
        obsolete = []
        # those aren't tracked by annexificator
        datalad_path = opj(self.annex.path, HANDLE_META_DIR)
        for fpath in find_files('.*', topdir=self.annex.path):
            filepath = self._get_filepath(fpath)
            if filepath.startswith(datalad_path):
                continue
            if fpath not in self._queried_filepaths:
                obsolete.append(filepath)
        return obsolete

示例#17

0

显示文件

def test_openfmri_pipeline1(ind, topurl, outd, clonedir):
    index_html = opj(ind, 'ds666', 'index.html')

    list(
        initiate_dataset(template="openfmri",
                         dataset_name='dataladtest-ds666',
                         path=outd,
                         data_fields=['dataset'])({
                             'dataset': 'ds666'
                         }))

    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Since datalad 0.11.2 all .metadata/objects go under annex.
    # Here we have a test where we force drop all annexed content,
    # to mitigate that let's place all metadata under git
    dotdatalad_attributes_file = opj('.datalad', '.gitattributes')
    repo.set_gitattributes([('metadata/objects/**', {
        'annex.largefiles': '(nothing)'
    })], dotdatalad_attributes_file)
    # --amend so we do not cause change in # of commits below
    repo.commit("gitattributes",
                files=dotdatalad_attributes_file,
                options=['--amend'])

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath_nover, "mighty load in old format")

    #
    # And now versioned files were specified!
    #
    add_to_index(index_html, content=_versioned_files)

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    ok_(
        not exists(t1w_fpath_nover),
        "%s file should no longer be there if unversioned files get removed correctly"
        % t1w_fpath_nover)
    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming
    # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents

    commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    # all commits out there:
    # dataset init, crawler init
    #   (2 commits)
    # + 3*(incoming, processed, merge)
    # + 3*aggregate-metadata update
    #   - 1 since now that incoming starts with master, there is one less merge
    # In --incremental mode there is a side effect of absent now
    #   2*remove of obsolete metadata object files,
    #     see https://github.com/datalad/datalad/issues/2772
    # TODO inspect by knowledgeable person and re-enable
    #ncommits_master = len(commits_hexsha['master'])
    #assert_in(ncommits_master, [13, 14])
    #assert_in(len(commits_l['master']), [8, 9])

    # TODO inspect by knowledgeable person and re-enable
    #eq_(len(commits_hexsha['incoming']), ncommits_master - 8)
    #eq_(len(commits_l['incoming']), ncommits_master - 8)
    #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5)
    #eq_(len(commits_l['incoming-processed']), ncommits_master - 8)

    # Check tags for the versions
    eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1'])
    # +1 because original "release" was assumed to be 1.0.0
    repo_tags = repo.get_tags()
    eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1'])

    # Ben: The tagged ones currently are the ones with the message
    # '[DATALAD] dataset aggregate metadata update\n':
    #eq_(repo_tags[0]['hexsha'], commits_l['master'][4])  # next to the last one
    #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0])  # the last one

    def hexsha(l):
        return l.__class__(x.hexsha for x in l)

    # TODO requires additional tooling to re-enable
    ## Verify that we have desired tree of merges
    #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1],
    #                                                         commits_l['incoming'][0]))
    #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3],  # also in master
    #                                                         commits_l['incoming'][2],))

    # ben: The following two comparisons are targeting these commits:
    # commit "Merge branch 'incoming-processed'\n" in commits_l['master'],
    # parents are:
    # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and
    # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed']
    # TODO requires additional tooling to re-enable
    #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2],
    #                                             commits_l['incoming-processed'][0]))
    #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4],
    #                                             commits_l['incoming-processed'][1]))

    with chpwd(outd):
        eq_(set(glob('*')), {'changelog.txt', 'sub-1'})
        all_files = sorted(find_files('.'))

    t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath, "mighty load 1.0.1")
    ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False)
    ok_file_under_git(t1w_fpath, annexed=True)

    try:
        # this is the new way
        from datalad.metadata.metadata import get_ds_aggregate_db_locations
        ds = Dataset('.')
        dbloc, objbase = get_ds_aggregate_db_locations(ds)
        dbloc = op.relpath(dbloc, start=ds.path)
    except ImportError:
        # this stopped working in early 2019 versions of datalad
        from datalad.metadata.metadata import agginfo_relpath
        dbloc = agginfo_relpath

    target_files = {
        './.datalad/config',
        './.datalad/crawl/crawl.cfg',
        # no more!
        # './.datalad/config.ttl', './.datalad/datalad.ttl',
        './.datalad/crawl/statuses/incoming.json',
        './.datalad/crawl/versions/incoming.json',
        './changelog.txt',
        './sub-1/anat/sub-1_T1w.dat',
        './sub-1/beh/responses.tsv',
        './' + dbloc,
    }
    target_incoming_files = {
        '.gitattributes',  # we marked default backend right in the incoming
        # we now base 'incoming' on master branch, so we get all those as well
        '.datalad/.gitattributes',
        '.datalad/config',
        '.datalad/crawl/crawl.cfg',
        'changelog.txt',
        'ds666.tar.gz',
        'ds666-beh_R1.0.1.tar.gz',
        'ds666_R1.0.0.tar.gz',
        'ds666_R1.0.1.tar.gz',
        'ds666_R2.0.0.tar.gz',
        '.datalad/crawl/statuses/incoming.json',
        '.datalad/crawl/versions/incoming.json'
    }
    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # check that -beh was committed in 2nd commit in incoming, not the first one
    assert_not_in('ds666-beh_R1.0.1.tar.gz',
                  repo.get_files(commits_l['incoming'][-1]))
    assert_in('ds666-beh_R1.0.1.tar.gz',
              repo.get_files(commits_l['incoming'][0]))

    # rerun pipeline -- make sure we are on the same in all branches!
    with chpwd(outd):
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    eq_(commits_hexsha, commits_hexsha_)  # i.e. nothing new
    # actually we do manage to add_git 1 (README) since it is generated committed directly to git
    # BUT now fixed -- if not committed (was the same), should be marked as skipped
    # Nothing was committed so stats leaked all the way up
    eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5))
    eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total())

    # rerun pipeline when new content is available
    # add new revision, rerun pipeline and check that stuff was processed/added correctly
    add_to_index(
        index_html,
        content=
        '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>')

    with chpwd(outd):
        out = run_pipeline(pipeline)
        all_files_updated = sorted(find_files('.'))
    eq_(len(out), 1)
    assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats())
    # there is no overlays ATM, so behav would be gone since no 2.0.0 for it!
    target_files.remove('./sub-1/beh/responses.tsv')

    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files_updated
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # new instance so it re-reads git stuff etc
    # repo = AnnexRepo(outd, create=False)  # to be used in the checks
    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l_ = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    assert_not_equal(commits_hexsha, commits_hexsha_)
    eq_(out[0]['datalad_stats'],
        ActivityStats())  # commit happened so stats were consumed
    # numbers seems to be right
    total_stats = out[0]['datalad_stats'].get_total()
    # but for some reason downloaded_size fluctuates.... why? probably archiving...?
    total_stats.downloaded_size = 0
    eq_(
        total_stats,
        ActivityStats(
            files=8,
            skipped=5,
            downloaded=1,
            renamed=1,
            urls=6,
            add_annex=2,  # add_git=1, # README
            versions=['2.0.0'],
            merges=[['incoming', 'incoming-processed']]))

    check_dropall_get(repo)

    # Let's see if pipeline would remove files we stopped tracking
    remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>')
    with chpwd(outd):
        with swallow_logs(new_level=logging.WARNING) as cml:
            out = run_pipeline(pipeline)
            # since files get removed in incoming, but repreprocessed completely
            # incomming-processed and merged into master -- new commits will come
            # They shouldn't have any difference but still should be new commits
            assert_in("There is already a tag 2.0.0 in the repository",
                      cml.out)
    eq_(len(out), 1)
    incoming_files = repo.get_files('incoming')
    target_incoming_files.remove('ds666_R1.0.0.tar.gz')
    eq_(set(incoming_files), target_incoming_files)
    commits_hexsha_removed = {
        b: list(_get_branch_commits(repo, b))
        for b in branches
    }
    # our 'statuses' database should have recorded the change thus got a diff
    # which propagated through all branches
    for b in 'master', 'incoming-processed':
        # with non persistent DB we had no changes
        # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), [])
        assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json',
                  repo.diff(b, commits_hexsha_[b][0]))
    dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0])
    eq_(len(dincoming),
        2)  # 2 diff objects -- 1 file removed, 1 statuses updated
    eq_(
        set(dincoming.keys()), {
            repo.pathobj / '.datalad/crawl/statuses/incoming.json',
            repo.pathobj / 'ds666_R1.0.0.tar.gz'
        })

    eq_(out[0]['datalad_stats'].get_total().removed, 1)
    assert_not_equal(commits_hexsha_, commits_hexsha_removed)

    # we will check if a clone would be crawling just as good
    from datalad.api import crawl

    # make a brand new clone
    GitRepo.clone(outd, clonedir)

    def _pipeline(*args, **kwargs):
        """Helper to mock openfmri.pipeline invocation so it looks at our 'server'"""
        kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False})
        return ofpipeline(*args, **kwargs)

    with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline):
        output, stats = crawl(
        )  # we should be able to recrawl without doing anything
        ok_(stats, ActivityStats(files=6, skipped=6, urls=5))