Exemplos de crawl em Python, exemplos de datalad.api.crawl em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_openfmri_collection.py Projeto: silky/datalad

def test_openfmri_superdataset_pipeline1(ind, topurl, outd):

    list(
        initiate_dataset(
            template="openfmri",
            template_func="superdataset_pipeline",
            template_kwargs={'url': topurl},
            path=outd,
        )())

    with chpwd(outd):
        crawl()
        #pipeline = ofcpipeline(url=topurl)
        #out = run_pipeline(pipeline)
    #eq_(out, [{'datalad_stats': ActivityStats()}])

    # TODO: replace below command with the one listing subdatasets
    subdatasets = ['ds000001', 'ds000002']
    eq_(Dataset(outd).get_subdatasets(fulfilled=True), subdatasets)

    # Check that crawling configuration was created for every one of those
    for sub in subdatasets:
        repo = GitRepo(opj(outd, sub))
        assert (not repo.dirty)
        assert (exists(opj(repo.path, CRAWLER_META_CONFIG_PATH)))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_openfmri_collection.py Projeto: debanjum/datalad

def test_openfmri_superdataset_pipeline1(ind, topurl, outd):

    list(initiate_dataset(
        template="openfmri",
        template_func="superdataset_pipeline",
        template_kwargs={'url': topurl},
        path=outd,
    )())

    with chpwd(outd):
        crawl()
        #pipeline = ofcpipeline(url=topurl)
        #out = run_pipeline(pipeline)
    #eq_(out, [{'datalad_stats': ActivityStats()}])

    # TODO: replace below command with the one listing subdatasets
    subdatasets = ['ds000001', 'ds000002']
    eq_(Dataset(outd).get_subdatasets(fulfilled=True), subdatasets)

    # Check that crawling configuration was created for every one of those
    for sub in subdatasets:
        repo = GitRepo(opj(outd, sub))
        assert(not repo.dirty)
        assert(exists(opj(repo.path, CRAWLER_META_CONFIG_PATH)))

    # TODO: check that configuration for the crawler is up to the standard
    # Ideally should also crawl some fake datasets I guess

Exemplo n.º 3

0

Exibir arquivo

def check_crawl_autoaddtext(gz, ind, topurl, outd):
    ds = create(outd)
    ds.run_procedure("cfg_text2git")
    with chpwd(outd):  # TODO -- dataset argument
        template_kwargs = {
            'url': topurl,
            'a_href_match_': '.*',
        }
        if gz:
            template_kwargs['archives_re'] = "\.gz$"
        crawl_init(template_kwargs, save=True, template='simple_with_archives')
        try:
            crawl()
        except MissingExternalDependency as exc:
            raise SkipTest(exc_str(exc))
    ok_clean_git(outd)
    ok_file_under_git(outd, "anothertext", annexed=False)
    ok_file_under_git(outd, "d/textfile", annexed=False)
    ok_file_under_git(outd, "d/tooshort", annexed=True)

    if 'compressed.dat.gz' in TEST_TREE2:
        if gz:
            ok_file_under_git(outd, "compressed.dat", annexed=False)
            ok_file_has_content(op.join(outd, "compressed.dat"),
                                u"мама мыла раму")
        else:
            ok_file_under_git(outd, "compressed.dat.gz", annexed=True)
    else:
        raise SkipTest(
            "Need datalad >= 0.11.2 to test .gz files decompression")

Exemplo n.º 4

0

Exibir arquivo

def test_obscure_names(path):
    bucket = "datalad-test2-obscurenames-versioned"
    get_test_providers('s3://' + bucket)  # to verify having s3 credentials
    create(path)
    with externals_use_cassette('test_simple_s3_test2_obscurenames_versioned_crawl_ext'), \
         chpwd(path):
        crawl_init(template="simple_s3", args=dict(bucket=bucket), save=True)
        crawl()
    # fun with unicode was postponed
    ok_clean_git(path, annex=True)
    for f in ['f &$=@:+,?;', "f!-_.*'( )", 'f 1', 'f [1][2]']:
        ok_file_under_git(path, f, annexed=True)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_simple_with_archives.py Projeto: xlecours/datalad

def test_crawl_autoaddtext(ind, topurl, outd):
    ds = create(outd, text_no_annex=True)
    with chpwd(outd):  # TODO -- dataset argument
        crawl_init(
            {'url': topurl, 'a_href_match_': '.*'}
            , save=True
            , template='simple_with_archives')
        crawl()
    ok_clean_git(outd)
    ok_file_under_git(outd, "anothertext", annexed=False)
    ok_file_under_git(outd, "d/textfile", annexed=False)
    ok_file_under_git(outd, "d/tooshort", annexed=True)

Exemplo n.º 6

0

Exibir arquivo

def test_crawl_api_recursive(get_subdatasets_, run_pipeline_,
                             load_pipeline_from_config_,
                             get_repo_pipeline_script_path_, get_lofilename_,
                             chpwd_, tdir):
    pwd = getpwd()
    with chpwd(tdir):
        output, stats = crawl(recursive=True)
    assert_equal(pwd, getpwd())
    if external_versions['mock'] < '1.0.1':
        raise SkipTest(
            "needs a more recent mock which throws exceptions in side_effects")
    assert_equal(output,
                 [[]] * 4 + [None])  # for now output is just a list of outputs
    assert_equal(
        stats, ActivityStats(
            datasets_crawled=5,
            datasets_crawl_failed=1))  # nothing was done but we got it crawled
    chpwd_.assert_has_calls([
        call(None),
        call('path1'),
        call('path1/path1_1'),
        call('path2'),
    ],
                            any_order=True)
    assert_equal(
        list(find_files('.*', tdir, exclude_vcs=False)),
        [_path_(tdir, 'some.log')])  # no files were generated besides the log

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_gh.py Projeto: datalad/datalad-crawler

def test_crawl(tempd):
    if not _get_github_cred().is_known:
        raise SkipTest("no github credential")
    ds = create(tempd)
    with chpwd(tempd):
        crawl_init(template='gh',
                   save=True,
                   args={
                       'org': 'datalad-collection-1',
                       'include': 'kaggle'
                   })
        crawl()
    subdss = ds.subdatasets(fulfilled=True, result_xfm='datasets')
    assert all('kaggle' in d.path for d in subdss)
    assert_greater(len(subdss), 1)
    assert_false(ds.repo.dirty)

Exemplo n.º 8

0

Exibir arquivo

def test_crawl_api_chdir(run_pipeline_, load_pipeline_from_config_, chpwd_):
    output, stats = crawl('some_path_not_checked', chdir='somedir')
    assert_equal(
        stats,
        ActivityStats(datasets_crawled=1))  # nothing was done but we got it
    assert_equal(output, None)

    chpwd_.assert_called_with('somedir')
    load_pipeline_from_config_.assert_called_with('some_path_not_checked')
    run_pipeline_.assert_called_with(['pipeline'],
                                     stats=ActivityStats(datasets_crawled=1))

Exemplo n.º 9

0

Exibir arquivo

def _test_drop(path, drop_immediately):
    s3url = 's3://datalad-test0-nonversioned'
    providers = get_test_providers(s3url)  # to verify having s3 credentials
    # vcr tape is getting bound to the session object, so we need to
    # force re-establishing the session for the bucket.
    # TODO (in datalad): make a dedicated API for that, now too obscure
    _ = providers.get_status(s3url, allow_old_session=False)
    create(path)
    # unfortunately this doesn't work without force dropping since I guess vcr
    # stops and then gets queried again for the same tape while testing for
    # drop :-/
    with chpwd(path):
        crawl_init(
            template="simple_s3",
            args=dict(
                bucket="datalad-test0-nonversioned",
                drop=True,
                drop_force=True,  # so test goes faster
                drop_immediately=drop_immediately,
            ),
            save=True)
    if drop_immediately:
        # cannot figure out but taping that interaction results in
        # git annex addurl  error.  No time to figure it out
        # so we just crawl without vcr for now. TODO: figure out WTF
        with chpwd(path):
            crawl()
    else:
        with externals_use_cassette(
                'test_simple_s3_test0_nonversioned_crawl_ext'
                + ('_immediately' if drop_immediately else '')), \
                chpwd(path):
            crawl()
    # test that all was dropped
    repo = AnnexRepo(path, create=False)
    files = glob(_path_(path, '*'))
    eq_(len(files), 8)
    for f in files:
        assert_false(repo.file_has_content(f))

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_simple_s3.py Projeto: yarikoptic/datalad

def test_drop(path):
    get_test_providers('s3://datalad-test0-nonversioned')  # to verify having s3 credentials
    create(path)
    # unfortunately this doesn't work without force dropping since I guess vcr
    # stops and then gets queried again for the same tape while testing for
    # drop :-/
    with externals_use_cassette('test_simple_s3_test0_nonversioned_crawl_ext'), \
         chpwd(path):
        crawl_init(template="simple_s3",
                   args=dict(
                       bucket="datalad-test0-nonversioned",
                       drop=True,
                       drop_force=True  # so test goes faster
                   ),
                   save=True
                   )
        crawl()
    # test that all was dropped
    repo = AnnexRepo(path, create=False)
    files = glob(_path_(path, '*'))
    eq_(len(files), 8)
    for f in files:
        assert_false(repo.file_has_content(f))

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_simple_with_archives.py Projeto: xlecours/datalad

def test_simple1(ind, topurl, outd):

    list(initiate_dataset(
        template="simple_with_archives",
        dataset_name='test1',
        path=outd,
        add_fields={'url': topurl + 'study/show/WG33',
                    'a_href_match_': '.*download.*'}
    )({}))

    with chpwd(outd):
        out, stats = crawl()

    eq_(stats.add_annex, 3)

    ok_file_under_git(outd, 'file1.nii', annexed=True)
    ok_file_has_content(opj(outd, 'file1.nii'), 'content of file1.nii')

    ok_file_under_git(outd, _path_('dir1/file2.nii'), annexed=True)
    ok_file_has_content(opj(outd, 'dir1', 'file2.nii'), 'content of file2.nii')

    eq_(len(out), 1)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_simple_with_archives.py Projeto: debanjum/datalad

def test_simple1(ind, topurl, outd):

    list(initiate_dataset(
        template="simple_with_archives",
        dataset_name='test1',
        path=outd,
        add_fields={'url': topurl + 'study/show/WG33',
                    'a_href_match_': '.*download.*'}
    )({}))

    with chpwd(outd):
        out, stats = crawl()

    eq_(stats.add_annex, 3)

    ok_file_under_git(outd, 'file1.nii', annexed=True)
    ok_file_has_content(opj(outd, 'file1.nii'), 'content of file1.nii')

    ok_file_under_git(outd, _path_('dir1/file2.nii'), annexed=True)
    ok_file_has_content(opj(outd, 'dir1', 'file2.nii'), 'content of file2.nii')

    eq_(len(out), 1)

Exemplo n.º 13

0

Exibir arquivo

def test_openfmri_pipeline1(ind, topurl, outd, clonedir):
    index_html = opj(ind, 'ds666', 'index.html')

    list(
        initiate_dataset(template="openfmri",
                         dataset_name='dataladtest-ds666',
                         path=outd,
                         data_fields=['dataset'])({
                             'dataset': 'ds666'
                         }))

    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Since datalad 0.11.2 all .metadata/objects go under annex.
    # Here we have a test where we force drop all annexed content,
    # to mitigate that let's place all metadata under git
    dotdatalad_attributes_file = opj('.datalad', '.gitattributes')
    repo.set_gitattributes([('metadata/objects/**', {
        'annex.largefiles': '(nothing)'
    })], dotdatalad_attributes_file)
    # --amend so we do not cause change in # of commits below
    repo.commit("gitattributes",
                files=dotdatalad_attributes_file,
                options=['--amend'])

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath_nover, "mighty load in old format")

    #
    # And now versioned files were specified!
    #
    add_to_index(index_html, content=_versioned_files)

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    ok_(
        not exists(t1w_fpath_nover),
        "%s file should no longer be there if unversioned files get removed correctly"
        % t1w_fpath_nover)
    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming
    # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents

    commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    # all commits out there:
    # dataset init, crawler init
    #   (2 commits)
    # + 3*(incoming, processed, merge)
    # + 3*aggregate-metadata update
    #   - 1 since now that incoming starts with master, there is one less merge
    # In --incremental mode there is a side effect of absent now
    #   2*remove of obsolete metadata object files,
    #     see https://github.com/datalad/datalad/issues/2772
    # TODO inspect by knowledgeable person and re-enable
    #ncommits_master = len(commits_hexsha['master'])
    #assert_in(ncommits_master, [13, 14])
    #assert_in(len(commits_l['master']), [8, 9])

    # TODO inspect by knowledgeable person and re-enable
    #eq_(len(commits_hexsha['incoming']), ncommits_master - 8)
    #eq_(len(commits_l['incoming']), ncommits_master - 8)
    #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5)
    #eq_(len(commits_l['incoming-processed']), ncommits_master - 8)

    # Check tags for the versions
    eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1'])
    # +1 because original "release" was assumed to be 1.0.0
    repo_tags = repo.get_tags()
    eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1'])

    # Ben: The tagged ones currently are the ones with the message
    # '[DATALAD] dataset aggregate metadata update\n':
    #eq_(repo_tags[0]['hexsha'], commits_l['master'][4])  # next to the last one
    #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0])  # the last one

    def hexsha(l):
        return l.__class__(x.hexsha for x in l)

    # TODO requires additional tooling to re-enable
    ## Verify that we have desired tree of merges
    #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1],
    #                                                         commits_l['incoming'][0]))
    #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3],  # also in master
    #                                                         commits_l['incoming'][2],))

    # ben: The following two comparisons are targeting these commits:
    # commit "Merge branch 'incoming-processed'\n" in commits_l['master'],
    # parents are:
    # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and
    # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed']
    # TODO requires additional tooling to re-enable
    #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2],
    #                                             commits_l['incoming-processed'][0]))
    #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4],
    #                                             commits_l['incoming-processed'][1]))

    with chpwd(outd):
        eq_(set(glob('*')), {'changelog.txt', 'sub-1'})
        all_files = sorted(find_files('.'))

    t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath, "mighty load 1.0.1")
    ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False)
    ok_file_under_git(t1w_fpath, annexed=True)

    try:
        # this is the new way
        from datalad.metadata.metadata import get_ds_aggregate_db_locations
        ds = Dataset('.')
        dbloc, objbase = get_ds_aggregate_db_locations(ds)
        dbloc = op.relpath(dbloc, start=ds.path)
    except ImportError:
        # this stopped working in early 2019 versions of datalad
        from datalad.metadata.metadata import agginfo_relpath
        dbloc = agginfo_relpath

    target_files = {
        './.datalad/config',
        './.datalad/crawl/crawl.cfg',
        # no more!
        # './.datalad/config.ttl', './.datalad/datalad.ttl',
        './.datalad/crawl/statuses/incoming.json',
        './.datalad/crawl/versions/incoming.json',
        './changelog.txt',
        './sub-1/anat/sub-1_T1w.dat',
        './sub-1/beh/responses.tsv',
        './' + dbloc,
    }
    target_incoming_files = {
        '.gitattributes',  # we marked default backend right in the incoming
        # we now base 'incoming' on master branch, so we get all those as well
        '.datalad/.gitattributes',
        '.datalad/config',
        '.datalad/crawl/crawl.cfg',
        'changelog.txt',
        'ds666.tar.gz',
        'ds666-beh_R1.0.1.tar.gz',
        'ds666_R1.0.0.tar.gz',
        'ds666_R1.0.1.tar.gz',
        'ds666_R2.0.0.tar.gz',
        '.datalad/crawl/statuses/incoming.json',
        '.datalad/crawl/versions/incoming.json'
    }
    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # check that -beh was committed in 2nd commit in incoming, not the first one
    assert_not_in('ds666-beh_R1.0.1.tar.gz',
                  repo.get_files(commits_l['incoming'][-1]))
    assert_in('ds666-beh_R1.0.1.tar.gz',
              repo.get_files(commits_l['incoming'][0]))

    # rerun pipeline -- make sure we are on the same in all branches!
    with chpwd(outd):
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    eq_(commits_hexsha, commits_hexsha_)  # i.e. nothing new
    # actually we do manage to add_git 1 (README) since it is generated committed directly to git
    # BUT now fixed -- if not committed (was the same), should be marked as skipped
    # Nothing was committed so stats leaked all the way up
    eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5))
    eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total())

    # rerun pipeline when new content is available
    # add new revision, rerun pipeline and check that stuff was processed/added correctly
    add_to_index(
        index_html,
        content=
        '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>')

    with chpwd(outd):
        out = run_pipeline(pipeline)
        all_files_updated = sorted(find_files('.'))
    eq_(len(out), 1)
    assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats())
    # there is no overlays ATM, so behav would be gone since no 2.0.0 for it!
    target_files.remove('./sub-1/beh/responses.tsv')

    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files_updated
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # new instance so it re-reads git stuff etc
    # repo = AnnexRepo(outd, create=False)  # to be used in the checks
    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l_ = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    assert_not_equal(commits_hexsha, commits_hexsha_)
    eq_(out[0]['datalad_stats'],
        ActivityStats())  # commit happened so stats were consumed
    # numbers seems to be right
    total_stats = out[0]['datalad_stats'].get_total()
    # but for some reason downloaded_size fluctuates.... why? probably archiving...?
    total_stats.downloaded_size = 0
    eq_(
        total_stats,
        ActivityStats(
            files=8,
            skipped=5,
            downloaded=1,
            renamed=1,
            urls=6,
            add_annex=2,  # add_git=1, # README
            versions=['2.0.0'],
            merges=[['incoming', 'incoming-processed']]))

    check_dropall_get(repo)

    # Let's see if pipeline would remove files we stopped tracking
    remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>')
    with chpwd(outd):
        with swallow_logs(new_level=logging.WARNING) as cml:
            out = run_pipeline(pipeline)
            # since files get removed in incoming, but repreprocessed completely
            # incomming-processed and merged into master -- new commits will come
            # They shouldn't have any difference but still should be new commits
            assert_in("There is already a tag 2.0.0 in the repository",
                      cml.out)
    eq_(len(out), 1)
    incoming_files = repo.get_files('incoming')
    target_incoming_files.remove('ds666_R1.0.0.tar.gz')
    eq_(set(incoming_files), target_incoming_files)
    commits_hexsha_removed = {
        b: list(_get_branch_commits(repo, b))
        for b in branches
    }
    # our 'statuses' database should have recorded the change thus got a diff
    # which propagated through all branches
    for b in 'master', 'incoming-processed':
        # with non persistent DB we had no changes
        # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), [])
        assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json',
                  repo.diff(b, commits_hexsha_[b][0]))
    dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0])
    eq_(len(dincoming),
        2)  # 2 diff objects -- 1 file removed, 1 statuses updated
    eq_(
        set(dincoming.keys()), {
            repo.pathobj / '.datalad/crawl/statuses/incoming.json',
            repo.pathobj / 'ds666_R1.0.0.tar.gz'
        })

    eq_(out[0]['datalad_stats'].get_total().removed, 1)
    assert_not_equal(commits_hexsha_, commits_hexsha_removed)

    # we will check if a clone would be crawling just as good
    from datalad.api import crawl

    # make a brand new clone
    GitRepo.clone(outd, clonedir)

    def _pipeline(*args, **kwargs):
        """Helper to mock openfmri.pipeline invocation so it looks at our 'server'"""
        kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False})
        return ofpipeline(*args, **kwargs)

    with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline):
        output, stats = crawl(
        )  # we should be able to recrawl without doing anything
        ok_(stats, ActivityStats(files=6, skipped=6, urls=5))

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_openfmri.py Projeto: debanjum/datalad

def test_openfmri_pipeline1(ind, topurl, outd, clonedir):
    index_html = opj(ind, 'ds666', 'index.html')

    list(initiate_dataset(
        template="openfmri",
        dataset_name='dataladtest-ds666',
        path=outd,
        data_fields=['dataset'])({'dataset': 'ds666'}))

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed'))

    t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath_nover, "mighty load in old format")

    #
    # And now versioned files were specified!
    #
    add_to_index(index_html, content=_versioned_files)

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    ok_(not exists(t1w_fpath_nover),
        "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover)
    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed'))

    # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming
    # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents

    commits = {b: list(repo.get_branch_commits(b)) for b in branches}
    commits_hexsha = {b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches}
    commits_l = {b: list(repo.get_branch_commits(b, limit='left-only')) for b in branches}
    eq_(len(commits['incoming']), 3)
    eq_(len(commits_l['incoming']), 3)
    eq_(len(commits['incoming-processed']), 6)
    eq_(len(commits_l['incoming-processed']), 4)  # because original merge has only 1 parent - incoming
    eq_(len(commits['master']), 12)  # all commits out there -- dataset init, crawler init + 3*(incoming, processed, meta data aggregation, merge)
    eq_(len(commits_l['master']), 6)

    # Check tags for the versions
    eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1'])
    # +1 because original "release" was assumed to be 1.0.0
    eq_([x.name for x in repo.repo.tags], ['1.0.0', '1.0.0+1', '1.0.1'])
    eq_(repo.repo.tags[0].commit.hexsha, commits_l['master'][-4].hexsha)  # next to the last one
    eq_(repo.repo.tags[-1].commit.hexsha, commits_l['master'][0].hexsha)  # the last one

    def hexsha(l):
        return l.__class__(x.hexsha for x in l)

    # Verify that we have desired tree of merges
    eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1].hexsha,
                                                             commits_l['incoming'][0].hexsha))
    eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming'][2].hexsha,))

    eq_(hexsha(commits_l['master'][0].parents), (commits_l['master'][1].hexsha,
                                                 commits_l['incoming-processed'][0].hexsha))

    eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2].hexsha,
                                                 commits_l['incoming-processed'][1].hexsha))

    with chpwd(outd):
        eq_(set(glob('*')), {'changelog.txt', 'sub-1'})
        all_files = sorted(find_files('.'))

    t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath, "mighty load 1.0.1")
    ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False)
    ok_file_under_git(t1w_fpath, annexed=True)

    target_files = {
        './.datalad/config',
        './.datalad/meta/meta.json',
        './.datalad/crawl/crawl.cfg',
        # no more!
        # './.datalad/config.ttl', './.datalad/datalad.ttl',
        './.datalad/crawl/statuses/incoming.json',
        './.datalad/crawl/versions/incoming.json',
        './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv'}
    target_incoming_files = {
        '.gitattributes',  # we marked default backend right in the incoming
        'changelog.txt',
        'ds666.tar.gz',
        'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz',
        '.datalad/crawl/statuses/incoming.json',
        '.datalad/crawl/versions/incoming.json'
    }
    eq_(set(all_files), target_files)

    # check that -beh was committed in 2nd commit in incoming, not the first one
    assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1]))
    assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0]))

    # rerun pipeline -- make sure we are on the same in all branches!
    with chpwd(outd):
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    commits_hexsha_ = {b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches}
    eq_(commits_hexsha, commits_hexsha_)  # i.e. nothing new
    # actually we do manage to add_git 1 (README) since it is generated committed directly to git
    # BUT now fixed -- if not committed (was the same), should be marked as skipped
    # Nothing was committed so stats leaked all the way up
    eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5))
    eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total())

    # rerun pipeline when new content is available
    # add new revision, rerun pipeline and check that stuff was processed/added correctly
    add_to_index(index_html,
                 content='<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>')

    with chpwd(outd):
        out = run_pipeline(pipeline)
        all_files_updated = sorted(find_files('.'))
    eq_(len(out), 1)
    assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats())
    # there is no overlays ATM, so behav would be gone since no 2.0.0 for it!
    target_files.remove('./sub-1/beh/responses.tsv')
    eq_(set(all_files_updated), target_files)

    # new instance so it re-reads git stuff etc
    # repo = AnnexRepo(outd, create=False)  # to be used in the checks
    commits_ = {b: list(repo.get_branch_commits(b)) for b in branches}
    commits_hexsha_ = {b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches}
    commits_l_ = {b: list(repo.get_branch_commits(b, limit='left-only')) for b in branches}

    assert_not_equal(commits_hexsha, commits_hexsha_)
    eq_(out[0]['datalad_stats'], ActivityStats())  # commit happened so stats were consumed
    # numbers seems to be right
    total_stats = out[0]['datalad_stats'].get_total()
    # but for some reason downloaded_size fluctuates.... why? probably archiving...?
    total_stats.downloaded_size = 0
    eq_(total_stats,
        ActivityStats(files=8, skipped=5, downloaded=1, renamed=1, urls=6,
                      add_annex=2,  # add_git=1, # README
                      versions=['2.0.0'],
                      merges=[['incoming', 'incoming-processed']]))

    check_dropall_get(repo)

    # Let's see if pipeline would remove files we stopped tracking
    remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>')
    with chpwd(outd):
        with swallow_logs(new_level=logging.WARNING) as cml:
            out = run_pipeline(pipeline)
            # since files get removed in incoming, but repreprocessed completely
            # incomming-processed and merged into master -- new commits will come
            # They shouldn't have any difference but still should be new commits
            assert_in("There is already a tag 2.0.0 in the repository", cml.out)
    eq_(len(out), 1)
    incoming_files = repo.get_files('incoming')
    target_incoming_files.remove('ds666_R1.0.0.tar.gz')
    eq_(set(incoming_files), target_incoming_files)
    commits_hexsha_removed = {b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches}
    # our 'statuses' database should have recorded the change thus got a diff
    # which propagated through all branches
    for b in 'master', 'incoming-processed':
        # with non persistent DB we had no changes
        # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), [])
        eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0])[0].a_path,
            '.datalad/crawl/statuses/incoming.json')
    dincoming = repo.repo.branches['incoming'].commit.diff(commits_hexsha_['incoming'][0])
    eq_(len(dincoming), 2)  # 2 diff objects -- 1 file removed, 1 statuses updated
    eq_(set([d.a_path for d in dincoming]),
        {'.datalad/crawl/statuses/incoming.json', 'ds666_R1.0.0.tar.gz'})
    # since it seems to diff "from current to the specified", it will be listed as new_file
    assert any(d.new_file for d in dincoming)

    eq_(out[0]['datalad_stats'].get_total().removed, 1)
    assert_not_equal(commits_hexsha_, commits_hexsha_removed)

    # we will check if a clone would be crawling just as good
    from datalad.api import crawl

    # make a brand new clone
    GitRepo.clone(outd, clonedir)

    def _pipeline(*args, **kwargs):
        """Helper to mock openfmri.pipeline invocation so it looks at our 'server'"""
        kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False})
        return ofpipeline(*args,  **kwargs)

    with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline):
        output, stats = crawl()  # we should be able to recrawl without doing anything
        ok_(stats, ActivityStats(files=6, skipped=6, urls=5))

Exemplo n.º 15

0

Exibir arquivo

Arquivo: crawl.py Projeto: yarikoptic/datalad-crawler

    def __call__(path=None, is_pipeline=False, is_template=False,
                 recursive=False, chdir=None):  # dry_run=False,
        dry_run = False

        from datalad_crawler.pipeline import (
            load_pipeline_from_config, load_pipeline_from_module,
            get_repo_pipeline_config_path, get_repo_pipeline_script_path
        )
        from datalad_crawler.pipeline import run_pipeline
        from datalad.utils import chpwd  # import late so we could mock during tests

        with chpwd(chdir):

            assert not (is_pipeline and is_template), "it is either a pipeline or a template name, can't be both"
            if is_template:
                # generate a config and overload path with its filename
                path = initiate_pipeline_config(template=path,  # kwargs=TODO,
                                                commit=True)

            # TODO: centralize via _params_ handling
            if dry_run:
                dryrun_optlabel = 'datalad.crawl.dryrun'
                if dryrun_optlabel in cfg:
                    cfg.unset(dryrun_optlabel, where='local', reload=False)
                cfg.add(dryrun_optlabel, "True", where='local')

            if path is None:

                # get config from the current repository/dataset
                if is_pipeline:
                    raise ValueError("You must specify the file if --pipeline")

                # Let's see if there is a config or pipeline in this repo
                path = get_repo_pipeline_config_path()
                if not path or not exists(path):
                    # Check if there may be the pipeline provided
                    path = get_repo_pipeline_script_path()
                    if path and exists(path):
                        is_pipeline = True

            stats = ActivityStats()

            if not path:
                raise RuntimeError("Cannot locate crawler config or pipeline file")

            if is_pipeline:
                lgr.info("Loading pipeline definition from %s" % path)
                pipeline = load_pipeline_from_module(path)
            else:
                lgr.info("Loading pipeline specification from %s" % path)
                pipeline = load_pipeline_from_config(path)

            lgr.info("Running pipeline %s" % str(pipeline))
            # TODO: capture the state of all branches so in case of crash
            # we could gracefully reset back
            try:
                output = run_pipeline(pipeline, stats=stats)
            except Exception as exc:
                # TODO: config.crawl.failure = full-reset | last-good-master
                # probably ask via ui which action should be performed unless
                # explicitly specified
                raise
            stats.datasets_crawled += 1

            # TODO:  Move gc/clean over here!

            stats_total = stats.get_total()

            if recursive:
                # get all subdatasets, and crawl them too!
                ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path
                import os
                from datalad.distribution.dataset import Dataset
                from datalad.api import crawl
                from datalad.utils import swallow_logs
                from datalad.dochelpers import exc_str
                # Note: we could collect all datasets to be crawled here or pass recursive=True
                # into the subdatasets' crawl.  We will collect all of them here so we might later
                # also introduce automatic commits when super-dataset got successfully updated
                subdatasets = Dataset(os.curdir).subdatasets(recursive=recursive, result_xfm='relpaths')

                lgr.info("Crawling %d subdatasets", len(subdatasets))
                output = [output]
                # TODO: parallelize
                # TODO: assumes that all sub-datasets are 'crawllable', and if not
                # just adds them to crawl_failed count.  But may be we should make it more
                # explicit, that some sub-datasets might not need to be crawled, so they get
                # skipped explicitly?
                for ds_ in subdatasets:
                    ds_logfile = utils.get_logfilename(ds_, 'crawl')
                    try:
                        # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth
                        with swallow_logs(file_=ds_logfile) as cml:
                            output_, stats_ = crawl(chdir=ds_)
                            stats_total += stats_
                            output.append(output_)
                        lgr.info("Crawled %s: %s (log: %s)", ds_, stats_.as_str(mode='line'), ds_logfile)
                    except Exception as exc:
                        stats_total.datasets_crawl_failed += 1
                        stats_total.datasets_crawled += 1
                        output += [None]
                        lgr.warning("Crawling of %s has failed (more in %s): %s.",  # Log output: %s",
                                    ds_, ds_logfile, exc_str(exc))  # , cml.out)

            lgr.info("Total stats: %s", stats_total.as_str(mode='line'))

            return output, stats_total