示例#1
0
def test_pipeline_looping():
    count = [0, 0]

    def count_threetimes(data):
        """helper to not yield anything if done it 3 times by now"""
        if count[0] >= 3:
            return
        count[0] += 1
        for i in range(count[0]):
            yield updated(data, dict(somevar=(i, count[0])))

    def add_count(data):
        count[1] += 1
        yield updated(data, {'count': count[0]})

    def passthrough(data):
        yield data

    pipeline_output = run_pipeline([{
        'loop': True
    }, count_threetimes], dict(x=0))
    eq_(pipeline_output, _out([{'x': 0}]))
    eq_(count, [3, 0])

    # and even if the node not yielding is note the first node
    pipeline_output = run_pipeline([{
        'loop': True
    }, passthrough, count_threetimes], dict(x=0))
    eq_(pipeline_output, _out([{'x': 0}]))
    eq_(count, [3, 0])

    count[0] = 0
    # Let's rerun with explicit last-output, which would also affect output of this pipeline
    pipeline_output = run_pipeline([{
        'loop': True,
        'output': 'last-output'
    }, count_threetimes], dict(x=0))
    eq_(pipeline_output, _out([{'x': 0, 'somevar': (2, 3)}]))
    eq_(count, [3, 0])

    # and if pipeline is composite, i.e. more than a single step, so we could make sure everything is called
    count[0] = 0
    pipeline_output = run_pipeline([{
        'loop': True
    }, count_threetimes, add_count], dict(x=0))
    eq_(pipeline_output, _out([{'x': 0}]))
    eq_(count, [3, 6])

    count[0] = count[1] = 0
    # Let's rerun with explicit last-output, which would also affect output of this pipeline
    pipeline_output = run_pipeline([{
        'loop': True,
        'output': 'last-output'
    }, count_threetimes, add_count], dict(x=0))
    eq_(pipeline_output, _out([{'x': 0, 'somevar': (2, 3), 'count': 3}]))
    eq_(count, [3, 6])
def _test_dataset(dataset, error, create, skip, tmpdir):

    with chpwd(tmpdir):

        if create:
            with open("README.txt", 'w') as f:
                f.write(" ")

        pipe = [
            crawl_url(TOPURL),
            [
                assign({'dataset': dataset}),
                skip_if(
                    {
                        'dataset':
                        'Cleveland CCF|Durham_Madden|NewYork_Test-Retest_Reliability'
                    },
                    re=True),
                sub({
                    'response': {
                        '<div class="tableParam">([^<]*)</div>': r'\1'
                    }
                }),
                find_dataset(dataset),
                extract_readme,
            ]
        ]

        if error:
            assert_raises((InvalidURL, RuntimeError), run_pipeline, pipe)
            return

        try:
            run_pipeline(pipe)
        except InvalidURL as exc:
            raise SkipTest(
                "This version of requests considers %s to be invalid.  "
                "See https://github.com/kennethreitz/requests/issues/3683#issuecomment-261947670 : %s"
                % (TOPURL, exc_str(exc)))

        if skip:
            assert_false(exists("README.txt"))
            return
        assert_true(exists("README.txt"))

        f = open("README.txt", 'r')
        contents = f.read()
        assert_true("Author(s)" and "Details" in contents)
示例#3
0
def test_pipeline_linear_top_isnested_pipeline():
    # check if no generated data to reach the end node, it still gets executed
    was_called = []
    pipeline = [
        # range_node(1),
        [
            range_node(1, "out2"),
        ],
        lambda d: was_called.append('yes')
    ]
    pipeline_output = run_pipeline(pipeline)
    eq_(was_called, ['yes'])
示例#4
0
def test_pipeline_dropped_stats():
    def n1(data):
        data['datalad_stats'].increment('add_git')
        yield data

    def n2(data):  # doesn't care to maintain previous stats
        yield {'out': 1}

    pipeline_output = run_pipeline([{'output': 'outputs'}, n1, n2])
    eq_(pipeline_output, [{
        'datalad_stats': ActivityStats(add_git=1),
        'out': 1
    }])
示例#5
0
def test_pipeline_linear_simple():
    sink = Sink()
    pipeline = [range_node(2, "out1"), range_node(3, "out2"), sink]
    pipeline_output = run_pipeline(pipeline)
    eq_(
        pipeline_output, DEFAULT_OUTPUT
    )  # by default 'input' is output and input is made empty dict if not provided
    eq_(sink.data, [{
        'out1': 0,
        'out2': 0
    }, {
        'out1': 0,
        'out2': 1
    }, {
        'out1': 0,
        'out2': 2
    }, {
        'out1': 1,
        'out2': 0
    }, {
        'out1': 1,
        'out2': 1
    }, {
        'out1': 1,
        'out2': 2
    }])

    # if we extend pipeline with matching interrupt_if, the entire pipeline should
    # stop at that matching point, but otherwise there should be no crash etc
    sink.clean()
    pipeline_output = run_pipeline(pipeline +
                                   [interrupt_if({
                                       'out1': 0,
                                       'out2': 1
                                   })])
    eq_(pipeline_output, DEFAULT_OUTPUT)
    eq_(sink.data, [{'out1': 0, 'out2': 0}, {'out1': 0, 'out2': 1}])
示例#6
0
def test_pipeline_recursive():
    def less3(data):
        """a little helper which would not yield whenever input x>3"""
        if data['x'] < 3:
            yield updated(data, dict(x=data['x'] + 1))

    pipeline = [
        {
            'loop': True,
            'output': 'outputs'
        },
        less3,
    ]
    pipeline_output = run_pipeline(pipeline, dict(x=0))
    eq_(pipeline_output, _out([{'x': 1}, {'x': 2}, {'x': 3}]))
示例#7
0
def test_pipeline_updated_stats():
    def n1(data):
        data['datalad_stats'].increment('add_git')
        yield data

    def n2(data):  # doesn't care to maintain previous stats
        data = data.copy()
        data['datalad_stats'] = ActivityStats(files=2)
        data['out'] = 1
        yield data

    pipeline_output = run_pipeline([{'output': 'outputs'}, n1, n2])
    eq_(pipeline_output, [{
        'datalad_stats': ActivityStats(files=2, add_git=1),
        'out': 1
    }])
示例#8
0
def test_pipeline_linear_nested_order():
    sink = Sink()
    sink2 = Sink()
    assert_order = AssertOrder()

    pipeline = [
        assert_order(1),
        range_node(2, "out1"),
        assert_order({2, 5}),
        [
            assert_order({3, 6}),
            range_node(3, "out2"),
            sink,
        ],
        assert_order({4, 7}), sink2
    ]
    pipeline_output = run_pipeline(pipeline)
示例#9
0
 def assert_pipeline(pipeline):
     eq_(run_pipeline(pipeline), [{'datalad_stats': target_stats}])
示例#10
0
def test_pipeline_linear_nested():
    sink = Sink()
    sink2 = Sink()
    assert_order = AssertOrder()

    pipeline = [range_node(2, "out1"), [
        range_node(3, "out2"),
        sink,
    ], sink2]
    all_pairs = [{
        'out1': 0,
        'out2': 0
    }, {
        'out1': 0,
        'out2': 1
    }, {
        'out1': 0,
        'out2': 2
    }, {
        'out1': 1,
        'out2': 0
    }, {
        'out1': 1,
        'out2': 1
    }, {
        'out1': 1,
        'out2': 2
    }]
    pipeline_output = run_pipeline(pipeline)
    eq_(pipeline_output, DEFAULT_OUTPUT)
    eq_(sink.data, all_pairs)
    # and output is not seen outside of the nested pipeline
    eq_(sink2.data, [{'out1': 0}, {'out1': 1}])

    # Let's make nested pipeline yield all
    sink.clean()
    sink2.clean()
    pipeline[1].insert(0, {'output': 'outputs'})

    pipeline_output = run_pipeline(pipeline)
    eq_(pipeline_output, DEFAULT_OUTPUT)  # by default no output produced
    eq_(sink.data, all_pairs)
    # and output was passed outside from the nested pipeline
    eq_(sink2.data, all_pairs)

    # Let's make it yield the last-output one
    sink2.clean()
    pipeline[1][0] = {'output': 'last-output'}
    pipeline_output = run_pipeline(pipeline)
    eq_(pipeline_output, DEFAULT_OUTPUT)  # by default no output produced
    # only the last output from the nested pipeline appeared outside
    eq_(sink2.data, [{'out1': 0, 'out2': 2}, {'out1': 1, 'out2': 2}])

    # Let's now add output to the top-most pipeline
    pipeline.insert(0, {'output': 'outputs'})
    pipeline_output = run_pipeline(pipeline)
    eq_(pipeline_output, _out([{
        'out1': 0,
        'out2': 2
    }, {
        'out1': 1,
        'out2': 2
    }]))

    # and if we ask only for the last one
    pipeline[0] = {'output': 'last-output'}
    pipeline_output = run_pipeline(pipeline)
    eq_(pipeline_output, _out([{'out1': 1, 'out2': 2}]))
示例#11
0
    def __call__(path=None, is_pipeline=False, is_template=False,
                 recursive=False, chdir=None):  # dry_run=False,
        dry_run = False

        from datalad_crawler.pipeline import (
            load_pipeline_from_config, load_pipeline_from_module,
            get_repo_pipeline_config_path, get_repo_pipeline_script_path
        )
        from datalad_crawler.pipeline import run_pipeline
        from datalad.utils import chpwd  # import late so we could mock during tests

        with chpwd(chdir):

            assert not (is_pipeline and is_template), "it is either a pipeline or a template name, can't be both"
            if is_template:
                # generate a config and overload path with its filename
                path = initiate_pipeline_config(template=path,  # kwargs=TODO,
                                                commit=True)

            # TODO: centralize via _params_ handling
            if dry_run:
                dryrun_optlabel = 'datalad.crawl.dryrun'
                if dryrun_optlabel in cfg:
                    cfg.unset(dryrun_optlabel, where='local', reload=False)
                cfg.add(dryrun_optlabel, "True", where='local')

            if path is None:

                # get config from the current repository/dataset
                if is_pipeline:
                    raise ValueError("You must specify the file if --pipeline")

                # Let's see if there is a config or pipeline in this repo
                path = get_repo_pipeline_config_path()
                if not path or not exists(path):
                    # Check if there may be the pipeline provided
                    path = get_repo_pipeline_script_path()
                    if path and exists(path):
                        is_pipeline = True

            stats = ActivityStats()

            if not path:
                raise RuntimeError("Cannot locate crawler config or pipeline file")

            if is_pipeline:
                lgr.info("Loading pipeline definition from %s" % path)
                pipeline = load_pipeline_from_module(path)
            else:
                lgr.info("Loading pipeline specification from %s" % path)
                pipeline = load_pipeline_from_config(path)

            lgr.info("Running pipeline %s" % str(pipeline))
            # TODO: capture the state of all branches so in case of crash
            # we could gracefully reset back
            try:
                output = run_pipeline(pipeline, stats=stats)
            except Exception as exc:
                # TODO: config.crawl.failure = full-reset | last-good-master
                # probably ask via ui which action should be performed unless
                # explicitly specified
                raise
            stats.datasets_crawled += 1

            # TODO:  Move gc/clean over here!

            stats_total = stats.get_total()

            if recursive:
                # get all subdatasets, and crawl them too!
                ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path
                import os
                from datalad.distribution.dataset import Dataset
                from datalad.api import crawl
                from datalad.utils import swallow_logs
                from datalad.dochelpers import exc_str
                # Note: we could collect all datasets to be crawled here or pass recursive=True
                # into the subdatasets' crawl.  We will collect all of them here so we might later
                # also introduce automatic commits when super-dataset got successfully updated
                subdatasets = Dataset(os.curdir).subdatasets(recursive=recursive, result_xfm='relpaths')

                lgr.info("Crawling %d subdatasets", len(subdatasets))
                output = [output]
                # TODO: parallelize
                # TODO: assumes that all sub-datasets are 'crawllable', and if not
                # just adds them to crawl_failed count.  But may be we should make it more
                # explicit, that some sub-datasets might not need to be crawled, so they get
                # skipped explicitly?
                for ds_ in subdatasets:
                    ds_logfile = utils.get_logfilename(ds_, 'crawl')
                    try:
                        # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth
                        with swallow_logs(file_=ds_logfile) as cml:
                            output_, stats_ = crawl(chdir=ds_)
                            stats_total += stats_
                            output.append(output_)
                        lgr.info("Crawled %s: %s (log: %s)", ds_, stats_.as_str(mode='line'), ds_logfile)
                    except Exception as exc:
                        stats_total.datasets_crawl_failed += 1
                        stats_total.datasets_crawled += 1
                        output += [None]
                        lgr.warning("Crawling of %s has failed (more in %s): %s.",  # Log output: %s",
                                    ds_, ds_logfile, exc_str(exc))  # , cml.out)

            lgr.info("Total stats: %s", stats_total.as_str(mode='line'))

            return output, stats_total