def test_pipeline_looping(): count = [0, 0] def count_threetimes(data): """helper to not yield anything if done it 3 times by now""" if count[0] >= 3: return count[0] += 1 for i in range(count[0]): yield updated(data, dict(somevar=(i, count[0]))) def add_count(data): count[1] += 1 yield updated(data, {'count': count[0]}) def passthrough(data): yield data pipeline_output = run_pipeline([{ 'loop': True }, count_threetimes], dict(x=0)) eq_(pipeline_output, _out([{'x': 0}])) eq_(count, [3, 0]) # and even if the node not yielding is note the first node pipeline_output = run_pipeline([{ 'loop': True }, passthrough, count_threetimes], dict(x=0)) eq_(pipeline_output, _out([{'x': 0}])) eq_(count, [3, 0]) count[0] = 0 # Let's rerun with explicit last-output, which would also affect output of this pipeline pipeline_output = run_pipeline([{ 'loop': True, 'output': 'last-output' }, count_threetimes], dict(x=0)) eq_(pipeline_output, _out([{'x': 0, 'somevar': (2, 3)}])) eq_(count, [3, 0]) # and if pipeline is composite, i.e. more than a single step, so we could make sure everything is called count[0] = 0 pipeline_output = run_pipeline([{ 'loop': True }, count_threetimes, add_count], dict(x=0)) eq_(pipeline_output, _out([{'x': 0}])) eq_(count, [3, 6]) count[0] = count[1] = 0 # Let's rerun with explicit last-output, which would also affect output of this pipeline pipeline_output = run_pipeline([{ 'loop': True, 'output': 'last-output' }, count_threetimes, add_count], dict(x=0)) eq_(pipeline_output, _out([{'x': 0, 'somevar': (2, 3), 'count': 3}])) eq_(count, [3, 6])
def _test_dataset(dataset, error, create, skip, tmpdir): with chpwd(tmpdir): if create: with open("README.txt", 'w') as f: f.write(" ") pipe = [ crawl_url(TOPURL), [ assign({'dataset': dataset}), skip_if( { 'dataset': 'Cleveland CCF|Durham_Madden|NewYork_Test-Retest_Reliability' }, re=True), sub({ 'response': { '<div class="tableParam">([^<]*)</div>': r'\1' } }), find_dataset(dataset), extract_readme, ] ] if error: assert_raises((InvalidURL, RuntimeError), run_pipeline, pipe) return try: run_pipeline(pipe) except InvalidURL as exc: raise SkipTest( "This version of requests considers %s to be invalid. " "See https://github.com/kennethreitz/requests/issues/3683#issuecomment-261947670 : %s" % (TOPURL, exc_str(exc))) if skip: assert_false(exists("README.txt")) return assert_true(exists("README.txt")) f = open("README.txt", 'r') contents = f.read() assert_true("Author(s)" and "Details" in contents)
def test_pipeline_linear_top_isnested_pipeline(): # check if no generated data to reach the end node, it still gets executed was_called = [] pipeline = [ # range_node(1), [ range_node(1, "out2"), ], lambda d: was_called.append('yes') ] pipeline_output = run_pipeline(pipeline) eq_(was_called, ['yes'])
def test_pipeline_dropped_stats(): def n1(data): data['datalad_stats'].increment('add_git') yield data def n2(data): # doesn't care to maintain previous stats yield {'out': 1} pipeline_output = run_pipeline([{'output': 'outputs'}, n1, n2]) eq_(pipeline_output, [{ 'datalad_stats': ActivityStats(add_git=1), 'out': 1 }])
def test_pipeline_linear_simple(): sink = Sink() pipeline = [range_node(2, "out1"), range_node(3, "out2"), sink] pipeline_output = run_pipeline(pipeline) eq_( pipeline_output, DEFAULT_OUTPUT ) # by default 'input' is output and input is made empty dict if not provided eq_(sink.data, [{ 'out1': 0, 'out2': 0 }, { 'out1': 0, 'out2': 1 }, { 'out1': 0, 'out2': 2 }, { 'out1': 1, 'out2': 0 }, { 'out1': 1, 'out2': 1 }, { 'out1': 1, 'out2': 2 }]) # if we extend pipeline with matching interrupt_if, the entire pipeline should # stop at that matching point, but otherwise there should be no crash etc sink.clean() pipeline_output = run_pipeline(pipeline + [interrupt_if({ 'out1': 0, 'out2': 1 })]) eq_(pipeline_output, DEFAULT_OUTPUT) eq_(sink.data, [{'out1': 0, 'out2': 0}, {'out1': 0, 'out2': 1}])
def test_pipeline_recursive(): def less3(data): """a little helper which would not yield whenever input x>3""" if data['x'] < 3: yield updated(data, dict(x=data['x'] + 1)) pipeline = [ { 'loop': True, 'output': 'outputs' }, less3, ] pipeline_output = run_pipeline(pipeline, dict(x=0)) eq_(pipeline_output, _out([{'x': 1}, {'x': 2}, {'x': 3}]))
def test_pipeline_updated_stats(): def n1(data): data['datalad_stats'].increment('add_git') yield data def n2(data): # doesn't care to maintain previous stats data = data.copy() data['datalad_stats'] = ActivityStats(files=2) data['out'] = 1 yield data pipeline_output = run_pipeline([{'output': 'outputs'}, n1, n2]) eq_(pipeline_output, [{ 'datalad_stats': ActivityStats(files=2, add_git=1), 'out': 1 }])
def test_pipeline_linear_nested_order(): sink = Sink() sink2 = Sink() assert_order = AssertOrder() pipeline = [ assert_order(1), range_node(2, "out1"), assert_order({2, 5}), [ assert_order({3, 6}), range_node(3, "out2"), sink, ], assert_order({4, 7}), sink2 ] pipeline_output = run_pipeline(pipeline)
def assert_pipeline(pipeline): eq_(run_pipeline(pipeline), [{'datalad_stats': target_stats}])
def test_pipeline_linear_nested(): sink = Sink() sink2 = Sink() assert_order = AssertOrder() pipeline = [range_node(2, "out1"), [ range_node(3, "out2"), sink, ], sink2] all_pairs = [{ 'out1': 0, 'out2': 0 }, { 'out1': 0, 'out2': 1 }, { 'out1': 0, 'out2': 2 }, { 'out1': 1, 'out2': 0 }, { 'out1': 1, 'out2': 1 }, { 'out1': 1, 'out2': 2 }] pipeline_output = run_pipeline(pipeline) eq_(pipeline_output, DEFAULT_OUTPUT) eq_(sink.data, all_pairs) # and output is not seen outside of the nested pipeline eq_(sink2.data, [{'out1': 0}, {'out1': 1}]) # Let's make nested pipeline yield all sink.clean() sink2.clean() pipeline[1].insert(0, {'output': 'outputs'}) pipeline_output = run_pipeline(pipeline) eq_(pipeline_output, DEFAULT_OUTPUT) # by default no output produced eq_(sink.data, all_pairs) # and output was passed outside from the nested pipeline eq_(sink2.data, all_pairs) # Let's make it yield the last-output one sink2.clean() pipeline[1][0] = {'output': 'last-output'} pipeline_output = run_pipeline(pipeline) eq_(pipeline_output, DEFAULT_OUTPUT) # by default no output produced # only the last output from the nested pipeline appeared outside eq_(sink2.data, [{'out1': 0, 'out2': 2}, {'out1': 1, 'out2': 2}]) # Let's now add output to the top-most pipeline pipeline.insert(0, {'output': 'outputs'}) pipeline_output = run_pipeline(pipeline) eq_(pipeline_output, _out([{ 'out1': 0, 'out2': 2 }, { 'out1': 1, 'out2': 2 }])) # and if we ask only for the last one pipeline[0] = {'output': 'last-output'} pipeline_output = run_pipeline(pipeline) eq_(pipeline_output, _out([{'out1': 1, 'out2': 2}]))
def __call__(path=None, is_pipeline=False, is_template=False, recursive=False, chdir=None): # dry_run=False, dry_run = False from datalad_crawler.pipeline import ( load_pipeline_from_config, load_pipeline_from_module, get_repo_pipeline_config_path, get_repo_pipeline_script_path ) from datalad_crawler.pipeline import run_pipeline from datalad.utils import chpwd # import late so we could mock during tests with chpwd(chdir): assert not (is_pipeline and is_template), "it is either a pipeline or a template name, can't be both" if is_template: # generate a config and overload path with its filename path = initiate_pipeline_config(template=path, # kwargs=TODO, commit=True) # TODO: centralize via _params_ handling if dry_run: dryrun_optlabel = 'datalad.crawl.dryrun' if dryrun_optlabel in cfg: cfg.unset(dryrun_optlabel, where='local', reload=False) cfg.add(dryrun_optlabel, "True", where='local') if path is None: # get config from the current repository/dataset if is_pipeline: raise ValueError("You must specify the file if --pipeline") # Let's see if there is a config or pipeline in this repo path = get_repo_pipeline_config_path() if not path or not exists(path): # Check if there may be the pipeline provided path = get_repo_pipeline_script_path() if path and exists(path): is_pipeline = True stats = ActivityStats() if not path: raise RuntimeError("Cannot locate crawler config or pipeline file") if is_pipeline: lgr.info("Loading pipeline definition from %s" % path) pipeline = load_pipeline_from_module(path) else: lgr.info("Loading pipeline specification from %s" % path) pipeline = load_pipeline_from_config(path) lgr.info("Running pipeline %s" % str(pipeline)) # TODO: capture the state of all branches so in case of crash # we could gracefully reset back try: output = run_pipeline(pipeline, stats=stats) except Exception as exc: # TODO: config.crawl.failure = full-reset | last-good-master # probably ask via ui which action should be performed unless # explicitly specified raise stats.datasets_crawled += 1 # TODO: Move gc/clean over here! stats_total = stats.get_total() if recursive: # get all subdatasets, and crawl them too! ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path import os from datalad.distribution.dataset import Dataset from datalad.api import crawl from datalad.utils import swallow_logs from datalad.dochelpers import exc_str # Note: we could collect all datasets to be crawled here or pass recursive=True # into the subdatasets' crawl. We will collect all of them here so we might later # also introduce automatic commits when super-dataset got successfully updated subdatasets = Dataset(os.curdir).subdatasets(recursive=recursive, result_xfm='relpaths') lgr.info("Crawling %d subdatasets", len(subdatasets)) output = [output] # TODO: parallelize # TODO: assumes that all sub-datasets are 'crawllable', and if not # just adds them to crawl_failed count. But may be we should make it more # explicit, that some sub-datasets might not need to be crawled, so they get # skipped explicitly? for ds_ in subdatasets: ds_logfile = utils.get_logfilename(ds_, 'crawl') try: # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth with swallow_logs(file_=ds_logfile) as cml: output_, stats_ = crawl(chdir=ds_) stats_total += stats_ output.append(output_) lgr.info("Crawled %s: %s (log: %s)", ds_, stats_.as_str(mode='line'), ds_logfile) except Exception as exc: stats_total.datasets_crawl_failed += 1 stats_total.datasets_crawled += 1 output += [None] lgr.warning("Crawling of %s has failed (more in %s): %s.", # Log output: %s", ds_, ds_logfile, exc_str(exc)) # , cml.out) lgr.info("Total stats: %s", stats_total.as_str(mode='line')) return output, stats_total