def test_spark_producer(app): producer_output = [] @app.consumer() def callback_func(a): producer_output.append(a) @app.pipeline() def spark_simple_pipeline(pipeline, a, b): return a.subscribe_consumer(callback_func) @app.spark_producer(spark_simple_pipeline) def spark_producer(): """ Simple producer which yields data to `main` pipeline. More information here: http://stairspy.com/#producer """ spark = SparkSession \ .builder \ .getOrCreate() f = sc.textFile("test.row", 10) df = spark.read.json(f) return df spark_producer.flush() spark_producer() spark_simple_pipeline.compile() run_pipelines(app) assert len(producer_output) == 20
def test_connected_pipelines_multiple(app): t = GlobalTestData() @app.pipeline() def p_builder(worker, sentence, use_lower): data = concatenate(sentence=sentence, use_lower=use_lower) data_with_name = data\ .subscribe_flow(NameExtractionOneWayFlow()) return data_with_name @app.pipeline() def p_builder_general(worker, sentence, use_lower): data = concatenate(sentence=sentence, use_lower=use_lower) v1 = data.subscribe_pipeline(p_builder)\ .subscribe_flow(NameExtractionFlowMultiple(use_lower=use_lower)) v2 = data.subscribe_pipeline(p_builder)\ .subscribe_flow(NameExtractionFlowMultiple(use_lower=use_lower)) return concatenate(v1=v1, v2=v2).subscribe_func(t.save_one_item) p_builder.compile() p_builder_general.compile() p_builder_general(sentence="Oleg", use_lower=True) run_pipelines(app) result = t.get_result() assert result['v1']['names'][0] == "oleg" assert result['v2']['names'][0] == "oleg"
def test_multiple_pipelines(app): t = GlobalTestData() @app.pipeline() def callback_func(pipeline, x): return x.subscribe_func(t.save_multiple_items) @app.pipeline() def callback_func2(pipeline, x): return x.subscribe_func(t.save_multiple_items) @app.producer(callback_func, callback_func2) def simple_producer(): for i in range(100): yield dict(x=1) callback_func.compile() callback_func2.compile() simple_producer.flush() simple_producer.run() run_pipelines(app) assert len(t.get_result()) == 200
def test_make_add_value_with_flow(app): t = GlobalTestData() @app.pipeline(config=dict(path='123')) def p_builder3(worker, sentence2): return sentence2\ .rename(sentence=sentence2)\ .add_value(path=worker.config['path'])\ .subscribe_flow(NameExtractionOneWayFlow(use_lower=True))\ .subscribe_func(t.save_one_item) @app.pipeline() def p_builder_general3(worker, sentence2: DataFrame): return sentence2.subscribe_pipeline(p_builder3, config=dict(path='321')) p_builder3.compile() p_builder_general3.compile() p_builder_general3(sentence2="Oleg") run_pipelines(app) result = t.get_result() assert check_keys(result.keys(), ['sentence', 'path', 'names']) assert result['names'][0] == "oleg" assert result['path'] == '321'
def test_worker_producer(app): t = GlobalTestData() @app.pipeline() def simple_pipeline(pipeline, x): return x.subscribe_func(t.save_multiple_items) @app.producer(simple_pipeline) def batch_handler(x): yield dict(x=x) @app.batch_producer(batch_handler) def worker_producer(): for i in range(10): # here we should yield batch of data yield dict(x=1) simple_pipeline.compile() worker_producer() try: worker_producer() run_jobs_processor(app.project, [batch_handler], die_when_empty=True) except SystemExit: pass run_pipelines(app) assert len(t.get_result()) == 20 # 10 batches with 2 jobs
def test_connected_pipelines(app): @app.consumer_iter() def save_globaly(func1, func2, **kwargs): return dict(func1=func1, func2=func2) @app.pipeline() def p_builder(worker, sentence): return sentence \ .subscribe_func(lambda sentence: dict(func1="ok"), name='root1') \ .subscribe_func_as_producer( lambda sentence: [dict(func2="ok"), dict(func2="ok")], name='root2') \ .subscribe_consumer(save_globaly) p_builder.compile() p_builder(sentence="Oleg") run_pipelines(app) it = save_globaly.iter(die_when_empty=True) result = next(it) assert len(result) == 2
def test_connected_pipelines(app): @app.pipeline() def p_builder(worker, sentence, use_lower): data = concatenate(sentence=sentence, use_lower=use_lower) data_with_name = data \ .subscribe_flow(NameExtractionOneWayFlow(), as_worker=False) return data_with_name @app.pipeline() def p_builder_general(worker, sentence, use_lower): data = concatenate(sentence=sentence, use_lower=use_lower) return data.subscribe_pipeline(p_builder)\ .subscribe_flow(NameExtractionFlowMultiple(), as_worker=False) p_builder.compile() p_builder_general.compile() p_builder_general(sentence="Oleg", use_lower=True) run_pipelines(app)
def test_deep_tree_functions(app): t = GlobalTestData() @app.consumer() def save_globaly(name, result, **kwargs): t.save_multiple_items(**{name: result}) @app.pipeline() def p_builder(worker, sentence): root_branch = sentence \ .subscribe_func(lambda sentence: dict(func1="ok"), name='root1') \ .subscribe_func(lambda sentence: dict(func2="ok"), name='root2') return concatenate(branch_1=root_branch.subscribe_pipeline(branch_1), branch_2=root_branch.subscribe_pipeline(branch_2), branch_3=root_branch.subscribe_pipeline(branch_3)) @app.pipeline() def branch_1(work, func1, func2): root_branch = concatenate(func1=func1, func2=func2) return root_branch \ .add_value(name='branch_1') \ .subscribe_func(lambda func1, func2: dict(func1_1="ok"), name='branch_1_1') \ .subscribe_func(lambda func1, func2: dict(result="branch_1"), name='branch_1_2')\ .subscribe_consumer(save_globaly) @app.pipeline() def branch_2(work, func1, func2): root_branch = concatenate(func1=func1, func2=func2) return root_branch \ .add_value(name='branch_2') \ .subscribe_func(lambda func1, func2: dict(func1_1="ok"), name='branch_2_1') \ .subscribe_func(lambda func1, func2: dict(result="branch_2"), name='branch_2_2')\ .subscribe_consumer(save_globaly) @app.pipeline() def branch_3(work, func1, func2): root_branch = concatenate(func1=func1, func2=func2) return root_branch \ .add_value(name='branch_3') \ .subscribe_func(lambda func1, func2: dict(func1_1="ok"), name='branch_3_1') \ .subscribe_func(lambda func1, func2: dict(result="branch_3"), name='branch_3_2')\ .subscribe_consumer(save_globaly) p_builder.compile() branch_1.compile() branch_2.compile() branch_3.compile() p_builder(sentence="Oleg", use_lower=True) run_pipelines(app) result = t.get_result() assert len(result) == 3 assert list(result[0].keys()) == list(result[0].values()) assert list(result[1].keys()) == list(result[1].values()) assert list(result[2].keys()) == list(result[2].values())