async def test_simple_pipeline_multiple_run_and_agg(self, ray_context): # m1:g1 ----------> \ # m2:g2 -> m3:g2 -> m4:g3(agg) builder = ParallelPipeline() builder.add_module(self.ModuleA('m1', group='g1')) builder.add_module(self.ModuleB('m2', group='g2')) builder.add_module( self.ModuleB('m3', group='g2') .depends_on(builder.get_module('m2')) ) builder.add_module( self.ModuleAgg('m4', group='g3') .depends_on(builder.get_module('m3')) .depends_on(builder.get_module('m1')) .expose_result('final') ) pipeline = await builder.build() r1, _ = await pipeline.run('R1') r2, _ = await pipeline.run('R2') agg1, _ = await pipeline.process('A1') r3, _ = await pipeline.run('R3') agg2, _ = await pipeline.process('A1') assert len(r1) == len(r2) == len(r3) == 3 assert len(agg1['final']) == 2 assert len(agg2['final']) == 1
async def test_can_create(self, ray_context): builder = ParallelPipeline() builder.add_module(ModuleA('m1', group='g1')) pipeline = await builder.build() assert isinstance(pipeline, ParallelPipeline.Runtime) assert len(pipeline.groups) == 1 assert set([g.name for g in pipeline.groups]) == set(['g1'])
async def test_close_pipeline(self, ray_context): # m1:g1 ----------> \ # m2:g2 -> m3:g2 -> m4:g3 builder = ParallelPipeline() builder.add_module(ModuleTestTeardown('m1', group='g1')) builder.add_module(ModuleTestTeardown('m2', group='g2')) builder.add_module( ModuleTestTeardown('m3', group='g2') .depends_on(builder.get_module('m2')) ) builder.add_module( ModuleTestTeardown('m4', group='g3') .depends_on(builder.get_module('m3')) .depends_on(builder.get_module('m1')) .expose_result('final') ) up, down = Queue(), Queue() pipeline = await builder.build(dict(up=up, down=down)) assert(up.size() == len(pipeline.modules)) assert(down.size() == 0) await pipeline.run() await pipeline.close() assert pipeline.closed assert(up.size() == len(pipeline.modules)) assert(down.size() == len(pipeline.modules)) with pytest.raises(ClosedPipelineException): await pipeline.run() with pytest.raises(ClosedPipelineException): await pipeline.process()
async def test_cannot_create_empty_group(self, ray_context): builder = ParallelPipeline() builder.add_module(ModuleA('m1', group='g1')) builder.add_module(ModuleB('m2', group='g2').depends_on(builder.get_module('m1'))) builder.add_module(ModuleC('m3', group='g3').depends_on(builder.get_module('m2'))) builder.add_group(builder.Group('g4')) with pytest.raises(Exception): await builder.build()
async def test_should_fail_on_adding_to_group_non_callable_hooks(self, ray_context): builder = ParallelPipeline() def callable_1(): pass with pytest.raises(Exception): builder.add_group(builder.Group('g1', after_created=[callable_1, "test"]))
async def test_can_pass_context(self, ray_context): context = dict(ctx='MAGDA') tag = 'output' @expose(tag) @finalize class ModuleWithContext(Module.Runtime): def run(self, *args, **kwargs): return self.context builder = ParallelPipeline() builder.add_module(ModuleWithContext('m1', group='g1')) pipeline = await builder.build(context) assert pipeline.context == context results, _ = await pipeline.run() assert results[tag] == context
async def test_request(self, ray_context): builder = ParallelPipeline() builder.add_module(ModuleA('m1', group='g1')) builder.add_module(ModuleB('m2', group='g2')) builder.add_module( ModuleB('m3', group='g2') .depends_on(builder.get_module('m2')) ) builder.add_module( ModuleC('m4', group='g3') .depends_on(builder.get_module('m3')) .depends_on(builder.get_module('m1')) .expose_result('final') ) pipeline = await builder.build() result, _ = await pipeline.run('R1') assert result['final'] == 'R1:C'
async def test_can_pass_shared_parameters(self, ray_context): context = dict(ctx='MAGDA') shared_parameters = {'shared_param1': 1} tag = 'output' @expose(tag) @finalize class ModuleWithSharedParams(Module.Runtime): def run(self, *args, **kwargs): return self.shared_parameters builder = ParallelPipeline() builder.add_module(ModuleWithSharedParams('m1', group='g1')) pipeline = await builder.build(context, shared_parameters) assert pipeline.shared_parameters == shared_parameters results, _ = await pipeline.run() assert results[tag] == shared_parameters
async def test_build_run_with_defaults(self, ray_context): tag = 'output_tag' @expose(tag) @finalize class MockModule(Module.Runtime): def run(self, *args, **kwargs): sleep(0.01) return 'output_result' builder = ParallelPipeline() builder.add_module(MockModule('m1', group='g1')) pipeline = await builder.build() results, _ = await pipeline.run() assert 'output_tag' in results assert results['output_tag'] == 'output_result'
async def test_can_create_multiple_dependent_groups(self, ray_context): builder = ParallelPipeline() builder.add_module(ModuleA('m1', group='g1')) builder.add_module( ModuleB('m2', group='g2').depends_on(builder.get_module('m1'))) builder.add_module( ModuleC('m3', group='g3').depends_on(builder.get_module('m2'))) pipeline = await builder.build() assert isinstance(pipeline, ParallelPipeline.Runtime) assert len(pipeline.groups) == 3 assert set([g.name for g in pipeline.groups]) == set(['g1', 'g2', 'g3']) g1 = next((g for g in pipeline.groups if g.name == 'g1')) g2 = next((g for g in pipeline.groups if g.name == 'g2')) g3 = next((g for g in pipeline.groups if g.name == 'g3')) assert g1.dependencies == set([]) assert g2.dependencies == set(['g1']) assert g3.dependencies == set(['g2'])
async def test_can_create_multiple_groups(self, ray_context): builder = ParallelPipeline() builder.add_module(ModuleA('m1', group='g1')) builder.add_module(ModuleB('m2', group='g2').depends_on(builder.get_module('m1'))) pipeline = await builder.build() assert isinstance(pipeline, ParallelPipeline.Runtime) assert len(pipeline.groups) == 2 assert set([g.name for g in pipeline.groups]) == set(['g1', 'g2'])
async def test_stateful_pipeline_with_only_nonregular_modules( self, ray_context): builder = ParallelPipeline() builder.add_module(self.ModuleAgg('m1', group='g1')) builder.add_module( self.ModuleC('m2', group='g1').depends_on(builder.get_module('m1'))) builder.add_module( self.ModuleC('m3', group='g2').depends_on(builder.get_module('m1'))) pipeline = await builder.build() res, _ = await pipeline.run('R1') agg, _ = await pipeline.process('A1') assert len(res) == 0 assert len(agg) == 3 assert agg['agg'] == [[]] assert agg['m2'] == agg['m3'] == [[], []]
def _add_group_options(cls, group_options, pipeline): if group_options and isinstance(pipeline, ParallelPipeline): for name, params in group_options.items(): group = ParallelPipeline.Group(name) group.set_replicas(params['replicas'] if 'replicas' in params else 1) params.pop('replicas', None) if params.keys(): group.set_options(**params) pipeline.add_group(group) return pipeline
async def test_listing_group_modules(self, modules, ray_context): builder = ParallelPipeline() for module in modules: builder.add_module(module) pipeline = await builder.build() names = set([m.name for m in modules]) pipeline_names = set([m.name for m in pipeline.modules]) assert names == pipeline_names def prepare_comparison(group) -> bool: expected = set([m.name for m in modules if m.group == group.name]) current = set([m.name for m in group.modules]) return expected == current compared_modules = [ prepare_comparison(group) for group in pipeline.groups ] assert all(compared_modules)
async def test_parallel_requests(self, ray_context): builder = ParallelPipeline() builder.add_module(ModuleA('m1', group='g1')) builder.add_module(ModuleB('m2', group='g2')) builder.add_module( ModuleB('m3', group='g2') .depends_on(builder.get_module('m2')) ) builder.add_module( ModuleC('m4', group='g3') .depends_on(builder.get_module('m3')) .depends_on(builder.get_module('m1')) .expose_result('final') ) pipeline = await builder.build() results = await asyncio.gather( asyncio.create_task(pipeline.run('R1')), asyncio.create_task(pipeline.run('R2')), asyncio.create_task(pipeline.run('R3')), ) outputs = set([r[0]['final'] for r in results]) assert outputs == set(['R1:C', 'R2:C', 'R3:C'])
def _add_group_options(cls, group_options, pipeline, hooks): if group_options and isinstance(pipeline, ParallelPipeline): for name, params in group_options.items(): if hooks: group_hooks = hooks[name] if isinstance(hooks, dict) else hooks else: group_hooks = None group = ParallelPipeline.Group(name, after_created=group_hooks) group.set_replicas(params['replicas'] if 'replicas' in params else 1) params.pop('replicas', None) if params.keys(): group.set_options(**params) pipeline.add_group(group) return pipeline
async def read( cls, config: str, module_factory: ModuleFactory, config_parameters: Optional[Dict] = None, context: Optional[Any] = None, shared_parameters: Optional[Dict] = None ): if config_parameters: cls._validate_config_parameters_structure(config_parameters) config = cls._check_and_substitute_declared_variables(config, config_parameters) parsed_yaml = yaml.safe_load(config) modules, shared_parameters, group_options = \ cls._extract_information_from_yaml(parsed_yaml, shared_parameters) pipeline = ( ParallelPipeline() if any([m.group is not None for m in modules]) else SequentialPipeline() ) pipeline = cls._add_modules_to_pipeline(modules, pipeline, module_factory) pipeline = cls._add_group_options(group_options, pipeline) # connect modules for mod in modules: curr_mod_obj = pipeline.get_module(mod.name) for dependent_mod_name in mod.depends_on: dependent_mod_obj = pipeline.get_module(dependent_mod_name) if dependent_mod_obj: curr_mod_obj.depends_on(dependent_mod_obj) else: raise AttributeError( f"Module '{dependent_mod_name}' hasn't been defined in the config file, " "whereas it's used as a dependency." ) runtime = await pipeline.build(context, shared_parameters) return runtime
async def test_stateful_pipeline_invalid(self, ray_context): # m1:g1 -> m2:g1(agg)-\ # m3:g2 -> m4:g2 ------> m5:g3 builder = ParallelPipeline() builder.add_module(self.ModuleA('m1', group='g1')) builder.add_module( self.ModuleAgg('m2', group='g1').depends_on(builder.get_module('m1'))) builder.add_module(self.ModuleA('m3', group='g2')) builder.add_module( self.ModuleB('m4', group='g2').depends_on(builder.get_module('m3'))) builder.add_module( self.ModuleC('m5', group='g3').depends_on( builder.get_module('m2')).depends_on(builder.get_module('m4'))) with pytest.raises(Exception): await builder.build()
async def test_stateful_pipeline_with_two_agg_modules_double_run_and_agg( self, ray_context): # /-> m12:g6 -> m13:g6 # m1:g1 -> m2:g1 -> \ /--> m5:g3 -> m6:g3 --> agg10:g5 -> m11:g5 # m3:g2 ----------> m4:g3 -> agg7:g3 ---------> m8:g4 -> m9:g4 builder = ParallelPipeline() builder.add_module(self.ModuleA('m1', group='g1')) builder.add_module( self.ModuleB('m2', group='g1').depends_on(builder.get_module('m1'))) builder.add_module(self.ModuleA('m3', group='g2')) builder.add_module( self.ModuleB('m4', group='g3').depends_on( builder.get_module('m2')).depends_on(builder.get_module('m3'))) builder.add_module( self.ModuleB('m5', group='g3').depends_on(builder.get_module('m4'))) builder.add_module( self.ModuleB('m6', group='g3').depends_on(builder.get_module('m5'))) builder.add_module( self.ModuleAgg('agg7', group='g3').depends_on(builder.get_module('m4'))) builder.add_module( self.ModuleC('m8', group='g4').depends_on(builder.get_module('agg7'))) builder.add_module( self.ModuleC('m9', group='g4').depends_on(builder.get_module('m8'))) builder.add_module( self.ModuleAgg('agg10', group='g5').depends_on( builder.get_module('m6')).expose_result('second_agg')) builder.add_module( self.ModuleC('m11', group='g5').depends_on(builder.get_module('agg10'))) builder.add_module( self.ModuleB('m12', group='g6').depends_on(builder.get_module('m6'))) builder.add_module( self.ModuleB('m13', group='g6').depends_on(builder.get_module('m12'))) pipeline = await builder.build() res1, _ = await pipeline.run('R1') agg1, _ = await pipeline.process('A1') res2, _ = await pipeline.run('R2') agg2, _ = await pipeline.process('A2') assert len(res1) == len(res2) == 8 assert len(agg1) == len(agg2) == 5 assert type(agg2['agg']) == list assert type(agg2['m8']) == list assert type(agg2['m9']) == str assert type(agg2['second_agg']) == list assert type(agg2['m11']) == str
async def test_stateful_pipeline_confluence_example(self, ray_context): # m1:g1 -> m2:g1 -> \ /--> m5:g3 -> m6:g3 # m3:g2 ----------> m4:g3 -> agg7:g3 ----> m8:g4 -> m9:g4 builder = ParallelPipeline() builder.add_module(self.ModuleA('m1', group='g1')) builder.add_module( self.ModuleB('m2', group='g1').depends_on(builder.get_module('m1'))) builder.add_module(self.ModuleA('m3', group='g2')) builder.add_module( self.ModuleB('m4', group='g3').depends_on( builder.get_module('m2')).depends_on(builder.get_module('m3'))) builder.add_module( self.ModuleB('m5', group='g3').depends_on(builder.get_module('m4'))) builder.add_module( self.ModuleB('m6', group='g3').depends_on(builder.get_module('m5'))) builder.add_module( self.ModuleAgg('agg7', group='g3').depends_on(builder.get_module('m4'))) builder.add_module( self.ModuleC('m8', group='g4').depends_on(builder.get_module('agg7'))) builder.add_module( self.ModuleC('m9', group='g4').depends_on(builder.get_module('m8'))) pipeline = await builder.build() await pipeline.run('R1') await pipeline.run('R2') res, _ = await pipeline.run('R3') agg, _ = await pipeline.process('A1') assert len(res) == 6 assert len(agg) == 3 assert type(agg['agg']) == list assert len(agg['agg']) == 3 assert type(agg['m8']) == list assert len(agg['m8']) == 6 assert type(agg['m9']) == str assert agg['m9'] == 'c'
async def test_stateful_pipeline_with_two_afteragg_modules( self, ray_context): # m1:g1 -> m2:g1 -> \ # m3:g2 -> m4:g2 ---> m5:g3 -> m6:g3(agg) -> m7:g4 -> m8:g4 builder = ParallelPipeline() builder.add_module(self.ModuleA('m1', group='g1')) builder.add_module( self.ModuleB('m2', group='g1').depends_on(builder.get_module('m1'))) builder.add_module(self.ModuleA('m3', group='g2')) builder.add_module( self.ModuleB('m4', group='g2').depends_on(builder.get_module('m3'))) builder.add_module( self.ModuleB('m5', group='g3').depends_on( builder.get_module('m2')).depends_on(builder.get_module('m4'))) builder.add_module( self.ModuleAgg('m6', group='g3').depends_on(builder.get_module('m5'))) builder.add_module( self.ModuleC('m7', group='g4').depends_on(builder.get_module('m6'))) builder.add_module( self.ModuleC('m8', group='g4').depends_on(builder.get_module('m7'))) pipeline = await builder.build() await pipeline.run('R1') await pipeline.run('R2') agg, _ = await pipeline.process('A1') assert len(agg) == 3 assert type(agg['agg']) == list assert len(agg['agg']) == 2 assert type(agg['m7']) == list assert len(agg['m7']) == 4 assert type(agg['m8']) == str assert agg['m8'] == 'c'
async def test_stateful_pipeline_with_stateful_group(self, ray_context): # m1:g1 -> m2:g1 -> \ # m3:g2 -> m4:g2 ---> m5:g3 -> m6:g4(agg) -> m7:g4 builder = ParallelPipeline() builder.add_module(self.ModuleA('m1', group='g1')) builder.add_module( self.ModuleB('m2', group='g1').depends_on(builder.get_module('m1'))) builder.add_module(self.ModuleA('m3', group='g2')) builder.add_module( self.ModuleB('m4', group='g2').depends_on(builder.get_module('m3'))) builder.add_module( self.ModuleB('m5', group='g3').depends_on( builder.get_module('m2')).depends_on(builder.get_module('m4'))) builder.add_module( self.ModuleAgg('m6', group='g4').depends_on(builder.get_module('m5'))) builder.add_module( self.ModuleC('m7', group='g4').depends_on(builder.get_module('m6'))) pipeline = await builder.build() await pipeline.run('R1') res, _ = await pipeline.run('R2') agg, _ = await pipeline.process('A1') assert len(res) == 5 assert len(agg) == 2 assert len(agg['agg']) == 2 assert agg['agg'][0][0] == agg['agg'][1][0] == ('m5', 'b')
async def test_failed_pipeline_run_not_affecting_other_runs( self, ray_context): builder = ParallelPipeline() @finalize class ModuleA(Module.Runtime): def run(self, request, *args, **kwargs): sleep(0.01) return 'a' @accept(ModuleA) @finalize class ModuleError(Module.Runtime): def run(self, request, *args, **kwargs): if request == 'error': raise Exception('Whoops') @accept(ModuleError) @expose() @finalize class ModuleC(Module.Runtime): def run(self, request, *args, **kwargs): sleep(0.01) return 'c' builder.add_module(ModuleA('m1', group='g1')) builder.add_module( ModuleError('m2', group='g1').depends_on(builder.get_module('m1'))) builder.add_module(ModuleError('m3', group='g2')) builder.add_module( ModuleC('m4', group='g3').depends_on( builder.get_module('m2')).depends_on(builder.get_module('m3'))) pipeline = await builder.build() results = await asyncio.gather(*[ asyncio.create_task(pipeline.run(request_value)) for request_value in ['error', 'R2'] ]) result1, error1 = results[0] result2, error2 = results[1] assert result1 is None assert isinstance(error1, Exception) assert str(error1) == 'Whoops' assert 'm4' in result2 assert result2['m4'] == 'c' assert error2 is None
async def test_should_call_hooks_in_groups(self, ray_context): builder = ParallelPipeline() callable_1_counter = Queue() callable_2_counter = Queue() def callable_1(): callable_1_counter.put(1) def callable_2(): callable_2_counter.put(2) builder.add_group(builder.Group('g1', after_created=[callable_1, callable_2])) builder.add_group(builder.Group('g2', after_created=[callable_2])) builder.add_module(ModuleA('m1', group='g1')) builder.add_module(ModuleB('m2', group='g2').depends_on(builder.get_module('m1'))) pipeline = await builder.build() assert isinstance(pipeline, ParallelPipeline.Runtime) assert len(pipeline.groups) == 2 assert set([g.name for g in pipeline.groups]) == {'g1', 'g2'} assert callable_1_counter.qsize() == 1 assert callable_2_counter.qsize() == 2
async def test_should_fail_on_adding_to_group_hooks_with_incorrect_type(self, ray_context): builder = ParallelPipeline() with pytest.raises(Exception): builder.add_group(builder.Group('g1', after_created="test"))
async def test_stateful_pipeline_mixed(self, ray_context): # m1:g1 -> m2:g1 -> \ # m3:g2 -> m4:g2 ---> m5:g3 -> m6:g3(agg) -> m7:g3 builder = ParallelPipeline() builder.add_module(self.ModuleA('m1', group='g1')) builder.add_module( self.ModuleB('m2', group='g1').depends_on(builder.get_module('m1'))) builder.add_module(self.ModuleA('m3', group='g2')) builder.add_module( self.ModuleB('m4', group='g2').depends_on(builder.get_module('m3'))) builder.add_module( self.ModuleB('m5', group='g3').depends_on( builder.get_module('m2')).depends_on(builder.get_module('m4'))) builder.add_module( self.ModuleAgg('m6', group='g3').depends_on(builder.get_module('m5'))) builder.add_module( self.ModuleC('m7', group='g3').depends_on(builder.get_module('m6'))) pipeline = await builder.build() res = await pipeline.run('R1') agg = await pipeline.process('A1') assert len(res) == 5 assert len(agg['agg']) == 1 assert len(agg['m7']) == 2
async def read( cls: Type[ConfigReader], config: str, module_factory: ModuleFactory, config_parameters: Optional[Dict] = None, name: Optional[str] = None, context: Optional[Any] = None, shared_parameters: Optional[Dict] = None, *, logger: Optional[MagdaLogger.Config] = None, ) -> BasePipeline.Runtime: if config_parameters: cls._validate_config_parameters_structure(config_parameters) config = cls._check_and_substitute_declared_variables( config, config_parameters) parsed_yaml = yaml.safe_load(config) config_pipeline_name, modules, shared_parameters, group_options = \ cls._extract_information_from_yaml(parsed_yaml, shared_parameters) cls._check_expose_settings(modules) if name and config_pipeline_name: warnings.warn( 'The pipeline name specified in config wil be overriden ' 'by ConfigReader.read parameter') name = name or config_pipeline_name if name: cls._check_pipeline_name(name) pipeline = (ParallelPipeline(name=name) if any( [m.group is not None for m in modules]) else SequentialPipeline(name=name)) pipeline = cls._add_modules_to_pipeline(modules, pipeline, module_factory) pipeline = cls._add_group_options(group_options, pipeline) # connect modules for mod in modules: curr_mod_obj = pipeline.get_module(mod.name) for dependent_mod_name in mod.depends_on: dependent_mod_obj = pipeline.get_module(dependent_mod_name) if dependent_mod_obj: curr_mod_obj.depends_on(dependent_mod_obj) else: raise AttributeError( f"Module '{dependent_mod_name}' hasn't been defined in the config file, " "whereas it's used as a dependency.") runtime = await pipeline.build( context=context, shared_parameters=shared_parameters, logger=logger, ) return runtime
async def test_cannot_create_without_group(self, modules, ray_context): builder = ParallelPipeline() for module in modules: builder.add_module(module) with pytest.raises(Exception): await builder.build()