def test_service_job_success(self): """Tests task generation when example-gen service job succeeds.""" test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) [ eg_update_node_state_task, sg_update_node_state_task, sg_exec_node_task ] = self._generate_and_test(True, num_initial_executions=1, num_tasks_generated=3, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._stats_gen]) self.assertTrue( task_lib.is_update_node_state_task(eg_update_node_state_task)) self.assertEqual('my_example_gen', eg_update_node_state_task.node_uid.node_id) self.assertEqual(pstate.NodeState.COMPLETE, eg_update_node_state_task.state) self.assertTrue( task_lib.is_update_node_state_task(sg_update_node_state_task)) self.assertEqual('my_statistics_gen', sg_update_node_state_task.node_uid.node_id) self.assertEqual(pstate.NodeState.RUNNING, sg_update_node_state_task.state) self.assertTrue(task_lib.is_exec_node_task(sg_exec_node_task))
def test_task_generation_when_node_stopped(self, stop_stats_gen): """Tests stopped nodes are ignored when generating tasks.""" test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) num_initial_executions = 1 if stop_stats_gen: num_tasks_generated = 0 num_new_executions = 0 num_active_executions = 0 with self._mlmd_connection as m: pipeline_state = test_utils.get_or_create_pipeline_state( m, self._pipeline) with pipeline_state: with pipeline_state.node_state_update_context( task_lib.NodeUid.from_pipeline_node( self._pipeline, self._stats_gen)) as node_state: node_state.update( pstate.NodeState.STOPPING, status_lib.Status(code=status_lib.Code.CANCELLED)) else: num_tasks_generated = 1 num_new_executions = 1 num_active_executions = 1 tasks = self._generate_and_test( True, num_initial_executions=num_initial_executions, num_tasks_generated=num_tasks_generated, num_new_executions=num_new_executions, num_active_executions=num_active_executions, ignore_update_node_state_tasks=True) self.assertLen(tasks, num_tasks_generated)
def test_generate_resolved_info(self): otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 2, 1) with self._mlmd_connection as m: resolved_info = task_gen_utils.generate_resolved_info( m, self._transform) self.assertCountEqual(['my_pipeline', 'my_transform'], [c.name for c in resolved_info.contexts]) self.assertLen(resolved_info.input_artifacts['examples'], 1) self.assertProtoPartiallyEquals( """ id: 1 type_id: 4 uri: "my_examples_uri" custom_properties { key: "span" value { int_value: 2 } } custom_properties { key: "version" value { int_value: 1 } } state: LIVE""", resolved_info.input_artifacts['examples'][0].mlmd_artifact, ignored_fields=[ 'create_time_since_epoch', 'last_update_time_since_epoch' ])
def test_task_generation_ignore_nodes(self, ignore_transform): """Tests nodes can be ignored while generating tasks.""" # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Generate once. with self.subTest(generate=1): num_initial_executions = 1 if ignore_transform: num_tasks_generated = 0 num_new_executions = 0 num_active_executions = 0 ignore_node_ids = set([self._transform.node_info.id]) else: num_tasks_generated = 1 num_new_executions = 1 num_active_executions = 1 ignore_node_ids = None tasks, active_executions = self._generate_and_test( True, num_initial_executions=num_initial_executions, num_tasks_generated=num_tasks_generated, num_new_executions=num_new_executions, num_active_executions=num_active_executions, ignore_node_ids=ignore_node_ids) if ignore_transform: self.assertEmpty(tasks) self.assertEmpty(active_executions) else: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0])
def test_pipeline_failure_strategies(self, fail_fast): """Tests pipeline failure strategies.""" test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) self._run_next(False, expect_nodes=[self._stats_gen], fail_fast=fail_fast) self._run_next(False, expect_nodes=[self._schema_gen], fail_fast=fail_fast) # Both example-validator and transform are ready to execute. [example_validator_task, transform_task] = self._generate(False, True, fail_fast=fail_fast) self.assertEqual(self._example_validator.node_info.id, example_validator_task.node_uid.node_id) self.assertEqual(self._transform.node_info.id, transform_task.node_uid.node_id) # Simulate Transform success. self._finish_node_execution(False, transform_task) # But fail example-validator. with self._mlmd_connection as m: with mlmd_state.mlmd_execution_atomic_op( m, example_validator_task.execution_id) as ev_exec: # Fail stats-gen execution. ev_exec.last_known_state = metadata_store_pb2.Execution.FAILED data_types_utils.set_metadata_value( ev_exec.custom_properties[ constants.EXECUTION_ERROR_MSG_KEY], 'example-validator error') if fail_fast: # Pipeline run should immediately fail because example-validator failed. [finalize_task] = self._generate(False, True, fail_fast=fail_fast) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task)) self.assertEqual(status_lib.Code.ABORTED, finalize_task.status.code) else: # Trainer and downstream nodes can execute as transform has finished. # example-validator failure does not impact them as it is not upstream. # Pipeline run will still fail but when no more progress can be made. self._run_next(False, expect_nodes=[self._trainer], fail_fast=fail_fast) self._run_next(False, expect_nodes=[self._chore_a], fail_fast=fail_fast) self._run_next(False, expect_nodes=[self._chore_b], fail_fast=fail_fast) [finalize_task] = self._generate(False, True, fail_fast=fail_fast) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task)) self.assertEqual(status_lib.Code.ABORTED, finalize_task.status.code)
def test_restart_node_cancelled_due_to_stopping(self): """Tests that a node previously cancelled due to stopping can be restarted.""" test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) [stats_gen_task ] = self._generate_and_test(False, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, ignore_update_node_state_tasks=True) node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, self._stats_gen) self.assertEqual(node_uid, stats_gen_task.node_uid) # Simulate stopping the node while it is under execution, which leads to # the node execution being cancelled. with self._mlmd_connection as m: with mlmd_state.mlmd_execution_atomic_op( m, stats_gen_task.execution_id) as stats_gen_exec: stats_gen_exec.last_known_state = metadata_store_pb2.Execution.CANCELED data_types_utils.set_metadata_value( stats_gen_exec.custom_properties[ constants.EXECUTION_ERROR_MSG_KEY], 'manually stopped') # Change state of node to STARTING. with self._mlmd_connection as m: pipeline_state = test_utils.get_or_create_pipeline_state( m, self._pipeline) with pipeline_state: with pipeline_state.node_state_update_context( node_uid) as node_state: node_state.update(pstate.NodeState.STARTING) # New execution should be created for any previously canceled node when the # node state is STARTING. [update_node_state_task, stats_gen_task] = self._generate_and_test(False, num_initial_executions=2, num_tasks_generated=2, num_new_executions=1, num_active_executions=1) self.assertTrue( task_lib.is_update_node_state_task(update_node_state_task)) self.assertEqual(node_uid, update_node_state_task.node_uid) self.assertEqual(pstate.NodeState.RUNNING, update_node_state_task.state) self.assertEqual(node_uid, stats_gen_task.node_uid)
def test_triggering_upon_executor_spec_change(self): test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) with mock.patch.object(task_gen_utils, 'get_executor_spec') as mock_get_executor_spec: mock_get_executor_spec.side_effect = _fake_executor_spec(1) [exec_transform_task] = self._generate_and_test( False, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._transform], ignore_update_node_state_tasks=True) # Fail the registered execution. with self._mlmd_connection as m: with mlmd_state.mlmd_execution_atomic_op( m, exec_transform_task.execution_id) as execution: execution.last_known_state = metadata_store_pb2.Execution.FAILED # Try to generate with same executor spec. This should not trigger as # there are no changes since last run. with mock.patch.object(task_gen_utils, 'get_executor_spec') as mock_get_executor_spec: mock_get_executor_spec.side_effect = _fake_executor_spec(1) self._generate_and_test( False, num_initial_executions=2, num_tasks_generated=0, num_new_executions=0, num_active_executions=0, ignore_update_node_state_tasks=True) # Generating with a different executor spec should trigger. with mock.patch.object(task_gen_utils, 'get_executor_spec') as mock_get_executor_spec: mock_get_executor_spec.side_effect = _fake_executor_spec(2) self._generate_and_test( False, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._transform], ignore_update_node_state_tasks=True)
def test_cached_execution(self): """Tests that cached execution is used if one is available.""" # Fake ExampleGen run. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Invoking generator should produce an ExecNodeTask for StatsGen. [stats_gen_task], _ = self._generate_and_test(False, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self.assertEqual('my_statistics_gen', stats_gen_task.node_uid.node_id) # Finish StatsGen execution. otu.fake_execute_node(self._mlmd_connection, stats_gen_task) # Prepare another pipeline with a new pipeline_run_id. pipeline_run_id = str(uuid.uuid4()) new_pipeline = self._make_pipeline(self._pipeline_root, pipeline_run_id) stats_gen = otu.get_node(new_pipeline, 'my_statistics_gen') # Invoking generator for the new pipeline should result in: # 1. StatsGen execution succeeds with state "CACHED" but no ExecNodeTask # generated. # 2. An ExecNodeTask is generated for SchemaGen (component downstream of # StatsGen) with an active execution in MLMD. [schema_gen_task ], _ = self._generate_and_test(False, pipeline=new_pipeline, num_initial_executions=2, num_tasks_generated=1, num_new_executions=2, num_active_executions=1) self.assertEqual('my_schema_gen', schema_gen_task.node_uid.node_id) # Check that StatsGen execution is successful in state "CACHED". with self._mlmd_connection as m: executions = task_gen_utils.get_executions(m, stats_gen) self.assertLen(executions, 1) execution = executions[0] self.assertTrue(execution_lib.is_execution_successful(execution)) self.assertEqual(metadata_store_pb2.Execution.CACHED, execution.last_known_state)
def test_node_failed(self, fail_fast): """Tests task generation when a node registers a failed execution.""" test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) [stats_gen_task ] = self._generate_and_test(False, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, ignore_update_node_state_tasks=True, fail_fast=fail_fast) self.assertEqual( task_lib.NodeUid.from_pipeline_node(self._pipeline, self._stats_gen), stats_gen_task.node_uid) with self._mlmd_connection as m: with mlmd_state.mlmd_execution_atomic_op( m, stats_gen_task.execution_id) as stats_gen_exec: # Fail stats-gen execution. stats_gen_exec.last_known_state = metadata_store_pb2.Execution.FAILED data_types_utils.set_metadata_value( stats_gen_exec.custom_properties[ constants.EXECUTION_ERROR_MSG_KEY], 'foobar error') # Test generation of FinalizePipelineTask. [update_node_state_task, finalize_task] = self._generate_and_test(True, num_initial_executions=2, num_tasks_generated=2, num_new_executions=0, num_active_executions=0, fail_fast=fail_fast) self.assertTrue( task_lib.is_update_node_state_task(update_node_state_task)) self.assertEqual('my_statistics_gen', update_node_state_task.node_uid.node_id) self.assertEqual(pstate.NodeState.FAILED, update_node_state_task.state) self.assertRegexMatch(update_node_state_task.status.message, ['foobar error']) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task)) self.assertEqual(status_lib.Code.ABORTED, finalize_task.status.code) self.assertRegexMatch(finalize_task.status.message, ['foobar error'])
def test_triggering_upon_exec_properties_change(self): test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) [exec_transform_task] = self._generate_and_test( False, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._transform], ignore_update_node_state_tasks=True) # Fail the registered execution. with self._mlmd_connection as m: with mlmd_state.mlmd_execution_atomic_op( m, exec_transform_task.execution_id) as execution: execution.last_known_state = metadata_store_pb2.Execution.FAILED # Try to generate with same execution properties. This should not trigger # as there are no changes since last run. self._generate_and_test( False, num_initial_executions=2, num_tasks_generated=0, num_new_executions=0, num_active_executions=0, ignore_update_node_state_tasks=True) # Change execution properties of last run. with self._mlmd_connection as m: with mlmd_state.mlmd_execution_atomic_op( m, exec_transform_task.execution_id) as execution: execution.custom_properties['a_param'].int_value = 20 # Generating with different execution properties should trigger. self._generate_and_test( False, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._transform], ignore_update_node_state_tasks=True)
def test_node_failed(self, use_task_queue): """Tests task generation when a node registers a failed execution.""" otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) def _ensure_node_services(unused_pipeline_state, node_id): self.assertEqual(self._example_gen.node_info.id, node_id) return service_jobs.ServiceStatus.SUCCESS self._mock_service_job_manager.ensure_node_services.side_effect = ( _ensure_node_services) tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self.assertEqual( task_lib.NodeUid.from_pipeline_node(self._pipeline, self._stats_gen), tasks[0].node_uid) stats_gen_exec = active_executions[0] # Fail stats-gen execution. stats_gen_exec.last_known_state = metadata_store_pb2.Execution.FAILED data_types_utils.set_metadata_value( stats_gen_exec.custom_properties[constants.EXECUTION_ERROR_MSG_KEY], 'foobar error') with self._mlmd_connection as m: m.store.put_executions([stats_gen_exec]) if use_task_queue: task = self._task_queue.dequeue() self._task_queue.task_done(task) # Test generation of FinalizePipelineTask. tasks, _ = self._generate_and_test( True, num_initial_executions=2, num_tasks_generated=1, num_new_executions=0, num_active_executions=0) self.assertLen(tasks, 1) self.assertTrue(task_lib.is_finalize_pipeline_task(tasks[0])) self.assertEqual(status_lib.Code.ABORTED, tasks[0].status.code) self.assertRegexMatch(tasks[0].status.message, ['foobar error'])
def test_task_generation_when_node_stopped(self, stop_transform): """Tests stopped nodes are ignored when generating tasks.""" # Simulate that ExampleGen has already completed successfully. test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Generate once. num_initial_executions = 1 if stop_transform: num_tasks_generated = 1 num_new_executions = 0 num_active_executions = 0 with self._mlmd_connection as m: pipeline_state = test_utils.get_or_create_pipeline_state( m, self._pipeline) with pipeline_state: with pipeline_state.node_state_update_context( task_lib.NodeUid.from_pipeline_node( self._pipeline, self._transform)) as node_state: node_state.update(pstate.NodeState.STOPPING, status_lib.Status(code=status_lib.Code.CANCELLED)) else: num_tasks_generated = 3 num_new_executions = 1 num_active_executions = 1 tasks = self._generate_and_test( True, num_initial_executions=num_initial_executions, num_tasks_generated=num_tasks_generated, num_new_executions=num_new_executions, num_active_executions=num_active_executions) self.assertLen(tasks, num_tasks_generated) if stop_transform: self.assertTrue(task_lib.is_update_node_state_task(tasks[0])) self.assertEqual(pstate.NodeState.RUNNING, tasks[0].state) else: self.assertTrue(task_lib.is_update_node_state_task(tasks[0])) self.assertEqual(pstate.NodeState.RUNNING, tasks[0].state) self.assertTrue(task_lib.is_update_node_state_task(tasks[1])) self.assertEqual(pstate.NodeState.RUNNING, tasks[1].state) self.assertTrue(task_lib.is_exec_node_task(tasks[2]))
def test_node_success(self): """Tests task generation when a node execution succeeds.""" test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) [stats_gen_task ] = self._generate_and_test(False, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, ignore_update_node_state_tasks=True) # Finish stats-gen execution. self._finish_node_execution(False, stats_gen_task) [ stats_gen_update_node_state_task, schema_gen_update_node_state_task, schema_gen_exec_node_task ] = self._generate_and_test(False, num_initial_executions=2, num_tasks_generated=3, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._schema_gen]) self.assertTrue( task_lib.is_update_node_state_task( stats_gen_update_node_state_task)) self.assertEqual('my_statistics_gen', stats_gen_update_node_state_task.node_uid.node_id) self.assertEqual(pstate.NodeState.COMPLETE, stats_gen_update_node_state_task.state) self.assertTrue( task_lib.is_update_node_state_task( schema_gen_update_node_state_task)) self.assertEqual('my_schema_gen', schema_gen_update_node_state_task.node_uid.node_id) self.assertEqual(pstate.NodeState.RUNNING, schema_gen_update_node_state_task.state) self.assertTrue(task_lib.is_exec_node_task(schema_gen_exec_node_task))
def test_task_generation_ignore_nodes(self, ignore_transform): """Tests nodes can be ignored while generating tasks.""" # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) def _ensure_node_services(unused_pipeline_state, node_id): self.assertEqual('my_example_gen', node_id) return service_jobs.ServiceStatus.RUNNING self._mock_service_job_manager.ensure_node_services.side_effect = ( _ensure_node_services) # Generate once. with self.subTest(generate=1): num_initial_executions = 1 if ignore_transform: num_tasks_generated = 0 num_new_executions = 0 num_active_executions = 0 ignore_node_ids = set([self._transform.node_info.id]) else: num_tasks_generated = 1 num_new_executions = 1 num_active_executions = 1 ignore_node_ids = None tasks, active_executions = self._generate_and_test( True, num_initial_executions=num_initial_executions, num_tasks_generated=num_tasks_generated, num_new_executions=num_new_executions, num_active_executions=num_active_executions, ignore_node_ids=ignore_node_ids) if ignore_transform: self.assertEmpty(tasks) self.assertEmpty(active_executions) else: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0])
def test_get_executions(self): with self._mlmd_connection as m: for node in [n.pipeline_node for n in self._pipeline.nodes]: self.assertEmpty(task_gen_utils.get_executions(m, node)) # Create executions for the same nodes under different pipeline contexts. self._set_pipeline_context('my_pipeline1') otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 2, 1) otu.fake_transform_output(self._mlmd_connection, self._transform) self._set_pipeline_context('my_pipeline2') otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 2, 1) otu.fake_transform_output(self._mlmd_connection, self._transform) # Get all executions across all pipeline contexts. with self._mlmd_connection as m: all_eg_execs = sorted(m.store.get_executions_by_type( self._example_gen.node_info.type.name), key=lambda e: e.id) all_transform_execs = sorted(m.store.get_executions_by_type( self._transform.node_info.type.name), key=lambda e: e.id) # Check that correct executions are returned for each node in each pipeline. self._set_pipeline_context('my_pipeline1') with self._mlmd_connection as m: self.assertCountEqual( all_eg_execs[0:2], task_gen_utils.get_executions(m, self._example_gen)) self.assertCountEqual( all_transform_execs[0:1], task_gen_utils.get_executions(m, self._transform)) self.assertEmpty(task_gen_utils.get_executions(m, self._trainer)) self._set_pipeline_context('my_pipeline2') with self._mlmd_connection as m: self.assertCountEqual( all_eg_execs[2:], task_gen_utils.get_executions(m, self._example_gen)) self.assertCountEqual( all_transform_execs[1:], task_gen_utils.get_executions(m, self._transform)) self.assertEmpty(task_gen_utils.get_executions(m, self._trainer))
def test_tasks_generated_when_upstream_done(self, use_task_queue): """Tests that tasks are generated when upstream is done. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) def _ensure_node_services(unused_pipeline_state, node_id): self.assertIn( node_id, (self._example_gen.node_info.id, self._transform.node_info.id)) return service_jobs.ServiceStatus.SUCCESS self._mock_service_job_manager.ensure_node_services.side_effect = ( _ensure_node_services) # Generate once. Stats-gen task should be generated. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._stats_gen, execution_id, tasks[0]) self._mock_service_job_manager.ensure_node_services.assert_called_with( mock.ANY, self._example_gen.node_info.id) self._mock_service_job_manager.reset_mock() # Finish stats-gen execution. self._finish_node_execution(use_task_queue, self._stats_gen, active_executions[0]) # Schema-gen should execute next. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._schema_gen, execution_id, tasks[0]) # Finish schema-gen execution. self._finish_node_execution(use_task_queue, self._schema_gen, active_executions[0]) # Transform and ExampleValidator should both execute next. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=3, num_tasks_generated=2, num_new_executions=2, num_active_executions=2) self._verify_exec_node_task(self._example_validator, active_executions[0].id, tasks[0]) transform_exec = active_executions[1] self._verify_exec_node_task(self._transform, transform_exec.id, tasks[1]) # Transform is a "mixed service node". self._mock_service_job_manager.ensure_node_services.assert_called_once_with( mock.ANY, self._transform.node_info.id) self._mock_service_job_manager.reset_mock() # Finish example-validator execution. self._finish_node_execution(use_task_queue, self._example_validator, active_executions[0]) # Since transform hasn't finished, trainer will not be triggered yet. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=5, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) if not use_task_queue: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) # Finish transform execution. self._finish_node_execution(use_task_queue, self._transform, transform_exec) # Now all trainer upstream nodes are done, so trainer will be triggered. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=5, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._trainer, active_executions[0].id, tasks[0]) # Finish trainer execution. self._finish_node_execution(use_task_queue, self._trainer, active_executions[0]) # No more components to execute, FinalizePipelineTask should be generated. tasks, _ = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=1, num_new_executions=0, num_active_executions=0) self.assertLen(tasks, 1) self.assertTrue(task_lib.is_finalize_pipeline_task(tasks[0])) self.assertEqual(status_lib.Code.OK, tasks[0].status.code) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def test_tasks_generated_when_upstream_done(self): # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Before generation, there's 1 execution. with self._mlmd_connection as m: executions = m.store.get_executions() self.assertLen(executions, 1) # Generate once. with self.subTest(generate=1): tasks, active_executions = self._generate_and_test( num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) # Should be fine to regenerate multiple times. There should be no new # effects. with self.subTest(generate=2): tasks, active_executions = self._generate_and_test( num_initial_executions=2, num_tasks_generated=1, num_new_executions=0, num_active_executions=1) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) with self.subTest(generate=3): tasks, active_executions = self._generate_and_test( num_initial_executions=2, num_tasks_generated=1, num_new_executions=0, num_active_executions=1) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Trainer execution task should be generated when generate called again. with self.subTest(generate=4): tasks, active_executions = self._generate_and_test( num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_node_execution_task(self._trainer, active_executions[0], tasks[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # No more components to execute so no tasks are generated. with self.subTest(generate=5): self._generate_and_test(num_initial_executions=3, num_tasks_generated=0, num_new_executions=0, num_active_executions=0)
def test_task_generation(self, use_task_queue): """Tests async pipeline task generation. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Before generation, there's 1 execution in MLMD. with self._mlmd_connection as m: executions = m.store.get_executions() self.assertLen(executions, 1) # Generate once. with self.subTest(generate=1): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) # No new effects if generate called again. with self.subTest(generate=2): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) execution_id = active_executions[0].id if not use_task_queue: self._verify_exec_node_task(self._transform, execution_id, tasks[0]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._transform, active_executions[0].id) # Trainer execution task should be generated next. with self.subTest(generate=3): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._trainer, execution_id, tasks[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._trainer, execution_id) # No more tasks should be generated as there are no new inputs. with self.subTest(generate=4): self._generate_and_test(use_task_queue, num_initial_executions=3, num_tasks_generated=0, num_new_executions=0, num_active_executions=0) # Fake another ExampleGen run. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Both transform and trainer tasks should be generated as they both find # new inputs. with self.subTest(generate=4): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=4, num_tasks_generated=2, num_new_executions=2, num_active_executions=2) self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) self._verify_exec_node_task(self._trainer, active_executions[1].id, tasks[1]) # Re-generation will produce the same tasks when task queue enabled. with self.subTest(generate=5): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=0 if use_task_queue else 2, num_new_executions=0, num_active_executions=2) if not use_task_queue: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) self._verify_exec_node_task(self._trainer, active_executions[1].id, tasks[1]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Dequeue the corresponding task. self._dequeue_and_test(use_task_queue, self._transform, active_executions[0].id) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[1]) self._dequeue_and_test(use_task_queue, self._trainer, active_executions[1].id) # Trainer should be triggered again due to transform producing new output. with self.subTest(generate=6): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._trainer, active_executions[0].id, tasks[0]) # Finally, no new tasks once trainer completes. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # Dequeue corresponding task. self._dequeue_and_test(use_task_queue, self._trainer, active_executions[0].id) with self.subTest(generate=7): self._generate_and_test(use_task_queue, num_initial_executions=7, num_tasks_generated=0, num_new_executions=0, num_active_executions=0) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def test_tasks_generated_when_upstream_done(self, use_task_queue): """Tests that tasks are generated when upstream is done. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Before generation, there's 1 execution. with self._mlmd_connection as m: executions = m.store.get_executions() self.assertLen(executions, 1) # Generate once. with self.subTest(generate=1): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) # Should be fine to regenerate multiple times. There should be no new # effects. with self.subTest(generate=2): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) if not use_task_queue: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) with self.subTest(generate=3): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) execution_id = active_executions[0].id if not use_task_queue: self._verify_exec_node_task(self._transform, execution_id, tasks[0]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._transform, execution_id) # Trainer execution task should be generated when generate called again. with self.subTest(generate=4): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._trainer, execution_id, tasks[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._trainer, execution_id) # No more components to execute so no tasks are generated. with self.subTest(generate=5): self._generate_and_test(use_task_queue, num_initial_executions=3, num_tasks_generated=0, num_new_executions=0, num_active_executions=0) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def test_tasks_generated_when_upstream_done(self, use_task_queue): """Tests that tasks are generated when upstream is done. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) def _ensure_node_services(unused_pipeline_state, node_id): self.assertEqual(self._example_gen.node_info.id, node_id) return service_jobs.ServiceStatus.SUCCESS self._mock_service_job_manager.ensure_node_services.side_effect = ( _ensure_node_services) # Generate once. with self.subTest(generate=1): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) self._mock_service_job_manager.ensure_node_services.assert_called() # Should be fine to regenerate multiple times. There should be no new # effects. with self.subTest(generate=2): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) if not use_task_queue: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) with self.subTest(generate=3): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) execution_id = active_executions[0].id if not use_task_queue: self._verify_exec_node_task(self._transform, execution_id, tasks[0]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._transform, execution_id) # Trainer execution task should be generated when generate called again. with self.subTest(generate=4): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._trainer, execution_id, tasks[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._trainer, execution_id) # No more components to execute, FinalizePipelineTask should be generated. with self.subTest(generate=5): tasks, _ = self._generate_and_test( use_task_queue, num_initial_executions=3, num_tasks_generated=1, num_new_executions=0, num_active_executions=0) self.assertLen(tasks, 1) self.assertTrue(task_lib.is_finalize_pipeline_task(tasks[0])) self.assertEqual(status_lib.Code.OK, tasks[0].status.code) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def test_task_generation(self, use_task_queue): """Tests async pipeline task generation. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) def _ensure_node_services(unused_pipeline_state, node_id): self.assertEqual('my_example_gen', node_id) return service_jobs.ServiceStatus.RUNNING self._mock_service_job_manager.ensure_node_services.side_effect = ( _ensure_node_services) # Generate once. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) self._mock_service_job_manager.ensure_node_services.assert_called() # No new effects if generate called again. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) execution_id = active_executions[0].id if not use_task_queue: self._verify_exec_node_task(self._transform, execution_id, tasks[0]) # Mark transform execution complete. self._finish_node_execution(use_task_queue, self._transform, active_executions[0]) # Trainer execution task should be generated next. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._trainer, execution_id, tasks[0]) # Mark the trainer execution complete. self._finish_node_execution(use_task_queue, self._trainer, active_executions[0]) # No more tasks should be generated as there are no new inputs. self._generate_and_test(use_task_queue, num_initial_executions=3, num_tasks_generated=0, num_new_executions=0, num_active_executions=0) # Fake another ExampleGen run. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Both transform and trainer tasks should be generated as they both find # new inputs. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=4, num_tasks_generated=2, num_new_executions=2, num_active_executions=2) self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) self._verify_exec_node_task(self._trainer, active_executions[1].id, tasks[1]) # Re-generation will produce the same tasks when task queue enabled. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=0 if use_task_queue else 2, num_new_executions=0, num_active_executions=2) if not use_task_queue: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) self._verify_exec_node_task(self._trainer, active_executions[1].id, tasks[1]) # Mark transform execution complete. self._finish_node_execution(use_task_queue, self._transform, active_executions[0]) # Mark the trainer execution complete. self._finish_node_execution(use_task_queue, self._trainer, active_executions[1]) # Trainer should be triggered again due to transform producing new output. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._trainer, active_executions[0].id, tasks[0]) # Finally, no new tasks once trainer completes. self._finish_node_execution(use_task_queue, self._trainer, active_executions[0]) self._generate_and_test(use_task_queue, num_initial_executions=7, num_tasks_generated=0, num_new_executions=0, num_active_executions=0) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def test_task_generation(self, use_task_queue): """Tests async pipeline task generation. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Generate once. [update_example_gen_task, update_transform_task, exec_transform_task] = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=3, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._transform]) self.assertTrue(task_lib.is_update_node_state_task(update_example_gen_task)) self.assertEqual(pstate.NodeState.RUNNING, update_example_gen_task.state) self.assertTrue(task_lib.is_update_node_state_task(update_transform_task)) self.assertEqual(pstate.NodeState.RUNNING, update_transform_task.state) self.assertTrue(task_lib.is_exec_node_task(exec_transform_task)) self._mock_service_job_manager.ensure_node_services.assert_has_calls([ mock.call(mock.ANY, self._example_gen.node_info.id), mock.call(mock.ANY, self._transform.node_info.id) ]) # No new effects if generate called again. tasks = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1 if use_task_queue else 3, num_new_executions=0, num_active_executions=1, expected_exec_nodes=[] if use_task_queue else [self._transform]) if not use_task_queue: exec_transform_task = tasks[2] # Mark transform execution complete. self._finish_node_execution(use_task_queue, exec_transform_task) # Trainer execution task should be generated next. [ update_example_gen_task, update_transform_task, update_trainer_task, exec_trainer_task ] = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=4, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._trainer]) self.assertTrue(task_lib.is_update_node_state_task(update_example_gen_task)) self.assertEqual(pstate.NodeState.RUNNING, update_example_gen_task.state) self.assertTrue(task_lib.is_update_node_state_task(update_transform_task)) self.assertEqual(pstate.NodeState.STARTED, update_transform_task.state) self.assertTrue(task_lib.is_update_node_state_task(update_trainer_task)) self.assertEqual(pstate.NodeState.RUNNING, update_trainer_task.state) self.assertTrue(task_lib.is_exec_node_task(exec_trainer_task)) # Mark the trainer execution complete. self._finish_node_execution(use_task_queue, exec_trainer_task) # Only UpdateNodeStateTask are generated as there are no new inputs. tasks = self._generate_and_test( use_task_queue, num_initial_executions=3, num_tasks_generated=3, num_new_executions=0, num_active_executions=0) for task in tasks: self.assertTrue(task_lib.is_update_node_state_task(task)) self.assertEqual(pstate.NodeState.RUNNING, update_example_gen_task.state) # Fake another ExampleGen run. test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Both transform and trainer tasks should be generated as they both find # new inputs. [ update_example_gen_task, update_transform_task, exec_transform_task, update_trainer_task, exec_trainer_task ] = self._generate_and_test( use_task_queue, num_initial_executions=4, num_tasks_generated=5, num_new_executions=2, num_active_executions=2, expected_exec_nodes=[self._transform, self._trainer]) self.assertTrue(task_lib.is_update_node_state_task(update_example_gen_task)) self.assertEqual(pstate.NodeState.RUNNING, update_example_gen_task.state) self.assertTrue(task_lib.is_update_node_state_task(update_transform_task)) self.assertEqual(pstate.NodeState.RUNNING, update_transform_task.state) self.assertTrue(task_lib.is_exec_node_task(exec_transform_task)) self.assertTrue(task_lib.is_update_node_state_task(update_trainer_task)) self.assertEqual(pstate.NodeState.RUNNING, update_trainer_task.state) self.assertTrue(task_lib.is_exec_node_task(exec_trainer_task)) # Re-generation will produce the same tasks when task queue disabled. tasks = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=1 if use_task_queue else 5, num_new_executions=0, num_active_executions=2, expected_exec_nodes=[] if use_task_queue else [self._transform, self._trainer]) if not use_task_queue: self.assertTrue(task_lib.is_update_node_state_task(tasks[0])) self.assertEqual(pstate.NodeState.RUNNING, update_example_gen_task.state) self.assertTrue(task_lib.is_update_node_state_task(tasks[1])) self.assertEqual(pstate.NodeState.RUNNING, update_example_gen_task.state) self.assertTrue(task_lib.is_exec_node_task(tasks[2])) self.assertTrue(task_lib.is_update_node_state_task(tasks[3])) self.assertEqual(pstate.NodeState.RUNNING, update_example_gen_task.state) self.assertTrue(task_lib.is_exec_node_task(tasks[4])) exec_transform_task = tasks[2] exec_trainer_task = tasks[4] else: self.assertTrue(task_lib.is_update_node_state_task(tasks[0])) self.assertEqual(pstate.NodeState.RUNNING, update_example_gen_task.state) # Mark transform execution complete. self._finish_node_execution(use_task_queue, exec_transform_task) # Mark the trainer execution complete. self._finish_node_execution(use_task_queue, exec_trainer_task) # Trainer should be triggered again due to transform producing new output. [ update_example_gen_task, update_transform_task, update_trainer_task, exec_trainer_task ] = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=4, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._trainer]) self.assertTrue(task_lib.is_update_node_state_task(update_example_gen_task)) self.assertEqual(pstate.NodeState.RUNNING, update_example_gen_task.state) self.assertTrue(task_lib.is_update_node_state_task(update_transform_task)) self.assertEqual(pstate.NodeState.STARTED, update_transform_task.state) self.assertTrue(task_lib.is_update_node_state_task(update_trainer_task)) self.assertEqual(pstate.NodeState.RUNNING, update_trainer_task.state) self.assertTrue(task_lib.is_exec_node_task(exec_trainer_task)) # Finally, no new tasks once trainer completes. self._finish_node_execution(use_task_queue, exec_trainer_task) [update_example_gen_task, update_transform_task, update_trainer_task] = self._generate_and_test( use_task_queue, num_initial_executions=7, num_tasks_generated=3, num_new_executions=0, num_active_executions=0) self.assertTrue(task_lib.is_update_node_state_task(update_example_gen_task)) self.assertEqual(pstate.NodeState.RUNNING, update_example_gen_task.state) self.assertTrue(task_lib.is_update_node_state_task(update_transform_task)) self.assertEqual(pstate.NodeState.STARTED, update_transform_task.state) self.assertTrue(task_lib.is_update_node_state_task(update_trainer_task)) self.assertEqual(pstate.NodeState.STARTED, update_trainer_task.state) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def test_tasks_generated_when_upstream_done(self, use_task_queue): """Tests that tasks are generated when upstream is done. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Generate once. Stats-gen task should be generated. [stats_gen_task] = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._stats_gen]) self._mock_service_job_manager.ensure_node_services.assert_called_with( mock.ANY, self._example_gen.node_info.id) self._mock_service_job_manager.reset_mock() # Finish stats-gen execution. self._finish_node_execution(use_task_queue, stats_gen_task) # Schema-gen should execute next. [schema_gen_task] = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._schema_gen]) # Finish schema-gen execution. self._finish_node_execution(use_task_queue, schema_gen_task) # Transform and ExampleValidator should both execute next. [example_validator_task, transform_task] = self._generate_and_test( use_task_queue, num_initial_executions=3, num_tasks_generated=2, num_new_executions=2, num_active_executions=2, expected_exec_nodes=[self._example_validator, self._transform]) # Transform is a "mixed service node". self._mock_service_job_manager.ensure_node_services.assert_called_once_with( mock.ANY, self._transform.node_info.id) self._mock_service_job_manager.reset_mock() # Finish example-validator execution. self._finish_node_execution(use_task_queue, example_validator_task) # Since transform hasn't finished, trainer will not be triggered yet. tasks = self._generate_and_test( use_task_queue, num_initial_executions=5, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1, expected_exec_nodes=[] if use_task_queue else [self._transform]) if not use_task_queue: transform_task = tasks[0] # Finish transform execution. self._finish_node_execution(use_task_queue, transform_task) # Now all trainer upstream nodes are done, so trainer will be triggered. [trainer_task] = self._generate_and_test( use_task_queue, num_initial_executions=5, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._trainer]) # Finish trainer execution. self._finish_node_execution(use_task_queue, trainer_task) # Test task-only dependencies: chore_a and chore_b nodes have no input or # output specs but should still be executed in the DAG order. [chore_a_task] = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._chore_a]) self._finish_node_execution(use_task_queue, chore_a_task) [chore_b_task] = self._generate_and_test( use_task_queue, num_initial_executions=7, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._chore_b]) self._finish_node_execution(use_task_queue, chore_b_task) # No more components to execute, FinalizePipelineTask should be generated. [finalize_task] = self._generate_and_test( use_task_queue, num_initial_executions=8, num_tasks_generated=1, num_new_executions=0, num_active_executions=0) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task)) self.assertEqual(status_lib.Code.OK, finalize_task.status.code) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def test_task_generation(self): # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Before generation, there's 1 execution in MLMD. with self._mlmd_connection as m: executions = m.store.get_executions() self.assertLen(executions, 1) # Generate once. with self.subTest(generate=1): tasks, active_executions = self._generate_and_test( num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) # No new effects if generate called again. with self.subTest(generate=2): tasks, active_executions = self._generate_and_test( num_initial_executions=2, num_tasks_generated=1, num_new_executions=0, num_active_executions=1) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Trainer execution task should be generated next. with self.subTest(generate=3): tasks, active_executions = self._generate_and_test( num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_node_execution_task(self._trainer, active_executions[0], tasks[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # No more tasks should be generated as there are no new inputs. with self.subTest(generate=4): self._generate_and_test(num_initial_executions=3, num_tasks_generated=0, num_new_executions=0, num_active_executions=0) # Fake another ExampleGen run. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Both transform and trainer tasks should be generated as they both find # new inputs. with self.subTest(generate=4): tasks, active_executions = self._generate_and_test( num_initial_executions=4, num_tasks_generated=2, num_new_executions=2, num_active_executions=2) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) self._verify_node_execution_task(self._trainer, active_executions[1], tasks[1]) # Re-generation will produce the same tasks again. with self.subTest(generate=5): tasks, active_executions = self._generate_and_test( num_initial_executions=6, num_tasks_generated=2, num_new_executions=0, num_active_executions=2) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) self._verify_node_execution_task(self._trainer, active_executions[1], tasks[1]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[1]) # Trainer should be triggered again due to transform producing new output. with self.subTest(generate=6): tasks, active_executions = self._generate_and_test( num_initial_executions=6, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_node_execution_task(self._trainer, active_executions[0], tasks[0]) # Finally, no new tasks once trainer completes. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) with self.subTest(generate=7): self._generate_and_test(num_initial_executions=7, num_tasks_generated=0, num_new_executions=0, num_active_executions=0)
def test_pipeline_succeeds_when_terminal_nodes_succeed( self, use_task_queue, fail_fast): """Tests that pipeline is finalized only after terminal nodes are successful. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). fail_fast: If `True`, pipeline is aborted immediately if any node fails. """ # Check the expected terminal nodes. layers = sptg._topsorted_layers(self._pipeline) self.assertEqual( { self._example_validator.node_info.id, self._chore_b.node_info.id, # evaluator execution will be skipped as it is run conditionally and # the condition always evaluates to False in the current test. self._evaluator.node_info.id, }, sptg._terminal_node_ids(layers)) # Start executing the pipeline: test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) self._run_next(use_task_queue, expect_nodes=[self._stats_gen]) self._run_next(use_task_queue, expect_nodes=[self._schema_gen]) # Both example-validator and transform are ready to execute. [example_validator_task, transform_task] = self._generate(use_task_queue, True, fail_fast=fail_fast) self.assertEqual(self._example_validator.node_info.id, example_validator_task.node_uid.node_id) self.assertEqual(self._transform.node_info.id, transform_task.node_uid.node_id) # Start processing (but do not finish) example-validator. self._start_processing(use_task_queue, example_validator_task) # But finish transform which is in the same layer. self._finish_node_execution(use_task_queue, transform_task) # Readability note: below, example-validator task should continue to be # generated when not using task queue because the execution is active. # Trainer and downstream nodes can execute as transform is finished. self._run_next(use_task_queue, expect_nodes=[self._trainer] if use_task_queue else [self._example_validator, self._trainer], finish_nodes=[self._trainer], fail_fast=fail_fast) self._run_next(use_task_queue, expect_nodes=[self._chore_a] if use_task_queue else [self._example_validator, self._chore_a], finish_nodes=[self._chore_a], fail_fast=fail_fast) self._run_next(use_task_queue, expect_nodes=[self._chore_b] if use_task_queue else [self._example_validator, self._chore_b], finish_nodes=[self._chore_b], fail_fast=fail_fast) self._run_next( use_task_queue, expect_nodes=[] if use_task_queue else [self._example_validator], finish_nodes=[], fail_fast=fail_fast) # FinalizePipelineTask is generated only after example-validator finishes. test_utils.fake_execute_node(self._mlmd_connection, example_validator_task) self._finish_processing(use_task_queue, example_validator_task) [finalize_task] = self._generate(use_task_queue, True, fail_fast=fail_fast) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task)) self.assertEqual(status_lib.Code.OK, finalize_task.status.code)
def setUp(self): super().setUp() pipeline_root = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self.id()) # Makes sure multiple connections within a test always connect to the same # MLMD instance. metadata_path = os.path.join(pipeline_root, 'metadata', 'metadata.db') self._metadata_path = metadata_path connection_config = metadata.sqlite_metadata_connection_config( metadata_path) connection_config.sqlite.SetInParent() self._mlmd_connection = metadata.Metadata( connection_config=connection_config) # Sets up the pipeline. pipeline = test_async_pipeline.create_pipeline() # Extracts components. self._example_gen = pipeline.nodes[0].pipeline_node self._transform = pipeline.nodes[1].pipeline_node self._trainer = pipeline.nodes[2].pipeline_node # Pack deployment config for testing. deployment_config = pipeline_pb2.IntermediateDeploymentConfig() executor_spec = pipeline_pb2.ExecutorSpec.PythonClassExecutorSpec( class_path='fake.ClassPath') deployment_config.executor_specs[self._trainer.node_info.id].Pack( executor_spec) deployment_config.executor_specs[self._transform.node_info.id].Pack( executor_spec) self._type_url = deployment_config.executor_specs[ self._trainer.node_info.id].type_url pipeline.deployment_config.Pack(deployment_config) self._pipeline = pipeline self._pipeline_info = pipeline.pipeline_info self._pipeline_runtime_spec = pipeline.runtime_spec self._pipeline_runtime_spec.pipeline_root.field_value.string_value = ( pipeline_root) ts.TaskSchedulerRegistry.clear() self._task_queue = tq.TaskQueue() # Run fake example-gen to prepare downstreams component triggers. test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Task generator should produce two tasks for transform. The first one is # UpdateNodeStateTask and the second one is ExecNodeTask. with self._mlmd_connection as m: pipeline_state = pstate.PipelineState.new(m, self._pipeline) tasks = asptg.AsyncPipelineTaskGenerator( m, self._task_queue.contains_task_id, service_jobs.DummyServiceJobManager()).generate(pipeline_state) self.assertLen(tasks, 2) self.assertTrue(task_lib.is_update_node_state_task(tasks[0])) self.assertEqual(pstate.NodeState.RUNNING, tasks[0].state) self.assertEqual('my_transform', tasks[0].node_uid.node_id) self.assertTrue(task_lib.is_exec_node_task(tasks[1])) self.assertEqual('my_transform', tasks[1].node_uid.node_id) self.assertTrue(os.path.exists(tasks[1].stateful_working_dir)) self.assertTrue(os.path.exists(tasks[1].tmp_dir)) self._task = tasks[1] self._output_artifact_uri = self._task.output_artifacts[ 'transform_graph'][0].uri self.assertTrue(os.path.exists(self._output_artifact_uri)) self._task_queue.enqueue(self._task) # There should be 1 active execution in MLMD. with self._mlmd_connection as m: executions = m.store.get_executions() active_executions = [ e for e in executions if e.last_known_state == metadata_store_pb2.Execution.RUNNING ] self.assertLen(active_executions, 1) # Active execution id. self._execution_id = active_executions[0].id
def setUp(self): super(TaskManagerE2ETest, self).setUp() pipeline_root = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self.id()) # Makes sure multiple connections within a test always connect to the same # MLMD instance. metadata_path = os.path.join(pipeline_root, 'metadata', 'metadata.db') self._metadata_path = metadata_path connection_config = metadata.sqlite_metadata_connection_config( metadata_path) connection_config.sqlite.SetInParent() self._mlmd_connection = metadata.Metadata( connection_config=connection_config) # Sets up the pipeline. pipeline = pipeline_pb2.Pipeline() self.load_proto_from_text( os.path.join(os.path.dirname(__file__), 'testdata', 'async_pipeline.pbtxt'), pipeline) # Extracts components. self._example_gen = pipeline.nodes[0].pipeline_node self._transform = pipeline.nodes[1].pipeline_node self._trainer = pipeline.nodes[2].pipeline_node # Pack deployment config for testing. deployment_config = pipeline_pb2.IntermediateDeploymentConfig() executor_spec = pipeline_pb2.ExecutorSpec.PythonClassExecutorSpec( class_path='fake.ClassPath') deployment_config.executor_specs[self._trainer.node_info.id].Pack( executor_spec) deployment_config.executor_specs[self._transform.node_info.id].Pack( executor_spec) self._type_url = deployment_config.executor_specs[ self._trainer.node_info.id].type_url pipeline.deployment_config.Pack(deployment_config) self._pipeline = pipeline self._pipeline_info = pipeline.pipeline_info self._pipeline_runtime_spec = pipeline.runtime_spec self._pipeline_runtime_spec.pipeline_root.field_value.string_value = ( pipeline_root) ts.TaskSchedulerRegistry.clear() self._task_queue = tq.TaskQueue() # Run fake example-gen to prepare downstreams component triggers. test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Task generator should produce a task to run transform. with self._mlmd_connection as m: pipeline_state = pstate.PipelineState(m, self._pipeline, 0) tasks = asptg.AsyncPipelineTaskGenerator( m, pipeline_state, self._task_queue.contains_task_id, service_jobs.DummyServiceJobManager()).generate() self.assertLen(tasks, 1) task = tasks[0] self.assertEqual('my_transform', task.node_uid.node_id) # Task generator should produce a task to run transform. with self._mlmd_connection as m: pipeline_state = pstate.PipelineState(m, self._pipeline, 0) tasks = asptg.AsyncPipelineTaskGenerator( m, pipeline_state, self._task_queue.contains_task_id, service_jobs.DummyServiceJobManager()).generate() self.assertLen(tasks, 1) self._task = tasks[0] self.assertEqual('my_transform', self._task.node_uid.node_id) self._task_queue.enqueue(self._task) # There should be 1 active execution in MLMD. with self._mlmd_connection as m: executions = m.store.get_executions() active_executions = [ e for e in executions if e.last_known_state == metadata_store_pb2.Execution.RUNNING ] self.assertLen(active_executions, 1) # Active execution id. self._execution_id = active_executions[0].id
def test_conditional_execution(self, evaluate): """Tests conditionals in the pipeline. Args: evaluate: Whether to run the conditional evaluator. """ # Check the expected terminal nodes. layers = sptg._topsorted_layers(self._pipeline) self.assertEqual( { self._example_validator.node_info.id, self._chore_b.node_info.id, self._evaluator.node_info.id, }, sptg._terminal_node_ids(layers)) # Start executing the pipeline: test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) self._run_next(False, expect_nodes=[self._stats_gen]) self._run_next(False, expect_nodes=[self._schema_gen]) self._run_next(False, expect_nodes=[self._example_validator, self._transform]) # Evaluator is run conditionally based on whether the Model artifact # produced by the trainer has a custom property evaluate=1. self._run_next( False, expect_nodes=[self._trainer], artifact_custom_properties={'evaluate': 1} if evaluate else None) tasks = self._generate(False) [evaluator_update_node_state_task] = [ t for t in tasks if task_lib.is_update_node_state_task(t) and t.node_uid.node_id == 'my_evaluator' ] self.assertEqual( pstate.NodeState.RUNNING if evaluate else pstate.NodeState.SKIPPED, evaluator_update_node_state_task.state) exec_node_tasks = [t for t in tasks if task_lib.is_exec_node_task(t)] if evaluate: [chore_a_exec_node_task, evaluator_exec_node_task] = exec_node_tasks self.assertEqual('chore_a', chore_a_exec_node_task.node_uid.node_id) self.assertEqual('my_evaluator', evaluator_exec_node_task.node_uid.node_id) self._finish_node_execution(False, chore_a_exec_node_task) self._finish_node_execution(False, evaluator_exec_node_task) else: [chore_a_exec_node_task] = exec_node_tasks self.assertEqual('chore_a', chore_a_exec_node_task.node_uid.node_id) self._finish_node_execution(False, chore_a_exec_node_task) self._run_next(False, expect_nodes=[self._chore_b]) # All nodes executed, finalization task should be produced. [finalize_task] = self._generate(False, True) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task))