def testStubbedTaxiPipelineBeam(self): # Run pipeline with stub executors. stub_component_launcher.StubComponentLauncher.initialize( test_data_dir=self._recorded_output_dir, test_component_ids=[]) stub_pipeline_config = pipeline_config.PipelineConfig( supported_launcher_classes=[ stub_component_launcher.StubComponentLauncher, ]) local_dag_runner.LocalDagRunner(config=stub_pipeline_config).run( self.taxi_pipeline) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) # Verify that recorded files are successfully copied to the output uris. with metadata.Metadata(metadata_config) as m: artifacts = m.store.get_artifacts() artifact_count = len(artifacts) executions = m.store.get_executions() execution_count = len(executions) # Artifact count is greater by 3 due to extra artifacts produced by # Evaluator(blessing and evaluation), Trainer(model and model_run) and # Transform(example, graph, cache) minus Resolver which doesn't generate # new artifact. self.assertEqual(artifact_count, execution_count + 3) self.assertLen(self.taxi_pipeline.components, execution_count) for execution in executions: component_id = execution.properties[ metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value # pylint: disable=protected-access if component_id.startswith('ResolverNode'): continue eid = [execution.id] events = m.store.get_events_by_execution_ids(eid) output_events = [ x for x in events if x.type == metadata_store_pb2.Event.OUTPUT ] for event in output_events: steps = event.path.steps self.assertTrue(steps[0].HasField('key')) name = steps[0].key artifacts = m.store.get_artifacts_by_id( [event.artifact_id]) for idx, artifact in enumerate(artifacts): self.assertDirectoryEqual( artifact.uri, os.path.join(self._recorded_output_dir, component_id, name, str(idx))) # Calls verifier for pipeline output artifacts, excluding the resolver node. local_dag_runner.LocalDagRunner().run(self.taxi_pipeline) pipeline_outputs = executor_verifier_utils.get_pipeline_outputs( self.taxi_pipeline.metadata_connection_config, self.taxi_pipeline.pipeline_info) verifier_map = { 'model': self._verify_model, 'model_run': self._verify_model, 'examples': self._verify_examples, 'schema': self._verify_schema, 'anomalies': self._verify_anomalies, 'evaluation': self._verify_evaluation } # List of components to verify. ResolverNode is ignored because it # doesn't have an executor. verify_component_ids = [ component.id for component in self.taxi_pipeline.components if not component.id.startswith('ResolverNode') ] for component_id in verify_component_ids: logging.info('Verifying %s', component_id) for key, artifact_dict in pipeline_outputs[component_id].items(): for idx, artifact in artifact_dict.items(): recorded_uri = os.path.join(self._recorded_output_dir, component_id, key, str(idx)) verifier_map.get(key, self._verify_file_path)(artifact.uri, recorded_uri)
def testStubbedTaxiPipelineBeam(self): pipeline_ir = compiler.Compiler().compile(self.taxi_pipeline) logging.info('Replacing with test_data_dir:%s', self._recorded_output_dir) pipeline_mock.replace_executor_with_stub(pipeline_ir, self._recorded_output_dir, []) BeamDagRunner().run_with_ir(pipeline_ir) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) # Verify that recorded files are successfully copied to the output uris. with metadata.Metadata(metadata_config) as m: artifacts = m.store.get_artifacts() artifact_count = len(artifacts) executions = m.store.get_executions() execution_count = len(executions) # Artifact count is greater by 7 due to extra artifacts produced by # Evaluator(blessing and evaluation), Trainer(model and model_run) and # Transform(example, graph, cache, pre_transform_statistics, # pre_transform_schema, post_transform_statistics, post_transform_schema, # post_transform_anomalies) minus Resolver which doesn't generate # new artifact. self.assertEqual(artifact_count, execution_count + 7) self.assertLen(self.taxi_pipeline.components, execution_count) for execution in executions: component_id = pipeline_recorder_utils.get_component_id_from_execution( m, execution) if component_id.startswith('Resolver'): continue eid = [execution.id] events = m.store.get_events_by_execution_ids(eid) output_events = [ x for x in events if x.type == metadata_store_pb2.Event.OUTPUT ] for event in output_events: steps = event.path.steps self.assertTrue(steps[0].HasField('key')) name = steps[0].key artifacts = m.store.get_artifacts_by_id( [event.artifact_id]) for idx, artifact in enumerate(artifacts): self.assertDirectoryEqual( artifact.uri, os.path.join(self._recorded_output_dir, component_id, name, str(idx))) # Calls verifier for pipeline output artifacts, excluding the resolver node. BeamDagRunner().run(self.taxi_pipeline) pipeline_outputs = executor_verifier_utils.get_pipeline_outputs( self.taxi_pipeline.metadata_connection_config, self._pipeline_name) verifier_map = { 'model': self._verify_model, 'model_run': self._verify_model, 'examples': self._verify_examples, 'schema': self._verify_schema, 'anomalies': self._verify_anomalies, 'evaluation': self._verify_evaluation, # A subdirectory of updated_analyzer_cache has changing name. 'updated_analyzer_cache': self._veryify_root_dir, } # List of components to verify. Resolver is ignored because it # doesn't have an executor. verify_component_ids = [ component.id for component in self.taxi_pipeline.components if not component.id.startswith('Resolver') ] for component_id in verify_component_ids: logging.info('Verifying %s', component_id) for key, artifact_dict in pipeline_outputs[component_id].items(): for idx, artifact in artifact_dict.items(): recorded_uri = os.path.join(self._recorded_output_dir, component_id, key, str(idx)) verifier_map.get(key, self._verify_file_path)(artifact.uri, recorded_uri)
def testStubbedImdbPipelineBeam(self): pipeline_ir = compiler.Compiler().compile(self.imdb_pipeline) pipeline_mock.replace_executor_with_stub(pipeline_ir, self._recorded_output_dir, []) BeamDagRunner().run(pipeline_ir) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) # Verify that recorded files are successfully copied to the output uris. with metadata.Metadata(metadata_config) as m: for execution in m.store.get_executions(): component_id = pipeline_recorder_utils.get_component_id_from_execution( m, execution) if component_id.startswith('ResolverNode'): continue eid = [execution.id] events = m.store.get_events_by_execution_ids(eid) output_events = [ x for x in events if x.type == metadata_store_pb2.Event.OUTPUT ] for event in output_events: steps = event.path.steps assert steps[0].HasField('key') name = steps[0].key artifacts = m.store.get_artifacts_by_id([event.artifact_id]) for idx, artifact in enumerate(artifacts): self.assertDirectoryEqual( artifact.uri, os.path.join(self._recorded_output_dir, component_id, name, str(idx))) # Calls verifier for pipeline output artifacts, excluding the resolver node. BeamDagRunner().run(self.imdb_pipeline) pipeline_outputs = executor_verifier_utils.get_pipeline_outputs( self.imdb_pipeline.metadata_connection_config, self._pipeline_name) verifier_map = { 'model': self._verify_model, 'model_run': self._verify_model, 'examples': self._verify_examples, 'schema': self._verify_schema, 'anomalies': self._verify_anomalies, 'evaluation': self._verify_evaluation, # A subdirectory of updated_analyzer_cache has changing name. 'updated_analyzer_cache': self._veryify_root_dir, } # List of components to verify. ResolverNode is ignored because it # doesn't have an executor. verify_component_ids = [ component.id for component in self.imdb_pipeline.components if not component.id.startswith('ResolverNode') ] for component_id in verify_component_ids: for key, artifact_dict in pipeline_outputs[component_id].items(): for idx, artifact in artifact_dict.items(): logging.info('Verifying %s', component_id) recorded_uri = os.path.join(self._recorded_output_dir, component_id, key, str(idx)) verifier_map.get(key, self._verify_file_path)(artifact.uri, recorded_uri)
def testStubbedImdbPipelineBeam(self): # Runs the pipeline and record to self._recorded_output_dir stub_component_launcher.StubComponentLauncher.initialize( test_data_dir=self._recorded_output_dir, test_component_ids=[]) stub_pipeline_config = pipeline_config.PipelineConfig( supported_launcher_classes=[ stub_component_launcher.StubComponentLauncher, ]) BeamDagRunner(config=stub_pipeline_config).run(self.imdb_pipeline) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) # Verify that recorded files are successfully copied to the output uris. with metadata.Metadata(metadata_config) as m: for execution in m.store.get_executions(): component_id = execution.properties[ metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value # pylint: disable=protected-access if component_id.startswith('ResolverNode'): continue eid = [execution.id] events = m.store.get_events_by_execution_ids(eid) output_events = [ x for x in events if x.type == metadata_store_pb2.Event.OUTPUT ] for event in output_events: steps = event.path.steps assert steps[0].HasField('key') name = steps[0].key artifacts = m.store.get_artifacts_by_id( [event.artifact_id]) for idx, artifact in enumerate(artifacts): self.assertDirectoryEqual( artifact.uri, os.path.join(self._recorded_output_dir, component_id, name, str(idx))) # Calls verifier for pipeline output artifacts, excluding the resolver node. BeamDagRunner().run(self.imdb_pipeline) pipeline_outputs = executor_verifier_utils.get_pipeline_outputs( self.imdb_pipeline.metadata_connection_config, self.imdb_pipeline.pipeline_info) verifier_map = { 'model': self._verify_model, 'model_run': self._verify_model, 'examples': self._verify_examples, 'schema': self._verify_schema, 'anomalies': self._verify_anomalies, 'evaluation': self._verify_evaluation } # List of components to verify. ResolverNode is ignored because it # doesn't have an executor. verify_component_ids = [ component.id for component in self.imdb_pipeline.components if not component.id.startswith('ResolverNode') ] for component_id in verify_component_ids: for key, artifact_dict in pipeline_outputs[component_id].items(): for idx, artifact in artifact_dict.items(): logging.info('Verifying %s', component_id) recorded_uri = os.path.join(self._recorded_output_dir, component_id, key, str(idx)) verifier_map.get(key, self._verify_file_path)(artifact.uri, recorded_uri)