def setUp(self): super(BaseExampleGenExecutorTest, self).setUp() output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. self._examples = standard_artifacts.Examples() self._examples.uri = output_data_dir self._output_dict = {utils.EXAMPLES_KEY: [self._examples]} self._train_output_file = os.path.join( self._examples.uri, 'train', 'data_tfrecord-00000-of-00001.gz') self._eval_output_file = os.path.join( self._examples.uri, 'eval', 'data_tfrecord-00000-of-00001.gz') # Create exec proterties for output splits. self._exec_properties = { utils.INPUT_CONFIG_KEY: proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='single', pattern='single/*'), ])), utils.OUTPUT_CONFIG_KEY: proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ]))) }
def testDoWithOutputExamplesSpecifiedSplits(self): self._exec_properties['data_spec'] = proto_utils.proto_to_json( text_format.Parse( """ example_splits: 'unlabelled' """, bulk_inferrer_pb2.DataSpec())) self._exec_properties[ 'output_example_spec'] = proto_utils.proto_to_json( text_format.Parse( """ output_columns_spec { classify_output { label_column: 'classify_label' score_column: 'classify_score' } } """, bulk_inferrer_pb2.OutputExampleSpec())) # Run executor. bulk_inferrer = executor.Executor(self._context) bulk_inferrer.Do(self._input_dict, self._output_dict_oe, self._exec_properties) # Check outputs. self.assertTrue(fileio.exists(self._output_examples_dir)) self._verify_example_split('unlabelled') self.assertFalse( fileio.exists( os.path.join(self._output_examples_dir, 'unlabelled2')))
def setUp(self): super(ExecutorTest, self).setUp() # Setup Mocks patcher = mock.patch.object(request_builder, 'build_requests') self.build_requests_mock = patcher.start() self.addCleanup(patcher.stop) # Setup directories source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') base_output_dir = os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()) output_data_dir = os.path.join(base_output_dir, self._testMethodName) # Setup input_dict. self._model = standard_artifacts.Model() self._model.uri = os.path.join(source_data_dir, 'trainer', 'current') self._model_path = path_utils.serving_model_path(self._model.uri) examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'transform', 'transformed_examples', 'eval') examples.split_names = artifact_utils.encode_split_names(['eval']) self._input_dict = { MODEL_KEY: [self._model], EXAMPLES_KEY: [examples], } self._blessing = standard_artifacts.InfraBlessing() self._blessing.uri = os.path.join(output_data_dir, 'blessing') self._output_dict = {BLESSING_KEY: [self._blessing]} temp_dir = os.path.join(output_data_dir, '.temp') self._context = executor.Executor.Context(tmp_dir=temp_dir, unique_id='1') self._serving_spec = _make_serving_spec({ 'tensorflow_serving': { 'tags': ['1.15.0'] }, 'local_docker': {}, 'model_name': 'chicago-taxi', }) self._serving_binary = serving_bins.parse_serving_binaries( self._serving_spec)[0] self._validation_spec = _make_validation_spec({ 'max_loading_time_seconds': 10, 'num_tries': 3 }) self._request_spec = _make_request_spec({ 'tensorflow_serving': { 'signature_names': ['serving_default'], }, 'num_examples': 1 }) self._exec_properties = { SERVING_SPEC_KEY: proto_utils.proto_to_json(self._serving_spec), VALIDATION_SPEC_KEY: proto_utils.proto_to_json(self._validation_spec), REQUEST_SPEC_KEY: proto_utils.proto_to_json(self._request_spec), }
def testDoWithCustomSplits(self): # Update input dict. io_utils.copy_dir( os.path.join(self._testdata_dir, 'penguin/data/train'), os.path.join(self._output_data_dir, 'data/training')) io_utils.copy_dir( os.path.join(self._testdata_dir, 'penguin/data/eval'), os.path.join(self._output_data_dir, 'data/evaluating')) examples = standard_artifacts.Examples() examples.uri = os.path.join(self._output_data_dir, 'data') examples.split_names = artifact_utils.encode_split_names( ['training', 'evaluating']) self._input_dict[standard_component_specs.EXAMPLES_KEY] = [examples] # Update exec properties skeleton with custom splits. self._exec_properties[standard_component_specs. TRAIN_ARGS_KEY] = proto_utils.proto_to_json( trainer_pb2.TrainArgs(splits=['training'], num_steps=1000)) self._exec_properties[standard_component_specs. EVAL_ARGS_KEY] = proto_utils.proto_to_json( trainer_pb2.EvalArgs(splits=['evaluating'], num_steps=500)) self._exec_properties[ standard_component_specs.MODULE_FILE_KEY] = os.path.join( self._testdata_dir, 'module_file', 'tuner_module.py') tuner = executor.Executor(self._context) tuner.Do(input_dict=self._input_dict, output_dict=self._output_dict, exec_properties=self._exec_properties) self._verify_output()
def testDoWithCustomSplits(self): # Update input dict. io_utils.copy_dir( os.path.join(self._source_data_dir, 'transform/transformed_examples/data/train'), os.path.join(self._output_data_dir, 'data/training')) io_utils.copy_dir( os.path.join(self._source_data_dir, 'transform/transformed_examples/data/eval'), os.path.join(self._output_data_dir, 'data/evaluating')) examples = standard_artifacts.Examples() examples.uri = os.path.join(self._output_data_dir, 'data') examples.split_names = artifact_utils.encode_split_names( ['training', 'evaluating']) self._input_dict[standard_component_specs.EXAMPLES_KEY] = [examples] # Update exec properties skeleton with custom splits. self._exec_properties[standard_component_specs. TRAIN_ARGS_KEY] = proto_utils.proto_to_json( trainer_pb2.TrainArgs(splits=['training'], num_steps=1000)) self._exec_properties[standard_component_specs. EVAL_ARGS_KEY] = proto_utils.proto_to_json( trainer_pb2.EvalArgs(splits=['evaluating'], num_steps=500)) self._exec_properties[ standard_component_specs.MODULE_FILE_KEY] = self._module_file self._do(self._trainer_executor) self._verify_model_exports() self._verify_model_run_exports()
def testGetCommonFnArgs(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'transform/transformed_examples') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) transform_output = standard_artifacts.TransformGraph() transform_output.uri = os.path.join(source_data_dir, 'transform/transform_graph') schema = standard_artifacts.Schema() schema.uri = os.path.join(source_data_dir, 'schema_gen') base_model = standard_artifacts.Model() base_model.uri = os.path.join(source_data_dir, 'trainer/previous') input_dict = { standard_component_specs.EXAMPLES_KEY: [examples], standard_component_specs.TRANSFORM_GRAPH_KEY: [transform_output], standard_component_specs.SCHEMA_KEY: [schema], standard_component_specs.BASE_MODEL_KEY: [base_model], } # Create exec properties skeleton. exec_properties = { 'train_args': proto_utils.proto_to_json(trainer_pb2.TrainArgs(num_steps=1000)), 'eval_args': proto_utils.proto_to_json(trainer_pb2.EvalArgs(num_steps=500)), } fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties, 'tempdir') self.assertEqual(fn_args.working_dir, 'tempdir') self.assertEqual(fn_args.train_steps, 1000) self.assertEqual(fn_args.eval_steps, 500) self.assertLen(fn_args.train_files, 1) self.assertEqual(fn_args.train_files[0], os.path.join(examples.uri, 'Split-train', '*')) self.assertLen(fn_args.eval_files, 1) self.assertEqual(fn_args.eval_files[0], os.path.join(examples.uri, 'Split-eval', '*')) self.assertEqual(fn_args.schema_path, os.path.join(schema.uri, 'schema.pbtxt')) # Depending on execution environment, the base model may have been stored # at .../Format-Servo/... or .../Format-Serving/... directory patterns. self.assertRegex( fn_args.base_model, os.path.join(base_model.uri, r'Format-(Servo|Serving)/export/chicago-taxi/\d+')) self.assertEqual(fn_args.transform_graph_path, transform_output.uri) self.assertIsInstance(fn_args.data_accessor, fn_args_utils.DataAccessor)
def testQueryBasedDriver(self): # Create exec proterties. exec_properties = { standard_component_specs.INPUT_CONFIG_KEY: proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='s1', pattern= "select * from table where span={SPAN} and split='s1'" ), example_gen_pb2.Input.Split( name='s2', pattern= "select * from table where span={SPAN} and split='s2'") ])), standard_component_specs.RANGE_CONFIG_KEY: proto_utils.proto_to_json( range_config_pb2.RangeConfig( static_range=range_config_pb2.StaticRange( start_span_number=2, end_span_number=2))), } # Prepare output_dict example = standard_artifacts.Examples() example.uri = 'my_uri' output_dict = {standard_component_specs.EXAMPLES_KEY: [example]} query_based_driver = driver.QueryBasedDriver(self._mock_metadata) result = query_based_driver.run( portable_data_types.ExecutionInfo(output_dict=output_dict, exec_properties=exec_properties)) self.assertEqual(exec_properties[utils.SPAN_PROPERTY_NAME], 2) self.assertIsNone(exec_properties[utils.VERSION_PROPERTY_NAME]) self.assertIsNone(exec_properties[utils.FINGERPRINT_PROPERTY_NAME]) updated_input_config = example_gen_pb2.Input() proto_utils.json_to_proto( exec_properties[standard_component_specs.INPUT_CONFIG_KEY], updated_input_config) self.assertProtoEquals( """ splits { name: "s1" pattern: "select * from table where span=2 and split='s1'" } splits { name: "s2" pattern: "select * from table where span=2 and split='s2'" }""", updated_input_config) self.assertLen( result.output_artifacts[ standard_component_specs.EXAMPLES_KEY].artifacts, 1) output_example = result.output_artifacts[ standard_component_specs.EXAMPLES_KEY].artifacts[0] self.assertEqual(output_example.uri, example.uri) self.assertEqual( output_example.custom_properties[ utils.SPAN_PROPERTY_NAME].string_value, '2')
def testDo(self, mock_client): # Mock query result schema for _BigQueryConverter. mock_client.return_value.query.return_value.result.return_value.schema = self._schema output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. examples = standard_artifacts.Examples() examples.uri = output_data_dir output_dict = {'examples': [examples]} # Create exe properties. exec_properties = { 'input_config': proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='bq', pattern='SELECT i, b, f, s FROM `fake`'), ])), 'output_config': proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split( name='eval', hash_buckets=1) ]))) } # Run executor. big_query_example_gen = executor.Executor( base_beam_executor.BaseBeamExecutor.Context( beam_pipeline_args=['--project=test-project'])) big_query_example_gen.Do({}, output_dict, exec_properties) mock_client.assert_called_with(project='test-project') self.assertEqual( artifact_utils.encode_split_names(['train', 'eval']), examples.split_names) # Check BigQuery example gen outputs. train_output_file = os.path.join(examples.uri, 'Split-train', 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(examples.uri, 'Split-eval', 'data_tfrecord-00000-of-00001.gz') self.assertTrue(fileio.exists(train_output_file)) self.assertTrue(fileio.exists(eval_output_file)) self.assertGreater( fileio.open(train_output_file).size(), fileio.open(eval_output_file).size())
def setUp(self): super().setUp() self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') self._output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) self._context = executor.Executor.Context( tmp_dir=self._output_data_dir, unique_id='1') # Create input dict. e1 = standard_artifacts.Examples() e1.uri = os.path.join(self._testdata_dir, 'penguin', 'data') e1.split_names = artifact_utils.encode_split_names(['train', 'eval']) e2 = copy.deepcopy(e1) self._single_artifact = [e1] self._multiple_artifacts = [e1, e2] schema = standard_artifacts.Schema() schema.uri = os.path.join(self._testdata_dir, 'penguin', 'schema') base_model = standard_artifacts.Model() base_model.uri = os.path.join(self._testdata_dir, 'trainer/previous') self._input_dict = { standard_component_specs.EXAMPLES_KEY: self._single_artifact, standard_component_specs.SCHEMA_KEY: [schema], standard_component_specs.BASE_MODEL_KEY: [base_model] } # Create output dict. self._best_hparams = standard_artifacts.Model() self._best_hparams.uri = os.path.join(self._output_data_dir, 'best_hparams') self._output_dict = { standard_component_specs.BEST_HYPERPARAMETERS_KEY: [self._best_hparams], } # Create exec properties. self._exec_properties = { standard_component_specs.TRAIN_ARGS_KEY: proto_utils.proto_to_json(trainer_pb2.TrainArgs(num_steps=100)), standard_component_specs.EVAL_ARGS_KEY: proto_utils.proto_to_json(trainer_pb2.EvalArgs(num_steps=50)), }
def testDo(self): output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. examples = standard_artifacts.Examples() examples.uri = output_data_dir output_dict = {utils.EXAMPLES_KEY: [examples]} # Create exec proterties. exec_properties = { utils.INPUT_BASE_KEY: self._input_data_dir, utils.INPUT_CONFIG_KEY: proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='parquet', pattern='parquet/*'), ])), utils.OUTPUT_CONFIG_KEY: proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split( name='eval', hash_buckets=1) ]))) } # Run executor. parquet_example_gen = parquet_executor.Executor() parquet_example_gen.Do({}, output_dict, exec_properties) self.assertEqual( artifact_utils.encode_split_names(['train', 'eval']), examples.split_names) # Check Parquet example gen outputs. train_output_file = os.path.join(examples.uri, 'train', 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(examples.uri, 'eval', 'data_tfrecord-00000-of-00001.gz') self.assertTrue(fileio.exists(train_output_file)) self.assertTrue(fileio.exists(eval_output_file)) self.assertGreater( fileio.open(train_output_file).size(), fileio.open(eval_output_file).size())
def testDoInputSplit(self): # Create exec proterties for input split. self._exec_properties = { standard_component_specs.INPUT_CONFIG_KEY: proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train/*'), example_gen_pb2.Input.Split(name='eval', pattern='eval/*') ])), standard_component_specs.OUTPUT_CONFIG_KEY: proto_utils.proto_to_json(example_gen_pb2.Output()) } self._testDo()
def testGetCommonFnArgs(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'transform/transformed_examples') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) transform_output = standard_artifacts.TransformGraph() transform_output.uri = os.path.join(source_data_dir, 'transform/transform_graph') schema = standard_artifacts.Schema() schema.uri = os.path.join(source_data_dir, 'schema_gen') input_dict = { standard_component_specs.EXAMPLES_KEY: [examples], standard_component_specs.TRANSFORM_GRAPH_KEY: [transform_output], standard_component_specs.SCHEMA_KEY: [schema], } # Create exec properties skeleton. exec_properties = { 'train_args': proto_utils.proto_to_json(trainer_pb2.TrainArgs(num_steps=1000)), 'eval_args': proto_utils.proto_to_json(trainer_pb2.EvalArgs(num_steps=500)), } fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties, 'tempdir') self.assertEqual(fn_args.working_dir, 'tempdir') self.assertEqual(fn_args.train_steps, 1000) self.assertEqual(fn_args.eval_steps, 500) self.assertLen(fn_args.train_files, 1) self.assertEqual(fn_args.train_files[0], os.path.join(examples.uri, 'train', '*')) self.assertLen(fn_args.eval_files, 1) self.assertEqual(fn_args.eval_files[0], os.path.join(examples.uri, 'eval', '*')) self.assertEqual(fn_args.schema_path, os.path.join(schema.uri, 'schema.pbtxt')) self.assertEqual(fn_args.transform_graph_path, transform_output.uri) self.assertIsInstance(fn_args.data_accessor, fn_args_utils.DataAccessor)
def testDo(self): output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. examples = standard_artifacts.Examples() examples.uri = output_data_dir output_dict = {'examples': [examples]} # Create exe properties. exec_properties = { 'input_config': proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='bq', pattern='SELECT i, f, s FROM `fake`'), ])), 'custom_config': proto_utils.proto_to_json(example_gen_pb2.CustomConfig()), 'output_config': proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ]))), } # Run executor. presto_example_gen = executor.Executor() presto_example_gen.Do({}, output_dict, exec_properties) self.assertEqual(artifact_utils.encode_split_names(['train', 'eval']), examples.split_names) # Check Presto example gen outputs. train_output_file = os.path.join(examples.uri, 'Split-train', 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(examples.uri, 'Split-eval', 'data_tfrecord-00000-of-00001.gz') self.assertTrue(fileio.exists(train_output_file)) self.assertTrue(fileio.exists(eval_output_file)) self.assertGreater( fileio.open(train_output_file).size(), fileio.open(eval_output_file).size())
def testDoWithTuneArgsAndTrainingInputOverride(self): executor = ai_platform_tuner_executor.Executor() self._exec_properties['tune_args'] = proto_utils.proto_to_json( tuner_pb2.TuneArgs(num_parallel_trials=6)) self._exec_properties['custom_config'][ ai_platform_trainer_executor.TRAINING_ARGS_KEY].update({ 'scaleTier': 'CUSTOM', 'masterType': 'n1-highmem-16', 'workerType': 'n1-highmem-16', 'workerCount': 2, }) executor.Do(self._inputs, self._outputs, self._serialize_custom_config_under_test()) self.mock_runner.start_aip_training.assert_called_with( self._inputs, self._outputs, self._serialize_custom_config_under_test(), self._executor_class_path, { 'project': self._project_id, 'jobDir': self._job_dir, # Confirm scale tier and machine types are not overritten. 'scaleTier': 'CUSTOM', 'masterType': 'n1-highmem-16', 'workerType': 'n1-highmem-16', # Confirm workerCount has been adjusted to num_parallel_trials. 'workerCount': 5, }, mock.ANY)
def testDoSkippedModelCreation(self, mock_runner, mock_run_model_inference, _): input_dict = { 'examples': [self._examples], 'model': [self._model], 'model_blessing': [self._model_blessing], } output_dict = { 'inference_result': [self._inference_result], } ai_platform_serving_args = { 'model_name': 'model_name', 'project_id': 'project_id' } # Create exe properties. exec_properties = { 'data_spec': proto_utils.proto_to_json(bulk_inferrer_pb2.DataSpec()), 'custom_config': json_utils.dumps( {executor.SERVING_ARGS_KEY: ai_platform_serving_args}), } mock_runner.get_service_name_and_api_version.return_value = ('ml', 'v1') mock_runner.create_model_for_aip_prediction_if_not_exist.return_value = False # Run executor. bulk_inferrer = executor.Executor(self._context) bulk_inferrer.Do(input_dict, output_dict, exec_properties) ai_platform_prediction_model_spec = ( model_spec_pb2.AIPlatformPredictionModelSpec( project_id='project_id', model_name='model_name', version_name=self._model_version)) ai_platform_prediction_model_spec.use_serialization_config = True inference_endpoint = model_spec_pb2.InferenceSpecType() inference_endpoint.ai_platform_prediction_model_spec.CopyFrom( ai_platform_prediction_model_spec) mock_run_model_inference.assert_called_once_with(mock.ANY, mock.ANY, mock.ANY, mock.ANY, mock.ANY, inference_endpoint) executor_class_path = '%s.%s' % (bulk_inferrer.__class__.__module__, bulk_inferrer.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.make_labels_dict() mock_runner.deploy_model_for_aip_prediction.assert_called_once_with( serving_path=path_utils.serving_model_path(self._model.uri), model_version_name=mock.ANY, ai_platform_serving_args=ai_platform_serving_args, labels=job_labels, api=mock.ANY, skip_model_endpoint_creation=True, set_default=False) mock_runner.delete_model_from_aip_if_exists.assert_called_once_with( model_version_name=mock.ANY, ai_platform_serving_args=ai_platform_serving_args, api=mock.ANY, delete_model_endpoint=False)
def get_value_and_set_type( value: types.ExecPropertyTypes, value_type: pipeline_pb2.Value.Schema.ValueType) -> types.Property: """Returns serialized value and sets value_type.""" if isinstance(value, bool): if set_schema: value_type.boolean_type.SetInParent() return value elif isinstance(value, message.Message): # TODO(b/171794016): Investigate if file descripter set is needed for # tfx-owned proto already build in the launcher binary. if set_schema: proto_type = value_type.proto_type proto_type.message_type = type(value).DESCRIPTOR.full_name proto_utils.build_file_descriptor_set(value, proto_type.file_descriptors) return proto_utils.proto_to_json(value) elif isinstance(value, list) and len(value): if set_schema: value_type.list_type.SetInParent() value = [ get_value_and_set_type(val, value_type.list_type) for val in value ] return json_utils.dumps(value) elif isinstance(value, (int, float, str)): return value else: raise ValueError('Unexpected type %s' % type(value))
def _parse_parameters(self, raw_args: Mapping[str, Any]): """Parse arguments to ComponentSpec.""" unparsed_args = set(raw_args.keys()) inputs = {} outputs = {} self.exec_properties = {} # First, check that the arguments are set. for arg_name, arg in itertools.chain(self.PARAMETERS.items(), self.INPUTS.items(), self.OUTPUTS.items()): if arg_name not in unparsed_args: if arg.optional: continue else: raise ValueError('Missing argument %r to %s.' % (arg_name, self.__class__)) unparsed_args.remove(arg_name) # Type check the argument. value = raw_args[arg_name] if arg.optional and value is None: continue arg.type_check(arg_name, value) # Populate the appropriate dictionary for each parameter type. for arg_name, arg in self.PARAMETERS.items(): if arg.optional and arg_name not in raw_args: continue value = raw_args[arg_name] if (inspect.isclass(arg.type) and issubclass(arg.type, message.Message) and value and not _is_runtime_param(value)): if arg.use_proto: if isinstance(value, dict): value = proto_utils.dict_to_proto(value, arg.type()) elif isinstance(value, str): value = proto_utils.json_to_proto(value, arg.type()) else: # Create deterministic json string as it will be stored in metadata # for cache check. if isinstance(value, dict): value = json_utils.dumps(value) elif not isinstance(value, str): value = proto_utils.proto_to_json(value) self.exec_properties[arg_name] = value for arg_dict, param_dict in ((self.INPUTS, inputs), (self.OUTPUTS, outputs)): for arg_name, arg in arg_dict.items(): if arg.optional and not raw_args.get(arg_name): continue value = raw_args[arg_name] param_dict[arg_name] = value self.inputs = inputs self.outputs = outputs
def setUp(self): super(ExecutorTest, self).setUp() self._input_data_dir = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'testdata', 'external') # Create values in exec_properties self._input_config = proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='tfrecord', pattern='tfrecord/*'), ])) self._output_config = proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ])))
def test_do_with_empty_analyze_splits(self): self._exec_properties['splits_config'] = proto_utils.proto_to_json( transform_pb2.SplitsConfig(analyze=[], transform=['train', 'eval'])) self._exec_properties['module_file'] = self._module_file with self.assertRaises(ValueError): self._transform_executor.Do(self._input_dict, self._output_dict, self._exec_properties)
def test_do_with_custom_splits(self): self._exec_properties['splits_config'] = proto_utils.proto_to_json( transform_pb2.SplitsConfig(analyze=['train'], transform=['train', 'eval'])) self._exec_properties['module_file'] = self._module_file self._transform_executor.Do(self._input_dict, self._output_dict, self._exec_properties) self._verify_transform_outputs()
def setUp(self): super(ExecutorTest, self).setUp() self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') self._output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) self._context = executor.Executor.Context( tmp_dir=self._output_data_dir, unique_id='1') # Create input dict. e1 = standard_artifacts.Examples() e1.uri = os.path.join(self._testdata_dir, 'penguin', 'data') e1.split_names = artifact_utils.encode_split_names(['train', 'eval']) e2 = copy.deepcopy(e1) self._single_artifact = [e1] self._multiple_artifacts = [e1, e2] schema = standard_artifacts.Schema() schema.uri = os.path.join(self._testdata_dir, 'penguin', 'schema') self._input_dict = { 'examples': self._single_artifact, 'schema': [schema], } # Create output dict. self._best_hparams = standard_artifacts.Model() self._best_hparams.uri = os.path.join(self._output_data_dir, 'best_hparams') self._output_dict = { 'best_hyperparameters': [self._best_hparams], } # Create exec properties. self._exec_properties = { 'train_args': proto_utils.proto_to_json(trainer_pb2.TrainArgs(num_steps=100)), 'eval_args': proto_utils.proto_to_json(trainer_pb2.EvalArgs(num_steps=50)), }
def testTuneArgs(self): with self.assertRaises(ValueError): self._exec_properties['tune_args'] = proto_utils.proto_to_json( tuner_pb2.TuneArgs(num_parallel_trials=3)) tuner = executor.Executor(self._context) tuner.Do(input_dict=self._input_dict, output_dict=self._output_dict, exec_properties=self._exec_properties)
def test_do_with_empty_analyze_splits(self): self._exec_properties[standard_component_specs. SPLITS_CONFIG_KEY] = proto_utils.proto_to_json( transform_pb2.SplitsConfig( analyze=[], transform=['train', 'eval'])) self._exec_properties[ standard_component_specs.MODULE_FILE_KEY] = self._module_file with self.assertRaises(ValueError): self._transform_executor.Do(self._input_dict, self._output_dict, self._exec_properties)
def _testFeatureBasedPartition(self, partition_feature_name): self._exec_properties[utils.OUTPUT_CONFIG_KEY] = proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig( splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split( name='eval', hash_buckets=1) ], partition_feature_name=partition_feature_name)))
def test_do_with_custom_splits(self): self._exec_properties[standard_component_specs. SPLITS_CONFIG_KEY] = proto_utils.proto_to_json( transform_pb2.SplitsConfig( analyze=['train'], transform=['train', 'eval'])) self._exec_properties[ standard_component_specs.MODULE_FILE_KEY] = self._module_file self._transform_executor.Do(self._input_dict, self._output_dict, self._exec_properties) self._verify_transform_outputs()
def testTuneArgs(self): with self.assertRaises(ValueError): self._exec_properties[standard_component_specs. TUNE_ARGS_KEY] = proto_utils.proto_to_json( tuner_pb2.TuneArgs( num_parallel_trials=3)) tuner = executor.Executor(self._context) tuner.Do(input_dict=self._input_dict, output_dict=self._output_dict, exec_properties=self._exec_properties)
def testPrestoToExample(self): with beam.Pipeline() as pipeline: examples = (pipeline | 'ToTFExample' >> executor._PrestoToExample( exec_properties={ 'input_config': proto_utils.proto_to_json(example_gen_pb2.Input()), 'custom_config': proto_utils.proto_to_json(example_gen_pb2.CustomConfig()) }, split_pattern='SELECT i, f, s FROM `fake`')) feature = {} feature['i'] = tf.train.Feature(int64_list=tf.train.Int64List( value=[1])) feature['f'] = tf.train.Feature(float_list=tf.train.FloatList( value=[2.0])) feature['s'] = tf.train.Feature(bytes_list=tf.train.BytesList( value=[tf.compat.as_bytes('abc')])) example_proto = tf.train.Example(features=tf.train.Features( feature=feature)) util.assert_that(examples, util.equal_to([example_proto]))
def run_executor( self, execution_info: data_types.ExecutionInfo ) -> execution_result_pb2.ExecutorOutput: """Execute underlying component implementation.""" context = placeholder_utils.ResolutionContext( exec_info=execution_info, executor_spec=self._executor_spec, platform_config=self._platform_config) component_executor_spec = ( executor_specs.TemplatedExecutorContainerSpec( image=self._container_executor_spec.image, command=[ placeholder_utils.resolve_placeholder_expression( cmd, context) for cmd in self._container_executor_spec.commands ])) logging.info('Container spec: %s', vars(component_executor_spec)) logging.info('Docker platform config: %s', proto_utils.proto_to_json(self._docker_platform_config)) # Call client.containers.run and wait for completion. # ExecutorContainerSpec follows k8s container spec which has different # names to Docker's container spec. It's intended to set command to docker's # entrypoint and args to docker's command. if self._docker_platform_config.docker_server_url: client = docker.DockerClient( base_url=self._docker_platform_config.docker_server_url) else: client = docker.from_env() run_args = self._build_run_args(self._docker_platform_config) container = client.containers.run( image=component_executor_spec.image, command=component_executor_spec.command, detach=True, **run_args) # Streaming logs for log in container.logs(stream=True): logging.info('Docker: %s', log.decode('utf-8')) exit_code = container.wait()['StatusCode'] if exit_code != 0: raise RuntimeError( 'Container exited with error code "{}"'.format(exit_code)) # TODO(b/141192583): Report data to publisher # - report container digest # - report replaced command line entrypoints # - report docker run args return execution_result_pb2.ExecutorOutput()
def testDoWithSequenceExamples(self): self._input_config = proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='tfrecord_sequence', pattern='tfrecord_sequence/*'), ])) self._testDo(example_gen_pb2.PayloadFormat.FORMAT_TF_SEQUENCE_EXAMPLE) self.assertEqual( example_gen_pb2.PayloadFormat.Name( example_gen_pb2.PayloadFormat.FORMAT_TF_SEQUENCE_EXAMPLE), self.examples.get_string_custom_property( utils.PAYLOAD_FORMAT_PROPERTY_NAME))
def testDo_MakeSavedModelWarmup(self): infra_validator = executor.Executor(self._context) self._request_spec.make_warmup = True self._exec_properties[REQUEST_SPEC_KEY] = ( proto_utils.proto_to_json(self._request_spec)) with mock.patch.object(infra_validator, '_ValidateOnce'): infra_validator.Do(self._input_dict, self._output_dict, self._exec_properties) warmup_file = path_utils.warmup_file_path( path_utils.stamped_model_path(self._blessing.uri)) self.assertFileExists(warmup_file) self.assertEqual(self._blessing.get_int_custom_property('has_model'), 1)