def get_parameter_from_output(file_path: str, param_name: str): """Gets a parameter value by its name from output metadata JSON.""" output = pipeline_spec_pb2.ExecutorOutput() json_format.Parse(text=_gcs_helper.GCSHelper.read_from_gcs_path(file_path), message=output) value = output.parameters[param_name] return getattr(value, value.WhichOneof('value'))
def get_artifact_from_output(file_path: str, output_name: str) -> artifact.Artifact: """Gets an artifact object from output metadata JSON.""" output = pipeline_spec_pb2.ExecutorOutput() json_format.Parse(text=_gcs_helper.GCSHelper.read_from_gcs_path(file_path), message=output) # Currently we bear the assumption that each output contains only one artifact json_str = json_format.MessageToJson( output.artifacts[output_name].artifacts[0], sort_keys=True) # Convert runtime_artifact to Python artifact return artifact.Artifact.deserialize(json_str)
def get_executor_output( output_artifacts: Dict[str, artifact.Artifact], output_params: Dict[str, Union[int, float, str]] ) -> pipeline_spec_pb2.ExecutorOutput: """Gets the output metadata message.""" result = pipeline_spec_pb2.ExecutorOutput() for name, art in output_artifacts.items(): result.artifacts[name].CopyFrom( pipeline_spec_pb2.ArtifactList(artifacts=[art.runtime_artifact])) for name, param in output_params.items(): result.parameters[name].CopyFrom(_get_pipeline_value(param)) return result
def testDriverWithoutSpan(self): split1 = os.path.join(_TEST_INPUT_DIR, 'split1', 'data') io_utils.write_string_file(split1, 'testing') os.utime(split1, (0, 1)) split2 = os.path.join(_TEST_INPUT_DIR, 'split2', 'data') io_utils.write_string_file(split2, 'testing2') os.utime(split2, (0, 3)) self._executor_invocation.inputs.parameters[ 'input_config'].string_value = json_format.MessageToJson( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='s1', pattern='split1/*'), example_gen_pb2.Input.Split(name='s2', pattern='split2/*') ])) serialized_args = [ '--json_serialized_invocation_args', json_format.MessageToJson(message=self._executor_invocation) ] # Invoke the driver driver.main(driver._parse_flags(serialized_args)) # Check the output metadata file for the expected outputs with open(_TEST_OUTPUT_METADATA_JSON) as output_meta_json: output_metadata = pipeline_pb2.ExecutorOutput() json_format.Parse(output_meta_json.read(), output_metadata, ignore_unknown_fields=True) self.assertEqual(output_metadata.parameters['span'].string_value, '0') self.assertEqual( output_metadata.parameters['input_fingerprint'].string_value, 'split:s1,num_files:1,total_bytes:7,xor_checksum:1,sum_checksum:1\n' 'split:s2,num_files:1,total_bytes:8,xor_checksum:3,sum_checksum:3' ) self.assertEqual( output_metadata.parameters['input_config'].string_value, json_format.MessageToJson( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='s1', pattern='split1/*'), example_gen_pb2.Input.Split(name='s2', pattern='split2/*') ])))
def testEntryPointWithDriver(self): """Test the entrypoint with Driver's output metadata.""" # Mock the driver's output metadata. output_metadata = pipeline_spec_pb2.ExecutorOutput() output_metadata.parameters["key_1"].string_value = "driver" output_metadata.parameters["key_3"].string_value = "driver3" fileio.makedirs(os.path.dirname(_TEST_OUTPUT_METADATA_JSON)) with fileio.open(_TEST_OUTPUT_METADATA_JSON, "wb") as f: f.write(json_format.MessageToJson(output_metadata, sort_keys=True)) with _ArgsCapture() as args_capture: args = [ "--executor_class_path", "%s.%s" % (_FakeExecutor.__module__, _FakeExecutor.__name__), "--json_serialized_invocation_args", self._serialized_metadata ] kubeflow_v2_run_executor.main( kubeflow_v2_run_executor._parse_flags(args)) # TODO(b/131417512): Add equal comparison to types.Artifact class so we # can use asserters. self.assertEqual(set(args_capture.input_dict.keys()), set(["input_1", "input_2"])) self.assertEqual( set(args_capture.output_dict.keys()), set(["output", standard_component_specs.BLESSING_KEY])) # Verify that exec_properties use driver's output metadata. self.assertEqual( args_capture.exec_properties, { "key_1": "driver", # Overwrite. "key_2": 536870911, "key_3": "driver3" # Append. }) # Test what's been output. with open(_TEST_OUTPUT_METADATA_JSON) as output_meta_json: actual_output = json.dumps(json.load(output_meta_json), indent=2, sort_keys=True) self.assertEqual(actual_output, self._expected_output) os.remove(_TEST_OUTPUT_METADATA_JSON)
def testDriverWithSpan(self): # Test align of span number. span1_split1 = os.path.join(_TEST_INPUT_DIR, 'span1', 'split1', 'data') io_utils.write_string_file(span1_split1, 'testing11') span1_split2 = os.path.join(_TEST_INPUT_DIR, 'span1', 'split2', 'data') io_utils.write_string_file(span1_split2, 'testing12') span2_split1 = os.path.join(_TEST_INPUT_DIR, 'span2', 'split1', 'data') io_utils.write_string_file(span2_split1, 'testing21') serialized_args = [ '--json_serialized_invocation_args', json_format.MessageToJson(message=self._executor_invocation) ] with self.assertRaisesRegexp( ValueError, 'Latest span should be the same for each split'): driver.main(driver._parse_flags(serialized_args)) # Test if latest span is selected when span aligns for each split. span2_split2 = os.path.join(_TEST_INPUT_DIR, 'span2', 'split2', 'data') io_utils.write_string_file(span2_split2, 'testing22') driver.main(driver._parse_flags(serialized_args)) # Check the output metadata file for the expected outputs with open(_TEST_OUTPUT_METADATA_JSON) as output_meta_json: output_metadata = pipeline_pb2.ExecutorOutput() json_format.Parse(output_meta_json.read(), output_metadata, ignore_unknown_fields=True) self.assertEqual(output_metadata.parameters['span'].string_value, '2') self.assertEqual( output_metadata.parameters['input_config'].string_value, json_format.MessageToJson( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='s1', pattern='span2/split1/*'), example_gen_pb2.Input.Split(name='s2', pattern='span2/split2/*') ])))
def testGetExecutorOutput(self): model = ontology_artifacts.Model() model.name = 'test-artifact' model.uri = 'gs://root/execution/output' model.metadata['test_property'] = 'test value' executor_output = entrypoint_utils.get_executor_output( output_artifacts={'output': model}, output_params={ 'int_output': 42, 'string_output': 'hello world!', 'float_output': 12.12 }) # Renormalize the JSON proto read from testdata. Otherwise there'll be # mismatch in the way treating int value. expected_output = pipeline_spec_pb2.ExecutorOutput() expected_output = json_format.Parse( text=_get_text_from_testdata('executor_output.json'), message=expected_output) self.assertDictEqual(json_format.MessageToDict(expected_output), json_format.MessageToDict(executor_output))
def _run_executor(args: argparse.Namespace, beam_args: List[str]) -> None: """Selects a particular executor and run it based on name. Args: args: --executor_class_path: The import path of the executor class. --json_serialized_invocation_args: Full JSON-serialized parameters for this execution. beam_args: Optional parameter that maps to the optional_pipeline_args parameter in the pipeline, which provides additional configuration options for apache-beam and tensorflow.logging. For more about the beam arguments please refer to: https://cloud.google.com/dataflow/docs/guides/specifying-exec-params """ logging.set_verbosity(logging.INFO) # Rehydrate inputs/outputs/exec_properties from the serialized metadata. executor_input = pipeline_spec_pb2.ExecutorInput() json_format.Parse( args.json_serialized_invocation_args, executor_input, ignore_unknown_fields=True) inputs_dict = executor_input.inputs.artifacts outputs_dict = executor_input.outputs.artifacts inputs_parameter = executor_input.inputs.parameters if fileio.exists(executor_input.outputs.output_file): # It has a driver that outputs the updated exec_properties in this file. with fileio.open(executor_input.outputs.output_file, 'rb') as output_meta_json: output_metadata = pipeline_spec_pb2.ExecutorOutput() json_format.Parse( output_meta_json.read(), output_metadata, ignore_unknown_fields=True) # Append/Overwrite exec_propertise. for k, v in output_metadata.parameters.items(): inputs_parameter[k].CopyFrom(v) name_from_id = {} inputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict( inputs_dict, name_from_id) outputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict( outputs_dict, name_from_id) exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties( inputs_parameter) logging.info('Executor %s do: inputs: %s, outputs: %s, exec_properties: %s', args.executor_class_path, inputs, outputs, exec_properties) executor_cls = import_utils.import_class_by_path(args.executor_class_path) if issubclass(executor_cls, base_beam_executor.BaseBeamExecutor): executor_context = base_beam_executor.BaseBeamExecutor.Context( beam_pipeline_args=beam_args, unique_id='', tmp_dir='/tmp') else: executor_context = base_executor.BaseExecutor.Context( extra_flags=beam_args, unique_id='', tmp_dir='/tmp') executor = executor_cls(executor_context) logging.info('Starting executor') executor.Do(inputs, outputs, exec_properties) # TODO(b/182316162): Unify publisher handling so that post-execution artifact # logic is more cleanly handled. outputs_utils.tag_output_artifacts_with_version(outputs) # pylint: disable=protected-access # TODO(b/169583143): Remove this workaround when TFX migrates to use str-typed # id/name to identify artifacts. # Convert ModelBlessing artifact to use managed MLMD resource name. if (issubclass(executor_cls, evaluator_executor.Executor) and standard_component_specs.BLESSING_KEY in outputs): # Parse the parent prefix for managed MLMD resource name. kubeflow_v2_entrypoint_utils.refactor_model_blessing( artifact_utils.get_single_instance( outputs[standard_component_specs.BLESSING_KEY]), name_from_id) # Log the output metadata to a file. So that it can be picked up by MP. metadata_uri = executor_input.outputs.output_file executor_output = pipeline_spec_pb2.ExecutorOutput() for k, v in kubeflow_v2_entrypoint_utils.translate_executor_output( outputs, name_from_id).items(): executor_output.artifacts[k].CopyFrom(v) fileio.makedirs(os.path.dirname(metadata_uri)) with fileio.open(metadata_uri, 'wb') as f: f.write(json_format.MessageToJson(executor_output))
def _run_driver(executor_input: pipeline_spec_pb2.ExecutorInput) -> None: """Runs the driver, writing its output as a ExecutorOutput proto. The main goal of this driver is to calculate the span and fingerprint of input data, allowing for the executor invocation to be skipped if the ExampleGen component has been previously run on the same data with the same configuration. This span and fingerprint are added as new custom execution properties to an ExecutorOutput proto and written to a GCS path. The CAIP pipelines system reads this file and updates MLMD with the new execution properties. Args: executor_input: pipeline_spec_pb2.ExecutorInput that contains TFX artifacts and exec_properties information. """ exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties( executor_input.inputs.parameters) name_from_id = {} outputs_dict = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict( executor_input.outputs.artifacts, name_from_id) # A path at which an ExecutorOutput message will be # written with updated execution properties and output artifacts. The CAIP # Pipelines service will update the task's properties and artifacts prior to # running the executor. output_metadata_uri = executor_input.outputs.output_file logging.set_verbosity(logging.INFO) logging.info('exec_properties = %s\noutput_metadata_uri = %s', exec_properties, output_metadata_uri) input_base_uri = exec_properties.get( standard_component_specs.INPUT_BASE_KEY) input_config = example_gen_pb2.Input() proto_utils.json_to_proto( exec_properties[standard_component_specs.INPUT_CONFIG_KEY], input_config) range_config = None range_config_entry = exec_properties.get( standard_component_specs.RANGE_CONFIG_KEY) if range_config_entry: range_config = range_config_pb2.RangeConfig() proto_utils.json_to_proto(range_config_entry, range_config) processor = input_processor.FileBasedInputProcessor( input_base_uri, input_config.splits, range_config) span, version = processor.resolve_span_and_version() fingerprint = processor.get_input_fingerprint(span, version) logging.info('Calculated span: %s', span) logging.info('Calculated fingerprint: %s', fingerprint) exec_properties[utils.SPAN_PROPERTY_NAME] = span exec_properties[utils.FINGERPRINT_PROPERTY_NAME] = fingerprint exec_properties[utils.VERSION_PROPERTY_NAME] = version # Updates the input_config.splits.pattern. for split in input_config.splits: split.pattern = processor.get_pattern_for_span_version( split.pattern, span, version) exec_properties[standard_component_specs. INPUT_CONFIG_KEY] = proto_utils.proto_to_json(input_config) if standard_component_specs.EXAMPLES_KEY not in outputs_dict: raise ValueError( 'Example artifact was missing in the ExampleGen outputs.') example_artifact = artifact_utils.get_single_instance( outputs_dict[standard_component_specs.EXAMPLES_KEY]) driver.update_output_artifact( exec_properties=exec_properties, output_artifact=example_artifact.mlmd_artifact) # Log the output metadata file output_metadata = pipeline_spec_pb2.ExecutorOutput() output_metadata.parameters[utils.SPAN_PROPERTY_NAME].int_value = span output_metadata.parameters[ utils.FINGERPRINT_PROPERTY_NAME].string_value = fingerprint if version is not None: output_metadata.parameters[ utils.VERSION_PROPERTY_NAME].int_value = version output_metadata.parameters[ standard_component_specs. INPUT_CONFIG_KEY].string_value = proto_utils.proto_to_json( input_config) output_metadata.artifacts[ standard_component_specs.EXAMPLES_KEY].artifacts.add().CopyFrom( kubeflow_v2_entrypoint_utils.to_runtime_artifact( example_artifact, name_from_id)) fileio.makedirs(os.path.dirname(output_metadata_uri)) with fileio.open(output_metadata_uri, 'wb') as f: f.write(json_format.MessageToJson(output_metadata, sort_keys=True))
def _run_driver(exec_properties: Dict[str, Any], outputs_dict: Dict[str, List[artifact.Artifact]], output_metadata_uri: str, name_from_id: Optional[Dict[int, str]] = None) -> None: """Runs the driver, writing its output as a ExecutorOutput proto. The main goal of this driver is to calculate the span and fingerprint of input data, allowing for the executor invocation to be skipped if the ExampleGen component has been previously run on the same data with the same configuration. This span and fingerprint are added as new custom execution properties to an ExecutorOutput proto and written to a GCS path. The CAIP pipelines system reads this file and updates MLMD with the new execution properties. Args: exec_properties: These are required to contain the following properties: 'input_base_uri': A path from which files will be read and their span/fingerprint calculated. 'input_config': A json-serialized tfx.proto.example_gen_pb2.InputConfig proto message. See https://www.tensorflow.org/tfx/guide/examplegen for more details. 'output_config': A json-serialized tfx.proto.example_gen_pb2.OutputConfig proto message. See https://www.tensorflow.org/tfx/guide/examplegen for more details. outputs_dict: The mapping of the output artifacts. output_metadata_uri: A path at which an ExecutorOutput message will be written with updated execution properties and output artifacts. The CAIP Pipelines service will update the task's properties and artifacts prior to running the executor. name_from_id: Optional. Mapping from the converted int-typed id to str-typed runtime artifact name, which should be unique. """ if name_from_id is None: name_from_id = {} logging.set_verbosity(logging.INFO) logging.info('exec_properties = %s\noutput_metadata_uri = %s', exec_properties, output_metadata_uri) input_base_uri = exec_properties.get( standard_component_specs.INPUT_BASE_KEY) input_config = example_gen_pb2.Input() proto_utils.json_to_proto( exec_properties[standard_component_specs.INPUT_CONFIG_KEY], input_config) range_config = None range_config_entry = exec_properties.get( standard_component_specs.RANGE_CONFIG_KEY) if range_config_entry: range_config = range_config_pb2.RangeConfig() proto_utils.json_to_proto(range_config_entry, range_config) processor = input_processor.FileBasedInputProcessor( input_base_uri, input_config.splits, range_config) span, version = processor.resolve_span_and_version() fingerprint = processor.get_input_fingerprint(span, version) logging.info('Calculated span: %s', span) logging.info('Calculated fingerprint: %s', fingerprint) exec_properties[utils.SPAN_PROPERTY_NAME] = span exec_properties[utils.FINGERPRINT_PROPERTY_NAME] = fingerprint exec_properties[utils.VERSION_PROPERTY_NAME] = version # Updates the input_config.splits.pattern. for split in input_config.splits: split.pattern = processor.get_pattern_for_span_version( split.pattern, span, version) exec_properties[standard_component_specs. INPUT_CONFIG_KEY] = proto_utils.proto_to_json(input_config) if standard_component_specs.EXAMPLES_KEY not in outputs_dict: raise ValueError( 'Example artifact was missing in the ExampleGen outputs.') example_artifact = artifact_utils.get_single_instance( outputs_dict[standard_component_specs.EXAMPLES_KEY]) driver.update_output_artifact( exec_properties=exec_properties, output_artifact=example_artifact.mlmd_artifact) # Log the output metadata file output_metadata = pipeline_spec_pb2.ExecutorOutput() output_metadata.parameters[ utils.FINGERPRINT_PROPERTY_NAME].string_value = fingerprint output_metadata.parameters[utils.SPAN_PROPERTY_NAME].string_value = str( span) output_metadata.parameters[ standard_component_specs. INPUT_CONFIG_KEY].string_value = json_format.MessageToJson( input_config) output_metadata.artifacts[ standard_component_specs.EXAMPLES_KEY].artifacts.add().CopyFrom( kubeflow_v2_entrypoint_utils.to_runtime_artifact( example_artifact, name_from_id)) fileio.makedirs(os.path.dirname(output_metadata_uri)) with fileio.open(output_metadata_uri, 'wb') as f: f.write(json_format.MessageToJson(output_metadata, sort_keys=True))