def testUpdateArtifactsMinimumExecProperties(self): # Test updating the output artifact with the minimum set of input exec # properties out_artifact = _OutputArtifact()._artifact exec_properties = { utils.SPAN_PROPERTY_NAME: 2, utils.VERSION_PROPERTY_NAME: None, utils.FINGERPRINT_PROPERTY_NAME: None } driver.update_output_artifact(exec_properties, out_artifact) self.assertEqual( out_artifact.custom_properties[utils.SPAN_PROPERTY_NAME].int_value, 2)
def testUpdateArtifactsAllExecProperties(self): # Test updating the output artifact with as many input exec properties as # possible out_artifact = _OutputArtifact()._artifact exec_properties = { utils.SPAN_PROPERTY_NAME: 2, utils.VERSION_PROPERTY_NAME: 1, utils.FINGERPRINT_PROPERTY_NAME: 'fp' } driver.update_output_artifact(exec_properties, out_artifact) self.assertEqual( out_artifact.custom_properties[utils.SPAN_PROPERTY_NAME].int_value, 2) self.assertEqual( out_artifact.custom_properties[ utils.VERSION_PROPERTY_NAME].int_value, 1) self.assertEqual( out_artifact.custom_properties[ utils.FINGERPRINT_PROPERTY_NAME].string_value, 'fp')
def _run_driver(exec_properties: Dict[str, Any], outputs_dict: Dict[str, List[artifact.Artifact]], output_metadata_uri: str, name_from_id: Optional[Dict[int, str]] = None) -> None: """Runs the driver, writing its output as a ExecutorOutput proto. The main goal of this driver is to calculate the span and fingerprint of input data, allowing for the executor invocation to be skipped if the ExampleGen component has been previously run on the same data with the same configuration. This span and fingerprint are added as new custom execution properties to an ExecutorOutput proto and written to a GCS path. The CAIP pipelines system reads this file and updates MLMD with the new execution properties. Args: exec_properties: These are required to contain the following properties: 'input_base_uri': A path from which files will be read and their span/fingerprint calculated. 'input_config': A json-serialized tfx.proto.example_gen_pb2.InputConfig proto message. See https://www.tensorflow.org/tfx/guide/examplegen for more details. 'output_config': A json-serialized tfx.proto.example_gen_pb2.OutputConfig proto message. See https://www.tensorflow.org/tfx/guide/examplegen for more details. outputs_dict: The mapping of the output artifacts. output_metadata_uri: A path at which an ExecutorOutput message will be written with updated execution properties and output artifacts. The CAIP Pipelines service will update the task's properties and artifacts prior to running the executor. name_from_id: Optional. Mapping from the converted int-typed id to str-typed runtime artifact name, which should be unique. """ if name_from_id is None: name_from_id = {} logging.set_verbosity(logging.INFO) logging.info('exec_properties = %s\noutput_metadata_uri = %s', exec_properties, output_metadata_uri) input_base_uri = exec_properties[utils.INPUT_BASE_KEY] input_config = example_gen_pb2.Input() proto_utils.json_to_proto(exec_properties[utils.INPUT_CONFIG_KEY], input_config) # TODO(b/161734559): Support range config. fingerprint, select_span, version = utils.calculate_splits_fingerprint_span_and_version( input_base_uri, input_config.splits) logging.info('Calculated span: %s', select_span) logging.info('Calculated fingerprint: %s', fingerprint) exec_properties[utils.SPAN_PROPERTY_NAME] = select_span exec_properties[utils.FINGERPRINT_PROPERTY_NAME] = fingerprint exec_properties[utils.VERSION_PROPERTY_NAME] = version if utils.EXAMPLES_KEY not in outputs_dict: raise ValueError( 'Example artifact was missing in the ExampleGen outputs.') example_artifact = artifact_utils.get_single_instance( outputs_dict[utils.EXAMPLES_KEY]) driver.update_output_artifact( exec_properties=exec_properties, output_artifact=example_artifact.mlmd_artifact) # Log the output metadata file output_metadata = pipeline_pb2.ExecutorOutput() output_metadata.parameters[ utils.FINGERPRINT_PROPERTY_NAME].string_value = fingerprint output_metadata.parameters[utils.SPAN_PROPERTY_NAME].string_value = str( select_span) output_metadata.parameters[ utils.INPUT_CONFIG_KEY].string_value = json_format.MessageToJson( input_config) output_metadata.artifacts[utils.EXAMPLES_KEY].artifacts.add().CopyFrom( kubeflow_v2_entrypoint_utils.to_runtime_artifact( example_artifact, name_from_id)) fileio.makedirs(os.path.dirname(output_metadata_uri)) with fileio.open(output_metadata_uri, 'wb') as f: f.write(json_format.MessageToJson(output_metadata, sort_keys=True))
def _run_driver(executor_input: pipeline_spec_pb2.ExecutorInput) -> None: """Runs the driver, writing its output as a ExecutorOutput proto. The main goal of this driver is to calculate the span and fingerprint of input data, allowing for the executor invocation to be skipped if the ExampleGen component has been previously run on the same data with the same configuration. This span and fingerprint are added as new custom execution properties to an ExecutorOutput proto and written to a GCS path. The CAIP pipelines system reads this file and updates MLMD with the new execution properties. Args: executor_input: pipeline_spec_pb2.ExecutorInput that contains TFX artifacts and exec_properties information. """ exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties( executor_input.inputs.parameters) name_from_id = {} outputs_dict = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict( executor_input.outputs.artifacts, name_from_id) # A path at which an ExecutorOutput message will be # written with updated execution properties and output artifacts. The CAIP # Pipelines service will update the task's properties and artifacts prior to # running the executor. output_metadata_uri = executor_input.outputs.output_file logging.set_verbosity(logging.INFO) logging.info('exec_properties = %s\noutput_metadata_uri = %s', exec_properties, output_metadata_uri) input_base_uri = exec_properties.get( standard_component_specs.INPUT_BASE_KEY) input_config = example_gen_pb2.Input() proto_utils.json_to_proto( exec_properties[standard_component_specs.INPUT_CONFIG_KEY], input_config) range_config = None range_config_entry = exec_properties.get( standard_component_specs.RANGE_CONFIG_KEY) if range_config_entry: range_config = range_config_pb2.RangeConfig() proto_utils.json_to_proto(range_config_entry, range_config) processor = input_processor.FileBasedInputProcessor( input_base_uri, input_config.splits, range_config) span, version = processor.resolve_span_and_version() fingerprint = processor.get_input_fingerprint(span, version) logging.info('Calculated span: %s', span) logging.info('Calculated fingerprint: %s', fingerprint) exec_properties[utils.SPAN_PROPERTY_NAME] = span exec_properties[utils.FINGERPRINT_PROPERTY_NAME] = fingerprint exec_properties[utils.VERSION_PROPERTY_NAME] = version # Updates the input_config.splits.pattern. for split in input_config.splits: split.pattern = processor.get_pattern_for_span_version( split.pattern, span, version) exec_properties[standard_component_specs. INPUT_CONFIG_KEY] = proto_utils.proto_to_json(input_config) if standard_component_specs.EXAMPLES_KEY not in outputs_dict: raise ValueError( 'Example artifact was missing in the ExampleGen outputs.') example_artifact = artifact_utils.get_single_instance( outputs_dict[standard_component_specs.EXAMPLES_KEY]) driver.update_output_artifact( exec_properties=exec_properties, output_artifact=example_artifact.mlmd_artifact) # Log the output metadata file output_metadata = pipeline_spec_pb2.ExecutorOutput() output_metadata.parameters[utils.SPAN_PROPERTY_NAME].int_value = span output_metadata.parameters[ utils.FINGERPRINT_PROPERTY_NAME].string_value = fingerprint if version is not None: output_metadata.parameters[ utils.VERSION_PROPERTY_NAME].int_value = version output_metadata.parameters[ standard_component_specs. INPUT_CONFIG_KEY].string_value = proto_utils.proto_to_json( input_config) output_metadata.artifacts[ standard_component_specs.EXAMPLES_KEY].artifacts.add().CopyFrom( kubeflow_v2_entrypoint_utils.to_runtime_artifact( example_artifact, name_from_id)) fileio.makedirs(os.path.dirname(output_metadata_uri)) with fileio.open(output_metadata_uri, 'wb') as f: f.write(json_format.MessageToJson(output_metadata, sort_keys=True))