def GetModelPath(self, input_dict: Dict[str, List[types.Artifact]]) -> str: """Get input model path to push. Pusher can push various types of artifacts if it contains the model. This method decides which artifact type is given to the Pusher and extracts the real model path. Subclass of Pusher Executor should use this method to acquire the source model path. Args: input_dict: A dictionary of artifacts that is given as the fisrt argument to the Executor.Do() method. Returns: A resolved input model path. Raises: RuntimeError: If no model path is found from input_dict. """ # Check input_dict['model'] first. models = input_dict.get(standard_component_specs.MODEL_KEY) if models: model = artifact_utils.get_single_instance(models) return path_utils.serving_model_path( model.uri, path_utils.is_old_model_artifact(model)) # Falls back to input_dict['infra_blessing'] blessed_models = input_dict.get( standard_component_specs.INFRA_BLESSING_KEY) if not blessed_models: # Should not reach here; Pusher.__init__ prohibits creating a component # without having any of model or infra_blessing inputs. raise RuntimeError('Pusher has no model input.') model = artifact_utils.get_single_instance(blessed_models) if not model.get_int_custom_property(_INFRA_BLESSING_MODEL_FLAG_KEY): raise RuntimeError('InfraBlessing does not contain a model. Check ' 'request_spec.make_warmup is set to True.') return path_utils.stamped_model_path(model.uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor. custom_config.ai_platform_serving_args is consumed by this class. For the full set of parameters supported by Google Cloud AI Platform, refer to https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version. Returns: None Raises: ValueError: if ai_platform_serving_args is not in exec_properties.custom_config. RuntimeError: if the Google Cloud AI Platform training job failed. """ self._log_startup(input_dict, output_dict, exec_properties) if not self.CheckBlessing(input_dict, output_dict): return model_export = artifact_utils.get_single_instance( input_dict['model_export']) model_export_uri = model_export.uri model_blessing_uri = artifact_utils.get_single_uri( input_dict['model_blessing']) model_push = artifact_utils.get_single_instance( output_dict['model_push']) # TODO(jyzhao): should this be in driver or executor. if not tf.gfile.Exists(os.path.join(model_blessing_uri, 'BLESSED')): model_push.set_int_custom_property('pushed', 0) tf.logging.info('Model on %s was not blessed', model_blessing_uri) return exec_properties_copy = exec_properties.copy() custom_config = exec_properties_copy.pop('custom_config', {}) ai_platform_serving_args = custom_config['ai_platform_serving_args'] # Deploy the model. model_path = path_utils.serving_model_path(model_export_uri) # Note: we do not have a logical model version right now. This # model_version is a timestamp mapped to trainer's exporter. model_version = os.path.basename(model_path) if ai_platform_serving_args is not None: runner.deploy_model_for_cmle_serving(model_path, model_version, ai_platform_serving_args) # Make sure artifacts are populated in a standard way by calling # tfx.pusher.executor.Executor.Do(). exec_properties_copy['push_destination'] = exec_properties.get( 'push_destination') or self._make_local_temp_destination() super(Executor, self).Do(input_dict, output_dict, exec_properties_copy)
def Do(self, input_dict: Mapping[str, Sequence[artifact.Artifact]], output_dict: Mapping[str, Sequence[artifact.Artifact]], exec_properties: Mapping[str, Any]) -> None: """Overrides BaseExecutor.Do().""" args_capture = _ArgsCapture.instance args_capture.input_dict = input_dict args_capture.output_dict = output_dict args_capture.exec_properties = exec_properties artifact_utils.get_single_instance( output_dict["output"]).set_string_custom_property( _TEST_OUTPUT_PROPERTY_KEY, _TEST_OUTPUT_PROPERTY_VALUE) blessing = artifact_utils.get_single_instance(output_dict["blessing"]) # Set to the hash value of # 'projects/123456789/locations/us-central1/metadataStores/default/artifacts/1' blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BASELINE_MODEL_ID_KEY, 5743745765020341227) # Set to the hash value of # 'projects/123456789/locations/us-central1/metadataStores/default/artifacts/2' blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY, 7228748496289751000) # Set the blessing result blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BLESSED_KEY, constants.BLESSED_VALUE)
def CheckBlessing(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]]) -> bool: """Check that model is blessed by upstream ModelValidator, or update output. Args: input_dict: Input dict from input key to a list of artifacts: - model_blessing: model blessing path from model_validator. Pusher looks for a file named 'BLESSED' to consider the model blessed and safe to push. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. Returns: True if the model is blessed by validator. """ model_blessing = artifact_utils.get_single_instance( input_dict['model_blessing']) model_push = artifact_utils.get_single_instance( output_dict['model_push']) # TODO(jyzhao): should this be in driver or executor. if not model_utils.is_model_blessed(model_blessing): model_push.set_int_custom_property('pushed', 0) absl.logging.info('Model on %s was not blessed', model_blessing.uri) return False return True
def CheckBlessing(self, input_dict: Dict[Text, List[types.Artifact]]) -> bool: """Check that model is blessed by upstream validators. Args: input_dict: Input dict from input key to a list of artifacts: - model_blessing: A `ModelBlessing` artifact from model validator or evaluator. Pusher looks for a custom property `blessed` in the artifact to check it is safe to push. - infra_blessing: An `InfraBlessing` artifact from infra validator. Pusher looks for a custom proeprty `blessed` in the artifact to determine whether the model is mechanically servable from the model server to which Pusher is going to push. Returns: True if the model is blessed by validator. """ model_blessing = artifact_utils.get_single_instance( input_dict[MODEL_BLESSING_KEY]) # TODO(jyzhao): should this be in driver or executor. if not model_utils.is_model_blessed(model_blessing): logging.info('Model on %s was not blessed by model validation', model_blessing.uri) return False if INFRA_BLESSING_KEY in input_dict: infra_blessing = artifact_utils.get_single_instance( input_dict[INFRA_BLESSING_KEY]) if not model_utils.is_infra_validated(infra_blessing): logging.info('Model on %s was not blessed by infra validator', model_blessing.uri) return False return True
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Contract for running InfraValidator Executor. Args: input_dict: - `model`: Single `Model` artifact that we're validating. - `examples`: `Examples` artifacts to be used for test requests. output_dict: - `blessing`: Single `InfraBlessing` artifact containing the validated result. It is an empty file with the name either of INFRA_BLESSED or INFRA_NOT_BLESSED. exec_properties: - `serving_spec`: Serialized `ServingSpec` configuration. - `validation_spec`: Serialized `ValidationSpec` configuration. - `request_spec`: Serialized `RequestSpec` configuration. """ self._log_startup(input_dict, output_dict, exec_properties) model = artifact_utils.get_single_instance(input_dict[_MODEL_KEY]) blessing = artifact_utils.get_single_instance( output_dict[_BLESSING_KEY]) if input_dict.get(_EXAMPLES_KEY): examples = artifact_utils.get_single_instance( input_dict[_EXAMPLES_KEY]) else: examples = None serving_spec = infra_validator_pb2.ServingSpec() json_format.Parse(exec_properties[_SERVING_SPEC_KEY], serving_spec) if not serving_spec.model_name: serving_spec.model_name = _DEFAULT_MODEL_NAME validation_spec = infra_validator_pb2.ValidationSpec() if exec_properties.get(_VALIDATION_SPEC_KEY): json_format.Parse(exec_properties[_VALIDATION_SPEC_KEY], validation_spec) if not validation_spec.num_tries: validation_spec.num_tries = _DEFAULT_NUM_TRIES if not validation_spec.max_loading_time_seconds: validation_spec.max_loading_time_seconds = _DEFAULT_MAX_LOADING_TIME_SEC if exec_properties.get(_REQUEST_SPEC_KEY): request_spec = infra_validator_pb2.RequestSpec() json_format.Parse(exec_properties[_REQUEST_SPEC_KEY], request_spec) else: request_spec = None with self._InstallGracefulShutdownHandler(): self._Do( model=model, examples=examples, blessing=blessing, serving_spec=serving_spec, validation_spec=validation_spec, request_spec=request_spec, )
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor. custom_config.ai_platform_serving_args is consumed by this class. For the full set of parameters supported by Google Cloud AI Platform, refer to https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version. Returns: None Raises: ValueError: If ai_platform_serving_args is not in exec_properties.custom_config. If Serving model path does not start with gs://. RuntimeError: if the Google Cloud AI Platform training job failed. """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict[tfx_pusher_executor.PUSHED_MODEL_KEY]) if not self.CheckBlessing(input_dict): model_push.set_int_custom_property('pushed', 0) return model_export = artifact_utils.get_single_instance( input_dict[tfx_pusher_executor.MODEL_KEY]) model_export_uri = model_export.uri exec_properties_copy = exec_properties.copy() custom_config = exec_properties_copy.pop('custom_config', {}) ai_platform_serving_args = custom_config[SERVING_ARGS_KEY] if not ai_platform_serving_args: raise ValueError( '\'ai_platform_serving_args\' is missing in \'custom_config\'') # Deploy the model. model_path = path_utils.serving_model_path(model_export_uri) # Note: we do not have a logical model version right now. This # model_version is a timestamp mapped to trainer's exporter. model_version = os.path.basename(model_path) executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) runner.deploy_model_for_aip_prediction( model_path, model_version, ai_platform_serving_args, executor_class_path, ) model_push.set_int_custom_property('pushed', 1) model_push.set_string_custom_property('pushed_model', model_path)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs batch inference on a given model with given input examples. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: examples for inference. - model: exported model. - model_blessing: model blessing result, optional. output_dict: Output dict from output key to a list of Artifacts. - output: bulk inference results. exec_properties: A dict of execution properties. - model_spec: JSON string of bulk_inferrer_pb2.ModelSpec instance. - data_spec: JSON string of bulk_inferrer_pb2.DataSpec instance. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'inference_result' not in output_dict: raise ValueError('\'inference_result\' is missing in output dict.') output = artifact_utils.get_single_instance( output_dict['inference_result']) if 'model' not in input_dict: raise ValueError('Input models are not valid, model ' 'need to be specified.') if 'model_blessing' in input_dict: model_blessing = artifact_utils.get_single_instance( input_dict['model_blessing']) if not model_utils.is_model_blessed(model_blessing): output.set_int_custom_property('inferred', 0) logging.info('Model on %s was not blessed', model_blessing.uri) return else: logging.info( 'Model blessing is not provided, exported model will be ' 'used.') model = artifact_utils.get_single_instance(input_dict['model']) model_path = path_utils.serving_model_path(model.uri) logging.info('Use exported model from %s.', model_path) data_spec = bulk_inferrer_pb2.DataSpec() json_format.Parse(exec_properties['data_spec'], data_spec) if self._run_model_inference( data_spec, input_dict['examples'], output.uri, self._get_inference_spec(model_path, exec_properties)): output.set_int_custom_property('inferred', 1) else: output.set_int_custom_property('inferred', 0)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Push model to target directory if blessed. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: A dict of execution properties, including: - push_destination: JSON string of pusher_pb2.PushDestination instance, providing instruction of destination to push model. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict[PUSHED_MODEL_KEY]) model_push_uri = model_push.uri model_export = artifact_utils.get_single_instance( input_dict[MODEL_KEY]) model_export_uri = model_export.uri logging.info('Model pushing.') # Copy the model to pushing uri. model_path = path_utils.serving_model_path(model_export_uri) model_version = str(int(time.time())) # model_version = path_utils.get_serving_model_version(model_export_uri) logging.info('Model version is %s', model_version) io_utils.copy_dir(model_path, os.path.join(model_push_uri, model_version)) logging.info('Model written to %s.', model_push_uri) push_destination = pusher_pb2.PushDestination() json_format.Parse(exec_properties['push_destination'], push_destination) serving_path = os.path.join(push_destination.filesystem.base_directory, model_version) if tf.io.gfile.exists(serving_path): logging.info( 'Destination directory %s already exists, skipping current push.', serving_path) else: # tf.serving won't load partial model, it will retry until fully copied. io_utils.copy_dir(model_path, serving_path) logging.info('Model written to serving path %s.', serving_path) model_push.set_int_custom_property('pushed', 1) model_push.set_string_custom_property('pushed_model', model_export_uri) model_push.set_int_custom_property('pushed_model_id', model_export.id) logging.info('Model pushed to %s.', serving_path)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Copy the input_data to the output_data. For this example that is all that the Executor does. For a different custom component, this is where the real functionality of the component would be included. This component both reads and writes Examples, but a different component might read and write artifacts of other types. Args: input_dict: Input dict from input key to a list of artifacts, including: - input_data: A list of type `standard_artifacts.Examples` which will often contain two splits, 'train' and 'eval'. output_dict: Output dict from key to a list of artifacts, including: - output_data: A list of type `standard_artifacts.Examples` which will usually contain the same splits as input_data. exec_properties: A dict of execution properties, including: - name: Optional unique name. Necessary iff multiple Hello components are declared in the same pipeline. Returns: None Raises: OSError and its subclasses """ self._log_startup(input_dict, output_dict, exec_properties) input_artifact = artifact_utils.get_single_instance( input_dict['input_data']) output_artifact = artifact_utils.get_single_instance( output_dict['output_data']) output_artifact.split_names = input_artifact.split_names split_to_instance = {} for split in json.loads(input_artifact.split_names): uri = artifact_utils.get_split_uri([input_artifact], split) split_to_instance[split] = uri for split, instance in split_to_instance.items(): input_dir = instance output_dir = artifact_utils.get_split_uri([output_artifact], split) for filename in tf.io.gfile.listdir(input_dir): input_uri = os.path.join(input_dir, filename) output_uri = os.path.join(output_dir, filename) io_utils.copy_file(src=input_uri, dst=output_uri, overwrite=True)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Push model to target directory if blessed. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. A push action delivers the model exports produced by Trainer to the destination defined in component config. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: A dict of execution properties, including: - push_destination: JSON string of pusher_pb2.PushDestination instance, providing instruction of destination to push artifact. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) artifact_export = artifact_utils.get_single_instance( input_dict[ARTIFACT_KEY]) artifact_path = artifact_export.uri artifact_push = artifact_utils.get_single_instance( output_dict[PUSHED_ARTIFACT_KEY]) push_destination = pusher_pb2.PushDestination() json_format.Parse(exec_properties['push_destination'], push_destination) destination_kind = push_destination.WhichOneof('destination') if destination_kind == 'filesystem': fs_config = push_destination.filesystem serving_path = fs_config.base_directory copy_dir(artifact_path, serving_path) absl.logging.info('artifact written to serving path %s.', serving_path) else: raise NotImplementedError( 'Invalid push destination {}'.format(destination_kind)) # Copy the artifact to pushing uri for archiving. copy_dir(artifact_path, artifact_push.uri) absl.logging.info('artifact pushed to %s.', artifact_push.uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """ Args: input_dict: output_dict: exec_properties: """ source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] c = source_utils.load_source_path_class(source) data_step: BaseDataStep = c(**args) # Get output split path examples_artifact = artifact_utils.get_single_instance( output_dict[DATA_SPLIT_NAME]) split_names = [DATA_SPLIT_NAME] examples_artifact.split_names = artifact_utils.encode_split_names( split_names) output_split_path = artifact_utils.get_split_uri([examples_artifact], DATA_SPLIT_NAME) with self._make_beam_pipeline() as p: (p | data_step.read_from_source() # | data_step.convert_to_dict() | WriteToTFRecord(data_step.schema, output_split_path))
def _prepare_output_artifacts( self, input_artifacts: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, types.Channel], exec_properties: Dict[Text, Any], execution_id: int, pipeline_info: data_types.PipelineInfo, component_info: data_types.ComponentInfo, ) -> Dict[Text, List[types.Artifact]]: """Overrides BaseDriver._prepare_output_artifacts().""" del input_artifacts result = channel_utils.unwrap_channel_dict(output_dict) if len(result) != 1: raise RuntimeError('Multiple output artifacts are not supported.') base_output_dir = os.path.join(pipeline_info.pipeline_root, component_info.component_id) example_artifact = artifact_utils.get_single_instance( result[utils.EXAMPLES_KEY]) example_artifact.uri = base_driver._generate_output_uri( # pylint: disable=protected-access base_output_dir, utils.EXAMPLES_KEY, execution_id) example_artifact.set_string_custom_property( utils.FINGERPRINT_PROPERTY_NAME, exec_properties[utils.FINGERPRINT_PROPERTY_NAME]) example_artifact.set_string_custom_property( utils.SPAN_PROPERTY_NAME, exec_properties[utils.SPAN_PROPERTY_NAME]) base_driver._prepare_output_paths(example_artifact) # pylint: disable=protected-access return result
def testPrepareOutputArtifacts(self): examples = standard_artifacts.Examples() output_dict = {utils.EXAMPLES_KEY: channel_utils.as_channel([examples])} exec_properties = { utils.SPAN_PROPERTY_NAME: 2, utils.VERSION_PROPERTY_NAME: 1, utils.FINGERPRINT_PROPERTY_NAME: 'fp' } pipeline_info = data_types.PipelineInfo( pipeline_name='name', pipeline_root=self._test_dir, run_id='rid') component_info = data_types.ComponentInfo( component_type='type', component_id='cid', pipeline_info=pipeline_info) input_artifacts = {} output_artifacts = self._example_gen_driver._prepare_output_artifacts( input_artifacts, output_dict, exec_properties, 1, pipeline_info, component_info) examples = artifact_utils.get_single_instance( output_artifacts[utils.EXAMPLES_KEY]) base_output_dir = os.path.join(self._test_dir, component_info.component_id) expected_uri = base_driver._generate_output_uri( # pylint: disable=protected-access base_output_dir, 'examples', 1) self.assertEqual(examples.uri, expected_uri) self.assertEqual( examples.get_string_custom_property(utils.FINGERPRINT_PROPERTY_NAME), 'fp') self.assertEqual( examples.get_string_custom_property(utils.SPAN_PROPERTY_NAME), '2') self.assertEqual( examples.get_string_custom_property(utils.VERSION_PROPERTY_NAME), '1')
def _build_importer_spec(self) -> ImporterSpec: """Builds ImporterSpec.""" assert isinstance(self._node, importer.Importer) output_channel = self._node.outputs[importer.IMPORT_RESULT_KEY] result = ImporterSpec() # Importer's output channel contains one artifact instance with # additional properties. artifact_instance = list(output_channel.get())[0] struct_proto = compiler_utils.pack_artifact_properties(artifact_instance) if struct_proto: result.metadata.CopyFrom(struct_proto) result.reimport = bool(self._exec_properties[importer.REIMPORT_OPTION_KEY]) result.artifact_uri.CopyFrom( compiler_utils.value_converter( self._exec_properties[importer.SOURCE_URI_KEY])) single_artifact = artifact_utils.get_single_instance( list(output_channel.get())) result.type_schema.CopyFrom( pipeline_pb2.ArtifactTypeSchema( instance_schema=compiler_utils.get_artifact_schema( single_artifact))) return result
def testPrepareOutputArtifacts(self): examples = standard_artifacts.Examples() output_dict = { utils.EXAMPLES_KEY: channel_utils.as_channel([examples]) } exec_properties = { utils.SPAN_PROPERTY_NAME: '02', utils.FINGERPRINT_PROPERTY_NAME: 'fp' } pipeline_info = data_types.PipelineInfo(pipeline_name='name', pipeline_root=self._test_dir, run_id='rid') component_info = data_types.ComponentInfo(component_type='type', component_id='cid', pipeline_info=pipeline_info) input_artifacts = {} output_artifacts = self._example_gen_driver._prepare_output_artifacts( input_artifacts, output_dict, exec_properties, 1, pipeline_info, component_info) examples = artifact_utils.get_single_instance( output_artifacts[utils.EXAMPLES_KEY]) self.assertEqual( examples.uri, os.path.join(self._test_dir, 'cid', 'examples', '1', '')) self.assertEqual( examples.get_string_custom_property( utils.FINGERPRINT_PROPERTY_NAME), 'fp') self.assertEqual( examples.get_string_custom_property(utils.SPAN_PROPERTY_NAME), '02')
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Stores `custom_config` as an artifact of type `artifacts.PipelineConfiguration`. Args: input_dict: Empty output_dict: Output dict from key to a list of artifacts, including: - pipeline_configuration: A list of type `artifacts.PipelineConfiguration` exec_properties: A dict of execution properties, including: - custom_config: the configuration to save. Returns: None Raises: OSError and its subclasses ValueError """ self._log_startup(input_dict, output_dict, exec_properties) pipeline_configuration = artifact_utils.get_single_instance(output_dict[PIPELINE_CONFIGURATION_KEY]) custom_config = exec_properties.get(CUSTOM_CONFIG_KEY, "{}") output_dir = artifact_utils.get_single_uri([pipeline_configuration]) output_file = os.path.join(output_dir, 'custom_config.json') io_utils.write_string_file(output_file, custom_config)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - 'stats': A list of 'ExampleStatistics' type which must contain split 'train'. Stats on other splits are ignored. - 'statistics': Synonym for 'stats'. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - infer_feature_shape: Whether or not to infer the shape of the feature. - exclude_splits: Names of splits that will not be taken into consideration when auto-generating a schema. Returns: None """ # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. infer_feature_shape = exec_properties.get(INFER_FEATURE_SHAPE_KEY) # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError('exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) # Only one schema is generated for all splits. schema = None stats_artifact = artifact_utils.get_single_instance( input_dict[STATISTICS_KEY]) for split in artifact_utils.decode_split_names(stats_artifact.split_names): if split in exclude_splits: continue logging.info('Processing schema from statistics for split %s.', split) stats_uri = io_utils.get_only_uri_in_dir( os.path.join(stats_artifact.uri, split)) if not schema: schema = tfdv.infer_schema( tfdv.load_statistics(stats_uri), infer_feature_shape) else: schema = tfdv.update_schema(schema, tfdv.load_statistics(stats_uri), infer_feature_shape) output_uri = os.path.join( artifact_utils.get_single_uri(output_dict[SCHEMA_KEY]), _DEFAULT_FILE_NAME) io_utils.write_pbtxt_file(output_uri, schema) logging.info('Schema written to %s.', output_uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: self._log_startup(input_dict, output_dict, exec_properties) data_view_artifact = artifact_utils.get_single_instance( input_dict.get(_DATA_VIEW_KEY)) input_examples_artifact = artifact_utils.get_single_instance( input_dict.get(_INPUT_EXAMPLES_KEY)) output_examples_artifact = artifact_utils.get_single_instance( output_dict.get(_OUTPUT_EXAMPLES_KEY, [])) # The output artifact shares the URI and all other properties with the # input, with the following additional custom properties added. output_examples_artifact.copy_from(input_examples_artifact) output_examples_artifact.set_int_custom_property( constants.DATA_VIEW_ID_PROPERTY_KEY, data_view_artifact.id) output_examples_artifact.set_string_custom_property( constants.DATA_VIEW_URI_PROPERTY_KEY, data_view_artifact.uri)
def CheckBlessing(self, input_dict: Dict[Text, List[types.Artifact]]) -> bool: """Check that model is blessed by upstream validators. Args: input_dict: Input dict from input key to a list of artifacts: - model_blessing: A `ModelBlessing` artifact from model validator or evaluator. Pusher looks for a custom property `blessed` in the artifact to check it is safe to push. - infra_blessing: An `InfraBlessing` artifact from infra validator. Pusher looks for a custom proeprty `blessed` in the artifact to determine whether the model is mechanically servable from the model server to which Pusher is going to push. Returns: True if the model is blessed by validator. """ # TODO(jyzhao): should this be in driver or executor. maybe_model_blessing = input_dict.get( standard_component_specs.MODEL_BLESSING_KEY) if maybe_model_blessing: model_blessing = artifact_utils.get_single_instance( maybe_model_blessing) if not model_utils.is_model_blessed(model_blessing): logging.info('Model on %s was not blessed by model validation', model_blessing.uri) return False maybe_infra_blessing = input_dict.get( standard_component_specs.INFRA_BLESSING_KEY) if maybe_infra_blessing: infra_blessing = artifact_utils.get_single_instance( maybe_infra_blessing) if not model_utils.is_infra_validated(infra_blessing): logging.info('Model on %s was not blessed by infra validator', infra_blessing.uri) return False if not maybe_model_blessing and not maybe_infra_blessing: logging.warning( 'Pusher is going to push the model without validation. ' 'Consider using Evaluator or InfraValidator in your ' 'pipeline.') return True
def __init__(self, statistics: types.Channel = None, schema: types.Channel = None, exclude_splits: Optional[List[Text]] = None, output: Optional[types.Channel] = None, stats: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct an ExampleValidator component. Args: statistics: A Channel of type `standard_artifacts.ExampleStatistics`. This should contain at least 'eval' split. Other splits are currently ignored. schema: A Channel of type `standard_artifacts.Schema`. _required_ exclude_splits: Names of splits that the example validator should not validate. Default behavior (when exclude_splits is set to None) is excluding no splits. output: Output channel of type `standard_artifacts.ExampleAnomalies`. stats: Backwards compatibility alias for the 'statistics' argument. instance_name: Optional name assigned to this specific instance of ExampleValidator. Required only if multiple ExampleValidator components are declared in the same pipeline. Either `stats` or `statistics` must be present in the arguments. """ if stats: logging.warning( 'The "stats" argument to the StatisticsGen component has ' 'been renamed to "statistics" and is deprecated. Please update your ' 'usage as support for this argument will be removed soon.') statistics = stats if exclude_splits is None: exclude_splits = [] logging.info('Excluding no splits because exclude_splits is not set.') anomalies = output if not anomalies: anomalies_artifact = standard_artifacts.ExampleAnomalies() statistics_split_names = artifact_utils.decode_split_names( artifact_utils.get_single_instance(list( statistics.get())).split_names) split_names = [ split for split in statistics_split_names if split not in exclude_splits ] anomalies_artifact.split_names = artifact_utils.encode_split_names( split_names) anomalies = types.Channel( type=standard_artifacts.ExampleAnomalies, artifacts=[anomalies_artifact]) spec = ExampleValidatorSpec( statistics=statistics, schema=schema, exclude_splits=json_utils.dumps(exclude_splits), anomalies=anomalies) super(ExampleValidator, self).__init__( spec=spec, instance_name=instance_name)
def testGetFromSingleList(self): """Test various retrieval utilities on a single list of Artifact.""" artifacts = [standard_artifacts.Examples()] artifacts[0].uri = '/tmp/evaluri' artifacts[0].split_names = '["eval"]' self.assertEqual(artifacts[0], artifact_utils.get_single_instance(artifacts)) self.assertEqual('/tmp/evaluri', artifact_utils.get_single_uri(artifacts)) self.assertEqual('/tmp/evaluri/eval', artifact_utils.get_split_uri(artifacts, 'eval')) with self.assertRaises(ValueError): artifact_utils.get_split_uri(artifacts, 'train')
def __init__(self, examples: types.Channel = None, schema: Optional[types.Channel] = None, stats_options: Optional[tfdv.StatsOptions] = None, output: Optional[types.Channel] = None, input_data: Optional[types.Channel] = None, instance_name: Optional[Text] = None, enable_cache: Optional[bool] = None): """Construct a StatisticsGen component. Args: examples: A Channel of `ExamplesPath` type, likely generated by the [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen). This needs to contain two splits labeled `train` and `eval`. _required_ schema: A `Schema` channel to use for automatically configuring the value of stats options passed to TFDV. stats_options: The StatsOptions instance to configure optional TFDV behavior. When stats_options.schema is set, it will be used instead of the `schema` channel input. Due to the requirement that stats_options be serialized, the slicer functions and custom stats generators are dropped and are therefore not usable. output: `ExampleStatisticsPath` channel for statistics of each split provided in the input examples. input_data: Backwards compatibility alias for the `examples` argument. instance_name: Optional name assigned to this specific instance of StatisticsGen. Required only if multiple StatisticsGen components are declared in the same pipeline. enable_cache: Optional boolean to indicate if cache is enabled for the StatisticsGen component. If not specified, defaults to the value specified for pipeline's enable_cache parameter. """ if input_data: absl.logging.warning( 'The "input_data" argument to the StatisticsGen component has ' 'been renamed to "examples" and is deprecated. Please update your ' 'usage as support for this argument will be removed soon.') examples = input_data if not output: statistics_artifact = standard_artifacts.ExampleStatistics() statistics_artifact.split_names = artifact_utils.get_single_instance( list(examples.get())).split_names output = types.Channel(type=standard_artifacts.ExampleStatistics, artifacts=[statistics_artifact]) # TODO(b/150802589): Move jsonable interface to tfx_bsl and use json_utils. stats_options_json = stats_options.to_json() if stats_options else None spec = StatisticsGenSpec(examples=examples, schema=schema, stats_options_json=stats_options_json, statistics=output) super(StatisticsGen, self).__init__(spec=spec, instance_name=instance_name, enable_cache=enable_cache)
def test_get_from_split_list(self): """Test various retrieval utilities on a list of split Artifact.""" split_list = [] for split in ['train', 'eval']: instance = artifact.Artifact('MyTypeName', split=split) instance.uri = '/tmp/' + split split_list.append(instance) with self.assertRaises(ValueError): artifact_utils.get_single_instance(split_list) with self.assertRaises(ValueError): artifact_utils.get_single_uri(split_list) self.assertEqual(split_list[0], artifact_utils._get_split_instance(split_list, 'train')) self.assertEqual('/tmp/train', artifact_utils.get_split_uri(split_list, 'train')) self.assertEqual(split_list[1], artifact_utils._get_split_instance(split_list, 'eval')) self.assertEqual('/tmp/eval', artifact_utils.get_split_uri(split_list, 'eval'))
def translate_executor_output( output_dict: Mapping[str, List[artifact.Artifact]], name_from_id: Mapping[int, str]) -> Dict[str, pipeline_pb2.ArtifactList]: """Translates output_dict to a Kubeflow ArtifactList mapping.""" result = {} for k, v in output_dict.items(): result[k] = pipeline_pb2.ArtifactList(artifacts=[ to_runtime_artifact(artifact_utils.get_single_instance(v), name_from_id) ]) return result
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] c = source_utils.load_source_path_class(source) tokenizer_step: BaseTokenizer = c(**args) tokenizer_location = artifact_utils.get_single_uri( output_dict["tokenizer"]) split_uris, split_names, all_files = [], [], [] for artifact in input_dict["examples"]: for split in artifact_utils.decode_split_names( artifact.split_names): split_names.append(split) uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) all_files += path_utils.list_dir(uri) # Get output split path output_examples = artifact_utils.get_single_instance( output_dict["output_examples"]) output_examples.split_names = artifact_utils.encode_split_names( split_names) if not tokenizer_step.skip_training: tokenizer_step.train(files=all_files) tokenizer_step.save(output_dir=tokenizer_location) with self._make_beam_pipeline() as p: for split, uri in split_uris: input_uri = io_utils.all_files_pattern(uri) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord( file_pattern=input_uri) | "ParseTFExFromString." + split >> beam.Map( tf.train.Example.FromString) | "AddTokens." + split >> beam.Map( append_tf_example, tokenizer_step=tokenizer_step) | 'Serialize.' + split >> beam.Map( lambda x: x.SerializeToString()) | 'WriteSplit.' + split >> WriteSplit( get_split_uri( output_dict["output_examples"], split)))
def Do(self, input_dict: Dict[Text, List[Artifact]], output_dict: Dict[Text, List[Artifact]], exec_properties: Dict[Text, Any]) -> None: """Take evaluator output and publish results to MLMD. It updates custom properties of BenchmarkResult artifact to contain benchmark results. Args: input_dict: Input dict from input key to a list of artifacts, including: - evaluation: Model evaluation results. output_dict: Output dict from key to a list of artifacts, including: - benchmark_result: `BenchmarkResult` artifact. exec_properties: A dict of execution properties, including either one of: - benchmark_name: An unique name of a benchmark. Raises: ValueError: If evaluation uri doesn't exists. """ uri = artifact_utils.get_single_uri(input_dict['evaluation']) if not tf.io.gfile.exists(uri): raise ValueError('The uri="{}" does not exist.'.format(uri)) benchmark_result = artifact_utils.get_single_instance( output_dict['benchmark_result']) benchmark_result.set_string_custom_property( br.BenchmarkResult.BENCHMARK_NAME_KEY, exec_properties['benchmark_name']) benchmark_result.set_int_custom_property( br.BenchmarkResult.BENCHMARK_RUN_KEY, exec_properties['run']) benchmark_result.set_int_custom_property( br.BenchmarkResult.RUNS_PER_BENCHMARK_KEY, exec_properties['num_runs']) # Publish evaluation metrics evals = self._load_evaluation(uri) for name, val in evals.items(): # TODO(b/151723291): Use correct type instead of string. benchmark_result.set_string_custom_property(name, str(val)) context_properties = serialize.decode( exec_properties['additional_context']) # TODO(b/175802446): Add additional properties storing # `additional_context` and `metric` keys so user can distinguish between # custom properties. for name, val in context_properties.items(): # TODO(b/151723291): Use correct type instead of string. benchmark_result.set_string_custom_property(name, str(val))
def test_get_from_single_list(self): """Test various retrieval utilities on a single list of Artifact.""" single_list = [artifact.Artifact('MyTypeName', split='eval')] single_list[0].uri = '/tmp/evaluri' self.assertEqual(single_list[0], artifact_utils.get_single_instance(single_list)) self.assertEqual('/tmp/evaluri', artifact_utils.get_single_uri(single_list)) self.assertEqual(single_list[0], artifact_utils._get_split_instance(single_list, 'eval')) self.assertEqual('/tmp/evaluri', artifact_utils.get_split_uri(single_list, 'eval')) with self.assertRaises(ValueError): artifact_utils._get_split_instance(single_list, 'train') with self.assertRaises(ValueError): artifact_utils.get_split_uri(single_list, 'train')
def testGetFromSplits(self): """Test various retrieval utilities on a list of split Artifact.""" artifacts = [standard_artifacts.Examples()] artifacts[0].uri = '/tmp' artifacts[0].split_names = artifact_utils.encode_split_names( ['train', 'eval']) self.assertEqual(artifacts[0].split_names, '["train", "eval"]') self.assertIs(artifact_utils.get_single_instance(artifacts), artifacts[0]) self.assertEqual('/tmp', artifact_utils.get_single_uri(artifacts)) self.assertEqual('/tmp/train', artifact_utils.get_split_uri(artifacts, 'train')) self.assertEqual('/tmp/eval', artifact_utils.get_split_uri(artifacts, 'eval'))
def __init__(self, input_examples: types.Channel, data_view: types.Channel, output_examples: Optional[types.Channel] = None, instance_name: Optional[Text] = None): if not output_examples: output_artifact = standard_artifacts.Examples() output_artifact.copy_from( artifact_utils.get_single_instance(list(input_examples.get()))) output_examples = channel_utils.as_channel([output_artifact]) spec = _DataViewBinderComponentSpec( input_examples=input_examples, data_view=data_view, output_examples=output_examples) super().__init__(spec=spec, instance_name=instance_name)