Exemplo n.º 1
0
def get_payload_format(examples: types.Artifact) -> int:
    """Returns the payload format of Examples artifact.

  If Examples artifact does not contain the "payload_format" custom property,
  it is made before tfx supports multiple payload format, and can regard as
  tf.Example format.

  Args:
    examples: A standard_artifacts.Examples artifact.

  Returns:
    payload_format: One of the enums in example_gen_pb2.PayloadFormat.
  """
    assert examples.type is standard_artifacts.Examples, (
        'examples must be of type standard_artifacts.Examples')
    if examples.has_custom_property(
            example_gen_utils.PAYLOAD_FORMAT_PROPERTY_NAME):
        return example_gen_pb2.PayloadFormat.Value(
            examples.get_string_custom_property(
                example_gen_utils.PAYLOAD_FORMAT_PROPERTY_NAME))
    else:
        logging.warning(
            'Examples artifact does not have %s custom property. '
            'Falling back to %s',
            example_gen_utils.PAYLOAD_FORMAT_PROPERTY_NAME,
            example_gen_pb2.PayloadFormat.Name(_DEFAULT_PAYLOAD_FORMAT))
        return _DEFAULT_PAYLOAD_FORMAT
Exemplo n.º 2
0
 def _CreateWarmupModel(self, blessing: types.Artifact, model_path: str,
                        warmup_requests: List[iv_types.Request]):
     output_model_path = path_utils.stamped_model_path(blessing.uri)
     io_utils.copy_dir(src=model_path, dst=output_model_path)
     io_utils.write_tfrecord_file(
         path_utils.warmup_file_path(output_model_path),
         *[_convert_to_prediction_log(r) for r in warmup_requests])
     blessing.set_int_custom_property(_MODEL_FLAG_KEY, 1)
Exemplo n.º 3
0
def set_file_format(examples: types.Artifact, file_format: str):
    """Sets the file format custom property for `examples`.

  Args:
    examples: A standard_artifacts.Examples artifact.
    file_format: One of the file format that tfx_bsl understands.
  """
    assert examples.type_name == standard_artifacts.Examples.TYPE_NAME, (
        'examples must be of type standard_artifacts.Examples')
    examples.set_string_custom_property(
        example_gen_utils.FILE_FORMAT_PROPERTY_NAME, file_format)
Exemplo n.º 4
0
def set_payload_format(examples: types.Artifact, payload_format: int):
    """Sets the payload format custom property for `examples`.

  Args:
    examples: A standard_artifacts.Examples artifact.
    payload_format: One of the enums in example_gen_pb2.PayloadFormat.
  """
    assert examples.type is standard_artifacts.Examples, (
        'examples must be of type standard_artifacts.Examples')
    examples.set_string_custom_property(
        example_gen_utils.PAYLOAD_FORMAT_PROPERTY_NAME,
        example_gen_pb2.PayloadFormat.Name(payload_format))
Exemplo n.º 5
0
def _set_artifact_properties(artifact: types.Artifact,
                             properties: Optional[Dict[str, Any]],
                             custom_properties: Optional[Dict[str, Any]]):
    """Sets properties and custom_properties to the given artifact."""
    if properties is not None:
        for key, value in properties.items():
            setattr(artifact, key, value)
    if custom_properties is not None:
        for key, value in custom_properties.items():
            if isinstance(value, int):
                artifact.set_int_custom_property(key, value)
            elif isinstance(value, (str, bytes)):
                artifact.set_string_custom_property(key, value)
            else:
                raise NotImplementedError(
                    f'Unexpected custom_property value type:{type(value)}')
Exemplo n.º 6
0
def is_model_blessed(model_blessing: types.Artifact) -> bool:
    """Returns whether model is blessed by upstream ModelValidator.

  Args:
    model_blessing: model blessing artifact from model_validator.

  Returns:
    True if the model is blessed by validator.
  """
    return model_blessing.get_int_custom_property('blessed') == 1
Exemplo n.º 7
0
def is_infra_validated(infra_blessing: types.Artifact) -> bool:
    """Returns whether model is infra blessed by upstream InfraValidator.

  Args:
    infra_blessing: A `InfraBlessing` artifact from infra validator.

  Returns:
    Whether model is infra validated or not.
  """
    return infra_blessing.get_int_custom_property('blessed') == 1
Exemplo n.º 8
0
def get_file_format(examples: types.Artifact) -> str:
    """Returns the file format of Examples artifact.

  If Examples artifact does not contain the "file_format" custom property,
  it is made by OSS ExampleGen and can be treated as 'tfrecords_gzip' format.

  Args:
    examples: A standard_artifacts.Examples artifact.

  Returns:
    One of the file format that tfx_bsl understands.
  """
    assert examples.type_name == standard_artifacts.Examples.TYPE_NAME, (
        'examples must be of type standard_artifacts.Examples')
    if examples.has_custom_property(
            example_gen_utils.FILE_FORMAT_PROPERTY_NAME):
        return examples.get_string_custom_property(
            example_gen_utils.FILE_FORMAT_PROPERTY_NAME)
    else:
        return _DEFAULT_FILE_FORMAT
Exemplo n.º 9
0
def _attach_artifact_properties(spec: pipeline_pb2.OutputSpec.ArtifactSpec,
                                artifact: types.Artifact):
    """Attaches properties of an artifact using ArtifactSpec."""
    for key, value in spec.additional_properties.items():
        if not value.HasField('field_value'):
            raise RuntimeError('Property value is not a field_value for %s' %
                               key)
        setattr(artifact, key,
                data_types_utils.get_metadata_value(value.field_value))

    for key, value in spec.additional_custom_properties.items():
        if not value.HasField('field_value'):
            raise RuntimeError('Property value is not a field_value for %s' %
                               key)
        value_type = value.field_value.WhichOneof('value')
        if value_type == 'int_value':
            artifact.set_int_custom_property(key, value.field_value.int_value)
        elif value_type == 'string_value':
            artifact.set_string_custom_property(key,
                                                value.field_value.string_value)
        elif value_type == 'double_value':
            artifact.set_float_custom_property(key,
                                               value.field_value.double_value)
        else:
            raise RuntimeError(f'Unexpected value_type: {value_type}')
Exemplo n.º 10
0
 def _MarkPushed(self, model_push: types.Artifact, pushed_destination: Text,
                 pushed_version: Optional[Text] = None) -> None:
   model_push.set_int_custom_property('pushed', 1)
   model_push.set_string_custom_property(
       _PUSHED_DESTINATION_KEY, pushed_destination)
   if pushed_version is not None:
     model_push.set_string_custom_property(_PUSHED_VERSION_KEY, pushed_version)
Exemplo n.º 11
0
def get_data_view_uri(examples: types.Artifact) -> Optional[Text]:
    """Returns the URI to the DataView attached to an Examples artifact.

  Or None, if not attached.

  Args:
    examples: an Examples artifact.
  Returns:
    The URI to the DataView or None.
  """
    assert examples.type is standard_artifacts.Examples, (
        'examples must be of type standard_artifacts.Examples')
    data_view_uri = examples.get_string_custom_property(
        constants.DATA_VIEW_URI_PROPERTY_KEY)
    return data_view_uri if data_view_uri else None
Exemplo n.º 12
0
def get_payload_format(examples: types.Artifact) -> int:
    """Returns the payload format of `examples`.

  Args:
    examples: A standard_artifacts.Examples artifact.

  Returns:
    payload_format: One of the enums in example_gen_pb2.PayloadFormat.
  """
    assert examples.type is standard_artifacts.Examples, (
        'examples must be of type standard_artifacts.Examples')
    payload_format_from_artifact = examples.get_string_custom_property(
        example_gen_utils.PAYLOAD_FORMAT_PROPERTY_NAME)
    if payload_format_from_artifact:
        return example_gen_pb2.PayloadFormat.Value(
            payload_format_from_artifact)
    else:
        return example_gen_pb2.PayloadFormat.FORMAT_TF_EXAMPLE
Exemplo n.º 13
0
def ExampleValidator(
    stats_path: InputPath('ExampleStatistics'),
    #statistics_path: InputPath('ExampleStatistics'),
    schema_path: InputPath('Schema'),

    output_path: OutputPath('ExampleValidation'),
):
    """
    A TFX component to validate input examples.

    The ExampleValidator component uses [Tensorflow Data
    Validation](https://www.tensorflow.org/tfx/data_validation) to
    validate the statistics of some splits on input examples against a schema.

    The ExampleValidator component identifies anomalies in training and serving
    data. The component can be configured to detect different classes of anomalies
    in the data. It can:
        - perform validity checks by comparing data statistics against a schema that
        codifies expectations of the user.
        - detect data drift by looking at a series of data.
        - detect changes in dataset-wide data (i.e., num_examples) across spans or
        versions.

    Schema Based Example Validation
    The ExampleValidator component identifies any anomalies in the example data by
    comparing data statistics computed by the StatisticsGen component against a
    schema. The schema codifies properties which the input data is expected to
    satisfy, and is provided and maintained by the user.

    Please see https://www.tensorflow.org/tfx/data_validation for more details.

    Args:
        stats: A Channel of 'ExampleStatisticsPath` type. This should contain at
            least 'eval' split. Other splits are ignored currently.  Will be
            deprecated in the future for the `statistics` parameter.
        #statistics: Future replacement of the 'stats' argument.
        schema: A Channel of "SchemaPath' type. _required_
    Returns:
        output: Output channel of 'ExampleValidationPath' type.

    Either `stats` or `statistics` must be present in the arguments.
    """
    from tfx.components.example_validator.component import ExampleValidator
    component_class = ExampleValidator
    input_channels_with_splits = {'stats', 'statistics'}
    output_channels_with_splits = {}


    import json
    import os
    from google.protobuf import json_format, message
    from tfx.types import Artifact, channel_utils

    arguments = locals().copy()

    component_class_args = {}

    for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items():
        argument_value_obj = argument_value = arguments.get(name, None)
        if argument_value is None:
            continue
        parameter_type = execution_parameter.type
        if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple
            argument_value_obj = parameter_type()
            json_format.Parse(argument_value, argument_value_obj)
        component_class_args[name] = argument_value_obj

    for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items():
        artifact_path = arguments[name + '_path']
        artifacts = []
        if name in input_channels_with_splits:
            # Recovering splits
            splits = sorted(os.listdir(artifact_path))
            for split in splits:
                artifact = Artifact(type_name=channel_parameter.type_name)
                artifact.split = split
                artifact.uri = os.path.join(artifact_path, split) + '/'
                artifacts.append(artifact)
        else:
            artifact = Artifact(type_name=channel_parameter.type_name)
            artifact.uri = artifact_path + '/' # ?
            artifacts.append(artifact)
        component_class_args[name] = channel_utils.as_channel(artifacts)

    component_class_instance = component_class(**component_class_args)

    input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()}
    output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()}
    exec_properties = component_class_instance.exec_properties

    # Generating paths for output artifacts
    for name, artifacts in output_dict.items():
        base_artifact_path = arguments[name + '_path']
        for artifact in artifacts:
            artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is ''

    print('component instance: ' + str(component_class_instance))

    #executor = component_class.EXECUTOR_SPEC.executor_class() # Same
    executor = component_class_instance.executor_spec.executor_class()
    executor.Do(
        input_dict=input_dict,
        output_dict=output_dict,
        exec_properties=exec_properties,
    )
Exemplo n.º 14
0
 def _MarkBlessedIfSucceeded(self, blessing: types.Artifact) -> None:
     if not self._validation_failed:
         logging.info(
             'Model passed infra validation; marking model as blessed.')
         io_utils.write_string_file(os.path.join(blessing.uri, BLESSED), '')
         blessing.set_int_custom_property('blessed', 1)
Exemplo n.º 15
0
def _mark_not_blessed(blessing: types.Artifact) -> None:
    logging.info('Model failed infra validation.')
    io_utils.write_string_file(os.path.join(blessing.uri, NOT_BLESSED), '')
    blessing.set_int_custom_property('blessed', 0)
Exemplo n.º 16
0
 def _MarkNotBlessed(self, blessing: types.Artifact) -> None:
     if not self._validation_failed:
         self._validation_failed = True
         io_utils.write_string_file(os.path.join(blessing.uri, NOT_BLESSED),
                                    '')
         blessing.set_int_custom_property('blessed', 0)
Exemplo n.º 17
0
def Evaluator(
    examples_path: InputPath('Examples'),
    model_exports_path: InputPath('Model'),
    #model_path: InputPath('Model'),
    output_path: OutputPath('ModelEval'),
    feature_slicing_spec: 'JsonObject: evaluator_pb2.FeatureSlicingSpec' = None,
):
    """
    A TFX component to evaluate models trained by a TFX Trainer component.

    The Evaluator component performs model evaluations in the TFX pipeline and
    the resultant metrics can be viewed in a Jupyter notebook.  It uses the
    input examples generated from the
    [ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen)
    component to evaluate the models.

    Specifically, it can provide:
        - metrics computed on entire training and eval dataset
        - tracking metrics over time
        - model quality performance on different feature slices

    ## Exporting the EvalSavedModel in Trainer

    In order to setup Evaluator in a TFX pipeline, an EvalSavedModel needs to be
    exported during training, which is a special SavedModel containing
    annotations for the metrics, features, labels, and so on in your model.
    Evaluator uses this EvalSavedModel to compute metrics.

    As part of this, the Trainer component creates eval_input_receiver_fn,
    analogous to the serving_input_receiver_fn, which will extract the features
    and labels from the input data. As with serving_input_receiver_fn, there are
    utility functions to help with this.

    Please see https://www.tensorflow.org/tfx/model_analysis for more details.

    Args:
        examples: A Channel of 'ExamplesPath' type, usually produced by ExampleGen
            component. @Ark-kun: Must have the eval split. _required_
        model_exports: A Channel of 'ModelExportPath' type, usually produced by
            Trainer component.  Will be deprecated in the future for the `model`
            parameter.
        #model: Future replacement of the `model_exports` argument.
        feature_slicing_spec:
            [evaluator_pb2.FeatureSlicingSpec](https://github.com/tensorflow/tfx/blob/master/tfx/proto/evaluator.proto)
            instance that describes how Evaluator should slice the data.
    Returns:
        output: Channel of `ModelEvalPath` to store the evaluation results.

    Either `model_exports` or `model` must be present in the input arguments.

    """
    from tfx.components.evaluator.component import Evaluator
    component_class = Evaluator
    input_channels_with_splits = {'examples'}
    output_channels_with_splits = {}

    import json
    import os
    from google.protobuf import json_format, message
    from tfx.types import Artifact, channel_utils

    arguments = locals().copy()

    component_class_args = {}

    for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(
    ):
        argument_value_obj = argument_value = arguments.get(name, None)
        if argument_value is None:
            continue
        parameter_type = execution_parameter.type
        if isinstance(parameter_type, type) and issubclass(
                parameter_type, message.Message
        ):  # execution_parameter.type can also be a tuple
            argument_value_obj = parameter_type()
            json_format.Parse(argument_value, argument_value_obj)
        component_class_args[name] = argument_value_obj

    for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items():
        artifact_path = arguments[name + '_path']
        artifacts = []
        if name in input_channels_with_splits:
            # Recovering splits
            splits = sorted(os.listdir(artifact_path))
            for split in splits:
                artifact = Artifact(type_name=channel_parameter.type_name)
                artifact.split = split
                artifact.uri = os.path.join(artifact_path, split) + '/'
                artifacts.append(artifact)
        else:
            artifact = Artifact(type_name=channel_parameter.type_name)
            artifact.uri = artifact_path + '/'  # ?
            artifacts.append(artifact)
        component_class_args[name] = channel_utils.as_channel(artifacts)

    component_class_instance = component_class(**component_class_args)

    input_dict = {
        name: channel.get()
        for name, channel in component_class_instance.inputs.get_all().items()
    }
    output_dict = {
        name: channel.get()
        for name, channel in
        component_class_instance.outputs.get_all().items()
    }
    exec_properties = component_class_instance.exec_properties

    # Generating paths for output artifacts
    for name, artifacts in output_dict.items():
        base_artifact_path = arguments[name + '_path']
        for artifact in artifacts:
            artifact.uri = os.path.join(base_artifact_path,
                                        artifact.split)  # Default split is ''

    print('component instance: ' + str(component_class_instance))

    #executor = component_class.EXECUTOR_SPEC.executor_class() # Same
    executor = component_class_instance.executor_spec.executor_class()
    executor.Do(
        input_dict=input_dict,
        output_dict=output_dict,
        exec_properties=exec_properties,
    )
Exemplo n.º 18
0
def _mark_blessed(blessing: types.Artifact) -> None:
  logging.info('Model passed infra validation.')
  io_utils.write_string_file(
      os.path.join(blessing.uri, _BLESSED_FILENAME), '')
  blessing.set_int_custom_property(_BLESSED_KEY, 1)
Exemplo n.º 19
0
def _get_span_custom_property(artifact: types.Artifact) -> int:
    # For backward compatibility, span may be stored as a string.
    str_span = artifact.get_string_custom_property(utils.SPAN_PROPERTY_NAME)
    if str_span:
        return int(str_span)
    return artifact.get_int_custom_property(utils.SPAN_PROPERTY_NAME)
Exemplo n.º 20
0
def BigQueryExampleGen(
    example_artifacts_path: OutputPath('Examples'),
    query: str = None,
    input_config: 'JsonObject: example_gen_pb2.Input' = None,
    output_config: 'JsonObject: example_gen_pb2.Output' = None,
):
    """
    Official TFX BigQueryExampleGen component.

    The BigQuery examplegen component takes a query, and generates train
    and eval examples for downsteam components.


    Args:
        query: BigQuery sql string, query result will be treated as a single
            split, can be overwritten by input_config.
        input_config: An example_gen_pb2.Input instance with Split.pattern as
            BigQuery sql string. If set, it overwrites the 'query' arg, and allows
            different queries per split.
        output_config: An example_gen_pb2.Output instance, providing output
            configuration. If unset, default splits will be 'train' and 'eval' with
            size 2:1.
    Returns:
        example_artifacts: Optional channel of 'ExamplesPath' for output train and
            eval examples.

    Raises:
        RuntimeError: Only one of query and input_config should be set.
    """
    from tfx.components.example_gen.csv_example_gen.component import BigQueryExampleGen
    component_class = BigQueryExampleGen
    input_channels_with_splits = {}
    output_channels_with_splits = {'example_artifacts'}

    import json
    import os
    from google.protobuf import json_format, message
    from tfx.types import Artifact, channel_utils

    arguments = locals().copy()

    component_class_args = {}

    for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(
    ):
        argument_value_obj = argument_value = arguments.get(name, None)
        if argument_value is None:
            continue
        parameter_type = execution_parameter.type
        if isinstance(parameter_type, type) and issubclass(
                parameter_type, message.Message
        ):  # execution_parameter.type can also be a tuple
            argument_value_obj = parameter_type()
            json_format.Parse(argument_value, argument_value_obj)
        component_class_args[name] = argument_value_obj

    for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items():
        artifact_path = arguments[name + '_path']
        artifacts = []
        if name in input_channels_with_splits:
            # Recovering splits
            splits = sorted(os.listdir(artifact_path))
            for split in splits:
                artifact = Artifact(type_name=channel_parameter.type_name)
                artifact.split = split
                artifact.uri = os.path.join(artifact_path, split) + '/'
                artifacts.append(artifact)
        else:
            artifact = Artifact(type_name=channel_parameter.type_name)
            artifact.uri = artifact_path + '/'  # ?
            artifacts.append(artifact)
        component_class_args[name] = channel_utils.as_channel(artifacts)

    component_class_instance = component_class(**component_class_args)

    input_dict = {
        name: channel.get()
        for name, channel in component_class_instance.inputs.get_all().items()
    }
    output_dict = {
        name: channel.get()
        for name, channel in
        component_class_instance.outputs.get_all().items()
    }
    exec_properties = component_class_instance.exec_properties

    # Generating paths for output artifacts
    for name, artifacts in output_dict.items():
        base_artifact_path = arguments[name + '_path']
        for artifact in artifacts:
            artifact.uri = os.path.join(base_artifact_path,
                                        artifact.split)  # Default split is ''

    print('component instance: ' + str(component_class_instance))

    #executor = component_class.EXECUTOR_SPEC.executor_class() # Same
    executor = component_class_instance.executor_spec.executor_class()
    executor.Do(
        input_dict=input_dict,
        output_dict=output_dict,
        exec_properties=exec_properties,
    )
Exemplo n.º 21
0
def Transform(
    input_data_path: InputPath('Examples'),
    #examples: InputPath('Examples'),
    schema_path: InputPath('Schema'),
    transform_output_path: OutputPath('TransformGraph'),
    #transform_graph_path: OutputPath('TransformGraph'),
    transformed_examples_path: OutputPath('Examples'),
    module_file: 'Uri' = None,
    preprocessing_fn: str = None,
):
    """A TFX component to transform the input examples.

    The Transform component wraps TensorFlow Transform (tf.Transform) to
    preprocess data in a TFX pipeline. This component will load the
    preprocessing_fn from input module file, preprocess both 'train' and 'eval'
    splits of input examples, generate the `tf.Transform` output, and save both
    transform function and transformed examples to orchestrator desired locations.

    ## Providing a preprocessing function
    The TFX executor will use the estimator provided in the `module_file` file
    to train the model.  The Transform executor will look specifically for the
    `preprocessing_fn()` function within that file.

    An example of `preprocessing_fn()` can be found in the [user-supplied
    code]((https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py))
    of the TFX Chicago Taxi pipeline example.

    Args:
      input_data: A Channel of 'Examples' type (required). This should
        contain the two splits 'train' and 'eval'.
      #examples: Forwards compatibility alias for the 'input_data' argument.
      schema: A Channel of 'SchemaPath' type. This should contain a single
        schema artifact.
      module_file: The file path to a python module file, from which the
        'preprocessing_fn' function will be loaded. The function must have the
        following signature.

        def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]:
          ...

        where the values of input and returned Dict are either tf.Tensor or
        tf.SparseTensor.  Exactly one of 'module_file' or 'preprocessing_fn'
        must be supplied.
      preprocessing_fn: The path to python function that implements a
         'preprocessing_fn'. See 'module_file' for expected signature of the
         function. Exactly one of 'module_file' or 'preprocessing_fn' must
         be supplied.

    Returns:
      transform_output: Optional output 'TransformPath' channel for output of
        'tf.Transform', which includes an exported Tensorflow graph suitable for
        both training and serving;
      transformed_examples: Optional output 'ExamplesPath' channel for
        materialized transformed examples, which includes both 'train' and
        'eval' splits.

    Raises:
      ValueError: When both or neither of 'module_file' and 'preprocessing_fn'
        is supplied.
    """
    from tfx.components.transform.component import Transform
    component_class = Transform
    input_channels_with_splits = {'input_data', 'examples'}
    output_channels_with_splits = {'transformed_examples'}

    import json
    import os
    import tfx
    from google.protobuf import json_format, message
    from tfx.types import Artifact, channel_utils

    arguments = locals().copy()

    component_class_args = {}

    for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(
    ):
        argument_value_obj = argument_value = arguments.get(name, None)
        if argument_value is None:
            continue
        parameter_type = execution_parameter.type
        if isinstance(parameter_type, type) and issubclass(
                parameter_type, message.Message
        ):  # Maybe FIX: execution_parameter.type can also be a tuple
            argument_value_obj = parameter_type()
            json_format.Parse(argument_value, argument_value_obj)
        component_class_args[name] = argument_value_obj

    for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items():
        artifact_path = arguments[name + '_path']
        artifacts = []
        if name in input_channels_with_splits:
            # Recovering splits
            splits = sorted(os.listdir(artifact_path))
            for split in splits:
                artifact = Artifact(type_name=channel_parameter.type_name)
                artifact.split = split
                artifact.uri = os.path.join(artifact_path, split) + '/'
                artifacts.append(artifact)
        else:
            artifact = Artifact(type_name=channel_parameter.type_name)
            artifact.uri = artifact_path + '/'  # ?
            artifacts.append(artifact)
        component_class_args[name] = channel_utils.as_channel(artifacts)

    component_class_instance = component_class(**component_class_args)

    input_dict = {
        name: channel.get()
        for name, channel in component_class_instance.inputs.get_all().items()
    }
    output_dict = {
        name: channel.get()
        for name, channel in
        component_class_instance.outputs.get_all().items()
    }
    exec_properties = component_class_instance.exec_properties

    # Generating paths for output artifacts
    for name, artifacts in output_dict.items():
        base_artifact_path = arguments[name + '_path']
        for artifact in artifacts:
            artifact.uri = os.path.join(base_artifact_path,
                                        artifact.split)  # Default split is ''

    print('component instance: ' + str(component_class_instance))

    #executor = component_class.EXECUTOR_SPEC.executor_class() # Same
    executor = component_class_instance.executor_spec.executor_class()
    executor.Do(
        input_dict=input_dict,
        output_dict=output_dict,
        exec_properties=exec_properties,
    )
Exemplo n.º 22
0
def ImportExampleGen(
    input_base_path: InputPath('ExternalPath'),
    #input_path: InputPath('ExternalPath'),
    example_artifacts_path: OutputPath('Examples'),
    input_config: 'JsonObject: example_gen_pb2.Input' = None,
    output_config: 'JsonObject: example_gen_pb2.Output' = None,
):
    """
    Official TFX ImportExampleGen component.

    The ImportExampleGen component takes TFRecord files with TF Example data
    format, and generates train and eval examples for downsteam components.
    This component provides consistent and configurable partition, and it also
    shuffle the dataset for ML best practice.

    Args:
        input_base: A Channel of 'ExternalPath' type, which includes one artifact
            whose uri is an external directory with TFRecord files inside
            (required).
        #input: Forwards compatibility alias for the 'input_base' argument.
        input_config: An example_gen_pb2.Input instance, providing input
            configuration. If unset, the files under input_base will be treated as a
            single split.
        output_config: An example_gen_pb2.Output instance, providing output
            configuration. If unset, default splits will be 'train' and 'eval' with
            size 2:1.
    Returns:
        example_artifacts: Optional channel of 'ExamplesPath' for output train and
            eval examples.

    Raises:
        RuntimeError: Only one of query and input_config should be set.
    """
    from tfx.components.example_gen.import_example_gen.component import ImportExampleGen
    component_class = ImportExampleGen
    input_channels_with_splits = {}
    output_channels_with_splits = {'example_artifacts'}

    import json
    import os
    from google.protobuf import json_format, message
    from tfx.types import Artifact, channel_utils

    arguments = locals().copy()

    component_class_args = {}

    for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(
    ):
        argument_value_obj = argument_value = arguments.get(name, None)
        if argument_value is None:
            continue
        parameter_type = execution_parameter.type
        if isinstance(parameter_type, type) and issubclass(
                parameter_type, message.Message
        ):  # execution_parameter.type can also be a tuple
            argument_value_obj = parameter_type()
            json_format.Parse(argument_value, argument_value_obj)
        component_class_args[name] = argument_value_obj

    for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items():
        artifact_path = arguments[name + '_path']
        artifacts = []
        if name in input_channels_with_splits:
            # Recovering splits
            splits = sorted(os.listdir(artifact_path))
            for split in splits:
                artifact = Artifact(type_name=channel_parameter.type_name)
                artifact.split = split
                artifact.uri = os.path.join(artifact_path, split) + '/'
                artifacts.append(artifact)
        else:
            artifact = Artifact(type_name=channel_parameter.type_name)
            artifact.uri = artifact_path + '/'  # ?
            artifacts.append(artifact)
        component_class_args[name] = channel_utils.as_channel(artifacts)

    component_class_instance = component_class(**component_class_args)

    input_dict = {
        name: channel.get()
        for name, channel in component_class_instance.inputs.get_all().items()
    }
    output_dict = {
        name: channel.get()
        for name, channel in
        component_class_instance.outputs.get_all().items()
    }
    exec_properties = component_class_instance.exec_properties

    # Generating paths for output artifacts
    for name, artifacts in output_dict.items():
        base_artifact_path = arguments[name + '_path']
        for artifact in artifacts:
            artifact.uri = os.path.join(base_artifact_path,
                                        artifact.split)  # Default split is ''

    print('component instance: ' + str(component_class_instance))

    #executor = component_class.EXECUTOR_SPEC.executor_class() # Same
    executor = component_class_instance.executor_spec.executor_class()
    executor.Do(
        input_dict=input_dict,
        output_dict=output_dict,
        exec_properties=exec_properties,
    )
Exemplo n.º 23
0
 def _MarkNotPushed(self, model_push: types.Artifact):
     model_push.set_int_custom_property('pushed', 0)
Exemplo n.º 24
0
def Trainer(
    examples_path: InputPath('Examples'),
    transform_output_path: InputPath('TransformGraph'), # ? = None
    #transform_graph_path: InputPath('TransformGraph'),
    schema_path: InputPath('Schema'),

    output_path: OutputPath('Model'),

    module_file: str = None,
    trainer_fn: str = None,
    train_args: 'JsonObject: tfx.proto.trainer_pb2.TrainArgs' = None,
    eval_args: 'JsonObject: tfx.proto.trainer_pb2.EvalArgs' = None,
    #custom_config: dict = None,
    #custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
):
    """
    A TFX component to train a TensorFlow model.

    The Trainer component is used to train and eval a model using given inputs and
    a user-supplied estimator.  This component includes a custom driver to
    optionally grab previous model to warm start from.

    ## Providing an estimator
    The TFX executor will use the estimator provided in the `module_file` file
    to train the model.  The Trainer executor will look specifically for the
    `trainer_fn()` function within that file.  Before training, the executor will
    call that function expecting the following returned as a dictionary:

        - estimator: The
        [estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)
        to be used by TensorFlow to train the model.
        - train_spec: The
        [configuration](https://www.tensorflow.org/api_docs/python/tf/estimator/TrainSpec)
        to be used by the "train" part of the TensorFlow `train_and_evaluate()`
        call.
        - eval_spec: The
        [configuration](https://www.tensorflow.org/api_docs/python/tf/estimator/EvalSpec)
        to be used by the "eval" part of the TensorFlow `train_and_evaluate()` call.
        - eval_input_receiver_fn: The
        [configuration](https://www.tensorflow.org/tfx/model_analysis/get_started#modify_an_existing_model)
        to be used
        by the [ModelValidator](https://www.tensorflow.org/tfx/guide/modelval)
        component when validating the model.

    An example of `trainer_fn()` can be found in the [user-supplied
    code]((https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py))
    of the TFX Chicago Taxi pipeline example.


    Args:
      examples: A Channel of 'ExamplesPath' type, serving as the source of
        examples that are used in training (required). May be raw or
        transformed.
      transform_output: An optional Channel of 'TransformPath' type, serving as
        the input transform graph if present.
      #transform_graph: Forwards compatibility alias for the 'transform_output'
      #  argument.
      schema:  A Channel of 'SchemaPath' type, serving as the schema of training
        and eval data.
      module_file: A path to python module file containing UDF model definition.
        The module_file must implement a function named `trainer_fn` at its
        top level. The function must have the following signature.

        def trainer_fn(tf.contrib.training.HParams,
                       tensorflow_metadata.proto.v0.schema_pb2) -> Dict:
          ...

        where the returned Dict has the following key-values.
          'estimator': an instance of tf.estimator.Estimator
          'train_spec': an instance of tf.estimator.TrainSpec
          'eval_spec': an instance of tf.estimator.EvalSpec
          'eval_input_receiver_fn': an instance of tfma.export.EvalInputReceiver

        Exactly one of 'module_file' or 'trainer_fn' must be supplied.
      trainer_fn:  A python path to UDF model definition function. See
        'module_file' for the required signature of the UDF.
        Exactly one of 'module_file' or 'trainer_fn' must be supplied.
      train_args: A trainer_pb2.TrainArgs instance, containing args used for
        training. Current only num_steps is available.
      eval_args: A trainer_pb2.EvalArgs instance, containing args used for eval.
        Current only num_steps is available.
      #custom_config: A dict which contains the training job parameters to be
      #  passed to Google Cloud ML Engine.  For the full set of parameters
      #  supported by Google Cloud ML Engine, refer to
      #  https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#Job
      #custom_executor_spec: Optional custom executor spec.
    Returns:
      output: Optional 'ModelExportPath' channel for result of exported models.
    Raises:
      ValueError:
        - When both or neither of 'module_file' and 'trainer_fn' is supplied.
        - When both or neither of 'examples' and 'transformed_examples'
            is supplied.
        - When 'transformed_examples' is supplied but 'transform_output'
            is not supplied.
    """
    from tfx.components.trainer.component import Trainer
    component_class = Trainer
    input_channels_with_splits = {'examples'}
    output_channels_with_splits = {}


    import json
    import os
    from google.protobuf import json_format, message
    from tfx.types import Artifact, channel_utils

    arguments = locals().copy()

    component_class_args = {}

    for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items():
        argument_value_obj = argument_value = arguments.get(name, None)
        if argument_value is None:
            continue
        parameter_type = execution_parameter.type
        if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple
            argument_value_obj = parameter_type()
            json_format.Parse(argument_value, argument_value_obj)
        component_class_args[name] = argument_value_obj

    for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items():
        artifact_path = arguments[name + '_path']
        artifacts = []
        if name in input_channels_with_splits:
            # Recovering splits
            splits = sorted(os.listdir(artifact_path))
            for split in splits:
                artifact = Artifact(type_name=channel_parameter.type_name)
                artifact.split = split
                artifact.uri = os.path.join(artifact_path, split) + '/'
                artifacts.append(artifact)
        else:
            artifact = Artifact(type_name=channel_parameter.type_name)
            artifact.uri = artifact_path + '/' # ?
            artifacts.append(artifact)
        component_class_args[name] = channel_utils.as_channel(artifacts)

    component_class_instance = component_class(**component_class_args)

    input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()}
    output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()}
    exec_properties = component_class_instance.exec_properties

    # Generating paths for output artifacts
    for name, artifacts in output_dict.items():
        base_artifact_path = arguments[name + '_path']
        for artifact in artifacts:
            artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is ''

    print('component instance: ' + str(component_class_instance))

    #executor = component_class.EXECUTOR_SPEC.executor_class() # Same
    executor = component_class_instance.executor_spec.executor_class()
    executor.Do(
        input_dict=input_dict,
        output_dict=output_dict,
        exec_properties=exec_properties,
    )