示例#1
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]):
        self._log_startup(input_dict, output_dict, exec_properties)

        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))

        groups = group_stats_and_examples(input_dict)
        for examples, datasets in groups:
            datasets = DatasetFeatureStatisticsList(
                datasets=list(datasets.values()))
            partitions = lists_to_partitions(
                datasets, schema, examples,
                partition_fn(datasets, schema, examples))

            for partition in partitions:
                output_uri = os.path.join(
                    artifact_utils.get_single_uri(output_dict[PARTITIONS_KEY]),
                    partition.name)
                io_utils.write_pbtxt_file(
                    os.path.join(output_uri, 'schema.pbtxt'), partition.schema)

                for i in range(0, len(partition.statistics.datasets)):
                    dataset = partition.statistics.datasets[i]
                    example_splits = partition.example_splits[i]

                    io_utils.write_tfrecord_file(
                        os.path.join(output_uri, example_splits.split,
                                     'stats_tfrecord'), dataset)
示例#2
0
    def Do(self, input_dict, output_dict, exec_properties):
        """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'eval'. Stats on other splits are ignored.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        tf.logging.info('Validating schema against the computed statistics.')
        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                types.get_single_uri(input_dict['schema'])))
        stats = tfdv.load_statistics(
            io_utils.get_only_uri_in_dir(
                types.get_split_uri(input_dict['stats'], 'eval')))
        output_uri = types.get_single_uri(output_dict['output'])
        anomalies = tfdv.validate_statistics(stats, schema)
        io_utils.write_pbtxt_file(os.path.join(output_uri, DEFAULT_FILE_NAME),
                                  anomalies)
        tf.logging.info(
            'Validation complete. Anomalies written to {}.'.format(output_uri))
示例#3
0
文件: executor.py 项目: zvrr/tfx
    def _Validate(self, inputs: Dict[Text, Any], outputs: Dict[Text,
                                                               Any]) -> None:
        """Validate the inputs and put validate result into outputs.

      This is the implementation part of example validator executor. This is
      intended for using or extending the executor without artifact dependecy.

    Args:
      inputs: A dictionary of labeled input values, including:
        - labels.STATS: the feature statistics to validate
        - labels.SCHEMA: the schema to respect
        - (Optional) labels.ENVIRONMENT: if an environment is specified, only
          validate the feature statistics of the fields in that environment.
          Otherwise, validate all fields.
        - (Optional) labels.PREV_SPAN_FEATURE_STATISTICS: the feature
          statistics of a previous span.
        - (Optional) labels.PREV_VERSION_FEATURE_STATISTICS: the feature
          statistics of a previous version.
        - (Optional) labels.FEATURES_NEEDED: the feature needed to be
          validated on.
        - (Optional) labels.VALIDATION_CONFIG: the configuration of this
          validation.
        - (Optional) labels.EXTERNAL_CONFIG_VERSION: the version number of
          external config file.
      outputs: A dictionary of labeled output values, including:
          - labels.SCHEMA_DIFF_PATH: the path to write the schema diff to
    """
        schema = value_utils.GetSoleValue(inputs, labels.SCHEMA)
        stats = value_utils.GetSoleValue(inputs, labels.STATS)
        schema_diff_path = value_utils.GetSoleValue(outputs,
                                                    labels.SCHEMA_DIFF_PATH)
        anomalies = tfdv.validate_statistics(stats, schema)
        io_utils.write_pbtxt_file(
            os.path.join(schema_diff_path, DEFAULT_FILE_NAME), anomalies)
示例#4
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'stats': A list of 'ExampleStatistics' type which must contain
          split 'train'. Stats on other splits are ignored.
        - 'statistics': Synonym for 'stats'.
        - 'schema': A singleton list of 'Schema' type. If provided, pass
          it through as the output as fixed schema. If not provided, infer
          schema from stats.
        If both or neither 'stats/statistics' nor 'schema' is provided,
        an error is raised.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.

    Returns:
      None
    """
        output_uri = os.path.join(
            artifact_utils.get_single_uri(output_dict['output']),
            _DEFAULT_FILE_NAME)

        # Materializing schema as an output artifact from SchemaGen, in order to log
        # metadata of it in the same way regardless of inferred or fixed.
        io_utils.write_pbtxt_file(
            output_uri, self._provide_schema(input_dict, exec_properties))
        absl.logging.info('Schema written to {}.'.format(output_uri))
示例#5
0
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'train'. Stats on other splits are ignored.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'SchemaPath' artifact of size one.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    # TODO(zhitaoli): Move constants between this file and component.py to a
    # constants.py.
    train_stats_uri = io_utils.get_only_uri_in_dir(
        types.get_split_uri(input_dict['stats'], 'train'))
    output_uri = os.path.join(
        types.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME)

    infer_feature_shape = False
    tf.logging.info('Infering schema from statistics.')
    schema = tfdv.infer_schema(
        tfdv.load_statistics(train_stats_uri), infer_feature_shape)
    io_utils.write_pbtxt_file(output_uri, schema)
    tf.logging.info('Schema written to {}.'.format(output_uri))
示例#6
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'stats': A list of 'ExampleStatistics' type which must contain
          split 'train'. Stats on other splits are ignored.
        - 'statistics': Synonym for 'stats'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.

    Returns:
      None
    """
        # TODO(zhitaoli): Move constants between this file and component.py to a
        # constants.py.
        train_stats_uri = io_utils.get_only_uri_in_dir(
            artifact_utils.get_split_uri(input_dict['stats'], 'train'))
        output_uri = os.path.join(
            artifact_utils.get_single_uri(output_dict['output']),
            _DEFAULT_FILE_NAME)

        infer_feature_shape = exec_properties['infer_feature_shape']
        absl.logging.info('Infering schema from statistics.')
        schema = tfdv.infer_schema(tfdv.load_statistics(train_stats_uri),
                                   infer_feature_shape)
        io_utils.write_pbtxt_file(output_uri, schema)
        absl.logging.info('Schema written to %s.' % output_uri)
示例#7
0
def create_sample_pipeline(m: metadata.Metadata,
                           pipeline_id: str,
                           run_num: int,
                           export_ir_path: str = '',
                           external_ir_file: str = '',
                           deployment_config: Optional[message.Message] = None,
                           execute_nodes_func: Callable[
                               [metadata.Metadata, pipeline_pb2.Pipeline, int],
                               None] = _execute_nodes):
  """Creates a list of pipeline and node execution."""
  ir_path = _get_ir_path(external_ir_file)
  for i in range(run_num):
    run_id = 'run%02d' % i
    pipeline = _test_pipeline(ir_path, pipeline_id, run_id, deployment_config)
    if export_ir_path:
      output_path = os.path.join(export_ir_path,
                                 '%s_%s.pbtxt' % (pipeline_id, run_id))
      io_utils.write_pbtxt_file(output_path, pipeline)
    pipeline_state = pipeline_ops.initiate_pipeline_start(m, pipeline)
    if not external_ir_file:
      execute_nodes_func(m, pipeline, i)
    if i < run_num - 1:
      with pipeline_state:
        pipeline_state.set_pipeline_execution_state(
            metadata_store_pb2.Execution.COMPLETE)
示例#8
0
def annotate_schema(
    ignore_features: Parameter[str],
    original_schema: InputArtifact[standard_artifacts.Schema],
    schema: OutputArtifact[standard_artifacts.Schema],
) -> None:  # pytype: disable=invalid-annotation,wrong-arg-types
    r"""Updates a schema with additional metadata.

  Args:
    ignore_features: Newline ('\n') separated list of features to mark as
      disabled in the output schema.
    original_schema: The Schema artifact to modify.
    schema: The output Schema with updates.
  """

    schema_file = io_utils.get_only_uri_in_dir(original_schema.uri)
    dataset_schema = schema_pb2.Schema()
    io_utils.parse_pbtxt_file(schema_file, dataset_schema)

    ignore_features = ignore_features.split("\n")
    for feature in dataset_schema.feature:
        if feature.name in ignore_features:
            logging.info("Marking '%s' as DISABLED.", feature.name)
            feature.lifecycle_stage = schema_pb2.LifecycleStage.DISABLED

    io_utils.write_pbtxt_file(os.path.join(schema.uri, "schema.txt"),
                              dataset_schema)
示例#9
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'stats': A list of 'ExampleStatistics' type which must contain
          split 'train'. Stats on other splits are ignored.
        - 'statistics': Synonym for 'stats'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.
        - exclude_splits: Names of splits that will not be taken into
          consideration when auto-generating a schema.

    Returns:
      None
    """
    # TODO(zhitaoli): Move constants between this file and component.py to a
    # constants.py.
    infer_feature_shape = exec_properties.get(INFER_FEATURE_SHAPE_KEY)

    # Load and deserialize exclude splits from execution properties.
    exclude_splits = json_utils.loads(
        exec_properties.get(EXCLUDE_SPLITS_KEY, 'null')) or []
    if not isinstance(exclude_splits, list):
      raise ValueError('exclude_splits in execution properties needs to be a '
                       'list. Got %s instead.' % type(exclude_splits))

    # Only one schema is generated for all splits.
    schema = None
    stats_artifact = artifact_utils.get_single_instance(
        input_dict[STATISTICS_KEY])
    for split in artifact_utils.decode_split_names(stats_artifact.split_names):
      if split in exclude_splits:
        continue

      logging.info('Processing schema from statistics for split %s.', split)
      stats_uri = io_utils.get_only_uri_in_dir(
          os.path.join(stats_artifact.uri, split))
      if not schema:
        schema = tfdv.infer_schema(
            tfdv.load_statistics(stats_uri), infer_feature_shape)
      else:
        schema = tfdv.update_schema(schema, tfdv.load_statistics(stats_uri),
                                    infer_feature_shape)

    output_uri = os.path.join(
        artifact_utils.get_single_uri(output_dict[SCHEMA_KEY]),
        _DEFAULT_FILE_NAME)
    io_utils.write_pbtxt_file(output_uri, schema)
    logging.info('Schema written to %s.', output_uri)
示例#10
0
def create_sample_pipeline(m: metadata.Metadata,
                           pipeline_id: str,
                           run_num: int,
                           export_ir_path: str = ''):
    """Creates a list of pipeline and node execution."""
    for i in range(run_num):
        run_id = 'run%02d' % i
        pipeline = _test_pipeline(pipeline_id, run_id)
        if export_ir_path:
            output_path = os.path.join(export_ir_path,
                                       '%s_%s.pbtxt' % (pipeline_id, run_id))
            io_utils.write_pbtxt_file(output_path, pipeline)
        pipeline_state = pipeline_ops.initiate_pipeline_start(m, pipeline)
        _execute_nodes(m, pipeline, i)
        if i < run_num - 1:
            with pipeline_state:
                pipeline_state.execution.last_known_state = (
                    metadata_store_pb2.Execution.COMPLETE)
示例#11
0
    def setUp(self):
        super(ExecutorTest, self).setUp()
        self._source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')
        self._output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        io_utils.write_pbtxt_file(
            os.path.join(self._output_data_dir, 'schema', 'schema.pbtxt'),
            schema_pb2.Schema())

        self._context = executor.Executor.Context(
            tmp_dir=self._output_data_dir, unique_id='1')

        # Create input dict.
        examples = standard_artifacts.Examples()
        examples.uri = 'path'
        examples.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])
        schema = standard_artifacts.Schema()
        schema.uri = os.path.join(self._output_data_dir, 'schema')

        self._input_dict = {
            'examples': [examples],
            'schema': [schema],
        }

        # Create output dict.
        model = standard_artifacts.Model()
        model.uri = os.path.join(self._output_data_dir, 'model')
        self._best_hparams = standard_artifacts.Model()
        self._best_hparams.uri = os.path.join(self._output_data_dir,
                                              'best_hparams')

        self._output_dict = {
            'model': [model],
            'study_best_hparams_path': [self._best_hparams],
        }
示例#12
0
文件: executor.py 项目: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'eval'. Stats on other splits are ignored.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    tf.logging.info('Validating schema against the computed statistics.')
    schema = io_utils.SchemaReader().read(
        io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema'])))
    stats = tfdv.load_statistics(
        io_utils.get_only_uri_in_dir(
            types.get_split_uri(input_dict['stats'], 'eval')))
    output_uri = types.get_single_uri(output_dict['output'])
    anomalies = tfdv.validate_statistics(stats, schema)
    io_utils.write_pbtxt_file(
        os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies)
    tf.logging.info(
        'Validation complete. Anomalies written to {}.'.format(output_uri))
示例#13
0
    def Do(self, input_dict: Dict[str, List[types.Artifact]],
           output_dict: Dict[str, List[types.Artifact]],
           exec_properties: Dict[str, Any]) -> None:
        """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'statistics': A list of 'ExampleStatistics' type which must contain
          split 'train'.
      output_dict: Output dict from key to a list of artifacts, including:
        - schema: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.
        - exclude_splits: Names of splits that will not be taken into
          consideration when auto-generating a schema.

    Returns:
      None
    """
        infer_feature_shape = bool(
            exec_properties.get(
                standard_component_specs.INFER_FEATURE_SHAPE_KEY, True))

        # Load and deserialize exclude splits from execution properties.
        exclude_splits = json_utils.loads(
            exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY,
                                'null')) or []
        if not isinstance(exclude_splits, list):
            raise ValueError(
                'exclude_splits in execution properties needs to be a '
                'list. Got %s instead.' % type(exclude_splits))

        # Only one schema is generated for all splits.
        schema = None
        stats_artifact = artifact_utils.get_single_instance(
            input_dict[standard_component_specs.STATISTICS_KEY])
        for split in artifact_utils.decode_split_names(
                stats_artifact.split_names):
            if split in exclude_splits:
                continue

            logging.info('Processing schema from statistics for split %s.',
                         split)
            stats_uri = io_utils.get_only_uri_in_dir(
                artifact_utils.get_split_uri([stats_artifact], split))
            if artifact_utils.is_artifact_version_older_than(
                    stats_artifact,
                    artifact_utils._ARTIFACT_VERSION_FOR_STATS_UPDATE):  # pylint: disable=protected-access
                stats = tfdv.load_statistics(stats_uri)
            else:
                stats = tfdv.load_stats_binary(stats_uri)
            if not schema:
                schema = tfdv.infer_schema(stats, infer_feature_shape)
            else:
                schema = tfdv.update_schema(schema, stats, infer_feature_shape)

        output_uri = os.path.join(
            artifact_utils.get_single_uri(
                output_dict[standard_component_specs.SCHEMA_KEY]),
            DEFAULT_FILE_NAME)
        io_utils.write_pbtxt_file(output_uri, schema)
        logging.info('Schema written to %s.', output_uri)
示例#14
0
def _get_ir_path(external_ir_file: str):
  if external_ir_file:
    return external_ir_file
  ir_file_path = tempfile.mktemp(suffix='.pbtxt')
  io_utils.write_pbtxt_file(ir_file_path, test_sync_pipeline.create_pipeline())
  return ir_file_path