예제 #1
0
 def test_raise_if_read_as_raw_but_raw_column_name_not_provided(self):
     examples = standard_artifacts.Examples()
     with self.assertRaisesRegex(AssertionError,
                                 'must provide raw_record_column_name'):
         tfxio_utils.get_tfxio_factory_from_artifact(
             examples, _TELEMETRY_DESCRIPTORS,
             read_as_raw_records=True)(_FAKE_FILE_PATTERN)
예제 #2
0
 def test_raise_if_data_view_uri_not_available(self):
     examples = standard_artifacts.Examples()
     examples_utils.set_payload_format(
         examples, example_gen_pb2.PayloadFormat.FORMAT_PROTO)
     with self.assertRaisesRegex(AssertionError, 'requires a DataView'):
         tfxio_utils.get_tfxio_factory_from_artifact(
             examples, _TELEMETRY_DESCRIPTORS)(_FAKE_FILE_PATTERN)
예제 #3
0
  def test_get_tfxio_factory_from_artifact_data_view_legacy(self):
    # This tests FORMAT_PROTO with data view where the DATA_VIEW_CREATE_TIME_KEY
    # is an int value. This is a legacy property type and should be string type
    # in the future.
    if tf.__version__ < '2':
      self.skipTest('DataView is not supported under TF 1.x.')

    examples = standard_artifacts.Examples()
    examples_utils.set_payload_format(
        examples, example_gen_pb2.PayloadFormat.FORMAT_PROTO)
    data_view_uri = tempfile.mkdtemp(dir=self.get_temp_dir())
    tf_graph_record_decoder.save_decoder(_SimpleTfGraphRecordDecoder(),
                                         data_view_uri)
    examples.set_string_custom_property(constants.DATA_VIEW_URI_PROPERTY_KEY,
                                        data_view_uri)
    examples.set_int_custom_property(constants.DATA_VIEW_CREATE_TIME_KEY, '1')
    tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
        [examples],
        _TELEMETRY_DESCRIPTORS,
        _SCHEMA,
        read_as_raw_records=False,
        raw_record_column_name=None)
    tfxio = tfxio_factory(_FAKE_FILE_PATTERN)
    self.assertIsInstance(tfxio, record_to_tensor_tfxio.TFRecordToTensorTFXIO)
    # We currently only create RecordBasedTFXIO and the check below relies on
    # that.
    self.assertIsInstance(tfxio, record_based_tfxio.RecordBasedTFXIO)
    self.assertEqual(tfxio.telemetry_descriptors, _TELEMETRY_DESCRIPTORS)
    # Since we provide a schema, ArrowSchema() should not raise.
    _ = tfxio.ArrowSchema()
예제 #4
0
 def test_get_tfxio_factory_from_artifact(self,
                                          payload_format,
                                          expected_tfxio_type,
                                          raw_record_column_name=None,
                                          provide_data_view_uri=False,
                                          read_as_raw_records=False):
     examples = standard_artifacts.Examples()
     if payload_format is not None:
         examples_utils.set_payload_format(examples, payload_format)
     data_view_uri = None
     if provide_data_view_uri:
         data_view_uri = tempfile.mkdtemp(dir=self.get_temp_dir())
         tf_graph_record_decoder.save_decoder(_SimpleTfGraphRecordDecoder(),
                                              data_view_uri)
     if data_view_uri is not None:
         examples.set_string_custom_property(
             constants.DATA_VIEW_URI_PROPERTY_KEY, data_view_uri)
     tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
         examples, _TELEMETRY_DESCRIPTORS, _SCHEMA, read_as_raw_records,
         raw_record_column_name)
     tfxio = tfxio_factory(_FAKE_FILE_PATTERN)
     self.assertIsInstance(tfxio, expected_tfxio_type)
     # We currently only create RecordBasedTFXIO and the check below relies on
     # that.
     self.assertIsInstance(tfxio, record_based_tfxio.RecordBasedTFXIO)
     self.assertEqual(tfxio.telemetry_descriptors, _TELEMETRY_DESCRIPTORS)
     self.assertEqual(tfxio.raw_record_column_name, raw_record_column_name)
     # Since we provide a schema, ArrowSchema() should not raise.
     _ = tfxio.ArrowSchema()
예제 #5
0
    def ReadExamplesArtifact(self,
                             examples: types.Artifact,
                             num_examples: int,
                             split_name: Optional[Text] = None):
        """Read records from Examples artifact.

    Currently it assumes Examples artifact contains serialized tf.Example in
    gzipped TFRecord files.

    Args:
      examples: `Examples` artifact.
      num_examples: Number of examples to read. If the specified value is larger
          than the actual number of examples, all examples would be read.
      split_name: Name of the split to read from the Examples artifact.

    Raises:
      RuntimeError: If read twice.
    """
        if self._records:
            raise RuntimeError('Cannot read records twice.')

        if num_examples < 1:
            raise ValueError('num_examples < 1 (got {})'.format(num_examples))

        available_splits = artifact_utils.decode_split_names(
            examples.split_names)
        if not available_splits:
            raise ValueError(
                'No split_name is available in given Examples artifact.')
        if split_name is None:
            split_name = available_splits[0]
        if split_name not in available_splits:
            raise ValueError(
                'No split_name {}; available split names: {}'.format(
                    split_name, ', '.join(available_splits)))

        # ExampleGen generates artifacts under each split_name directory.
        glob_pattern = os.path.join(examples.uri, split_name, '*')
        tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
            examples=[examples],
            telemetry_descriptors=_TELEMETRY_DESCRIPTORS,
            schema=None,
            read_as_raw_records=True,
            raw_record_column_name=_RAW_RECORDS_COLUMN)
        try:
            filenames = fileio.glob(glob_pattern)
        except tf.errors.NotFoundError:
            filenames = []
        if not filenames:
            raise ValueError(
                'Unable to find examples matching {}.'.format(glob_pattern))

        self._payload_format = examples_utils.get_payload_format(examples)
        tfxio = tfxio_factory(filenames)

        self._ReadFromDataset(
            tfxio.TensorFlowDataset(
                dataset_options.TensorFlowDatasetOptions(
                    batch_size=num_examples)))
예제 #6
0
    def _run_model_inference(
        self,
        data_spec: bulk_inferrer_pb2.DataSpec,
        output_example_spec: bulk_inferrer_pb2.OutputExampleSpec,
        examples: List[types.Artifact],
        output_examples: Optional[types.Artifact],
        inference_result: Optional[types.Artifact],
        inference_endpoint: model_spec_pb2.InferenceSpecType,
    ) -> None:
        """Runs model inference on given examples data.

    Args:
      data_spec: bulk_inferrer_pb2.DataSpec instance.
      output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance.
      examples: List of `standard_artifacts.Examples` artifacts.
      output_examples: Optional output `standard_artifacts.Examples` artifact.
      inference_result: Optional output `standard_artifacts.InferenceResult`
        artifact.
      inference_endpoint: Model inference endpoint.
    """

        example_uris = {}
        for example_artifact in examples:
            for split in artifact_utils.decode_split_names(
                    example_artifact.split_names):
                if data_spec.example_splits:
                    if split in data_spec.example_splits:
                        example_uris[split] = artifact_utils.get_split_uri(
                            [example_artifact], split)
                else:
                    example_uris[split] = artifact_utils.get_split_uri(
                        [example_artifact], split)

        payload_format, _ = tfxio_utils.resolve_payload_format_and_data_view_uri(
            examples)

        tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
            examples,
            _TELEMETRY_DESCRIPTORS,
            schema=None,
            read_as_raw_records=True,
            # We have to specify this parameter in order to create a RawRecord TFXIO
            # but we won't use the RecordBatches so the column name of the raw
            # records does not matter.
            raw_record_column_name='unused')

        if output_examples:
            output_examples.split_names = artifact_utils.encode_split_names(
                sorted(example_uris.keys()))

        with self._make_beam_pipeline() as pipeline:
            data_list = []
            for split, example_uri in example_uris.items():
                tfxio = tfxio_factory(
                    [io_utils.all_files_pattern(example_uri)])
                assert isinstance(
                    tfxio, record_based_tfxio.RecordBasedTFXIO
                ), ('Unable to use TFXIO {} as it does not support reading raw records.'
                    .format(type(tfxio)))
                # pylint: disable=no-value-for-parameter
                data = (pipeline
                        | 'ReadData[{}]'.format(split) >>
                        tfxio.RawRecordBeamSource()
                        | 'RunInference[{}]'.format(split) >> _RunInference(
                            payload_format, inference_endpoint))
                if output_examples:
                    output_examples_split_uri = artifact_utils.get_split_uri(
                        [output_examples], split)
                    logging.info('Path of output examples split `%s` is %s.',
                                 split, output_examples_split_uri)
                    _ = (data
                         | 'WriteExamples[{}]'.format(split) >> _WriteExamples(
                             output_example_spec, output_examples_split_uri))
                    # pylint: enable=no-value-for-parameter

                data_list.append(data)

            if inference_result:
                _ = (
                    data_list
                    |
                    'FlattenInferenceResult' >> beam.Flatten(pipeline=pipeline)
                    | 'WritePredictionLogs' >> beam.io.WriteToTFRecord(
                        os.path.join(inference_result.uri,
                                     _PREDICTION_LOGS_FILE_NAME),
                        file_name_suffix='.gz',
                        coder=beam.coders.ProtoCoder(
                            prediction_log_pb2.PredictionLog)))

        if output_examples:
            logging.info('Output examples written to %s.', output_examples.uri)
        if inference_result:
            logging.info('Inference result written to %s.',
                         inference_result.uri)
예제 #7
0
파일: executor.py 프로젝트: jay90099/tfx
  def Do(self, input_dict: Dict[str, List[types.Artifact]],
         output_dict: Dict[str, List[types.Artifact]],
         exec_properties: Dict[str, Any]) -> None:
    """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - examples: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - statistics: A list of type `standard_artifacts.ExampleStatistics`.
          This should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.
        - exclude_splits: JSON-serialized list of names of splits where
          statistics and sample should not be generated.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    # Load and deserialize exclude splits from execution properties.
    exclude_splits = json_utils.loads(
        exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY,
                            'null')) or []
    if not isinstance(exclude_splits, list):
      raise ValueError('exclude_splits in execution properties needs to be a '
                       'list. Got %s instead.' % type(exclude_splits))
    # Setup output splits.
    examples = artifact_utils.get_single_instance(
        input_dict[standard_component_specs.EXAMPLES_KEY])
    examples_split_names = artifact_utils.decode_split_names(
        examples.split_names)
    split_names = [
        split for split in examples_split_names if split not in exclude_splits
    ]
    statistics_artifact = artifact_utils.get_single_instance(
        output_dict[standard_component_specs.STATISTICS_KEY])
    statistics_artifact.split_names = artifact_utils.encode_split_names(
        split_names)

    stats_options = options.StatsOptions()
    stats_options_json = exec_properties.get(
        standard_component_specs.STATS_OPTIONS_JSON_KEY)
    if stats_options_json:
      # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
      # json_utils
      stats_options = options.StatsOptions.from_json(stats_options_json)
    if input_dict.get(standard_component_specs.SCHEMA_KEY):
      if stats_options.schema:
        raise ValueError('A schema was provided as an input and the '
                         'stats_options exec_property also contains a schema '
                         'value. At most one of these may be set.')
      else:
        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(
                    input_dict[standard_component_specs.SCHEMA_KEY])))
        stats_options.schema = schema

    split_and_tfxio = []
    tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
        examples=[examples],
        telemetry_descriptors=_TELEMETRY_DESCRIPTORS)
    for split in artifact_utils.decode_split_names(examples.split_names):
      if split in exclude_splits:
        continue

      uri = artifact_utils.get_split_uri([examples], split)
      split_and_tfxio.append(
          (split, tfxio_factory(io_utils.all_files_pattern(uri))))
    with self._make_beam_pipeline() as p:
      for split, tfxio in split_and_tfxio:
        logging.info('Generating statistics for split %s.', split)
        output_uri = artifact_utils.get_split_uri(
            output_dict[standard_component_specs.STATISTICS_KEY], split)
        output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
        data = p | 'TFXIORead[%s]' % split >> tfxio.BeamSource()
        _ = (
            data
            | 'GenerateStatistics[%s]' % split >>
            stats_api.GenerateStatistics(stats_options)
            | 'WriteStatsOutput[%s]' % split >>
            stats_api.WriteStatisticsToBinaryFile(output_path))
        logging.info('Statistics for split %s written to %s.', split,
                     output_uri)
예제 #8
0
 def test_raise_if_not_example(self):
     artifact = standard_artifacts.DataView()
     with self.assertRaisesRegex(
             AssertionError, 'must be of type standard_artifacts.Example'):
         tfxio_utils.get_tfxio_factory_from_artifact(
             artifact, _TELEMETRY_DESCRIPTORS)
예제 #9
0
파일: executor.py 프로젝트: tinally/tfx
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - eval_config: JSON string of tfma.EvalConfig.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data. Deprecated, use
          eval_config.slicing_specs instead.
        - example_splits: JSON-serialized list of names of splits on which the
          metrics are computed. Default behavior (when example_splits is set to
          None) is using the 'eval' split.

    Returns:
      None
    """
        if constants.EXAMPLES_KEY not in input_dict:
            raise ValueError('EXAMPLES_KEY is missing from input dict.')
        if constants.MODEL_KEY not in input_dict:
            raise ValueError('MODEL_KEY is missing from input dict.')
        if constants.EVALUATION_KEY not in output_dict:
            raise ValueError('EVALUATION_KEY is missing from output dict.')
        if len(input_dict[constants.MODEL_KEY]) > 1:
            raise ValueError(
                'There can be only one candidate model, there are %d.' %
                (len(input_dict[constants.MODEL_KEY])))
        if constants.BASELINE_MODEL_KEY in input_dict and len(
                input_dict[constants.BASELINE_MODEL_KEY]) > 1:
            raise ValueError(
                'There can be only one baseline model, there are %d.' %
                (len(input_dict[constants.BASELINE_MODEL_KEY])))

        self._log_startup(input_dict, output_dict, exec_properties)

        # Add fairness indicator metric callback if necessary.
        fairness_indicator_thresholds = exec_properties.get(
            'fairness_indicator_thresholds', None)
        add_metrics_callbacks = None
        if fairness_indicator_thresholds:
            add_metrics_callbacks = [
                tfma.post_export_metrics.fairness_indicators(  # pytype: disable=module-attr
                    thresholds=fairness_indicator_thresholds),
            ]

        output_uri = artifact_utils.get_single_uri(
            output_dict[constants.EVALUATION_KEY])

        eval_shared_model_fn = udf_utils.try_get_fn(
            exec_properties=exec_properties,
            fn_name='custom_eval_shared_model'
        ) or tfma.default_eval_shared_model

        run_validation = False
        models = []
        if 'eval_config' in exec_properties and exec_properties['eval_config']:
            slice_spec = None
            has_baseline = bool(input_dict.get(constants.BASELINE_MODEL_KEY))
            eval_config = tfma.EvalConfig()
            json_format.Parse(exec_properties['eval_config'], eval_config)
            eval_config = tfma.update_eval_config_with_defaults(
                eval_config,
                maybe_add_baseline=has_baseline,
                maybe_remove_baseline=not has_baseline)
            tfma.verify_eval_config(eval_config)
            # Do not validate model when there is no thresholds configured. This is to
            # avoid accidentally blessing models when users forget to set thresholds.
            run_validation = bool(
                tfma.metrics.metric_thresholds_from_metrics_specs(
                    eval_config.metrics_specs))
            if len(eval_config.model_specs) > 2:
                raise ValueError(
                    """Cannot support more than two models. There are %d models in this
             eval_config.""" % (len(eval_config.model_specs)))
            # Extract model artifacts.
            for model_spec in eval_config.model_specs:
                if model_spec.is_baseline:
                    model_uri = artifact_utils.get_single_uri(
                        input_dict[constants.BASELINE_MODEL_KEY])
                else:
                    model_uri = artifact_utils.get_single_uri(
                        input_dict[constants.MODEL_KEY])
                if tfma.get_model_type(model_spec) == tfma.TF_ESTIMATOR:
                    model_path = path_utils.eval_model_path(model_uri)
                else:
                    model_path = path_utils.serving_model_path(model_uri)
                logging.info('Using %s as %s model.', model_path,
                             model_spec.name)
                models.append(
                    eval_shared_model_fn(
                        eval_saved_model_path=model_path,
                        model_name=model_spec.name,
                        eval_config=eval_config,
                        add_metrics_callbacks=add_metrics_callbacks))
        else:
            eval_config = None
            assert ('feature_slicing_spec' in exec_properties
                    and exec_properties['feature_slicing_spec']
                    ), 'both eval_config and feature_slicing_spec are unset.'
            feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
            json_format.Parse(exec_properties['feature_slicing_spec'],
                              feature_slicing_spec)
            slice_spec = self._get_slice_spec_from_feature_slicing_spec(
                feature_slicing_spec)
            model_uri = artifact_utils.get_single_uri(
                input_dict[constants.MODEL_KEY])
            model_path = path_utils.eval_model_path(model_uri)
            logging.info('Using %s for model eval.', model_path)
            models.append(
                eval_shared_model_fn(
                    eval_saved_model_path=model_path,
                    model_name='',
                    eval_config=None,
                    add_metrics_callbacks=add_metrics_callbacks))

        eval_shared_model = models[0] if len(models) == 1 else models
        schema = None
        if constants.SCHEMA_KEY in input_dict:
            schema = io_utils.SchemaReader().read(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(
                        input_dict[constants.SCHEMA_KEY])))

        # Load and deserialize example splits from execution properties.
        example_splits = json_utils.loads(
            exec_properties.get(constants.EXAMPLE_SPLITS_KEY, 'null'))
        if not example_splits:
            example_splits = ['eval']
            logging.info(
                "The 'example_splits' parameter is not set, using 'eval' "
                'split.')

        logging.info('Evaluating model.')
        with self._make_beam_pipeline() as pipeline:
            examples_list = []
            tensor_adapter_config = None
            # pylint: disable=expression-not-assigned
            if _USE_TFXIO and tfma.is_batched_input(eval_shared_model,
                                                    eval_config):
                tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
                    examples=[
                        artifact_utils.get_single_instance(
                            input_dict[constants.EXAMPLES_KEY])
                    ],
                    telemetry_descriptors=_TELEMETRY_DESCRIPTORS,
                    schema=schema,
                    raw_record_column_name=tfma_constants.ARROW_INPUT_COLUMN)
                # TODO(b/161935932): refactor after TFXIO supports multiple patterns.
                for split in example_splits:
                    file_pattern = io_utils.all_files_pattern(
                        artifact_utils.get_split_uri(
                            input_dict[constants.EXAMPLES_KEY], split))
                    tfxio = tfxio_factory(file_pattern)
                    data = (pipeline
                            | 'ReadFromTFRecordToArrow[%s]' % split >>
                            tfxio.BeamSource())
                    examples_list.append(data)
                if schema is not None:
                    # Use last tfxio as TensorRepresentations and ArrowSchema are fixed.
                    tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
                        arrow_schema=tfxio.ArrowSchema(),
                        tensor_representations=tfxio.TensorRepresentations())
            else:
                for split in example_splits:
                    file_pattern = io_utils.all_files_pattern(
                        artifact_utils.get_split_uri(
                            input_dict[constants.EXAMPLES_KEY], split))
                    data = (
                        pipeline
                        | 'ReadFromTFRecord[%s]' % split >>
                        beam.io.ReadFromTFRecord(file_pattern=file_pattern))
                    examples_list.append(data)

            custom_extractors = udf_utils.try_get_fn(
                exec_properties=exec_properties, fn_name='custom_extractors')
            extractors = None
            if custom_extractors:
                extractors = custom_extractors(
                    eval_shared_model=eval_shared_model,
                    eval_config=eval_config,
                    tensor_adapter_config=tensor_adapter_config)

            (examples_list | 'FlattenExamples' >> beam.Flatten()
             | 'ExtractEvaluateAndWriteResults' >>
             tfma.ExtractEvaluateAndWriteResults(
                 eval_shared_model=models[0] if len(models) == 1 else models,
                 eval_config=eval_config,
                 extractors=extractors,
                 output_path=output_uri,
                 slice_spec=slice_spec,
                 tensor_adapter_config=tensor_adapter_config))
        logging.info('Evaluation complete. Results written to %s.', output_uri)

        if not run_validation:
            # TODO(jinhuang): delete the BLESSING_KEY from output_dict when supported.
            logging.info('No threshold configured, will not validate model.')
            return
        # Set up blessing artifact
        blessing = artifact_utils.get_single_instance(
            output_dict[constants.BLESSING_KEY])
        blessing.set_string_custom_property(
            constants.ARTIFACT_PROPERTY_CURRENT_MODEL_URI_KEY,
            artifact_utils.get_single_uri(input_dict[constants.MODEL_KEY]))
        blessing.set_int_custom_property(
            constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY,
            input_dict[constants.MODEL_KEY][0].id)
        if input_dict.get(constants.BASELINE_MODEL_KEY):
            baseline_model = input_dict[constants.BASELINE_MODEL_KEY][0]
            blessing.set_string_custom_property(
                constants.ARTIFACT_PROPERTY_BASELINE_MODEL_URI_KEY,
                baseline_model.uri)
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BASELINE_MODEL_ID_KEY,
                baseline_model.id)
        if 'current_component_id' in exec_properties:
            blessing.set_string_custom_property(
                'component_id', exec_properties['current_component_id'])
        # Check validation result and write BLESSED file accordingly.
        logging.info('Checking validation results.')
        validation_result = tfma.load_validation_result(output_uri)
        if validation_result.validation_ok:
            io_utils.write_string_file(
                os.path.join(blessing.uri, constants.BLESSED_FILE_NAME), '')
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                constants.BLESSED_VALUE)
        else:
            io_utils.write_string_file(
                os.path.join(blessing.uri, constants.NOT_BLESSED_FILE_NAME),
                '')
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                constants.NOT_BLESSED_VALUE)
        logging.info('Blessing result %s written to %s.',
                     validation_result.validation_ok, blessing.uri)
예제 #10
0
파일: executor.py 프로젝트: Mistobaan/tfx
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    stats_options = options.StatsOptions()
    if STATS_OPTIONS_JSON_KEY in exec_properties:
      stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY]
      if stats_options_json:
        # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
        # json_utils
        stats_options = options.StatsOptions.from_json(stats_options_json)
    if input_dict.get(SCHEMA_KEY):
      if stats_options.schema:
        raise ValueError('A schema was provided as an input and the '
                         'stats_options exec_property also contains a schema '
                         'value. At most one of these may be set.')
      else:
        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
        stats_options.schema = schema

    split_and_tfxio = []
    for artifact in input_dict[EXAMPLES_KEY]:
      tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
          examples=artifact, telemetry_descriptors=_TELEMETRY_DESCRIPTORS)
      for split in artifact_utils.decode_split_names(artifact.split_names):
        uri = os.path.join(artifact.uri, split)
        split_and_tfxio.append(
            (split, tfxio_factory(io_utils.all_files_pattern(uri))))
    with self._make_beam_pipeline() as p:
      for split, tfxio in split_and_tfxio:
        absl.logging.info('Generating statistics for split {}'.format(split))
        output_uri = artifact_utils.get_split_uri(output_dict[STATISTICS_KEY],
                                                  split)
        output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
        data = p | 'TFXIORead[{}]'.format(split) >> tfxio.BeamSource()
        _ = (
            data
            | 'GenerateStatistics[{}]'.format(split) >>
            stats_api.GenerateStatistics(stats_options)
            | 'WriteStatsOutput[{}]'.format(split) >>
            stats_api.WriteStatisticsToTFRecord(output_path))
        absl.logging.info('Statistics for split {} written to {}.'.format(
            split, output_uri))
예제 #11
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:

        # Check the inputs
        if constants.EXAMPLES not in input_dict:
            raise ValueError(f'{constants.EXAMPLES} is missing from inputs')
        examples_artifact = input_dict[constants.EXAMPLES]

        input_uri = artifact_utils.get_single_uri(examples_artifact)
        if len(zenml_path_utils.list_dir(input_uri)) == 0:
            raise AssertionError(
                'ZenML can not run the evaluation as the provided input '
                'configuration does not point towards any data. Specifically, '
                'if you are using the agnostic evaluator, please make sure '
                'that you are using a proper test_fn in your trainer step to '
                'write these results.')

        else:
            # Check the outputs
            if constants.EVALUATION not in output_dict:
                raise ValueError(
                    f'{constants.EVALUATION} is missing from outputs')
            evaluation_artifact = output_dict[constants.EVALUATION]
            output_uri = artifact_utils.get_single_uri(evaluation_artifact)

            # Resolve the schema
            schema = None
            if constants.SCHEMA in input_dict:
                schema_artifact = input_dict[constants.SCHEMA]
                schema_uri = artifact_utils.get_single_uri(schema_artifact)
                reader = io_utils.SchemaReader()
                schema = reader.read(io_utils.get_only_uri_in_dir(schema_uri))

            # Create the step with the schema attached if provided
            source = exec_properties[StepKeys.SOURCE]
            args = exec_properties[StepKeys.ARGS]
            c = source_utils.load_source_path_class(source)
            evaluator_step: BaseEvaluatorStep = c(**args)

            # Check the execution parameters
            eval_config = evaluator_step.build_config()
            eval_config = tfma.update_eval_config_with_defaults(eval_config)
            tfma.verify_eval_config(eval_config)

            # Resolve the model
            if constants.MODEL in input_dict:
                model_artifact = input_dict[constants.MODEL]
                model_uri = artifact_utils.get_single_uri(model_artifact)
                model_path = path_utils.serving_model_path(model_uri)

                model_fn = try_get_fn(evaluator_step.CUSTOM_MODULE,
                                      'custom_eval_shared_model'
                                      ) or tfma.default_eval_shared_model

                eval_shared_model = model_fn(
                    model_name='',  # TODO: Fix with model names
                    eval_saved_model_path=model_path,
                    eval_config=eval_config)
            else:
                eval_shared_model = None

            self._log_startup(input_dict, output_dict, exec_properties)

            # Main pipeline
            logging.info('Evaluating model.')
            with self._make_beam_pipeline() as pipeline:
                examples_list = []
                tensor_adapter_config = None

                if tfma.is_batched_input(eval_shared_model, eval_config):
                    tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
                        examples=[
                            artifact_utils.get_single_instance(
                                examples_artifact)
                        ],
                        telemetry_descriptors=_TELEMETRY_DESCRIPTORS,
                        schema=schema,
                        raw_record_column_name=tfma_constants.
                        ARROW_INPUT_COLUMN)
                    for split in evaluator_step.splits:
                        file_pattern = io_utils.all_files_pattern(
                            artifact_utils.get_split_uri(
                                examples_artifact, split))
                        tfxio = tfxio_factory(file_pattern)
                        data = (pipeline
                                | 'ReadFromTFRecordToArrow[%s]' % split >>
                                tfxio.BeamSource())
                        examples_list.append(data)
                    if schema is not None:
                        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
                            arrow_schema=tfxio.ArrowSchema(),
                            tensor_representations=tfxio.TensorRepresentations(
                            ))
                else:
                    for split in evaluator_step.splits:
                        file_pattern = io_utils.all_files_pattern(
                            artifact_utils.get_split_uri(
                                examples_artifact, split))
                        data = (pipeline
                                | 'ReadFromTFRecord[%s]' % split >> beam.io.
                                ReadFromTFRecord(file_pattern=file_pattern))
                        examples_list.append(data)

                # Resolve custom extractors
                custom_extractors = try_get_fn(evaluator_step.CUSTOM_MODULE,
                                               'custom_extractors')
                extractors = None
                if custom_extractors:
                    extractors = custom_extractors(
                        eval_shared_model=eval_shared_model,
                        eval_config=eval_config,
                        tensor_adapter_config=tensor_adapter_config)

                # Resolve custom evaluators
                custom_evaluators = try_get_fn(evaluator_step.CUSTOM_MODULE,
                                               'custom_evaluators')
                evaluators = None
                if custom_evaluators:
                    evaluators = custom_evaluators(
                        eval_shared_model=eval_shared_model,
                        eval_config=eval_config,
                        tensor_adapter_config=tensor_adapter_config)

                # Extract, evaluate and write
                (examples_list | 'FlattenExamples' >> beam.Flatten()
                 | 'ExtractEvaluateAndWriteResults' >>
                 tfma.ExtractEvaluateAndWriteResults(
                     eval_config=eval_config,
                     eval_shared_model=eval_shared_model,
                     output_path=output_uri,
                     extractors=extractors,
                     evaluators=evaluators,
                     tensor_adapter_config=tensor_adapter_config))
            logging.info('Evaluation complete. Results written to %s.',
                         output_uri)
예제 #12
0
파일: executor.py 프로젝트: jay90099/tfx
  def Do(self, input_dict: Dict[str, List[types.Artifact]],
         output_dict: Dict[str, List[types.Artifact]],
         exec_properties: Dict[str, Any]) -> None:
    """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - evaluation: model evaluation results.
      exec_properties: A dict of execution properties.
        - eval_config: JSON string of tfma.EvalConfig.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data. Deprecated, use
          eval_config.slicing_specs instead.
        - example_splits: JSON-serialized list of names of splits on which the
          metrics are computed. Default behavior (when example_splits is set to
          None) is using the 'eval' split.

    Returns:
      None
    """
    if standard_component_specs.EXAMPLES_KEY not in input_dict:
      raise ValueError('EXAMPLES_KEY is missing from input dict.')
    if standard_component_specs.EVALUATION_KEY not in output_dict:
      raise ValueError('EVALUATION_KEY is missing from output dict.')
    if standard_component_specs.MODEL_KEY in input_dict and len(
        input_dict[standard_component_specs.MODEL_KEY]) > 1:
      raise ValueError('There can be only one candidate model, there are %d.' %
                       (len(input_dict[standard_component_specs.MODEL_KEY])))
    if standard_component_specs.BASELINE_MODEL_KEY in input_dict and len(
        input_dict[standard_component_specs.BASELINE_MODEL_KEY]) > 1:
      raise ValueError(
          'There can be only one baseline model, there are %d.' %
          (len(input_dict[standard_component_specs.BASELINE_MODEL_KEY])))

    self._log_startup(input_dict, output_dict, exec_properties)

    # Add fairness indicator metric callback if necessary.
    fairness_indicator_thresholds = json_utils.loads(
        exec_properties.get(
            standard_component_specs.FAIRNESS_INDICATOR_THRESHOLDS_KEY, 'null'))
    add_metrics_callbacks = None
    if fairness_indicator_thresholds:
      add_metrics_callbacks = [
          tfma.post_export_metrics.fairness_indicators(  # pytype: disable=module-attr
              thresholds=fairness_indicator_thresholds),
      ]

    output_uri = artifact_utils.get_single_uri(
        output_dict[constants.EVALUATION_KEY])

    # Make sure user packages get propagated to the remote Beam worker.
    unused_module_path, extra_pip_packages = udf_utils.decode_user_module_key(
        exec_properties.get(standard_component_specs.MODULE_PATH_KEY, None))
    for pip_package_path in extra_pip_packages:
      local_pip_package_path = io_utils.ensure_local(pip_package_path)
      self._beam_pipeline_args.append('--extra_package=%s' %
                                      local_pip_package_path)

    eval_shared_model_fn = udf_utils.try_get_fn(
        exec_properties=exec_properties,
        fn_name='custom_eval_shared_model') or tfma.default_eval_shared_model

    run_validation = False
    models = []
    if (standard_component_specs.EVAL_CONFIG_KEY in exec_properties
        and exec_properties[standard_component_specs.EVAL_CONFIG_KEY]):
      slice_spec = None
      has_baseline = bool(
          input_dict.get(standard_component_specs.BASELINE_MODEL_KEY))
      eval_config = tfma.EvalConfig()
      proto_utils.json_to_proto(
          exec_properties[standard_component_specs.EVAL_CONFIG_KEY],
          eval_config)
      # rubber_stamp is always assumed true, i.e., change threshold will always
      # be ignored when a baseline model is missing.
      if hasattr(tfma, 'utils'):
        eval_config = tfma.utils.update_eval_config_with_defaults(
            eval_config, has_baseline=has_baseline, rubber_stamp=True)
        tfma.utils.verify_eval_config(eval_config)
      else:
        # TODO(b/171992041): Replaced by tfma.utils.
        eval_config = tfma.update_eval_config_with_defaults(
            eval_config, has_baseline=has_baseline, rubber_stamp=True)
        tfma.verify_eval_config(eval_config)
      # Do not validate model when there is no thresholds configured. This is to
      # avoid accidentally blessing models when users forget to set thresholds.
      run_validation = bool(
          tfma.metrics.metric_thresholds_from_metrics_specs(
              eval_config.metrics_specs, eval_config=eval_config))
      if len(eval_config.model_specs) > 2:
        raise ValueError(
            """Cannot support more than two models. There are %d models in this
             eval_config.""" % (len(eval_config.model_specs)))
      # Extract model artifacts.
      for model_spec in eval_config.model_specs:
        if standard_component_specs.MODEL_KEY not in input_dict:
          if not model_spec.prediction_key:
            raise ValueError(
                'model_spec.prediction_key required if model not provided')
          continue
        if model_spec.is_baseline:
          model_artifact = artifact_utils.get_single_instance(
              input_dict[standard_component_specs.BASELINE_MODEL_KEY])
        else:
          model_artifact = artifact_utils.get_single_instance(
              input_dict[standard_component_specs.MODEL_KEY])
        # TODO(b/171992041): tfma.get_model_type replaced by tfma.utils.
        if ((hasattr(tfma, 'utils') and
             tfma.utils.get_model_type(model_spec) == tfma.TF_ESTIMATOR) or
            hasattr(tfma, 'get_model_type') and
            tfma.get_model_type(model_spec) == tfma.TF_ESTIMATOR):
          model_path = path_utils.eval_model_path(
              model_artifact.uri,
              path_utils.is_old_model_artifact(model_artifact))
        else:
          model_path = path_utils.serving_model_path(
              model_artifact.uri,
              path_utils.is_old_model_artifact(model_artifact))
        logging.info('Using %s as %s model.', model_path, model_spec.name)
        models.append(
            eval_shared_model_fn(
                eval_saved_model_path=model_path,
                model_name=model_spec.name,
                eval_config=eval_config,
                add_metrics_callbacks=add_metrics_callbacks))
    else:
      eval_config = None
      assert (standard_component_specs.FEATURE_SLICING_SPEC_KEY
              in exec_properties and
              exec_properties[standard_component_specs.FEATURE_SLICING_SPEC_KEY]
             ), 'both eval_config and feature_slicing_spec are unset.'
      feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
      proto_utils.json_to_proto(
          exec_properties[standard_component_specs.FEATURE_SLICING_SPEC_KEY],
          feature_slicing_spec)
      slice_spec = self._get_slice_spec_from_feature_slicing_spec(
          feature_slicing_spec)
      model_artifact = artifact_utils.get_single_instance(
          input_dict[standard_component_specs.MODEL_KEY])
      model_path = path_utils.eval_model_path(
          model_artifact.uri, path_utils.is_old_model_artifact(model_artifact))
      logging.info('Using %s for model eval.', model_path)
      models.append(
          eval_shared_model_fn(
              eval_saved_model_path=model_path,
              model_name='',
              eval_config=None,
              add_metrics_callbacks=add_metrics_callbacks))

    eval_shared_model = models[0] if len(models) == 1 else models
    schema = None
    if standard_component_specs.SCHEMA_KEY in input_dict:
      schema = io_utils.SchemaReader().read(
          io_utils.get_only_uri_in_dir(
              artifact_utils.get_single_uri(
                  input_dict[standard_component_specs.SCHEMA_KEY])))

    # Load and deserialize example splits from execution properties.
    example_splits = json_utils.loads(
        exec_properties.get(standard_component_specs.EXAMPLE_SPLITS_KEY,
                            'null'))
    if not example_splits:
      example_splits = ['eval']
      logging.info("The 'example_splits' parameter is not set, using 'eval' "
                   'split.')

    logging.info('Evaluating model.')
    # TempPipInstallContext is needed here so that subprocesses (which
    # may be created by the Beam multi-process DirectRunner) can find the
    # needed dependencies.
    # TODO(b/187122662): Move this to the ExecutorOperator or Launcher.
    with udf_utils.TempPipInstallContext(extra_pip_packages):
      with self._make_beam_pipeline() as pipeline:
        examples_list = []
        tensor_adapter_config = None
        # pylint: disable=expression-not-assigned
        if tfma.is_batched_input(eval_shared_model, eval_config):
          tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
              examples=input_dict[standard_component_specs.EXAMPLES_KEY],
              telemetry_descriptors=_TELEMETRY_DESCRIPTORS,
              schema=schema,
              raw_record_column_name=tfma_constants.ARROW_INPUT_COLUMN)
          # TODO(b/161935932): refactor after TFXIO supports multiple patterns.
          for split in example_splits:
            split_uris = artifact_utils.get_split_uris(
                input_dict[standard_component_specs.EXAMPLES_KEY], split)
            for index in range(len(split_uris)):
              split_uri = split_uris[index]
              file_pattern = io_utils.all_files_pattern(split_uri)
              tfxio = tfxio_factory(file_pattern)
              data = (
                  pipeline
                  | f'ReadFromTFRecordToArrow[{split}][{index}]' >>
                  tfxio.BeamSource())
              examples_list.append(data)
          if schema is not None:
            # Use last tfxio as TensorRepresentations and ArrowSchema are fixed.
            tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
                arrow_schema=tfxio.ArrowSchema(),
                tensor_representations=tfxio.TensorRepresentations())
        else:
          for split in example_splits:
            split_uris = artifact_utils.get_split_uris(
                input_dict[standard_component_specs.EXAMPLES_KEY], split)
            for index in range(len(split_uris)):
              split_uri = split_uris[index]
              file_pattern = io_utils.all_files_pattern(split_uri)
              data = (
                  pipeline
                  | f'ReadFromTFRecord[{split}][{index}]' >>
                  beam.io.ReadFromTFRecord(file_pattern=file_pattern))
              examples_list.append(data)

        custom_extractors = udf_utils.try_get_fn(
            exec_properties=exec_properties, fn_name='custom_extractors')
        extractors = None
        if custom_extractors:
          extractors = custom_extractors(
              eval_shared_model=eval_shared_model,
              eval_config=eval_config,
              tensor_adapter_config=tensor_adapter_config)

        (examples_list | 'FlattenExamples' >> beam.Flatten()
         | 'ExtractEvaluateAndWriteResults' >>
         (tfma.ExtractEvaluateAndWriteResults(
             eval_shared_model=models[0] if len(models) == 1 else models,
             eval_config=eval_config,
             extractors=extractors,
             output_path=output_uri,
             slice_spec=slice_spec,
             tensor_adapter_config=tensor_adapter_config)))
    logging.info('Evaluation complete. Results written to %s.', output_uri)

    if not run_validation:
      # TODO(jinhuang): delete the BLESSING_KEY from output_dict when supported.
      logging.info('No threshold configured, will not validate model.')
      return
    # Set up blessing artifact
    blessing = artifact_utils.get_single_instance(
        output_dict[standard_component_specs.BLESSING_KEY])
    blessing.set_string_custom_property(
        constants.ARTIFACT_PROPERTY_CURRENT_MODEL_URI_KEY,
        artifact_utils.get_single_uri(
            input_dict[standard_component_specs.MODEL_KEY]))
    blessing.set_int_custom_property(
        constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY,
        input_dict[standard_component_specs.MODEL_KEY][0].id)
    if input_dict.get(standard_component_specs.BASELINE_MODEL_KEY):
      baseline_model = input_dict[
          standard_component_specs.BASELINE_MODEL_KEY][0]
      blessing.set_string_custom_property(
          constants.ARTIFACT_PROPERTY_BASELINE_MODEL_URI_KEY,
          baseline_model.uri)
      blessing.set_int_custom_property(
          constants.ARTIFACT_PROPERTY_BASELINE_MODEL_ID_KEY, baseline_model.id)
    if 'current_component_id' in exec_properties:
      blessing.set_string_custom_property(
          'component_id', exec_properties['current_component_id'])
    # Check validation result and write BLESSED file accordingly.
    logging.info('Checking validation results.')
    validation_result = tfma.load_validation_result(output_uri)
    if validation_result.validation_ok:
      io_utils.write_string_file(
          os.path.join(blessing.uri, constants.BLESSED_FILE_NAME), '')
      blessing.set_int_custom_property(constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                                       constants.BLESSED_VALUE)
    else:
      io_utils.write_string_file(
          os.path.join(blessing.uri, constants.NOT_BLESSED_FILE_NAME), '')
      blessing.set_int_custom_property(constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                                       constants.NOT_BLESSED_VALUE)
    logging.info('Blessing result %s written to %s.',
                 validation_result.validation_ok, blessing.uri)