示例#1
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    # KerasTuner generates tuning state (e.g., oracle, trials) to working dir.
    working_dir = self._get_tmp_dir()

    train_path = artifact_utils.get_split_uri(input_dict['examples'], 'train')
    eval_path = artifact_utils.get_split_uri(input_dict['examples'], 'eval')
    schema_file = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict['schema']))
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

    tuner_fn = self._GetTunerFn(exec_properties)
    tuner_spec = tuner_fn(working_dir, io_utils.all_files_pattern(train_path),
                          io_utils.all_files_pattern(eval_path), schema)
    tuner = tuner_spec.tuner

    tuner.search_space_summary()
    # TODO(jyzhao): assert v2 behavior as KerasTuner doesn't work in v1.
    # TODO(jyzhao): make epochs configurable.
    tuner.search(
        tuner_spec.train_dataset,
        epochs=5,
        validation_data=tuner_spec.eval_dataset)
    tuner.results_summary()

    best_hparams = tuner.oracle.get_best_trials(
        1)[0].hyperparameters.get_config()
    best_hparams_path = os.path.join(
        artifact_utils.get_single_uri(output_dict['study_best_hparams_path']),
        _DEFAULT_FILE_NAME)
    io_utils.write_string_file(best_hparams_path, json.dumps(best_hparams))
    absl.logging.info('Best HParams is written to %s.' % best_hparams_path)
示例#2
0
文件: fn_args_utils.py 项目: zvrr/tfx
def get_common_fn_args(input_dict: Dict[Text, List[types.Artifact]],
                       exec_properties: Dict[Text, Any],
                       working_dir: Text = None) -> FnArgs:
  """Get common args of training and tuning."""
  train_files = [
      io_utils.all_files_pattern(
          artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY],
                                       'train'))
  ]
  eval_files = [
      io_utils.all_files_pattern(
          artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY],
                                       'eval'))
  ]

  if input_dict.get(constants.TRANSFORM_GRAPH_KEY):
    transform_graph_path = artifact_utils.get_single_uri(
        input_dict[constants.TRANSFORM_GRAPH_KEY])
  else:
    transform_graph_path = None

  if input_dict.get(constants.SCHEMA_KEY):
    schema_path = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict[constants.SCHEMA_KEY]))
  else:
    schema_path = None

  train_args = trainer_pb2.TrainArgs()
  eval_args = trainer_pb2.EvalArgs()
  json_format.Parse(exec_properties[constants.TRAIN_ARGS_KEY], train_args)
  json_format.Parse(exec_properties[constants.EVAL_ARGS_KEY], eval_args)

  # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
  # num_steps=None.  Conversion of the proto to python will set the default
  # value of an int as 0 so modify the value here.  Tensorflow will raise an
  # error if num_steps <= 0.
  train_steps = train_args.num_steps or None
  eval_steps = eval_args.num_steps or None

  # TODO(b/156929910): Refactor Trainer to be consistent with empty or None
  #                    custom_config handling.
  custom_config = json_utils.loads(
      exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null'))

  return FnArgs(
      working_dir=working_dir,
      train_files=train_files,
      eval_files=eval_files,
      train_steps=train_steps,
      eval_steps=eval_steps,
      schema_path=schema_path,
      transform_graph_path=transform_graph_path,
      custom_config=custom_config,
  )
示例#3
0
    def _generate_blessing_result(self, eval_examples_uri: Text,
                                  slice_spec: List[
                                      tfma.slicer.SingleSliceSpec],
                                  current_model_dir: Text,
                                  blessed_model_dir: Text) -> bool:
        current_model_eval_result_path = os.path.join(
            self._temp_path, constants.CURRENT_MODEL_EVAL_RESULT_PATH)
        blessed_model_eval_result_path = os.path.join(
            self._temp_path, constants.BLESSED_MODEL_EVAL_RESULT_PATH)

        with self._make_beam_pipeline() as pipeline:
            eval_data = (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord(
                file_pattern=io_utils.all_files_pattern(eval_examples_uri)))

            current_model = tfma.default_eval_shared_model(
                eval_saved_model_path=path_utils.eval_model_path(
                    current_model_dir))
            (eval_data
             | 'EvalCurrentModel' >> tfma.ExtractEvaluateAndWriteResults(  # pylint: disable=expression-not-assigned
                 eval_shared_model=current_model,
                 slice_spec=slice_spec,
                 output_path=current_model_eval_result_path))

            if blessed_model_dir is not None:
                blessed_model = tfma.default_eval_shared_model(
                    eval_saved_model_path=path_utils.eval_model_path(
                        blessed_model_dir))
                (eval_data
                 | 'EvalBlessedModel' >> tfma.ExtractEvaluateAndWriteResults(  # pylint: disable=expression-not-assigned
                     eval_shared_model=blessed_model,
                     slice_spec=slice_spec,
                     output_path=blessed_model_eval_result_path))

        absl.logging.info(
            'all files in current_model_eval_result_path: [%s]',
            str(tf.io.gfile.listdir(current_model_eval_result_path)))
        current_model_eval_result = tfma.load_eval_result(
            output_path=current_model_eval_result_path)

        if not self._pass_threshold(current_model_eval_result):
            absl.logging.info('Current model does not pass threshold.')
            return False
        absl.logging.info('Current model passes threshold.')

        if blessed_model_dir is None:
            absl.logging.info('No blessed model yet.')
            return True
        absl.logging.info(
            'all files in blessed_model_eval_result: [%s]',
            str(tf.io.gfile.listdir(blessed_model_eval_result_path)))
        blessed_model_eval_result = tfma.load_eval_result(
            output_path=blessed_model_eval_result_path)

        if (self._compare_eval_result(current_model_eval_result,
                                      blessed_model_eval_result)):
            absl.logging.info('Current model better than blessed model.')
            return True
        else:
            absl.logging.info('Current model worse than blessed model.')
            return False
示例#4
0
文件: executor.py 项目: reddqian/tfx
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data.

    Returns:
      None
    """
        if 'model_exports' not in input_dict:
            raise ValueError('\'model_exports\' is missing in input dict.')
        if 'examples' not in input_dict:
            raise ValueError('\'examples\' is missing in input dict.')
        if 'output' not in output_dict:
            raise ValueError('\'output\' is missing in output dict.')

        self._log_startup(input_dict, output_dict, exec_properties)

        # Extract input artifacts
        model_exports_uri = artifact_utils.get_single_uri(
            input_dict['model_exports'])

        feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
        json_format.Parse(exec_properties['feature_slicing_spec'],
                          feature_slicing_spec)
        slice_spec = self._get_slice_spec_from_feature_slicing_spec(
            feature_slicing_spec)

        output_uri = artifact_utils.get_single_uri(output_dict['output'])

        eval_model_path = path_utils.eval_model_path(model_exports_uri)

        tf.logging.info('Using {} for model eval.'.format(eval_model_path))
        eval_shared_model = tfma.default_eval_shared_model(
            eval_saved_model_path=eval_model_path)

        tf.logging.info('Evaluating model.')
        with self._make_beam_pipeline() as pipeline:
            # pylint: disable=expression-not-assigned
            (pipeline
             | 'ReadData' >>
             beam.io.ReadFromTFRecord(file_pattern=io_utils.all_files_pattern(
                 artifact_utils.get_split_uri(input_dict['examples'], 'eval')))
             | 'ExtractEvaluateAndWriteResults' >>
             tfma.ExtractEvaluateAndWriteResults(
                 eval_shared_model=eval_shared_model,
                 slice_spec=slice_spec,
                 output_path=output_uri))
        tf.logging.info(
            'Evaluation complete. Results written to {}.'.format(output_uri))
示例#5
0
文件: executor.py 项目: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data.

    Returns:
      None
    """
    if 'model_exports' not in input_dict:
      raise ValueError('\'model_exports\' is missing in input dict.')
    if 'examples' not in input_dict:
      raise ValueError('\'examples\' is missing in input dict.')
    if 'output' not in output_dict:
      raise ValueError('\'output\' is missing in output dict.')

    self._log_startup(input_dict, output_dict, exec_properties)

    # Extract input artifacts
    model_exports_uri = types.get_single_uri(input_dict['model_exports'])

    feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
    json_format.Parse(exec_properties['feature_slicing_spec'],
                      feature_slicing_spec)
    slice_spec = self._get_slice_spec_from_feature_slicing_spec(
        feature_slicing_spec)

    output_uri = types.get_single_uri(output_dict['output'])

    eval_model_path = path_utils.eval_model_path(model_exports_uri)

    tf.logging.info('Using {} for model eval.'.format(eval_model_path))
    eval_shared_model = tfma.default_eval_shared_model(
        eval_saved_model_path=eval_model_path)

    tf.logging.info('Evaluating model.')
    with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
      # pylint: disable=expression-not-assigned
      (pipeline
       | 'ReadData' >> beam.io.ReadFromTFRecord(
           file_pattern=io_utils.all_files_pattern(
               types.get_split_uri(input_dict['examples'], 'eval')))
       |
       'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults(
           eval_shared_model=eval_shared_model,
           slice_spec=slice_spec,
           output_path=output_uri))
    tf.logging.info(
        'Evaluation complete. Results written to {}.'.format(output_uri))
示例#6
0
    def _run_model_inference(
            self, data_spec: bulk_inferrer_pb2.DataSpec,
            examples: List[types.Artifact], output_uri: Text,
            inference_endpoint: model_spec_pb2.InferenceSpecType) -> bool:
        """Runs model inference on given example data.

    Args:
      data_spec: bulk_inferrer_pb2.DataSpec instance.
      examples: List of example artifacts.
      output_uri: Output artifact uri.
      inference_endpoint: Model inference endpoint.

    Returns:
      Whether the inference job succeed.
    """

        example_uris = {}
        if data_spec.example_splits:
            for example in examples:
                for split in artifact_utils.decode_split_names(
                        example.split_names):
                    if split in data_spec.example_splits:
                        example_uris[split] = os.path.join(example.uri, split)
        else:
            for example in examples:
                for split in artifact_utils.decode_split_names(
                        example.split_names):
                    example_uris[split] = os.path.join(example.uri, split)
        output_path = os.path.join(output_uri, _PREDICTION_LOGS_DIR_NAME)
        logging.info('BulkInferrer generates prediction log to %s',
                     output_path)

        with self._make_beam_pipeline() as pipeline:
            data_list = []
            for split, example_uri in example_uris.items():
                data = (
                    pipeline
                    | 'ReadData[{}]'.format(split) >> beam.io.ReadFromTFRecord(
                        file_pattern=io_utils.all_files_pattern(example_uri)))
                data_list.append(data)
            _ = (
                data_list
                | 'FlattenExamples' >> beam.Flatten(pipeline=pipeline)
                # TODO(b/131873699): Use the correct Example type here, which
                # is either Example or SequenceExample.
                | 'ParseExamples' >> beam.Map(tf.train.Example.FromString)
                | 'RunInference' >>
                run_inference.RunInference(inference_endpoint)
                | 'WritePredictionLogs' >> beam.io.WriteToTFRecord(
                    output_path,
                    file_name_suffix='.gz',
                    coder=beam.coders.ProtoCoder(
                        prediction_log_pb2.PredictionLog)))
        logging.info('Inference result written to %s.', output_path)
示例#7
0
    def _run_model_inference(self, model_path: Text,
                             example_uris: Mapping[Text,
                                                   Text], output_path: Text,
                             model_spec: bulk_inferrer_pb2.ModelSpec) -> None:
        """Runs model inference on given example data.

    Args:
      model_path: Path to model.
      example_uris: Mapping of example split name to example uri.
      output_path: Path to output generated prediction logs.
      model_spec: bulk_inferrer_pb2.ModelSpec instance.

    Returns:
      None
    """

        try:
            from tfx_bsl.public.beam import run_inference
            from tfx_bsl.public.proto import model_spec_pb2
        except ImportError:
            # TODO(b/151468119): Remove this branch after next release.
            run_inference = importlib.import_module(
                'tfx_bsl.beam.run_inference')
            model_spec_pb2 = importlib.import_module(
                'tfx_bsl.proto.model_spec_pb2')
        saved_model_spec = model_spec_pb2.SavedModelSpec(
            model_path=model_path,
            tag=model_spec.tag,
            signature_name=model_spec.model_signature_name)
        # TODO(b/151468119): Remove this branch after next release.
        if getattr(model_spec_pb2, 'InferenceEndpoint', False):
            inference_endpoint = getattr(model_spec_pb2, 'InferenceEndpoint')()
        else:
            inference_endpoint = model_spec_pb2.InferenceSpecType()
        inference_endpoint.saved_model_spec.CopyFrom(saved_model_spec)
        with self._make_beam_pipeline() as pipeline:
            data_list = []
            for split, example_uri in example_uris.items():
                data = (
                    pipeline
                    | 'ReadData[{}]'.format(split) >> beam.io.ReadFromTFRecord(
                        file_pattern=io_utils.all_files_pattern(example_uri)))
                data_list.append(data)
            _ = ([data for data in data_list]
                 | 'FlattenExamples' >> beam.Flatten(pipeline=pipeline)
                 | 'ParseExamples' >> beam.Map(tf.train.Example.FromString)
                 | 'RunInference' >>
                 run_inference.RunInference(inference_endpoint)
                 | 'WritePredictionLogs' >> beam.io.WriteToTFRecord(
                     output_path,
                     file_name_suffix='.gz',
                     coder=beam.coders.ProtoCoder(
                         prediction_log_pb2.PredictionLog)))
        logging.info('Inference result written to %s.', output_path)
示例#8
0
    def _generate_blessing_result(self, eval_examples_uri, slice_spec,
                                  current_model_dir, blessed_model_dir):
        current_model_eval_result_path = os.path.join(
            self._temp_path, CURRENT_MODEL_EVAL_RESULT_PATH)
        blessed_model_eval_result_path = os.path.join(
            self._temp_path, BLESSED_MODEL_EVAL_RESULT_PATH)

        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
            eval_data = (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord(
                file_pattern=io_utils.all_files_pattern(eval_examples_uri)))

            current_model = tfma.default_eval_shared_model(
                eval_saved_model_path=path_utils.eval_model_path(
                    current_model_dir))
            (eval_data
             | 'EvalCurrentModel' >> tfma.ExtractEvaluateAndWriteResults(  # pylint: disable=expression-not-assigned
                 eval_shared_model=current_model,
                 slice_spec=slice_spec,
                 output_path=current_model_eval_result_path))

            if blessed_model_dir is not None:
                blessed_model = tfma.default_eval_shared_model(
                    eval_saved_model_path=path_utils.eval_model_path(
                        blessed_model_dir))
                (eval_data
                 | 'EvalBlessedModel' >> tfma.ExtractEvaluateAndWriteResults(  # pylint: disable=expression-not-assigned
                     eval_shared_model=blessed_model,
                     slice_spec=slice_spec,
                     output_path=blessed_model_eval_result_path))

        current_model_eval_result = tfma.load_eval_result(
            output_path=current_model_eval_result_path)

        if not self._pass_threshold(current_model_eval_result):
            tf.logging.info('Current model does not pass threshold.')
            return False
        tf.logging.info('Current model passes threshold.')

        if blessed_model_dir is None:
            tf.logging.info('No blessed model yet.')
            return True

        blessed_model_eval_result = tfma.load_eval_result(
            output_path=blessed_model_eval_result_path)

        if (self._compare_eval_result(current_model_eval_result,
                                      blessed_model_eval_result)):
            tf.logging.info('Current model better than blessed model.')
            return True
        else:
            tf.logging.info('Current model worse than blessed model.')
            return False
示例#9
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:

        source = exec_properties[StepKeys.SOURCE]
        args = exec_properties[StepKeys.ARGS]

        c = source_utils.load_source_path_class(source)
        tokenizer_step: BaseTokenizer = c(**args)

        tokenizer_location = artifact_utils.get_single_uri(
            output_dict["tokenizer"])

        split_uris, split_names, all_files = [], [], []
        for artifact in input_dict["examples"]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                split_names.append(split)
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
                all_files += path_utils.list_dir(uri)

        # Get output split path
        output_examples = artifact_utils.get_single_instance(
            output_dict["output_examples"])
        output_examples.split_names = artifact_utils.encode_split_names(
            split_names)

        if not tokenizer_step.skip_training:
            tokenizer_step.train(files=all_files)

            tokenizer_step.save(output_dir=tokenizer_location)

        with self._make_beam_pipeline() as p:
            for split, uri in split_uris:
                input_uri = io_utils.all_files_pattern(uri)

                _ = (p
                     | 'ReadData.' + split >> beam.io.ReadFromTFRecord(
                            file_pattern=input_uri)
                     | "ParseTFExFromString." + split >> beam.Map(
                            tf.train.Example.FromString)
                     | "AddTokens." + split >> beam.Map(
                            append_tf_example,
                            tokenizer_step=tokenizer_step)
                     | 'Serialize.' + split >> beam.Map(
                            lambda x: x.SerializeToString())
                     | 'WriteSplit.' + split >> WriteSplit(
                            get_split_uri(
                                output_dict["output_examples"],
                                split)))
示例#10
0
    def _run_sampling(self, example_uris: Mapping[Text, Text], to_key_fn: Text,
                      output_artifact: Artifact, samples_per_key: int) -> None:
        """Runs stratified sampling on given example data.
    Args:
      example_uris: Mapping of example split name to example uri.
      to_key_fn: function to convert an example to a key
      output_artifact: Output artifact.
      samples_per_key: number of examples to keep per value of the key.
    Returns:
      None
    """

        d = {}
        exec(to_key_fn, globals(), d)  # how ugly is that?
        to_key = d['to_key']

        def to_keyed_value(m):
            return to_key(m), m

        with self._make_beam_pipeline() as pipeline:
            for split_name, example_uri in example_uris.items():
                data_list = [
                    (pipeline | 'ReadData[{}]'.format(split_name) >>
                     beam.io.ReadFromTFRecord(
                         file_pattern=io_utils.all_files_pattern(example_uri)))
                ]

                dest_path = os.path.join(
                    artifact_utils.get_split_uri([output_artifact],
                                                 split_name),
                    _STRATIFIED_EXAMPLES_FILE_PREFIX)

                _ = ([data for data in data_list]
                     | 'FlattenExamples ({})'.format(split_name) >>
                     beam.Flatten(pipeline=pipeline)
                     | 'ParseExamples ({})'.format(split_name) >> beam.Map(
                         tf.train.Example.FromString)
                     |
                     'Key ({})'.format(split_name) >> beam.Map(to_keyed_value)
                     | 'Sample per key ({})'.format(split_name) >>
                     beam.combiners.Sample.FixedSizePerKey(samples_per_key)
                     | 'Values ({})'.format(split_name) >> beam.Values()
                     | 'Flatten lists ({})'.format(split_name) >>
                     beam.FlatMap(lambda elements: elements)
                     | 'WriteStratifiedSamples ({})'.format(split_name) >>
                     beam.io.WriteToTFRecord(dest_path,
                                             file_name_suffix='.gz',
                                             coder=beam.coders.ProtoCoder(
                                                 tf.train.Example)))
                logging.info('Sampling result written to %s.', dest_path)
示例#11
0
def _RunInference(
    pipeline: beam.Pipeline, example_uri: Text,
    inference_endpoint: model_spec_pb2.InferenceSpecType
) -> beam.pvalue.PCollection:
  """Runs model inference on given examples data."""
  # TODO(b/174703893): adopt standardized input.
  return (
      pipeline
      | 'ReadData' >> beam.io.ReadFromTFRecord(
          file_pattern=io_utils.all_files_pattern(example_uri))
      # TODO(b/131873699): Use the correct Example type here, which
      # is either Example or SequenceExample.
      | 'ParseExamples' >> beam.Map(tf.train.Example.FromString)
      | 'RunInference' >> run_inference.RunInference(inference_endpoint))
示例#12
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        split_uris = []
        for artifact in input_dict['input_data']:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            # TODO(b/126263006): Support more stats_options through config.
            stats_options = options.StatsOptions()
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                output_uri = artifact_utils.get_split_uri(
                    output_dict['output'], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
示例#13
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        self._log_startup(input_dict, output_dict, exec_properties)

        example_uris = {}
        for example in input_dict['examples']:
            for split in artifact_utils.decode_split_names(
                    example.split_names):
                example_uris[split] = os.path.join(example.uri, split)

        model = artifact_utils.get_single_instance(input_dict['model'])
        model_path = path_utils.serving_model_path(model.uri)
        absl.logging.info('Using {} as current model.'.format(model_path))

        output_uri = os.path.join(
            artifact_utils.get_single_uri(output_dict['output_data']),
            'pred.csv')
        with self._make_beam_pipeline() as pipeline:
            test_data = []
            for split, example_uri in example_uris.items():
                test_data.append(pipeline | 'ReadFromTFRecord_{}'.format(
                    split) >> beam.io.ReadFromTFRecord(
                        file_pattern=io_utils.all_files_pattern(example_uri)))

            (test_data | 'Flattern' >> beam.Flatten()
             | 'ParseToExample' >> beam.Map(tf.train.Example.FromString)
             | 'Prediction' >> beam.ParDo(
                 RunModel(model_path, 'serving_default', 'PassengerId'))
             | 'ParseToKVPair' >> beam.Map(lambda x: ParseResultToKV(x))
             | 'AddSameKey' >> beam.Map(lambda x: (1, x))
             | 'Window' >> beam.WindowInto(beam.window.GlobalWindows())
             | 'GroupByKey' >> beam.GroupByKey()
             | 'Sort' >> beam.Map(
                 lambda group_data: sorted(group_data[1], key=lambda x: x[0]))
             | 'Flatten' >> beam.FlatMap(lambda x: x)
             | 'ToStr' >> beam.Map(
                 lambda x: '{},{}'.format(x[0], '0' if x[1] < 0.5 else '1'))
             | 'WriteToFile' >> beam.io.WriteToText(
                 output_uri,
                 num_shards=1,
                 shard_name_template='',
                 header='PassengerId,Survived'))
        absl.logging.info('TestPredComponent result written to %s.',
                          output_uri)
示例#14
0
    def Do(self, input_dict: Dict[Text, List[types.TfxType]],
           output_dict: Dict[Text, List[types.TfxType]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of 'ExamplesPath' type. This should contain both
          'train' and 'eval' split.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of 'ExampleStatisticsPath' type. This should contain
          both 'train' and 'eval' split.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        split_to_instance = {x.split: x for x in input_dict['input_data']}
        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as p:
            # TODO(b/126263006): Support more stats_options through config.
            stats_options = options.StatsOptions()
            for split, instance in split_to_instance.items():
                tf.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(instance.uri)
                output_uri = types.get_split_uri(output_dict['output'], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
            tf.logging.info('Statistics written to {}.'.format(output_uri))
示例#15
0
    def _run_model_inference(self, model_path: Text,
                             example_uris: Mapping[Text,
                                                   Text], output_path: Text,
                             model_spec: bulk_inferrer_pb2.ModelSpec) -> None:
        """Runs model inference on given example data.

    Args:
      model_path: Path to model.
      example_uris: Mapping of example split name to example uri.
      output_path: Path to output generated prediction logs.
      model_spec: bulk_inferrer_pb2.ModelSpec instance.

    Returns:
      None
    """

        saved_model_spec = model_spec_pb2.SavedModelSpec(
            model_path=model_path,
            tag=model_spec.tag,
            signature_name=model_spec.model_signature_name)
        inference_endpoint = model_spec_pb2.InferenceEndpoint()
        inference_endpoint.saved_model_spec.CopyFrom(saved_model_spec)
        with self._make_beam_pipeline() as pipeline:
            data_list = []
            for split, example_uri in example_uris.items():
                data = (
                    pipeline
                    | 'ReadData[{}]'.format(split) >> beam.io.ReadFromTFRecord(
                        file_pattern=io_utils.all_files_pattern(example_uri)))
                data_list.append(data)
            _ = ([data for data in data_list]
                 | 'FlattenExamples' >> beam.Flatten(pipeline=pipeline)
                 | 'ParseExamples' >> beam.Map(tf.train.Example.FromString)
                 | 'RunInference' >>
                 run_inference.RunInference(inference_endpoint)
                 | 'WritePredictionLogs' >> beam.io.WriteToTFRecord(
                     output_path,
                     file_name_suffix='.gz',
                     coder=beam.coders.ProtoCoder(
                         prediction_log_pb2.PredictionLog)))
        logging.info('Inference result written to %s.', output_path)
示例#16
0
文件: executor.py 项目: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of 'ExamplesPath' type. This should contain both
          'train' and 'eval' split.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of 'ExampleStatisticsPath' type. This should contain
          both 'train' and 'eval' split.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    split_to_instance = {x.split: x for x in input_dict['input_data']}
    with beam.Pipeline(argv=self._get_beam_pipeline_args()) as p:
      # TODO(b/126263006): Support more stats_options through config.
      stats_options = options.StatsOptions()
      for split, instance in split_to_instance.items():
        tf.logging.info('Generating statistics for split {}'.format(split))
        input_uri = io_utils.all_files_pattern(instance.uri)
        output_uri = types.get_split_uri(output_dict['output'], split)
        output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
        _ = (
            p
            | 'ReadData.' + split >>
            beam.io.ReadFromTFRecord(file_pattern=input_uri)
            | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample()
            | 'GenerateStatistics.' + split >>
            stats_api.GenerateStatistics(stats_options)
            | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                output_path,
                shard_name_template='',
                coder=beam.coders.ProtoCoder(
                    statistics_pb2.DatasetFeatureStatisticsList)))
      tf.logging.info('Statistics written to {}.'.format(output_uri))
示例#17
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        self._log_startup(input_dict, output_dict, exec_properties)

        absl.logging.info('Hello Component - Executor - Do Start')

        assert (len(input_dict['input_data']) == 1)
        for artifact in input_dict['input_data']:
            input_dir = artifact.uri
            output_dir = artifact_utils.get_single_uri(
                output_dict['output_data'])

            input_uri = io_utils.all_files_pattern(input_dir)
            output_uri = os.path.join(output_dir, 'result.csv')

            with self._make_beam_pipeline() as p:
                intrim = p | 'ReadData' >> beam.io.ReadFromTFRecord(
                    file_pattern=input_uri,
                    coder=beam.coders.ProtoCoder(
                        prediction_log_pb2.PredictionLog))
                intrim = intrim | 'Process' >> beam.Map(process_item)
                intrim = intrim | 'SameKey' >> beam.Map(lambda it: (0, it))
                intrim = intrim | 'SameWindow' >> beam.WindowInto(
                    beam.window.GlobalWindows())
                intrim = intrim | 'GroupAll' >> GroupByKey()
                intrim = intrim | 'RemoveDummyKey' >> beam.Map(
                    lambda item: item[1])
                intrim = intrim | 'SortAll' >> beam.Map(sort_data)
                intrim = intrim | 'InMemorySink' >> beam.Map(
                    lambda item: write_data(item, output_uri))

            # intrim | 'Sink' >> beam.io.WriteToText(file_path_prefix=output_uri,
            #                                          file_name_suffix='.csv',
            #                                          num_shards=1,
            #                                          # CompressionTypes.UNCOMPRESSED,
            #                                          header='ID_code,target')

        absl.logging.info('Hello Component - Executor - Do End')
示例#18
0
from tensorflow_serving.apis import prediction_log_pb2
import apache_beam as beam
import tensorflow as tf


def print_item(item, file):
    example_bytes = item.predict_log.request.inputs['input_example_tensor'].string_val[0]
    
    # parsed = tf.train.Example.FromString(example_bytes)
    # parsed is tf.Example (list of feature)

    features = {
        'ID_code': tf.io.FixedLenFeature((), tf.string)
    }
    parsed = tf.io.parse_single_example(example_bytes, features=features)
    # parsed['ID_code'] is a Tensor with string value, .numpy() can gets the value like b'id1'
    id_string = parsed['ID_code'].numpy().decode()
    output = item.predict_log.response.outputs['output_0'].float_val[0]
    file.write('{0},{1}\n'.format(id_string, 1 if output >= 0.5 else 0))
    
input_dir = '/var/tmp/santander/keras-tft/HelloComponent.HelloWorld/output_data/10'
input_uri = io_utils.all_files_pattern(input_dir)
with tf.io.gfile.GFile('/var/tmp/output.csv', 'w') as file:
    file.write('ID_code,target\n')

    p = beam.Pipeline()
    out = p | 'ReadExamples' >> beam.io.ReadFromTFRecord(file_pattern=input_uri, coder=beam.coders.ProtoCoder(prediction_log_pb2.PredictionLog))
    out = out | 'Print' >> beam.Map(lambda item: print_item(item, file))
    result = p.run()

print('done')
示例#19
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        stats_options = options.StatsOptions()
        if STATS_OPTIONS_JSON_KEY in exec_properties:
            stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY]
            if stats_options_json:
                # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
                # json_utils
                stats_options = options.StatsOptions.from_json(
                    stats_options_json)
        if input_dict.get(SCHEMA_KEY):
            if stats_options.schema:
                raise ValueError(
                    'A schema was provided as an input and the '
                    'stats_options exec_property also contains a schema '
                    'value. At most one of these may be set.')
            else:
                schema = io_utils.SchemaReader().read(
                    io_utils.get_only_uri_in_dir(
                        artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
                stats_options.schema = schema

        split_uris = []
        for artifact in input_dict[EXAMPLES_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                tfxio_kwargs = {'file_pattern': input_uri}
                # TODO(b/151624179): clean this up after tfx_bsl is released with the
                # below flag.
                if getattr(tfxio, 'TFXIO_HAS_TELEMETRY', False):
                    tfxio_kwargs[
                        'telemetry_descriptors'] = _TELEMETRY_DESCRIPTORS
                input_tfxio = tf_example_record.TFExampleRecord(**tfxio_kwargs)
                output_uri = artifact_utils.get_split_uri(
                    output_dict[STATISTICS_KEY], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                data = p | 'TFXIORead[{}]'.format(
                    split) >> input_tfxio.BeamSource()
                # TODO(b/153368237): Clean this up after a release post tfx 0.21.
                if not getattr(tfdv, 'TFDV_ACCEPT_RECORD_BATCH', False):
                    data |= 'RecordBatchToTable[{}]'.format(split) >> beam.Map(
                        lambda rb: pa.Table.from_batches([rb]))
                _ = (data
                     | 'GenerateStatistics[{}]'.format(split) >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput[{}]'.format(split) >>
                     beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
示例#20
0
 def testAllFilesPattern(self):
   self.assertEqual('model*', io_utils.all_files_pattern('model'))
示例#21
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """TensorFlow Transform executor entrypoint.

    This implements BaseExecutor.Do() and is invoked by orchestration systems.
    This is not inteded for manual usage or further customization. Please use
    the Transform() function which takes an input format with no artifact
    dependency.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of 'ExamplesPath' type which should contain two
          splits 'train' and 'eval'.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - transform_output: Output of 'tf.Transform', which includes an exported
          Tensorflow graph suitable for both training and serving;
        - transformed_examples: Materialized transformed examples, which
          includes both 'train' and 'eval' splits.
      exec_properties: A dict of execution properties, including:
        - module_file: The file path to a python module file, from which the
          'preprocessing_fn' function will be loaded.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)
    train_data_uri = artifact_utils.get_split_uri(input_dict['input_data'],
                                                  'train')
    eval_data_uri = artifact_utils.get_split_uri(input_dict['input_data'],
                                                 'eval')
    schema_file = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict['schema']))
    transform_output = artifact_utils.get_single_uri(
        output_dict['transform_output'])
    transformed_train_output = artifact_utils.get_split_uri(
        output_dict['transformed_examples'], 'train')
    transformed_eval_output = artifact_utils.get_split_uri(
        output_dict['transformed_examples'], 'eval')
    temp_path = os.path.join(transform_output, _TEMP_DIR_IN_TRANSFORM_OUTPUT)
    tf.logging.debug('Using temp path %s for tft.beam', temp_path)

    def _GetCachePath(label, params_dict):
      if label not in params_dict:
        return None
      else:
        return artifact_utils.get_single_uri(params_dict[label])

    label_inputs = {
        labels.COMPUTE_STATISTICS_LABEL:
            False,
        labels.SCHEMA_PATH_LABEL:
            schema_file,
        labels.EXAMPLES_DATA_FORMAT_LABEL:
            labels.FORMAT_TF_EXAMPLE,
        labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(train_data_uri),
        labels.TRANSFORM_ONLY_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(eval_data_uri),
        labels.TFT_STATISTICS_USE_TFDV_LABEL:
            True,
        labels.PREPROCESSING_FN:
            exec_properties['module_file'],
    }
    cache_input = _GetCachePath('cache_input_path', input_dict)
    if cache_input is not None:
      label_inputs[labels.CACHE_INPUT_PATH_LABEL] = cache_input

    label_outputs = {
        labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output,
        labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [
            os.path.join(transformed_train_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
            os.path.join(transformed_eval_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
        ],
        labels.TEMP_OUTPUT_LABEL: str(temp_path),
    }
    cache_output = _GetCachePath('cache_output_path', output_dict)
    if cache_output is not None:
      label_outputs[labels.CACHE_OUTPUT_PATH_LABEL] = cache_output
    status_file = 'status_file'  # Unused
    self.Transform(label_inputs, label_outputs, status_file)
    tf.logging.info('Cleaning up temp path %s on executor success', temp_path)
    io_utils.delete_dir(temp_path)
示例#22
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """
        Write description regarding this beautiful executor.

        Args:
            input_dict:
            output_dict:
            exec_properties:
        """
        self._log_startup(input_dict, output_dict, exec_properties)

        schema = parse_schema(input_dict=input_dict)

        statistics = parse_statistics(
            split_name=DATA_SPLIT_NAME,
            statistics=input_dict[constants.STATISTICS])

        source = exec_properties[StepKeys.SOURCE]
        args = exec_properties[StepKeys.ARGS]

        # pass the schema and stats straight to the Step
        args[constants.SCHEMA] = schema
        args[constants.STATISTICS] = statistics

        c = source_utils.load_source_path_class(source)
        split_step: BaseSplit = c(**args)

        # infer the names of the splits from the config
        split_names = split_step.get_split_names()

        # Get output split path
        examples_artifact = artifact_utils.get_single_instance(
            output_dict[constants.OUTPUT_EXAMPLES])
        if SKIP in split_names:
            sanitized_names = [name for name in split_names if name != SKIP]
            examples_artifact.split_names = artifact_utils.encode_split_names(
                sanitized_names)
        else:
            examples_artifact.split_names = artifact_utils.encode_split_names(
                split_names)

        split_uris = []
        for artifact in input_dict[constants.INPUT_EXAMPLES]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))

        with self._make_beam_pipeline() as p:
            # The outer loop will for now only run once
            for split, uri in split_uris:
                input_uri = io_utils.all_files_pattern(uri)

                new_splits = (
                    p
                    | 'ReadData.' + split >>
                    beam.io.ReadFromTFRecord(file_pattern=input_uri)
                    | beam.Map(tf.train.Example.FromString)
                    |
                    'Split' >> beam.Partition(split_step.partition_fn()[0],
                                              split_step.get_num_splits(),
                                              **split_step.partition_fn()[1]))

                for split_name, new_split in zip(split_names,
                                                 list(new_splits)):
                    if split_name != SKIP:
                        # WriteSplit function writes to TFRecord again
                        (new_split
                         | 'Serialize.' + split_name >>
                         beam.Map(lambda x: x.SerializeToString())
                         | 'WriteSplit_' + split_name >> WriteSplit(
                             get_split_uri(
                                 output_dict[constants.OUTPUT_EXAMPLES],
                                 split_name)))
示例#23
0
    def _run_model_inference(
        self,
        data_spec: bulk_inferrer_pb2.DataSpec,
        output_example_spec: bulk_inferrer_pb2.OutputExampleSpec,
        examples: List[types.Artifact],
        output_examples: Optional[types.Artifact],
        inference_result: Optional[types.Artifact],
        inference_endpoint: model_spec_pb2.InferenceSpecType,
    ) -> None:
        """Runs model inference on given examples data.

    Args:
      data_spec: bulk_inferrer_pb2.DataSpec instance.
      output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance.
      examples: List of `standard_artifacts.Examples` artifacts.
      output_examples: Optional output `standard_artifacts.Examples` artifact.
      inference_result: Optional output `standard_artifacts.InferenceResult`
        artifact.
      inference_endpoint: Model inference endpoint.
    """

        example_uris = {}
        for example_artifact in examples:
            for split in artifact_utils.decode_split_names(
                    example_artifact.split_names):
                if data_spec.example_splits:
                    if split in data_spec.example_splits:
                        example_uris[split] = artifact_utils.get_split_uri(
                            [example_artifact], split)
                else:
                    example_uris[split] = artifact_utils.get_split_uri(
                        [example_artifact], split)

        payload_format, _ = tfxio_utils.resolve_payload_format_and_data_view_uri(
            examples)

        tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
            examples,
            _TELEMETRY_DESCRIPTORS,
            schema=None,
            read_as_raw_records=True,
            # We have to specify this parameter in order to create a RawRecord TFXIO
            # but we won't use the RecordBatches so the column name of the raw
            # records does not matter.
            raw_record_column_name='unused')

        if output_examples:
            output_examples.split_names = artifact_utils.encode_split_names(
                sorted(example_uris.keys()))

        with self._make_beam_pipeline() as pipeline:
            data_list = []
            for split, example_uri in example_uris.items():
                tfxio = tfxio_factory(
                    [io_utils.all_files_pattern(example_uri)])
                assert isinstance(
                    tfxio, record_based_tfxio.RecordBasedTFXIO
                ), ('Unable to use TFXIO {} as it does not support reading raw records.'
                    .format(type(tfxio)))
                # pylint: disable=no-value-for-parameter
                data = (pipeline
                        | 'ReadData[{}]'.format(split) >>
                        tfxio.RawRecordBeamSource()
                        | 'RunInference[{}]'.format(split) >> _RunInference(
                            payload_format, inference_endpoint))
                if output_examples:
                    output_examples_split_uri = artifact_utils.get_split_uri(
                        [output_examples], split)
                    logging.info('Path of output examples split `%s` is %s.',
                                 split, output_examples_split_uri)
                    _ = (data
                         | 'WriteExamples[{}]'.format(split) >> _WriteExamples(
                             output_example_spec, output_examples_split_uri))
                    # pylint: enable=no-value-for-parameter

                data_list.append(data)

            if inference_result:
                _ = (
                    data_list
                    |
                    'FlattenInferenceResult' >> beam.Flatten(pipeline=pipeline)
                    | 'WritePredictionLogs' >> beam.io.WriteToTFRecord(
                        os.path.join(inference_result.uri,
                                     _PREDICTION_LOGS_FILE_NAME),
                        file_name_suffix='.gz',
                        coder=beam.coders.ProtoCoder(
                            prediction_log_pb2.PredictionLog)))

        if output_examples:
            logging.info('Output examples written to %s.', output_examples.uri)
        if inference_result:
            logging.info('Inference result written to %s.',
                         inference_result.uri)
示例#24
0
文件: executor.py 项目: ktsitsi/tfx
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - eval_config: JSON string of tfma.EvalConfig.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data. Deprecated, use
          eval_config.slicing_specs instead.

    Returns:
      None
    """
    if constants.EXAMPLES_KEY not in input_dict:
      raise ValueError('EXAMPLES_KEY is missing from input dict.')
    if constants.MODEL_KEY not in input_dict:
      raise ValueError('MODEL_KEY is missing from input dict.')
    if constants.EVALUATION_KEY not in output_dict:
      raise ValueError('EVALUATION_KEY is missing from output dict.')
    if len(input_dict[constants.MODEL_KEY]) > 1:
      raise ValueError(
          'There can be only one candidate model, there are {}.'.format(
              len(input_dict[constants.MODEL_KEY])))
    if constants.BASELINE_MODEL_KEY in input_dict and len(
        input_dict[constants.BASELINE_MODEL_KEY]) > 1:
      raise ValueError(
          'There can be only one baseline model, there are {}.'.format(
              len(input_dict[constants.BASELINE_MODEL_KEY])))

    self._log_startup(input_dict, output_dict, exec_properties)

    # Add fairness indicator metric callback if necessary.
    fairness_indicator_thresholds = exec_properties.get(
        'fairness_indicator_thresholds', None)
    add_metrics_callbacks = None
    if fairness_indicator_thresholds:
      # Need to import the following module so that the fairness indicator
      # post-export metric is registered.
      import tensorflow_model_analysis.addons.fairness.post_export_metrics.fairness_indicators  # pylint: disable=g-import-not-at-top, unused-variable
      add_metrics_callbacks = [
          tfma.post_export_metrics.fairness_indicators(  # pytype: disable=module-attr
              thresholds=fairness_indicator_thresholds),
      ]

    def _get_eval_saved_model(artifact: List[types.Artifact],
                              tags=None) -> tfma.EvalSharedModel:
      model_uri = artifact_utils.get_single_uri(artifact)
      if tags and tf.saved_model.SERVING in tags:
        model_path = path_utils.serving_model_path(model_uri)
      else:
        model_path = path_utils.eval_model_path(model_uri)
      return tfma.default_eval_shared_model(
          eval_saved_model_path=model_path,
          tags=tags,
          add_metrics_callbacks=add_metrics_callbacks)

    output_uri = artifact_utils.get_single_uri(
        output_dict[constants.EVALUATION_KEY])

    run_validation = False
    if 'eval_config' in exec_properties and exec_properties['eval_config']:
      slice_spec = None
      eval_config = tfma.EvalConfig()
      json_format.Parse(exec_properties['eval_config'], eval_config)
      # Do not validate model when there is no thresholds configured. This is to
      # avoid accidentally blessing models when users forget to set thresholds.
      for metrics_spec in eval_config.metrics_specs:
        if (metrics_spec.thresholds or any(
            metric.HasField('threshold') for metric in metrics_spec.metrics)):
          run_validation = True
          break
      if len(eval_config.model_specs) > 2:
        raise ValueError(
            """Cannot support more than two models. There are {} models in this
             eval_config.""".format(len(eval_config.model_specs)))
      if not eval_config.model_specs:
        eval_config.model_specs.add()
      # Remove baseline model_spec and all change thresholds if there is no
      # baseline model provided.
      if not input_dict.get(constants.BASELINE_MODEL_KEY):
        tmp_model_specs = []
        for model_spec in eval_config.model_specs:
          if not model_spec.is_baseline:
            tmp_model_specs.append(model_spec)
        del eval_config.model_specs[:]
        eval_config.model_specs.extend(tmp_model_specs)
        absl.logging.info("""No baseline model provided, ignoring all
            baseline model_spec.""")
        for metrics_spec in eval_config.metrics_specs:
          for metric in metrics_spec.metrics:
            metric.threshold.ClearField('change_threshold')
          for threshold in metrics_spec.thresholds.values():
            threshold.ClearField('change_threshold')
        absl.logging.info("""No baseline model provided, ignoring all
            change thresholds.""")
      # Extract model artifacts.
      models = {}
      for model_spec in eval_config.model_specs:
        if model_spec.signature_name != 'eval':
          tags = [tf.saved_model.SERVING]
        if model_spec.is_baseline:
          models[model_spec.name] = _get_eval_saved_model(
              input_dict[constants.BASELINE_MODEL_KEY], tags)
          absl.logging.info('Using {} as baseline model.'.format(
              models[model_spec.name].model_path))
        else:
          models[model_spec.name] = _get_eval_saved_model(
              input_dict[constants.MODEL_KEY], tags)
          absl.logging.info('Using {} for model eval.'.format(
              models[model_spec.name].model_path))
    else:
      eval_config = None
      assert ('feature_slicing_spec' in exec_properties and
              exec_properties['feature_slicing_spec']
             ), 'both eval_config and feature_slicing_spec are unset.'
      feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
      json_format.Parse(exec_properties['feature_slicing_spec'],
                        feature_slicing_spec)
      slice_spec = self._get_slice_spec_from_feature_slicing_spec(
          feature_slicing_spec)
      models = _get_eval_saved_model(input_dict[constants.MODEL_KEY])
      absl.logging.info('Using {} for model eval.'.format(models.model_path))

    absl.logging.info('Evaluating model.')
    with self._make_beam_pipeline() as pipeline:
      # pylint: disable=expression-not-assigned
      (pipeline
       | 'ReadData' >> beam.io.ReadFromTFRecord(
           file_pattern=io_utils.all_files_pattern(
               artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY],
                                            'eval')))
       |
       'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults(
           eval_shared_model=models,
           eval_config=eval_config,
           output_path=output_uri,
           slice_spec=slice_spec))
    absl.logging.info(
        'Evaluation complete. Results written to {}.'.format(output_uri))

    if not run_validation:
      # TODO(jinhuang): delete the BLESSING_KEY from output_dict when supported.
      absl.logging.info('No threshold configured, will not validate model.')
      return
    # Set up blessing artifact
    blessing = artifact_utils.get_single_instance(
        output_dict[constants.BLESSING_KEY])
    blessing.set_string_custom_property(
        constants.ARTIFACT_PROPERTY_CURRENT_MODEL_URI_KEY,
        artifact_utils.get_single_uri(input_dict[constants.MODEL_KEY]))
    blessing.set_int_custom_property(
        constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY,
        input_dict[constants.MODEL_KEY][0].id)
    if input_dict.get(constants.BASELINE_MODEL_KEY):
      baseline_model = input_dict[constants.BASELINE_MODEL_KEY][0]
      blessing.set_string_custom_property(
          constants.ARTIFACT_PROPERTY_BASELINE_MODEL_URI_KEY,
          baseline_model.uri)
      blessing.set_int_custom_property(
          constants.ARTIFACT_PROPERTY_BASELINE_MODEL_ID_KEY, baseline_model.id)
    if 'current_component_id' in exec_properties:
      blessing.set_string_custom_property(
          'component_id', exec_properties['current_component_id'])
    # Check validation result and write BLESSED file accordingly.
    validation_file = os.path.join(output_uri, tfma.constants.VALIDATIONS_KEY)
    absl.logging.info('Checking validation results.')
    validation_result = tfma.load_validation_result(validation_file)
    if validation_result.validation_ok:
      io_utils.write_string_file(
          os.path.join(blessing.uri, constants.BLESSED_FILE_NAME), '')
      blessing.set_int_custom_property(constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                                       constants.BLESSED_VALUE)
    else:
      io_utils.write_string_file(
          os.path.join(blessing.uri, constants.NOT_BLESSED_FILE_NAME), '')
      blessing.set_int_custom_property(constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                                       constants.NOT_BLESSED_VALUE)
    absl.logging.info('Blessing result {} written to {}.'.format(
        validation_result.validation_ok, blessing.uri))
示例#25
0
 def testAllFilesPattern(self):
     self.assertEqual('model*', io_utils.all_files_pattern('model'))
示例#26
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - eval_config: JSON string of tfma.EvalConfig.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data. Deprecated, use
          eval_config.slicing_specs instead.

    Returns:
      None
    """
        if constants.EXAMPLES_KEY not in input_dict:
            raise ValueError('EXAMPLES_KEY is missing from input dict.')
        if constants.MODEL_KEY not in input_dict:
            raise ValueError('MODEL_KEY is missing from input dict.')
        if constants.EVALUATION_KEY not in output_dict:
            raise ValueError('EVALUATION_KEY is missing from output dict.')
        if len(input_dict[constants.MODEL_KEY]) > 1:
            raise ValueError(
                'There can be only one candidate model, there are {}.'.format(
                    len(input_dict[constants.MODEL_KEY])))
        if constants.BASELINE_MODEL_KEY in input_dict and len(
                input_dict[constants.BASELINE_MODEL_KEY]) > 1:
            raise ValueError(
                'There can be only one baseline model, there are {}.'.format(
                    len(input_dict[constants.BASELINE_MODEL_KEY])))

        self._log_startup(input_dict, output_dict, exec_properties)

        # Add fairness indicator metric callback if necessary.
        fairness_indicator_thresholds = exec_properties.get(
            'fairness_indicator_thresholds', None)
        add_metrics_callbacks = None
        if fairness_indicator_thresholds:
            # Need to import the following module so that the fairness indicator
            # post-export metric is registered.
            import tensorflow_model_analysis.addons.fairness.post_export_metrics.fairness_indicators  # pylint: disable=g-import-not-at-top, unused-variable
            add_metrics_callbacks = [
                tfma.post_export_metrics.fairness_indicators(  # pytype: disable=module-attr
                    thresholds=fairness_indicator_thresholds),
            ]

        output_uri = artifact_utils.get_single_uri(
            output_dict[constants.EVALUATION_KEY])

        run_validation = False
        models = []
        if 'eval_config' in exec_properties and exec_properties['eval_config']:
            slice_spec = None
            has_baseline = bool(input_dict.get(constants.BASELINE_MODEL_KEY))
            eval_config = tfma.EvalConfig()
            json_format.Parse(exec_properties['eval_config'], eval_config)
            eval_config = tfma.update_eval_config_with_defaults(
                eval_config,
                maybe_add_baseline=has_baseline,
                maybe_remove_baseline=not has_baseline)
            tfma.verify_eval_config(eval_config)
            # Do not validate model when there is no thresholds configured. This is to
            # avoid accidentally blessing models when users forget to set thresholds.
            run_validation = bool(
                tfma.metrics.metric_thresholds_from_metrics_specs(
                    eval_config.metrics_specs))
            if len(eval_config.model_specs) > 2:
                raise ValueError(
                    """Cannot support more than two models. There are {} models in this
             eval_config.""".format(len(eval_config.model_specs)))
            # Extract model artifacts.
            for model_spec in eval_config.model_specs:
                if model_spec.is_baseline:
                    model_uri = artifact_utils.get_single_uri(
                        input_dict[constants.BASELINE_MODEL_KEY])
                else:
                    model_uri = artifact_utils.get_single_uri(
                        input_dict[constants.MODEL_KEY])
                if tfma.get_model_type(model_spec) == tfma.TF_ESTIMATOR:
                    model_path = path_utils.eval_model_path(model_uri)
                else:
                    model_path = path_utils.serving_model_path(model_uri)
                absl.logging.info('Using {} as {} model.'.format(
                    model_path, model_spec.name))
                models.append(
                    tfma.default_eval_shared_model(
                        model_name=model_spec.name,
                        eval_saved_model_path=model_path,
                        add_metrics_callbacks=add_metrics_callbacks,
                        eval_config=eval_config))
        else:
            eval_config = None
            assert ('feature_slicing_spec' in exec_properties
                    and exec_properties['feature_slicing_spec']
                    ), 'both eval_config and feature_slicing_spec are unset.'
            feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
            json_format.Parse(exec_properties['feature_slicing_spec'],
                              feature_slicing_spec)
            slice_spec = self._get_slice_spec_from_feature_slicing_spec(
                feature_slicing_spec)
            model_uri = artifact_utils.get_single_uri(
                input_dict[constants.MODEL_KEY])
            model_path = path_utils.eval_model_path(model_uri)
            absl.logging.info('Using {} for model eval.'.format(model_path))
            models.append(
                tfma.default_eval_shared_model(
                    eval_saved_model_path=model_path,
                    add_metrics_callbacks=add_metrics_callbacks))

        file_pattern = io_utils.all_files_pattern(
            artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY],
                                         'eval'))
        eval_shared_model = models[0] if len(models) == 1 else models
        schema = None
        if constants.SCHEMA_KEY in input_dict:
            schema = io_utils.SchemaReader().read(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(
                        input_dict[constants.SCHEMA_KEY])))

        absl.logging.info('Evaluating model.')
        with self._make_beam_pipeline() as pipeline:
            # pylint: disable=expression-not-assigned
            if _USE_TFXIO:
                tensor_adapter_config = None
                if tfma.is_batched_input(eval_shared_model, eval_config):
                    tfxio = tf_example_record.TFExampleRecord(
                        file_pattern=file_pattern,
                        schema=schema,
                        raw_record_column_name=tfma.BATCHED_INPUT_KEY)
                    if schema is not None:
                        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
                            arrow_schema=tfxio.ArrowSchema(),
                            tensor_representations=tfxio.TensorRepresentations(
                            ))
                    data = pipeline | 'ReadFromTFRecordToArrow' >> tfxio.BeamSource(
                    )
                else:
                    data = pipeline | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord(
                        file_pattern=file_pattern)
                (data
                 | 'ExtractEvaluateAndWriteResults' >>
                 tfma.ExtractEvaluateAndWriteResults(
                     eval_shared_model=models[0]
                     if len(models) == 1 else models,
                     eval_config=eval_config,
                     output_path=output_uri,
                     slice_spec=slice_spec,
                     tensor_adapter_config=tensor_adapter_config))
            else:
                data = pipeline | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord(
                    file_pattern=file_pattern)
                (data
                 | 'ExtractEvaluateAndWriteResults' >>
                 tfma.ExtractEvaluateAndWriteResults(
                     eval_shared_model=models[0]
                     if len(models) == 1 else models,
                     eval_config=eval_config,
                     output_path=output_uri,
                     slice_spec=slice_spec))
        absl.logging.info(
            'Evaluation complete. Results written to {}.'.format(output_uri))

        if not run_validation:
            # TODO(jinhuang): delete the BLESSING_KEY from output_dict when supported.
            absl.logging.info(
                'No threshold configured, will not validate model.')
            return
        # Set up blessing artifact
        blessing = artifact_utils.get_single_instance(
            output_dict[constants.BLESSING_KEY])
        blessing.set_string_custom_property(
            constants.ARTIFACT_PROPERTY_CURRENT_MODEL_URI_KEY,
            artifact_utils.get_single_uri(input_dict[constants.MODEL_KEY]))
        blessing.set_int_custom_property(
            constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY,
            input_dict[constants.MODEL_KEY][0].id)
        if input_dict.get(constants.BASELINE_MODEL_KEY):
            baseline_model = input_dict[constants.BASELINE_MODEL_KEY][0]
            blessing.set_string_custom_property(
                constants.ARTIFACT_PROPERTY_BASELINE_MODEL_URI_KEY,
                baseline_model.uri)
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BASELINE_MODEL_ID_KEY,
                baseline_model.id)
        if 'current_component_id' in exec_properties:
            blessing.set_string_custom_property(
                'component_id', exec_properties['current_component_id'])
        # Check validation result and write BLESSED file accordingly.
        absl.logging.info('Checking validation results.')
        validation_result = tfma.load_validation_result(output_uri)
        if validation_result.validation_ok:
            io_utils.write_string_file(
                os.path.join(blessing.uri, constants.BLESSED_FILE_NAME), '')
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                constants.BLESSED_VALUE)
        else:
            io_utils.write_string_file(
                os.path.join(blessing.uri, constants.NOT_BLESSED_FILE_NAME),
                '')
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                constants.NOT_BLESSED_VALUE)
        absl.logging.info('Blessing result {} written to {}.'.format(
            validation_result.validation_ok, blessing.uri))
示例#27
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Get human review result on a model through Slack channel.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from model_validator.
      output_dict: Output dict from key to a list of artifacts, including:
        - slack_blessing: model blessing result.
      exec_properties: A dict of execution properties, including:
        - slack_token: Token used to setup connection with slack server.
        - slack_channel_id: The id of the Slack channel to send and receive
          messages.
        - timeout_sec: How long do we wait for response, in seconds.

    Returns:
      None

    Raises:
      TimeoutError:
        When there is no decision made within timeout_sec.
      ConnectionError:
        When connection to slack server cannot be established.

    """
    self._log_startup(input_dict, output_dict, exec_properties)
    transform_graph_uri = artifact_utils.get_single_uri(
        input_dict[TRANSFORM_GRAPH_KEY])
    temp_path = os.path.join(transform_graph_uri, _TEMP_DIR_IN_TRANSFORM_OUTPUT)
    # transformed_schema_file = os.path.join(
    #   transform_graph_uri,
    #   tft.TFTransformOutput.TRANSFORMED_METADATA_DIR,
    #   'schema.pbtxt'
    # )
    # transformed_schema_proto = io_utils.parse_pbtxt_file(
    #   transformed_schema_file,
    #   schema_pb2.Schema()
    # )
    transformed_train_output = artifact_utils.get_split_uri(
      output_dict[TRANSFORMED_EXAMPLES_KEY], 'train')
    transformed_eval_output = artifact_utils.get_split_uri(
      output_dict[TRANSFORMED_EXAMPLES_KEY], 'eval')

    tf_transform_output = tft.TFTransformOutput(transform_graph_uri)
    # transform_output_dataset_metadata = dataset_metadata.DatasetMetadata(
    #   schema=transformed_schema_proto
    # )

    # transform_fn = (tf_transform_output.transform_raw_features, transform_output_dataset_metadata)
    # feature_spec = schema_utils.schema_as_feature_spec(schema_proto).feature_spec
    schema_file = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))
    schema_proto = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
    transform_input_dataset_metadata = dataset_metadata.DatasetMetadata(
      schema_proto
    )

    train_data_uri = artifact_utils.get_split_uri(
      input_dict[EXAMPLES_KEY],
      'train'
    )
    eval_data_uri = artifact_utils.get_split_uri(
      input_dict[EXAMPLES_KEY],
      'eval'
    )
    analyze_data_paths = [io_utils.all_files_pattern(train_data_uri)]
    transform_data_paths = [
      io_utils.all_files_pattern(train_data_uri),
      io_utils.all_files_pattern(eval_data_uri),
    ]
    materialize_output_paths = [
      os.path.join(transformed_train_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
      os.path.join(transformed_eval_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX)
    ]
    transform_data_list = self._MakeDatasetList(
      transform_data_paths,
      materialize_output_paths
    )
    analyze_data_list = self._MakeDatasetList(
      analyze_data_paths,
    )

    with self._make_beam_pipeline() as pipeline:
      with tft_beam.Context(temp_dir=temp_path):
        # NOTE: Unclear if there is a difference between input_dataset_metadata
        # and transform_input_dataset_metadata. Look at Transform executor.
        decode_fn = tft.coders.ExampleProtoCoder(schema_proto, serialized=True).decode

        input_analysis_data = {}
        for dataset in analyze_data_list:
          infix = 'AnalysisIndex{}'.format(dataset.index)
          dataset.serialized = (
            pipeline
            | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples(
                dataset, transform_input_dataset_metadata))
          dataset.decoded = (
            dataset.serialized
            | 'Decode[{}]'.format(infix)
            >> self._DecodeInputs(decode_fn))
          input_analysis_data[dataset.dataset_key] = dataset.decoded

        if not hasattr(tft_beam.analyzer_cache, 'DatasetKey'):
          input_analysis_data = (
              [
                  dataset for dataset in input_analysis_data.values()
                  if dataset is not None
              ]
              | 'FlattenAnalysisDatasetsBecauseItIsRequired' >>
              beam.Flatten(pipeline=pipeline))

        transform_fn = (
            (input_analysis_data, transform_input_dataset_metadata)
            | 'Analyze' >> tft_beam.AnalyzeDataset(
                tf_transform_output.transform_raw_features, pipeline=pipeline))

        for dataset in transform_data_list:
          infix = 'TransformIndex{}'.format(dataset.index)
          dataset.serialized = (
            pipeline
            | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples(
                dataset, transform_input_dataset_metadata))

          dataset.decoded = (
            dataset.serialized
            | 'Decode[{}]'.format(infix)
            >> self._DecodeInputs(decode_fn))

          dataset.transformed, metadata = (
              ((dataset.decoded, transform_input_dataset_metadata), transform_fn)
              | 'Transform[{}]'.format(infix) >> tft_beam.TransformDataset())

          dataset.transformed_and_serialized = (
              dataset.transformed
              | 'EncodeAndSerialize[{}]'.format(infix)
              >> beam.ParDo(self._EncodeAsSerializedExamples(), _GetSchemaProto(metadata)))

          _ = (
            dataset.transformed_and_serialized
            | 'Materialize[{}]'.format(infix) >> self._WriteExamples(dataset.materialize_output_path))
示例#28
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data.

    Returns:
      None
    """
    if 'model_exports' not in input_dict:
      raise ValueError('\'model_exports\' is missing in input dict.')
    if 'examples' not in input_dict:
      raise ValueError('\'examples\' is missing in input dict.')
    if 'output' not in output_dict:
      raise ValueError('\'output\' is missing in output dict.')

    self._log_startup(input_dict, output_dict, exec_properties)

    # Extract input artifacts
    model_exports_uri = artifact_utils.get_single_uri(
        input_dict['model_exports'])

    feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
    json_format.Parse(exec_properties['feature_slicing_spec'],
                      feature_slicing_spec)
    slice_spec = self._get_slice_spec_from_feature_slicing_spec(
        feature_slicing_spec)

    output_uri = artifact_utils.get_single_uri(output_dict['output'])

    eval_model_path = path_utils.eval_model_path(model_exports_uri)

    # Add fairness indicator metric callback if necessary.
    fairness_indicator_thresholds = exec_properties.get(
        'fairness_indicator_thresholds', None)
    add_metrics_callbacks = None
    if fairness_indicator_thresholds:
      # Need to import the following module so that the fairness indicator
      # post-export metric is registered.
      import tensorflow_model_analysis.addons.fairness.post_export_metrics.fairness_indicators  # pylint: disable=g-import-not-at-top, unused-variable
      add_metrics_callbacks = [
          tfma.post_export_metrics.fairness_indicators(  # pytype: disable=module-attr
              thresholds=fairness_indicator_thresholds),
      ]

    absl.logging.info('Using {} for model eval.'.format(eval_model_path))
    eval_shared_model = tfma.default_eval_shared_model(
        eval_saved_model_path=eval_model_path,
        add_metrics_callbacks=add_metrics_callbacks)

    absl.logging.info('Evaluating model.')
    with self._make_beam_pipeline() as pipeline:
      # pylint: disable=expression-not-assigned
      (pipeline
       | 'ReadData' >> beam.io.ReadFromTFRecord(
           file_pattern=io_utils.all_files_pattern(
               artifact_utils.get_split_uri(input_dict['examples'], 'eval')))
       |
       'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults(
           eval_shared_model=eval_shared_model,
           slice_spec=slice_spec,
           output_path=output_uri))
    absl.logging.info(
        'Evaluation complete. Results written to {}.'.format(output_uri))
示例#29
0
文件: executor.py 项目: jay90099/tfx
  def Do(self, input_dict: Dict[str, List[types.Artifact]],
         output_dict: Dict[str, List[types.Artifact]],
         exec_properties: Dict[str, Any]) -> None:
    """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - examples: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - statistics: A list of type `standard_artifacts.ExampleStatistics`.
          This should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.
        - exclude_splits: JSON-serialized list of names of splits where
          statistics and sample should not be generated.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    # Load and deserialize exclude splits from execution properties.
    exclude_splits = json_utils.loads(
        exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY,
                            'null')) or []
    if not isinstance(exclude_splits, list):
      raise ValueError('exclude_splits in execution properties needs to be a '
                       'list. Got %s instead.' % type(exclude_splits))
    # Setup output splits.
    examples = artifact_utils.get_single_instance(
        input_dict[standard_component_specs.EXAMPLES_KEY])
    examples_split_names = artifact_utils.decode_split_names(
        examples.split_names)
    split_names = [
        split for split in examples_split_names if split not in exclude_splits
    ]
    statistics_artifact = artifact_utils.get_single_instance(
        output_dict[standard_component_specs.STATISTICS_KEY])
    statistics_artifact.split_names = artifact_utils.encode_split_names(
        split_names)

    stats_options = options.StatsOptions()
    stats_options_json = exec_properties.get(
        standard_component_specs.STATS_OPTIONS_JSON_KEY)
    if stats_options_json:
      # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
      # json_utils
      stats_options = options.StatsOptions.from_json(stats_options_json)
    if input_dict.get(standard_component_specs.SCHEMA_KEY):
      if stats_options.schema:
        raise ValueError('A schema was provided as an input and the '
                         'stats_options exec_property also contains a schema '
                         'value. At most one of these may be set.')
      else:
        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(
                    input_dict[standard_component_specs.SCHEMA_KEY])))
        stats_options.schema = schema

    split_and_tfxio = []
    tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
        examples=[examples],
        telemetry_descriptors=_TELEMETRY_DESCRIPTORS)
    for split in artifact_utils.decode_split_names(examples.split_names):
      if split in exclude_splits:
        continue

      uri = artifact_utils.get_split_uri([examples], split)
      split_and_tfxio.append(
          (split, tfxio_factory(io_utils.all_files_pattern(uri))))
    with self._make_beam_pipeline() as p:
      for split, tfxio in split_and_tfxio:
        logging.info('Generating statistics for split %s.', split)
        output_uri = artifact_utils.get_split_uri(
            output_dict[standard_component_specs.STATISTICS_KEY], split)
        output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
        data = p | 'TFXIORead[%s]' % split >> tfxio.BeamSource()
        _ = (
            data
            | 'GenerateStatistics[%s]' % split >>
            stats_api.GenerateStatistics(stats_options)
            | 'WriteStatsOutput[%s]' % split >>
            stats_api.WriteStatisticsToBinaryFile(output_path))
        logging.info('Statistics for split %s written to %s.', split,
                     output_uri)
示例#30
0
    def Do(self, input_dict: Dict[Text, List[Artifact]],
           output_dict: Dict[Text, List[Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        split_uris: List[Text] = []
        for artifact in input_dict[executor.EXAMPLES_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                split_uris.append(split)

        self._log_startup(input_dict, output_dict, exec_properties)
        data_uris = []
        for split in split_uris:
            data_uris.append(
                artifact_utils.get_split_uri(input_dict[executor.EXAMPLES_KEY],
                                             split))

        schema_file = io_utils.get_only_uri_in_dir(
            artifact_utils.get_single_uri(input_dict[executor.SCHEMA_KEY]))
        transform_output = artifact_utils.get_single_uri(
            output_dict[executor.TRANSFORM_GRAPH_KEY])
        transformed_data_uris = []
        for split in split_uris:
            transformed_data_uris.append(
                artifact_utils.get_split_uri(
                    output_dict[executor.TRANSFORMED_EXAMPLES_KEY], split))
        temp_path = os.path.join(transform_output,
                                 executor._TEMP_DIR_IN_TRANSFORM_OUTPUT)
        logging.debug('Using temp path %s for tft.beam', temp_path)

        def _GetCachePath(label, params_dict):
            if label not in params_dict:
                return None
            else:
                return artifact_utils.get_single_uri(params_dict[label])

        label_inputs = {
            labels.COMPUTE_STATISTICS_LABEL:
            False,
            labels.SCHEMA_PATH_LABEL:
            schema_file,
            labels.EXAMPLES_DATA_FORMAT_LABEL:
            labels.FORMAT_TF_EXAMPLE,
            labels.ANALYZE_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(data_uris[0]),
            labels.ANALYZE_PATHS_FILE_FORMATS_LABEL:
            labels.FORMAT_TFRECORD,
            labels.TRANSFORM_DATA_PATHS_LABEL:
            [io_utils.all_files_pattern(uri) for uri in data_uris],
            labels.TRANSFORM_PATHS_FILE_FORMATS_LABEL:
            [labels.FORMAT_TFRECORD for uri in data_uris],
            labels.TFT_STATISTICS_USE_TFDV_LABEL:
            True,
            labels.MODULE_FILE:
            exec_properties.get('module_file', None),
            labels.PREPROCESSING_FN:
            exec_properties.get('preprocessing_fn', None),
            # TODO(b/149754658): switch to True once the TFXIO integration is
            # complete.
            labels.USE_TFXIO_LABEL:
            False,
        }
        cache_input = _GetCachePath('cache_input_path', input_dict)
        if cache_input is not None:
            label_inputs[labels.CACHE_INPUT_PATH_LABEL] = cache_input

        label_outputs = {
            labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL:
            transform_output,
            labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [
                os.path.join(uri,
                             executor._DEFAULT_TRANSFORMED_EXAMPLES_PREFIX)
                for uri in transformed_data_uris
            ],
            labels.TEMP_OUTPUT_LABEL:
            str(temp_path),
        }
        cache_output = _GetCachePath('cache_output_path', output_dict)
        if cache_output is not None:
            label_outputs[labels.CACHE_OUTPUT_PATH_LABEL] = cache_output
        status_file = 'status_file'  # Unused
        self.Transform(label_inputs, label_outputs, status_file)
        logging.debug('Cleaning up temp path %s on executor success',
                      temp_path)
        io_utils.delete_dir(temp_path)
示例#31
0
def get_common_fn_args(input_dict: Dict[Text, List[types.Artifact]],
                       exec_properties: Dict[Text, Any],
                       working_dir: Text = None) -> FnArgs:
  """Get common args of training and tuning."""
  if input_dict.get(standard_component_specs.TRANSFORM_GRAPH_KEY):
    transform_graph_path = artifact_utils.get_single_uri(
        input_dict[standard_component_specs.TRANSFORM_GRAPH_KEY])
  else:
    transform_graph_path = None

  if input_dict.get(standard_component_specs.SCHEMA_KEY):
    schema_path = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(
            input_dict[standard_component_specs.SCHEMA_KEY]))
  else:
    schema_path = None

  train_args = trainer_pb2.TrainArgs()
  eval_args = trainer_pb2.EvalArgs()
  proto_utils.json_to_proto(
      exec_properties[standard_component_specs.TRAIN_ARGS_KEY], train_args)
  proto_utils.json_to_proto(
      exec_properties[standard_component_specs.EVAL_ARGS_KEY], eval_args)

  # Default behavior is train on `train` split (when splits is empty in train
  # args) and evaluate on `eval` split (when splits is empty in eval args).
  if not train_args.splits:
    train_args.splits.append('train')
    absl.logging.info("Train on the 'train' split when train_args.splits is "
                      'not set.')
  if not eval_args.splits:
    eval_args.splits.append('eval')
    absl.logging.info("Evaluate on the 'eval' split when eval_args.splits is "
                      'not set.')

  train_files = []
  for train_split in train_args.splits:
    train_files.extend([
        io_utils.all_files_pattern(uri)
        for uri in artifact_utils.get_split_uris(
            input_dict[standard_component_specs.EXAMPLES_KEY], train_split)
    ])

  eval_files = []
  for eval_split in eval_args.splits:
    eval_files.extend([
        io_utils.all_files_pattern(uri)
        for uri in artifact_utils.get_split_uris(
            input_dict[standard_component_specs.EXAMPLES_KEY], eval_split)
    ])

  data_accessor = DataAccessor(
      tf_dataset_factory=tfxio_utils.get_tf_dataset_factory_from_artifact(
          input_dict[standard_component_specs.EXAMPLES_KEY],
          _TELEMETRY_DESCRIPTORS),
      record_batch_factory=tfxio_utils.get_record_batch_factory_from_artifact(
          input_dict[standard_component_specs.EXAMPLES_KEY],
          _TELEMETRY_DESCRIPTORS),
      data_view_decode_fn=tfxio_utils.get_data_view_decode_fn_from_artifact(
          input_dict[standard_component_specs.EXAMPLES_KEY],
          _TELEMETRY_DESCRIPTORS)
      )

  # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
  # num_steps=None.  Conversion of the proto to python will set the default
  # value of an int as 0 so modify the value here.  Tensorflow will raise an
  # error if num_steps <= 0.
  train_steps = train_args.num_steps or None
  eval_steps = eval_args.num_steps or None

  # Load and deserialize custom config from execution properties.
  # Note that in the component interface the default serialization of custom
  # config is 'null' instead of '{}'. Therefore we need to default the
  # json_utils.loads to 'null' then populate it with an empty dict when
  # needed.
  custom_config = json_utils.loads(
      exec_properties.get(standard_component_specs.CUSTOM_CONFIG_KEY, 'null'))

  return FnArgs(
      working_dir=working_dir,
      train_files=train_files,
      eval_files=eval_files,
      train_steps=train_steps,
      eval_steps=eval_steps,
      schema_path=schema_path,
      transform_graph_path=transform_graph_path,
      data_accessor=data_accessor,
      custom_config=custom_config,
  )
示例#32
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """
        Main execution logic for the Sequencer component

        :param input_dict: input channels
        :param output_dict: output channels
        :param exec_properties: the execution properties defined in the spec
        """

        source = exec_properties[StepKeys.SOURCE]
        args = exec_properties[StepKeys.ARGS]

        c = source_utils.load_source_path_class(source)

        # Get the schema
        schema_path = io_utils.get_only_uri_in_dir(
            artifact_utils.get_single_uri(input_dict[constants.SCHEMA]))
        schema = io_utils.SchemaReader().read(schema_path)

        # TODO: Getting the statistics might help the future implementations

        sequence_step: BaseSequencerStep = c(schema=schema,
                                             statistics=None,
                                             **args)

        # Get split names
        input_artifact = artifact_utils.get_single_instance(
            input_dict[constants.INPUT_EXAMPLES])
        split_names = artifact_utils.decode_split_names(
            input_artifact.split_names)

        # Create output artifact
        output_artifact = artifact_utils.get_single_instance(
            output_dict[constants.OUTPUT_EXAMPLES])
        output_artifact.split_names = artifact_utils.encode_split_names(
            split_names)

        with self._make_beam_pipeline() as p:
            for s in split_names:
                input_uri = io_utils.all_files_pattern(
                    artifact_utils.get_split_uri(
                        input_dict[constants.INPUT_EXAMPLES], s))

                output_uri = artifact_utils.get_split_uri(
                    output_dict[constants.OUTPUT_EXAMPLES], s)
                output_path = os.path.join(output_uri, self._DEFAULT_FILENAME)

                # Read and decode the data
                data = \
                    (p
                     | 'Read_' + s >> beam.io.ReadFromTFRecord(
                                file_pattern=input_uri)
                     | 'Decode_' + s >> tf_example_decoder.DecodeTFExample()
                     | 'ToDataFrame_' + s >> beam.ParDo(utils.ConvertToDataframe()))

                # Window into sessions
                s_data = \
                    (data
                     | 'AddCategory_' + s >> beam.ParDo(
                                sequence_step.get_category_do_fn())
                     | 'AddTimestamp_' + s >> beam.ParDo(
                                sequence_step.get_timestamp_do_fn())
                     | 'Sessions_' + s >> beam.WindowInto(
                                sequence_step.get_window()))

                # Combine and transform
                p_data = \
                    (s_data
                     | 'Combine_' + s >> beam.CombinePerKey(
                                sequence_step.get_combine_fn()))

                # Write the results
                _ = \
                    (p_data
                     | 'Global_' + s >> beam.WindowInto(GlobalWindows())
                     | 'RemoveKey_' + s >> beam.ParDo(RemoveKey())
                     | 'ToExample_' + s >> beam.Map(utils.df_to_example)
                     | 'Serialize_' + s >> beam.Map(utils.serialize)
                     | 'Write_' + s >> beam.io.WriteToTFRecord(
                                output_path,
                                file_name_suffix='.gz'))
示例#33
0
文件: executor.py 项目: vibhatha/tfx
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        stats_options = options.StatsOptions()
        if STATS_OPTIONS_JSON_KEY in exec_properties:
            stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY]
            if stats_options_json:
                # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
                # json_utils
                stats_options = options.StatsOptions.from_json(
                    stats_options_json)
        if input_dict.get(SCHEMA_KEY):
            if stats_options.schema:
                raise ValueError(
                    'A schema was provided as an input and the '
                    'stats_options exec_property also contains a schema '
                    'value. At most one of these may be set.')
            else:
                schema = io_utils.SchemaReader().read(
                    io_utils.get_only_uri_in_dir(
                        artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
                stats_options.schema = schema

        split_uris = []
        for artifact in input_dict[EXAMPLES_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                output_uri = artifact_utils.get_split_uri(
                    output_dict[STATISTICS_KEY], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
示例#34
0
文件: executor.py 项目: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow Transform executor entrypoint.

    This implements BaseExecutor.Do() and is invoked by orchestration systems.
    This is not inteded for manual usage or further customization. Please use
    the Transform() function which takes an input format with no artifact
    dependency.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of 'ExamplesPath' type which should contain two
          splits 'train' and 'eval'.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - transform_output: Output of 'tf.Transform', which includes an exported
          Tensorflow graph suitable for both training and serving;
        - transformed_examples: Materialized transformed examples, which
          includes both 'train' and 'eval' splits.
      exec_properties: A dict of execution properties, including:
        - module_file: The file path to a python module file, from which the
          'preprocessing_fn' function will be loaded.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)
    train_data_uri = types.get_split_uri(input_dict['input_data'], 'train')
    eval_data_uri = types.get_split_uri(input_dict['input_data'], 'eval')
    schema_file = io_utils.get_only_uri_in_dir(
        types.get_single_uri(input_dict['schema']))

    transform_output = types.get_single_uri(output_dict['transform_output'])
    if tf.gfile.Exists(transform_output):
      io_utils.delete_dir(transform_output)

    transformed_train_output = types.get_split_uri(
        output_dict['transformed_examples'], 'train')
    if tf.gfile.Exists(transformed_train_output):
      io_utils.delete_dir(transformed_train_output)

    transformed_eval_output = types.get_split_uri(
        output_dict['transformed_examples'], 'eval')
    if tf.gfile.Exists(transformed_eval_output):
      io_utils.delete_dir(transformed_eval_output)

    temp_path = os.path.join(transform_output, _TEMP_DIR_IN_TRANSFORM_OUTPUT)
    tf.logging.debug('Using temp path %s for tft.beam', temp_path)

    label_inputs = {
        labels.COMPUTE_STATISTICS_LABEL:
            False,
        labels.SCHEMA_PATH_LABEL:
            schema_file,
        labels.EXAMPLES_DATA_FORMAT_LABEL:
            labels.FORMAT_TF_EXAMPLE,
        labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(train_data_uri),
        labels.TRANSFORM_ONLY_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(eval_data_uri),
        labels.TFT_STATISTICS_USE_TFDV_LABEL:
            True,
        labels.PREPROCESSING_FN:
            exec_properties['module_file'],
    }

    label_outputs = {
        labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output,
        labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [
            os.path.join(transformed_train_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
            os.path.join(transformed_eval_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
        ],
        labels.TEMP_OUTPUT_LABEL: str(temp_path),
    }
    status_file = 'status_file'  # Unused
    self.Transform(label_inputs, label_outputs, status_file)
    tf.logging.info('Cleaning up temp path %s on executor success', temp_path)
    io_utils.delete_dir(temp_path)