示例#1
0
    def Do(self, input_dict, output_dict, exec_properties):
        """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'eval'. Stats on other splits are ignored.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        tf.logging.info('Validating schema against the computed statistics.')
        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                types.get_single_uri(input_dict['schema'])))
        stats = tfdv.load_statistics(
            io_utils.get_only_uri_in_dir(
                types.get_split_uri(input_dict['stats'], 'eval')))
        output_uri = types.get_single_uri(output_dict['output'])
        anomalies = tfdv.validate_statistics(stats, schema)
        io_utils.write_pbtxt_file(os.path.join(output_uri, DEFAULT_FILE_NAME),
                                  anomalies)
        tf.logging.info(
            'Validation complete. Anomalies written to {}.'.format(output_uri))
示例#2
0
  def Do(self, input_dict: Dict[Text, List[types.TfxType]],
         output_dict: Dict[Text, List[types.TfxType]],
         exec_properties: Dict[Text, Any]) -> None:
    """Get human review result on a model through Slack channel.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from model_validator.
      output_dict: Output dict from key to a list of artifacts, including:
        - slack_blessing: model blessing result.
      exec_properties: A dict of execution properties, including:
        - slack_token: Token used to setup connection with slack server.
        - channel_id: The id of the Slack channel to send and receive messages.
        - timeout_sec: How long do we wait for response, in seconds.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    # Fetch execution properties from exec_properties dict.
    slack_token = exec_properties['slack_token']
    channel_id = exec_properties['channel_id']
    timeout_sec = exec_properties['timeout_sec']

    # Fetch input URIs from input_dict.
    model_export_uri = types.get_single_uri(input_dict['model_export'])
    model_blessing_uri = types.get_single_uri(input_dict['model_blessing'])

    # Fetch output artifact from output_dict.
    slack_blessing = types.get_single_instance(output_dict['slack_blessing'])

    # We only consider a model as blessed if both of the following conditions
    # are met:
    # - The model is blessed by model validator. This is determined by looking
    #   for file named 'BLESSED' from the output from Model Validator.
    # - The model is blessed by a human reviewer. This logic is in
    #   _fetch_slack_blessing().
    try:
      with Timeout(timeout_sec):
        blessed = tf.gfile.Exists(os.path.join(
            model_blessing_uri, 'BLESSED')) and self._fetch_slack_blessing(
                slack_token, channel_id, model_export_uri)
    except TimeoutError:  # pylint: disable=undefined-variable
      tf.logging.info('Timeout fetching manual model evaluation result.')
      blessed = False

    # If model is blessed, write an empty file named 'BLESSED' in the assigned
    # output path. Otherwise, write an empty file named 'NOT_BLESSED' instead.
    if blessed:
      io_utils.write_string_file(
          os.path.join(slack_blessing.uri, 'BLESSED'), '')
      slack_blessing.set_int_custom_property('blessed', 1)
    else:
      io_utils.write_string_file(
          os.path.join(slack_blessing.uri, 'NOT_BLESSED'), '')
      slack_blessing.set_int_custom_property('blessed', 0)
    tf.logging.info('Blessing result %s written to %s.', blessed,
                    slack_blessing.uri)
示例#3
0
文件: executor.py 项目: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data.

    Returns:
      None
    """
    if 'model_exports' not in input_dict:
      raise ValueError('\'model_exports\' is missing in input dict.')
    if 'examples' not in input_dict:
      raise ValueError('\'examples\' is missing in input dict.')
    if 'output' not in output_dict:
      raise ValueError('\'output\' is missing in output dict.')

    self._log_startup(input_dict, output_dict, exec_properties)

    # Extract input artifacts
    model_exports_uri = types.get_single_uri(input_dict['model_exports'])

    feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
    json_format.Parse(exec_properties['feature_slicing_spec'],
                      feature_slicing_spec)
    slice_spec = self._get_slice_spec_from_feature_slicing_spec(
        feature_slicing_spec)

    output_uri = types.get_single_uri(output_dict['output'])

    eval_model_path = path_utils.eval_model_path(model_exports_uri)

    tf.logging.info('Using {} for model eval.'.format(eval_model_path))
    eval_shared_model = tfma.default_eval_shared_model(
        eval_saved_model_path=eval_model_path)

    tf.logging.info('Evaluating model.')
    with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
      # pylint: disable=expression-not-assigned
      (pipeline
       | 'ReadData' >> beam.io.ReadFromTFRecord(
           file_pattern=io_utils.all_files_pattern(
               types.get_split_uri(input_dict['examples'], 'eval')))
       |
       'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults(
           eval_shared_model=eval_shared_model,
           slice_spec=slice_spec,
           output_path=output_uri))
    tf.logging.info(
        'Evaluation complete. Results written to {}.'.format(output_uri))
示例#4
0
    def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]],
           output_dict: Dict[Text, List[types.TfxArtifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data.

    Returns:
      None
    """
        if 'model_exports' not in input_dict:
            raise ValueError('\'model_exports\' is missing in input dict.')
        if 'examples' not in input_dict:
            raise ValueError('\'examples\' is missing in input dict.')
        if 'output' not in output_dict:
            raise ValueError('\'output\' is missing in output dict.')

        self._log_startup(input_dict, output_dict, exec_properties)

        # Extract input artifacts
        model_exports_uri = types.get_single_uri(input_dict['model_exports'])

        feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
        json_format.Parse(exec_properties['feature_slicing_spec'],
                          feature_slicing_spec)
        slice_spec = self._get_slice_spec_from_feature_slicing_spec(
            feature_slicing_spec)

        output_uri = types.get_single_uri(output_dict['output'])

        eval_model_path = path_utils.eval_model_path(model_exports_uri)

        tf.logging.info('Using {} for model eval.'.format(eval_model_path))
        eval_shared_model = tfma.default_eval_shared_model(
            eval_saved_model_path=eval_model_path)

        tf.logging.info('Evaluating model.')
        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
            # pylint: disable=expression-not-assigned
            (pipeline
             | 'ReadData' >>
             beam.io.ReadFromTFRecord(file_pattern=io_utils.all_files_pattern(
                 types.get_split_uri(input_dict['examples'], 'eval')))
             | 'ExtractEvaluateAndWriteResults' >>
             tfma.ExtractEvaluateAndWriteResults(
                 eval_shared_model=eval_shared_model,
                 slice_spec=slice_spec,
                 output_path=output_uri))
        tf.logging.info(
            'Evaluation complete. Results written to {}.'.format(output_uri))
示例#5
0
    def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]],
           output_dict: Dict[Text, List[types.TfxArtifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Get human review result on a model through Slack channel.

        Args:
          input_dict: Input dict from input key to a list of artifacts, including:
            - input_example: an example for an input

          output_dict: Output dict from key to a list of artifacts, including:
            - output_example: an example for an output
          exec_properties: A dict of execution properties, including:
            - string_parameter: An string execution parameter (only used in here, not persistent or shared up stream)
            - integer_parameter: An integer execution parameter (only used in here, not persistent or shared up stream)
            - input_config: not of concern here, only relevant for Driver
            - output_config: not of concern here, only relevant for Driver

        Returns:
          None
        """
        self._log_startup(input_dict, output_dict, exec_properties)

        # Fetch execution properties from exec_properties dict.
        string_parameter = exec_properties['string_execution_parameter']
        integer_parameter = exec_properties['integer_execution_parameter']

        # Fetch input URIs from input_dict.
        input_example_uri = types.get_single_uri(input_dict['input_example'])

        # Fetch output artifact from output_dict.
        output_example = types.get_single_instance(
            output_dict['output_example'])

        print("I AM RUNNING!")
        print(string_parameter)
        print(integer_parameter)
        print(input_example_uri)
        print(output_example)

        input_data = ""

        # load your input
        if tf.gfile.Exists(input_example_uri):
            with open(input_example_uri, "r") as file:
                input_data = file.read()

        # make some changes
        output_data = input_data + " changed by an awesome custom executor!"

        # update output uri for up stream components to know the filename
        output_example.uri = os.path.join(output_example.uri,
                                          _DEFAULT_FILE_NAME)

        # write the changes back to your output
        io_utils.write_string_file(output_example.uri, output_data)

        # you can also set custom properties to make checks in up stream components more quickly.
        # this is optional.
        output_example.set_string_custom_property('stringProperty', "Awesome")
        output_example.set_int_custom_property('intProperty', 42)
示例#6
0
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'train'. Stats on other splits are ignored.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'SchemaPath' artifact of size one.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    # TODO(zhitaoli): Move constants between this file and component.py to a
    # constants.py.
    train_stats_uri = io_utils.get_only_uri_in_dir(
        types.get_split_uri(input_dict['stats'], 'train'))
    output_uri = os.path.join(
        types.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME)

    infer_feature_shape = False
    tf.logging.info('Infering schema from statistics.')
    schema = tfdv.infer_schema(
        tfdv.load_statistics(train_stats_uri), infer_feature_shape)
    io_utils.write_pbtxt_file(output_uri, schema)
    tf.logging.info('Schema written to {}.'.format(output_uri))
示例#7
0
def _ImportExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    input_dict: Dict[Text, List[types.TfxArtifact]],
    exec_properties: Dict[Text, Any],  # pylint: disable=unused-argument
    split_pattern: Text) -> beam.pvalue.PCollection:
  """Read TFRecord files to PCollection of TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input-base: input dir that contains tf example data.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input-base.

  Returns:
    PCollection of TF examples.
  """
  input_base_uri = types.get_single_uri(input_dict['input-base'])
  input_split_pattern = os.path.join(input_base_uri, split_pattern)
  tf.logging.info(
      'Reading input TFExample data {}.'.format(input_split_pattern))

  # TODO(jyzhao): profile input examples.
  return (pipeline
          # TODO(jyzhao): support multiple input format.
          | 'ReadFromTFRecord' >>
          beam.io.ReadFromTFRecord(file_pattern=input_split_pattern)
          # TODO(jyzhao): consider move serialization out of base example gen.
          | 'ToTFExample' >> beam.Map(tf.train.Example.FromString))
示例#8
0
def _ParquetToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    input_dict: Dict[Text, List[types.TfxArtifact]],
    exec_properties: Dict[Text, Any],  # pylint: disable=unused-argument
    split_pattern: Text) -> beam.pvalue.PCollection:
    """Read Parquet files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input_base: input dir that contains Parquet data.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.
  """
    input_base_uri = types.get_single_uri(input_dict['input_base'])
    parquet_pattern = os.path.join(input_base_uri, split_pattern)
    tf.logging.info('Processing input parquet data {} to TFExample.'.format(
        parquet_pattern))

    return (pipeline
            # TODO(jyzhao): support per column read by input_config.
            | 'ReadFromParquet' >> beam.io.ReadFromParquet(parquet_pattern)
            | 'ToTFExample' >> beam.Map(_dict_to_example))
示例#9
0
文件: executor.py 项目: zorrock/tfx
    def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]],
           output_dict: Dict[Text, List[types.TfxArtifact]],
           exec_properties: Dict[Text, Any]):
        """Overrides the tfx_pusher_executor.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from model_validator.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one. It will
          include the model in this push execution if the model was pushed.
      exec_properties: Mostly a passthrough input dict for
        tfx.components.Pusher.executor.  custom_config.ai_platform_serving_args
        is consumed by this class.  For the full set of parameters supported by
        Google Cloud AI Platform, refer to
        https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version.

    Returns:
      None
    Raises:
      ValueError: if ai_platform_serving_args is not in
      exec_properties.custom_config.
      RuntimeError: if the Google Cloud AI Platform training job failed.
    """
        self._log_startup(input_dict, output_dict, exec_properties)
        if not self.CheckBlessing(input_dict, output_dict):
            return

        model_export = types.get_single_instance(input_dict['model_export'])
        model_export_uri = model_export.uri
        model_blessing_uri = types.get_single_uri(input_dict['model_blessing'])
        model_push = types.get_single_instance(output_dict['model_push'])
        # TODO(jyzhao): should this be in driver or executor.
        if not tf.gfile.Exists(os.path.join(model_blessing_uri, 'BLESSED')):
            model_push.set_int_custom_property('pushed', 0)
            tf.logging.info('Model on %s was not blessed', )
            return

        exec_properties_copy = exec_properties.copy()
        custom_config = exec_properties_copy.pop('custom_config', {})
        ai_platform_serving_args = custom_config['ai_platform_serving_args']

        # Deploy the model.
        model_path = path_utils.serving_model_path(model_export_uri)
        # Note: we do not have a logical model version right now. This
        # model_version is a timestamp mapped to trainer's exporter.
        model_version = os.path.basename(model_path)
        if ai_platform_serving_args is not None:
            cmle_runner.deploy_model_for_cmle_serving(
                model_path, model_version, ai_platform_serving_args)

        # Make sure artifacts are populated in a standard way by calling
        # tfx.pusher.executor.Executor.Do().
        exec_properties_copy['push_destination'] = exec_properties.get(
            'push_destination', self._make_local_temp_destination())
        super(Executor, self).Do(input_dict, output_dict, exec_properties_copy)
示例#10
0
 def testGetSingleUriDeprecated(self):
     with mock.patch.object(tf_logging, 'warning'):
         warn_mock = mock.MagicMock()
         tf_logging.warning = warn_mock
         my_artifact = artifact.Artifact('TestType')
         my_artifact.uri = '123'
         self.assertEqual('123', types.get_single_uri([my_artifact]))
         warn_mock.assert_called_once()
         self.assertIn('tfx.utils.types.get_single_uri has been renamed to',
                       warn_mock.call_args[0][5])
示例#11
0
  def test_get_from_split_list(self):
    """Test various retrieval utilities on a list of split TfxTypes."""
    split_list = []
    for split in ['train', 'eval']:
      instance = types.TfxType('MyTypeName', split=split)
      instance.uri = '/tmp/' + split
      split_list.append(instance)

    with self.assertRaises(ValueError):
      types.get_single_instance(split_list)

    with self.assertRaises(ValueError):
      types.get_single_uri(split_list)

    self.assertEqual(split_list[0],
                     types._get_split_instance(split_list, 'train'))
    self.assertEqual('/tmp/train', types.get_split_uri(split_list, 'train'))
    self.assertEqual(split_list[1], types._get_split_instance(
        split_list, 'eval'))
    self.assertEqual('/tmp/eval', types.get_split_uri(split_list, 'eval'))
示例#12
0
  def test_get_from_split_list(self):
    """Test various retrieval utilities on a list of split TfxTypes."""
    split_list = []
    for split in ['train', 'eval']:
      instance = types.TfxType('MyTypeName', split=split)
      instance.uri = '/tmp/' + split
      split_list.append(instance)

    with self.assertRaises(ValueError):
      types.get_single_instance(split_list)

    with self.assertRaises(ValueError):
      types.get_single_uri(split_list)

    self.assertEqual(split_list[0],
                     types._get_split_instance(split_list, 'train'))
    self.assertEqual('/tmp/train', types.get_split_uri(split_list, 'train'))
    self.assertEqual(split_list[1],
                     types._get_split_instance(split_list, 'eval'))
    self.assertEqual('/tmp/eval', types.get_split_uri(split_list, 'eval'))
示例#13
0
def _CsvToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    input_dict: Dict[Text, List[types.TfxArtifact]],
    exec_properties: Dict[Text, Any],  # pylint: disable=unused-argument
    split_pattern: Text) -> beam.pvalue.PCollection:
    """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input_base: input dir that contains csv data. csv files must have header
        line.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
    input_base_uri = types.get_single_uri(input_dict['input_base'])
    csv_pattern = os.path.join(input_base_uri, split_pattern)
    tf.logging.info(
        'Processing input csv data {} to TFExample.'.format(csv_pattern))

    csv_files = tf.gfile.Glob(csv_pattern)
    if not csv_files:
        raise RuntimeError(
            'Split pattern {} does not match any files.'.format(csv_pattern))

    column_names = io_utils.load_csv_column_names(csv_files[0])
    for csv_files in csv_files[1:]:
        if io_utils.load_csv_column_names(csv_files) != column_names:
            raise RuntimeError(
                'Files in same split {} have different header.'.format(
                    csv_pattern))

    # TODO(pachristopher): Remove this once TFDV 0.14 is released.
    (major, minor, _) = tfdv.__version__.split('.')
    if int(major) > 0 or int(minor) >= 14:
        decoder = csv_decoder.DecodeCSVToDict
    else:
        decoder = csv_decoder.DecodeCSV
    return (pipeline
            | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern,
                                                     skip_header_lines=1)
            | 'ParseCSV' >> decoder(column_names)
            | 'ToTFExample' >> beam.Map(_dict_to_example))
示例#14
0
 def test_get_from_single_list(self):
   """Test various retrieval utilities on a single list of TfxType."""
   single_list = [types.TfxType('MyTypeName', split='eval')]
   single_list[0].uri = '/tmp/evaluri'
   self.assertEqual(single_list[0], types.get_single_instance(single_list))
   self.assertEqual('/tmp/evaluri', types.get_single_uri(single_list))
   self.assertEqual(single_list[0],
                    types._get_split_instance(single_list, 'eval'))
   self.assertEqual('/tmp/evaluri', types.get_split_uri(single_list, 'eval'))
   with self.assertRaises(ValueError):
     types._get_split_instance(single_list, 'train')
   with self.assertRaises(ValueError):
     types.get_split_uri(single_list, 'train')
示例#15
0
 def test_get_from_single_list(self):
   """Test various retrieval utilities on a single list of TfxType."""
   single_list = [types.TfxType('MyTypeName', split='eval')]
   single_list[0].uri = '/tmp/evaluri'
   self.assertEqual(single_list[0], types.get_single_instance(single_list))
   self.assertEqual('/tmp/evaluri', types.get_single_uri(single_list))
   self.assertEqual(single_list[0],
                    types._get_split_instance(single_list, 'eval'))
   self.assertEqual('/tmp/evaluri', types.get_split_uri(single_list, 'eval'))
   with self.assertRaises(ValueError):
     types._get_split_instance(single_list, 'train')
   with self.assertRaises(ValueError):
     types.get_split_uri(single_list, 'train')
示例#16
0
文件: executor.py 项目: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'eval'. Stats on other splits are ignored.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    tf.logging.info('Validating schema against the computed statistics.')
    schema = io_utils.SchemaReader().read(
        io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema'])))
    stats = tfdv.load_statistics(
        io_utils.get_only_uri_in_dir(
            types.get_split_uri(input_dict['stats'], 'eval')))
    output_uri = types.get_single_uri(output_dict['output'])
    anomalies = tfdv.validate_statistics(stats, schema)
    io_utils.write_pbtxt_file(
        os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies)
    tf.logging.info(
        'Validation complete. Anomalies written to {}.'.format(output_uri))
示例#17
0
文件: executor.py 项目: dizcology/tfx
    def CheckBlessing(self, input_dict: Dict[Text, List[types.TfxType]],
                      output_dict: Dict[Text, List[types.TfxType]]) -> bool:
        """Check that model is blessed by upstream ModelValidator, or update output.

    Args:
      input_dict: Input dict from input key to a list of artifacts:
        - model_blessing: model blessing path from model_validator. Pusher looks
          for a file named 'BLESSED' to consider the model blessed and safe to
          push.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one.

    Returns:
      True if the model is blessed by validator.
    """
        model_blessing_uri = types.get_single_uri(input_dict['model_blessing'])
        model_push = types.get_single_instance(output_dict['model_push'])
        # TODO(jyzhao): should this be in driver or executor.
        if not tf.gfile.Exists(os.path.join(model_blessing_uri, 'BLESSED')):
            model_push.set_int_custom_property('pushed', 0)
            tf.logging.info('Model on %s was not blessed', model_blessing_uri)
            return False
        return True
示例#18
0
文件: executor.py 项目: ashishML/tfx
    def Do(self, input_dict, output_dict, exec_properties):
        """Push model to target if blessed.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from model_validator.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one. It will
          include the model in this push execution if the model was pushed.
      exec_properties: A dict of execution properties, including:
        - push_destination: JSON string of pusher_pb2.PushDestination instance,
          providing instruction of destination to push model.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)
        model_export = types.get_single_instance(input_dict['model_export'])
        model_export_uri = model_export.uri
        model_blessing_uri = types.get_single_uri(input_dict['model_blessing'])
        model_push = types.get_single_instance(output_dict['model_push'])
        model_push_uri = model_push.uri
        # TODO(jyzhao): should this be in driver or executor.
        if not tf.gfile.Exists(os.path.join(model_blessing_uri, 'BLESSED')):
            model_push.set_int_custom_property('pushed', 0)
            tf.logging.info('Model on %s was not blessed', )
            return
        tf.logging.info('Model pushing.')
        # Copy the model we are pushing into
        model_path = path_utils.serving_model_path(model_export_uri)
        # Note: we do not have a logical model version right now. This
        # model_version is a timestamp mapped to trainer's exporter.
        model_version = os.path.basename(model_path)
        tf.logging.info('Model version is %s', model_version)
        io_utils.copy_dir(model_path,
                          os.path.join(model_push_uri, model_version))
        tf.logging.info('Model written to %s.', model_push_uri)

        # Copied to a fixed outside path, which can be listened by model server.
        #
        # If model is already successfully copied to outside before, stop copying.
        # This is because model validator might blessed same model twice (check
        # mv driver) with different blessing output, we still want Pusher to
        # handle the mv output again to keep metadata tracking, but no need to
        # copy to outside path again..
        # TODO(jyzhao): support rpc push and verification.
        push_destination = pusher_pb2.PushDestination()
        json_format.Parse(exec_properties['push_destination'],
                          push_destination)
        serving_path = os.path.join(push_destination.filesystem.base_directory,
                                    model_version)
        if tf.gfile.Exists(serving_path):
            tf.logging.info(
                'Destination directory %s already exists, skipping current push.',
                serving_path)
        else:
            # tf.serving won't load partial model, it will retry until fully copied.
            io_utils.copy_dir(model_path, serving_path)
            tf.logging.info('Model written to serving path %s.', serving_path)

        model_push.set_int_custom_property('pushed', 1)
        model_push.set_string_custom_property('pushed_model', model_export_uri)
        model_push.set_int_custom_property('pushed_model_id', model_export.id)
        tf.logging.info('Model pushed to %s.', serving_path)

        if exec_properties.get('custom_config'):
            cmle_serving_args = exec_properties.get(
                'custom_config', {}).get('cmle_serving_args')
            if cmle_serving_args is not None:
                return cmle_runner.deploy_model_for_serving(
                    serving_path, model_version, cmle_serving_args,
                    exec_properties['log_root'])
示例#19
0
    def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]],
           output_dict: Dict[Text, List[types.TfxArtifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Uses a user-supplied tf.estimator to train a TensorFlow model locally.

    The Trainer Executor invokes a training_fn callback function provided by
    the user via the module_file parameter.  With the tf.estimator returned by
    this function, the Trainer Executor then builds a TensorFlow model using the
    user-provided tf.estimator.

    Args:
      input_dict: Input dict from input key to a list of ML-Metadata Artifacts.
        - transformed_examples: Transformed example.
        - transform_output: Input transform graph.
        - schema: Schema of the data.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: Exported model.
      exec_properties: A dict of execution properties.
        - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
          args for training.
        - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing
          args for eval.
        - module_file: Python module file containing UDF model definition.
        - warm_starting: Whether or not we need to do warm starting.
        - warm_start_from: Optional. If warm_starting is True, this is the
          directory to find previous model to warm start on.

    Returns:
      None

    Raises:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        # TODO(zhitaoli): Deprecate this in a future version.
        if exec_properties.get('custom_config', None):
            cmle_args = exec_properties.get('custom_config',
                                            {}).get('cmle_training_args')
            if cmle_args:
                executor_class_path = '.'.join(
                    [Executor.__module__, Executor.__name__])
                tf.logging.warn(
                    'Passing \'cmle_training_args\' to trainer directly is deprecated, '
                    'please use extension executor at '
                    'tfx.extensions.google_cloud_ai_platform.trainer.executor instead'
                )

                return cmle_runner.start_cmle_training(input_dict, output_dict,
                                                       exec_properties,
                                                       executor_class_path,
                                                       cmle_args)

        trainer_fn = io_utils.import_func(exec_properties['module_file'],
                                          'trainer_fn')

        # Set up training parameters
        train_files = [
            _all_files_pattern(
                types.get_split_uri(input_dict['transformed_examples'],
                                    'train'))
        ]
        transform_output = types.get_single_uri(input_dict['transform_output'])
        eval_files = [
            _all_files_pattern(
                types.get_split_uri(input_dict['transformed_examples'],
                                    'eval'))
        ]
        schema_file = io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema']))

        train_args = trainer_pb2.TrainArgs()
        eval_args = trainer_pb2.EvalArgs()
        json_format.Parse(exec_properties['train_args'], train_args)
        json_format.Parse(exec_properties['eval_args'], eval_args)

        # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
        # num_steps=None.  Conversion of the proto to python will set the default
        # value of an int as 0 so modify the value here.  Tensorflow will raise an
        # error if num_steps <= 0.
        train_steps = train_args.num_steps or None
        eval_steps = eval_args.num_steps or None

        output_path = types.get_single_uri(output_dict['output'])
        serving_model_dir = path_utils.serving_model_dir(output_path)
        eval_model_dir = path_utils.eval_model_dir(output_path)

        # Assemble warm start path if needed.
        warm_start_from = None
        if exec_properties.get('warm_starting') and exec_properties.get(
                'warm_start_from'):
            previous_model_dir = os.path.join(
                exec_properties['warm_start_from'],
                path_utils.SERVING_MODEL_DIR)
            if previous_model_dir and tf.gfile.Exists(
                    os.path.join(previous_model_dir,
                                 self._CHECKPOINT_FILE_NAME)):
                warm_start_from = previous_model_dir

        # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
        hparams = tf.contrib.training.HParams(
            # A list of uris for train files.
            train_files=train_files,
            # A single uri for transform graph produced by TFT.
            transform_output=transform_output,
            # A single uri for the output directory of the serving model.
            serving_model_dir=serving_model_dir,
            # A list of uris for eval files.
            eval_files=eval_files,
            # A single uri for schema file.
            schema_file=schema_file,
            # Number of train steps.
            train_steps=train_steps,
            # Number of eval steps.
            eval_steps=eval_steps,
            # A single uri for the model directory to warm start from.
            warm_start_from=warm_start_from)

        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

        training_spec = trainer_fn(hparams, schema)

        # Train the model
        tf.logging.info('Training model.')
        tf.estimator.train_and_evaluate(training_spec['estimator'],
                                        training_spec['train_spec'],
                                        training_spec['eval_spec'])
        tf.logging.info('Training complete.  Model written to %s',
                        serving_model_dir)

        # Export an eval savedmodel for TFMA
        tf.logging.info('Exporting eval_savedmodel for TFMA.')
        tfma.export.export_eval_savedmodel(
            estimator=training_spec['estimator'],
            export_dir_base=eval_model_dir,
            eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

        tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
示例#20
0
    def Do(self, input_dict, output_dict, exec_properties):
        """Validate current model against last blessed model.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - examples: examples for eval the model.
        - model: current model for validation.
      output_dict: Output dict from output key to a list of Artifacts.
        - blessing: model blessing result.
        - results: model validation results.
      exec_properties: A dict of execution properties.
        - blessed_model: last blessed model for validation.
        - blessed_model_id: last blessed model id.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        # TODO(b/125451545): Provide a safe temp path from base executor instead.
        self._temp_path = os.path.join(
            types.get_single_uri(output_dict['results']), '.temp')
        tf.logging.info('Using temp path {} for tft.beam'.format(
            self._temp_path))

        eval_examples_uri = types.get_split_uri(input_dict['examples'], 'eval')
        blessing = types.get_single_instance(output_dict['blessing'])

        # Current model.
        current_model = types.get_single_instance(input_dict['model'])
        tf.logging.info('Using {} as current model.'.format(current_model.uri))
        blessing.set_string_custom_property('current_model', current_model.uri)
        blessing.set_int_custom_property('current_model_id', current_model.id)

        # Blessed model.
        blessed_model_dir = exec_properties['blessed_model']
        blessed_model_id = exec_properties['blessed_model_id']
        tf.logging.info('Using {} as blessed model.'.format(blessed_model_dir))
        if blessed_model_dir:
            blessing.set_string_custom_property('blessed_model',
                                                blessed_model_dir)
            blessing.set_int_custom_property('blessed_model_id',
                                             blessed_model_id)

        tf.logging.info('Validating model.')
        # TODO(b/125853306): support customized slice spec.
        blessed = self._generate_blessing_result(
            eval_examples_uri=eval_examples_uri,
            slice_spec=[tfma.slicer.slicer.SingleSliceSpec()],
            current_model_dir=current_model.uri,
            blessed_model_dir=blessed_model_dir)

        if blessed:
            io_utils.write_string_file(os.path.join(blessing.uri, 'BLESSED'),
                                       '')
            blessing.set_int_custom_property('blessed', 1)
        else:
            io_utils.write_string_file(
                os.path.join(blessing.uri, 'NOT_BLESSED'), '')
            blessing.set_int_custom_property('blessed', 0)
        tf.logging.info('Blessing result {} written to {}.'.format(
            blessed, blessing.uri))

        io_utils.delete_dir(self._temp_path)
        tf.logging.info('Cleaned up temp path {} on executor success.'.format(
            self._temp_path))
示例#21
0
    def Do(self, input_dict, output_dict, exec_properties):
        """Runs trainer job the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - transformed_examples: Transformed example.
        - transform_output: Input transform graph.
        - schema: Schema of the data.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: Exported model.
      exec_properties: A dict of execution properties.
        - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
          args for training.
        - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing
          args for eval.
        - module_file: Python module file containing UDF model definition.
        - warm_starting: Whether or not we need to do warm starting.
        - warm_start_from: Optional. If warm_starting is True, this is the
          directory to find previous model to warm start on.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        # TODO(khaas): Move this to tfx/extensions.
        if exec_properties.get('custom_config', None):
            cmle_args = exec_properties.get('custom_config',
                                            {}).get('cmle_training_args')
            if cmle_args:
                return cmle_runner.start_cmle_training(input_dict, output_dict,
                                                       exec_properties,
                                                       cmle_args)

        trainer_fn = io_utils.import_func(exec_properties['module_file'],
                                          'trainer_fn')

        # Set up training parameters
        train_files = [
            _all_files_pattern(
                types.get_split_uri(input_dict['transformed_examples'],
                                    'train'))
        ]
        transform_output = types.get_single_uri(input_dict['transform_output'])
        eval_files = _all_files_pattern(
            types.get_split_uri(input_dict['transformed_examples'], 'eval'))
        schema_file = io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema']))

        train_args = trainer_pb2.TrainArgs()
        eval_args = trainer_pb2.EvalArgs()
        json_format.Parse(exec_properties['train_args'], train_args)
        json_format.Parse(exec_properties['eval_args'], eval_args)

        # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
        # num_steps=None.  Conversion of the proto to python will set the default
        # value of an int as 0 so modify the value here.  Tensorflow will raise an
        # error if num_steps <= 0.
        train_steps = train_args.num_steps or None
        eval_steps = eval_args.num_steps or None

        output_path = types.get_single_uri(output_dict['output'])
        serving_model_dir = path_utils.serving_model_dir(output_path)
        eval_model_dir = path_utils.eval_model_dir(output_path)

        # Assemble warm start path if needed.
        warm_start_from = None
        if exec_properties.get('warm_starting') and exec_properties.get(
                'warm_start_from'):
            previous_model_dir = os.path.join(
                exec_properties['warm_start_from'],
                path_utils.SERVING_MODEL_DIR)
            if previous_model_dir and tf.gfile.Exists(
                    os.path.join(previous_model_dir,
                                 self._CHECKPOINT_FILE_NAME)):
                warm_start_from = previous_model_dir

        # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
        hparams = tf.contrib.training.HParams(
            train_files=train_files,
            transform_output=transform_output,
            output_dir=output_path,
            serving_model_dir=serving_model_dir,
            eval_files=eval_files,
            schema_file=schema_file,
            train_steps=train_steps,
            eval_steps=eval_steps,
            warm_start_from=warm_start_from)

        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

        training_spec = trainer_fn(hparams, schema)

        # Train the model
        tf.logging.info('Training model.')
        tf.estimator.train_and_evaluate(training_spec['estimator'],
                                        training_spec['train_spec'],
                                        training_spec['eval_spec'])
        tf.logging.info('Training complete.  Model written to %s',
                        serving_model_dir)

        # Export an eval savedmodel for TFMA
        tf.logging.info('Exporting eval_savedmodel for TFMA.')
        tfma.export.export_eval_savedmodel(
            estimator=training_spec['estimator'],
            export_dir_base=eval_model_dir,
            eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

        tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
示例#22
0
文件: executor.py 项目: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow Transform executor entrypoint.

    This implements BaseExecutor.Do() and is invoked by orchestration systems.
    This is not inteded for manual usage or further customization. Please use
    the Transform() function which takes an input format with no artifact
    dependency.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of 'ExamplesPath' type which should contain two
          splits 'train' and 'eval'.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - transform_output: Output of 'tf.Transform', which includes an exported
          Tensorflow graph suitable for both training and serving;
        - transformed_examples: Materialized transformed examples, which
          includes both 'train' and 'eval' splits.
      exec_properties: A dict of execution properties, including:
        - module_file: The file path to a python module file, from which the
          'preprocessing_fn' function will be loaded.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)
    train_data_uri = types.get_split_uri(input_dict['input_data'], 'train')
    eval_data_uri = types.get_split_uri(input_dict['input_data'], 'eval')
    schema_file = io_utils.get_only_uri_in_dir(
        types.get_single_uri(input_dict['schema']))

    transform_output = types.get_single_uri(output_dict['transform_output'])
    if tf.gfile.Exists(transform_output):
      io_utils.delete_dir(transform_output)

    transformed_train_output = types.get_split_uri(
        output_dict['transformed_examples'], 'train')
    if tf.gfile.Exists(transformed_train_output):
      io_utils.delete_dir(transformed_train_output)

    transformed_eval_output = types.get_split_uri(
        output_dict['transformed_examples'], 'eval')
    if tf.gfile.Exists(transformed_eval_output):
      io_utils.delete_dir(transformed_eval_output)

    temp_path = os.path.join(transform_output, _TEMP_DIR_IN_TRANSFORM_OUTPUT)
    tf.logging.debug('Using temp path %s for tft.beam', temp_path)

    label_inputs = {
        labels.COMPUTE_STATISTICS_LABEL:
            False,
        labels.SCHEMA_PATH_LABEL:
            schema_file,
        labels.EXAMPLES_DATA_FORMAT_LABEL:
            labels.FORMAT_TF_EXAMPLE,
        labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(train_data_uri),
        labels.TRANSFORM_ONLY_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(eval_data_uri),
        labels.TFT_STATISTICS_USE_TFDV_LABEL:
            True,
        labels.PREPROCESSING_FN:
            exec_properties['module_file'],
    }

    label_outputs = {
        labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output,
        labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [
            os.path.join(transformed_train_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
            os.path.join(transformed_eval_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
        ],
        labels.TEMP_OUTPUT_LABEL: str(temp_path),
    }
    status_file = 'status_file'  # Unused
    self.Transform(label_inputs, label_outputs, status_file)
    tf.logging.info('Cleaning up temp path %s on executor success', temp_path)
    io_utils.delete_dir(temp_path)
示例#23
0
文件: executor.py 项目: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """Push model to target if blessed.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from model_validator.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one. It will
          include the model in this push execution if the model was pushed.
      exec_properties: A dict of execution properties, including:
        - push_destination: JSON string of pusher_pb2.PushDestination instance,
          providing instruction of destination to push model.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)
    model_export = types.get_single_instance(input_dict['model_export'])
    model_export_uri = model_export.uri
    model_blessing_uri = types.get_single_uri(input_dict['model_blessing'])
    model_push = types.get_single_instance(output_dict['model_push'])
    model_push_uri = model_push.uri
    # TODO(jyzhao): should this be in driver or executor.
    if not tf.gfile.Exists(os.path.join(model_blessing_uri, 'BLESSED')):
      model_push.set_int_custom_property('pushed', 0)
      tf.logging.info('Model on %s was not blessed',)
      return
    tf.logging.info('Model pushing.')
    # Copy the model we are pushing into
    model_path = path_utils.serving_model_path(model_export_uri)
    # Note: we do not have a logical model version right now. This
    # model_version is a timestamp mapped to trainer's exporter.
    model_version = os.path.basename(model_path)
    tf.logging.info('Model version is %s', model_version)
    io_utils.copy_dir(model_path, os.path.join(model_push_uri, model_version))
    tf.logging.info('Model written to %s.', model_push_uri)

    # Copied to a fixed outside path, which can be listened by model server.
    #
    # If model is already successfully copied to outside before, stop copying.
    # This is because model validator might blessed same model twice (check
    # mv driver) with different blessing output, we still want Pusher to
    # handle the mv output again to keep metadata tracking, but no need to
    # copy to outside path again..
    # TODO(jyzhao): support rpc push and verification.
    push_destination = pusher_pb2.PushDestination()
    json_format.Parse(exec_properties['push_destination'], push_destination)
    serving_path = os.path.join(push_destination.filesystem.base_directory,
                                model_version)
    if tf.gfile.Exists(serving_path):
      tf.logging.info(
          'Destination directory %s already exists, skipping current push.',
          serving_path)
    else:
      # tf.serving won't load partial model, it will retry until fully copied.
      io_utils.copy_dir(model_path, serving_path)
      tf.logging.info('Model written to serving path %s.', serving_path)

    model_push.set_int_custom_property('pushed', 1)
    model_push.set_string_custom_property('pushed_model', model_export_uri)
    model_push.set_int_custom_property('pushed_model_id', model_export.id)
    tf.logging.info('Model pushed to %s.', serving_path)

    if exec_properties.get('custom_config'):
      cmle_serving_args = exec_properties.get('custom_config',
                                              {}).get('cmle_serving_args')
      if cmle_serving_args is not None:
        return cmle_runner.deploy_model_for_serving(serving_path, model_version,
                                                    cmle_serving_args,
                                                    exec_properties['log_root'])
示例#24
0
 def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]],
        output_dict: Dict[Text, List[types.TfxArtifact]],
        exec_properties: Dict[Text, Any]) -> None:
     input_path = types.get_single_uri(input_dict['input'])
     output_path = types.get_single_uri(output_dict['output'])
     tf.gfile.Copy(input_path, output_path)
示例#25
0
    def Do(self, input_dict, output_dict, exec_properties):
        """TensorFlow Transform executor entrypoint.

    This implements BaseExecutor.Do() and is invoked by orchestration systems.
    This is not inteded for manual usage or further customization. Please use
    the Transform() function which takes an input format with no artifact
    dependency.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of 'ExamplesPath' type which should contain two
          splits 'train' and 'eval'.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - transform_output: Output of 'tf.Transform', which includes an exported
          Tensorflow graph suitable for both training and serving;
        - transformed_examples: Materialized transformed examples, which
          includes both 'train' and 'eval' splits.
      exec_properties: A dict of execution properties, including:
        - module_file: The file path to a python module file, from which the
          'preprocessing_fn' function will be loaded.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)
        train_data_uri = types.get_split_uri(input_dict['input_data'], 'train')
        eval_data_uri = types.get_split_uri(input_dict['input_data'], 'eval')
        schema_file = io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema']))

        transform_output = types.get_single_uri(
            output_dict['transform_output'])
        if tf.gfile.Exists(transform_output):
            io_utils.delete_dir(transform_output)

        transformed_train_output = types.get_split_uri(
            output_dict['transformed_examples'], 'train')
        if tf.gfile.Exists(transformed_train_output):
            io_utils.delete_dir(transformed_train_output)

        transformed_eval_output = types.get_split_uri(
            output_dict['transformed_examples'], 'eval')
        if tf.gfile.Exists(transformed_eval_output):
            io_utils.delete_dir(transformed_eval_output)

        temp_path = os.path.join(transform_output,
                                 _TEMP_DIR_IN_TRANSFORM_OUTPUT)
        tf.logging.debug('Using temp path %s for tft.beam', temp_path)

        label_inputs = {
            labels.COMPUTE_STATISTICS_LABEL:
            False,
            labels.SCHEMA_PATH_LABEL:
            schema_file,
            labels.EXAMPLES_DATA_FORMAT_LABEL:
            labels.FORMAT_TF_EXAMPLE,
            labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(train_data_uri),
            labels.TRANSFORM_ONLY_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(eval_data_uri),
            labels.TFT_STATISTICS_USE_TFDV_LABEL:
            True,
            labels.PREPROCESSING_FN:
            exec_properties['module_file'],
        }

        label_outputs = {
            labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL:
            transform_output,
            labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [
                os.path.join(transformed_train_output,
                             _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
                os.path.join(transformed_eval_output,
                             _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
            ],
            labels.TEMP_OUTPUT_LABEL:
            str(temp_path),
        }
        status_file = 'status_file'  # Unused
        self.Transform(label_inputs, label_outputs, status_file)
        tf.logging.info('Cleaning up temp path %s on executor success',
                        temp_path)
        io_utils.delete_dir(temp_path)
示例#26
0
    def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]],
           output_dict: Dict[Text, List[types.TfxArtifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Get human review result on a model through Slack channel.

        Args:
          input_dict: Input dict from input key to a list of artifacts, including:
            - model_export: exported model from trainer.
            - model_blessing: model blessing path from model_validator.
          output_dict: Output dict from key to a list of artifacts, including:
            - slack_blessing: model blessing result.
          exec_properties: A dict of execution properties, including:
            - slack_token: Token used to setup connection with slack server.
            - channel_id: The id of the Slack channel to send and receive messages.
            - timeout_sec: How long do we wait for response, in seconds.

        Returns:
          None

        Raises:
          TimeoutError:
            When there is no decision made within timeout_sec.
          ConnectionError:
            When connection to slack server cannot be established.

        """
        self._log_startup(input_dict, output_dict, exec_properties)

        # Fetch execution properties from exec_properties dict.
        slack_token = exec_properties['slack_token']
        channel_id = exec_properties['channel_id']
        timeout_sec = exec_properties['timeout_sec']

        # Fetch input URIs from input_dict.
        model_export_uri = types.get_single_uri(input_dict['model_export'])
        model_blessing_uri = types.get_single_uri(input_dict['model_blessing'])

        # Fetch output artifact from output_dict.
        slack_blessing = types.get_single_instance(output_dict['slack_blessing'])

        # We only consider a model as blessed if both of the following conditions
        # are met:
        # - The model is blessed by model validator. This is determined by looking
        #   for file named 'BLESSED' from the output from Model Validator.
        # - The model is blessed by a human reviewer. This logic is in
        #   _fetch_slack_blessing().
        slack_response = None
        with Timeout(timeout_sec):
            if tf.gfile.Exists(os.path.join(model_blessing_uri, 'BLESSED')):
                slack_response = self._fetch_slack_blessing(slack_token, channel_id,
                                                            model_export_uri)

        # If model is blessed, write an empty file named 'BLESSED' in the assigned
        # output path. Otherwise, write an empty file named 'NOT_BLESSED' instead.
        if slack_response and slack_response.approved:
            io_utils.write_string_file(
                os.path.join(slack_blessing.uri, 'BLESSED'), '')
            slack_blessing.set_int_custom_property('blessed', 1)
        else:
            io_utils.write_string_file(
                os.path.join(slack_blessing.uri, 'NOT_BLESSED'), '')
            slack_blessing.set_int_custom_property('blessed', 0)
        if slack_response:
            slack_blessing.set_string_custom_property('slack_decision_maker',
                                                      slack_response.user_id)
            slack_blessing.set_string_custom_property('slack_decision_message',
                                                      slack_response.message)
            slack_blessing.set_string_custom_property('slack_decision_channel',
                                                      slack_response.channel_id)
            slack_blessing.set_string_custom_property('slack_decision_thread',
                                                      slack_response.thread_ts)
        tf.logging.info('Blessing result written to %s.', slack_blessing.uri)