Пример #1
0
  def _save_pipeline(self, pipeline_args: Dict[str, Any]) -> None:
    """Creates/updates pipeline folder in the handler directory.

    Args:
      pipeline_args: Pipeline details obtained from DSL.
    """
    pipeline_name = pipeline_args[labels.PIPELINE_NAME]
    handler_pipeline_path = self._get_pipeline_info_path(pipeline_name)

    # If updating pipeline, first delete pipeline directory.
    if fileio.exists(handler_pipeline_path):
      io_utils.delete_dir(handler_pipeline_path)

    # Dump pipeline_args to handler pipeline folder as json.
    fileio.makedirs(handler_pipeline_path)
    with open(os.path.join(
        handler_pipeline_path, 'pipeline_args.json'), 'w') as f:
      json.dump(pipeline_args, f)

    # Copy dsl to pipeline folder
    pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH]
    io_utils.copy_file(
        pipeline_dsl_path,
        os.path.join(handler_pipeline_path,
                     os.path.basename(pipeline_dsl_path)))
Пример #2
0
    def _save_pipeline(self, pipeline_args: Dict[Text, Any]) -> None:
        """Creates/updates pipeline folder in the handler directory.

    Args:
      pipeline_args: Pipeline details obtained from DSL.
    """
        # Path to pipeline folder in Airflow.
        handler_pipeline_path = os.path.join(
            self._handler_home_dir, pipeline_args[labels.PIPELINE_NAME], '')

        # If updating pipeline, first delete pipeline directory.
        if tf.io.gfile.exists(handler_pipeline_path):
            io_utils.delete_dir(handler_pipeline_path)

        # Dump pipeline_args to handler pipeline folder as json.
        tf.io.gfile.makedirs(handler_pipeline_path)
        with open(os.path.join(handler_pipeline_path, 'pipeline_args.json'),
                  'w') as f:
            json.dump(pipeline_args, f)

        # Copy dsl to pipeline folder
        pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH]
        io_utils.copy_file(
            pipeline_dsl_path,
            os.path.join(handler_pipeline_path,
                         os.path.basename(pipeline_dsl_path)))
Пример #3
0
  def _save_pipeline(self, pipeline_args) -> None:
    """Creates/updates pipeline folder in the handler directory."""

    # Path to pipeline folder in airflow.
    handler_pipeline_path = self._get_handler_pipeline_path(
        pipeline_args[labels.PIPELINE_NAME])

    # If updating pipeline, first delete pipeline directory.
    if tf.io.gfile.exists(handler_pipeline_path):
      io_utils.delete_dir(handler_pipeline_path)

    # Dump pipeline_args to handler pipeline folder as json.
    tf.io.gfile.makedirs(handler_pipeline_path)
    with open(os.path.join(
        handler_pipeline_path, 'pipeline_args.json'), 'w') as f:
      json.dump(pipeline_args, f)

    # Copy dsl to pipeline folder
    io_utils.copy_file(
        self.flags_dict[labels.PIPELINE_DSL_PATH],
        os.path.join(
            handler_pipeline_path,
            os.path.basename(self.flags_dict[labels.PIPELINE_DSL_PATH])
            )
        )
Пример #4
0
  def delete_pipeline(self) -> None:
    """Delete pipeline in Kubeflow."""
    try:
      # Check if pipeline exists.
      pipeline_id = self._get_pipeline_id(self.flags_dict[labels.PIPELINE_NAME])
      self._client._pipelines_api.get_pipeline(pipeline_id)  # pylint: disable=protected-access

      # Delete pipeline for kfp server.
      self._client._pipelines_api.delete_pipeline(id=pipeline_id)  # pylint: disable=protected-access

      # Delete experiment from server.
      experiment_id = self._client.get_experiment(
          experiment_name=self.flags_dict[labels.PIPELINE_NAME]).id
      self._client._experiment_api.delete_experiment(experiment_id)  # pylint: disable=protected-access

    except kfp_server_api.rest.ApiException as err:
      sys.exit(self._print_error(err))

    # Path to pipeline folder.
    handler_pipeline_path = self._get_handler_pipeline_path(
        self.flags_dict[labels.PIPELINE_NAME])

    # Delete pipeline for home directory.
    io_utils.delete_dir(handler_pipeline_path)

    click.echo('Pipeline ' + self.flags_dict[labels.PIPELINE_NAME] +
               ' deleted successfully.')
Пример #5
0
    def _extract_pipeline_args(self) -> Dict[Text, Any]:
        """Get pipeline args from the DSL."""
        if os.path.isdir(self.flags_dict[labels.PIPELINE_DSL_PATH]):
            sys.exit('Provide dsl file path.')

        # Create an environment for subprocess.
        temp_env = os.environ.copy()

        # Create temp file to store pipeline_args from pipeline dsl.
        temp_file = tempfile.mkstemp(prefix='cli_tmp_',
                                     suffix='_pipeline_args')[1]

        # Store temp_file path in temp_env.
        temp_env[labels.TFX_JSON_EXPORT_PIPELINE_ARGS_PATH] = temp_file

        # Run dsl with mock environment to store pipeline args in temp_file.
        subprocess.call(['python', self.flags_dict[labels.PIPELINE_DSL_PATH]],
                        env=temp_env)
        if os.stat(temp_file).st_size != 0:
            # Load pipeline_args from temp_file for TFX pipelines
            with open(temp_file, 'r') as f:
                pipeline_args = json.load(f)
        else:
            # For non-TFX pipelines, extract pipeline name from the dsl filename.
            pipeline_args = {
                labels.PIPELINE_NAME:
                os.path.basename(
                    self.flags_dict[labels.PIPELINE_DSL_PATH]).split('.')[0]
            }

        # Delete temp file
        io_utils.delete_dir(temp_file)

        return pipeline_args
Пример #6
0
 def testDeleteDir(self):
   self.createFiles({
       'dir': {
           'file.txt': 'testing'
       }
   })
   io_utils.delete_dir(self.relpath('dir'))
   self.assertDirectoryEqual(self._base_dir, {})
Пример #7
0
  def delete_pipeline(self) -> None:
    """Delete pipeline in the environment."""
    pipeline_name = self.flags_dict[labels.PIPELINE_NAME]
    self._check_pipeline_existence(pipeline_name)

    io_utils.delete_dir(os.path.join(self._handler_home_dir, pipeline_name))

    click.echo('Pipeline ' + pipeline_name + ' deleted successfully.')
Пример #8
0
  def delete_pipeline(self) -> None:
    """Delete pipeline in Airflow."""
    pipeline_name = self.flags_dict[labels.PIPELINE_NAME]

    # Check if pipeline exists.
    self._check_pipeline_existence(pipeline_name)

    # Delete pipeline folder.
    io_utils.delete_dir(self._get_pipeline_info_path(pipeline_name))
    click.echo('Pipeline "{}" deleted successfully.'.format(pipeline_name))
Пример #9
0
  def delete_pipeline(self) -> None:
    """Delete pipeline in Airflow."""
    # Path to pipeline folder in airflow.
    handler_pipeline_path = self._get_handler_pipeline_path(
        self.flags_dict[labels.PIPELINE_NAME])

    # Check if pipeline exists.
    if not tf.io.gfile.exists(handler_pipeline_path):
      sys.exit('Pipeline {} does not exist.'
               .format(self.flags_dict[labels.PIPELINE_NAME]))

    # Delete pipeline folder.
    io_utils.delete_dir(handler_pipeline_path)
Пример #10
0
  def _save_pipeline(self, pipeline_args: Dict[Text, Any]) -> None:
    """Creates/updates pipeline folder in the handler directory."""
    pipeline_name = pipeline_args[labels.PIPELINE_NAME]

    # Path to pipeline folder.
    handler_pipeline_path = os.path.join(self._handler_home_dir, pipeline_name,
                                         '')

    # When updating pipeline delete pipeline from server and home dir.
    if tf.io.gfile.exists(handler_pipeline_path):

      # Delete pipeline for kfp server.
      pipeline_id = self._get_pipeline_id(pipeline_name)

      try:
        self._client._pipelines_api.delete_pipeline(id=pipeline_id)  # pylint: disable=protected-access
      except kfp_server_api.rest.ApiException as err:
        sys.exit(self._print_error(err))

      # Delete pipeline for home directory.
      io_utils.delete_dir(handler_pipeline_path)

    pipeline_package_path = self.flags_dict[labels.PIPELINE_PACKAGE_PATH]
    try:
      # Now upload pipeline to server.
      upload_response = self._client.upload_pipeline(
          pipeline_package_path=pipeline_package_path,
          pipeline_name=pipeline_name)
      click.echo(upload_response)

      # Create experiment with pipeline name as experiment name.
      experiment_name = pipeline_name
      experiment_id = self._client.create_experiment(experiment_name).id

    except kfp_server_api.rest.ApiException as err:
      sys.exit(self._print_error(err))

    # Add pipeline details to pipeline_args.
    pipeline_args[labels.PIPELINE_NAME] = upload_response.name
    pipeline_args[labels.PIPELINE_ID] = upload_response.id
    pipeline_args[labels.PIPELINE_PACKAGE_PATH] = pipeline_package_path
    pipeline_args[labels.EXPERIMENT_ID] = experiment_id

    # Path to pipeline_args.json .
    pipeline_args_path = os.path.join(handler_pipeline_path,
                                      'pipeline_args.json')

    # Copy pipeline_args to pipeline folder.
    tf.io.gfile.makedirs(handler_pipeline_path)
    with open(pipeline_args_path, 'w') as f:
      json.dump(pipeline_args, f)
Пример #11
0
    def _save_pipeline(self, pipeline_args: Dict[Text, Any]) -> None:
        """Creates/updates pipeline folder in the handler directory."""
        pipeline_name = pipeline_args[labels.PIPELINE_NAME]

        # Path to pipeline folder.
        handler_pipeline_path = os.path.join(self._handler_home_dir,
                                             pipeline_name, '')

        # When updating pipeline delete pipeline from server and home dir.
        if tf.io.gfile.exists(handler_pipeline_path):

            # Delete pipeline for kfp server.
            pipeline_id = self._get_pipeline_id(pipeline_name)

            self._client._pipelines_api.delete_pipeline(id=pipeline_id)  # pylint: disable=protected-access

            # Delete pipeline for home directory.
            io_utils.delete_dir(handler_pipeline_path)

        pipeline_package_path = self.flags_dict[labels.PIPELINE_PACKAGE_PATH]

        # Now upload pipeline to server.
        upload_response = self._client.upload_pipeline(
            pipeline_package_path=pipeline_package_path,
            pipeline_name=pipeline_name)

        # Display the link to the pipeline detail page in KFP UI.
        click.echo(upload_response)
        click.echo('Please access the pipeline detail page at '
                   '{prefix}/#/pipelines/details/{pipeline_id}'.format(
                       prefix=self._client._get_url_prefix(),  # pylint: disable=protected-access
                       pipeline_id=upload_response.id))

        # Create experiment with pipeline name as experiment name.
        experiment_name = pipeline_name
        experiment_id = self._client.create_experiment(experiment_name).id

        # Add pipeline details to pipeline_args.
        pipeline_args[labels.PIPELINE_NAME] = upload_response.name
        pipeline_args[labels.PIPELINE_ID] = upload_response.id
        pipeline_args[labels.PIPELINE_PACKAGE_PATH] = pipeline_package_path
        pipeline_args[labels.EXPERIMENT_ID] = experiment_id

        # Path to pipeline_args.json .
        pipeline_args_path = os.path.join(handler_pipeline_path,
                                          'pipeline_args.json')

        # Copy pipeline_args to pipeline folder.
        tf.io.gfile.makedirs(handler_pipeline_path)
        with open(pipeline_args_path, 'w') as f:
            json.dump(pipeline_args, f)
Пример #12
0
    def delete_pipeline(self) -> None:
        """Delete pipeline in Airflow."""
        pipeline_name = self.flags_dict[labels.PIPELINE_NAME]

        # Path to pipeline folder.
        handler_pipeline_path = os.path.join(self._handler_home_dir,
                                             pipeline_name, '')

        # Check if pipeline exists.
        self._check_pipeline_existence(pipeline_name)

        # Delete pipeline folder.
        io_utils.delete_dir(handler_pipeline_path)
        click.echo('Pipeline "{}" deleted successfully.'.format(pipeline_name))
Пример #13
0
    def _prepare_pipeline_dir(self, pipeline_name: str, required: bool) -> str:
        """Create a directory for pipeline definition in the handler directory."""

        self._check_pipeline_existence(pipeline_name, required)

        handler_pipeline_path = self._get_pipeline_dir(pipeline_name)

        # If updating pipeline, first delete the pipeline directory.
        if fileio.exists(handler_pipeline_path):
            io_utils.delete_dir(handler_pipeline_path)

        fileio.makedirs(handler_pipeline_path)

        # pipeline.json will be stored in KubeflowV2DagRunner.run().
        return handler_pipeline_path
Пример #14
0
    def delete_pipeline(self) -> None:
        """Delete pipeline in the environment."""

        pipeline_name = self.flags_dict[labels.PIPELINE_NAME]

        # Path to pipeline folder.
        handler_pipeline_path = os.path.join(self._handler_home_dir,
                                             pipeline_name, '')
        # Check if pipeline exists.
        self._check_pipeline_existence(pipeline_name)

        # Delete pipeline for home directory.
        io_utils.delete_dir(handler_pipeline_path)

        click.echo('Pipeline ' + pipeline_name + ' deleted successfully.')
Пример #15
0
    def delete_pipeline(self) -> None:
        """Deletes pipeline in Beam."""
        # Path to pipeline folder.
        handler_pipeline_path = self._get_handler_pipeline_path(
            self.flags_dict[labels.PIPELINE_NAME])

        # Check if pipeline exists.
        if not tf.io.gfile.exists(handler_pipeline_path):
            sys.exit('Pipeline "{}" does not exist.'.format(
                self.flags_dict[labels.PIPELINE_NAME]))

        # Delete pipeline folder.
        io_utils.delete_dir(handler_pipeline_path)
        click.echo('Pipeline "{}" deleted successfully.'.format(
            self.flags_dict[labels.PIPELINE_NAME]))
Пример #16
0
    def _extract_pipeline_args(self) -> Dict[Text, Any]:
        """Get pipeline args from the DSL.

    Returns:
      Python dictionary with pipeline details extracted from DSL.
    """
        # TODO(b/157599419): Consider using a better way to extract pipeline info:
        # e.g. pipeline name/root. Currently we relies on consulting a env var when
        # creating Pipeline object, which is brittle.
        pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH]
        if os.path.isdir(pipeline_dsl_path):
            sys.exit('Provide dsl file path.')

        # Create an environment for subprocess.
        temp_env = os.environ.copy()

        # Create temp file to store pipeline_args from pipeline dsl.
        temp_file = tempfile.mkstemp(prefix='cli_tmp_',
                                     suffix='_pipeline_args')[1]

        # Store temp_file path in temp_env.
        # LINT.IfChange
        temp_env[labels.TFX_JSON_EXPORT_PIPELINE_ARGS_PATH] = temp_file
        # LINT.ThenChange(
        #     ../../../orchestration/beam/beam_dag_runner.py,
        #     ../../../orchestration/local/local_dag_runner.py,
        #     ../../../orchestration/portable/beam_dag_runner.py,
        # )

        # Run dsl with mock environment to store pipeline args in temp_file.
        self._subprocess_call([sys.executable, pipeline_dsl_path],
                              env=temp_env)
        if os.stat(temp_file).st_size != 0:
            # Load pipeline_args from temp_file for TFX pipelines
            with open(temp_file, 'r') as f:
                pipeline_args = json.load(f)
        else:
            # For non-TFX pipelines, extract pipeline name from the dsl filename.
            pipeline_args = {
                labels.PIPELINE_NAME:
                os.path.basename(pipeline_dsl_path).split('.')[0]
            }

        # Delete temp file
        io_utils.delete_dir(temp_file)

        return pipeline_args
Пример #17
0
    def _save_pipeline(self, pipeline_args: Dict[Text, Any]) -> None:
        """Creates/updates pipeline folder in the handler directory."""
        # Add pipeline dsl path to pipeline args.
        pipeline_args[labels.PIPELINE_DSL_PATH] = self.flags_dict[
            labels.PIPELINE_DSL_PATH]

        # Path to pipeline folder in beam.
        handler_pipeline_path = os.path.join(
            self._handler_home_dir, pipeline_args[labels.PIPELINE_NAME], '')

        # If updating pipeline, first delete pipeline directory.
        if tf.io.gfile.exists(handler_pipeline_path):
            io_utils.delete_dir(handler_pipeline_path)

        # Dump pipeline_args to handler pipeline folder as json.
        tf.io.gfile.makedirs(handler_pipeline_path)
        with open(os.path.join(handler_pipeline_path, 'pipeline_args.json'),
                  'w') as f:
            json.dump(pipeline_args, f)
Пример #18
0
    def _extract_pipeline_args(self) -> Dict[Text, Any]:
        """Get pipeline args from the DSL.

    Returns:
      Python dictionary with pipeline details extracted from DSL.
    """
        pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH]
        if os.path.isdir(pipeline_dsl_path):
            sys.exit('Provide dsl file path.')

        # Create an environment for subprocess.
        temp_env = os.environ.copy()

        # Create temp file to store pipeline_args from pipeline dsl.
        temp_file = tempfile.mkstemp(prefix='cli_tmp_',
                                     suffix='_pipeline_args')[1]

        # Store temp_file path in temp_env.
        temp_env[labels.TFX_JSON_EXPORT_PIPELINE_ARGS_PATH] = temp_file

        # Mark the SDK environment if not in a template.
        if 'pipelines.kubeflow.org/pipeline-sdk-type' not in temp_env:
            temp_env['pipelines.kubeflow.org/pipeline-sdk-type'] = 'tfx-cli'

        # Run dsl with mock environment to store pipeline args in temp_file.
        self._subprocess_call([sys.executable, pipeline_dsl_path],
                              env=temp_env)
        if os.stat(temp_file).st_size != 0:
            # Load pipeline_args from temp_file for TFX pipelines
            with open(temp_file, 'r') as f:
                pipeline_args = json.load(f)
        else:
            # For non-TFX pipelines, extract pipeline name from the dsl filename.
            pipeline_args = {
                labels.PIPELINE_NAME:
                os.path.basename(pipeline_dsl_path).split('.')[0]
            }

        # Delete temp file
        io_utils.delete_dir(temp_file)

        return pipeline_args
Пример #19
0
    def _save_pipeline(self, pipeline_args: Dict[Text, Any]) -> None:
        """Creates/updates pipeline folder in the handler directory."""
        # Add pipeline dsl path to pipeline args.
        pipeline_args[labels.PIPELINE_DSL_PATH] = self.flags_dict[
            labels.PIPELINE_DSL_PATH]

        # Path to pipeline folder in beam.
        handler_pipeline_path = os.path.join(
            self._handler_home_dir, pipeline_args[labels.PIPELINE_NAME])

        # If updating pipeline, first delete the pipeline directory.
        if fileio.exists(handler_pipeline_path):
            io_utils.delete_dir(handler_pipeline_path)

        # TODO(b/157599419): Consider deprecating PipelineArgs.
        # Dump pipeline_args to handler pipeline folder as json.
        fileio.makedirs(handler_pipeline_path)
        with open(os.path.join(handler_pipeline_path, _PIPELINE_ARG_FILE),
                  'w') as f:
            json.dump(pipeline_args, f)
Пример #20
0
    def _save_pipeline(self, pipeline_args: Dict[Text, Any]) -> None:
        """Creates/updates pipeline folder in the handler directory."""

        # Path to pipeline folder in Kubeflow.
        handler_pipeline_path = self._get_handler_pipeline_path(
            pipeline_args[labels.PIPELINE_NAME])

        # Path to pipeline_args.json .
        pipeline_args_path = os.path.join(handler_pipeline_path,
                                          'pipeline_args.json')

        # When updating pipeline delete pipeline from server and home dir.
        if tf.io.gfile.exists(handler_pipeline_path):

            # Get pipeline_id from pipeline_args.json
            with open(pipeline_args_path, 'r') as f:
                pipeline_args = json.load(f)
            pipeline_id = pipeline_args[labels.PIPELINE_ID]

            # Delete pipeline for home directory.
            io_utils.delete_dir(handler_pipeline_path)

            # Delete pipeline for kfp server.
            self._client._pipelines_api.delete_pipeline(id=pipeline_id)  # pylint: disable=protected-access

        # Now upload pipeline to server.
        upload_response = self._client.upload_pipeline(
            pipeline_package_path=self.flags_dict[
                labels.PIPELINE_PACKAGE_PATH],
            pipeline_name=pipeline_args[labels.PIPELINE_NAME])
        click.echo(upload_response)

        # Add pipeline_id and pipeline_name to pipeline_args.
        pipeline_args[labels.PIPELINE_NAME] = upload_response.name
        pipeline_args[labels.PIPELINE_ID] = upload_response.id

        # Copy pipeline_args to pipeline folder.
        tf.io.gfile.makedirs(handler_pipeline_path)
        with open(pipeline_args_path, 'w') as f:
            json.dump(pipeline_args, f)
Пример #21
0
    def delete_pipeline(self) -> None:
        """Delete pipeline in Kubeflow."""

        pipeline_name = self.flags_dict[labels.PIPELINE_NAME]
        # Check if pipeline exists on server.
        pipeline_id, experiment_id = self._get_pipeline_id_and_experiment_id(
            pipeline_name)
        self._client._pipelines_api.get_pipeline(pipeline_id)  # pylint: disable=protected-access

        # Delete pipeline for kfp server.
        self._client._pipelines_api.delete_pipeline(id=pipeline_id)  # pylint: disable=protected-access

        # Delete experiment from server.
        self._client._experiment_api.delete_experiment(experiment_id)  # pylint: disable=protected-access

        # Path to pipeline folder.
        handler_pipeline_path = os.path.join(self._handler_home_dir,
                                             pipeline_name, '')

        # Delete pipeline for home directory.
        io_utils.delete_dir(handler_pipeline_path)

        click.echo('Pipeline ' + pipeline_name + ' deleted successfully.')
Пример #22
0
 def _delete_pipeline_output(self, pipeline_name: str):
     """Deletes output produced by the named pipeline."""
     io_utils.delete_dir(self._pipeline_root(pipeline_name))
Пример #23
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Validate current model against last blessed model.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - examples: examples for eval the model.
        - model: current model for validation.
      output_dict: Output dict from output key to a list of Artifacts.
        - blessing: model blessing result.
      exec_properties: A dict of execution properties.
        - blessed_model: last blessed model for validation.
        - blessed_model_id: last blessed model id.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)
        self._temp_path = self._get_tmp_dir()
        absl.logging.info('Using temp path {} for tft.beam'.format(
            self._temp_path))

        eval_examples_uri = artifact_utils.get_split_uri(
            input_dict[constants.EXAMPLES_KEY], 'eval')
        blessing = artifact_utils.get_single_instance(
            output_dict[constants.BLESSING_KEY])

        # Current model to be validated.
        current_model = artifact_utils.get_single_instance(
            input_dict[constants.MODEL_KEY])
        absl.logging.info('Using {} as current model.'.format(
            current_model.uri))
        blessing.set_string_custom_property(
            constants.ARTIFACT_PROPERTY_CURRENT_MODEL_URI_KEY,
            current_model.uri)
        blessing.set_int_custom_property(
            constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY, current_model.id)

        # Denote model component_name.
        component_id = exec_properties['current_component_id']
        blessing.set_string_custom_property('component_id', component_id)

        # Previous blessed model to be validated against.
        blessed_model_dir = exec_properties['blessed_model']
        blessed_model_id = exec_properties['blessed_model_id']
        absl.logging.info(
            'Using {} as blessed model.'.format(blessed_model_dir))
        if blessed_model_dir:
            blessing.set_string_custom_property(
                constants.ARTIFACT_PROPERTY_BLESSED_MODEL_URI_KEY,
                blessed_model_dir)
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BLESSED_MODEL_ID_KEY,
                blessed_model_id)

        absl.logging.info('Validating model.')
        # TODO(b/125853306): support customized slice spec.
        blessed = self._generate_blessing_result(
            eval_examples_uri=eval_examples_uri,
            slice_spec=[tfma.slicer.SingleSliceSpec()],
            current_model_dir=current_model.uri,
            blessed_model_dir=blessed_model_dir)

        if blessed:
            io_utils.write_string_file(
                os.path.join(blessing.uri, constants.BLESSED_FILE_NAME), '')
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                constants.BLESSED_VALUE)
        else:
            io_utils.write_string_file(
                os.path.join(blessing.uri, constants.NOT_BLESSED_FILE_NAME),
                '')
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                constants.NOT_BLESSED_VALUE)
        absl.logging.info('Blessing result {} written to {}.'.format(
            blessed, blessing.uri))

        io_utils.delete_dir(self._temp_path)
        absl.logging.info(
            'Cleaned up temp path {} on executor success.'.format(
                self._temp_path))
Пример #24
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Validate current model against last blessed model.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - examples: examples for eval the model.
        - model: current model for validation.
      output_dict: Output dict from output key to a list of Artifacts.
        - blessing: model blessing result.
      exec_properties: A dict of execution properties.
        - blessed_model: last blessed model for validation.
        - blessed_model_id: last blessed model id.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)
        self._temp_path = self._get_tmp_dir()
        tf.logging.info('Using temp path {} for tft.beam'.format(
            self._temp_path))

        eval_examples_uri = artifact_utils.get_split_uri(
            input_dict['examples'], 'eval')
        blessing = artifact_utils.get_single_instance(output_dict['blessing'])

        # Current model.
        current_model = artifact_utils.get_single_instance(input_dict['model'])
        tf.logging.info('Using {} as current model.'.format(current_model.uri))
        blessing.set_string_custom_property('current_model', current_model.uri)
        blessing.set_int_custom_property('current_model_id', current_model.id)

        # Denote model component_name.
        component_id = exec_properties['component_id']
        blessing.set_string_custom_property('component_id', component_id)

        # Blessed model.
        blessed_model_dir = exec_properties['blessed_model']
        blessed_model_id = exec_properties['blessed_model_id']
        tf.logging.info('Using {} as blessed model.'.format(blessed_model_dir))
        if blessed_model_dir:
            blessing.set_string_custom_property('blessed_model',
                                                blessed_model_dir)
            blessing.set_int_custom_property('blessed_model_id',
                                             blessed_model_id)

        tf.logging.info('Validating model.')
        # TODO(b/125853306): support customized slice spec.
        blessed = self._generate_blessing_result(
            eval_examples_uri=eval_examples_uri,
            slice_spec=[tfma.slicer.slicer.SingleSliceSpec()],
            current_model_dir=current_model.uri,
            blessed_model_dir=blessed_model_dir)

        if blessed:
            io_utils.write_string_file(os.path.join(blessing.uri, 'BLESSED'),
                                       '')
            blessing.set_int_custom_property('blessed', 1)
        else:
            io_utils.write_string_file(
                os.path.join(blessing.uri, 'NOT_BLESSED'), '')
            blessing.set_int_custom_property('blessed', 0)
        tf.logging.info('Blessing result {} written to {}.'.format(
            blessed, blessing.uri))

        io_utils.delete_dir(self._temp_path)
        tf.logging.info('Cleaned up temp path {} on executor success.'.format(
            self._temp_path))
Пример #25
0
 def testDeleteDir(self):
   file_path = os.path.join(self._base_dir, 'file', 'path')
   io_utils.write_string_file(file_path, 'testing')
   self.assertTrue(tf.gfile.Exists(file_path))
   io_utils.delete_dir(os.path.dirname(file_path))
   self.assertFalse(tf.gfile.Exists(file_path))
Пример #26
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """TensorFlow Transform executor entrypoint.

    This implements BaseExecutor.Do() and is invoked by orchestration systems.
    This is not inteded for manual usage or further customization. Please use
    the Transform() function which takes an input format with no artifact
    dependency.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of 'ExamplesPath' type which should contain two
          splits 'train' and 'eval'.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - transform_output: Output of 'tf.Transform', which includes an exported
          Tensorflow graph suitable for both training and serving;
        - transformed_examples: Materialized transformed examples, which
          includes both 'train' and 'eval' splits.
      exec_properties: A dict of execution properties, including:
        - module_file: The file path to a python module file, from which the
          'preprocessing_fn' function will be loaded.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)
    train_data_uri = artifact_utils.get_split_uri(input_dict['input_data'],
                                                  'train')
    eval_data_uri = artifact_utils.get_split_uri(input_dict['input_data'],
                                                 'eval')
    schema_file = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict['schema']))
    transform_output = artifact_utils.get_single_uri(
        output_dict['transform_output'])
    transformed_train_output = artifact_utils.get_split_uri(
        output_dict['transformed_examples'], 'train')
    transformed_eval_output = artifact_utils.get_split_uri(
        output_dict['transformed_examples'], 'eval')
    temp_path = os.path.join(transform_output, _TEMP_DIR_IN_TRANSFORM_OUTPUT)
    tf.logging.debug('Using temp path %s for tft.beam', temp_path)

    def _GetCachePath(label, params_dict):
      if label not in params_dict:
        return None
      else:
        return artifact_utils.get_single_uri(params_dict[label])

    label_inputs = {
        labels.COMPUTE_STATISTICS_LABEL:
            False,
        labels.SCHEMA_PATH_LABEL:
            schema_file,
        labels.EXAMPLES_DATA_FORMAT_LABEL:
            labels.FORMAT_TF_EXAMPLE,
        labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(train_data_uri),
        labels.TRANSFORM_ONLY_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(eval_data_uri),
        labels.TFT_STATISTICS_USE_TFDV_LABEL:
            True,
        labels.PREPROCESSING_FN:
            exec_properties['module_file'],
    }
    cache_input = _GetCachePath('cache_input_path', input_dict)
    if cache_input is not None:
      label_inputs[labels.CACHE_INPUT_PATH_LABEL] = cache_input

    label_outputs = {
        labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output,
        labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [
            os.path.join(transformed_train_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
            os.path.join(transformed_eval_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
        ],
        labels.TEMP_OUTPUT_LABEL: str(temp_path),
    }
    cache_output = _GetCachePath('cache_output_path', output_dict)
    if cache_output is not None:
      label_outputs[labels.CACHE_OUTPUT_PATH_LABEL] = cache_output
    status_file = 'status_file'  # Unused
    self.Transform(label_inputs, label_outputs, status_file)
    tf.logging.info('Cleaning up temp path %s on executor success', temp_path)
    io_utils.delete_dir(temp_path)
Пример #27
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Contract for running InfraValidator Executor.

    Args:
      input_dict:
        - `model`: Single `Model` artifact that we're validating.
        - `examples`: `Examples` artifacts to be used for test requests.
      output_dict:
        - `blessing`: Single `InfraBlessing` artifact containing the validated
          result. It is an empty file with the name either of INFRA_BLESSED or
          INFRA_NOT_BLESSED.
      exec_properties:
        - `serving_spec`: Serialized `ServingSpec` configuration.
        - `validation_spec`: Serialized `ValidationSpec` configuration.
        - `request_spec`: Serialized `RequestSpec` configuration.
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        model = artifact_utils.get_single_instance(input_dict['model'])
        blessing = artifact_utils.get_single_instance(output_dict['blessing'])

        serving_spec = infra_validator_pb2.ServingSpec()
        json_format.Parse(exec_properties['serving_spec'], serving_spec)
        if not serving_spec.model_name:
            serving_spec.model_name = _DEFAULT_MODEL_NAME

        validation_spec = infra_validator_pb2.ValidationSpec()
        if 'validation_spec' in exec_properties:
            json_format.Parse(exec_properties['validation_spec'],
                              validation_spec)
        if not validation_spec.num_tries:
            validation_spec.num_tries = _DEFAULT_NUM_TRIES
        if not validation_spec.max_loading_time_seconds:
            validation_spec.max_loading_time_seconds = _DEFAULT_MAX_LOADING_TIME_SEC

        if _is_query_mode(input_dict, exec_properties):
            logging.info('InfraValidator will be run in LOAD_AND_QUERY mode.')
            request_spec = infra_validator_pb2.RequestSpec()
            json_format.Parse(exec_properties['request_spec'], request_spec)
            examples = artifact_utils.get_single_instance(
                input_dict['examples'])
            requests = request_builder.build_requests(
                model_name=os.path.basename(
                    os.path.dirname(path_utils.serving_model_path(model.uri))),
                examples=examples,
                request_spec=request_spec)
        else:
            logging.info('InfraValidator will be run in LOAD_ONLY mode.')
            requests = []

        model_path = self._PrepareModelPath(model.uri, serving_spec)
        try:
            # TODO(jjong): Make logic parallel.
            all_passed = True
            for serving_binary in serving_bins.parse_serving_binaries(
                    serving_spec):
                all_passed &= self._ValidateWithRetry(
                    model_path=model_path,
                    serving_binary=serving_binary,
                    serving_spec=serving_spec,
                    validation_spec=validation_spec,
                    requests=requests)
        finally:
            io_utils.delete_dir(self._get_tmp_dir())

        if all_passed:
            _mark_blessed(blessing)
        else:
            _mark_not_blessed(blessing)
Пример #28
0
 def testDeleteDir(self):
     file_path = os.path.join(self._base_dir, 'file', 'path')
     io_utils.write_string_file(file_path, 'testing')
     self.assertTrue(tf.gfile.Exists(file_path))
     io_utils.delete_dir(os.path.dirname(file_path))
     self.assertFalse(tf.gfile.Exists(file_path))
Пример #29
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """TensorFlow Transform executor entrypoint.

    This implements BaseExecutor.Do() and is invoked by orchestration systems.
    This is not inteded for manual usage or further customization. Please use
    the Transform() function which takes an input format with no artifact
    dependency.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of type `standard_artifacts.Examples` which
          should contain two splits 'train' and 'eval'.
        - schema: A list of type `standard_artifacts.Schema` which should
          contain a single schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - transform_output: Output of 'tf.Transform', which includes an exported
          Tensorflow graph suitable for both training and serving;
        - transformed_examples: Materialized transformed examples, which
          includes both 'train' and 'eval' splits.
      exec_properties: A dict of execution properties, including either one of:
        - module_file: The file path to a python module file, from which the
          'preprocessing_fn' function will be loaded.
        - preprocessing_fn: The module path to a python function that
          implements 'preprocessing_fn'.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        train_data_uri = artifact_utils.get_split_uri(input_dict[EXAMPLES_KEY],
                                                      'train')
        eval_data_uri = artifact_utils.get_split_uri(input_dict[EXAMPLES_KEY],
                                                     'eval')
        payload_format, data_view_uri = (
            tfxio_utils.resolve_payload_format_and_data_view_uri(
                input_dict[EXAMPLES_KEY]))
        schema_file = io_utils.get_only_uri_in_dir(
            artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))

        transform_graph_uri = artifact_utils.get_single_uri(
            input_dict[TRANSFORM_GRAPH_KEY])
        transform_output = artifact_utils.get_single_uri(
            output_dict[TRANSFORM_OUTPUT_KEY])

        temp_path = os.path.join(transform_output,
                                 _TEMP_DIR_IN_TRANSFORM_OUTPUT)
        absl.logging.debug('Using temp path %s for tft.beam', temp_path)

        materialize_output_paths = []
        if output_dict.get(TRANSFORMED_EXAMPLES_KEY) is not None:
            transformed_example_artifact = artifact_utils.get_single_instance(
                output_dict[TRANSFORMED_EXAMPLES_KEY])
            # TODO(b/161490287): move the split_names setting to executor for all
            # components.
            transformed_example_artifact.split_names = (
                artifact_utils.encode_split_names(
                    artifact.DEFAULT_EXAMPLE_SPLITS))
            transformed_train_output = artifact_utils.get_split_uri(
                output_dict[TRANSFORMED_EXAMPLES_KEY], 'train')
            transformed_eval_output = artifact_utils.get_split_uri(
                output_dict[TRANSFORMED_EXAMPLES_KEY], 'eval')
            materialize_output_paths = [
                os.path.join(transformed_train_output,
                             _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
                os.path.join(transformed_eval_output,
                             _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX)
            ]

        def _GetCachePath(label, params_dict):
            if label not in params_dict:
                return None
            else:
                return artifact_utils.get_single_uri(params_dict[label])

        label_inputs = {
            'transform_graph_uri':
            transform_graph_uri,
            labels.COMPUTE_STATISTICS_LABEL:
            False,
            labels.SCHEMA_PATH_LABEL:
            schema_file,
            labels.EXAMPLES_DATA_FORMAT_LABEL:
            payload_format,
            labels.DATA_VIEW_LABEL:
            data_view_uri,
            labels.ANALYZE_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(train_data_uri),
            labels.ANALYZE_PATHS_FILE_FORMATS_LABEL:
            labels.FORMAT_TFRECORD,
            labels.TRANSFORM_DATA_PATHS_LABEL: [
                io_utils.all_files_pattern(train_data_uri),
                io_utils.all_files_pattern(eval_data_uri)
            ],
            labels.TRANSFORM_PATHS_FILE_FORMATS_LABEL:
            [labels.FORMAT_TFRECORD, labels.FORMAT_TFRECORD],
            labels.CUSTOM_CONFIG:
            exec_properties.get('custom_config', None),
        }
        cache_input = _GetCachePath('cache_input_path', input_dict)
        if cache_input is not None:
            label_inputs[labels.CACHE_INPUT_PATH_LABEL] = cache_input

        label_outputs = {
            labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output,
            labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL:
            materialize_output_paths,
            labels.TEMP_OUTPUT_LABEL: str(temp_path),
        }
        cache_output = _GetCachePath('cache_output_path', output_dict)
        if cache_output is not None:
            label_outputs[labels.CACHE_OUTPUT_PATH_LABEL] = cache_output
        status_file = 'status_file'  # Unused
        self.Transform(label_inputs, label_outputs, status_file)
        absl.logging.debug('Cleaning up temp path %s on executor success',
                           temp_path)
        io_utils.delete_dir(temp_path)
Пример #30
0
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow Transform executor entrypoint.

    This implements BaseExecutor.Do() and is invoked by orchestration systems.
    This is not inteded for manual usage or further customization. Please use
    the Transform() function which takes an input format with no artifact
    dependency.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of 'ExamplesPath' type which should contain two
          splits 'train' and 'eval'.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - transform_output: Output of 'tf.Transform', which includes an exported
          Tensorflow graph suitable for both training and serving;
        - transformed_examples: Materialized transformed examples, which
          includes both 'train' and 'eval' splits.
      exec_properties: A dict of execution properties, including:
        - module_file: The file path to a python module file, from which the
          'preprocessing_fn' function will be loaded.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)
    train_data_uri = types.get_split_uri(input_dict['input_data'], 'train')
    eval_data_uri = types.get_split_uri(input_dict['input_data'], 'eval')
    schema_file = io_utils.get_only_uri_in_dir(
        types.get_single_uri(input_dict['schema']))

    transform_output = types.get_single_uri(output_dict['transform_output'])
    if tf.gfile.Exists(transform_output):
      io_utils.delete_dir(transform_output)

    transformed_train_output = types.get_split_uri(
        output_dict['transformed_examples'], 'train')
    if tf.gfile.Exists(transformed_train_output):
      io_utils.delete_dir(transformed_train_output)

    transformed_eval_output = types.get_split_uri(
        output_dict['transformed_examples'], 'eval')
    if tf.gfile.Exists(transformed_eval_output):
      io_utils.delete_dir(transformed_eval_output)

    temp_path = os.path.join(transform_output, _TEMP_DIR_IN_TRANSFORM_OUTPUT)
    tf.logging.debug('Using temp path %s for tft.beam', temp_path)

    label_inputs = {
        labels.COMPUTE_STATISTICS_LABEL:
            False,
        labels.SCHEMA_PATH_LABEL:
            schema_file,
        labels.EXAMPLES_DATA_FORMAT_LABEL:
            labels.FORMAT_TF_EXAMPLE,
        labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(train_data_uri),
        labels.TRANSFORM_ONLY_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(eval_data_uri),
        labels.TFT_STATISTICS_USE_TFDV_LABEL:
            True,
        labels.PREPROCESSING_FN:
            exec_properties['module_file'],
    }

    label_outputs = {
        labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output,
        labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [
            os.path.join(transformed_train_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
            os.path.join(transformed_eval_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
        ],
        labels.TEMP_OUTPUT_LABEL: str(temp_path),
    }
    status_file = 'status_file'  # Unused
    self.Transform(label_inputs, label_outputs, status_file)
    tf.logging.info('Cleaning up temp path %s on executor success', temp_path)
    io_utils.delete_dir(temp_path)
Пример #31
0
    def Do(self, input_dict: Dict[Text, List[Artifact]],
           output_dict: Dict[Text, List[Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        split_uris: List[Text] = []
        for artifact in input_dict[executor.EXAMPLES_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                split_uris.append(split)

        self._log_startup(input_dict, output_dict, exec_properties)
        data_uris = []
        for split in split_uris:
            data_uris.append(
                artifact_utils.get_split_uri(input_dict[executor.EXAMPLES_KEY],
                                             split))

        schema_file = io_utils.get_only_uri_in_dir(
            artifact_utils.get_single_uri(input_dict[executor.SCHEMA_KEY]))
        transform_output = artifact_utils.get_single_uri(
            output_dict[executor.TRANSFORM_GRAPH_KEY])
        transformed_data_uris = []
        for split in split_uris:
            transformed_data_uris.append(
                artifact_utils.get_split_uri(
                    output_dict[executor.TRANSFORMED_EXAMPLES_KEY], split))
        temp_path = os.path.join(transform_output,
                                 executor._TEMP_DIR_IN_TRANSFORM_OUTPUT)
        logging.debug('Using temp path %s for tft.beam', temp_path)

        def _GetCachePath(label, params_dict):
            if label not in params_dict:
                return None
            else:
                return artifact_utils.get_single_uri(params_dict[label])

        label_inputs = {
            labels.COMPUTE_STATISTICS_LABEL:
            False,
            labels.SCHEMA_PATH_LABEL:
            schema_file,
            labels.EXAMPLES_DATA_FORMAT_LABEL:
            labels.FORMAT_TF_EXAMPLE,
            labels.ANALYZE_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(data_uris[0]),
            labels.ANALYZE_PATHS_FILE_FORMATS_LABEL:
            labels.FORMAT_TFRECORD,
            labels.TRANSFORM_DATA_PATHS_LABEL:
            [io_utils.all_files_pattern(uri) for uri in data_uris],
            labels.TRANSFORM_PATHS_FILE_FORMATS_LABEL:
            [labels.FORMAT_TFRECORD for uri in data_uris],
            labels.TFT_STATISTICS_USE_TFDV_LABEL:
            True,
            labels.MODULE_FILE:
            exec_properties.get('module_file', None),
            labels.PREPROCESSING_FN:
            exec_properties.get('preprocessing_fn', None),
            # TODO(b/149754658): switch to True once the TFXIO integration is
            # complete.
            labels.USE_TFXIO_LABEL:
            False,
        }
        cache_input = _GetCachePath('cache_input_path', input_dict)
        if cache_input is not None:
            label_inputs[labels.CACHE_INPUT_PATH_LABEL] = cache_input

        label_outputs = {
            labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL:
            transform_output,
            labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [
                os.path.join(uri,
                             executor._DEFAULT_TRANSFORMED_EXAMPLES_PREFIX)
                for uri in transformed_data_uris
            ],
            labels.TEMP_OUTPUT_LABEL:
            str(temp_path),
        }
        cache_output = _GetCachePath('cache_output_path', output_dict)
        if cache_output is not None:
            label_outputs[labels.CACHE_OUTPUT_PATH_LABEL] = cache_output
        status_file = 'status_file'  # Unused
        self.Transform(label_inputs, label_outputs, status_file)
        logging.debug('Cleaning up temp path %s on executor success',
                      temp_path)
        io_utils.delete_dir(temp_path)
Пример #32
0
 def _delete_pipeline_data(self):
     path = f'gs://{self._BUCKET_NAME}/tfx_pipeline_output/{self._pipeline_name}'
     io_utils.delete_dir(path)
     path = (f'gs://{self._BUCKET_NAME}/{self._DATA_DIRECTORY_NAME}/'
             f'{self._pipeline_name}')
     io_utils.delete_dir(path)