Пример #1
0
 def testScopedLabels(self):
     """Test for scoped_labels."""
     orig_labels = telemetry_utils.get_labels_dict()
     with telemetry_utils.scoped_labels({'foo': 'bar'}):
         self.assertDictEqual(telemetry_utils.get_labels_dict(),
                              dict({'foo': 'bar'}, **orig_labels))
         with telemetry_utils.scoped_labels({'inner': 'baz'}):
             self.assertDictEqual(
                 telemetry_utils.get_labels_dict(),
                 dict({
                     'foo': 'bar',
                     'inner': 'baz'
                 }, **orig_labels))
Пример #2
0
 def testDoBlessed(self, mock_runner, _):
     self._model_blessing.uri = os.path.join(self._source_data_dir,
                                             'model_validator/blessed')
     self._model_blessing.set_int_custom_property('blessed', 1)
     mock_runner.get_service_name_and_api_version.return_value = ('ml',
                                                                  'v1')
     self._executor.Do(self._input_dict, self._output_dict,
                       self._serialize_custom_config_under_test())
     executor_class_path = '%s.%s' % (self._executor.__class__.__module__,
                                      self._executor.__class__.__name__)
     with telemetry_utils.scoped_labels(
         {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}):
         job_labels = telemetry_utils.get_labels_dict()
     mock_runner.deploy_model_for_aip_prediction.assert_called_once_with(
         mock.ANY,
         self._model_push.uri,
         mock.ANY,
         mock.ANY,
         job_labels,
     )
     self.assertPushed()
     version = self._model_push.get_string_custom_property('pushed_version')
     self.assertEqual(
         self._model_push.get_string_custom_property('pushed_destination'),
         'projects/project_id/models/model_name/versions/{}'.format(
             version))
Пример #3
0
def _ReadFromBigQueryImpl(  # pylint: disable=invalid-name
        pipeline: beam.Pipeline,
        query: Text,
        use_bigquery_source: bool = False) -> beam.pvalue.PCollection:
    """Read from BigQuery.

  Args:
    pipeline: beam pipeline.
    query: a BigQuery sql string.
    use_bigquery_source: Whether to use BigQuerySource instead of experimental
      `ReadFromBigQuery` PTransform.

  Returns:
    PCollection of dict.
  """
    # TODO(b/155441037): Consolidate to ReadFromBigQuery once its performance
    # on dataflow runner is on par with BigQuerySource.
    if use_bigquery_source:
        return (
            pipeline
            | 'ReadFromBigQuerySource' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))

    return (pipeline
            | 'ReadFromBigQuery' >> beam_bigquery.ReadFromBigQuery(
                query=query,
                use_standard_sql=True,
                bigquery_job_labels=telemetry_utils.get_labels_dict()))
Пример #4
0
 def setUp(self):
     super(RunnerTest, self).setUp()
     self._output_data_dir = os.path.join(
         os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
         self._testMethodName)
     self._project_id = '12345'
     self._mock_api_client = mock.Mock()
     self._inputs = {}
     self._outputs = {}
     self._training_inputs = {
         'project': self._project_id,
     }
     self._job_id = 'my_jobid'
     # Dict format of exec_properties. custom_config needs to be serialized
     # before being passed into start_aip_training function.
     self._exec_properties = {
         'custom_config': {
             executor.TRAINING_ARGS_KEY: self._training_inputs,
         },
     }
     self._model_name = 'model_name'
     self._ai_platform_serving_args = {
         'model_name': self._model_name,
         'project_id': self._project_id,
     }
     self._executor_class_path = 'my.executor.Executor'
     with telemetry_utils.scoped_labels(
         {telemetry_utils.LABEL_TFX_EXECUTOR: self._executor_class_path}):
         self._job_labels = telemetry_utils.get_labels_dict()
Пример #5
0
def _ReadFromBigQueryImpl(  # pylint: disable=invalid-name
        pipeline: beam.Pipeline,
        query: Text,
        use_bigquery_source: bool = False) -> beam.pvalue.PCollection:
    """Read from BigQuery.

  Args:
    pipeline: beam pipeline.
    query: a BigQuery sql string.
    use_bigquery_source: Whether to use BigQuerySource instead of experimental
      `ReadFromBigQuery` PTransform.

  Returns:
    PCollection of dict.
  """
    if use_bigquery_source:
        return (
            pipeline
            | 'ReadFromBigQuerySource' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))

    return (pipeline
            | 'ReadFromBigQuery' >> ReadFromBigQuery(
                query=query,
                use_standard_sql=True,
                bigquery_job_labels=telemetry_utils.get_labels_dict()))
Пример #6
0
    def testDoWithBlessedModel(self, mock_runner, mock_run_model_inference, _):
        input_dict = {
            'examples': [self._examples],
            'model': [self._model],
            'model_blessing': [self._model_blessing],
        }
        output_dict = {
            'inference_result': [self._inference_result],
        }
        ai_platform_serving_args = {
            'model_name': 'model_name',
            'project_id': 'project_id'
        }
        # Create exe properties.
        exec_properties = {
            'data_spec':
            proto_utils.proto_to_json(bulk_inferrer_pb2.DataSpec()),
            'custom_config':
            json_utils.dumps(
                {executor.SERVING_ARGS_KEY: ai_platform_serving_args}),
        }
        mock_runner.get_service_name_and_api_version.return_value = ('ml',
                                                                     'v1')
        mock_runner.create_model_for_aip_prediction_if_not_exist.return_value = True

        # Run executor.
        bulk_inferrer = executor.Executor(self._context)
        bulk_inferrer.Do(input_dict, output_dict, exec_properties)

        ai_platform_prediction_model_spec = (
            model_spec_pb2.AIPlatformPredictionModelSpec(
                project_id='project_id',
                model_name='model_name',
                version_name=self._model_version))
        ai_platform_prediction_model_spec.use_serialization_config = True
        inference_endpoint = model_spec_pb2.InferenceSpecType()
        inference_endpoint.ai_platform_prediction_model_spec.CopyFrom(
            ai_platform_prediction_model_spec)
        mock_run_model_inference.assert_called_once_with(
            mock.ANY, mock.ANY, mock.ANY, mock.ANY, mock.ANY,
            inference_endpoint)
        executor_class_path = '%s.%s' % (bulk_inferrer.__class__.__module__,
                                         bulk_inferrer.__class__.__name__)
        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}):
            job_labels = telemetry_utils.get_labels_dict()
        mock_runner.deploy_model_for_aip_prediction.assert_called_once_with(
            mock.ANY,
            path_utils.serving_model_path(self._model.uri),
            mock.ANY,
            ai_platform_serving_args,
            job_labels,
            skip_model_creation=True,
            set_default_version=False,
        )
        mock_runner.delete_model_version_from_aip_if_exists.assert_called_once_with(
            mock.ANY, mock.ANY, ai_platform_serving_args)
        mock_runner.delete_model_from_aip_if_exists.assert_called_once_with(
            mock.ANY, ai_platform_serving_args)
Пример #7
0
    def run(self,
            pipeline: tfx_pipeline.Pipeline,
            parameter_values: Optional[Dict[Text, Any]] = None,
            write_out: Optional[bool] = True) -> Dict[Text, Any]:
        """Compiles a pipeline DSL object into pipeline file.

    Args:
      pipeline: TFX pipeline object.
      parameter_values: mapping from runtime parameter names to its values.
      write_out: set to True to actually write out the file to the place
        designated by output_dir and output_filename. Otherwise return the
        JSON-serialized pipeline job spec.

    Returns:
      Returns the JSON pipeline job spec.

    Raises:
      RuntimeError: if trying to write out to a place occupied by an existing
      file.
    """
        # TODO(b/166343606): Support user-provided labels.
        # TODO(b/169095387): Deprecate .run() method in favor of the unified API
        # client.
        display_name = (self._config.display_name
                        or pipeline.pipeline_info.pipeline_name)
        pipeline_spec = pipeline_builder.PipelineBuilder(
            tfx_pipeline=pipeline,
            default_image=self._config.default_image,
            default_commands=self._config.default_commands).build()
        pipeline_spec.sdk_version = 'tfx-{}'.format(version.__version__)
        pipeline_spec.schema_version = _SCHEMA_VERSION
        runtime_config = pipeline_builder.RuntimeConfigBuilder(
            pipeline_info=pipeline.pipeline_info,
            parameter_values=parameter_values).build()
        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_RUNNER: 'kubeflow_v2'}):
            result = pipeline_spec_pb2.PipelineJob(
                display_name=display_name
                or pipeline.pipeline_info.pipeline_name,
                labels=telemetry_utils.get_labels_dict(),
                runtime_config=runtime_config)
        result.pipeline_spec.update(json_format.MessageToDict(pipeline_spec))
        pipeline_json_dict = json_format.MessageToDict(result)
        if write_out:
            if fileio.exists(
                    self._output_dir) and not fileio.isdir(self._output_dir):
                raise RuntimeError('Output path: %s is pointed to a file.' %
                                   self._output_dir)
            if not fileio.exists(self._output_dir):
                fileio.makedirs(self._output_dir)

            with fileio.open(
                    os.path.join(self._output_dir, self._output_filename),
                    'wb') as f:
                f.write(json.dumps(pipeline_json_dict, sort_keys=True))

        return pipeline_json_dict
Пример #8
0
    def _assertDeployModelMockCalls(self,
                                    expected_models_create_body=None,
                                    expected_versions_create_body=None,
                                    expect_set_default=True):
        if not expected_models_create_body:
            expected_models_create_body = {
                'name': self._model_name,
                'regions': [],
            }

        if not expected_versions_create_body:
            with telemetry_utils.scoped_labels({
                    telemetry_utils.LABEL_TFX_EXECUTOR:
                    self._executor_class_path
            }):
                labels = telemetry_utils.get_labels_dict()

            expected_versions_create_body = {
                'name':
                self._model_version,
                'deployment_uri':
                self._serving_path,
                'runtime_version':
                runner._get_tf_runtime_version(tf.__version__),
                'python_version':
                runner._get_caip_python_version(
                    runner._get_tf_runtime_version(tf.__version__)),
                'labels':
                labels
            }

        self._mock_models_create.assert_called_with(
            body=mock.ANY,
            parent='projects/{}'.format(self._project_id),
        )
        (_, models_create_kwargs) = self._mock_models_create.call_args
        self.assertDictEqual(expected_models_create_body,
                             models_create_kwargs['body'])

        self._mock_versions_create.assert_called_with(
            body=mock.ANY,
            parent='projects/{}/models/{}'.format(self._project_id,
                                                  self._model_name))
        (_, versions_create_kwargs) = self._mock_versions_create.call_args

        self.assertDictEqual(expected_versions_create_body,
                             versions_create_kwargs['body'])

        if not expect_set_default:
            return

        self._mock_set_default.assert_called_with(
            name='projects/{}/models/{}/versions/{}'.format(
                self._project_id, self._model_name, self._model_version))
        self._mock_set_default_execute.assert_called_with()
Пример #9
0
def ReadFromBigQuery(
    pipeline: beam.Pipeline, query: Text) -> beam.pvalue.PCollection:
  """Read data from BigQuery.

  Args:
    pipeline: Beam pipeline.
    query: A BigQuery sql string.

  Returns:
    PCollection of dict.
  """
  return (pipeline
          | 'ReadFromBigQuery' >> bigquery.ReadFromBigQuery(
              query=query,
              use_standard_sql=True,
              bigquery_job_labels=telemetry_utils.get_labels_dict()))
Пример #10
0
def _ReadFromBigQueryImpl(  # pylint: disable=invalid-name
        pipeline: beam.Pipeline, query: Text) -> beam.pvalue.PCollection:
    """Read from BigQuery.

  Args:
    pipeline: beam pipeline.
    query: a BigQuery sql string.

  Returns:
    PCollection of dict.
  """
    return (pipeline
            | 'ReadFromBigQuery' >> beam_bigquery.ReadFromBigQuery(
                query=query,
                use_standard_sql=True,
                bigquery_job_labels=telemetry_utils.get_labels_dict()))
Пример #11
0
    def testDeployModelForAIPPredictionWithCustomRegion(self, mock_discovery):
        mock_discovery.build.return_value = self._mock_api_client
        self._setUpPredictionMocks()

        self._ai_platform_serving_args['regions'] = ['custom-region']
        runner.deploy_model_for_aip_prediction(self._serving_path,
                                               self._model_version,
                                               self._ai_platform_serving_args,
                                               self._executor_class_path)

        self._mock_models_create.assert_called_with(
            body=mock.ANY,
            parent='projects/{}'.format(self._project_id),
        )
        (_, models_create_kwargs) = self._mock_models_create.call_args
        models_create_body = models_create_kwargs['body']
        self.assertDictEqual(
            {
                'name': 'model_name',
                'regions': ['custom-region']
            }, models_create_body)

        self._mock_versions_create.assert_called_with(
            body=mock.ANY,
            parent='projects/{}/models/{}'.format(self._project_id,
                                                  'model_name'))
        (_, versions_create_kwargs) = self._mock_versions_create.call_args
        versions_create_body = versions_create_kwargs['body']
        with telemetry_utils.scoped_labels(
            {telemetry_utils.TFX_EXECUTOR: self._executor_class_path}):
            labels = telemetry_utils.get_labels_dict()
        runtime_version = runner._get_tf_runtime_version(tf.__version__)
        self.assertDictEqual(
            {
                'name': self._model_version,
                'deployment_uri': self._serving_path,
                'runtime_version': runtime_version,
                'python_version':
                runner._get_caip_python_version(runtime_version),
                'labels': labels,
            }, versions_create_body)
        self._mock_get.assert_called_with(name='op_name')

        self._mock_set_default.assert_called_with(
            name='projects/{}/models/{}/versions/{}'.format(
                self._project_id, 'model_name', self._model_version))
        self._mock_set_default_execute.assert_called_with()
Пример #12
0
    def testDeployModelForAIPPredictionWithCustomRuntime(self, mock_discovery):
        mock_discovery.build.return_value = self._mock_api_client
        self._setUpPredictionMocks()

        self._ai_platform_serving_args['runtime_version'] = '1.23.45'
        runner.deploy_model_for_aip_prediction(self._serving_path,
                                               self._model_version,
                                               self._ai_platform_serving_args,
                                               self._executor_class_path)

        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_EXECUTOR: self._executor_class_path}):
            labels = telemetry_utils.get_labels_dict()

        expected_versions_create_body = {
            'name': self._model_version,
            'deployment_uri': self._serving_path,
            'runtime_version': '1.23.45',
            'python_version': runner._get_caip_python_version('1.23.45'),
            'labels': labels,
        }
        self._assertDeployModelMockCalls(
            expected_versions_create_body=expected_versions_create_body)
Пример #13
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Runs batch inference on a given model with given input examples.

    This function creates a new model (if necessary) and a new model version
    before inference, and cleans up resources after inference. It provides
    re-executability as it cleans up (only) the model resources that are created
    during the process even inference job failed.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - examples: examples for inference.
        - model: exported model.
        - model_blessing: model blessing result
      output_dict: Output dict from output key to a list of Artifacts.
        - output: bulk inference results.
      exec_properties: A dict of execution properties.
        - data_spec: JSON string of bulk_inferrer_pb2.DataSpec instance.
        - custom_config: custom_config.ai_platform_serving_args need to contain
          the serving job parameters sent to Google Cloud AI Platform. For the
          full set of parameters, refer to
          https://cloud.google.com/ml-engine/reference/rest/v1/projects.models

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)
    if 'examples' not in input_dict:
      raise ValueError('\'examples\' is missing in input dict.')
    if 'inference_result' not in output_dict:
      raise ValueError('\'inference_result\' is missing in output dict.')
    output = artifact_utils.get_single_instance(output_dict['inference_result'])
    if 'model' not in input_dict:
      raise ValueError('Input models are not valid, model '
                       'need to be specified.')
    if 'model_blessing' in input_dict:
      model_blessing = artifact_utils.get_single_instance(
          input_dict['model_blessing'])
      if not model_utils.is_model_blessed(model_blessing):
        output.set_int_custom_property('inferred', 0)
        logging.info('Model on %s was not blessed', model_blessing.uri)
        return
    else:
      logging.info('Model blessing is not provided, exported model will be '
                   'used.')
    if _CUSTOM_CONFIG_KEY not in exec_properties:
      raise ValueError('Input exec properties are not valid, {} '
                       'need to be specified.'.format(_CUSTOM_CONFIG_KEY))

    custom_config = json_utils.loads(
        exec_properties.get(_CUSTOM_CONFIG_KEY, 'null'))
    if custom_config is not None and not isinstance(custom_config, Dict):
      raise ValueError('custom_config in execution properties needs to be a '
                       'dict.')
    ai_platform_serving_args = custom_config.get(SERVING_ARGS_KEY)
    if not ai_platform_serving_args:
      raise ValueError(
          '\'ai_platform_serving_args\' is missing in \'custom_config\'')
    service_name, api_version = runner.get_service_name_and_api_version(
        ai_platform_serving_args)
    executor_class_path = '%s.%s' % (self.__class__.__module__,
                                     self.__class__.__name__)
    with telemetry_utils.scoped_labels(
        {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}):
      job_labels = telemetry_utils.get_labels_dict()
    model = artifact_utils.get_single_instance(input_dict['model'])
    model_path = path_utils.serving_model_path(model.uri)
    logging.info('Use exported model from %s.', model_path)
    # Use model artifact uri to generate model version to guarantee the
    # 1:1 mapping from model version to model.
    model_version = 'version_' + hashlib.sha256(model.uri.encode()).hexdigest()
    inference_spec = self._get_inference_spec(model_path, model_version,
                                              ai_platform_serving_args)
    data_spec = bulk_inferrer_pb2.DataSpec()
    json_format.Parse(exec_properties['data_spec'], data_spec)
    api = discovery.build(service_name, api_version)
    new_model_created = False
    try:
      new_model_created = runner.create_model_for_aip_prediction_if_not_exist(
          api, job_labels, ai_platform_serving_args)
      runner.deploy_model_for_aip_prediction(
          api,
          model_path,
          model_version,
          ai_platform_serving_args,
          job_labels,
          skip_model_creation=True,
          set_default_version=False,
      )
      self._run_model_inference(data_spec, input_dict['examples'], output.uri,
                                inference_spec)
    except Exception as e:
      logging.error('Error in executing CloudAIBulkInferrerComponent: %s',
                    str(e))
      output.set_int_custom_property('inferred', 0)
      raise
    finally:
      # Guarantee newly created resources are cleaned up even if theinference
      # job failed.

      # Clean up the newly deployed model.
      runner.delete_model_version_from_aip_if_exists(api, model_version,
                                                     ai_platform_serving_args)
      if new_model_created:
        runner.delete_model_from_aip_if_exists(api, ai_platform_serving_args)
    # Mark the inferenence as successful after resources are cleaned up.
    output.set_int_custom_property('inferred', 1)
Пример #14
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]):
    """Overrides the tfx_pusher_executor.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from model_validator.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one. It will
          include the model in this push execution if the model was pushed.
      exec_properties: Mostly a passthrough input dict for
        tfx.components.Pusher.executor.  custom_config.bigquery_serving_args is
        consumed by this class.  For the full set of parameters supported by
        Big Query ML, refer to https://cloud.google.com/bigquery-ml/

    Returns:
      None
    Raises:
      ValueError:
        If bigquery_serving_args is not in exec_properties.custom_config.
        If pipeline_root is not 'gs://...'
      RuntimeError: if the Big Query job failed.
    """
    self._log_startup(input_dict, output_dict, exec_properties)
    model_push = artifact_utils.get_single_instance(
        output_dict[tfx_pusher_executor.PUSHED_MODEL_KEY])
    if not self.CheckBlessing(input_dict):
      self._MarkNotPushed(model_push)
      return

    model_export = artifact_utils.get_single_instance(
        input_dict[tfx_pusher_executor.MODEL_KEY])
    model_export_uri = model_export.uri

    custom_config = json_utils.loads(
        exec_properties.get(_CUSTOM_CONFIG_KEY, 'null'))
    if custom_config is not None and not isinstance(custom_config, Dict):
      raise ValueError('custom_config in execution properties needs to be a '
                       'dict.')

    bigquery_serving_args = custom_config.get(SERVING_ARGS_KEY)
    # if configuration is missing error out
    if bigquery_serving_args is None:
      raise ValueError('Big Query ML configuration was not provided')

    bq_model_uri = '.'.join([
        bigquery_serving_args[_PROJECT_ID_KEY],
        bigquery_serving_args[_BQ_DATASET_ID_KEY],
        bigquery_serving_args[_MODEL_NAME_KEY],
    ])

    # Deploy the model.
    io_utils.copy_dir(
        src=path_utils.serving_model_path(model_export_uri), dst=model_push.uri)
    model_path = model_push.uri
    if not model_path.startswith(_GCS_PREFIX):
      raise ValueError('pipeline_root must be gs:// for BigQuery ML Pusher.')

    logging.info('Deploying the model to BigQuery ML for serving: %s from %s',
                 bigquery_serving_args, model_path)

    query = _BQML_CREATE_OR_REPLACE_MODEL_QUERY_TEMPLATE.format(
        model_uri=bq_model_uri, model_path=model_path)

    # TODO(zhitaoli): Refactor the executor_class_path creation into a common
    # utility function.
    executor_class_path = '%s.%s' % (self.__class__.__module__,
                                     self.__class__.__name__)
    with telemetry_utils.scoped_labels(
        {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}):
      default_query_job_config = bigquery.job.QueryJobConfig(
          labels=telemetry_utils.get_labels_dict())
    client = bigquery.Client(default_query_job_config=default_query_job_config)

    try:
      query_job = client.query(query)
      query_job.result()  # Waits for the query to finish
    except Exception as e:
      raise RuntimeError('BigQuery ML Push failed: {}'.format(e))

    logging.info('Successfully deployed model %s serving from %s', bq_model_uri,
                 model_path)

    # Setting the push_destination to bigquery uri
    self._MarkPushed(model_push, pushed_destination=bq_model_uri)
Пример #15
0
Файл: runner.py Проект: zvrr/tfx
def deploy_model_for_aip_prediction(
    serving_path: Text,
    model_version: Text,
    ai_platform_serving_args: Dict[Text, Any],
    executor_class_path: Text,
):
  """Deploys a model for serving with AI Platform.

  Args:
    serving_path: The path to the model. Must be a GCS URI.
    model_version: Version of the model being deployed. Must be different from
      what is currently being served.
    ai_platform_serving_args: Dictionary containing arguments for pushing to AI
      Platform. For the full set of parameters supported, refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions#Version
    executor_class_path: class path for TFX core default trainer.

  Raises:
    RuntimeError: if an error is encountered when trying to push.
  """
  logging.info(
      'Deploying to model with version %s to AI Platform for serving: %s',
      model_version, ai_platform_serving_args)

  model_name = ai_platform_serving_args['model_name']
  project_id = ai_platform_serving_args['project_id']
  regions = ai_platform_serving_args.get('regions', [])
  default_runtime_version = _get_tf_runtime_version(tf.__version__)
  runtime_version = ai_platform_serving_args.get('runtime_version',
                                                 default_runtime_version)
  python_version = _get_caip_python_version(runtime_version)

  api = discovery.build('ml', 'v1')
  body = {'name': model_name, 'regions': regions}
  parent = 'projects/{}'.format(project_id)
  try:
    api.projects().models().create(body=body, parent=parent).execute()
  except errors.HttpError as e:
    # If the error is to create an already existing model, it's ok to ignore.
    # TODO(b/135211463): Remove the disable once the pytype bug is fixed.
    if e.resp.status == 409:  # pytype: disable=attribute-error
      logging.warn('Model %s already exists', model_name)
    else:
      raise RuntimeError('AI Platform Push failed: {}'.format(e))
  with telemetry_utils.scoped_labels(
      {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}):
    job_labels = telemetry_utils.get_labels_dict()
  body = {
      'name': model_version,
      'deployment_uri': serving_path,
      'runtime_version': runtime_version,
      'python_version': python_version,
      'labels': job_labels,
  }

  # Push to AIP, and record the operation name so we can poll for its state.
  model_name = 'projects/{}/models/{}'.format(project_id, model_name)
  response = api.projects().models().versions().create(
      body=body, parent=model_name).execute()
  op_name = response['name']

  deploy_status_resc = api.projects().operations().get(name=op_name)
  while not deploy_status_resc.execute().get('done'):
    time.sleep(_POLLING_INTERVAL_IN_SECONDS)
    logging.info('Model still being deployed...')

  deploy_status = deploy_status_resc.execute()

  if deploy_status.get('error'):
    # The operation completed with an error.
    raise RuntimeError(
        'Failed to deploy model to AI Platform for serving: {}'.format(
            deploy_status['error']))

  # Set the new version as default.
  # By API specification, if Long-Running-Operation is done and there is
  # no error, 'response' is guaranteed to exist.
  api.projects().models().versions().setDefault(name='{}/versions/{}'.format(
      model_name, deploy_status['response']['name'])).execute()

  logging.info(
      'Successfully deployed model %s with version %s, serving from %s',
      model_name, model_version, serving_path)
Пример #16
0
Файл: runner.py Проект: zvrr/tfx
def start_aip_training(input_dict: Dict[Text, List[types.Artifact]],
                       output_dict: Dict[Text, List[types.Artifact]],
                       exec_properties: Dict[Text,
                                             Any], executor_class_path: Text,
                       training_inputs: Dict[Text,
                                             Any], job_id: Optional[Text]):
  """Start a trainer job on AI Platform (AIP).

  This is done by forwarding the inputs/outputs/exec_properties to the
  tfx.scripts.run_executor module on a AI Platform training job interpreter.

  Args:
    input_dict: Passthrough input dict for tfx.components.Trainer.executor.
    output_dict: Passthrough input dict for tfx.components.Trainer.executor.
    exec_properties: Passthrough input dict for tfx.components.Trainer.executor.
    executor_class_path: class path for TFX core default trainer.
    training_inputs: Training input argument for AI Platform training job.
      'pythonModule', 'pythonVersion' and 'runtimeVersion' will be inferred. For
      the full set of parameters, refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput
    job_id: Job ID for AI Platform Training job. If not supplied,
      system-determined unique ID is given. Refer to
    https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#resource-job

  Returns:
    None
  Raises:
    RuntimeError: if the Google Cloud AI Platform training job failed/cancelled.
  """
  training_inputs = training_inputs.copy()

  json_inputs = artifact_utils.jsonify_artifact_dict(input_dict)
  logging.info('json_inputs=\'%s\'.', json_inputs)
  json_outputs = artifact_utils.jsonify_artifact_dict(output_dict)
  logging.info('json_outputs=\'%s\'.', json_outputs)
  json_exec_properties = json.dumps(exec_properties, sort_keys=True)
  logging.info('json_exec_properties=\'%s\'.', json_exec_properties)

  # Configure AI Platform training job
  api_client = discovery.build('ml', 'v1')

  # We use custom containers to launch training on AI Platform, which invokes
  # the specified image using the container's entrypoint. The default
  # entrypoint for TFX containers is to call scripts/run_executor.py. The
  # arguments below are passed to this run_executor entry to run the executor
  # specified in `executor_class_path`.
  job_args = [
      '--executor_class_path', executor_class_path, '--inputs', json_inputs,
      '--outputs', json_outputs, '--exec-properties', json_exec_properties
  ]

  if not training_inputs.get('masterConfig'):
    training_inputs['masterConfig'] = {
        'imageUri': _TFX_IMAGE,
    }

  training_inputs['args'] = job_args

  # Pop project_id so AIP doesn't complain about an unexpected parameter.
  # It's been a stowaway in aip_args and has finally reached its destination.
  project = training_inputs.pop('project')
  project_id = 'projects/{}'.format(project)
  with telemetry_utils.scoped_labels(
      {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}):
    job_labels = telemetry_utils.get_labels_dict()

  # 'tfx_YYYYmmddHHMMSS' is the default job ID if not explicitly specified.
  job_id = job_id or 'tfx_{}'.format(
      datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
  job_spec = {
      'jobId': job_id,
      'trainingInput': training_inputs,
      'labels': job_labels,
  }

  # Submit job to AIP Training
  logging.info('Submitting job=\'%s\', project=\'%s\' to AI Platform.', job_id,
               project)
  request = api_client.projects().jobs().create(
      body=job_spec, parent=project_id)
  request.execute()

  # Wait for AIP Training job to finish
  job_name = '{}/jobs/{}'.format(project_id, job_id)
  request = api_client.projects().jobs().get(name=job_name)
  response = request.execute()
  retry_count = 0

  # Monitors the long-running operation by polling the job state periodically,
  # and retries the polling when a transient connectivity issue is encountered.
  #
  # Long-running operation monitoring:
  #   The possible states of "get job" response can be found at
  #   https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#State
  #   where SUCCEEDED/FAILED/CANCELLED are considered to be final states.
  #   The following logic will keep polling the state of the job until the job
  #   enters a final state.
  #
  # During the polling, if a connection error was encountered, the GET request
  # will be retried by recreating the Python API client to refresh the lifecycle
  # of the connection being used. See
  # https://github.com/googleapis/google-api-python-client/issues/218
  # for a detailed description of the problem. If the error persists for
  # _CONNECTION_ERROR_RETRY_LIMIT consecutive attempts, the function will exit
  # with code 1.
  while response['state'] not in ('SUCCEEDED', 'FAILED', 'CANCELLED'):
    time.sleep(_POLLING_INTERVAL_IN_SECONDS)
    try:
      response = request.execute()
      retry_count = 0
    # Handle transient connection error.
    except ConnectionError as err:
      if retry_count < _CONNECTION_ERROR_RETRY_LIMIT:
        retry_count += 1
        logging.warning(
            'ConnectionError (%s) encountered when polling job: %s. Trying to '
            'recreate the API client.', err, job_id)
        # Recreate the Python API client.
        api_client = discovery.build('ml', 'v1')
        request = api_client.projects().jobs().get(name=job_name)
      else:
        # TODO(b/158433873): Consider raising the error instead of exit with
        # code 1 after CMLE supports configurable retry policy.
        # Currently CMLE will automatically retry the job unless return code
        # 1-128 is returned.
        logging.error('Request failed after %s retries.',
                      _CONNECTION_ERROR_RETRY_LIMIT)
        sys.exit(1)

  if response['state'] in ('FAILED', 'CANCELLED'):
    err_msg = 'Job \'{}\' did not succeed.  Detailed response {}.'.format(
        job_name, response)
    logging.error(err_msg)
    raise RuntimeError(err_msg)

  # AIP training complete
  logging.info('Job \'%s\' successful.', job_name)
Пример #17
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]):
        """Overrides the tfx_pusher_executor.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from evaluator.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one. It will
          include the model in this push execution if the model was pushed.
      exec_properties: Mostly a passthrough input dict for
        tfx.components.Pusher.executor.  custom_config.ai_platform_serving_args
        is consumed by this class.  For the full set of parameters supported by
        Google Cloud AI Platform, refer to
        https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version.

    Raises:
      ValueError:
        If ai_platform_serving_args is not in exec_properties.custom_config.
        If Serving model path does not start with gs://.
      RuntimeError: if the Google Cloud AI Platform training job failed.
    """
        self._log_startup(input_dict, output_dict, exec_properties)
        model_push = artifact_utils.get_single_instance(
            output_dict[tfx_pusher_executor.PUSHED_MODEL_KEY])
        if not self.CheckBlessing(input_dict):
            self._MarkNotPushed(model_push)
            return

        model_export = artifact_utils.get_single_instance(
            input_dict[tfx_pusher_executor.MODEL_KEY])

        custom_config = json_utils.loads(
            exec_properties.get(_CUSTOM_CONFIG_KEY, 'null'))
        if custom_config is not None and not isinstance(custom_config, Dict):
            raise ValueError(
                'custom_config in execution properties needs to be a '
                'dict.')

        ai_platform_serving_args = custom_config.get(SERVING_ARGS_KEY)
        if not ai_platform_serving_args:
            raise ValueError(
                '\'ai_platform_serving_args\' is missing in \'custom_config\'')
        service_name, api_version = runner.get_service_name_and_api_version(
            ai_platform_serving_args)
        # Deploy the model.
        io_utils.copy_dir(src=path_utils.serving_model_path(model_export.uri),
                          dst=model_push.uri)
        model_path = model_push.uri
        # TODO(jjong): Introduce Versioning.
        # Note that we're adding "v" prefix as Cloud AI Prediction only allows the
        # version name that starts with letters, and contains letters, digits,
        # underscore only.
        model_version = 'v{}'.format(int(time.time()))
        executor_class_path = '%s.%s' % (self.__class__.__module__,
                                         self.__class__.__name__)
        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}):
            job_labels = telemetry_utils.get_labels_dict()
        runner.deploy_model_for_aip_prediction(
            discovery.build(service_name, api_version),
            model_path,
            model_version,
            ai_platform_serving_args,
            job_labels,
        )

        self._MarkPushed(
            model_push,
            pushed_destination=_CAIP_MODEL_VERSION_PATH_FORMAT.format(
                project_id=ai_platform_serving_args['project_id'],
                model=ai_platform_serving_args['model_name'],
                version=model_version),
            pushed_version=model_version)
Пример #18
0
def start_aip_training(input_dict: Dict[Text, List[types.Artifact]],
                       output_dict: Dict[Text, List[types.Artifact]],
                       exec_properties: Dict[Text,
                                             Any], executor_class_path: Text,
                       training_inputs: Dict[Text,
                                             Any], job_id: Optional[Text]):
    """Start a trainer job on AI Platform (AIP).

  This is done by forwarding the inputs/outputs/exec_properties to the
  tfx.scripts.run_executor module on a AI Platform training job interpreter.

  Args:
    input_dict: Passthrough input dict for tfx.components.Trainer.executor.
    output_dict: Passthrough input dict for tfx.components.Trainer.executor.
    exec_properties: Passthrough input dict for tfx.components.Trainer.executor.
    executor_class_path: class path for TFX core default trainer.
    training_inputs: Training input argment for AI Platform training job.
      'pythonModule', 'pythonVersion' and 'runtimeVersion' will be inferred. For
      the full set of parameters, refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput
    job_id: Job ID for AI Platform Training job. If not supplied,
      system-determined unique ID is given. Refer to
    https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#resource-job

  Returns:
    None
  Raises:
    RuntimeError: if the Google Cloud AI Platform training job failed.
  """
    training_inputs = training_inputs.copy()

    json_inputs = artifact_utils.jsonify_artifact_dict(input_dict)
    absl.logging.info('json_inputs=\'%s\'.', json_inputs)
    json_outputs = artifact_utils.jsonify_artifact_dict(output_dict)
    absl.logging.info('json_outputs=\'%s\'.', json_outputs)
    json_exec_properties = json.dumps(exec_properties, sort_keys=True)
    absl.logging.info('json_exec_properties=\'%s\'.', json_exec_properties)

    # Configure AI Platform training job
    api_client = discovery.build('ml', 'v1')

    # We use custom containers to launch training on AI Platform, which invokes
    # the specified image using the container's entrypoint. The default
    # entrypoint for TFX containers is to call scripts/run_executor.py. The
    # arguments below are passed to this run_executor entry to run the executor
    # specified in `executor_class_path`.
    job_args = [
        '--executor_class_path', executor_class_path, '--inputs', json_inputs,
        '--outputs', json_outputs, '--exec-properties', json_exec_properties
    ]

    if not training_inputs.get('masterConfig'):
        training_inputs['masterConfig'] = {
            'imageUri': _TFX_IMAGE,
        }

    training_inputs['args'] = job_args

    # Pop project_id so AIP doesn't complain about an unexpected parameter.
    # It's been a stowaway in aip_args and has finally reached its destination.
    project = training_inputs.pop('project')
    project_id = 'projects/{}'.format(project)
    job_labels = telemetry_utils.get_labels_dict(
        tfx_executor=executor_class_path)

    # 'tfx_YYYYmmddHHMMSS' is the default job ID if not explicitly specified.
    job_id = job_id or 'tfx_%s' % datetime.datetime.now().strftime(
        '%Y%m%d%H%M%S')
    job_spec = {
        'jobId': job_id,
        'trainingInput': training_inputs,
        'labels': job_labels,
    }

    # Submit job to AIP Training
    absl.logging.info(
        'Submitting job=\'{}\', project=\'{}\' to AI Platform.'.format(
            job_id, project))
    request = api_client.projects().jobs().create(body=job_spec,
                                                  parent=project_id)
    request.execute()

    # Wait for AIP Training job to finish
    job_name = '{}/jobs/{}'.format(project_id, job_id)
    request = api_client.projects().jobs().get(name=job_name)
    response = request.execute()
    while response['state'] not in ('SUCCEEDED', 'FAILED'):
        time.sleep(_POLLING_INTERVAL_IN_SECONDS)
        response = request.execute()

    if response['state'] == 'FAILED':
        err_msg = 'Job \'{}\' did not succeed.  Detailed response {}.'.format(
            job_name, response)
        absl.logging.error(err_msg)
        raise RuntimeError(err_msg)

    # AIP training complete
    absl.logging.info('Job \'{}\' successful.'.format(job_name))
Пример #19
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]):
    """Overrides the tfx_pusher_executor.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from model_validator.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one. It will
          include the model in this push execution if the model was pushed.
      exec_properties: Mostly a passthrough input dict for
        tfx.components.Pusher.executor.  custom_config.bigquery_serving_args is
        consumed by this class.  For the full set of parameters supported by
        Big Query ML, refer to https://cloud.google.com/bigquery-ml/

    Returns:
      None
    Raises:
      ValueError:
        If bigquery_serving_args is not in exec_properties.custom_config.
        If pipeline_root is not 'gs://...'
      RuntimeError: if the Big Query job failed.
    """
    self._log_startup(input_dict, output_dict, exec_properties)
    model_push = artifact_utils.get_single_instance(output_dict['model_push'])
    if not self.CheckBlessing(input_dict):
      model_push.set_int_custom_property('pushed', 0)
      return

    model_export = artifact_utils.get_single_instance(
        input_dict['model_export'])
    model_export_uri = model_export.uri

    custom_config = exec_properties.get('custom_config', {})
    bigquery_serving_args = custom_config.get('bigquery_serving_args', None)
    # if configuration is missing error out
    if bigquery_serving_args is None:
      raise ValueError('Big Query ML configuration was not provided')

    bq_model_uri = '`{}`.`{}`.`{}`'.format(
        bigquery_serving_args['project_id'],
        bigquery_serving_args['bq_dataset_id'],
        bigquery_serving_args['model_name'])

    # Deploy the model.
    model_path = path_utils.serving_model_path(model_export_uri)

    if not model_path.startswith('gs://'):
      raise ValueError(
          'pipeline_root must be gs:// for BigQuery ML Pusher.')

    absl.logging.info(
        'Deploying the model to BigQuery ML for serving: {} from {}'.format(
            bigquery_serving_args, model_path))

    query = ("""
      CREATE OR REPLACE MODEL {}
      OPTIONS (model_type='tensorflow',
               model_path='{}')""".format(bq_model_uri,
                                          os.path.join(model_path, '*')))

    # TODO(zhitaoli): Refactor the executor_class_path creation into a common
    # utility function.
    executor_class_path = '%s.%s' % (self.__class__.__module__,
                                     self.__class__.__name__)
    default_query_job_config = bigquery.job.QueryJobConfig(
        labels=telemetry_utils.get_labels_dict(
            tfx_executor=executor_class_path))
    client = bigquery.Client(default_query_job_config=default_query_job_config)

    try:
      query_job = client.query(query)
      query_job.result()  # Waits for the query to finish
    except Exception as e:
      raise RuntimeError('BigQuery ML Push failed: {}'.format(e))

    absl.logging.info('Successfully deployed model {} serving from {}'.format(
        bq_model_uri, model_path))

    # Setting the push_destination to bigquery uri
    model_push.set_int_custom_property('pushed', 1)
    model_push.set_string_custom_property('pushed_model', bq_model_uri)
Пример #20
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]):
    """Overrides the tfx_pusher_executor.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from evaluator.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one. It will
          include the model in this push execution if the model was pushed.
      exec_properties: Mostly a passthrough input dict for
        tfx.components.Pusher.executor.  custom_config.bigquery_serving_args is
        consumed by this class, including:
        - bq_dataset_id: ID of the dataset you're creating or replacing
        - model_name: name of the model you're creating or replacing
        - project_id: GCP project where the model will be stored. It is also
          the project where the query is executed unless a compute_project_id
          is provided.
        - compute_project_id: GCP project where the query is executed. If not
          provided, the query is executed in project_id.
        For the full set of parameters supported by
        Big Query ML, refer to https://cloud.google.com/bigquery-ml/

    Returns:
      None
    Raises:
      ValueError:
        If bigquery_serving_args is not in exec_properties.custom_config.
        If pipeline_root is not 'gs://...'
      RuntimeError: if the Big Query job failed.

    Example usage:
      from tfx.extensions.google_cloud_big_query.pusher import executor

      pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        custom_executor_spec=executor_spec.ExecutorClassSpec(executor.Executor),
        custom_config={
          'bigquery_serving_args': {
            'model_name': 'your_model_name',
            'project_id': 'your_gcp_storage_project',
            'bq_dataset_id': 'your_dataset_id',
            'compute_project_id': 'your_gcp_compute_project',
          },
        },
      )
    """
    self._log_startup(input_dict, output_dict, exec_properties)
    model_push = artifact_utils.get_single_instance(
        output_dict[standard_component_specs.PUSHED_MODEL_KEY])
    if not self.CheckBlessing(input_dict):
      self._MarkNotPushed(model_push)
      return

    custom_config = json_utils.loads(
        exec_properties.get(_CUSTOM_CONFIG_KEY, 'null'))
    if custom_config is not None and not isinstance(custom_config, Dict):
      raise ValueError('custom_config in execution properties needs to be a '
                       'dict.')

    bigquery_serving_args = custom_config.get(SERVING_ARGS_KEY)
    # if configuration is missing error out
    if bigquery_serving_args is None:
      raise ValueError('Big Query ML configuration was not provided')

    bq_model_uri = '.'.join([
        bigquery_serving_args[_PROJECT_ID_KEY],
        bigquery_serving_args[_BQ_DATASET_ID_KEY],
        bigquery_serving_args[_MODEL_NAME_KEY],
    ])

    # Deploy the model.
    io_utils.copy_dir(src=self.GetModelPath(input_dict), dst=model_push.uri)
    model_path = model_push.uri
    if not model_path.startswith(_GCS_PREFIX):
      raise ValueError('pipeline_root must be gs:// for BigQuery ML Pusher.')

    logging.info('Deploying the model to BigQuery ML for serving: %s from %s',
                 bigquery_serving_args, model_path)

    query = _BQML_CREATE_OR_REPLACE_MODEL_QUERY_TEMPLATE.format(
        model_uri=bq_model_uri, model_path=model_path)

    # TODO(zhitaoli): Refactor the executor_class_path creation into a common
    # utility function.
    executor_class_path = '%s.%s' % (self.__class__.__module__,
                                     self.__class__.__name__)
    with telemetry_utils.scoped_labels(
        {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}):
      default_query_job_config = bigquery.job.QueryJobConfig(
          labels=telemetry_utils.get_labels_dict())
    # TODO(b/181368842) Add integration test for BQML Pusher + Managed Pipeline
    project_id = (
        bigquery_serving_args.get(_COMPUTE_PROJECT_ID_KEY) or
        bigquery_serving_args[_PROJECT_ID_KEY])
    client = bigquery.Client(
        default_query_job_config=default_query_job_config, project=project_id)

    try:
      query_job = client.query(query)
      query_job.result()  # Waits for the query to finish
    except Exception as e:
      raise RuntimeError('BigQuery ML Push failed: {}'.format(e)) from e

    logging.info('Successfully deployed model %s serving from %s', bq_model_uri,
                 model_path)

    # Setting the push_destination to bigquery uri
    self._MarkPushed(model_push, pushed_destination=bq_model_uri)
Пример #21
0
    def create_training_args(self, input_dict: Dict[Text,
                                                    List[types.Artifact]],
                             output_dict: Dict[Text, List[types.Artifact]],
                             exec_properties: Dict[Text, Any],
                             executor_class_path: Text,
                             training_inputs: Dict[Text, Any],
                             job_id: Optional[Text]) -> Dict[Text, Any]:
        """Get training args for runner._launch_aip_training.

    The training args contain the inputs/outputs/exec_properties to the
    tfx.scripts.run_executor module.

    Args:
      input_dict: Passthrough input dict for tfx.components.Trainer.executor.
      output_dict: Passthrough input dict for tfx.components.Trainer.executor.
      exec_properties: Passthrough input dict for
        tfx.components.Trainer.executor.
      executor_class_path: class path for TFX core default trainer.
      training_inputs: Training input argument for AI Platform training job.
        'pythonModule', 'pythonVersion' and 'runtimeVersion' will be inferred.
        For the full set of parameters, refer to
        https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput
      job_id: Job ID for AI Platform Training job. If not supplied,
        system-determined unique ID is given. Refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#resource-job

    Returns:
      A dict containing the training arguments
    """
        training_inputs = training_inputs.copy()

        json_inputs = artifact_utils.jsonify_artifact_dict(input_dict)
        logging.info('json_inputs=\'%s\'.', json_inputs)
        json_outputs = artifact_utils.jsonify_artifact_dict(output_dict)
        logging.info('json_outputs=\'%s\'.', json_outputs)
        json_exec_properties = json.dumps(exec_properties, sort_keys=True)
        logging.info('json_exec_properties=\'%s\'.', json_exec_properties)

        # We use custom containers to launch training on AI Platform, which invokes
        # the specified image using the container's entrypoint. The default
        # entrypoint for TFX containers is to call scripts/run_executor.py. The
        # arguments below are passed to this run_executor entry to run the executor
        # specified in `executor_class_path`.
        container_command = _CONTAINER_COMMAND + [
            '--executor_class_path',
            executor_class_path,
            '--inputs',
            json_inputs,
            '--outputs',
            json_outputs,
            '--exec-properties',
            json_exec_properties,
        ]

        if not training_inputs.get('masterConfig'):
            training_inputs['masterConfig'] = {
                'imageUri': _TFX_IMAGE,
            }

        # Always use our own entrypoint instead of relying on container default.
        if 'containerCommand' in training_inputs['masterConfig']:
            logging.warn('Overriding custom value of containerCommand')
        training_inputs['masterConfig']['containerCommand'] = container_command

        # Pop project_id so AIP doesn't complain about an unexpected parameter.
        # It's been a stowaway in aip_args and has finally reached its destination.
        project = training_inputs.pop('project')
        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}):
            job_labels = telemetry_utils.get_labels_dict()

        # 'tfx_YYYYmmddHHMMSS' is the default job ID if not explicitly specified.
        job_id = job_id or 'tfx_{}'.format(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S'))

        training_args = {
            'job_id': job_id,
            'project': project,
            'training_input': training_inputs,
            'job_labels': job_labels
        }

        return training_args
Пример #22
0
    def create_training_args(self, input_dict: Dict[Text,
                                                    List[types.Artifact]],
                             output_dict: Dict[Text, List[types.Artifact]],
                             exec_properties: Dict[Text, Any],
                             executor_class_path: Text,
                             training_inputs: Dict[Text, Any],
                             job_id: Optional[Text]) -> Dict[Text, Any]:
        """Get training args for runner._launch_aip_training.

    The training args contain the inputs/outputs/exec_properties to the
    tfx.scripts.run_executor module.

    Args:
      input_dict: Passthrough input dict for tfx.components.Trainer.executor.
      output_dict: Passthrough input dict for tfx.components.Trainer.executor.
      exec_properties: Passthrough input dict for
        tfx.components.Trainer.executor.
      executor_class_path: class path for TFX core default trainer.
      training_inputs: Spec for CustomJob for AI Platform (Unified) custom
        training job. See
        https://cloud.google.com/ai-platform-unified/docs/reference/rest/v1/CustomJobSpec
          for the detailed schema.
      job_id: Display name for AI Platform (Unified) custom training job. If not
        supplied, system-determined unique ID is given. Refer to
        https://cloud.google.com/ai-platform-unified/docs/reference/rest/v1/projects.locations.customJobs

    Returns:
      A dict containing the training arguments
    """
        training_inputs = training_inputs.copy()

        json_inputs = artifact_utils.jsonify_artifact_dict(input_dict)
        logging.info('json_inputs=\'%s\'.', json_inputs)
        json_outputs = artifact_utils.jsonify_artifact_dict(output_dict)
        logging.info('json_outputs=\'%s\'.', json_outputs)
        json_exec_properties = json.dumps(exec_properties, sort_keys=True)
        logging.info('json_exec_properties=\'%s\'.', json_exec_properties)

        # We use custom containers to launch training on AI Platform (unified),
        # which invokes the specified image using the container's entrypoint. The
        # default entrypoint for TFX containers is to call scripts/run_executor.py.
        # The arguments below are passed to this run_executor entry to run the
        # executor specified in `executor_class_path`.
        container_command = _CONTAINER_COMMAND + [
            '--executor_class_path',
            executor_class_path,
            '--inputs',
            json_inputs,
            '--outputs',
            json_outputs,
            '--exec-properties',
            json_exec_properties,
        ]

        if not training_inputs.get('worker_pool_specs'):
            training_inputs['worker_pool_specs'] = [{}]

        for worker_pool_spec in training_inputs['worker_pool_specs']:
            if not worker_pool_spec.get('container_spec'):
                worker_pool_spec['container_spec'] = {
                    'image_uri': _TFX_IMAGE,
                }

            # Always use our own entrypoint instead of relying on container default.
            if 'command' in worker_pool_spec['container_spec']:
                logging.warn(
                    'Overriding custom value of container_spec.command')
            worker_pool_spec['container_spec']['command'] = container_command

        # Pop project_id so AIP doesn't complain about an unexpected parameter.
        # It's been a stowaway in aip_args and has finally reached its destination.
        project = training_inputs.pop('project')
        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}):
            job_labels = telemetry_utils.get_labels_dict()

        # 'tfx_YYYYmmddHHMMSS' is the default job display name if not explicitly
        # specified.
        job_id = job_id or 'tfx_{}'.format(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S'))

        training_args = {
            'job_id': job_id,
            'project': project,
            'training_input': training_inputs,
            'job_labels': job_labels
        }

        return training_args
Пример #23
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]):
        """Overrides the tfx_pusher_executor.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from evaluator.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one. It will
          include the model in this push execution if the model was pushed.
      exec_properties: Mostly a passthrough input dict for
        tfx.components.Pusher.executor.  The following keys in `custom_config`
        are consumed by this class:
        - ai_platform_serving_args: For the full set of parameters supported
          by Google Cloud AI Platform, refer to
          https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions#Version.
        - endpoint: Optional endpoint override. Should be in format of
          `https://[region]-ml.googleapis.com`. Default to global endpoint if
          not set. Using regional endpoint is recommended by Cloud AI Platform.
          When set, 'regions' key in ai_platform_serving_args cannot be set.
          For more details, please see
          https://cloud.google.com/ai-platform/prediction/docs/regional-endpoints#using_regional_endpoints

    Raises:
      ValueError:
        If ai_platform_serving_args is not in exec_properties.custom_config.
        If Serving model path does not start with gs://.
        If 'endpoint' and 'regions' are set simultanuously.
      RuntimeError: if the Google Cloud AI Platform training job failed.
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        custom_config = json_utils.loads(
            exec_properties.get(_CUSTOM_CONFIG_KEY, 'null'))
        if custom_config is not None and not isinstance(custom_config, Dict):
            raise ValueError(
                'custom_config in execution properties needs to be a '
                'dict.')
        ai_platform_serving_args = custom_config.get(SERVING_ARGS_KEY)
        if not ai_platform_serving_args:
            raise ValueError(
                '\'ai_platform_serving_args\' is missing in \'custom_config\'')
        endpoint = custom_config.get(ENDPOINT_ARGS_KEY)
        if endpoint and 'regions' in ai_platform_serving_args:
            raise ValueError(
                '\'endpoint\' and \'ai_platform_serving_args.regions\' cannot be set simultanuously'
            )

        model_push = artifact_utils.get_single_instance(
            output_dict[standard_component_specs.PUSHED_MODEL_KEY])
        if not self.CheckBlessing(input_dict):
            self._MarkNotPushed(model_push)
            return

        service_name, api_version = runner.get_service_name_and_api_version(
            ai_platform_serving_args)
        # Deploy the model.
        io_utils.copy_dir(src=self.GetModelPath(input_dict),
                          dst=model_push.uri)
        model_path = model_push.uri
        # TODO(jjong): Introduce Versioning.
        # Note that we're adding "v" prefix as Cloud AI Prediction only allows the
        # version name that starts with letters, and contains letters, digits,
        # underscore only.
        model_version = 'v{}'.format(int(time.time()))
        executor_class_path = '%s.%s' % (self.__class__.__module__,
                                         self.__class__.__name__)
        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}):
            job_labels = telemetry_utils.get_labels_dict()
        endpoint = endpoint or runner.DEFAULT_ENDPOINT
        api = discovery.build(
            service_name,
            api_version,
            client_options=client_options.ClientOptions(api_endpoint=endpoint),
        )
        runner.deploy_model_for_aip_prediction(
            api,
            model_path,
            model_version,
            ai_platform_serving_args,
            job_labels,
        )

        self._MarkPushed(
            model_push,
            pushed_destination=_CAIP_MODEL_VERSION_PATH_FORMAT.format(
                project_id=ai_platform_serving_args['project_id'],
                model=ai_platform_serving_args['model_name'],
                version=model_version),
            pushed_version=model_version)