Exemplo n.º 1
0
def load_source_path_class(source: Text) -> Type:
    """
    Loads a Python class from the source.

    Args:
        source (str): class_source e.g. this.module.Class[@sha]
    """
    source = source.split('@')[0]
    pin = source.split('@')[-1]
    is_standard = is_standard_pin(pin)

    if '@' in source and not is_standard:
        logger.debug('Pinned step found with git sha. '
                     'Loading class from git history.')
        wrapper: GitWrapper = Repository.get_instance().get_git_wrapper()

        module_source = get_module_source_from_source(source)
        relative_module_path = get_relative_path_from_module_source(
            module_source)

        logger.warning('Found source with a pinned sha. Will now checkout '
                       f'module: {module_source}')

        # critical step
        if not wrapper.check_module_clean(source):
            raise Exception(f'One of the files at {relative_module_path} '
                            f'is not committed and we '
                            f'are trying to load that directory from git '
                            f'history due to a pinned step in the pipeline. '
                            f'Please commit the file and then run the '
                            f'pipeline.')

        # Check out the directory at that sha
        wrapper.checkout(sha_or_branch=pin, directory=relative_module_path)

        # After this point, all exceptions will first undo the above
        try:
            class_ = import_class_by_path(source)
            wrapper.reset(relative_module_path)
            wrapper.checkout(directory=relative_module_path)
        except Exception:
            wrapper.reset(relative_module_path)
            wrapper.checkout(directory=relative_module_path)
            raise Exception
    elif '@' in source and is_standard:
        logger.debug(f'Default {APP_NAME} class used. Loading directly.')
        # TODO: [LOW] Check if ZenML version is installed before loading.
        class_ = import_class_by_path(source)
    else:
        logger.debug('Unpinned step found with no git sha. Attempting to '
                     'load class from current repository state.')
        class_ = import_class_by_path(source)

    return class_
Exemplo n.º 2
0
    def __init__(
        self,
        executor_class_path,
        name,
        input_dict,
        outputs,
        exec_properties,
    ):
        raw_args = exec_properties.get('beam_pipeline_args', [])

        # Beam expects str types for it's pipeline args. Ensure unicode type is
        # converted to str if required.
        beam_pipeline_args = []
        for arg in raw_args:
            if isinstance(arg, unicode):
                arg = arg.encode('ascii', 'ignore')
            beam_pipeline_args.append(arg)

        # TODO(zhitaoli): Revisit usage of setup_file here.
        module_dir = os.path.dirname(os.path.dirname(tfx.__file__))
        setup_file = os.path.join(module_dir, 'setup.py')
        beam_pipeline_args.append('--setup_file={}'.format(setup_file))

        executor_cls = import_utils.import_class_by_path(executor_class_path)
        self._executor = executor_cls(beam_pipeline_args=beam_pipeline_args)

        self._input_dict = input_dict
        self._output_dict = types.parse_tfx_type_dict(outputs)
        self._exec_properties = exec_properties
        self._component_name = to_snake_case(name)
Exemplo n.º 3
0
def _run_executor(args, pipeline_args):
    """Select a particular executor and run it based on name."""
    tf.logging.set_verbosity(tf.logging.INFO)

    (inputs_str, outputs_str,
     exec_properties_str) = (args.inputs
                             or base64.b64decode(args.inputs_base64),
                             args.outputs
                             or base64.b64decode(args.outputs_base64),
                             args.exec_properties
                             or base64.b64decode(args.exec_properties_base64))

    inputs = types.parse_tfx_type_dict(inputs_str)
    outputs = types.parse_tfx_type_dict(outputs_str)
    exec_properties = json.loads(exec_properties_str)
    tf.logging.info(
        'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format(
            args.executor, inputs, outputs, exec_properties))

    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor = executor_cls(beam_pipeline_args=pipeline_args)
    tf.logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # The last line of stdout will be pushed to xcom by Airflow.
    if args.write_outputs_stdout:
        print(types.jsonify_tfx_type_dict(outputs))
Exemplo n.º 4
0
def main():
    # Log to the container's stdout so Kubeflow Pipelines UI can display logs to
    # the user.
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('--pipeline_name', type=str, required=True)
    parser.add_argument('--pipeline_root', type=str, required=True)
    parser.add_argument('--kubeflow_metadata_config', type=str, required=True)
    parser.add_argument('--beam_pipeline_args', type=str, required=True)
    parser.add_argument('--additional_pipeline_args', type=str, required=True)
    parser.add_argument('--component_launcher_class_path',
                        type=str,
                        required=True)
    parser.add_argument('--enable_cache', action='store_true')
    parser.add_argument('--serialized_component', type=str, required=True)
    parser.add_argument('--component_config', type=str, required=True)

    args = parser.parse_args()

    component = json_utils.loads(args.serialized_component)
    component_config = json_utils.loads(args.component_config)
    component_launcher_class = import_utils.import_class_by_path(
        args.component_launcher_class_path)
    if not issubclass(component_launcher_class,
                      base_component_launcher.BaseComponentLauncher):
        raise TypeError(
            'component_launcher_class "%s" is not subclass of base_component_launcher.BaseComponentLauncher'
            % component_launcher_class)

    kubeflow_metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    json_format.Parse(args.kubeflow_metadata_config, kubeflow_metadata_config)
    metadata_connection = kubeflow_metadata_adapter.KubeflowMetadataAdapter(
        _get_metadata_connection_config(kubeflow_metadata_config))
    driver_args = data_types.DriverArgs(enable_cache=args.enable_cache)

    beam_pipeline_args = _make_beam_pipeline_args(args.beam_pipeline_args)

    additional_pipeline_args = json.loads(args.additional_pipeline_args)

    launcher = component_launcher_class.create(
        component=component,
        pipeline_info=data_types.PipelineInfo(
            pipeline_name=args.pipeline_name,
            pipeline_root=args.pipeline_root,
            run_id=os.environ['WORKFLOW_ID']),
        driver_args=driver_args,
        metadata_connection=metadata_connection,
        beam_pipeline_args=beam_pipeline_args,
        additional_pipeline_args=additional_pipeline_args,
        component_config=component_config)

    execution_info = launcher.launch()

    # Dump the UI metadata.
    _dump_ui_metadata(component, execution_info)
Exemplo n.º 5
0
def main():
    # Log to the container's stdout so it can be streamed by the orchestrator.
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('--pipeline_name', type=str, required=True)
    parser.add_argument('--pipeline_root', type=str, required=True)
    parser.add_argument('--run_id', type=str, required=True)
    parser.add_argument('--metadata_config', type=str, required=True)
    parser.add_argument('--beam_pipeline_args', type=str, required=True)
    parser.add_argument('--additional_pipeline_args', type=str, required=True)
    parser.add_argument('--component_launcher_class_path',
                        type=str,
                        required=True)
    parser.add_argument('--enable_cache', action='store_true')
    parser.add_argument('--serialized_component', type=str, required=True)
    parser.add_argument('--component_config', type=str, required=True)

    args = parser.parse_args()

    component = json_utils.loads(args.serialized_component)
    component_config = json_utils.loads(args.component_config)
    component_launcher_class = import_utils.import_class_by_path(
        args.component_launcher_class_path)
    if not issubclass(component_launcher_class,
                      base_component_launcher.BaseComponentLauncher):
        raise TypeError(
            'component_launcher_class "%s" is not subclass of base_component_launcher.BaseComponentLauncher'
            % component_launcher_class)

    metadata_config = metadata_store_pb2.ConnectionConfig()
    json_format.Parse(args.metadata_config, metadata_config)
    driver_args = data_types.DriverArgs(enable_cache=args.enable_cache)
    beam_pipeline_args = json.loads(args.beam_pipeline_args)
    additional_pipeline_args = json.loads(args.additional_pipeline_args)

    launcher = component_launcher_class.create(
        component=component,
        pipeline_info=data_types.PipelineInfo(
            pipeline_name=args.pipeline_name,
            pipeline_root=args.pipeline_root,
            run_id=args.run_id,
        ),
        driver_args=driver_args,
        metadata_connection=metadata.Metadata(
            connection_config=metadata_config),
        beam_pipeline_args=beam_pipeline_args,
        additional_pipeline_args=additional_pipeline_args,
        component_config=component_config)

    # Attach necessary labels to distinguish different runner and DSL.
    with telemetry_utils.scoped_labels({
            telemetry_utils.LABEL_TFX_RUNNER:
            'kubernetes',
    }):
        launcher.launch()
Exemplo n.º 6
0
    def __init__(self, driver_spec: message.Message,
                 mlmd_connection: metadata.Metadata):
        """Constructor.

    Args:
      driver_spec: The specification of how to initialize the driver.
      mlmd_connection: ML metadata connection.

    Raises:
      RuntimeError: if the driver_spec is not supported.
    """
        super().__init__(driver_spec, mlmd_connection)

        python_class_driver_spec = cast(
            pipeline_pb2.ExecutorSpec.PythonClassExecutorSpec, driver_spec)
        self._driver = import_utils.import_class_by_path(
            python_class_driver_spec.class_path)(self._mlmd_connection)
Exemplo n.º 7
0
    def __init__(self,
                 executor_spec: message.Message,
                 platform_spec: Optional[message.Message] = None):
        """Initialize an PythonExecutorOperator.

    Args:
      executor_spec: The specification of how to initialize the executor.
      platform_spec: The specification of how to allocate resource for the
        executor.
    """
        # Python exectors run locally, so platform_spec is not used.
        del platform_spec
        super(PythonExecutorOperator, self).__init__(executor_spec)
        python_class_executor_spec = cast(
            pipeline_pb2.ExecutorSpec.PythonClassExecutorSpec,
            self._executor_spec)
        self._executor_cls = import_utils.import_class_by_path(
            python_class_executor_spec.class_path)
Exemplo n.º 8
0
    def __init__(
        self,
        executor_class_path: Text,
        name: Text,
        input_dict: Dict[Text, List[types.TfxArtifact]],
        outputs: Text,
        exec_properties: Dict[Text, Any],
    ):
        self._input_dict = input_dict
        self._output_dict = types.parse_tfx_type_dict(outputs)
        self._component_name = to_snake_case(name)
        self._exec_properties = exec_properties
        self._output_dir = self._exec_properties['output_dir']
        self._workflow_id = os.environ['WORKFLOW_ID']

        raw_args = self._exec_properties.get('beam_pipeline_args', [])

        # Beam expects str types for it's pipeline args. Ensure unicode type is
        # converted to str if required.
        beam_pipeline_args = []
        for arg in raw_args:
            # In order to support both Py2 and Py3: Py3 doesn't have `unicode` type.
            if six.PY2 and isinstance(arg, unicode):
                arg = arg.encode('ascii', 'ignore')

            beam_pipeline_args.append(arg)

        # TODO(zhitaoli): Revisit usage of setup_file here.
        module_dir = os.path.dirname(os.path.dirname(version.__file__))
        setup_file = os.path.join(module_dir, 'setup.py')
        tf.logging.info('Using setup_file \'%s\' to capture TFX dependencies',
                        setup_file)
        beam_pipeline_args.append('--setup_file={}'.format(setup_file))

        executor_cls = import_utils.import_class_by_path(executor_class_path)
        # TODO(swoonna): Switch to execution_id when available
        unique_id = '{}_{}'.format(self._component_name, self._workflow_id)
        # TODO(swoonna): Add tmp_dir to additional_pipeline_args
        executor_context = base_executor.BaseExecutor.Context(
            beam_pipeline_args=beam_pipeline_args,
            tmp_dir=os.path.join(self._output_dir, '.temp', ''),
            unique_id=unique_id)
        self._executor = executor_cls(executor_context)
Exemplo n.º 9
0
  def __init__(self,
               executor_spec: message.Message,
               platform_config: Optional[message.Message] = None):
    """Initializes a BeamExecutorOperator.

    Args:
      executor_spec: The specification of how to initialize the executor.
      platform_config: The specification of how to allocate resource for the
        executor.
    """
    del platform_config
    super().__init__(executor_spec)
    beam_executor_spec = cast(executable_spec_pb2.BeamExecutableSpec,
                              self._executor_spec)
    self._executor_cls = import_utils.import_class_by_path(
        beam_executor_spec.python_executor_spec.class_path)
    self.extra_flags = []
    self.extra_flags.extend(beam_executor_spec.python_executor_spec.extra_flags)
    self.beam_pipeline_args = []
    self.beam_pipeline_args.extend(beam_executor_spec.beam_pipeline_args)
Exemplo n.º 10
0
  def __init__(self,
               executor_spec: message.Message,
               platform_config: Optional[message.Message] = None):
    """Initializes a PythonExecutorOperator.

    Args:
      executor_spec: The specification of how to initialize the executor.
      platform_config: The specification of how to allocate resource for the
        executor.
    """
    # Python executors run locally, so platform_config is not used.
    del platform_config
    super().__init__(executor_spec)
    python_class_executor_spec = cast(
        executable_spec_pb2.PythonClassExecutableSpec, self._executor_spec)
    self._executor_cls = import_utils.import_class_by_path(
        python_class_executor_spec.class_path)
    self.extra_flags = []
    self.extra_flags.extend(python_class_executor_spec.extra_flags)
    self.extra_flags.extend(sys.argv[1:])
Exemplo n.º 11
0
def _parse_raw_artifact(
        artifact_pb: pipeline_pb2.RuntimeArtifact,
        name_from_id: MutableMapping[int, str]) -> artifact.Artifact:
    """Parses RuntimeArtifact proto message without artifact_type."""
    # This parser can only reserve what's inside the RuntimeArtifact pb message.

    # Recovers the type information from artifact type schema.
    # TODO(b/170261670): Replace this workaround by a more resilient
    # implementation. Currently custom artifact type can hardly be supported.
    assert (artifact_pb.type
            and artifact_pb.type.WhichOneof('kind') == 'instance_schema'
            and artifact_pb.type.instance_schema), (
                'RuntimeArtifact is expected to have '
                'instance_schema populated.')
    # 1. Import the artifact class from preloaded TFX library.
    type_path = _retrieve_class_path(artifact_pb.type.instance_schema)
    artifact_cls = import_utils.import_class_by_path(type_path)

    # 2. Copy properties and custom properties to the MLMD artifact pb.
    mlmd_artifact = metadata_store_pb2.Artifact()
    # TODO(b/135056715): Change to a unified getter/setter of Artifact type
    # once it's ready.
    if artifact_pb.name:
        # TODO(b/169583143): Remove this workaround when TFX migrates to use
        # str-typed id/name to identify artifacts.
        # Convert and populate the MLMD artifact ID.
        mlmd_artifact.id = _get_hashed_id(artifact_pb.name, name_from_id)

    mlmd_artifact.uri = artifact_pb.uri
    for k, v in artifact_pb.properties.items():
        mlmd_artifact.properties[k].CopyFrom(compiler_utils.get_mlmd_value(v))

    for k, v in artifact_pb.custom_properties.items():
        mlmd_artifact.custom_properties[k].CopyFrom(
            compiler_utils.get_mlmd_value(v))

    # 3. Instantiate the artifact Python object.
    result = artifact_cls()
    result.set_mlmd_artifact(mlmd_artifact)

    return result
Exemplo n.º 12
0
  def __init__(self, driver_spec: message.Message,
               mlmd_connection: metadata.Metadata,
               pipeline_info: pipeline_pb2.PipelineInfo,
               pipeline_node: pipeline_pb2.PipelineNode):
    """Constructor.

    Args:
      driver_spec: The specification of how to initialize the driver.
      mlmd_connection: ML metadata connection.
      pipeline_info: The information of the pipeline that this driver is in.
      pipeline_node: The specification of the node that this driver is in.

    Raises:
      RuntimeError: if the driver_spec is not supported.
    """
    super(PythonDriverOperator, self).__init__(driver_spec, mlmd_connection,
                                               pipeline_info, pipeline_node)

    python_class_driver_spec = cast(
        pipeline_pb2.ExecutorSpec.PythonClassExecutorSpec, driver_spec)
    self._driver = import_utils.import_class_by_path(
        python_class_driver_spec.class_path)(self._mlmd_connection,
                                             self._pipeline_info,
                                             self._pipeline_node)
Exemplo n.º 13
0
def _run_executor(args: argparse.Namespace, beam_args: List[str]) -> None:
  """Selects a particular executor and run it based on name.

  Args:
    args:
      --executor_class_path: The import path of the executor class.
      --json_serialized_invocation_args: Full JSON-serialized parameters for
        this execution.
    beam_args: Optional parameter that maps to the optional_pipeline_args
      parameter in the pipeline, which provides additional configuration options
      for apache-beam and tensorflow.logging.
    For more about the beam arguments please refer to:
    https://cloud.google.com/dataflow/docs/guides/specifying-exec-params
  """
  logging.set_verbosity(logging.INFO)

  # Rehydrate inputs/outputs/exec_properties from the serialized metadata.
  executor_input = pipeline_spec_pb2.ExecutorInput()
  json_format.Parse(
      args.json_serialized_invocation_args,
      executor_input,
      ignore_unknown_fields=True)

  inputs_dict = executor_input.inputs.artifacts
  outputs_dict = executor_input.outputs.artifacts
  inputs_parameter = executor_input.inputs.parameters

  if fileio.exists(executor_input.outputs.output_file):
    # It has a driver that outputs the updated exec_properties in this file.
    with fileio.open(executor_input.outputs.output_file,
                     'rb') as output_meta_json:
      output_metadata = pipeline_spec_pb2.ExecutorOutput()
      json_format.Parse(
          output_meta_json.read(), output_metadata, ignore_unknown_fields=True)
      # Append/Overwrite exec_propertise.
      for k, v in output_metadata.parameters.items():
        inputs_parameter[k].CopyFrom(v)

  name_from_id = {}

  inputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
      inputs_dict, name_from_id)
  outputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
      outputs_dict, name_from_id)
  exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties(
      inputs_parameter)
  logging.info('Executor %s do: inputs: %s, outputs: %s, exec_properties: %s',
               args.executor_class_path, inputs, outputs, exec_properties)
  executor_cls = import_utils.import_class_by_path(args.executor_class_path)
  if issubclass(executor_cls, base_beam_executor.BaseBeamExecutor):
    executor_context = base_beam_executor.BaseBeamExecutor.Context(
        beam_pipeline_args=beam_args, unique_id='', tmp_dir='/tmp')
  else:
    executor_context = base_executor.BaseExecutor.Context(
        extra_flags=beam_args, unique_id='', tmp_dir='/tmp')
  executor = executor_cls(executor_context)
  logging.info('Starting executor')
  executor.Do(inputs, outputs, exec_properties)

  # TODO(b/182316162): Unify publisher handling so that post-execution artifact
  # logic is more cleanly handled.
  outputs_utils.tag_output_artifacts_with_version(outputs)  # pylint: disable=protected-access

  # TODO(b/169583143): Remove this workaround when TFX migrates to use str-typed
  # id/name to identify artifacts.
  # Convert ModelBlessing artifact to use managed MLMD resource name.
  if (issubclass(executor_cls, evaluator_executor.Executor) and
      standard_component_specs.BLESSING_KEY in outputs):
    # Parse the parent prefix for managed MLMD resource name.
    kubeflow_v2_entrypoint_utils.refactor_model_blessing(
        artifact_utils.get_single_instance(
            outputs[standard_component_specs.BLESSING_KEY]), name_from_id)

  # Log the output metadata to a file. So that it can be picked up by MP.
  metadata_uri = executor_input.outputs.output_file
  executor_output = pipeline_spec_pb2.ExecutorOutput()
  for k, v in kubeflow_v2_entrypoint_utils.translate_executor_output(
      outputs, name_from_id).items():
    executor_output.artifacts[k].CopyFrom(v)

  fileio.makedirs(os.path.dirname(metadata_uri))
  with fileio.open(metadata_uri, 'wb') as f:
    f.write(json_format.MessageToJson(executor_output))
Exemplo n.º 14
0
def _run_executor(args, pipeline_args) -> None:
    r"""Select a particular executor and run it based on name.

  # pylint: disable=line-too-long
  _run_executor() is used to invoke a class subclassing
  tfx.components.base.base_executor.BaseExecutor.  This function can be used for
  both invoking the executor on remote environments as well as for unit testing
  of executors.

  How to invoke an executor as standalone:
  # TODO(b/132958430): Create utility script to generate arguments for run_executor.py
  First, the input data needs to be prepared.  An easy way to generate the test
  data is to fully run the pipeline once.  This will generate the data to be
  used for testing as well as log the artifacts to be used as input parameters.
  In each executed component, three log entries will be generated similar to the
  below:
  ```
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,116] {base_executor.py:72} INFO - Starting Executor execution.
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:74} INFO - Inputs for Executor is: {"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:76} INFO - Outputs for Executor is: {"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:78} INFO - Execution properties for Executor is: {"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  Each of these map directly to the input parameters expected by run_executor():
  ```
  python scripts/run_executor.py \
      --executor_class_path=tfx.components.example_gen.big_query_example_gen.executor.Executor \
      --inputs={"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} \
      --outputs={"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} \
      --exec-properties={"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  # pylint: disable=line-too-long

  Args:
    args:
      - inputs: The input artifacts for this execution, serialized as JSON.
      - outputs: The output artifacts to be generated by this execution,
        serialized as JSON.
      - exec_properties: The execution properties to be used by this execution,
        serialized as JSON. Technically all the exec_properties values should be
        a primitive, and nested exec_properties needs to be JSON-encoded as a
        string. But as a convenience, the script allows you to feed in
        non-serialized values of exec_properties, which is then automatically
        serialized.
    pipeline_args: Optional parameter that maps to the optional_pipeline_args
    parameter in the pipeline, which provides additional configuration options
    for apache-beam and tensorflow.logging.

  Returns:
    None

  Raises:
    None
  """
    (inputs_str, outputs_str,
     exec_properties_str) = (args.inputs
                             or base64.b64decode(args.inputs_base64),
                             args.outputs
                             or base64.b64decode(args.outputs_base64),
                             args.exec_properties
                             or base64.b64decode(args.exec_properties_base64))
    inputs = artifact_utils.parse_artifact_dict(inputs_str)
    outputs = artifact_utils.parse_artifact_dict(outputs_str)
    exec_properties = json.loads(exec_properties_str)

    # Technically exec_properties value can only be a primitive (e.g. string), and
    # one of our convention is to use proto object by JSON-serializing it.
    # Unfortunately, run_executor.py script accepts serialized exec_properties as
    # an input, thus proto object value would be serialized twice. This is really
    # inconvenient if you're manually constructing exec_properties, so we allow
    # to feed in non-serialized values of exec_properties, and serialize them
    # here.
    for key, value in exec_properties.items():
        if isinstance(value, (dict, list)):
            exec_properties[key] = json.dumps(value)

    logging.info(
        'Executor %s do: inputs: %s, outputs: %s, exec_properties: %s',
        args.executor_class_path, inputs, outputs, exec_properties)
    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor_context = base_executor.BaseExecutor.Context(
        beam_pipeline_args=pipeline_args,
        tmp_dir=args.temp_directory_path,
        unique_id='')
    executor = executor_cls(executor_context)
    logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # The last line of stdout will be pushed to xcom by Airflow.
    if args.write_outputs_stdout:
        print(artifact_utils.jsonify_artifact_dict(outputs))
Exemplo n.º 15
0
def _run_executor(args, pipeline_args) -> None:
    r"""Select a particular executor and run it based on name.

  # pylint: disable=line-too-long
  _run_executor() is used to invoke a class subclassing
  tfx.components.base.base_executor.BaseExecutor.  This function can be used for
  both invoking the executor on remote environments as well as for unit testing
  of executors.

  How to invoke an executor as standalone:
  # TODO(b/132958430): Create utility script to generate arguments for run_executor.py
  First, the input data needs to be prepared.  An easy way to generate the test
  data is to fully run the pipeline once.  This will generate the data to be
  used for testing as well as log the artifacts to be used as input parameters.
  In each executed component, three log entries will be generated similar to the
  below:
  ```
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,116] {base_executor.py:72} INFO - Starting Executor execution.
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:74} INFO - Inputs for Executor is: {"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:76} INFO - Outputs for Executor is: {"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:78} INFO - Execution properties for Executor is: {"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  Each of these map directly to the input parameters expected by run_executor():
  ```
  python scripts/run_executor.py \
      --executor_class_path=tfx.components.example_gen.big_query_example_gen.executor.Executor \
      --inputs={"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} \
      --outputs={"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} \
      --exec-properties={"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  # pylint: disable=line-too-long

  Args:
    args:
      - inputs: The input artifacts for this execution, serialized as JSON.
      - outputs: The output artifacts to be generated by this execution,
        serialized as JSON.
      - exec_properties: The execution properties to be used by this execution,
        serialized as JSON.
    pipeline_args: Optional parameter that maps to the optional_pipeline_args
    parameter in the pipeline, which provides additional configuration options
    for apache-beam and tensorflow.logging.

  Returns:
    None

  Raises:
    None
  """

    tf.logging.set_verbosity(tf.logging.INFO)

    (inputs_str, outputs_str,
     exec_properties_str) = (args.inputs
                             or base64.b64decode(args.inputs_base64),
                             args.outputs
                             or base64.b64decode(args.outputs_base64),
                             args.exec_properties
                             or base64.b64decode(args.exec_properties_base64))

    inputs = artifact_utils.parse_artifact_dict(inputs_str)
    outputs = artifact_utils.parse_artifact_dict(outputs_str)
    exec_properties = json.loads(exec_properties_str)
    tf.logging.info(
        'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format(
            args.executor_class_path, inputs, outputs, exec_properties))
    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor_context = base_executor.BaseExecutor.Context(
        beam_pipeline_args=pipeline_args,
        tmp_dir=args.temp_directory_path,
        unique_id='')
    executor = executor_cls(executor_context)
    tf.logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # The last line of stdout will be pushed to xcom by Airflow.
    if args.write_outputs_stdout:
        print(artifact_utils.jsonify_artifact_dict(outputs))
Exemplo n.º 16
0
def _parse_raw_artifact(
        artifact_pb: pipeline_pb2.RuntimeArtifact,
        name_from_id: MutableMapping[int, str]) -> artifact.Artifact:
    """Parses RuntimeArtifact proto message without artifact_type."""
    # This parser can only reserve what's inside the RuntimeArtifact pb message.

    # Recovers the type information from artifact type schema.
    # TODO(b/170261670): Replace this workaround by a more resilient
    # implementation. Currently custom artifact type can hardly be supported.
    assert artifact_pb.type, 'RuntimeArtifact is expected to have a type.'

    # 1. Import the artifact class from preloaded TFX library.
    type_path = _retrieve_class_path(artifact_pb.type)
    artifact_cls = import_utils.import_class_by_path(type_path)

    # 2. Copy properties and custom properties to the MLMD artifact pb.
    mlmd_artifact = metadata_store_pb2.Artifact()
    # TODO(b/135056715): Change to a unified getter/setter of Artifact type
    # once it's ready.
    if artifact_pb.name:
        # TODO(b/169583143): Remove this workaround when TFX migrates to use
        # str-typed id/name to identify artifacts.
        # Convert and populate the MLMD artifact ID.
        mlmd_artifact.id = _get_hashed_id(artifact_pb.name, name_from_id)

    mlmd_artifact.uri = artifact_pb.uri
    for k, v in artifact_pb.properties.items():
        mlmd_artifact.properties[k].CopyFrom(compiler_utils.get_mlmd_value(v))

    for k, v in artifact_pb.custom_properties.items():
        mlmd_artifact.custom_properties[k].CopyFrom(
            compiler_utils.get_mlmd_value(v))

    # Translate metadata items into properties and custom properties.
    mlmd_artifact_type = artifact_cls().artifact_type
    metadata_dict = json_format.MessageToDict(artifact_pb.metadata)
    for k, v in metadata_dict.items():
        if k in mlmd_artifact_type.properties:
            property_type = mlmd_artifact_type.properties[k]
            if property_type == metadata_store_pb2.INT and isinstance(
                    v, float):
                mlmd_artifact.properties[k].int_value = int(v)
                continue
            elif property_type == metadata_store_pb2.DOUBLE and isinstance(
                    v, float):
                mlmd_artifact.properties[k].double_value = v
                continue
            elif property_type == metadata_store_pb2.STRING and isinstance(
                    v, str):
                mlmd_artifact.properties[k].string_value = v
                continue
            elif property_type == metadata_store_pb2.STRUCT:
                mlmd_artifact.properties[k].struct_value.CopyFrom(
                    artifact._encode_struct_value(v))  # pylint: disable=protected-access
                continue
            # We fell through, which means the property doesn't actually fit the
            # schema. Therefore, we treat it as a custom property.

        # First, we drop the custom property prefix if we had to drop it because
        # of a property name conflict.
        if k.startswith(artifact.CUSTOM_PROPERTIES_PREFIX):
            stripped_k = k[len(artifact.CUSTOM_PROPERTIES_PREFIX):]
            if stripped_k in mlmd_artifact_type.properties:
                k = stripped_k
        mlmd_artifact.custom_properties[k].struct_value.CopyFrom(
            artifact._encode_struct_value(v))  # pylint: disable=protected-access

    # 3. Instantiate the artifact Python object.
    result = artifact_cls()
    result.set_mlmd_artifact(mlmd_artifact)

    return result
Exemplo n.º 17
0
 def test_import_class_by_path(self):
     """Test import_class_by_path."""
     class_path = '.'.join(
         [ImportUtilsTest.__module__, ImportUtilsTest.__name__])
     imported_class = import_utils.import_class_by_path(class_path)
     self.assertEqual(ImportUtilsTest, imported_class)
Exemplo n.º 18
0
 def testImportClassByPath(self):
     test_class = test_fn.TestClass
     class_path = '%s.%s' % (test_class.__module__, test_class.__name__)
     imported_class = import_utils.import_class_by_path(class_path)
     self.assertEqual(test_class, imported_class)
Exemplo n.º 19
0
 def _reconstruct_from_executor_class_path(executor_class_path):
   executor_class = import_utils.import_class_by_path(executor_class_path)
   return ExecutorClassSpec(executor_class)
Exemplo n.º 20
0
def run_component(
    full_component_class_name: Text,
    temp_directory_path: Text = None,
    beam_pipeline_args: List[Text] = None,
    **arguments
):
  r"""Loads a component, instantiates it with arguments and runs its executor.

  The component class is instantiated, so the component code is executed,
  not just the executor code.

  To pass artifact URI, use <input_name>_uri argument name.
  To pass artifact property, use <input_name>_<property> argument name.
  Protobuf property values can be passed as JSON-serialized protobufs.

  # pylint: disable=line-too-long

  Example::

    # When run as a script:
    python3 scripts/run_component.py \
      --full-component-class-name tfx.components.StatisticsGen \
      --examples-uri gs://my_bucket/chicago_taxi_simple/CsvExamplesGen/examples/1/ \
      --examples-split-names '["train", "eval"]' \
      --output-uri gs://my_bucket/chicago_taxi_simple/StatisticsGen/output/1/

    # When run as a function:
    run_component(
      full_component_class_name='tfx.components.StatisticsGen',
      examples_uri='gs://my_bucket/chicago_taxi_simple/CsvExamplesGen/sxamples/1/',
      examples_split_names='["train", "eval"]',
      output_uri='gs://my_bucket/chicago_taxi_simple/StatisticsGen/output/1/',
    )

  Args:
    full_component_class_name: The component class name including module name.
    temp_directory_path: Optional. Temporary directory path for the executor.
    beam_pipeline_args: Optional. Arguments to pass to the Beam pipeline.
    **arguments: Key-value pairs with component arguments.
  """
  component_class = import_utils.import_class_by_path(full_component_class_name)

  component_arguments = {}

  for name, execution_param in component_class.SPEC_CLASS.PARAMETERS.items():
    argument_value = arguments.get(name, None)
    if argument_value is None:
      continue
    param_type = execution_param.type
    if (isinstance(param_type, type) and
        issubclass(param_type, message.Message)):
      argument_value_obj = param_type()
      json_format.Parse(argument_value, argument_value_obj)
    else:
      argument_value_obj = argument_value
    component_arguments[name] = argument_value_obj

  for input_name, channel_param in component_class.SPEC_CLASS.INPUTS.items():
    uri = (arguments.get(input_name + '_uri') or
           arguments.get(input_name + '_path'))
    if uri:
      artifact = channel_param.type()
      artifact.uri = uri
      # Setting the artifact properties
      for property_name in channel_param.type.PROPERTIES:
        property_arg_name = input_name + '_' + property_name
        if property_arg_name in arguments:
          setattr(artifact, property_name, arguments[property_arg_name])
      component_arguments[input_name] = channel_utils.as_channel([artifact])

  component_instance = component_class(**component_arguments)

  input_dict = channel_utils.unwrap_channel_dict(
      component_instance.inputs.get_all())
  output_dict = channel_utils.unwrap_channel_dict(
      component_instance.outputs.get_all())
  exec_properties = component_instance.exec_properties

  # Generating paths for output artifacts
  for output_name, artifacts in output_dict.items():
    uri = (arguments.get('output_' + output_name + '_uri') or
           arguments.get(output_name + '_uri') or
           arguments.get(output_name + '_path'))
    if uri:
      for artifact in artifacts:
        artifact.uri = uri

  executor_context = base_executor.BaseExecutor.Context(
      beam_pipeline_args=beam_pipeline_args,
      tmp_dir=temp_directory_path,
      unique_id='',
  )
  executor = component_instance.executor_spec.executor_class(executor_context)
  executor.Do(
      input_dict=input_dict,
      output_dict=output_dict,
      exec_properties=exec_properties,
  )
Exemplo n.º 21
0
def main():
    # Log to the container's stdout so Kubeflow Pipelines UI can display logs to
    # the user.
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('--pipeline_name', type=str, required=True)
    parser.add_argument('--pipeline_root', type=str, required=True)
    parser.add_argument('--kubeflow_metadata_config', type=str, required=True)
    parser.add_argument('--additional_pipeline_args', type=str, required=True)
    parser.add_argument('--component_id', type=str, required=True)
    parser.add_argument('--component_type', type=str, required=True)
    parser.add_argument('--driver_class_path', type=str, required=True)
    parser.add_argument('--executor_spec', type=str, required=True)
    parser.add_argument('--component_launcher_class_path',
                        type=str,
                        required=True)
    parser.add_argument('--inputs', type=str, required=True)
    parser.add_argument('--outputs', type=str, required=True)
    parser.add_argument('--exec_properties', type=str, required=True)
    parser.add_argument('--enable_cache', action='store_true')

    args = parser.parse_args()

    inputs = artifact_utils.parse_artifact_dict(args.inputs)
    input_dict = _make_channel_dict(inputs)

    outputs = artifact_utils.parse_artifact_dict(args.outputs)
    output_dict = _make_channel_dict(outputs)

    exec_properties = json.loads(args.exec_properties)

    driver_class = import_utils.import_class_by_path(args.driver_class_path)
    executor_spec = json_utils.loads(args.executor_spec)

    component_launcher_class = import_utils.import_class_by_path(
        args.component_launcher_class_path)
    if not issubclass(component_launcher_class,
                      base_component_launcher.BaseComponentLauncher):
        raise TypeError(
            'component_launcher_class "%s" is not subclass of base_component_launcher.BaseComponentLauncher'
            % component_launcher_class)

    kubeflow_metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    json_format.Parse(args.kubeflow_metadata_config, kubeflow_metadata_config)
    connection_config = _get_metadata_connection_config(
        kubeflow_metadata_config)

    component_info = data_types.ComponentInfo(
        component_type=args.component_type, component_id=args.component_id)

    driver_args = data_types.DriverArgs(enable_cache=args.enable_cache)

    additional_pipeline_args = _make_additional_pipeline_args(
        args.additional_pipeline_args)

    # TODO(hongyes): create a classmethod to create launcher from a deserialized
    # component.
    launcher = component_launcher_class(
        component_info=component_info,
        driver_class=driver_class,
        component_executor_spec=executor_spec,
        input_dict=input_dict,
        output_dict=output_dict,
        exec_properties=exec_properties,
        pipeline_info=data_types.PipelineInfo(
            pipeline_name=args.pipeline_name,
            pipeline_root=args.pipeline_root,
            run_id=os.environ['WORKFLOW_ID']),
        driver_args=driver_args,
        metadata_connection_config=connection_config,
        additional_pipeline_args=additional_pipeline_args)

    launcher.launch()
Exemplo n.º 22
0
def main():
  # Log to the container's stdout so Kubeflow Pipelines UI can display logs to
  # the user.
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
  logging.getLogger().setLevel(logging.INFO)

  parser = argparse.ArgumentParser()
  parser.add_argument('--pipeline_name', type=str, required=True)
  parser.add_argument('--pipeline_root', type=str, required=True)
  parser.add_argument('--kubeflow_metadata_config', type=str, required=True)
  parser.add_argument('--beam_pipeline_args', type=str, required=True)
  parser.add_argument('--additional_pipeline_args', type=str, required=True)
  parser.add_argument(
      '--component_launcher_class_path', type=str, required=True)
  parser.add_argument('--enable_cache', action='store_true')
  parser.add_argument('--serialized_component', type=str, required=True)
  parser.add_argument('--component_config', type=str, required=True)

  args = parser.parse_args()

  component = json_utils.loads(args.serialized_component)
  component_config = json_utils.loads(args.component_config)
  component_launcher_class = import_utils.import_class_by_path(
      args.component_launcher_class_path)
  if not issubclass(component_launcher_class,
                    base_component_launcher.BaseComponentLauncher):
    raise TypeError(
        'component_launcher_class "%s" is not subclass of base_component_launcher.BaseComponentLauncher'
        % component_launcher_class)

  kubeflow_metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
  json_format.Parse(args.kubeflow_metadata_config, kubeflow_metadata_config)
  metadata_connection = kubeflow_metadata_adapter.KubeflowMetadataAdapter(
      _get_metadata_connection_config(kubeflow_metadata_config))
  driver_args = data_types.DriverArgs(enable_cache=args.enable_cache)

  beam_pipeline_args = json.loads(args.beam_pipeline_args)

  additional_pipeline_args = json.loads(args.additional_pipeline_args)

  launcher = component_launcher_class.create(
      component=component,
      pipeline_info=data_types.PipelineInfo(
          pipeline_name=args.pipeline_name,
          pipeline_root=args.pipeline_root,
          run_id=os.environ['WORKFLOW_ID']),
      driver_args=driver_args,
      metadata_connection=metadata_connection,
      beam_pipeline_args=beam_pipeline_args,
      additional_pipeline_args=additional_pipeline_args,
      component_config=component_config)

  # Attach necessary labels to distinguish different runner and DSL.
  # TODO(zhitaoli): Pass this from KFP runner side when the same container
  # entrypoint can be used by a different runner.
  with telemetry_utils.scoped_labels({
      telemetry_utils.LABEL_TFX_RUNNER: 'kfp',
  }):
    execution_info = launcher.launch()

  # Dump the UI metadata.
  _dump_ui_metadata(component, execution_info)
Exemplo n.º 23
0
def run_component(full_component_class_name: str,
                  temp_directory_path: Optional[str] = None,
                  beam_pipeline_args: Optional[List[str]] = None,
                  **arguments):
    r"""Loads a component, instantiates it with arguments and runs its executor.

  The component class is instantiated, so the component code is executed,
  not just the executor code.

  To pass artifact URI, use <input_name>_uri argument name.
  To pass artifact property, use <input_name>_<property> argument name.
  Protobuf property values can be passed as JSON-serialized protobufs.

  # pylint: disable=line-too-long

  Example::

    # When run as a script:
    python3 scripts/run_component.py \
      --full-component-class-name tfx.components.StatisticsGen \
      --examples-uri gs://my_bucket/chicago_taxi_simple/CsvExamplesGen/examples/1/ \
      --examples-split-names '["train", "eval"]' \
      --output-uri gs://my_bucket/chicago_taxi_simple/StatisticsGen/output/1/

    # When run as a function:
    run_component(
      full_component_class_name='tfx.components.StatisticsGen',
      examples_uri='gs://my_bucket/chicago_taxi_simple/CsvExamplesGen/sxamples/1/',
      examples_split_names='["train", "eval"]',
      output_uri='gs://my_bucket/chicago_taxi_simple/StatisticsGen/output/1/',
    )

  Args:
    full_component_class_name: The component class name including module name.
    temp_directory_path: Optional. Temporary directory path for the executor.
    beam_pipeline_args: Optional. Arguments to pass to the Beam pipeline.
    **arguments: Key-value pairs with component arguments.
  """
    component_class = import_utils.import_class_by_path(
        full_component_class_name)

    component_arguments = {}

    for name, execution_param in component_class.SPEC_CLASS.PARAMETERS.items():
        argument_value = arguments.get(name, None)
        if argument_value is None:
            continue
        param_type = execution_param.type
        if (isinstance(param_type, type)
                and issubclass(param_type, message.Message)):
            argument_value_obj = param_type()
            proto_utils.json_to_proto(argument_value, argument_value_obj)
        elif param_type is int:
            argument_value_obj = int(argument_value)
        elif param_type is float:
            argument_value_obj = float(argument_value)
        else:
            argument_value_obj = argument_value
        component_arguments[name] = argument_value_obj

    for input_name, channel_param in component_class.SPEC_CLASS.INPUTS.items():
        uri = (arguments.get(input_name + '_uri')
               or arguments.get(input_name + '_path'))
        if uri:
            artifact = channel_param.type()
            artifact.uri = uri
            # Setting the artifact properties
            for property_name, property_spec in (channel_param.type.PROPERTIES
                                                 or {}).items():
                property_arg_name = input_name + '_' + property_name
                if property_arg_name in arguments:
                    property_value = arguments[property_arg_name]
                    if property_spec.type == PropertyType.INT:
                        property_value = int(property_value)
                    if property_spec.type == PropertyType.FLOAT:
                        property_value = float(property_value)
                    setattr(artifact, property_name, property_value)
            component_arguments[input_name] = channel_utils.as_channel(
                [artifact])

    component_instance = component_class(**component_arguments)

    input_dict = channel_utils.unwrap_channel_dict(component_instance.inputs)
    output_dict = channel_utils.unwrap_channel_dict(component_instance.outputs)
    exec_properties = component_instance.exec_properties

    # Generating paths for output artifacts
    for output_name, channel_param in component_class.SPEC_CLASS.OUTPUTS.items(
    ):
        uri = (arguments.get('output_' + output_name + '_uri')
               or arguments.get(output_name + '_uri')
               or arguments.get(output_name + '_path'))
        if uri:
            artifacts = output_dict[output_name]
            if not artifacts:
                artifacts.append(channel_param.type())
            for artifact in artifacts:
                artifact.uri = uri

    if issubclass(component_instance.executor_spec.executor_class,
                  base_beam_executor.BaseBeamExecutor):
        executor_context = base_beam_executor.BaseBeamExecutor.Context(
            beam_pipeline_args=beam_pipeline_args,
            tmp_dir=temp_directory_path,
            unique_id='',
        )
    else:
        executor_context = base_executor.BaseExecutor.Context(
            extra_flags=beam_pipeline_args,
            tmp_dir=temp_directory_path,
            unique_id='',
        )
    executor = component_instance.executor_spec.executor_class(
        executor_context)
    executor.Do(
        input_dict=input_dict,
        output_dict=output_dict,
        exec_properties=exec_properties,
    )

    # Writing out the output artifact properties
    for output_name, channel_param in component_class.SPEC_CLASS.OUTPUTS.items(
    ):
        for property_name in channel_param.type.PROPERTIES or []:
            property_path_arg_name = output_name + '_' + property_name + '_path'
            property_path = arguments.get(property_path_arg_name)
            if property_path:
                artifacts = output_dict[output_name]
                for artifact in artifacts:
                    property_value = getattr(artifact, property_name)
                    os.makedirs(os.path.dirname(property_path), exist_ok=True)
                    with open(property_path, 'w') as f:
                        f.write(str(property_value))
Exemplo n.º 24
0
def _run_executor(args: argparse.Namespace, beam_args: List[str]) -> None:
    """Selects a particular executor and run it based on name.

  Args:
    args:
      --executor_class_path: The import path of the executor class.
      --json_serialized_invocation_args: Full JSON-serialized parameters for
        this execution. See go/mp-alpha-placeholder for details.
    beam_args: Optional parameter that maps to the optional_pipeline_args
      parameter in the pipeline, which provides additional configuration options
      for apache-beam and tensorflow.logging.
    For more about the beam arguments please refer to:
    https://cloud.google.com/dataflow/docs/guides/specifying-exec-params
  """
    logging.set_verbosity(logging.INFO)

    # Rehydrate inputs/outputs/exec_properties from the serialized metadata.
    executor_input = pipeline_pb2.ExecutorInput()
    json_format.Parse(args.json_serialized_invocation_args,
                      executor_input,
                      ignore_unknown_fields=True)

    inputs_dict = executor_input.inputs.artifacts
    outputs_dict = executor_input.outputs.artifacts
    inputs_parameter = executor_input.inputs.parameters

    name_from_id = {}

    inputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
        inputs_dict, name_from_id)
    outputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
        outputs_dict, name_from_id)
    exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties(
        inputs_parameter)
    logging.info(
        'Executor %s do: inputs: %s, outputs: %s, exec_properties: %s',
        args.executor_class_path, inputs, outputs, exec_properties)
    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor_context = base_executor.BaseExecutor.Context(
        beam_pipeline_args=beam_args, unique_id='')
    executor = executor_cls(executor_context)
    logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # TODO(b/169583143): Remove this workaround when TFX migrates to use str-typed
    # id/name to identify artifacts.
    # Convert ModelBlessing artifact to use managed MLMD resource name.
    if (issubclass(executor_cls, evaluator_executor.Executor)
            and constants.BLESSING_KEY in outputs):
        # Parse the parent prefix for managed MLMD resource name.
        kubeflow_v2_entrypoint_utils.refactor_model_blessing(
            artifact_utils.get_single_instance(
                outputs[constants.BLESSING_KEY]), name_from_id)

    # Log the output metadata to a file. So that it can be picked up by MP.
    metadata_uri = executor_input.outputs.output_file
    executor_output = pipeline_pb2.ExecutorOutput()
    for k, v in kubeflow_v2_entrypoint_utils.translate_executor_output(
            outputs, name_from_id).items():
        executor_output.artifacts[k].CopyFrom(v)

    fileio.open(metadata_uri,
                'wb').write(json_format.MessageToJson(executor_output))