Пример #1
0
  def run(self, pipeline: pipeline_py.Pipeline) -> None:
    """Runs given logical pipeline locally.

    Args:
      pipeline: Logical pipeline containing pipeline args and components.
    """
    # For CLI, while creating or updating pipeline, pipeline_args are extracted
    # and hence we avoid executing the pipeline.
    if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ:
      return

    for component in pipeline.components:
      # TODO(b/187122662): Pass through pip dependencies as a first-class
      # component flag.
      if isinstance(component, base_component.BaseComponent):
        component._resolve_pip_dependencies(  # pylint: disable=protected-access
            pipeline.pipeline_info.pipeline_root)

    c = compiler.Compiler()
    pipeline = c.compile(pipeline)

    # Substitute the runtime parameter to be a concrete run_id
    runtime_parameter_utils.substitute_runtime_parameter(
        pipeline, {
            constants.PIPELINE_RUN_ID_PARAMETER_NAME:
                datetime.datetime.now().isoformat(),
        })

    deployment_config = runner_utils.extract_local_deployment_config(pipeline)
    connection_config = deployment_config.metadata_connection_config

    logging.info('Running pipeline:\n %s', pipeline)
    logging.info('Using deployment config:\n %s', deployment_config)
    logging.info('Using connection config:\n %s', connection_config)

    with telemetry_utils.scoped_labels(
        {telemetry_utils.LABEL_TFX_RUNNER: 'local'}):
      # Run each component. Note that the pipeline.components list is in
      # topological order.
      #
      # TODO(b/171319478): After IR-based execution is used, used multi-threaded
      # execution so that independent components can be run in parallel.
      for node in pipeline.nodes:
        pipeline_node = node.pipeline_node
        node_id = pipeline_node.node_info.id
        executor_spec = runner_utils.extract_executor_spec(
            deployment_config, node_id)
        custom_driver_spec = runner_utils.extract_custom_driver_spec(
            deployment_config, node_id)

        component_launcher = launcher.Launcher(
            pipeline_node=pipeline_node,
            mlmd_connection=metadata.Metadata(connection_config),
            pipeline_info=pipeline.pipeline_info,
            pipeline_runtime_spec=pipeline.runtime_spec,
            executor_spec=executor_spec,
            custom_driver_spec=custom_driver_spec)
        logging.info('Component %s is running.', node_id)
        component_launcher.launch()
        logging.info('Component %s is finished.', node_id)
Пример #2
0
    def run(self,
            pipeline: tfx_pipeline.Pipeline,
            run_name: Optional[str] = None) -> "airflow.DAG":
        """Deploys given logical pipeline on Airflow.

        Args:
          pipeline: Logical pipeline containing pipeline args and comps.
          run_name: Optional name for the run.

        Returns:
          An Airflow DAG.
        """
        # Only import these when needed.
        import airflow  # noqa

        from zenml.integrations.airflow.orchestrators import airflow_component

        # Merge airflow-specific configs with pipeline args

        airflow_dag = airflow.DAG(
            dag_id=pipeline.pipeline_info.pipeline_name,
            **(typing.cast(AirflowPipelineConfig,
                           self._config).airflow_dag_config),
            is_paused_upon_creation=False,
            catchup=False,  # no backfill
        )
        if "tmp_dir" not in pipeline.additional_pipeline_args:
            tmp_dir = os.path.join(pipeline.pipeline_info.pipeline_root,
                                   ".temp", "")
            pipeline.additional_pipeline_args["tmp_dir"] = tmp_dir

        for component in pipeline.components:
            if isinstance(component, base_component.BaseComponent):
                component._resolve_pip_dependencies(
                    pipeline.pipeline_info.pipeline_root)
            self._replace_runtime_params(component)

        c = compiler.Compiler()
        pipeline = c.compile(pipeline)

        run_name = run_name or datetime.now().strftime("%d_%h_%y-%H_%M_%S_%f")
        # Substitute the runtime parameter to be a concrete run_id
        runtime_parameter_utils.substitute_runtime_parameter(
            pipeline,
            {
                "pipeline-run-id": run_name,
            },
        )
        deployment_config = runner_utils.extract_local_deployment_config(
            pipeline)
        connection_config = deployment_config.metadata_connection_config  # type: ignore[attr-defined] # noqa

        component_impl_map = {}

        for node in pipeline.nodes:
            pipeline_node = node.pipeline_node
            node_id = pipeline_node.node_info.id
            executor_spec = runner_utils.extract_executor_spec(
                deployment_config, node_id)
            custom_driver_spec = runner_utils.extract_custom_driver_spec(
                deployment_config, node_id)

            current_airflow_component = airflow_component.AirflowComponent(
                parent_dag=airflow_dag,
                pipeline_node=pipeline_node,
                mlmd_connection=connection_config,
                pipeline_info=pipeline.pipeline_info,
                pipeline_runtime_spec=pipeline.runtime_spec,
                executor_spec=executor_spec,
                custom_driver_spec=custom_driver_spec,
            )
            component_impl_map[node_id] = current_airflow_component
            for upstream_node in node.pipeline_node.upstream_nodes:
                assert (upstream_node in component_impl_map
                        ), "Components is not in topological order"
                current_airflow_component.set_upstream(
                    component_impl_map[upstream_node])

        return airflow_dag
Пример #3
0
 def _extract_deployment_config(
     self, pipeline: pipeline_pb2.Pipeline
 ) -> local_deployment_config_pb2.LocalDeploymentConfig:
     """Extracts the proto.Any pipeline.deployment_config to LocalDeploymentConfig."""
     return runner_utils.extract_local_deployment_config(pipeline)
Пример #4
0
def main(argv):
    # Log to the container's stdout so Kubeflow Pipelines UI can display logs to
    # the user.
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('--pipeline_root', type=str, required=True)
    parser.add_argument('--metadata_ui_path',
                        type=str,
                        required=False,
                        default='/mlpipeline-ui-metadata.json')
    parser.add_argument('--kubeflow_metadata_config', type=str, required=True)
    parser.add_argument('--tfx_ir', type=str, required=True)
    parser.add_argument('--node_id', type=str, required=True)
    # There might be multiple runtime parameters.
    # `args.runtime_parameter` should become List[str] by using "append".
    parser.add_argument('--runtime_parameter', type=str, action='append')

    # TODO(b/196892362): Replace hooking with a more straightforward mechanism.
    launcher._register_execution = _register_execution  # pylint: disable=protected-access

    args = parser.parse_args(argv)

    tfx_ir = pipeline_pb2.Pipeline()
    json_format.Parse(args.tfx_ir, tfx_ir)

    _resolve_runtime_parameters(tfx_ir, args.runtime_parameter)

    deployment_config = runner_utils.extract_local_deployment_config(tfx_ir)

    kubeflow_metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    json_format.Parse(args.kubeflow_metadata_config, kubeflow_metadata_config)
    metadata_connection = metadata.Metadata(
        _get_metadata_connection_config(kubeflow_metadata_config))

    node_id = args.node_id
    # Attach necessary labels to distinguish different runner and DSL.
    # TODO(zhitaoli): Pass this from KFP runner side when the same container
    # entrypoint can be used by a different runner.
    with telemetry_utils.scoped_labels({
            telemetry_utils.LABEL_TFX_RUNNER: 'kfp',
    }):
        custom_executor_operators = {
            executable_spec_pb2.ContainerExecutableSpec:
            kubernetes_executor_operator.KubernetesExecutorOperator
        }

        executor_spec = runner_utils.extract_executor_spec(
            deployment_config, node_id)
        custom_driver_spec = runner_utils.extract_custom_driver_spec(
            deployment_config, node_id)

        pipeline_node = _get_pipeline_node(tfx_ir, node_id)
        component_launcher = launcher.Launcher(
            pipeline_node=pipeline_node,
            mlmd_connection=metadata_connection,
            pipeline_info=tfx_ir.pipeline_info,
            pipeline_runtime_spec=tfx_ir.runtime_spec,
            executor_spec=executor_spec,
            custom_driver_spec=custom_driver_spec,
            custom_executor_operators=custom_executor_operators)
        logging.info('Component %s is running.', node_id)
        execution_info = component_launcher.launch()
        logging.info('Component %s is finished.', node_id)

    # Dump the UI metadata.
    _dump_ui_metadata(pipeline_node, execution_info, args.metadata_ui_path)
Пример #5
0
    def run(
        self, pipeline: tfx_pipeline.Pipeline, run_name: Optional[str] = None
    ) -> None:
        """Runs given logical pipeline locally.

        Args:
          pipeline: Logical pipeline containing pipeline args and components.
          run_name: Optional name for the run.
        """
        for component in pipeline.components:
            if isinstance(component, base_component.BaseComponent):
                component._resolve_pip_dependencies(
                    pipeline.pipeline_info.pipeline_root
                )

        c = compiler.Compiler()
        pipeline = c.compile(pipeline)

        run_name = run_name or datetime.now().strftime("%d_%h_%y-%H_%M_%S_%f")
        # Substitute the runtime parameter to be a concrete run_id
        runtime_parameter_utils.substitute_runtime_parameter(
            pipeline,
            {
                PIPELINE_RUN_ID_PARAMETER_NAME: run_name,
            },
        )

        deployment_config = runner_utils.extract_local_deployment_config(
            pipeline
        )
        connection_config = deployment_config.metadata_connection_config  # type: ignore[attr-defined] # noqa

        logger.debug(f"Using deployment config:\n {deployment_config}")
        logger.debug(f"Using connection config:\n {connection_config}")

        # Run each component. Note that the pipeline.components list is in
        # topological order.
        for node in pipeline.nodes:
            pipeline_node = node.pipeline_node
            node_id = pipeline_node.node_info.id
            executor_spec = runner_utils.extract_executor_spec(
                deployment_config, node_id
            )
            custom_driver_spec = runner_utils.extract_custom_driver_spec(
                deployment_config, node_id
            )

            component_launcher = launcher.Launcher(
                pipeline_node=pipeline_node,
                mlmd_connection=metadata.Metadata(connection_config),
                pipeline_info=pipeline.pipeline_info,
                pipeline_runtime_spec=pipeline.runtime_spec,
                executor_spec=executor_spec,
                custom_driver_spec=custom_driver_spec,
            )
            start = time.time()
            logger.info(f"Step `{node_id}` has started.")
            component_launcher.launch()
            end = time.time()
            logger.info(
                f"Step `{node_id}` has finished"
                f" in {format_timedelta_pretty(end - start)}."
            )
Пример #6
0
def main():
    # Log to the container's stdout so Kubeflow Pipelines UI can display logs to
    # the user.
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('--pipeline_root', type=str, required=True)
    parser.add_argument('--kubeflow_metadata_config', type=str, required=True)
    parser.add_argument('--serialized_component', type=str, required=True)
    parser.add_argument('--tfx_ir', type=str, required=True)
    parser.add_argument('--node_id', type=str, required=True)
    launcher._register_execution = _register_execution  # pylint: disable=protected-access

    args = parser.parse_args()

    tfx_ir = pipeline_pb2.Pipeline()
    json_format.Parse(args.tfx_ir, tfx_ir)
    # Substitute the runtime parameter to be a concrete run_id
    runtime_parameter_utils.substitute_runtime_parameter(
        tfx_ir, {
            constants.PIPELINE_RUN_ID_PARAMETER_NAME:
            os.environ['WORKFLOW_ID'],
        })

    deployment_config = runner_utils.extract_local_deployment_config(tfx_ir)

    kubeflow_metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    json_format.Parse(args.kubeflow_metadata_config, kubeflow_metadata_config)
    metadata_connection = kubeflow_metadata_adapter.KubeflowMetadataAdapter(
        _get_metadata_connection_config(kubeflow_metadata_config))

    node_id = args.node_id
    # Attach necessary labels to distinguish different runner and DSL.
    # TODO(zhitaoli): Pass this from KFP runner side when the same container
    # entrypoint can be used by a different runner.
    with telemetry_utils.scoped_labels({
            telemetry_utils.LABEL_TFX_RUNNER: 'kfp',
    }):
        custom_executor_operators = {
            executable_spec_pb2.ContainerExecutableSpec:
            kubernetes_executor_operator.KubernetesExecutorOperator
        }

        executor_spec = runner_utils.extract_executor_spec(
            deployment_config, node_id)
        custom_driver_spec = runner_utils.extract_custom_driver_spec(
            deployment_config, node_id)

        pipeline_node = _get_pipeline_node(tfx_ir, node_id)
        component_launcher = launcher.Launcher(
            pipeline_node=pipeline_node,
            mlmd_connection=metadata_connection,
            pipeline_info=tfx_ir.pipeline_info,
            pipeline_runtime_spec=tfx_ir.runtime_spec,
            executor_spec=executor_spec,
            custom_driver_spec=custom_driver_spec,
            custom_executor_operators=custom_executor_operators)
        logging.info('Component %s is running.', node_id)
        execution_info = component_launcher.launch()
        logging.info('Component %s is finished.', node_id)

    # Dump the UI metadata.
    _dump_ui_metadata(pipeline_node, execution_info)
Пример #7
0
  def run_with_ir(
      self,
      pipeline: pipeline_pb2.Pipeline,
      run_options: Optional[pipeline_pb2.RunOptions] = None,
  ) -> None:
    """Runs given pipeline locally.

    Args:
      pipeline: Pipeline IR containing pipeline args and components.
      run_options: Optional args for the run.

    Raises:
      ValueError: If run_options is provided, and partial_run_options.from_nodes
        and partial_run_options.to_nodes are both empty.
    """
    # Substitute the runtime parameter to be a concrete run_id
    runtime_parameter_utils.substitute_runtime_parameter(
        pipeline, {
            constants.PIPELINE_RUN_ID_PARAMETER_NAME:
                datetime.datetime.now().isoformat(),
        })

    deployment_config = runner_utils.extract_local_deployment_config(pipeline)
    connection_config = getattr(
        deployment_config.metadata_connection_config,
        deployment_config.metadata_connection_config.WhichOneof(
            'connection_config'))

    logging.info('Using deployment config:\n %s', deployment_config)
    logging.info('Using connection config:\n %s', connection_config)

    if run_options:
      logging.info('Using run_options:\n %s', run_options)
      pr_opts = run_options.partial_run
      partial_run_utils.mark_pipeline(
          pipeline,
          from_nodes=pr_opts.from_nodes or None,
          to_nodes=pr_opts.to_nodes or None,
          snapshot_settings=pr_opts.snapshot_settings)

    with telemetry_utils.scoped_labels(
        {telemetry_utils.LABEL_TFX_RUNNER: 'local'}):
      # Run each component. Note that the pipeline.components list is in
      # topological order.
      #
      # TODO(b/171319478): After IR-based execution is used, used multi-threaded
      # execution so that independent components can be run in parallel.
      for node in pipeline.nodes:
        pipeline_node = node.pipeline_node
        node_id = pipeline_node.node_info.id
        if pipeline_node.execution_options.HasField('skip'):
          logging.info('Skipping component %s.', node_id)
          continue
        executor_spec = runner_utils.extract_executor_spec(
            deployment_config, node_id)
        custom_driver_spec = runner_utils.extract_custom_driver_spec(
            deployment_config, node_id)

        component_launcher = launcher.Launcher(
            pipeline_node=pipeline_node,
            mlmd_connection=metadata.Metadata(connection_config),
            pipeline_info=pipeline.pipeline_info,
            pipeline_runtime_spec=pipeline.runtime_spec,
            executor_spec=executor_spec,
            custom_driver_spec=custom_driver_spec)
        logging.info('Component %s is running.', node_id)
        if pipeline_node.execution_options.run.perform_snapshot:
          with metadata.Metadata(connection_config) as mlmd_handle:
            partial_run_utils.snapshot(mlmd_handle, pipeline)
        component_launcher.launch()
        logging.info('Component %s is finished.', node_id)