def run(self, pipeline: tfx_pipeline.Pipeline) -> None: """Deploys given logical pipeline on Kubernetes. Args: pipeline: Logical pipeline containing pipeline args and components. """ if not pipeline.pipeline_info.run_id: pipeline.pipeline_info.run_id = datetime.datetime.now().isoformat() if not kube_utils.is_inside_cluster(): kubernetes_remote_runner.run_as_kubernetes_job( pipeline=pipeline, tfx_image=self._config.tfx_image) return # TODO(ericlege): Support running components in parallel. ran_components = set() # Runs component in topological order. for component in pipeline.components: # Verify that components are in topological order. if hasattr(component, 'upstream_nodes') and component.upstream_nodes: for upstream_node in component.upstream_nodes: assert upstream_node in ran_components, ( 'Components is not in ' 'topological order') (component_launcher_class, component_config) = config_utils.find_component_launch_info( self._config, component) # Check if the component is launchable as a container component. if kubernetes_component_launcher.KubernetesComponentLauncher.can_launch( component.executor_spec, component_config): launch_container_component(component, component_launcher_class, component_config, pipeline) # Otherwise, the component should be launchable with the in process # component launcher. wrap the component to a container component. elif in_process_component_launcher.InProcessComponentLauncher.can_launch( component.executor_spec, component_config): wrapped_component = self._wrap_container_component( component=component, component_launcher_class=component_launcher_class, component_config=component_config, pipeline=pipeline) # Component launch info is updated by wrapping the component into a # container component. Therefore, these properties need to be reloaded. (wrapped_component_launcher_class, wrapped_component_config ) = config_utils.find_component_launch_info( self._config, wrapped_component) launch_container_component(wrapped_component, wrapped_component_launcher_class, wrapped_component_config, pipeline) else: raise ValueError( 'Can not find suitable launcher for component.') ran_components.add(component)
def testFindComponentLaunchInfoFailWithNoLauncherClassFound(self): input_artifact = test_utils._InputArtifact() component = test_utils._FakeComponent( name='FakeComponent', input_channel=channel_utils.as_channel([input_artifact])) p_config = pipeline_config.PipelineConfig(supported_launcher_classes=[ docker_component_launcher.DockerComponentLauncher ]) with self.assertRaises(RuntimeError): # DockerComponentLauncher cannot launch class executor. config_utils.find_component_launch_info(p_config, component)
def testFindComponentLaunchInfoReturnConfigOverride(self): input_artifact = test_utils._InputArtifact() component = test_utils._FakeComponent( name='FakeComponent', input_channel=channel_utils.as_channel([input_artifact]), custom_executor_spec=executor_spec.ExecutorContainerSpec( image='gcr://test', args=['{{input_dict["input"][0].uri}}'])) default_config = docker_component_config.DockerComponentConfig() override_config = docker_component_config.DockerComponentConfig( name='test') p_config = pipeline_config.PipelineConfig( supported_launcher_classes=[ docker_component_launcher.DockerComponentLauncher ], default_component_configs=[default_config], component_config_overrides={ '_FakeComponent.FakeComponent': override_config }) (launcher_class, c_config) = config_utils.find_component_launch_info( p_config, component) self.assertEqual(docker_component_launcher.DockerComponentLauncher, launcher_class) self.assertEqual(override_config, c_config)
def run(self, tfx_pipeline: pipeline.Pipeline) -> None: with beam.Pipeline(argv=self._beam_orchestrator_args) as p: root = p | 'CreateRoot' >> beam.Create([None]) signal_map = {} for component in tfx_pipeline.components: component_id = component.id signals_to_wait = [] if component.upstream_nodes: for upstream_node in component.upstream_nodes: assert upstream_node in signal_map, ( 'Components is not in ' 'topological order') signals_to_wait.append(signal_map[upstream_node]) logger.debug('Component %s depends on %s.', component_id, [s.producer.full_label for s in signals_to_wait]) (component_launcher_class, component_config) = \ config_utils.find_component_launch_info( self._config, component) signal_map[component] = ( root | 'Run[%s]' % component_id >> beam.ParDo( _ComponentAsDoFn(component, component_launcher_class, component_config, tfx_pipeline), * [beam.pvalue.AsIter(s) for s in signals_to_wait])) logger.debug('Component %s is scheduled.', component_id)
def run(self, tfx_pipeline: pipeline.Pipeline) -> None: """Deploys given logical pipeline on Beam. Args: tfx_pipeline: Logical pipeline containing pipeline args and components. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid executing the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return tfx_pipeline.pipeline_info.run_id = datetime.datetime.now().isoformat() with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'beam'}): with beam.Pipeline(argv=self._beam_orchestrator_args) as p: # Uses for triggering the component DoFns. root = p | 'CreateRoot' >> beam.Create([None]) # Stores mapping of component to its signal. signal_map = {} # pipeline.components are in topological order. for component in tfx_pipeline.components: # TODO(b/187122662): Pass through pip dependencies as a first-class # component flag. if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( # pylint: disable=protected-access tfx_pipeline.pipeline_info.pipeline_root) component_id = component.id # Signals from upstream components. signals_to_wait = [] if component.upstream_nodes: for upstream_node in component.upstream_nodes: assert upstream_node in signal_map, ( 'Components is not in ' 'topological order') signals_to_wait.append(signal_map[upstream_node]) absl.logging.info( 'Component %s depends on %s.', component_id, [s.producer.full_label for s in signals_to_wait]) (component_launcher_class, component_config ) = config_utils.find_component_launch_info( self._config, component) # Each signal is an empty PCollection. AsIter ensures component will # be triggered after upstream components are finished. signal_map[component] = ( root | 'Run[%s]' % component_id >> beam.ParDo( _ComponentAsDoFn(component, component_launcher_class, component_config, tfx_pipeline), * [beam.pvalue.AsIter(s) for s in signals_to_wait])) absl.logging.info('Component %s is scheduled.', component_id)
def run(self, tfx_pipeline: pipeline.Pipeline): """Deploys given logical pipeline on Airflow. Args: tfx_pipeline: Logical pipeline containing pipeline args and components. Returns: An Airflow DAG. """ # Merge airflow-specific configs with pipeline args airflow_dag = models.DAG( dag_id=tfx_pipeline.pipeline_info.pipeline_name, **(typing.cast(AirflowPipelineConfig, self._config).airflow_dag_config)) if 'tmp_dir' not in tfx_pipeline.additional_pipeline_args: tmp_dir = os.path.join(tfx_pipeline.pipeline_info.pipeline_root, '.temp', '') tfx_pipeline.additional_pipeline_args['tmp_dir'] = tmp_dir component_impl_map = {} for tfx_component in tfx_pipeline.components: # TODO(b/187122662): Pass through pip dependencies as a first-class # component flag. if isinstance(tfx_component, base_component.BaseComponent): tfx_component._resolve_pip_dependencies( # pylint: disable=protected-access tfx_pipeline.pipeline_info.pipeline_root) tfx_component = self._replace_runtime_params(tfx_component) (component_launcher_class, component_config) = config_utils.find_component_launch_info( self._config, tfx_component) current_airflow_component = airflow_component.AirflowComponent( parent_dag=airflow_dag, component=tfx_component, component_launcher_class=component_launcher_class, pipeline_info=tfx_pipeline.pipeline_info, enable_cache=tfx_pipeline.enable_cache, metadata_connection_config=tfx_pipeline. metadata_connection_config, beam_pipeline_args=tfx_pipeline.beam_pipeline_args, additional_pipeline_args=tfx_pipeline.additional_pipeline_args, component_config=component_config) component_impl_map[tfx_component] = current_airflow_component for upstream_node in tfx_component.upstream_nodes: assert upstream_node in component_impl_map, ( 'Components is not in ' 'topological order') current_airflow_component.set_upstream( component_impl_map[upstream_node]) return airflow_dag
def testFindComponentLaunchInfoReturnDefaultLaunchInfo(self): input_artifact = types.Artifact(type_name='InputPath') component = test_utils._FakeComponent( name='FakeComponent', input_channel=channel_utils.as_channel([input_artifact])) p_config = pipeline_config.PipelineConfig() (launcher_class, c_config) = config_utils.find_component_launch_info(p_config, component) self.assertEqual(in_process_component_launcher.InProcessComponentLauncher, launcher_class) self.assertIsNone(c_config)
def run(self, tfx_pipeline: pipeline.Pipeline) -> None: """Deploys given logical pipeline on Beam. Args: tfx_pipeline: Logical pipeline containing pipeline args and components. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid deploying the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return tfx_pipeline.pipeline_info.run_id = datetime.datetime.now().isoformat() with beam.Pipeline(argv=self._beam_orchestrator_args) as p: # Uses for triggering the component DoFns. root = p | 'CreateRoot' >> beam.Create([None]) # Stores mapping of component to its signal. signal_map = {} # pipeline.components are in topological order. for component in tfx_pipeline.components: component_id = component.id # Signals from upstream components. signals_to_wait = [] if component.upstream_nodes: for upstream_node in component.upstream_nodes: assert upstream_node in signal_map, ( 'Components is not in ' 'topological order') signals_to_wait.append(signal_map[upstream_node]) absl.logging.info( 'Component %s depends on %s.', component_id, [s.producer.full_label for s in signals_to_wait]) (component_launcher_class, component_config) = config_utils.find_component_launch_info( self._config, component) # Each signal is an empty PCollection. AsIter ensures component will be # triggered after upstream components are finished. signal_map[component] = ( root | 'Run[%s]' % component_id >> beam.ParDo( _ComponentAsDoFn(component, component_launcher_class, component_config, tfx_pipeline), * [beam.pvalue.AsIter(s) for s in signals_to_wait])) absl.logging.info('Component %s is scheduled.', component_id)
def run(self, tfx_pipeline: pipeline.Pipeline): """Deploys given logical pipeline on Airflow. Args: tfx_pipeline: Logical pipeline containing pipeline args and components. Returns: An Airflow DAG. """ # Merge airflow-specific configs with pipeline args airflow_dag = models.DAG( dag_id=tfx_pipeline.pipeline_info.pipeline_name, **self._config.airflow_dag_config) if 'tmp_dir' not in tfx_pipeline.additional_pipeline_args: tmp_dir = os.path.join(tfx_pipeline.pipeline_info.pipeline_root, '.temp', '') tfx_pipeline.additional_pipeline_args['tmp_dir'] = tmp_dir component_impl_map = {} for tfx_component in tfx_pipeline.components: tfx_component = self._replace_runtime_params(tfx_component) (component_launcher_class, component_config) = config_utils.find_component_launch_info( self._config, tfx_component) current_airflow_component = airflow_component.AirflowComponent( airflow_dag, component=tfx_component, component_launcher_class=component_launcher_class, pipeline_info=tfx_pipeline.pipeline_info, enable_cache=tfx_pipeline.enable_cache, metadata_connection_config=tfx_pipeline. metadata_connection_config, beam_pipeline_args=tfx_pipeline.beam_pipeline_args, additional_pipeline_args=tfx_pipeline.additional_pipeline_args, component_config=component_config) component_impl_map[tfx_component] = current_airflow_component for upstream_node in tfx_component.upstream_nodes: assert upstream_node in component_impl_map, ( 'Components is not in ' 'topological order') current_airflow_component.set_upstream( component_impl_map[upstream_node]) return airflow_dag
def _construct_pipeline_graph(self, pipeline: tfx_pipeline.Pipeline, pipeline_root: dsl.PipelineParam): """Constructs a Kubeflow Pipeline graph. Args: pipeline: The logical TFX pipeline to base the construction on. pipeline_root: dsl.PipelineParam representing the pipeline root. """ component_to_kfp_op = {} # Assumption: There is a partial ordering of components in the list, i.e., # if component A depends on component B and C, then A appears after B and C # in the list. for component in pipeline.components: # Keep track of the set of upstream dsl.ContainerOps for this component. depends_on = set() for upstream_component in component.upstream_nodes: depends_on.add(component_to_kfp_op[upstream_component]) (component_launcher_class, component_config) = config_utils.find_component_launch_info( self._config, component) kfp_component = base_component.BaseComponent( component=component, component_launcher_class=component_launcher_class, depends_on=depends_on, pipeline=pipeline, pipeline_name=pipeline.pipeline_info.pipeline_name, pipeline_root=pipeline_root, tfx_image=self._config.tfx_image, kubeflow_metadata_config=self._config.kubeflow_metadata_config, component_config=component_config) for operator in self._config.pipeline_operator_funcs: kfp_component.container_op.apply(operator) kfp_component.container_op.add_pod_label(SDK_ENV_LABEL, self._sdk_env) assert self._pipeline_id, 'Failed to generate pipeline ID.' kfp_component.container_op.add_pod_label(PIPELINE_UUID_LABEL, self._pipeline_id) component_to_kfp_op[component] = kfp_component.container_op
def run(self, tfx_pipeline: pipeline.Pipeline) -> None: """Runs given logical pipeline locally. Args: tfx_pipeline: Logical pipeline containing pipeline args and components. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid executing the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return tfx_pipeline.pipeline_info.run_id = datetime.datetime.now().isoformat() with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'local'}): # Run each component. Note that the pipeline.components list is in # topological order. # # TODO(b/171319478): After IR-based execution is used, used multi-threaded # execution so that independent components can be run in parallel. for component in tfx_pipeline.components: # TODO(b/187122662): Pass through pip dependencies as a first-class # component flag. if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( # pylint: disable=protected-access tfx_pipeline.pipeline_info.pipeline_root) (component_launcher_class, component_config) = ( config_utils.find_component_launch_info(self._config, component)) driver_args = data_types.DriverArgs( enable_cache=tfx_pipeline.enable_cache) metadata_connection = metadata.Metadata( tfx_pipeline.metadata_connection_config) node_launcher = component_launcher_class.create( component=component, pipeline_info=tfx_pipeline.pipeline_info, driver_args=driver_args, metadata_connection=metadata_connection, beam_pipeline_args=tfx_pipeline.beam_pipeline_args, additional_pipeline_args=tfx_pipeline.additional_pipeline_args, component_config=component_config) logging.info('Component %s is running.', component.id) node_launcher.launch() logging.info('Component %s is finished.', component.id)
def run(self, tfx_pipeline: pipeline.Pipeline) -> None: for component in tfx_pipeline.components: (component_launcher_class, component_config) = (config_utils.find_component_launch_info( self._config, component)) driver_args = data_types.DriverArgs( enable_cache=tfx_pipeline.enable_cache) metadata_connection = metadata.Metadata( tfx_pipeline.metadata_connection_config) component_launcher = component_launcher_class.create( component=component, pipeline_info=tfx_pipeline.pipeline_info, driver_args=driver_args, metadata_connection=metadata_connection, beam_pipeline_args=tfx_pipeline.beam_pipeline_args, additional_pipeline_args=tfx_pipeline.additional_pipeline_args, component_config=component_config) logger.info('Component %s is running.', component.id) component_launcher.launch() logger.info('Component %s is finished.', component.id)