def testPipeline(self): self._copyTemplate() # Uncomment all variables in config. self._uncommentMultiLineVariables( os.path.join('pipeline', 'configs.py'), [ 'GOOGLE_CLOUD_REGION', 'BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS', 'BIG_QUERY_QUERY', 'DATAFLOW_BEAM_PIPELINE_ARGS', 'GCP_AI_PLATFORM_TRAINING_ARGS', 'GCP_AI_PLATFORM_SERVING_ARGS' ]) # Prepare data. self._prepare_data() self._replaceFileContent('kubeflow_v2_dag_runner.py', [ ('_DATA_PATH = \'gs://{}/tfx-template/data/\'.' 'format(configs.GCS_BUCKET_NAME)', '_DATA_PATH = \'gs://{{}}/{}/{}\'.format(configs.GCS_BUCKET_NAME)' .format(self._DATA_DIRECTORY_NAME, self._pipeline_name)), ]) # Create a pipeline with only one component. self._create_pipeline() # Extract the compiled pipeline spec. kubeflow_v2_pb = pipeline_spec_pb2.PipelineJob() io_utils.parse_json_file(file_name=os.path.join( os.getcwd(), 'pipeline.json'), message=kubeflow_v2_pb) # There should be one step in the compiled pipeline. self.assertLen(kubeflow_v2_pb.pipeline_spec['tasks'], 1)
def _create_pipeline_job( self, pipeline_spec: pipeline_spec_pb2.PipelineSpec, pipeline_root: str, pipeline_parameters: Optional[Mapping[str, Any]] = None, ) -> pipeline_spec_pb2.PipelineJob: """Creates the pipeline job spec object. Args: pipeline_spec: The pipeline spec object. pipeline_root: The root of the pipeline outputs. pipeline_parameters: The mapping from parameter names to values. Optional. Returns: A PipelineJob proto representing the compiled pipeline. """ runtime_config = compiler_utils.build_runtime_config_spec( pipeline_root=pipeline_root, pipeline_parameters=pipeline_parameters) pipeline_job = pipeline_spec_pb2.PipelineJob( runtime_config=runtime_config) pipeline_job.pipeline_spec.update( json_format.MessageToDict(pipeline_spec)) return pipeline_job
def _mock_subprocess_call(cmd: Sequence[Optional[Text]], env: Mapping[Text, Text]) -> int: """Mocks the subprocess call.""" assert len(cmd) == 2, 'Unexpected number of commands: {}'.format(cmd) del env dsl_path = cmd[1] if dsl_path.endswith('test_pipeline_bad.py'): sys.exit(1) if not dsl_path.endswith( 'test_pipeline_1.py') and not dsl_path.endswith( 'test_pipeline_2.py'): raise ValueError('Unexpected dsl path: {}'.format(dsl_path)) spec_pb = pipeline_pb2.PipelineSpec( pipeline_info=pipeline_pb2.PipelineInfo(name='chicago_taxi_kubeflow')) runtime_pb = pipeline_pb2.PipelineJob.RuntimeConfig( gcs_output_directory=os.path.join(os.environ['HOME'], 'tfx', 'pipelines', 'chicago_taxi_kubeflow')) job_pb = pipeline_pb2.PipelineJob(runtime_config=runtime_pb) job_pb.pipeline_spec.update(json_format.MessageToDict(spec_pb)) io_utils.write_string_file( file_name='pipeline.json', string_value=json_format.MessageToJson(message=job_pb, sort_keys=True)) return 0
def run(self, pipeline: tfx_pipeline.Pipeline, parameter_values: Optional[Dict[Text, Any]] = None, write_out: Optional[bool] = True) -> Dict[Text, Any]: """Compiles a pipeline DSL object into pipeline file. Args: pipeline: TFX pipeline object. parameter_values: mapping from runtime parameter names to its values. write_out: set to True to actually write out the file to the place designated by output_dir and output_filename. Otherwise return the JSON-serialized pipeline job spec. Returns: Returns the JSON pipeline job spec. Raises: RuntimeError: if trying to write out to a place occupied by an existing file. """ # TODO(b/166343606): Support user-provided labels. # TODO(b/169095387): Deprecate .run() method in favor of the unified API # client. display_name = (self._config.display_name or pipeline.pipeline_info.pipeline_name) pipeline_spec = pipeline_builder.PipelineBuilder( tfx_pipeline=pipeline, default_image=self._config.default_image, default_commands=self._config.default_commands).build() pipeline_spec.sdk_version = 'tfx-{}'.format(version.__version__) pipeline_spec.schema_version = _SCHEMA_VERSION runtime_config = pipeline_builder.RuntimeConfigBuilder( pipeline_info=pipeline.pipeline_info, parameter_values=parameter_values).build() with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'kubeflow_v2'}): result = pipeline_spec_pb2.PipelineJob( display_name=display_name or pipeline.pipeline_info.pipeline_name, labels=telemetry_utils.get_labels_dict(), runtime_config=runtime_config) result.pipeline_spec.update(json_format.MessageToDict(pipeline_spec)) pipeline_json_dict = json_format.MessageToDict(result) if write_out: if fileio.exists( self._output_dir) and not fileio.isdir(self._output_dir): raise RuntimeError('Output path: %s is pointed to a file.' % self._output_dir) if not fileio.exists(self._output_dir): fileio.makedirs(self._output_dir) with fileio.open( os.path.join(self._output_dir, self._output_filename), 'wb') as f: f.write(json.dumps(pipeline_json_dict, sort_keys=True)) return pipeline_json_dict
def _extract_pipeline_args(self) -> Dict[Text, Any]: """Get pipeline args from the DSL by compiling the pipeline. Returns: Python dictionary with pipeline details extracted from DSL. Raises: RuntimeError: when the given pipeline arg file location is occupied. """ pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH] if os.path.isdir(pipeline_dsl_path): sys.exit('Provide a valid dsl file path.') # Create an environment for subprocess. temp_env = os.environ.copy() # We don't need image name and project ID for extracting pipeline info, # so they can be optional. runner_env = { kubeflow_labels.TFX_IMAGE_ENV: self.flags_dict.get(kubeflow_labels.TFX_IMAGE_ENV, ''), kubeflow_labels.GCP_PROJECT_ID_ENV: self.flags_dict.get(kubeflow_labels.GCP_PROJECT_ID_ENV, ''), } temp_env.update(runner_env) # Run pipeline dsl. Note that here because we don't have RUN_FLAG_ENV # the actual execution won't be triggered. Instead the DSL will output a # compiled pipeline spec. self._subprocess_call(command=[sys.executable, pipeline_dsl_path], env=temp_env) # Only import pipeline_spec_pb2 when needed to guard CLI dependency. from kfp.pipeline_spec import pipeline_spec_pb2 # pylint: disable=g-import-not-at-top # Extract the needed information from compiled pipeline spec. job_message = pipeline_spec_pb2.PipelineJob() io_utils.parse_json_file(file_name=os.path.join( os.getcwd(), _PIPELINE_SPEC_FILE), message=job_message) pipeline_spec_pb = json_format.ParseDict( job_message.pipeline_spec, pipeline_spec_pb2.PipelineSpec()) pipeline_name = pipeline_spec_pb.pipeline_info.name pipeline_args = { 'pipeline_name': pipeline_name, 'pipeline_root': job_message.runtime_config.gcs_output_directory } return pipeline_args
def _create_pipeline_v2( self, pipeline_func: Callable[..., Any], pipeline_root: Optional[str] = None, pipeline_name: Optional[str] = None, pipeline_parameters_override: Optional[Mapping[str, Any]] = None, ) -> pipeline_spec_pb2.PipelineJob: """Creates a pipeline instance and constructs the pipeline spec from it. Args: pipeline_func: Pipeline function with @dsl.pipeline decorator. pipeline_root: The root of the pipeline outputs. Optional. pipeline_name: The name of the pipeline. Optional. pipeline_parameters_override: The mapping from parameter names to values. Optional. Returns: A PipelineJob proto representing the compiled pipeline. """ # Create the arg list with no default values and call pipeline function. # Assign type information to the PipelineParam pipeline_meta = _python_op._extract_component_interface(pipeline_func) pipeline_name = pipeline_name or pipeline_meta.name pipeline_root = pipeline_root or getattr(pipeline_func, 'output_directory', None) if not pipeline_root: warnings.warn('pipeline_root is None or empty. A valid pipeline_root ' 'must be provided at job submission.') args_list = [] signature = inspect.signature(pipeline_func) for arg_name in signature.parameters: arg_type = None for pipeline_input in pipeline_meta.inputs or []: if arg_name == pipeline_input.name: arg_type = pipeline_input.type break args_list.append( dsl.PipelineParam( sanitize_k8s_name(arg_name, True), param_type=arg_type)) with dsl.Pipeline(pipeline_name) as dsl_pipeline: pipeline_func(*args_list) self._sanitize_and_inject_artifact(dsl_pipeline) # Fill in the default values. args_list_with_defaults = [] if pipeline_meta.inputs: args_list_with_defaults = [ dsl.PipelineParam( sanitize_k8s_name(input_spec.name, True), param_type=input_spec.type, value=input_spec.default) for input_spec in pipeline_meta.inputs ] # Making the pipeline group name unique to prevent name clashes with templates pipeline_group = dsl_pipeline.groups[0] temp_pipeline_group_name = uuid.uuid4().hex pipeline_group.name = temp_pipeline_group_name pipeline_spec = self._create_pipeline_spec( args_list_with_defaults, dsl_pipeline, ) pipeline_parameters = { param.name: param for param in args_list_with_defaults } # Update pipeline parameters override if there were any. pipeline_parameters_override = pipeline_parameters_override or {} for k, v in pipeline_parameters_override.items(): if k not in pipeline_parameters: raise ValueError('Pipeline parameter {} does not match any known ' 'pipeline argument.'.format(k)) pipeline_parameters[k].value = v runtime_config = compiler_utils.build_runtime_config_spec( output_directory=pipeline_root, pipeline_parameters=pipeline_parameters) pipeline_job = pipeline_spec_pb2.PipelineJob(runtime_config=runtime_config) pipeline_job.pipeline_spec.update(json_format.MessageToDict(pipeline_spec)) return pipeline_job
def _create_pipeline( self, pipeline_func: Callable[..., Any], output_directory: str, pipeline_name: Optional[str] = None, pipeline_parameters_override: Optional[Mapping[str, Any]] = None, ) -> pipeline_spec_pb2.PipelineJob: """Creates a pipeline instance and constructs the pipeline spec from it. Args: pipeline_func: Pipeline function with @dsl.pipeline decorator. pipeline_name: The name of the pipeline. Optional. output_directory: The root of the pipeline outputs. pipeline_parameters_override: The mapping from parameter names to values. Optional. Returns: A PipelineJob proto representing the compiled pipeline. """ # Create the arg list with no default values and call pipeline function. # Assign type information to the PipelineParam pipeline_meta = _python_op._extract_component_interface(pipeline_func) pipeline_name = pipeline_name or pipeline_meta.name args_list = [] signature = inspect.signature(pipeline_func) for arg_name in signature.parameters: arg_type = None for pipeline_input in pipeline_meta.inputs or []: if arg_name == pipeline_input.name: arg_type = pipeline_input.type break args_list.append( dsl.PipelineParam(sanitize_k8s_name(arg_name, True), param_type=arg_type)) with dsl.Pipeline(pipeline_name) as dsl_pipeline: pipeline_func(*args_list) # Fill in the default values. args_list_with_defaults = [] if pipeline_meta.inputs: args_list_with_defaults = [ dsl.PipelineParam(sanitize_k8s_name(input_spec.name, True), param_type=input_spec.type, value=input_spec.default) for input_spec in pipeline_meta.inputs ] pipeline_spec = self._create_pipeline_spec( args_list_with_defaults, dsl_pipeline, ) pipeline_parameters = { arg.name: arg.value for arg in args_list_with_defaults } # Update pipeline parameters override if there were any. pipeline_parameters.update(pipeline_parameters_override or {}) runtime_config = compiler_utils.build_runtime_config_spec( output_directory=output_directory, pipeline_parameters=pipeline_parameters) pipeline_job = pipeline_spec_pb2.PipelineJob( runtime_config=runtime_config) pipeline_job.pipeline_spec.update( json_format.MessageToDict(pipeline_spec)) return pipeline_job