def test_processing_url_runtime_specific_component(monkeypatch, processor, sample_metadata, tmpdir): # Define the appropriate reader for a URL-type component definition kfp_supported_file_types = [".yaml"] reader = UrlComponentCatalogConnector(kfp_supported_file_types) # Assign test resource location url = ("https://raw.githubusercontent.com/elyra-ai/elyra/master/" "elyra/tests/pipeline/resources/components/filter_text.yaml") # Read contents of given path -- read_component_definition() returns a # a dictionary of component definition content indexed by path entry_data = reader.get_entry_data({"url": url}, {}) component_definition = entry_data.definition # Instantiate a url-based component component_id = "test_component" component = Component( id=component_id, name="Filter text", description="", op="filter-text", catalog_type="url-catalog", component_reference={"url": url}, definition=component_definition, categories=[], properties=[], ) # Fabricate the component cache to include single filename-based component for testing ComponentCache.instance()._component_cache[processor._type.name] = { "spoofed_catalog": { "components": { component_id: component } } } # Construct hypothetical operation for component operation_name = "Filter text test" operation_params = {"text": "path/to/text.txt", "pattern": "hello"} operation = Operation( id="filter-text-id", type="execution_node", classifier=component_id, name=operation_name, parent_operation_ids=[], component_params=operation_params, ) # Build a mock runtime config for use in _cc_pipeline mocked_runtime = Metadata(name="test-metadata", display_name="test", schema_name="kfp", metadata=sample_metadata) mocked_func = mock.Mock(return_value="default", side_effect=[mocked_runtime, sample_metadata]) monkeypatch.setattr(processor, "_get_metadata_configuration", mocked_func) # Construct single-operation pipeline pipeline = Pipeline(id="pipeline-id", name="kfp_test", runtime="kfp", runtime_config="test", source="filter_text.pipeline") pipeline.operations[operation.id] = operation # Establish path and function to construct pipeline pipeline_path = os.path.join(tmpdir, "kfp_test.yaml") constructed_pipeline_function = lambda: processor._cc_pipeline( pipeline=pipeline, pipeline_name="test_pipeline") # TODO Check against both argo and tekton compilations # Compile pipeline and save into pipeline_path kfp_argo_compiler.Compiler().compile(constructed_pipeline_function, pipeline_path) # Read contents of pipeline YAML with open(pipeline_path) as f: pipeline_yaml = yaml.safe_load(f.read()) # Check the pipeline file contents for correctness pipeline_template = pipeline_yaml["spec"]["templates"][0] assert pipeline_template["metadata"]["annotations"][ "pipelines.kubeflow.org/task_display_name"] == operation_name assert pipeline_template["inputs"]["artifacts"][0]["raw"][ "data"] == operation_params["text"]
def _cc_pipeline(self, pipeline: Pipeline, pipeline_name: str, pipeline_instance_id: str) -> OrderedDict: """ Compile the pipeline in preparation for DAG generation """ runtime_configuration = self._get_metadata_configuration( schemaspace=Runtimes.RUNTIMES_SCHEMASPACE_ID, name=pipeline.runtime_config ) image_namespace = self._get_metadata_configuration( schemaspace=RuntimeImages.RUNTIME_IMAGES_SCHEMASPACE_ID, name=None ) cos_endpoint = runtime_configuration.metadata.get("cos_endpoint") cos_username = runtime_configuration.metadata.get("cos_username") cos_password = runtime_configuration.metadata.get("cos_password") cos_secret = runtime_configuration.metadata.get("cos_secret") cos_bucket = runtime_configuration.metadata.get("cos_bucket") pipeline_instance_id = pipeline_instance_id or pipeline_name artifact_object_prefix = join_paths(pipeline.pipeline_parameters.get(COS_OBJECT_PREFIX), pipeline_instance_id) self.log_pipeline_info( pipeline_name, f"processing pipeline dependencies for upload to '{cos_endpoint}' " f"bucket '{cos_bucket}' folder '{artifact_object_prefix}'", ) # Create dictionary that maps component Id to its ContainerOp instance target_ops = [] t0_all = time.time() # Sort operations based on dependency graph (topological order) sorted_operations = PipelineProcessor._sort_operations(pipeline.operations) # Determine whether access to cloud storage is required and check connectivity for operation in sorted_operations: if isinstance(operation, GenericOperation): self._verify_cos_connectivity(runtime_configuration) break # All previous operation outputs should be propagated throughout the pipeline. # In order to process this recursively, the current operation's inputs should be combined # from its parent's inputs (which, themselves are derived from the outputs of their parent) # and its parent's outputs. PipelineProcessor._propagate_operation_inputs_outputs(pipeline, sorted_operations) # Scrub all node labels of invalid characters scrubbed_operations = self._scrub_invalid_characters_from_list(sorted_operations) # Generate unique names for all operations unique_operations = self._create_unique_node_names(scrubbed_operations) for operation in unique_operations: if isinstance(operation, GenericOperation): operation_artifact_archive = self._get_dependency_archive_name(operation) self.log.debug(f"Creating pipeline component:\n {operation} archive : {operation_artifact_archive}") # Collect env variables pipeline_envs = self._collect_envs( operation, cos_secret=cos_secret, cos_username=cos_username, cos_password=cos_password ) # Generate unique ELYRA_RUN_NAME value and expose it as an # environment variable in the container. # Notebook | script nodes are implemented using the kubernetes_pod_operator # (https://airflow.apache.org/docs/apache-airflow/1.10.12/_api/airflow/contrib/operators/kubernetes_pod_operator/index.html) # Environment variables that are passed to this operator are # pre-processed by Airflow at runtime and placeholder values (expressed as '{{ xyz }}' # - see https://airflow.apache.org/docs/apache-airflow/1.10.12/macros-ref#default-variables) # replaced. if pipeline_envs is None: pipeline_envs = {} pipeline_envs["ELYRA_RUN_NAME"] = f"{pipeline_name}-{{{{ ts_nodash }}}}" image_pull_policy = None runtime_image_pull_secret = None for image_instance in image_namespace: if image_instance.metadata["image_name"] == operation.runtime_image: if image_instance.metadata.get("pull_policy"): image_pull_policy = image_instance.metadata["pull_policy"] if image_instance.metadata.get("pull_secret"): runtime_image_pull_secret = image_instance.metadata["pull_secret"] break bootscript = BootscriptBuilder( filename=operation.filename, pipeline_name=pipeline_name, cos_endpoint=cos_endpoint, cos_bucket=cos_bucket, cos_directory=artifact_object_prefix, cos_dependencies_archive=operation_artifact_archive, inputs=operation.inputs, outputs=operation.outputs, ) target_op = { "notebook": operation.name, "id": operation.id, "argument_list": bootscript.container_cmd, "runtime_image": operation.runtime_image, "pipeline_envs": pipeline_envs, "parent_operation_ids": operation.parent_operation_ids, "image_pull_policy": image_pull_policy, "cpu_request": operation.cpu, "mem_request": operation.memory, "gpu_limit": operation.gpu, "operator_source": operation.component_params["filename"], "is_generic_operator": True, "doc": operation.doc, "volume_mounts": operation.component_params.get(MOUNTED_VOLUMES, []), "kubernetes_secrets": operation.component_params.get(KUBERNETES_SECRETS, []), } if runtime_image_pull_secret is not None: target_op["runtime_image_pull_secret"] = runtime_image_pull_secret target_ops.append(target_op) self.log_pipeline_info( pipeline_name, f"processing operation dependencies for id '{operation.id}'", operation_name=operation.name, ) self._upload_dependencies_to_object_store( runtime_configuration, pipeline_name, operation, prefix=artifact_object_prefix ) else: # Retrieve component from cache component = ComponentCache.instance().get_component(self._type, operation.classifier) # Convert the user-entered value of certain properties according to their type for component_property in component.properties: # Skip properties for which no value was given if component_property.ref not in operation.component_params.keys(): continue # Get corresponding property's value from parsed pipeline property_value_dict = operation.component_params.get(component_property.ref) # The type and value of this property can vary depending on what the user chooses # in the pipeline editor. So we get the current active parameter (e.g. StringControl) # from the activeControl value active_property_name = property_value_dict["activeControl"] # One we have the value (e.g. StringControl) we use can retrieve the value # assigned to it property_value = property_value_dict.get(active_property_name, None) # If the value is not found, assign it the default value assigned in parser if property_value is None: property_value = component_property.value self.log.debug(f"Active property name : {active_property_name}, value : {property_value}") self.log.debug( f"Processing component parameter '{component_property.name}' " f"of type '{component_property.data_type}'" ) if ( property_value and str(property_value)[0] == "{" and str(property_value)[-1] == "}" and isinstance(json.loads(json.dumps(property_value)), dict) and set(json.loads(json.dumps(property_value)).keys()) == {"value", "option"} ): parent_node_name = self._get_node_name( target_ops, json.loads(json.dumps(property_value))["value"] ) processed_value = "\"{{ ti.xcom_pull(task_ids='" + parent_node_name + "') }}\"" operation.component_params[component_property.ref] = processed_value elif component_property.data_type == "boolean": operation.component_params[component_property.ref] = property_value elif component_property.data_type == "string": # Add surrounding quotation marks to string value for correct rendering # in jinja DAG template operation.component_params[component_property.ref] = json.dumps(property_value) elif component_property.data_type == "dictionary": processed_value = self._process_dictionary_value(property_value) operation.component_params[component_property.ref] = processed_value elif component_property.data_type == "list": processed_value = self._process_list_value(property_value) operation.component_params[component_property.ref] = processed_value # Remove inputs and outputs from params dict until support for data exchange is provided operation.component_params_as_dict.pop("inputs") operation.component_params_as_dict.pop("outputs") # Locate the import statement. If not found raise... import_stmts = [] # Check for import statement on Component object, otherwise get from class_import_map import_stmt = component.import_statement or self.class_import_map.get(component.name) if import_stmt: import_stmts.append(import_stmt) else: # If we didn't find a mapping to the import statement, let's check if the component # name includes a package prefix. If it does, log a warning, but proceed, otherwise # raise an exception. if len(component.name.split(".")) > 1: # We (presumably) have a package prefix self.log.warning( f"Operator '{component.name}' of node '{operation.name}' is not configured " f"in the list of available Airflow operators but appears to include a " f"package prefix and processing will proceed." ) else: raise ValueError( f"Operator '{component.name}' of node '{operation.name}' is not configured " f"in the list of available operators. Please add the fully-qualified " f"package name for '{component.name}' to the " f"AirflowPipelineProcessor.available_airflow_operators configuration." ) target_op = { "notebook": operation.name, "id": operation.id, "imports": import_stmts, "class_name": component.name, "parent_operation_ids": operation.parent_operation_ids, "component_params": operation.component_params_as_dict, "operator_source": component.component_source, "is_generic_operator": False, "doc": operation.doc, } target_ops.append(target_op) ordered_target_ops = OrderedDict() while target_ops: for i in range(len(target_ops)): target_op = target_ops.pop(0) if not target_op["parent_operation_ids"]: ordered_target_ops[target_op["id"]] = target_op self.log.debug(f"Added root node {ordered_target_ops[target_op['id']]}") elif all(deps in ordered_target_ops.keys() for deps in target_op["parent_operation_ids"]): ordered_target_ops[target_op["id"]] = target_op self.log.debug(f"Added dependent node {ordered_target_ops[target_op['id']]}") else: target_ops.append(target_op) self.log_pipeline_info(pipeline_name, "pipeline dependencies processed", duration=(time.time() - t0_all)) return ordered_target_ops