def download_file_content_from_url(url: str, bearer_token: str = None) -> bytes: request_headers = dict() if bearer_token and "?token=" not in url: request_headers.update({"Authorization": f"Bearer {bearer_token}"}) try: raw_url = url.replace("/blob/", "/") \ .replace("/github.ibm.com/", "/raw.github.ibm.com/") \ .replace("/github.com/", "/raw.githubusercontent.com/") response = requests.get(raw_url, allow_redirects=True, headers=request_headers) if response.ok: file_content = response.content return file_content except Exception as e: raise ApiError(f"Could not download file '{url}'. \n{str(e)}", 422) raise ApiError( f"Could not download file '{url}'. Reason: {response.reason}", response.status_code)
def _download_notebook(url: str, enterprise_github_api_token: str) -> dict: request_headers = dict() if "ibm.com" in url and "?token=" not in url: if not enterprise_github_api_token and not ghe_api_token: raise ApiError( f"Must provide API token to access notebooks on Enterprise GitHub: {url}", 422) else: request_headers.update({ 'Authorization': f'token {enterprise_github_api_token or ghe_api_token}' }) try: raw_url = url.replace("/github.ibm.com/", "/raw.github.ibm.com/")\ .replace("/github.com/", "/raw.githubusercontent.com/")\ .replace("/blob/", "/") response = requests.get(raw_url, allow_redirects=True, headers=request_headers) if response.ok: notebook_dict = response.json() return notebook_dict except Exception as e: raise ApiError(f"Could not download notebook file '{url}'. \n{str(e)}", 422) raise ApiError( f"Could not download notebook file '{url}'. Reason: {response.reason}", response.status_code)
def _update_bucket_policy(bucket_name: str, prefix: str): client = _get_minio_client() try: bucket_policy = json.loads(client.get_bucket_policy(bucket_name)) except NoSuchBucketPolicy: bucket_policy = dict(_bucket_policy_template) getobject_stmts = [s for s in bucket_policy["Statement"] if s.get("Sid") == _bucket_policy_sid] or \ [s for s in bucket_policy["Statement"] if "s3:GetObject" in s["Action"]] if not getobject_stmts: bucket_policy["Statement"].append(_bucket_policy_stmt) getobject_stmts = bucket_policy["Statement"][-1] resources = getobject_stmts[-1]["Resource"] new_resource = f"arn:aws:s3:::{bucket_name}/{prefix}" if new_resource not in resources and not any( [r.strip("*") in new_resource for r in resources]): resources.append(new_resource) new_policy_str = json.dumps(bucket_policy) try: client.set_bucket_policy(bucket_name, new_policy_str) except ResponseError as e: if e.code == 'XMinioPolicyNesting': raise ApiError( f"{e.message.split('.')[0]}." f" New policy: '{new_policy_str}'." f" Existing policy: '{client.get_bucket_policy(bucket_name)}'")
def run_pipeline_in_experiment(api_pipeline: ApiPipeline, parameters: dict = None, run_name: str = None, wait_for_status: bool = False): try: client = KfpClient(_pipeline_service_url) experiment = client.create_experiment('PIPELINE_RUNS') run_result = client.run_pipeline(experiment_id=experiment.id, job_name=run_name or api_pipeline.name, params=parameters, pipeline_id=api_pipeline.id) run_id = run_result.id if wait_for_status: run_details = wait_for_run_status(client, run_id, 10) run_status = json.loads(run_details.pipeline_runtime.workflow_manifest)["status"] if run_status \ and run_status.get("phase", "").lower() in ["failed", "error"] \ and run_status.get("message"): raise RuntimeError(f"Run {run_id} failed with error: {run_status['message']}") return run_id except Exception as e: print(f"Exception trying to run pipeline {api_pipeline.id} '{api_pipeline.name}'" f" with parameters {parameters}:" f" %s\n" % e) raise ApiError(message=f"{e.body}\nKFP URL: {_pipeline_service_url}", http_status_code=e.status or 422) return None
def upload_pipeline_to_kfp(uploadfile: str, name: str = None) -> ApiPipeline: config = UploadClientConfig() config.host = _pipeline_service_url api_client = UploadApiClient(configuration=config) api_instance = PipelineUploadServiceApi(api_client=api_client) try: kfp_pipeline: KfpPipeline = api_instance.upload_pipeline(uploadfile=uploadfile, name=name) api_pipeline: ApiPipeline = ApiPipeline.from_dict(kfp_pipeline.to_dict()) api_pipeline.status = kfp_pipeline.error return api_pipeline except PipelineUploadApiException as e: kfp_host = api_instance.api_client.configuration.host print(f"Error calling PipelineServiceApi ({kfp_host}) -> upload_pipeline(name='{name}'): {e}") error_body = json.loads(e.body) or {"error_message": str(e)} error_msg = error_body["error_message"] status_code = 409 if "already exist. Please specify a new name" in error_msg else e.status raise ApiError(error_msg, status_code) return None
def _validate_schema(table_name: str, swagger_class): # swagger_object.swagger_types dictionary maintains insertion order since Python 3.6 # but does not show defaults, use inspection to get argument list from constructor sig = inspect.signature(swagger_class.__init__) swagger_columns_w_type = [] for _, p in sig.parameters.items(): if p.name == "self": continue col_name = _convert_attr_name_to_col_name(p.name) col_type = _get_column_type(p.name, p.annotation, swagger_class) swagger_columns_w_type.append((col_name, col_type)) query = f"SELECT COLUMN_NAME, SUBSTR(COLUMN_TYPE,1,64) as COLUMN_TYPE " \ f" FROM INFORMATION_SCHEMA.COLUMNS " \ f" WHERE TABLE_SCHEMA = '{_database}' AND TABLE_NAME = '{table_name}'" cnx = _get_connection() cursor = cnx.cursor(buffered=True) table_columns_w_type = [] try: cursor.execute(query) for column_name, column_type in cursor: table_columns_w_type.append((column_name, column_type)) except Error as err: print(err.msg) raise err finally: cursor.close() cnx.close() if table_columns_w_type and set(table_columns_w_type) != set( swagger_columns_w_type): if isinstance(swagger_class, Model): swagger_class = type(swagger_class) cols_found = "\n - ".join( [f"'{n}' {t}" for n, t in table_columns_w_type]) cols_expect = "\n - ".join( [f"'{n}' {t}" for n, t in swagger_columns_w_type]) err_msg = f"The MySQL table '{_database}.{table_name}' does not match Swagger" \ f" class '{swagger_class.__name__}'.\n" \ f" Found table with columns:\n" \ f" - {cols_found}.\n" \ f" Expected table with columns:\n" \ f" - {cols_expect}.\n" \ f" Delete and recreate the table by calling the API endpoint 'DELETE /{table_name}/*'" raise ApiError(err_msg) return len(table_columns_w_type) > 0
def upload_pipeline_to_kfp(uploadfile: str, name: str = None, description: str = None) -> ApiPipeline: kfp_client = KfpClient() try: kfp_pipeline: KfpPipeline = kfp_client.upload_pipeline( pipeline_package_path=uploadfile, pipeline_name=name, description=description) api_pipeline: ApiPipeline = ApiPipeline.from_dict( kfp_pipeline.to_dict()) api_pipeline.status = kfp_pipeline.error return api_pipeline except PipelineApiException as e: kfp_host = _pipeline_service_url print( f"Error calling PipelineServiceApi ({kfp_host}) -> upload_pipeline(name='{name}'): {e}" ) error_body = json.loads(e.body) or {"error_message": str(e)} error_msg = error_body["error_message"] status_code = 409 if "already exist. Please specify a new name" in error_msg else e.status raise ApiError(error_msg, status_code) return None
def _convert_value_to_mysql(value, python_type: type, mysql_type_override: str = None, quote_str=False): # turn child attributes of type swagger._base_model.Model into dicts def to_dict(v): return v.to_dict() if hasattr(v, "to_dict") else v if type( python_type ) == typing._GenericAlias: # or str(python_type).startswith("typing."): python_type = eval(python_type._name.lower()) if value and not issubclass(type(value), python_type) \ and not (isinstance(value, dict) and issubclass(python_type, Model)): err_msg = f"The type '{type(value)}' does not match expected target type '{python_type}' for value '{value}'" raise ApiError(err_msg, 422) if not value: if python_type == bool: return False elif python_type in [int, float]: return 0 elif python_type == dict and mysql_type_override == "json": return "{}" elif python_type in [str, dict, list] or issubclass( python_type, Model): return "" else: return None if hasattr(value, "to_dict"): mysql_value = json.dumps(value.to_dict()) elif python_type == list: # or isinstance(value, list): mysql_value = json.dumps(list(map(to_dict, value))) elif python_type == dict or issubclass(python_type, Model) and isinstance( value, dict): mysql_value = json.dumps( dict(map(lambda item: (item[0], to_dict(item[1])), value.items()))) elif python_type == datetime: # or isinstance(value, datetime): mysql_value = int(value.timestamp()) else: mysql_value = value # DON'T quote strings when using MySQL queries with parameters if mysql_value and quote_str and type(mysql_value) == str: mysql_value_escaped = mysql_value.replace("'", r"\'") mysql_value = f"'{mysql_value_escaped}'" return mysql_value
def store_data(swagger_object: Model) -> str: table_name = _get_table_name(swagger_object) _verify_or_create_table(table_name, swagger_object) swagger_fields = swagger_object.to_dict().keys() # TODO: remove generate_id() calls in controller_impl methods, do it here if "id" in swagger_fields and not swagger_object.id: swagger_object.id = generate_id(swagger_object.name if "name" in swagger_fields else None) # TODO: remove creating a new data in controller_impl methods, do it here if "created_at" in swagger_fields and not swagger_object.created_at: swagger_object.created_at = datetime.now() column_names = [_convert_attr_name_to_col_name(f) for f in swagger_fields] column_values = [ _convert_value_to_mysql(getattr(swagger_object, f), swagger_object.swagger_types[f]) for f in swagger_fields ] column_names_str = ", ".join(column_names) values_list_str = ('%s,' * len(column_values)).rstrip(',') insert_stmt = (f"INSERT INTO {table_name} " f"({column_names_str}) " f"VALUES ({values_list_str})") cnx = _get_connection() cnx.autocommit = True cursor = cnx.cursor() try: cursor.execute(insert_stmt, tuple(column_values)) cnx.commit() except IntegrityError as e: cnx.rollback() raise ApiError(e.msg, 409) except Error as err: cnx.rollback() print(err.msg) print(insert_stmt) print(column_values) raise err finally: cursor.close() cnx.close() return swagger_object.id
def create_secret(secret_name: str, secret_contents: dict): try: command = [ 'kubectl', '-n', _namespace, 'create', 'secret', 'generic', secret_name ] for key, value in secret_contents.items(): if type(value) == dict: raise ApiError(f"Secret values must not be of type 'dict'") if type(value) == list: value = ",".join([str(v) for v in value]) if type(value) == str and " " in value: value = f"\"{value}\"" command.append(f"--from-literal={key}={value or ''}") output = subprocess.run(command, capture_output=True, check=True, timeout=10) pprint(output.stdout.decode()) except Exception as e: if output and output.stderr: pprint(output.stderr.decode()) raise ApiError(f"Error trying to create secret '{secret_name}': {e}")
def run_custom_pipeline_in_experiment(custom_pipeline: ApiPipelineCustom, run_name: str, parameters: dict, wait_for_status: bool = False): try: source_code = generate_custom_pipeline_run_script(custom_pipeline, parameters, run_name, hide_secrets=False) except Exception as e: # TODO: remove this debug logging for development only print( f"Error trying to generate code for custom pipeline run '{run_name or custom_pipeline.name}': {e}" ) print(custom_pipeline) print(parameters) raise e try: run_id = run_code_in_experiment(source_code, wait_for_status) except SyntaxError as e: print( f"SyntaxError trying to run pipeline DSL '{run_name or custom_pipeline.name}': {e}" ) print(source_code) print("Custom pipeline payload:") print(custom_pipeline) raise ApiError( f"SyntaxError trying to run pipeline DSL: {e.msg}\n" f"{source_code}", 500) except Exception as e: # TODO: remove this debug logging for development only print( f"Error trying to run custom pipeline code '{run_name or custom_pipeline.name}': {e}" ) print(custom_pipeline) print(source_code) raise e # TODO: remove this debug logging for development only print("Custom pipeline payload:") print(custom_pipeline) print("Pipeline DSL:") print(source_code) return run_id
def get_yaml_file_content_from_uploadfile(uploadfile: FileStorage): file_name = uploadfile.filename file_ext = file_name.lower().split(".")[-1] if file_ext in ["tgz", "gz"]: yaml_file_content = extract_yaml_from_tarfile(uploadfile) elif file_ext in ["yaml", "yml"]: yaml_file_content = uploadfile.stream.read() else: raise ApiError( f"File extension not supported: '{file_ext}', uploadfile: '{file_name}'." f"Supported file extensions: .tar.gz, .gz, .yaml, .yml", 501) return yaml_file_content
def delete_secret(secret_name): if secret_name == "*": return delete_all_secrets() output = None try: delete_command = [ 'kubectl', 'delete', '-n', _namespace, 'secret', secret_name ] output = subprocess.run(delete_command, capture_output=True, check=True, timeout=10) print(f"Credential {secret_name} was deleted") except Exception as e: if output and output.stderr: pprint(output.stderr.decode()) raise ApiError(f"Error trying to delete secret '{secret_name}': {e}")
def delete_kfp_pipeline(pipeline_id: str): config = PipelineClientConfig() config.host = _pipeline_service_url api_client = PipelineApiClient(configuration=config) api_instance = PipelineServiceApi(api_client=api_client) try: api_instance.delete_pipeline(pipeline_id) except AttributeError as e: # ignore KFP AttributeError. It is a bug in the Swagger-generated client code for Kubeflow Pipelines if not str(e) == "module 'kfp_pipeline.models' has no attribute 'ERRORUNKNOWN'": raise e except PipelineApiException as e: kfp_host = api_instance.api_client.configuration.host print(f"Exception when calling PipelineServiceApi ({kfp_host}) -> delete_pipeline: %s\n" % e) raise ApiError(message=f"{e.body}\nKFP URL: {kfp_host}", http_status_code=e.status or 422)
def get_secret(secret_name, decode=False) -> dict: output = None try: get_command = [ 'kubectl', '-n', _namespace, '-o', 'json', 'get', 'secret', secret_name ] output = subprocess.run(get_command, capture_output=True, check=True, timeout=10) secret_data = json.loads(output.stdout.decode()) or {} secret = secret_data.get("data") if decode: for k, v in secret.items(): secret[k] = b64decode(v).decode() return secret except Exception as e: if output and output.stderr: pprint(output.stderr.decode()) raise ApiError(f"Error trying to retrieve secret '{secret_name}': {e}")
def list_secrets(name_prefix=secret_name_prefix, decode=False) -> [dict]: output = None try: list_command = [ 'kubectl', '-n', _namespace, '-o', 'json', 'get', 'secrets' ] output = subprocess.run(list_command, capture_output=True, check=True, timeout=10) secrets_data = json.loads(output.stdout.decode()) or {} mlx_secrets = [ d for d in secrets_data.get("items") or [] if d["metadata"]["name"].startswith(name_prefix) ] if decode: for s in mlx_secrets: for k, v in s["data"].items(): s["data"][k] = b64decode(v).decode() return mlx_secrets except Exception as e: if output and output.stderr: pprint(output.stderr.decode()) raise ApiError(f"Error trying to list secrets '{name_prefix}...': {e}")
def generate_dataset_run_script(dataset: ApiDataset, dataset_template_url, run_parameters=dict(), run_name: str = None, fail_on_missing_prereqs=False): name = f"{dataset.name} ({generate_id(length=4)})" description = dataset.description.strip().replace("'", "\\'") # TODO: some of the parameters, template URLs should move out of here # dataset_parameters = dataset.parameters # TODO: ApiParameters should not be defined here dataset_parameters = [ApiParameter(name="action", default="create"), ApiParameter(name="namespace", default=_namespace)] pipeline_method_args = generate_pipeline_method_args(dataset_parameters) parameter_names = ",".join([p.name for p in dataset_parameters]) # TODO: the action parameter is required by DLF-to-PVC op, so it should not be dynamically generated here parameter_dict = { "action": "create", "namespace": run_parameters.get("namespace", _namespace) } # see component name at https://github.com/machine-learning-exchange/mlx/blob/main/components/component-samples/dax-to-dlf/component.yaml#L1 dax_to_dlf_component_id = generate_id(name="Generate Dataset Metadata") # see component name at https://github.com/machine-learning-exchange/mlx/blob/main/components/component-samples/dlf/component.yaml#L1 dlf_to_pvc_component_id = generate_id(name="Create Dataset Volume") dax_to_dlf_component_url = get_object_url(bucket_name="mlpipeline", prefix=f"components/{dax_to_dlf_component_id}/", file_extensions=[".yaml"]) dlf_to_pvc_component_url = get_object_url(bucket_name="mlpipeline", prefix=f"components/{dlf_to_pvc_component_id}/", file_extensions=[".yaml"]) if fail_on_missing_prereqs: if not dax_to_dlf_component_url: raise ApiError(f"Missing required component '{dax_to_dlf_component_id}'") if not dlf_to_pvc_component_url: raise ApiError(f"Missing required component '{dlf_to_pvc_component_id}'") namespace = run_parameters.get("namespace", _namespace) pipeline_server = "" if "POD_NAMESPACE" in os.environ else f"'{_pipeline_service_url}'" run_name = (run_name or "").replace("'", "\"") or dataset.name substitutions = dict(locals()) template_file = f"run_dataset.TEMPLATE.py" with open(join(CODE_TEMPLATE_DIR, template_file), 'r') as f: template_raw = f.read() template_rendered = Template(template_raw).substitute(substitutions) run_script = autopep8.fix_code(template_rendered, options={"aggressive": 2}) return run_script
def generate_custom_pipeline_function_body(custom_pipeline: ApiPipelineCustom, hide_secrets=True): function_body = """ from kfp import components """ component_template_raw = """ ${comp_name} = components.load_component_from_url('${template_url}') ${op_name} = ${comp_name}(${component_args}) """ op_dependency_template_raw = """ ${op_name}.after(${required_op_name}) """ for task in custom_pipeline.dag.tasks: parameters = [] if task.artifact_type == "notebook": component_s3_prefix = f"components/jupyter/" notebook_url = get_object_url(bucket_name="mlpipeline", prefix=f"notebooks/{task.artifact_id}/", file_extensions=[".ipynb"]) if not notebook_url: raise ApiError(f"Could not find notebook '{task.artifact_id}'") task_parameters = list(task.arguments.parameters) if task.arguments and task.arguments.parameters else [] for p in task_parameters: if type(p.value) == str and p.value.startswith("{{inputs.parameters."): raise ApiError("Referencing '{{inputs.parameters.*}}' is not supported for notebook parameter" f" values: {task.to_dict()}", 422) notebook_parameters = {p.name: p.value or p.default for p in task_parameters} notebook_parameters_str = json.dumps(notebook_parameters) if notebook_parameters else "" jupyter_component_parameters = { "notebook_url": notebook_url, "notebook_params": notebook_parameters_str, "api_token": "", "endpoint_url": "", "bucket_name": "", "object_name": "", "access_key": "", "secret_access_key": "" } if not hide_secrets: output_folder = f"notebooks/{task.artifact_id}/runs/{datetime.now().strftime('%Y%m%d-%H%M%S')}" notebook_file_name = notebook_url.split("/")[-1] output_file_name = notebook_file_name.replace(r'.ipynb', '_out.ipynb') output_file_path = f"{output_folder}/{output_file_name}" output_bucket = "mlpipeline" jupyter_component_parameters.update({ "endpoint_url": "minio-service:9000", # f"{minio_host}:{minio_port}", "bucket_name": output_bucket, "object_name": output_file_path, "access_key": minio_access_key, "secret_access_key": minio_secret_key }) for name, value in jupyter_component_parameters.items(): parameters.append(f"{name} = '{value}'") elif task.artifact_type == "component": component_s3_prefix = f"components/{task.artifact_id}/" # replace parameter values that reference pipeline input parameters {{inputs.parameters.parameter_name}} task_parameters = list(task.arguments.parameters) if task.arguments and task.arguments.parameters else [] missing_parameter_values = [p.name for p in task_parameters if not p.value and not p.default and p.description \ and p.description.title().startswith("Required")] if missing_parameter_values: raise ApiError(f"Missing required task parameters {missing_parameter_values}", 422) for p in task_parameters: if type(p.value) == str and p.value.startswith("{{inputs.parameters."): match = re.match(r"{{inputs.parameters.(?P<pipeline_parameter_name>\w+)}}", p.value) if not match: raise ApiError(f"Cannot match pipeline input.parameter '{p.value}'", 422) pipeline_param_ref = match.groupdict().get("pipeline_parameter_name") parameters.append(f"{p.name} = {pipeline_param_ref}") else: arg = generate_method_arg_from_parameter(p) parameters.append(arg) else: raise ApiError(f"Unknown or unsupported artifact_type '{task.artifact_type}':\n'{task}'", 422) comp_name = "comp_" + re.sub(r"\W+", "_", task.name, flags=re.ASCII).lower() op_name = "op_" + re.sub(r"\W+", "_", task.name, flags=re.ASCII).lower() template_url = get_object_url(bucket_name="mlpipeline", prefix=component_s3_prefix, file_extensions=[".yaml", ".yml"]) if not template_url: raise ApiError(f"Could not find component template '{component_s3_prefix}'") substitutions = { "comp_name": comp_name, "op_name": op_name, "template_url": template_url, "component_args": ", ".join(parameters) } template_rendered = Template(component_template_raw).substitute(substitutions) function_body += template_rendered for task in custom_pipeline.dag.tasks: for required_task_name in task.dependencies or []: substitutions = { "op_name": "op_" + re.sub(r"\W+", "_", task.name, flags=re.ASCII).lower(), "required_op_name": "op_" + re.sub(r"\W+", "_", required_task_name, flags=re.ASCII).lower() } template_rendered = Template(op_dependency_template_raw).substitute(substitutions) function_body += template_rendered return function_body