def run_pipeline_in_experiment(api_pipeline: ApiPipeline, parameters: dict = None, run_name: str = None, wait_for_status: bool = False): try: client = KfpClient(_pipeline_service_url) experiment = client.create_experiment('PIPELINE_RUNS') run_result = client.run_pipeline(experiment_id=experiment.id, job_name=run_name or api_pipeline.name, params=parameters, pipeline_id=api_pipeline.id) run_id = run_result.id if wait_for_status: run_details = wait_for_run_status(client, run_id, 10) run_status = json.loads(run_details.pipeline_runtime.workflow_manifest)["status"] if run_status \ and run_status.get("phase", "").lower() in ["failed", "error"] \ and run_status.get("message"): raise RuntimeError(f"Run {run_id} failed with error: {run_status['message']}") return run_id except Exception as e: print(f"Exception trying to run pipeline {api_pipeline.id} '{api_pipeline.name}'" f" with parameters {parameters}:" f" %s\n" % e) raise ApiError(message=f"{e.body}\nKFP URL: {_pipeline_service_url}", http_status_code=e.status or 422) return None
def upload_experiments( client: kfp.Client, pipeline_name: str, github_sha: str, experiment_name: str = "", ) -> str: """Function to upload an experiment to Kubeflow Pipelines. For clarity, the experiment will be registered to Kubeflow Pipelines named like below: {pipeline_name}-{experiment_name} If the experiment does not exist, it will be created newly with specified name. If the experiment is not specified, {pipeline_name}-default will be used. Args: client (kfp.Client) : KFP client. pipeline_name (str) : The name of the pipeline function. github_sha (str) : GitHub SHA generated in GitHub Actions. experiment_name (str) : The experiment name. (Optional) Returns: str : The ID of the experiment. """ register_name = (f"{pipeline_name}-{experiment_name}" if experiment_name != "Default" else experiment_name) try: experiment_id = client.get_experiment( experiment_name=register_name).to_dict()["id"] except ValueError: experiment_id = client.create_experiment( name=register_name).to_dict()["id"] logging.info(f"The experiment is newly registered : {register_name}") return experiment_id
def main(): args = parse_arguments() test_cases = [] test_name = args.testname + ' Sample Test' ###### Initialization ###### client = Client(namespace=args.namespace) ###### Check Input File ###### utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(args.input), 'yaml file is not generated') if not os.path.exists(args.input): utils.write_junit_xml(test_name, args.result, test_cases) print('Error: job not found.') exit(1) ###### Create Experiment ###### experiment_name = args.testname + ' sample experiment' response = client.create_experiment(experiment_name) experiment_id = response.id utils.add_junit_test(test_cases, 'create experiment', True) ###### Create Job ###### job_name = args.testname + '_sample' params = {} response = client.run_pipeline(experiment_id, job_name, args.input, params) run_id = response.id utils.add_junit_test(test_cases, 'create pipeline run', True) ###### Monitor Job ###### start_time = datetime.now() response = client.wait_for_run_completion(run_id, 1200) succ = (response.run.status.lower() == 'succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format( args.namespace, workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit(1) ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)
def run_pipeline_func(client: kfp.Client, pipeline_name: str, pipeline_id: str, pipeline_paramters_path: dict, recurring_flag: bool = False, cron_exp: str = ''): pipeline_params = read_pipeline_params( pipeline_paramters_path=pipeline_paramters_path) pipeline_params = pipeline_params if pipeline_params is not None else {} experiment_id = None experiment_name = "{}-{}".format(pipeline_name, os.environ["INPUT_EXPERIMENT_NAME"]) try: experiment_id = client.get_experiment( experiment_name=experiment_name).to_dict()["id"] except ValueError: experiment_id = client.create_experiment( name=experiment_name).to_dict()["id"] namespace = os.getenv("INPUT_PIPELINE_NAMESPACE") if not str.isspace( os.getenv("INPUT_PIPELINE_NAMESPACE")) else None job_name = 'Run {} on {}'.format( pipeline_name, datetime.now().strftime("%Y-%m-%d %H:%M:%S")) logging.info(f"experiment_id: {experiment_id}, \ job_name: {job_name}, \ pipeline_params: {pipeline_params}, \ pipeline_id: {pipeline_id}, \ namespace: {namespace}, \ cron_exp: {cron_exp}") if recurring_flag == "true": client.create_recurring_run(experiment_id=experiment_id, job_name=job_name, params=pipeline_params, pipeline_id=pipeline_id, cron_expression=cron_exp) logging.info( "Successfully started the recurring pipeline, head over to kubeflow to check it out" ) client.run_pipeline(experiment_id=experiment_id, job_name=job_name, params=pipeline_params, pipeline_id=pipeline_id) logging.info( "Successfully started the pipeline, head over to kubeflow to check it out" )
def get_or_create_experiment(experiment_name: str, client: kfp.Client) -> ApiExperiment: existing_experiments = client.list_experiments().experiments if existing_experiments is not None: exp = next(iter([exp for exp in existing_experiments if exp.name == experiment_name]), None) else: exp = None if exp is None: exp = client.create_experiment(experiment_name) print('Experiment %s created with ID %s' % (exp.name, exp.id)) else: print('Experiment already exists with id %s' % exp.id) return exp
def _mock_pipelines_creation(kfp_client_mock: kfp.Client): def _mock_create_experiment(name, description=None, namespace=None): return kfp_server_api.models.ApiExperiment( id="some-exp-id", name=name, description=description, ) def _mock_run_pipeline( experiment_id, job_name, pipeline_package_path=None, params={}, pipeline_id=None, version_id=None, ): return kfp_server_api.models.ApiRun(id="some-run-id", name=job_name) kfp_client_mock.create_experiment = _mock_create_experiment kfp_client_mock.run_pipeline = _mock_run_pipeline
def main(): args = parse_arguments() test_cases = [] test_name = 'Kubeflow Sample Test' ###### Initialization ###### client = Client() ###### Check Input File ###### utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(args.input), 'yaml file is not generated') if not os.path.exists(args.input): utils.write_junit_xml(test_name, args.result, test_cases) exit() ###### Create Experiment ###### experiment_name = 'kubeflow sample experiment' response = client.create_experiment(experiment_name) experiment_id = response.id utils.add_junit_test(test_cases, 'create experiment', True) ###### Create Job ###### job_name = 'kubeflow_sample' params = { 'output': args.output, 'project': 'ml-pipeline-test', 'evaluation': 'gs://ml-pipeline-dataset/sample-test/flower/eval15.csv', 'train': 'gs://ml-pipeline-dataset/sample-test/flower/train30.csv', 'hidden-layer-size': '10,5', 'steps': '5' } response = client.run_pipeline(experiment_id, job_name, args.input, params) run_id = response.id utils.add_junit_test(test_cases, 'create pipeline run', True) ###### Monitor Job ###### start_time = datetime.now() response = client.wait_for_run_completion(run_id, 1200) succ = (response.run.status.lower() == 'succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit() ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] #TODO: remove the namespace dependency or make is configurable. argo_log, _ = utils.run_bash_command( 'argo logs -n kubeflow -w {}'.format(workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) ###### Validate the results ###### # confusion matrix should show three columns for the flower data # target, predicted, count cm_tar_path = './confusion_matrix.tar.gz' cm_filename = 'mlpipeline-ui-metadata.json' utils.get_artifact_in_minio(workflow_json, 'confusionmatrix', cm_tar_path) tar_handler = tarfile.open(cm_tar_path) tar_handler.extractall() with open(cm_filename, 'r') as f: cm_data = json.load(f) utils.add_junit_test( test_cases, 'confusion matrix format', (len(cm_data['outputs'][0]['schema']) == 3), 'the column number of the confusion matrix output is not equal to three' ) ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)
class KubeflowClient(object): log = logging.getLogger(__name__) def __init__(self, config, project_name, context): token = AuthHandler().obtain_id_token() self.host = config.host self.client = Client(self.host, existing_token=token) self.project_name = project_name self.pipeline_description = config.run_config.description self.generator = PipelineGenerator(config, project_name, context) def list_pipelines(self): pipelines = self.client.list_pipelines(page_size=30).pipelines return tabulate(map(lambda x: [x.name, x.id], pipelines), headers=["Name", "ID"]) def run_once( self, pipeline, image, experiment_name, run_name, wait, image_pull_policy="IfNotPresent", ) -> None: run = self.client.create_run_from_pipeline_func( self.generator.generate_pipeline(pipeline, image, image_pull_policy), arguments={}, experiment_name=experiment_name, run_name=run_name, ) if wait: run.wait_for_run_completion(timeout=WAIT_TIMEOUT) def compile(self, pipeline, image, output, image_pull_policy="IfNotPresent"): Compiler().compile( self.generator.generate_pipeline(pipeline, image, image_pull_policy), output, ) self.log.info("Generated pipeline definition was saved to %s" % output) def upload(self, pipeline, image, image_pull_policy="IfNotPresent"): pipeline = self.generator.generate_pipeline(pipeline, image, image_pull_policy) if self._pipeline_exists(self.project_name): pipeline_id = self._get_pipeline_id(self.project_name) version_id = self._upload_pipeline_version(pipeline, pipeline_id) self.log.info("New version of pipeline created: %s", version_id) else: (pipeline_id, version_id) = self._upload_pipeline(pipeline) self.log.info("Pipeline created") self.log.info( f"Pipeline link: {self.host}/#/pipelines/details/%s/version/%s", pipeline_id, version_id, ) def _pipeline_exists(self, pipeline_name): return self._get_pipeline_id(pipeline_name) is not None def _get_pipeline_id(self, pipeline_name): pipelines = self.client.pipelines.list_pipelines(filter=json.dumps({ "predicates": [{ "key": "name", "op": 1, "string_value": pipeline_name, }] })).pipelines if pipelines: return pipelines[0].id def _upload_pipeline_version(self, pipeline_func, pipeline_id): version_name = f"{clean_name(self.project_name)}-{uuid.uuid4()}"[:100] with NamedTemporaryFile(suffix=".yaml") as f: Compiler().compile(pipeline_func, f.name) return self.client.pipeline_uploads.upload_pipeline_version( f.name, name=version_name, pipelineid=pipeline_id, _request_timeout=10000, ).id def _upload_pipeline(self, pipeline_func): with NamedTemporaryFile(suffix=".yaml") as f: Compiler().compile(pipeline_func, f.name) pipeline = self.client.pipeline_uploads.upload_pipeline( f.name, name=self.project_name, description=self.pipeline_description, _request_timeout=10000, ) return (pipeline.id, pipeline.default_version.id) def _ensure_experiment_exists(self, experiment_name): try: experiment = self.client.get_experiment( experiment_name=experiment_name) self.log.info(f"Existing experiment found: {experiment.id}") except ValueError as e: if not str(e).startswith("No experiment is found"): raise experiment = self.client.create_experiment(experiment_name) self.log.info(f"New experiment created: {experiment.id}") return experiment.id def schedule(self, experiment_name, cron_expression): experiment_id = self._ensure_experiment_exists(experiment_name) pipeline_id = self._get_pipeline_id(self.project_name) self._disable_runs(experiment_id, pipeline_id) self.client.create_recurring_run( experiment_id, f"{self.project_name} on {cron_expression}", cron_expression=cron_expression, pipeline_id=pipeline_id, ) self.log.info("Pipeline scheduled to %s", cron_expression) def _disable_runs(self, experiment_id, pipeline_id): runs = self.client.list_recurring_runs(experiment_id=experiment_id) if runs.jobs is not None: my_runs = [ job for job in runs.jobs if job.pipeline_spec.pipeline_id == pipeline_id ] for job in my_runs: self.client.jobs.delete_job(job.id) self.log.info(f"Previous schedule deleted {job.id}")
def run_pipeline(pipeline, arguments=None, experiment=None, run=None, namespace=None, artifact_path=None, ops=None, url=None, remote=False): """remote KubeFlow pipeline execution Submit a workflow task to KFP via mlrun API service :param pipeline KFP pipeline function or path to .yaml/.zip pipeline file :param arguments pipeline arguments :param experiment experiment name :param run optional, run name :param namespace Kubernetes namespace (if not using default) :param url optional, url to mlrun API service :param artifact_path target location/url for mlrun artifacts :param ops additional operators (.apply() to all pipeline functions) :param remote use mlrun remote API service vs direct KFP APIs :return kubeflow pipeline id """ namespace = namespace or mlconf.namespace arguments = arguments or {} if remote or url: mldb = get_run_db(url).connect() if mldb.kind != 'http': raise ValueError( 'run pipeline require access to remote api-service' ', please set the dbpath url') id = mldb.submit_pipeline(pipeline, arguments, experiment=experiment, run=run, namespace=namespace, ops=ops, artifact_path=artifact_path) else: client = Client(namespace=namespace) if isinstance(pipeline, str): experiment = client.create_experiment(name=experiment) run_result = client.run_pipeline(experiment.id, run, pipeline, params=arguments) else: conf = new_pipe_meta(artifact_path, ops) run_result = client.create_run_from_pipeline_func( pipeline, arguments, run_name=run, experiment_name=experiment, pipeline_conf=conf) id = run_result.run_id logger.info('Pipeline run id={}, check UI or DB for progress'.format(id)) return id
def main(): args = parse_arguments() test_cases = [] test_name = args.testname + ' Sample Test' ###### Initialization ###### host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace client = Client(host=host) ###### Check Input File ###### utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(args.input), 'yaml file is not generated') if not os.path.exists(args.input): utils.write_junit_xml(test_name, args.result, test_cases) print('Error: job not found.') exit(1) ###### Create Experiment ###### experiment_name = args.testname + ' sample experiment' response = client.create_experiment(experiment_name) experiment_id = response.id utils.add_junit_test(test_cases, 'create experiment', True) ###### Create Job ###### job_name = args.testname + '_sample' ###### Test-specific arguments ####### if args.testname == 'tfx_cab_classification': params = { 'output': args.output, 'project': 'ml-pipeline-test', 'column-names': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/column-names.json', 'evaluation': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/eval20.csv', 'train': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/train50.csv', 'hidden-layer-size': '5', 'steps': '5' } elif args.testname == 'xgboost_training_cm': params = { 'output': args.output, 'project': 'ml-pipeline-test', 'train-data': 'gs://ml-pipeline-dataset/sample-test/sfpd/train_50.csv', 'eval-data': 'gs://ml-pipeline-dataset/sample-test/sfpd/eval_20.csv', 'schema': 'gs://ml-pipeline-dataset/sample-test/sfpd/schema.json', 'rounds': '20', 'workers': '2' } else: # Basic tests require no additional params. params = {} response = client.run_pipeline(experiment_id, job_name, args.input, params) run_id = response.id utils.add_junit_test(test_cases, 'create pipeline run', True) ###### Monitor Job ###### try: start_time = datetime.now() if args.testname == 'xgboost_training_cm': response = client.wait_for_run_completion(run_id, 1800) else: response = client.wait_for_run_completion(run_id, 1200) succ = (response.run.status.lower() == 'succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) finally: ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format( args.namespace, workflow_id)) print('=========Argo Workflow Log=========') print(argo_log) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit(1) ###### Validate the results for specific test cases ###### #TODO: Add result check for tfx-cab-classification after launch. if args.testname == 'xgboost_training_cm': cm_tar_path = './confusion_matrix.tar.gz' utils.get_artifact_in_minio(workflow_json, 'confusion-matrix', cm_tar_path, 'mlpipeline-ui-metadata') with tarfile.open(cm_tar_path) as tar_handle: file_handles = tar_handle.getmembers() assert len(file_handles) == 1 with tar_handle.extractfile(file_handles[0]) as f: cm_data = f.read() utils.add_junit_test(test_cases, 'confusion matrix format', (len(cm_data) > 0), 'the confusion matrix file is empty') ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)
class KubeflowClient: """ A wrapper of the existing Kubeflow Pipelines Client which enriches it to be able to access more of the Kubeflow Pipelines API. """ def __init__(self, host: Optional[str] = None, client_id: Optional[str] = None, namespace: Optional[str] = "kubeflow"): """ Instandiate a new KubeflowClient Args: host (str): The host we can find the Kubeflow API at (e.g. https://{APP_NAME}.endpoints.{PROJECT_ID}.cloud.goog/pipeline) client_id (str): The IAP client id we can use for authorisate (e.g. "XXXXXX-XXXXXXXXX.apps.googleusercontent.com") namespace (str): The Kuberenetes / Kubeflow namespace to deploy to (e.g. kubeflow) """ self.host = host self.client_id = client_id self.namespace = namespace logging.info(f"KubeflowClient: host: {host}, client_id: {client_id}") self.kfp_client = Client(host, client_id, namespace) self.config = self.kfp_client._load_config(self.host, self.client_id, self.namespace, None, None) # print(f"kfp auth:") # print(f"\thost: {self.host}") # print(f"\tclient_id: {self.client_id}") # print(f"\tnamespace: {self.namespace}") # print(f"\tapi_key: {self.config.api_key}") self.kfp_pipelines = self._connect_pipelines_api() self.kfp_runs = self._connect_runs_api() self.kfp_jobs = self._connect_jobs_api() def create_pipeline(self, pipeline_func, pipeline_name): """ Create a new Kubeflow Pipeline using the provided pipeline function Args: pipeline_func: The method decorated by @dsl.pipeline which defines the pipeline Returns: The Kubeflow Pipeline object created """ try: (_, pipeline_package_path) = tempfile.mkstemp(suffix=".zip") compiler.Compiler().compile(pipeline_func, pipeline_package_path) logging.info(f"Compiled piopeline to: {pipeline_package_path}") return self.kfp_client.upload_pipeline(pipeline_package_path, pipeline_name) finally: pass # os.remove(pipeline_package_path) def create_experiment(self, experiment_name): """ Create a new Kubeflow Pipelines Experiment (grouping of pipeliens / runs) Args: experiment_name (str): The name of the experiment Returns: The Kubeflow experiement object created """ return self.kfp_client.create_experiment(name=experiment_name) def list_experiments(self): """ List the Experiments in the current namespace Returns: A list of all the Experiments """ all_experiments = list() next_page_token = "" while next_page_token is not None: response = self.kfp_client.list_experiments( page_size=100, page_token=next_page_token) if response.experiments is None: break all_experiments.extend(response.experiments) next_page_token = response.next_page_token count = len(all_experiments) # print(f"list_experiments: found {count}") return all_experiments def find_job(self, job_name): """ Look up a job by its name (in the current namespace). Returns None if the job cannot be found Args: job_name (str): The name of the job to find Returns: A reference to the job if found, and None if not. """ jobs = self.list_jobs() if jobs is None: return None for j in jobs: if j.name == job_name: return j return None def list_jobs(self): """ List the Jobs in the current namespace Returns: A list of all the Jobs """ all_jobs = list() next_page_token = "" while next_page_token is not None: response = self.kfp_jobs.list_jobs(page_size=100, page_token=next_page_token) if response.jobs is None: break all_jobs.extend(response.jobs) next_page_token = response.next_page_token count = len(all_jobs) # print(f"all_jobs: found {count}") return all_jobs def delete_job(self, job): """ Delete a `Job` using its job.id Args: job (KubeflowJob): A `Job` object to delete Returns: True if the `Job` was deleted succesfully """ self.kfp_jobs.delete_job(id=job.id) return True def create_job(self, name: str, pipeline, experiment, description=None, enabled=True, max_concurrency=1, cron=None): """ Create a new Kubeflow Pipelines Job Args: name (str): The name of the `Job` pipeline (Pipeline): The `Pipeline` object to execute when the `Job` is called experiment (Experiment): The `Experiment` object to create the `Job` in. description (str): A description of what the `Job` is all about enabled (bool): Should be `Job` be enabled? max_concurrency (int): How many concurrent executions of the `Job` are allowed? cron (str): The CRON expression to use to execute the job periodicalls Returns: The Kubeflow API response object. """ key = kfp_server_api.models.ApiResourceKey( id=experiment.id, type=kfp_server_api.models.ApiResourceType.EXPERIMENT) reference = kfp_server_api.models.ApiResourceReference( key, kfp_server_api.models.ApiRelationship.OWNER) spec = kfp_server_api.models.ApiPipelineSpec(pipeline_id=pipeline.id) trigger = None if cron is not None: cron_schedule = kfp_server_api.models.api_cron_schedule.ApiCronSchedule( cron=cron) trigger = kfp_server_api.models.api_trigger.ApiTrigger( cron_schedule=cron_schedule) run_body = kfp_server_api.models.ApiJob( name=name, description=description, pipeline_spec=spec, resource_references=[reference], enabled=True, trigger=trigger, max_concurrency=str(max_concurrency), ) response = self.kfp_jobs.create_job(body=run_body) return response def list_runs(self, experiment_name): """ List the `Runs` in the specified Exper`iment Args: experiment_name (str): The name of the `Experiment` Returns: A list of all the `Runs` in the current `Experiment` """ experiment = self.get_experiment(experiment_name=experiment_name) all_runs = list() next_page_token = "" while next_page_token is not None: response = self.kfp_client.list_runs(page_size=100, page_token=next_page_token) if response.runs is None: break all_runs.extend(response.runs) next_page_token = response.next_page_token run_count = len(all_runs) # print(f"list_runs: found {run_count}") return all_runs def list_pipelines(self): """ List the `Pipelines` in the current namespace Returns: A list of all the `Pipelines` in the current `Experiment` """ all_pipelines = list() response = self.kfp_client.list_pipelines(page_size=100) next_page_token = "" while next_page_token is not None: response = self.kfp_client.list_pipelines( page_size=100, page_token=next_page_token) if response.pipelines is None: break all_pipelines.extend(response.pipelines) next_page_token = response.next_page_token pipeline_count = len(all_pipelines) # print(f"list_pipelines: found {pipeline_count}") return all_pipelines def find_experiment(self, id=None, name=None): """ Look up an `Experiment` by its name or id. Returns None if the `Experiment` cannot be found. Both `id` and `name` are optional, but atleast one must be provided. Where both a provided, the function will return with the first `Experiment` matching either id or name. Args: id (str): The `id` of the `Experiment` to find name (string): The `name` of the `Experiment` to find Returns: A reference to the `Experiment` if found, and None if not. """ experiments = self.list_experiments() if experiments is None: return None for e in experiments: if e.name == name: return e if e.id == id: return e return None def find_pipeline(self, name): """ Look up a `Pipeline` by its name (in the current namespace). Returns None if the `Pipeline` cannot be found Args: name (str): The name of the `Pipeline` to find Returns: A reference to the `Pipeline` if found, and `None` if not. """ pipelines = self.list_pipelines() if pipelines is None: return None for p in pipelines: if p.name == name: return p return None def delete_pipeline(self, pipeline): """ Delete the specified `Pipeline` Args: pipeline: The pipeline object to delete Returns: True if successfull """ # Go through all my pipelines to find the one to delete self.kfp_pipelines.delete_pipeline(pipeline.id) return True def _connect_pipelines_api(self): """ Create a new PipelineServiceApi client """ api_client = kfp_server_api.api_client.ApiClient(self.config) pipelines_api = kfp_server_api.api.pipeline_service_api.PipelineServiceApi( api_client) return pipelines_api def _connect_runs_api(self): """ Create a new PipelineServiceApi client """ api_client = kfp_server_api.api_client.ApiClient(self.config) runs_api = kfp_server_api.api.run_service_api.RunServiceApi(api_client) return runs_api def _connect_jobs_api(self): """ Create a new PipelineServiceApi client """ api_client = kfp_server_api.api_client.ApiClient(self.config) runs_api = kfp_server_api.api.job_service_api.JobServiceApi(api_client) return runs_api
class KubeflowClient(object): log = logging.getLogger(__name__) def __init__(self, config, project_name, context): token = self.obtain_id_token() self.host = config.host self.client = Client(self.host, existing_token=token) self.project_name = project_name self.context = context dsl.ContainerOp._DISABLE_REUSABLE_COMPONENT_WARNING = True self.volume_meta = config.run_config.volume def list_pipelines(self): pipelines = self.client.list_pipelines(page_size=30).pipelines return tabulate(map(lambda x: [x.name, x.id], pipelines), headers=["Name", "ID"]) def run_once( self, pipeline, image, experiment_name, run_name, wait, image_pull_policy="IfNotPresent", ) -> None: run = self.client.create_run_from_pipeline_func( self.generate_pipeline(pipeline, image, image_pull_policy), arguments={}, experiment_name=experiment_name, run_name=run_name, ) if wait: run.wait_for_run_completion(timeout=WAIT_TIMEOUT) def obtain_id_token(self): from google.auth.transport.requests import Request from google.oauth2 import id_token from google.auth.exceptions import DefaultCredentialsError client_id = os.environ.get(IAP_CLIENT_ID, None) jwt_token = None if not client_id: self.log.info( "No IAP_CLIENT_ID provided, skipping custom IAP authentication" ) return jwt_token try: self.log.debug("Obtaining JWT token for %s." + client_id) jwt_token = id_token.fetch_id_token(Request(), client_id) self.log.info("Obtained JWT token for MLFLOW connectivity.") except DefaultCredentialsError as ex: self.log.warning( str(ex) + (" Note that this authentication method does not work with default" " credentials obtained via 'gcloud auth application-default login'" " command. Refer to documentation on how to configure service account" " locally" " (https://cloud.google.com/docs/authentication/production#manually)" )) except Exception as e: self.log.error("Failed to obtain IAP access token. " + str(e)) finally: return jwt_token def generate_pipeline(self, pipeline, image, image_pull_policy): @dsl.pipeline( name=self.project_name, description="Kubeflow pipeline for Kedro project", ) def convert_kedro_pipeline_to_kfp() -> None: """Convert from a Kedro pipeline into a kfp container graph.""" node_volumes = (_setup_volumes() if self.volume_meta is not None else {}) node_dependencies = self.context.pipelines.get( pipeline).node_dependencies kfp_ops = _build_kfp_ops(node_dependencies, node_volumes) for node, dependencies in node_dependencies.items(): for dependency in dependencies: kfp_ops[node.name].after(kfp_ops[dependency.name]) def _setup_volumes(): vop = dsl.VolumeOp( name="data-volume-create", resource_name="data-volume", size=self.volume_meta.size, modes=self.volume_meta.access_modes, storage_class=self.volume_meta.storageclass, ) if self.volume_meta.skip_init: return {"/home/kedro/data": vop.volume} else: volume_init = dsl.ContainerOp( name="data-volume-init", image=image, command=["sh", "-c"], arguments=[ " ".join([ "cp", "--verbose", "-r", "/home/kedro/data/*", "/home/kedro/datavolume", ]) ], pvolumes={"/home/kedro/datavolume": vop.volume}, ) volume_init.container.set_image_pull_policy(image_pull_policy) return {"/home/kedro/data": volume_init.pvolume} def _build_kfp_ops(node_dependencies: Dict[Node, Set[Node]], node_volumes: Dict) -> Dict[str, dsl.ContainerOp]: """Build kfp container graph from Kedro node dependencies. """ kfp_ops = {} env = [ V1EnvVar(name=IAP_CLIENT_ID, value=os.environ.get(IAP_CLIENT_ID, "")) ] if is_mlflow_enabled(): kfp_ops["mlflow-start-run"] = dsl.ContainerOp( name="mlflow-start-run", image=image, command=["kedro"], arguments=[ "kubeflow", "mlflow-start", dsl.RUN_ID_PLACEHOLDER, ], file_outputs={"mlflow_run_id": "/tmp/mlflow_run_id"}, ) kfp_ops["mlflow-start-run"].container.set_image_pull_policy( image_pull_policy) env.append( V1EnvVar( name="MLFLOW_RUN_ID", value=kfp_ops["mlflow-start-run"].output, )) for node in node_dependencies: name = _clean_name(node.name) kfp_ops[node.name] = dsl.ContainerOp( name=name, image=image, command=["kedro"], arguments=["run", "--node", node.name], pvolumes=node_volumes, container_kwargs={"env": env}, ) kfp_ops[node.name].container.set_image_pull_policy( image_pull_policy) return kfp_ops return convert_kedro_pipeline_to_kfp def compile(self, pipeline, image, output, image_pull_policy="IfNotPresent"): Compiler().compile( self.generate_pipeline(pipeline, image, image_pull_policy), output) self.log.info("Generated pipeline definition was saved to %s" % output) def upload(self, pipeline, image, image_pull_policy="IfNotPresent"): pipeline = self.generate_pipeline(pipeline, image, image_pull_policy) if self._pipeline_exists(self.project_name): pipeline_id = self._get_pipeline_id(self.project_name) version_id = self._upload_pipeline_version(pipeline, pipeline_id, self.project_name) self.log.info("New version of pipeline created: %s", version_id) else: (pipeline_id, version_id) = self._upload_pipeline(pipeline, self.project_name) self.log.info("Pipeline created") self.log.info( f"Pipeline link: {self.host}/#/pipelines/details/%s/version/%s", pipeline_id, version_id, ) def _pipeline_exists(self, pipeline_name): return self._get_pipeline_id(pipeline_name) is not None def _get_pipeline_id(self, pipeline_name): pipelines = self.client.pipelines.list_pipelines(filter=json.dumps({ "predicates": [{ "key": "name", "op": 1, "string_value": pipeline_name, }] })).pipelines if pipelines: return pipelines[0].id def _upload_pipeline_version(self, pipeline_func, pipeline_id, pipeline_name): version_name = f"{_clean_name(pipeline_name)}-{uuid.uuid4()}"[:100] with NamedTemporaryFile(suffix=".yaml") as f: Compiler().compile(pipeline_func, f.name) return self.client.pipeline_uploads.upload_pipeline_version( f.name, name=version_name, pipelineid=pipeline_id).id def _upload_pipeline(self, pipeline_func, pipeline_name): with NamedTemporaryFile(suffix=".yaml") as f: Compiler().compile(pipeline_func, f.name) pipeline = self.client.pipeline_uploads.upload_pipeline( f.name, name=pipeline_name) return (pipeline.id, pipeline.default_version.id) def _ensure_experiment_exists(self, experiment_name): try: experiment = self.client.get_experiment( experiment_name=experiment_name) self.log.info(f"Existing experiment found: {experiment.id}") except ValueError as e: if not str(e).startswith("No experiment is found"): raise experiment = self.client.create_experiment(experiment_name) self.log.info(f"New experiment created: {experiment.id}") return experiment.id def schedule(self, experiment_name, cron_expression): experiment_id = self._ensure_experiment_exists(experiment_name) pipeline_id = self._get_pipeline_id(self.project_name) self._disable_runs(experiment_id, pipeline_id) self.client.create_recurring_run( experiment_id, f"{self.project_name} on {cron_expression}", cron_expression=cron_expression, pipeline_id=pipeline_id, ) self.log.info("Pipeline scheduled to %s", cron_expression) def _disable_runs(self, experiment_id, pipeline_id): runs = self.client.list_recurring_runs(experiment_id=experiment_id) if runs.jobs is not None: my_runs = [ job for job in runs.jobs if job.pipeline_spec.pipeline_id == pipeline_id ] for job in my_runs: self.client.jobs.delete_job(job.id) self.log.info(f"Previous schedule deleted {job.id}")
def main(): args = parse_arguments() test_cases = [] test_name = 'Resnet CMLE Test' ###### Initialization ###### host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace client = Client(host=host) ###### Check Input File ###### utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(args.input), 'yaml file is not generated') if not os.path.exists(args.input): utils.write_junit_xml(test_name, args.result, test_cases) print('Error: job not found.') exit(1) ###### Create Experiment ###### experiment_name = 'resnet cmle sample experiment' response = client.create_experiment(experiment_name) experiment_id = response.id utils.add_junit_test(test_cases, 'create experiment', True) ###### Create Job ###### job_name = 'cmle_sample' params = { 'output': args.output, 'project_id': 'ml-pipeline-test', 'region': 'us-central1', 'model': 'bolts', 'version': 'beta1', 'tf_version': '1.9', # Watch out! If 1.9 is no longer supported we need to set it to a newer version. 'train_csv': 'gs://ml-pipeline-dataset/sample-test/bolts/bolt_images_train_sample1000.csv', 'validation_csv': 'gs://ml-pipeline-dataset/sample-test/bolts/bolt_images_validate_sample200.csv', 'labels': 'gs://bolts_image_dataset/labels.txt', 'depth': 50, 'train_batch_size': 32, 'eval_batch_size': 32, 'steps_per_eval': 128, 'train_steps': 128, 'num_train_images': 1000, 'num_eval_images': 200, 'num_label_classes': 10 } response = client.run_pipeline(experiment_id, job_name, args.input, params) run_id = response.id utils.add_junit_test(test_cases, 'create pipeline run', True) ###### Monitor Job ###### try: start_time = datetime.now() response = client.wait_for_run_completion(run_id, 1800) succ = (response.run.status.lower() == 'succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) finally: ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format( args.namespace, workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit(1) ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)
def main(): args = parse_arguments() test_cases = [] test_name = 'TFX Sample Test' ###### Initialization ###### client = Client(namespace=args.namespace) ###### Check Input File ###### utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(args.input), 'yaml file is not generated') if not os.path.exists(args.input): utils.write_junit_xml(test_name, args.result, test_cases) exit() ###### Create Experiment ###### experiment_name = 'TFX sample experiment' response = client.create_experiment(experiment_name) experiment_id = response.id utils.add_junit_test(test_cases, 'create experiment', True) ###### Create Job ###### job_name = 'TFX_sample' params = {'output': args.output, 'project': 'ml-pipeline-test', 'column-names': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/column-names.json', 'evaluation': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/eval20.csv', 'train': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/train50.csv', 'hidden-layer-size': '5', 'steps': '5'} response = client.run_pipeline(experiment_id, job_name, args.input, params) run_id = response.id utils.add_junit_test(test_cases, 'create pipeline run', True) ###### Monitor Job ###### start_time = datetime.now() response = client.wait_for_run_completion(run_id, 1200) succ = (response.run.status.lower()=='succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit() ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(args.namespace, workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) ###### Validate the results ###### #TODO: enable after launch # model analysis html is validated # argo_workflow_id = workflow_json['metadata']['name'] # gcs_html_path = os.path.join(os.path.join(args.output, str(argo_workflow_id)), 'analysis/output_display.html') # print('Output display HTML path is ' + gcs_html_path) # utils.run_bash_command('gsutil cp ' + gcs_html_path + './') # display_file = open('./output_display.html', 'r') # is_next_line_state = False # for line in display_file: # if is_next_line_state: # state = line.strip() # break # if line.strip() == '<script type="application/vnd.jupyter.widget-state+json">': # is_next_line_state = True # import json # state_json = json.loads(state) # succ = ('state' in state_json and 'version_major' in state_json and 'version_minor' in state_json) # utils.add_junit_test(test_cases, 'output display html', succ, 'the state json does not contain state, version_major, or version_inor') ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)
def check(self): """Run sample test and check results.""" test_cases = [] test_name = self._testname + ' Sample Test' ###### Initialization ###### host = 'ml-pipeline.%s.svc.cluster.local:8888' % self._namespace client = Client(host=host) ###### Check Input File ###### utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(self._input), 'yaml file is not generated') if not os.path.exists(self._input): utils.write_junit_xml(test_name, self._result, test_cases) print('Error: job not found.') exit(1) ###### Create Experiment ###### experiment_name = self._testname + ' sample experiment' response = client.create_experiment(experiment_name) experiment_id = response.id utils.add_junit_test(test_cases, 'create experiment', True) ###### Create Job ###### job_name = self._testname + '_sample' ###### Figure out arguments from associated config files. ####### test_args = {} try: with open(DEFAULT_CONFIG, 'r') as f: raw_args = yaml.safe_load(f) except yaml.YAMLError as yamlerr: raise RuntimeError('Illegal default config:{}'.format(yamlerr)) except OSError as ose: raise FileExistsError('Default config not found:{}'.format(ose)) else: test_timeout = raw_args['test_timeout'] try: with open(os.path.join(CONFIG_DIR, '%s.config.yaml' % self._testname), 'r') as f: raw_args = yaml.safe_load(f) except yaml.YAMLError as yamlerr: print('No legit yaml config file found, use default args:{}'.format(yamlerr)) except OSError as ose: print('Config file with the same name not found, use default args:{}'.format(ose)) else: test_args.update(raw_args['arguments']) if 'output' in test_args.keys(): # output is a special param that has to be specified dynamically. test_args['output'] = self._output if 'test_timeout' in raw_args.keys(): test_timeout = raw_args['test_timeout'] response = client.run_pipeline(experiment_id, job_name, self._input, test_args) run_id = response.id utils.add_junit_test(test_cases, 'create pipeline run', True) ###### Monitor Job ###### try: start_time = datetime.now() response = client.wait_for_run_completion(run_id, test_timeout) succ = (response.run.status.lower() == 'succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) finally: ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format( self._namespace, workflow_id)) print('=========Argo Workflow Log=========') print(argo_log) if not succ: utils.write_junit_xml(test_name, self._result, test_cases) exit(1) ###### Validate the results for specific test cases ###### #TODO: Add result check for tfx-cab-classification after launch. if self._testname == 'xgboost_training_cm': # For xgboost sample, check its confusion matrix. cm_tar_path = './confusion_matrix.tar.gz' utils.get_artifact_in_minio(workflow_json, 'confusion-matrix', cm_tar_path, 'mlpipeline-ui-metadata') with tarfile.open(cm_tar_path) as tar_handle: file_handles = tar_handle.getmembers() assert len(file_handles) == 1 with tar_handle.extractfile(file_handles[0]) as f: cm_data = f.read() utils.add_junit_test(test_cases, 'confusion matrix format', (len(cm_data) > 0), 'the confusion matrix file is empty') ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, self._result, test_cases)
class PySampleChecker(object): def __init__(self, testname, input, output, result, namespace='kubeflow'): """Util class for checking python sample test running results. :param testname: test name. :param input: The path of a pipeline file that will be submitted. :param output: The path of the test output. :param result: The path of the test result that will be exported. :param namespace: namespace of the deployed pipeline system. Default: kubeflow """ self._testname = testname self._input = input self._output = output self._result = result self._namespace = namespace self._run_pipeline = None self._test_timeout = None self._test_cases = [] self._test_name = self._testname + ' Sample Test' self._client = None self._experiment_id = None self._job_name = None self._test_args = None self._run_id = None def run(self): """Run compiled KFP pipeline.""" ###### Initialization ###### host = 'ml-pipeline.%s.svc.cluster.local:8888' % self._namespace self._client = Client(host=host) ###### Check Input File ###### utils.add_junit_test(self._test_cases, 'input generated yaml file', os.path.exists(self._input), 'yaml file is not generated') if not os.path.exists(self._input): utils.write_junit_xml(self._test_name, self._result, self._test_cases) print('Error: job not found.') exit(1) ###### Create Experiment ###### experiment_name = self._testname + ' sample experiment' response = self._client.create_experiment(experiment_name) self._experiment_id = response.id utils.add_junit_test(self._test_cases, 'create experiment', True) ###### Create Job ###### self._job_name = self._testname + '_sample' ###### Figure out arguments from associated config files. ####### self._test_args = {} config_schema = yamale.make_schema(SCHEMA_CONFIG) try: with open(DEFAULT_CONFIG, 'r') as f: raw_args = yaml.safe_load(f) default_config = yamale.make_data(DEFAULT_CONFIG) yamale.validate( config_schema, default_config) # If fails, a ValueError will be raised. except yaml.YAMLError as yamlerr: raise RuntimeError('Illegal default config:{}'.format(yamlerr)) except OSError as ose: raise FileExistsError('Default config not found:{}'.format(ose)) else: self._test_timeout = raw_args['test_timeout'] self._run_pipeline = raw_args['run_pipeline'] try: with open( os.path.join(CONFIG_DIR, '%s.config.yaml' % self._testname), 'r') as f: raw_args = yaml.safe_load(f) test_config = yamale.make_data( os.path.join(CONFIG_DIR, '%s.config.yaml' % self._testname)) yamale.validate( config_schema, test_config) # If fails, a ValueError will be raised. except yaml.YAMLError as yamlerr: print( 'No legit yaml config file found, use default args:{}'.format( yamlerr)) except OSError as ose: print( 'Config file with the same name not found, use default args:{}' .format(ose)) else: self._test_args.update(raw_args['arguments']) if 'output' in self._test_args.keys( ): # output is a special param that has to be specified dynamically. self._test_args['output'] = self._output if 'test_timeout' in raw_args.keys(): self._test_timeout = raw_args['test_timeout'] if 'run_pipeline' in raw_args.keys(): self._run_pipeline = raw_args['run_pipeline'] # Submit for pipeline running. if self._run_pipeline: response = self._client.run_pipeline(self._experiment_id, self._job_name, self._input, self._test_args) self._run_id = response.id utils.add_junit_test(self._test_cases, 'create pipeline run', True) def check(self): """Check pipeline run results.""" if self._run_pipeline: ###### Monitor Job ###### try: start_time = datetime.now() response = self._client.wait_for_run_completion( self._run_id, self._test_timeout) succ = (response.run.status.lower() == 'succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test(self._test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) finally: ###### Output Argo Log for Debugging ###### workflow_json = self._client._get_workflow_json(self._run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command( 'argo logs -n {} -w {}'.format(self._namespace, workflow_id)) print('=========Argo Workflow Log=========') print(argo_log) if not succ: utils.write_junit_xml(self._test_name, self._result, self._test_cases) exit(1) ###### Validate the results for specific test cases ###### #TODO: Add result check for tfx-cab-classification after launch. if self._testname == 'xgboost_training_cm': # For xgboost sample, check its confusion matrix. cm_tar_path = './confusion_matrix.tar.gz' utils.get_artifact_in_minio(workflow_json, 'confusion-matrix', cm_tar_path, 'mlpipeline-ui-metadata') with tarfile.open(cm_tar_path) as tar_handle: file_handles = tar_handle.getmembers() assert len(file_handles) == 1 with tar_handle.extractfile(file_handles[0]) as f: cm_data = f.read() utils.add_junit_test( self._test_cases, 'confusion matrix format', (len(cm_data) > 0), 'the confusion matrix file is empty') ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(self._test_name, self._result, self._test_cases)
def run_pipeline( pipeline, arguments=None, project=None, experiment=None, run=None, namespace=None, artifact_path=None, ops=None, url=None, ttl=None, ): """remote KubeFlow pipeline execution Submit a workflow task to KFP via mlrun API service :param pipeline: KFP pipeline function or path to .yaml/.zip pipeline file :param arguments: pipeline arguments :param experiment: experiment name :param run: optional, run name :param namespace: Kubernetes namespace (if not using default) :param url: optional, url to mlrun API service :param artifact_path: target location/url for mlrun artifacts :param ops: additional operators (.apply() to all pipeline functions) :param ttl: pipeline ttl in secs (after that the pods will be removed) :returns: kubeflow pipeline id """ remote = not get_k8s_helper( silent=True).is_running_inside_kubernetes_cluster() artifact_path = artifact_path or mlconf.artifact_path if artifact_path and "{{run.uid}}" in artifact_path: artifact_path.replace("{{run.uid}}", "{{workflow.uid}}") if artifact_path and "{{run.project}}" in artifact_path: if not project: raise ValueError("project name must be specified with this" + f" artifact_path template {artifact_path}") artifact_path.replace("{{run.project}}", project) if not artifact_path: raise ValueError("artifact path was not specified") namespace = namespace or mlconf.namespace arguments = arguments or {} if remote or url: mldb = get_run_db(url) if mldb.kind != "http": raise ValueError( "run pipeline require access to remote api-service" ", please set the dbpath url") id = mldb.submit_pipeline( pipeline, arguments, experiment=experiment, run=run, namespace=namespace, ops=ops, artifact_path=artifact_path, ) else: client = Client(namespace=namespace) if isinstance(pipeline, str): experiment = client.create_experiment(name=experiment) run_result = client.run_pipeline(experiment.id, run, pipeline, params=arguments) else: conf = new_pipe_meta(artifact_path, ttl, ops) run_result = client.create_run_from_pipeline_func( pipeline, arguments, run_name=run, experiment_name=experiment, pipeline_conf=conf, ) id = run_result.run_id logger.info("Pipeline run id={}, check UI or DB for progress".format(id)) return id
def main(): args = parse_arguments() test_cases = [] test_name = 'XGBoost Sample Test' ###### Initialization ###### host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace client = Client(host=host) ###### Check Input File ###### utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(args.input), 'yaml file is not generated') if not os.path.exists(args.input): utils.write_junit_xml(test_name, args.result, test_cases) print('Error: job not found.') exit(1) ###### Create Experiment ###### experiment_name = 'xgboost sample experiment' response = client.create_experiment(experiment_name) experiment_id = response.id utils.add_junit_test(test_cases, 'create experiment', True) ###### Create Job ###### job_name = 'xgboost_sample' params = { 'output': args.output, 'project': 'ml-pipeline-test', 'train-data': 'gs://ml-pipeline-dataset/sample-test/sfpd/train_50.csv', 'eval-data': 'gs://ml-pipeline-dataset/sample-test/sfpd/eval_20.csv', 'schema': 'gs://ml-pipeline-dataset/sample-test/sfpd/schema.json', 'rounds': '20', 'workers': '2' } response = client.run_pipeline(experiment_id, job_name, args.input, params) run_id = response.id utils.add_junit_test(test_cases, 'create pipeline run', True) ###### Monitor Job ###### start_time = datetime.now() response = client.wait_for_run_completion(run_id, 1800) succ = (response.run.status.lower() == 'succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format( args.namespace, workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) ###### If the job fails, skip the result validation ###### if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit(1) ###### Validate the results ###### # confusion matrix should show three columns for the flower data # target, predicted, count cm_tar_path = './confusion_matrix.tar.gz' cm_filename = 'mlpipeline-ui-metadata.json' utils.get_artifact_in_minio(workflow_json, 'confusion-matrix', cm_tar_path) tar_handler = tarfile.open(cm_tar_path) tar_handler.extractall() with open(cm_filename, 'r') as f: cm_data = f.read() utils.add_junit_test(test_cases, 'confusion matrix format', (len(cm_data) > 0), 'the confusion matrix file is empty') ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)