def titanic_suvival_prediction(region='aws-region', log_s3_uri="s3://mlops-kubeflow-pipeline-data/emr/titanic/logs", cluster_name="emr-cluster", job_name='spark-ml-trainner', input='s3://mlops-kubeflow-pipeline-data/emr/titanic/train.csv', output='s3://mlops-kubeflow-pipeline-data/emr/titanic/output', jar_path='s3://mlops-kubeflow-pipeline-data/emr/titanic/titanic-survivors-prediction_2.11-1.0.jar', main_class='com.amazonaws.emr.titanic.Titanic', instance_type="m4.xlarge", instance_count="3" ): create_cluster = emr_create_cluster_op( region=region, name=cluster_name, instance_type=instance_type, instance_count=instance_count, log_s3_uri=log_s3_uri, ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) training_and_prediction = emr_submit_spark_job_op( region=region, jobflow_id=create_cluster.output, job_name=job_name, jar_path=jar_path, main_class=main_class, input=input, output=output ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) delete_cluster = emr_delete_cluster_op( region=region, jobflow_id=create_cluster.output, dependent=training_and_prediction.outputs['job_id'] ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
def twitter_classification( s3_raw_data='s3://kubeflow-meda/data/raw/tweets.csv', s3_model_data='s3://kubeflow-meda/models', model_name='NNET'): # preprocess data. cleansing and feature engineering. Also creating s3 folder structure to # store data and artifacts of the model run. preprocess = preprocess_op(s3_raw_data=s3_raw_data, model_name=model_name).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) training = train_op( s3_training_data=preprocess.outputs['s3_training_data'], s3_training_predictions=preprocess.outputs['s3_training_predictions'], s3_model_artifacts=preprocess.outputs['s3_model_artifacts'], model_name=model_name, max_length=preprocess.outputs['max_length'], vocab_size=preprocess.outputs['vocab_size']).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) testing = test_op( s3_testing_data=preprocess.outputs['s3_testing_data'], s3_testing_predictions=preprocess.outputs['s3_testing_predictions'], s3_model_artifacts=training.outputs['s3_model_artifacts'], model_name=model_name).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
def batch_transform_pipeline( region="", image="", model_name="", job_name="", model_artifact_url="", instance_type="", instance_count="", data_input="", data_type="", content_type="", compression_type="", output_location="", max_concurrent="", max_payload="", batch_strategy="", split_type="", network_isolation="", role="", ): create_model = sagemaker_model_op( region=region, model_name=model_name, image=image, model_artifact_url=model_artifact_url, network_isolation=network_isolation, role=role, ).apply( use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) sagemaker_batch_transform_op( region=region, model_name=create_model.output, job_name=job_name, instance_type=instance_type, instance_count=instance_count, max_concurrent=max_concurrent, max_payload=max_payload, batch_strategy=batch_strategy, input_location=data_input, data_type=data_type, content_type=content_type, split_type=split_type, compression_type=compression_type, output_location=output_location, ).apply( use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
def mnist_classification( region='us-west-2', image='174872318107.dkr.ecr.us-west-2.amazonaws.com/kmeans:1', dataset_path='s3://kubeflow-pipeline-data/mnist_kmeans_example/data', instance_type='ml.c4.8xlarge', instance_count='2', volume_size='50', model_output_path='s3://kubeflow-pipeline-data/mnist_kmeans_example/model', batch_transform_input='s3://kubeflow-pipeline-data/mnist_kmeans_example/input', batch_transform_ouput='s3://kubeflow-pipeline-data/mnist_kmeans_example/output', role_arn=''): training = sagemaker_train_op( region=region, image=image, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, dataset_path=dataset_path, model_artifact_path=model_output_path, role=role_arn, ).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) create_model = sagemaker_model_op( region=region, image=image, model_artifact_url=training.outputs['model_artifact_url'], model_name=training.outputs['job_name'], role=role_arn).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) prediction = sagemaker_deploy_op(region=region, model_name=create_model.output).apply( use_aws_secret( 'aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) batch_transform = sagemaker_batch_transform_op( region=region, model_name=create_model.output, input_location=batch_transform_input, output_location=batch_transform_ouput).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
def training( region='us-east-1', endpoint_url='', image='382416733822.dkr.ecr.us-east-1.amazonaws.com/kmeans:1', training_input_mode='File', hyperparameters='{"k": "10", "feature_dim": "784"}', channels='[ \ { \ "ChannelName": "train", \ "DataSource": { \ "S3DataSource": { \ "S3Uri": "s3://kubeflow-pipeline-data/mnist_kmeans_example/data", \ "S3DataType": "S3Prefix", \ "S3DataDistributionType": "FullyReplicated" \ } \ }, \ "ContentType": "", \ "CompressionType": "None", \ "RecordWrapperType": "None", \ "InputMode": "File" \ } \ ]', instance_type='ml.p2.xlarge', instance_count='1', volume_size='50', max_run_time='3600', model_artifact_path='s3://kubeflow-pipeline-data/mnist_kmeans_example/data', output_encryption_key='', network_isolation='True', traffic_encryption='False', spot_instance='False', max_wait_time='3600', checkpoint_config='{}', role=''): training = sagemaker_train_op( region=region, endpoint_url=endpoint_url, image=image, training_input_mode=training_input_mode, hyperparameters=hyperparameters, channels=channels, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, max_run_time=max_run_time, model_artifact_path=model_artifact_path, output_encryption_key=output_encryption_key, network_isolation=network_isolation, traffic_encryption=traffic_encryption, spot_instance=spot_instance, max_wait_time=max_wait_time, checkpoint_config=checkpoint_config, role=role, ).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
def titanic_suvival_prediction( region="us-west-2", log_s3_uri="s3://kubeflow-pipeline-data/emr/titanic/logs", cluster_name="emr-cluster", job_name="spark-ml-trainner", input="s3://kubeflow-pipeline-data/emr/titanic/train.csv", output="s3://kubeflow-pipeline-data/emr/titanic/output", jar_path="s3://kubeflow-pipeline-data/emr/titanic/titanic-survivors-prediction_2.11-1.0.jar", main_class="com.amazonaws.emr.titanic.Titanic", instance_type="m4.xlarge", instance_count="3", ): create_cluster = emr_create_cluster_op( region=region, name=cluster_name, instance_type=instance_type, instance_count=instance_count, log_s3_uri=log_s3_uri, ).apply( use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) training_and_prediction = emr_submit_spark_job_op( region=region, jobflow_id=create_cluster.output, job_name=job_name, jar_path=jar_path, main_class=main_class, input=input, output=output, ).apply( use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) delete_cluster = emr_delete_cluster_op( region=region, jobflow_id=create_cluster.output, dependent=training_and_prediction.outputs["job_id"], ).apply( use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
def pipeline_use_aws_secret(): secret_name = "kfp-aws-secret" dsl.ContainerOp( name='mnist_use_aws_secret', image='kangwoo/kfp-mnist-storage:0.0.1', arguments=['--model', 's3://tensorflow/kfp/mnist/model']).apply( aws.use_aws_secret( secret_name, aws_access_key_id_name='AWS_ACCESS_KEY_ID', aws_secret_access_key_name='AWS_SECRET_ACCESS_KEY'))
def create_endpoint_pipeline( region="", endpoint_url="", image="", model_name="", endpoint_config_name="", endpoint_name="", model_artifact_url="", variant_name_1="", instance_type_1="", initial_instance_count_1="", initial_variant_weight_1="", network_isolation="", role="", ): create_model = sagemaker_model_op( region=region, endpoint_url=endpoint_url, model_name=model_name, image=image, model_artifact_url=model_artifact_url, network_isolation=network_isolation, role=role, ).apply( use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) sagemaker_deploy_op( region=region, endpoint_url=endpoint_url, endpoint_config_name=endpoint_config_name, endpoint_name=endpoint_name, model_name_1=create_model.output, variant_name_1=variant_name_1, instance_type_1=instance_type_1, initial_instance_count_1=initial_instance_count_1, initial_variant_weight_1=initial_variant_weight_1, ).apply( use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
def test_use_aws_secret(self): with Pipeline('somename') as p: op1 = ContainerOp(name='op1', image='image') op1 = op1.apply( use_aws_secret('myaws-secret', 'key_id', 'access_key')) assert len(op1.env_variables) == 2 index = 0 for expected in ['key_id', 'access_key']: assert op1.env_variables[index].name == expected assert op1.env_variables[ index].value_from.secret_key_ref.name == 'myaws-secret' assert op1.env_variables[ index].value_from.secret_key_ref.key == expected index += 1
def test_use_aws_secret(self): op1 = ContainerOp(name='op1', image='image') op1 = op1.apply(use_aws_secret('myaws-secret', 'key_id', 'access_key')) assert len(op1.container.env) == 2 index = 0 for expected_name, expected_key in [('AWS_ACCESS_KEY_ID', 'key_id'), ('AWS_SECRET_ACCESS_KEY', 'access_key')]: assert op1.container.env[index].name == expected_name assert op1.container.env[ index].value_from.secret_key_ref.name == 'myaws-secret' assert op1.container.env[ index].value_from.secret_key_ref.key == expected_key index += 1
def hpo_pipeline( region="", algorithm_name="", training_input_mode="", static_parameters="", integer_parameters="", channels="", categorical_parameters="", early_stopping_type="", max_parallel_jobs="", max_num_jobs="", metric_name="", metric_type="", hpo_strategy="", instance_type="", instance_count="", volume_size="", max_run_time="", output_location="", network_isolation="", max_wait_time="", role="", ): sagemaker_hpo_op( region=region, algorithm_name=algorithm_name, training_input_mode=training_input_mode, static_parameters=static_parameters, integer_parameters=integer_parameters, channels=channels, categorical_parameters=categorical_parameters, early_stopping_type=early_stopping_type, max_parallel_jobs=max_parallel_jobs, max_num_jobs=max_num_jobs, metric_name=metric_name, metric_type=metric_type, strategy=hpo_strategy, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, max_run_time=max_run_time, output_location=output_location, network_isolation=network_isolation, max_wait_time=max_wait_time, role=role, ).apply( use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
def create_model_pipeline( region="", endpoint_url="", image="", model_name="", model_artifact_url="", network_isolation="", role="", ): sagemaker_model_op( region=region, endpoint_url=endpoint_url, model_name=model_name, image=image, model_artifact_url=model_artifact_url, network_isolation=network_isolation, role=role, ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
def training( region='us-east-1', endpoint_url='', image='382416733822.dkr.ecr.us-east-1.amazonaws.com/kmeans:1', training_input_mode='File', hyperparameters={ "k": "10", "feature_dim": "784" }, channels=channelObjList, instance_type='ml.p2.xlarge', instance_count=1, volume_size=50, max_run_time=3600, model_artifact_path='s3://kubeflow-pipeline-data/mnist_kmeans_example/data', output_encryption_key='', network_isolation=True, traffic_encryption=False, spot_instance=False, max_wait_time=3600, checkpoint_config={}, role=''): training = sagemaker_train_op( region=region, endpoint_url=endpoint_url, image=image, training_input_mode=training_input_mode, hyperparameters=hyperparameters, channels=channels, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, max_run_time=max_run_time, model_artifact_path=model_artifact_path, output_encryption_key=output_encryption_key, network_isolation=network_isolation, traffic_encryption=traffic_encryption, spot_instance=spot_instance, max_wait_time=max_wait_time, checkpoint_config=checkpoint_config, role=role, ).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
def training_pipeline( region="", endpoint_url="", image="", training_input_mode="", hyperparameters="", channels="", instance_type="", instance_count="", volume_size="", max_run_time="", model_artifact_path="", output_encryption_key="", network_isolation="", traffic_encryption="", spot_instance="", max_wait_time="", checkpoint_config="{}", role="", ): sagemaker_train_op( region=region, endpoint_url=endpoint_url, image=image, training_input_mode=training_input_mode, hyperparameters=hyperparameters, channels=channels, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, max_run_time=max_run_time, model_artifact_path=model_artifact_path, output_encryption_key=output_encryption_key, network_isolation=network_isolation, traffic_encryption=traffic_encryption, spot_instance=spot_instance, max_wait_time=max_wait_time, checkpoint_config=checkpoint_config, role=role, ).apply( use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
def iris_prod_pipeline( location: dsl.PipelineParam = dsl.PipelineParam( name='location', value='FOLDER_NAME_TO_MODELS'), model_name: dsl.PipelineParam = dsl.PipelineParam(name="model_name", value="MODEL NAME"), is_deploy: dsl.PipelineParam = dsl.PipelineParam(name="is_deploy", param_type='bool')): _load_s3 = load_s3_op(location, model_name).apply( aws.use_aws_secret(secret_name='s3-secrets')) seldon_config = yaml.load( open("iris_prod_pipeline/components/deploy/deploy_iris.yaml")) with dsl.Condition(is_deploy == True, name='deploy'): _deploy = dsl.ResourceOp( name="seldondeploy", k8s_resource=seldon_config, attribute_outputs={"name": "{.metadata.name}"}) _deploy.after(_load_s3)
def iris_train_pipeline( kernel: dsl.PipelineParam = dsl.PipelineParam( name='kernel', value='linear, poly, rbf, sigmoid or precomputed'), C: dsl.PipelineParam = dsl.PipelineParam( name='C', value='Float value, default value is 1'), n_neighbors: dsl.PipelineParam = dsl.PipelineParam(name='n_neighbors', value='int value'), n_splits: dsl.PipelineParam = dsl.PipelineParam( name='n_splits', value="Number of splits for fold"), location: dsl.PipelineParam = dsl.PipelineParam( name='location', value='FOLDER_NAME_TO_MODELS'), svm_filename: dsl.PipelineParam = dsl.PipelineParam( name='svm-filename', value='SVM_NAME'), lr_filename: dsl.PipelineParam = dsl.PipelineParam( name='logistic-regression-filename', value='LOGISTIC_REGRESSION_NAME'), dt_filename: dsl.PipelineParam = dsl.PipelineParam( name='decision-tree-filename', value='DECISION_TREE_NAME'), knn_filename: dsl.PipelineParam = dsl.PipelineParam( name='knn-filename', value='KNN_NAME'), label1: dsl.PipelineParam = dsl.PipelineParam(name='labels', value='Label 1'), label2: dsl.PipelineParam = dsl.PipelineParam(name='labels', value='Label 2'), label3: dsl.PipelineParam = dsl.PipelineParam(name='labels', value='Label 3')): _load_data = load_op() _transform = transform_op(dsl.InputArgumentPath( _load_data.outputs['iris'])).after(_load_data) _svm = svm_op( str(svm_filename) + '.pkl', dsl.InputArgumentPath(_transform.outputs['X_train']), dsl.InputArgumentPath(_transform.outputs['y_train']), dsl.InputArgumentPath(_transform.outputs['X_test']), kernel, C, n_splits).after(_transform) _lr = lr_op(dsl.InputArgumentPath(_transform.outputs['X_train']), dsl.InputArgumentPath(_transform.outputs['y_train']), dsl.InputArgumentPath(_transform.outputs['X_test']), str(lr_filename) + '.pkl', n_splits).after(_transform) _dt = dt_op(dsl.InputArgumentPath(_transform.outputs['X_train']), dsl.InputArgumentPath(_transform.outputs['y_train']), dsl.InputArgumentPath(_transform.outputs['X_test']), str(dt_filename) + '.pkl', n_splits).after(_transform) _knn = knn_op( dsl.InputArgumentPath(_transform.outputs['X_train']), dsl.InputArgumentPath(_transform.outputs['y_train']), dsl.InputArgumentPath(_transform.outputs['X_test']), n_neighbors, n_splits, str(knn_filename) + '.pkl', ).after(_transform) models = [ dsl.InputArgumentPath(_svm.outputs['svm_model']), dsl.InputArgumentPath(_lr.outputs['lr_model']), dsl.InputArgumentPath(_dt.outputs['dt_model']), dsl.InputArgumentPath(_knn.outputs['knn_model']), ] _save_s3 = save_s3_op( models, location, [svm_filename, lr_filename, dt_filename, knn_filename]).after( _svm, _lr, _dt, _knn).apply(aws.use_aws_secret(secret_name='s3-secrets')) _evaluation_knn = evaluation_op( dsl.InputArgumentPath(_knn.outputs['knn_predict']), dsl.InputArgumentPath(_transform.outputs['y_test']), [label1, label2, label3], dsl.InputArgumentPath(_transform.outputs['y_train']), dsl.InputArgumentPath(_knn.outputs['knn_y_scores'])).after(_knn) _evaluation_dt = evaluation_op( dsl.InputArgumentPath(_dt.outputs['dt_predict']), dsl.InputArgumentPath(_transform.outputs['y_test']), [label1, label2, label3], dsl.InputArgumentPath(_transform.outputs['y_train']), dsl.InputArgumentPath(_dt.outputs['dt_y_scores'])).after(_dt) _evaluation_svm = evaluation_op( dsl.InputArgumentPath(_svm.outputs['svm_predict']), dsl.InputArgumentPath(_transform.outputs['y_test']), [label1, label2, label3], dsl.InputArgumentPath(_transform.outputs['y_train']), dsl.InputArgumentPath(_dt.outputs['svm_y_scores'])).after(_svm) _evaluation_svm = evaluation_op( dsl.InputArgumentPath(_lr.outputs['lr_predict']), dsl.InputArgumentPath(_transform.outputs['y_test']), [label1, label2, label3], dsl.InputArgumentPath(_transform.outputs['y_train']), dsl.InputArgumentPath(_dt.outputs['lr_y_scores'])).after(_lr)
def mnist_classification( region='us-west-2', image='174872318107.dkr.ecr.us-west-2.amazonaws.com/kmeans:1', training_input_mode='File', hpo_strategy='Bayesian', hpo_metric_name='test:msd', hpo_metric_type='Minimize', hpo_early_stopping_type='Off', hpo_static_parameters={ "k": "10", "feature_dim": "784" }, hpo_integer_parameters=[{ "Name": "mini_batch_size", "MinValue": "500", "MaxValue": "600" }, { "Name": "extra_center_factor", "MinValue": "10", "MaxValue": "20" }], hpo_continuous_parameters=[], hpo_categorical_parameters=[{ "Name": "init_method", "Values": ["random", "kmeans++"] }], hpo_channels=hpoChannels, hpo_spot_instance=False, hpo_max_wait_time=3600, hpo_checkpoint_config={}, output_location='s3://kubeflow-pipeline-data/mnist_kmeans_example/output', output_encryption_key='', instance_type='ml.p2.16xlarge', instance_count=1, volume_size=50, hpo_max_num_jobs=9, hpo_max_parallel_jobs=3, max_run_time=3600, endpoint_url='', network_isolation=True, traffic_encryption=False, train_channels=trainChannels, train_spot_instance=False, train_max_wait_time=3600, train_checkpoint_config={}, batch_transform_instance_type='ml.m4.xlarge', batch_transform_input='s3://kubeflow-pipeline-data/mnist_kmeans_example/input', batch_transform_data_type='S3Prefix', batch_transform_content_type='text/csv', batch_transform_compression_type='None', batch_transform_ouput='s3://kubeflow-pipeline-data/mnist_kmeans_example/output', batch_transform_max_concurrent=4, batch_transform_max_payload=6, batch_strategy='MultiRecord', batch_transform_split_type='Line', role_arn=''): hpo = sagemaker_hpo_op( region=region, endpoint_url=endpoint_url, image=image, training_input_mode=training_input_mode, strategy=hpo_strategy, metric_name=hpo_metric_name, metric_type=hpo_metric_type, early_stopping_type=hpo_early_stopping_type, static_parameters=hpo_static_parameters, integer_parameters=hpo_integer_parameters, continuous_parameters=hpo_continuous_parameters, categorical_parameters=hpo_categorical_parameters, channels=hpo_channels, output_location=output_location, output_encryption_key=output_encryption_key, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, max_num_jobs=hpo_max_num_jobs, max_parallel_jobs=hpo_max_parallel_jobs, max_run_time=max_run_time, network_isolation=network_isolation, traffic_encryption=traffic_encryption, spot_instance=hpo_spot_instance, max_wait_time=hpo_max_wait_time, checkpoint_config=hpo_checkpoint_config, role=role_arn, ).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) training = sagemaker_train_op( region=region, endpoint_url=endpoint_url, image=image, training_input_mode=training_input_mode, hyperparameters=hpo.outputs['best_hyperparameters'], channels=train_channels, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, max_run_time=max_run_time, model_artifact_path=output_location, output_encryption_key=output_encryption_key, network_isolation=network_isolation, traffic_encryption=traffic_encryption, spot_instance=train_spot_instance, max_wait_time=train_max_wait_time, checkpoint_config=train_checkpoint_config, role=role_arn, ).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) create_model = sagemaker_model_op( region=region, endpoint_url=endpoint_url, model_name=training.outputs['job_name'], image=training.outputs['training_image'], model_artifact_url=training.outputs['model_artifact_url'], network_isolation=network_isolation, role=role_arn).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) prediction = sagemaker_deploy_op( region=region, endpoint_url=endpoint_url, model_name_1=create_model.output, ).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) batch_transform = sagemaker_batch_transform_op( region=region, endpoint_url=endpoint_url, model_name=create_model.output, instance_type=batch_transform_instance_type, instance_count=instance_count, max_concurrent=batch_transform_max_concurrent, max_payload=batch_transform_max_payload, batch_strategy=batch_strategy, input_location=batch_transform_input, data_type=batch_transform_data_type, content_type=batch_transform_content_type, split_type=batch_transform_split_type, compression_type=batch_transform_compression_type, output_location=batch_transform_ouput).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
def kf_pipeline(input_data='reddit_train.csv', ): """ Pipeline """ tokenize_training_step = dsl.ContainerOp( name='tokenize', image=f"{REGISTRY}/tokenize:{TAG}", command="python", arguments=[ "-m", "src.steps.tokenize.pipeline_step" ], file_outputs={"tokenize_location": "/tokenized_location.txt", "labels_location": "/labels_location.txt"}, pvolumes={} ).apply(aws.use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) \ .set_image_pull_policy('Always') vectorize_training_step = dsl.ContainerOp( name='vectorize', image=f"{REGISTRY}/tokenize:{TAG}", command="python", arguments=[ "-m", "src.steps.tfidftransformer.pipeline_step", "--input-data", tokenize_training_step.outputs['tokenize_location'] ], file_outputs={"tfidftransformer_location": "/vectorizer_location.txt", "tfidfvectors_location": "/vectors_location.txt"}, pvolumes={} ).apply(aws.use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) \ .set_image_pull_policy('Always') lr_training_step = dsl.ContainerOp( name='logical regression', image=f"{REGISTRY}/tokenize:{TAG}", command="python", arguments=[ "-m", "src.steps.lrclassifier.pipeline_step", "--labels-data", tokenize_training_step.outputs['labels_location'], "--vectors-data", vectorize_training_step.outputs['tfidfvectors_location'] ], file_outputs={"lr_model_location": "/lr_model_location.txt"}, pvolumes={} ).apply(aws.use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) \ .set_image_pull_policy('Always') tokenize_build_step = dsl.ContainerOp( name='Build tokenize Serving', image=f"{REGISTRY}/kaniko-executor:{TAG}", arguments=[ "--dockerfile=Dockerfile", f"--build-arg=TOKENIZE_MODEL={tokenize_training_step.outputs['tokenize_location']}", "--context=dir:///workspace", f"--destination={REGISTRY}/tokenizeserving:{TAG}" ], pvolumes={} ).apply(aws.use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) \ .set_image_pull_policy('Always') \ .after(lr_training_step) vectorize_build_step = dsl.ContainerOp( name='Build vectorize Serving', image=f"{REGISTRY}/kaniko-executor:{TAG}", arguments=[ "--dockerfile=Dockerfile", f"--build-arg=VECTORIZE_MODEL={vectorize_training_step.outputs['tfidftransformer_location']}", "--context=dir:///workspace", f"--destination={REGISTRY}/vecotrizeserving:{TAG}" ], pvolumes={} ).apply(aws.use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) \ .set_image_pull_policy('Always') \ .after(tokenize_build_step) lr_model_build_step = dsl.ContainerOp( name='Build LR Serving', image=f"{REGISTRY}/kaniko-executor:{TAG}", arguments=[ "--dockerfile=Dockerfile", f"--build-arg=LR_MODEL={lr_training_step.outputs['lr_model_location']}", "--context=dir:///workspace", f"--destination={REGISTRY}/lrserving:{TAG}" ], pvolumes={} ).apply(aws.use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) \ .set_image_pull_policy('Always') \ .after(vectorize_build_step)
def mnist_classification( region='us-west-2', image='174872318107.dkr.ecr.us-west-2.amazonaws.com/kmeans:1', training_input_mode='File', hpo_strategy='Bayesian', hpo_metric_name='test:msd', hpo_metric_type='Minimize', hpo_early_stopping_type='Off', hpo_static_parameters='{"k": "10", "feature_dim": "784"}', hpo_integer_parameters='[{"Name": "mini_batch_size", "MinValue": "500", "MaxValue": "600"}, {"Name": "extra_center_factor", "MinValue": "10", "MaxValue": "20"}]', hpo_continuous_parameters='[]', hpo_categorical_parameters='[{"Name": "init_method", "Values": ["random", "kmeans++"]}]', hpo_channels='[{"ChannelName": "train", \ "DataSource": { \ "S3DataSource": { \ "S3Uri": "s3://kubeflow-pipeline-data/mnist_kmeans_example/train_data", \ "S3DataType": "S3Prefix", \ "S3DataDistributionType": "FullyReplicated" \ } \ }, \ "ContentType": "", \ "CompressionType": "None", \ "RecordWrapperType": "None", \ "InputMode": "File"}, \ {"ChannelName": "test", \ "DataSource": { \ "S3DataSource": { \ "S3Uri": "s3://kubeflow-pipeline-data/mnist_kmeans_example/test_data", \ "S3DataType": "S3Prefix", \ "S3DataDistributionType": "FullyReplicated" \ } \ }, \ "ContentType": "", \ "CompressionType": "None", \ "RecordWrapperType": "None", \ "InputMode": "File"}]', output_location='s3://kubeflow-pipeline-data/mnist_kmeans_example/output', output_encryption_key='', instance_type='ml.p2.16xlarge', instance_count='1', volume_size='50', hpo_max_num_jobs='9', hpo_max_parallel_jobs='3', max_run_time='3600', network_isolation='True', traffic_encryption='False', train_channels='[{"ChannelName": "train", \ "DataSource": { \ "S3DataSource": { \ "S3Uri": "s3://kubeflow-pipeline-data/mnist_kmeans_example/train_data", \ "S3DataType": "S3Prefix", \ "S3DataDistributionType": "FullyReplicated" \ } \ }, \ "ContentType": "", \ "CompressionType": "None", \ "RecordWrapperType": "None", \ "InputMode": "File"}]', batch_transform_instance_type='ml.m4.xlarge', batch_transform_input='s3://kubeflow-pipeline-data/mnist_kmeans_example/input', batch_transform_data_type='S3Prefix', batch_transform_content_type='text/csv', batch_transform_compression_type='None', batch_transform_ouput='s3://kubeflow-pipeline-data/mnist_kmeans_example/output', batch_transform_max_concurrent='4', batch_transform_max_payload='6', batch_strategy='MultiRecord', batch_transform_split_type='Line', role_arn=''): hpo = sagemaker_hpo_op( region=region, image=image, training_input_mode=training_input_mode, strategy=hpo_strategy, metric_name=hpo_metric_name, metric_type=hpo_metric_type, early_stopping_type=hpo_early_stopping_type, static_parameters=hpo_static_parameters, integer_parameters=hpo_integer_parameters, continuous_parameters=hpo_continuous_parameters, categorical_parameters=hpo_categorical_parameters, channels=hpo_channels, output_location=output_location, output_encryption_key=output_encryption_key, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, max_num_jobs=hpo_max_num_jobs, max_parallel_jobs=hpo_max_parallel_jobs, max_run_time=max_run_time, network_isolation=network_isolation, traffic_encryption=traffic_encryption, role=role_arn, ).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) training = sagemaker_train_op( region=region, image=image, training_input_mode=training_input_mode, hyperparameters=hpo.outputs['best_hyperparameters'], channels=train_channels, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, max_run_time=max_run_time, model_artifact_path=output_location, output_encryption_key=output_encryption_key, network_isolation=network_isolation, traffic_encryption=traffic_encryption, role=role_arn, ).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) create_model = sagemaker_model_op( region=region, model_name=training.outputs['job_name'], image=training.outputs['training_image'], model_artifact_url=training.outputs['model_artifact_url'], network_isolation=network_isolation, role=role_arn).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) prediction = sagemaker_deploy_op( region=region, model_name_1=create_model.output, ).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) batch_transform = sagemaker_batch_transform_op( region=region, model_name=create_model.output, instance_type=batch_transform_instance_type, instance_count=instance_count, max_concurrent=batch_transform_max_concurrent, max_payload=batch_transform_max_payload, batch_strategy=batch_strategy, input_location=batch_transform_input, data_type=batch_transform_data_type, content_type=batch_transform_content_type, split_type=batch_transform_split_type, compression_type=batch_transform_compression_type, output_location=batch_transform_ouput).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
def _cc_pipeline(self, pipeline, pipeline_name, pipeline_version='', experiment_name='', cos_directory=None, export=False): runtime_configuration = self._get_metadata_configuration(namespace=MetadataManager.NAMESPACE_RUNTIMES, name=pipeline.runtime_config) cos_endpoint = runtime_configuration.metadata['cos_endpoint'] cos_username = runtime_configuration.metadata['cos_username'] cos_password = runtime_configuration.metadata['cos_password'] cos_secret = runtime_configuration.metadata.get('cos_secret') if cos_directory is None: cos_directory = pipeline_name cos_bucket = runtime_configuration.metadata['cos_bucket'] self.log_pipeline_info(pipeline_name, f"processing pipeline dependencies to: {cos_endpoint} " f"bucket: {cos_bucket} folder: {cos_directory}") t0_all = time.time() emptydir_volume_size = '' container_runtime = bool(os.getenv('CRIO_RUNTIME', 'False').lower() == 'true') # Create dictionary that maps component Id to its ContainerOp instance notebook_ops = {} # Sort operations based on dependency graph (topological order) sorted_operations = PipelineProcessor._sort_operations(pipeline.operations) # All previous operation outputs should be propagated throughout the pipeline. # In order to process this recursively, the current operation's inputs should be combined # from its parent's inputs (which, themselves are derived from the outputs of their parent) # and its parent's outputs. PipelineProcessor._propagate_operation_inputs_outputs(pipeline, sorted_operations) for operation in sorted_operations: operation_artifact_archive = self._get_dependency_archive_name(operation) self.log.debug("Creating pipeline component :\n {op} archive : {archive}".format( op=operation, archive=operation_artifact_archive)) if container_runtime: # Volume size to create when using CRI-o, NOTE: IBM Cloud minimum is 20Gi emptydir_volume_size = '20Gi' # Collect env variables pipeline_envs = self._collect_envs(operation, cos_secret=cos_secret, cos_username=cos_username, cos_password=cos_password) # Include any envs set on the operation pipeline_envs.update(operation.env_vars_as_dict(logger=self.log)) sanitized_operation_name = self._sanitize_operation_name(operation.name) # create pipeline operation notebook_ops[operation.id] = NotebookOp(name=sanitized_operation_name, pipeline_name=pipeline_name, experiment_name=experiment_name, notebook=operation.filename, cos_endpoint=cos_endpoint, cos_bucket=cos_bucket, cos_directory=cos_directory, cos_dependencies_archive=operation_artifact_archive, pipeline_version=pipeline_version, pipeline_source=pipeline.source, pipeline_inputs=operation.inputs, pipeline_outputs=operation.outputs, pipeline_envs=pipeline_envs, emptydir_volume_size=emptydir_volume_size, cpu_request=operation.cpu, mem_request=operation.memory, gpu_limit=operation.gpu, image=operation.runtime_image, file_outputs={ 'mlpipeline-metrics': '{}/mlpipeline-metrics.json' .format(pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']), 'mlpipeline-ui-metadata': '{}/mlpipeline-ui-metadata.json' .format(pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']) }) if cos_secret and not export: notebook_ops[operation.id].apply(use_aws_secret(cos_secret)) image_namespace = self._get_metadata_configuration(namespace=MetadataManager.NAMESPACE_RUNTIME_IMAGES) for image_instance in image_namespace: if image_instance.metadata['image_name'] == operation.runtime_image and \ image_instance.metadata.get('pull_policy'): notebook_ops[operation.id].container.set_image_pull_policy(image_instance.metadata['pull_policy']) self.log_pipeline_info(pipeline_name, f"processing operation dependencies for id: {operation.id}", operation_name=operation.name) self._upload_dependencies_to_object_store(runtime_configuration, cos_directory, operation) # Process dependencies after all the operations have been created for operation in pipeline.operations.values(): op = notebook_ops[operation.id] for parent_operation_id in operation.parent_operations: parent_op = notebook_ops[parent_operation_id] # Parent Operation op.after(parent_op) self.log_pipeline_info(pipeline_name, "pipeline dependencies processed", duration=(time.time() - t0_all)) return notebook_ops
def _cc_pipeline( self, pipeline, pipeline_name, pipeline_version="", experiment_name="", cos_directory=None, export=False ): runtime_configuration = self._get_metadata_configuration( schemaspace=Runtimes.RUNTIMES_SCHEMASPACE_ID, name=pipeline.runtime_config ) cos_endpoint = runtime_configuration.metadata["cos_endpoint"] cos_username = runtime_configuration.metadata.get("cos_username") cos_password = runtime_configuration.metadata.get("cos_password") cos_secret = runtime_configuration.metadata.get("cos_secret") cos_bucket = runtime_configuration.metadata.get("cos_bucket") if cos_directory is None: cos_directory = pipeline_name engine = runtime_configuration.metadata["engine"] self.log_pipeline_info( pipeline_name, f"processing pipeline dependencies to: {cos_endpoint} " f"bucket: {cos_bucket} folder: {cos_directory}", ) t0_all = time.time() emptydir_volume_size = "" container_runtime = bool(os.getenv("CRIO_RUNTIME", "False").lower() == "true") # Create dictionary that maps component Id to its ContainerOp instance target_ops = {} # Sort operations based on dependency graph (topological order) sorted_operations = PipelineProcessor._sort_operations(pipeline.operations) # Determine whether access to cloud storage is required for operation in sorted_operations: if isinstance(operation, GenericOperation): self._verify_cos_connectivity(runtime_configuration) break # All previous operation outputs should be propagated throughout the pipeline. # In order to process this recursively, the current operation's inputs should be combined # from its parent's inputs (which, themselves are derived from the outputs of their parent) # and its parent's outputs. PipelineProcessor._propagate_operation_inputs_outputs(pipeline, sorted_operations) for operation in sorted_operations: if container_runtime: # Volume size to create when using CRI-o, NOTE: IBM Cloud minimum is 20Gi emptydir_volume_size = "20Gi" sanitized_operation_name = self._sanitize_operation_name(operation.name) # Create pipeline operation # If operation is one of the "generic" set of NBs or scripts, construct custom ExecuteFileOp if isinstance(operation, GenericOperation): # Collect env variables pipeline_envs = self._collect_envs( operation, cos_secret=cos_secret, cos_username=cos_username, cos_password=cos_password ) operation_artifact_archive = self._get_dependency_archive_name(operation) self.log.debug(f"Creating pipeline component:\n {operation} archive : {operation_artifact_archive}") target_ops[operation.id] = ExecuteFileOp( name=sanitized_operation_name, pipeline_name=pipeline_name, experiment_name=experiment_name, notebook=operation.filename, cos_endpoint=cos_endpoint, cos_bucket=cos_bucket, cos_directory=cos_directory, cos_dependencies_archive=operation_artifact_archive, pipeline_version=pipeline_version, pipeline_source=pipeline.source, pipeline_inputs=operation.inputs, pipeline_outputs=operation.outputs, pipeline_envs=pipeline_envs, emptydir_volume_size=emptydir_volume_size, cpu_request=operation.cpu, mem_request=operation.memory, gpu_limit=operation.gpu, workflow_engine=engine, image=operation.runtime_image, file_outputs={ "mlpipeline-metrics": f"{pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']}/mlpipeline-metrics.json", # noqa "mlpipeline-ui-metadata": f"{pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']}/mlpipeline-ui-metadata.json", # noqa }, ) if operation.doc: target_ops[operation.id].add_pod_annotation("elyra/node-user-doc", operation.doc) # TODO Can we move all of this to apply to non-standard components as well? Test when servers are up if cos_secret and not export: target_ops[operation.id].apply(use_aws_secret(cos_secret)) image_namespace = self._get_metadata_configuration(RuntimeImages.RUNTIME_IMAGES_SCHEMASPACE_ID) for image_instance in image_namespace: if image_instance.metadata["image_name"] == operation.runtime_image and image_instance.metadata.get( "pull_policy" ): target_ops[operation.id].container.set_image_pull_policy(image_instance.metadata["pull_policy"]) self.log_pipeline_info( pipeline_name, f"processing operation dependencies for id: {operation.id}", operation_name=operation.name, ) self._upload_dependencies_to_object_store(runtime_configuration, cos_directory, operation) # If operation is a "non-standard" component, load it's spec and create operation with factory function else: # Retrieve component from cache component = ComponentCache.instance().get_component(self._type, operation.classifier) # Convert the user-entered value of certain properties according to their type for component_property in component.properties: # Get corresponding property's value from parsed pipeline property_value = operation.component_params.get(component_property.ref) self.log.debug( f"Processing component parameter '{component_property.name}' " f"of type '{component_property.data_type}'" ) if component_property.data_type == "inputpath": output_node_id = property_value["value"] output_node_parameter_key = property_value["option"].replace("elyra_output_", "") operation.component_params[component_property.ref] = target_ops[output_node_id].outputs[ output_node_parameter_key ] elif component_property.data_type == "inputvalue": active_property = property_value["activeControl"] active_property_value = property_value.get(active_property, None) # If the value is not found, assign it the default value assigned in parser if active_property_value is None: active_property_value = component_property.value if isinstance(active_property_value, dict) and set(active_property_value.keys()) == { "value", "option", }: output_node_id = active_property_value["value"] output_node_parameter_key = active_property_value["option"].replace("elyra_output_", "") operation.component_params[component_property.ref] = target_ops[output_node_id].outputs[ output_node_parameter_key ] elif component_property.default_data_type == "dictionary": processed_value = self._process_dictionary_value(active_property_value) operation.component_params[component_property.ref] = processed_value elif component_property.default_data_type == "list": processed_value = self._process_list_value(active_property_value) operation.component_params[component_property.ref] = processed_value else: operation.component_params[component_property.ref] = active_property_value # Build component task factory try: factory_function = components.load_component_from_text(component.definition) except Exception as e: # TODO Fix error messaging and break exceptions down into categories self.log.error(f"Error loading component spec for {operation.name}: {str(e)}") raise RuntimeError(f"Error loading component spec for {operation.name}.") # Add factory function, which returns a ContainerOp task instance, to pipeline operation dict try: comp_spec_inputs = [ inputs.name.lower().replace(" ", "_") for inputs in factory_function.component_spec.inputs ] # Remove inputs and outputs from params dict # TODO: need to have way to retrieve only required params parameter_removal_list = ["inputs", "outputs"] for component_param in operation.component_params_as_dict.keys(): if component_param not in comp_spec_inputs: parameter_removal_list.append(component_param) for parameter in parameter_removal_list: operation.component_params_as_dict.pop(parameter, None) # Create ContainerOp instance and assign appropriate user-provided name sanitized_component_params = { self._sanitize_param_name(name): value for name, value in operation.component_params_as_dict.items() } container_op = factory_function(**sanitized_component_params) container_op.set_display_name(operation.name) if operation.doc: container_op.add_pod_annotation("elyra/node-user-doc", operation.doc) target_ops[operation.id] = container_op except Exception as e: # TODO Fix error messaging and break exceptions down into categories self.log.error(f"Error constructing component {operation.name}: {str(e)}") raise RuntimeError(f"Error constructing component {operation.name}.") # Process dependencies after all the operations have been created for operation in pipeline.operations.values(): op = target_ops[operation.id] for parent_operation_id in operation.parent_operation_ids: parent_op = target_ops[parent_operation_id] # Parent Operation op.after(parent_op) self.log_pipeline_info(pipeline_name, "pipeline dependencies processed", duration=(time.time() - t0_all)) return target_ops
def ground_truth_test( region='us-west-2', team_name='ground-truth-demo-team', team_description='Team for mini image classification labeling job', user_pool='', user_groups='', client_id='', ground_truth_train_job_name='mini-image-classification-demo-train', ground_truth_validation_job_name='mini-image-classification-demo-validation', ground_truth_label_attribute_name='category', ground_truth_train_manifest_location='s3://your-bucket-name/gt-demo-images/ground-truth-demo/train.manifest', ground_truth_validation_manifest_location='s3://your-bucket-name/gt-demo-images/ground-truth-demo/validation.manifest', ground_truth_output_location='s3://your-bucket-name/gt-demo-images/ground-truth-demo/output', ground_truth_task_type='image classification', ground_truth_worker_type='private', ground_truth_label_category_config='s3://your-bucket-name/gt-demo-images/ground-truth-demo/class_labels.json', ground_truth_ui_template='s3://your-bucket-name/gt-demo-images/ground-truth-demo/instructions.template', ground_truth_title='Mini image classification', ground_truth_description='Test for Ground Truth KFP component', ground_truth_num_workers_per_object='1', ground_truth_time_limit='30', ground_truth_task_availibility='3600', ground_truth_max_concurrent_tasks='20', training_algorithm_name='image classification', training_input_mode='Pipe', training_hyperparameters='{"num_classes": "2", "num_training_samples": "14", "mini_batch_size": "2"}', training_channels='[{"ChannelName": "train", \ "DataSource": { \ "S3DataSource": { \ "S3Uri": "", \ "S3DataType": "AugmentedManifestFile", \ "S3DataDistributionType": "FullyReplicated", \ "AttributeNames": ["source-ref", "category"] \ } \ }, \ "ContentType": "application/x-recordio", \ "CompressionType": "None", \ "RecordWrapperType": "RecordIO"}, \ {"ChannelName": "validation", \ "DataSource": { \ "S3DataSource": { \ "S3Uri": "", \ "S3DataType": "AugmentedManifestFile", \ "S3DataDistributionType": "FullyReplicated", \ "AttributeNames": ["source-ref", "category"] \ } \ }, \ "ContentType": "application/x-recordio", \ "CompressionType": "None", \ "RecordWrapperType": "RecordIO"}]', training_output_location='s3://your-bucket-name/gt-demo-images/training-output', training_instance_type='ml.p2.xlarge', training_instance_count='1', training_volume_size='50', training_max_run_time='3600', role_arn=''): workteam = sagemaker_workteam_op(region=region, team_name=team_name, description=team_description, user_pool=user_pool, user_groups=user_groups, client_id=client_id).apply( use_aws_secret( 'aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) ground_truth_train = sagemaker_gt_op( region=region, role=role_arn, job_name=ground_truth_train_job_name, label_attribute_name=ground_truth_label_attribute_name, manifest_location=ground_truth_train_manifest_location, output_location=ground_truth_output_location, task_type=ground_truth_task_type, worker_type=ground_truth_worker_type, workteam_arn=workteam.output, label_category_config=ground_truth_label_category_config, ui_template=ground_truth_ui_template, title=ground_truth_title, description=ground_truth_description, num_workers_per_object=ground_truth_num_workers_per_object, time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) ground_truth_validation = sagemaker_gt_op( region=region, role=role_arn, job_name=ground_truth_validation_job_name, label_attribute_name=ground_truth_label_attribute_name, manifest_location=ground_truth_validation_manifest_location, output_location=ground_truth_output_location, task_type=ground_truth_task_type, worker_type=ground_truth_worker_type, workteam_arn=workteam.output, label_category_config=ground_truth_label_category_config, ui_template=ground_truth_ui_template, title=ground_truth_title, description=ground_truth_description, num_workers_per_object=ground_truth_num_workers_per_object, time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) training = sagemaker_train_op( region=region, algorithm_name=training_algorithm_name, training_input_mode=training_input_mode, hyperparameters=training_hyperparameters, channels=training_channels, data_location_1=ground_truth_train.outputs['output_manifest_location'], data_location_2=ground_truth_validation. outputs['output_manifest_location'], instance_type=training_instance_type, instance_count=training_instance_count, volume_size=training_volume_size, max_run_time=training_max_run_time, model_artifact_path=training_output_location, role=role_arn).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
def ground_truth_test( region='us-west-2', team_name='ground-truth-demo-team', team_description='Team for mini image classification labeling job', user_pool='', user_groups='', client_id='', ground_truth_train_job_name='mini-image-classification-demo-train', ground_truth_validation_job_name='mini-image-classification-demo-validation', ground_truth_label_attribute_name='category', ground_truth_train_manifest_location='s3://your-bucket-name/mini-image-classification/ground-truth-demo/train.manifest', ground_truth_validation_manifest_location='s3://your-bucket-name/mini-image-classification/ground-truth-demo/validation.manifest', ground_truth_output_location='s3://your-bucket-name/mini-image-classification/ground-truth-demo/output', ground_truth_task_type='image classification', ground_truth_worker_type='private', ground_truth_label_category_config='s3://your-bucket-name/mini-image-classification/ground-truth-demo/class_labels.json', ground_truth_ui_template='s3://your-bucket-name/mini-image-classification/ground-truth-demo/instructions.template', ground_truth_title='Mini image classification', ground_truth_description='Test for Ground Truth KFP component', ground_truth_num_workers_per_object=1, ground_truth_time_limit=30, ground_truth_task_availibility=3600, ground_truth_max_concurrent_tasks=20, training_algorithm_name='image classification', training_input_mode='Pipe', training_hyperparameters={ "num_classes": "2", "num_training_samples": "14", "mini_batch_size": "2" }, training_output_location='s3://your-bucket-name/mini-image-classification/training-output', training_instance_type='ml.p2.xlarge', training_instance_count=1, training_volume_size=50, training_max_run_time=3600, role_arn=''): workteam = sagemaker_workteam_op(region=region, team_name=team_name, description=team_description, user_pool=user_pool, user_groups=user_groups, client_id=client_id).apply( use_aws_secret( 'aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) ground_truth_train = sagemaker_gt_op( region=region, role=role_arn, job_name=ground_truth_train_job_name, label_attribute_name=ground_truth_label_attribute_name, manifest_location=ground_truth_train_manifest_location, output_location=ground_truth_output_location, task_type=ground_truth_task_type, worker_type=ground_truth_worker_type, workteam_arn=workteam.output, label_category_config=ground_truth_label_category_config, ui_template=ground_truth_ui_template, title=ground_truth_title, description=ground_truth_description, num_workers_per_object=ground_truth_num_workers_per_object, time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) ground_truth_validation = sagemaker_gt_op( region=region, role=role_arn, job_name=ground_truth_validation_job_name, label_attribute_name=ground_truth_label_attribute_name, manifest_location=ground_truth_validation_manifest_location, output_location=ground_truth_output_location, task_type=ground_truth_task_type, worker_type=ground_truth_worker_type, workteam_arn=workteam.output, label_category_config=ground_truth_label_category_config, ui_template=ground_truth_ui_template, title=ground_truth_title, description=ground_truth_description, num_workers_per_object=ground_truth_num_workers_per_object, time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) channelObj['ChannelName'] = 'train' channelObj['DataSource']['S3DataSource']['S3Uri'] = str( ground_truth_train.outputs['output_manifest_location']) channelObjList.append(copy.deepcopy(channelObj)) channelObj['ChannelName'] = 'validation' channelObj['DataSource']['S3DataSource']['S3Uri'] = str( ground_truth_validation.outputs['output_manifest_location']) channelObjList.append(copy.deepcopy(channelObj)) training = sagemaker_train_op(region=region, algorithm_name=training_algorithm_name, training_input_mode=training_input_mode, hyperparameters=training_hyperparameters, channels=json.dumps(channelObjList), instance_type=training_instance_type, instance_count=training_instance_count, volume_size=training_volume_size, max_run_time=training_max_run_time, model_artifact_path=training_output_location, role=role_arn).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
def s3_sync_pipeline(): echo_task = s3_sync().apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
def _cc_pipeline(self, pipeline, pipeline_name, pipeline_version='', experiment_name='', cos_directory=None, export=False): runtime_configuration = self._get_metadata_configuration( namespace=MetadataManager.NAMESPACE_RUNTIMES, name=pipeline.runtime_config) cos_endpoint = runtime_configuration.metadata['cos_endpoint'] cos_username = runtime_configuration.metadata['cos_username'] cos_password = runtime_configuration.metadata['cos_password'] cos_secret = runtime_configuration.metadata.get('cos_secret') if cos_directory is None: cos_directory = pipeline_name cos_bucket = runtime_configuration.metadata['cos_bucket'] self.log_pipeline_info( pipeline_name, f"processing pipeline dependencies to: {cos_endpoint} " f"bucket: {cos_bucket} folder: {cos_directory}") t0_all = time.time() emptydir_volume_size = '' container_runtime = bool( os.getenv('CRIO_RUNTIME', 'False').lower() == 'true') # Create dictionary that maps component Id to its ContainerOp instance notebook_ops = {} # Sort operations based on dependency graph (topological order) sorted_operations = PipelineProcessor._sort_operations( pipeline.operations) # All previous operation outputs should be propagated throughout the pipeline. # In order to process this recursively, the current operation's inputs should be combined # from its parent's inputs (which, themselves are derived from the outputs of their parent) # and its parent's outputs. PipelineProcessor._propagate_operation_inputs_outputs( pipeline, sorted_operations) for operation in sorted_operations: operation_artifact_archive = self._get_dependency_archive_name( operation) self.log.debug( "Creating pipeline component :\n {op} archive : {archive}". format(op=operation, archive=operation_artifact_archive)) if container_runtime: # Volume size to create when using CRI-o, NOTE: IBM Cloud minimum is 20Gi emptydir_volume_size = '20Gi' # Collect env variables pipeline_envs = dict() if not cos_secret: pipeline_envs['AWS_ACCESS_KEY_ID'] = cos_username pipeline_envs['AWS_SECRET_ACCESS_KEY'] = cos_password # Convey pipeline logging enablement to operation pipeline_envs['ELYRA_ENABLE_PIPELINE_INFO'] = str( self.enable_pipeline_info) # Setting identifies a writable directory in the container image. # Only Unix-style path spec is supported. pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR'] = self.WCD if operation.env_vars: for env_var in operation.env_vars: # Strip any of these special characters from both key and value # Splits on the first occurrence of '=' result = [x.strip(' \'\"') for x in env_var.split('=', 1)] # Should be non empty key with a value if len(result) == 2 and result[0] != '': pipeline_envs[result[0]] = result[1] sanitized_operation_name = self._sanitize_operation_name( operation.name) # create pipeline operation notebook_ops[operation.id] = NotebookOp( name=sanitized_operation_name, pipeline_name=pipeline_name, experiment_name=experiment_name, notebook=operation.filename, cos_endpoint=cos_endpoint, cos_bucket=cos_bucket, cos_directory=cos_directory, cos_dependencies_archive=operation_artifact_archive, pipeline_version=pipeline_version, pipeline_source=pipeline.source, pipeline_inputs=operation.inputs, pipeline_outputs=operation.outputs, pipeline_envs=pipeline_envs, emptydir_volume_size=emptydir_volume_size, cpu_request=operation.cpu, mem_request=operation.memory, gpu_limit=operation.gpu, image=operation.runtime_image, file_outputs={ 'mlpipeline-metrics': '{}/mlpipeline-metrics.json'.format( pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']), 'mlpipeline-ui-metadata': '{}/mlpipeline-ui-metadata.json'.format( pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']) }) if cos_secret and not export: notebook_ops[operation.id].apply(use_aws_secret(cos_secret)) image_namespace = self._get_metadata_configuration( namespace=MetadataManager.NAMESPACE_RUNTIME_IMAGES) for image_instance in image_namespace: if image_instance.metadata['image_name'] == operation.runtime_image and \ image_instance.metadata.get('pull_policy'): notebook_ops[operation.id].container.set_image_pull_policy( image_instance.metadata['pull_policy']) self.log_pipeline_info( pipeline_name, f"processing operation dependencies for id: {operation.id}", operation_name=operation.name) self._upload_dependencies_to_object_store(runtime_configuration, cos_directory, operation) # Process dependencies after all the operations have been created for operation in pipeline.operations.values(): op = notebook_ops[operation.id] for parent_operation_id in operation.parent_operations: parent_op = notebook_ops[ parent_operation_id] # Parent Operation op.after(parent_op) self.log_pipeline_info(pipeline_name, "pipeline dependencies processed", duration=(time.time() - t0_all)) return notebook_ops
def hpo_test( region='us-west-2', hpo_job_name='HPO-kmeans-sample', image='', algorithm_name='K-Means', training_input_mode='File', metric_definitions='{}', strategy='Bayesian', metric_name='test:msd', metric_type='Minimize', early_stopping_type='Off', static_parameters='{"k": "10", "feature_dim": "784"}', integer_parameters='[{"Name": "mini_batch_size", "MinValue": "450", "MaxValue": "550"}, \ {"Name": "extra_center_factor", "MinValue": "10", "MaxValue": "20"}]', continuous_parameters='[]', categorical_parameters='[{"Name": "init_method", "Values": ["random", "kmeans++"]}]', channels='[{"ChannelName": "train", \ "DataSource": { \ "S3DataSource": { \ "S3Uri": "s3://kubeflow-pipeline-data/mnist_kmeans_example/data", \ "S3DataType": "S3Prefix", \ "S3DataDistributionType": "FullyReplicated" \ } \ }, \ "ContentType": "", \ "CompressionType": "None", \ "RecordWrapperType": "None", \ "InputMode": "File"}, \ {"ChannelName": "test", \ "DataSource": { \ "S3DataSource": { \ "S3Uri": "s3://kubeflow-pipeline-data/mnist_kmeans_example/data", \ "S3DataType": "S3Prefix", \ "S3DataDistributionType": "FullyReplicated" \ } \ }, \ "ContentType": "", \ "CompressionType": "None", \ "RecordWrapperType": "None", \ "InputMode": "File"}]', output_location='s3://kubeflow-pipeline-data/mnist_kmeans_example/output', output_encryption_key='', instance_type='ml.p2.16xlarge', instance_count='1', volume_size='50', max_num_jobs='1', max_parallel_jobs='1', resource_encryption_key='', max_run_time='3600', vpc_security_group_ids='', vpc_subnets='', network_isolation='True', traffic_encryption='False', warm_start_type='', parent_hpo_jobs='', tags='{}', role_arn='', ): training = sagemaker_hpo_op( region=region, job_name=hpo_job_name, image=image, training_input_mode=training_input_mode, algorithm_name=algorithm_name, metric_definitions=metric_definitions, strategy=strategy, metric_name=metric_name, metric_type=metric_type, early_stopping_type=early_stopping_type, static_parameters=static_parameters, integer_parameters=integer_parameters, continuous_parameters=continuous_parameters, categorical_parameters=categorical_parameters, channels=channels, output_location=output_location, output_encryption_key=output_encryption_key, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, max_num_jobs=max_num_jobs, max_parallel_jobs=max_parallel_jobs, resource_encryption_key=resource_encryption_key, max_run_time=max_run_time, vpc_security_group_ids=vpc_security_group_ids, vpc_subnets=vpc_subnets, network_isolation=network_isolation, traffic_encryption=traffic_encryption, warm_start_type=warm_start_type, parent_hpo_jobs=parent_hpo_jobs, tags=tags, role=role_arn, ).apply( use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
def apply_config_map_and_aws_secret(op): return (op.apply(use_config_map(configmap)).apply( use_aws_secret()).set_image_pull_policy('Always'))
def _cc_pipeline(self, pipeline, pipeline_name, pipeline_version='', experiment_name='', cos_directory=None, export=False): runtime_configuration = self._get_metadata_configuration( namespace=MetadataManager.NAMESPACE_RUNTIMES, name=pipeline.runtime_config) cos_endpoint = runtime_configuration.metadata['cos_endpoint'] cos_username = runtime_configuration.metadata['cos_username'] cos_password = runtime_configuration.metadata['cos_password'] cos_secret = runtime_configuration.metadata.get('cos_secret') if cos_directory is None: cos_directory = pipeline_name cos_bucket = runtime_configuration.metadata['cos_bucket'] self.log_pipeline_info( pipeline_name, f"processing pipeline dependencies to: {cos_endpoint} " f"bucket: {cos_bucket} folder: {cos_directory}") t0_all = time.time() emptydir_volume_size = '' container_runtime = bool( os.getenv('CRIO_RUNTIME', 'False').lower() == 'true') # Create dictionary that maps component Id to its ContainerOp instance notebook_ops = {} # Sort operations based on dependency graph (topological order) sorted_operations = PipelineProcessor._sort_operations( pipeline.operations) # All previous operation outputs should be propagated throughout the pipeline. # In order to process this recursively, the current operation's inputs should be combined # from its parent's inputs (which, themselves are derived from the outputs of their parent) # and its parent's outputs. PipelineProcessor._propagate_operation_inputs_outputs( pipeline, sorted_operations) for operation in sorted_operations: if container_runtime: # Volume size to create when using CRI-o, NOTE: IBM Cloud minimum is 20Gi emptydir_volume_size = '20Gi' # Collect env variables pipeline_envs = self._collect_envs(operation, cos_secret=cos_secret, cos_username=cos_username, cos_password=cos_password) sanitized_operation_name = self._sanitize_operation_name( operation.name) # Create pipeline operation # If operation is one of the "standard" set of NBs or scripts, construct custom NotebookOp if operation.classifier in [ "execute-notebook-node", "execute-python-node", "execute-r-node" ]: operation_artifact_archive = self._get_dependency_archive_name( operation) self.log.debug( "Creating pipeline component :\n {op} archive : {archive}". format(op=operation, archive=operation_artifact_archive)) notebook_ops[operation.id] = NotebookOp( name=sanitized_operation_name, pipeline_name=pipeline_name, experiment_name=experiment_name, notebook=operation.filename, cos_endpoint=cos_endpoint, cos_bucket=cos_bucket, cos_directory=cos_directory, cos_dependencies_archive=operation_artifact_archive, pipeline_version=pipeline_version, pipeline_source=pipeline.source, pipeline_inputs=operation.inputs, pipeline_outputs=operation.outputs, pipeline_envs=pipeline_envs, emptydir_volume_size=emptydir_volume_size, cpu_request=operation.cpu, mem_request=operation.memory, gpu_limit=operation.gpu, image=operation.runtime_image, file_outputs={ 'mlpipeline-metrics': '{}/mlpipeline-metrics.json'.format( pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']), 'mlpipeline-ui-metadata': '{}/mlpipeline-ui-metadata.json'.format( pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']) }) # TODO Can we move all of this to apply to non-standard components as well? Test when servers are up if cos_secret and not export: notebook_ops[operation.id].apply( use_aws_secret(cos_secret)) image_namespace = self._get_metadata_configuration( namespace=MetadataManager.NAMESPACE_RUNTIME_IMAGES) for image_instance in image_namespace: if image_instance.metadata['image_name'] == operation.runtime_image and \ image_instance.metadata.get('pull_policy'): notebook_ops[operation.id].container. \ set_image_pull_policy(image_instance.metadata['pull_policy']) self.log_pipeline_info( pipeline_name, f"processing operation dependencies for id: {operation.id}", operation_name=operation.name) self._upload_dependencies_to_object_store( runtime_configuration, cos_directory, operation) # If operation is a "non-standard" component, load it's spec and create operation with factory function else: component_source = {} component_source[ operation. component_source_type] = operation.component_source # Build component task factory try: factory_function = components.load_component( **component_source) except Exception: # TODO Fix error messaging and break exceptions down into categories self.log.error( f"There was an error while loading component spec for {operation.name}." ) raise RuntimeError( f"There was an error while loading component spec for {operation.name}." ) # Add factory function, which returns a ContainerOp task instance, to pipeline operation dict try: notebook_ops[operation.id] = factory_function( **operation.component_params) except Exception: # TODO Fix error messaging and break exceptions down into categories self.log.error( f"There was an error while constructing component {operation.name}." ) raise RuntimeError( f"There was an error while constructing component {operation.name}." ) # Process dependencies after all the operations have been created for operation in pipeline.operations.values(): op = notebook_ops[operation.id] for parent_operation_id in operation.parent_operations: parent_op = notebook_ops[ parent_operation_id] # Parent Operation op.after(parent_op) self.log_pipeline_info(pipeline_name, "pipeline dependencies processed", duration=(time.time() - t0_all)) return notebook_ops
def hpo_test(region='us-west-2', hpo_job_name='HPO-kmeans-sample', image='', algorithm_name='K-Means', training_input_mode='File', metric_definitions={}, strategy='Bayesian', metric_name='test:msd', metric_type='Minimize', early_stopping_type='Off', static_parameters={"k": "10", "feature_dim": "784"}, integer_parameters=[{"Name": "mini_batch_size", "MinValue": "450", "MaxValue": "550"}, \ {"Name": "extra_center_factor", "MinValue": "10", "MaxValue": "20"}], continuous_parameters=[], categorical_parameters=[{"Name": "init_method", "Values": ["random", "kmeans++"]}], channels=channelObjList, output_location='s3://kubeflow-pipeline-data/mnist_kmeans_example/output', output_encryption_key='', instance_type='ml.p2.16xlarge', instance_count=1, volume_size=50, max_num_jobs=1, max_parallel_jobs=1, resource_encryption_key='', max_run_time=3600, vpc_security_group_ids='', vpc_subnets='', endpoint_url='', network_isolation=True, traffic_encryption=False, warm_start_type='', parent_hpo_jobs='', spot_instance=False, max_wait_time=3600, checkpoint_config={}, tags={}, role_arn='', ): training = sagemaker_hpo_op( region=region, endpoint_url=endpoint_url, job_name=hpo_job_name, image=image, training_input_mode=training_input_mode, algorithm_name=algorithm_name, metric_definitions=metric_definitions, strategy=strategy, metric_name=metric_name, metric_type=metric_type, early_stopping_type=early_stopping_type, static_parameters=static_parameters, integer_parameters=integer_parameters, continuous_parameters=continuous_parameters, categorical_parameters=categorical_parameters, channels=channels, output_location=output_location, output_encryption_key=output_encryption_key, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, max_num_jobs=max_num_jobs, max_parallel_jobs=max_parallel_jobs, resource_encryption_key=resource_encryption_key, max_run_time=max_run_time, vpc_security_group_ids=vpc_security_group_ids, vpc_subnets=vpc_subnets, network_isolation=network_isolation, traffic_encryption=traffic_encryption, warm_start_type=warm_start_type, parent_hpo_jobs=parent_hpo_jobs, spot_instance=spot_instance, max_wait_time=max_wait_time, checkpoint_config=checkpoint_config, tags=tags, role=role_arn, ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))