def workflow(client): workflow = Workflow(name=state_machine_name, definition=definition, role=role_arn, client=client) workflow.create() return workflow
def test_workflow_update_when_statemachinearn_is_none(client): workflow = Workflow(name=state_machine_name, definition=definition, role=role_arn, client=client) new_definition = steps.Pass('HelloWorld') with pytest.raises(WorkflowNotFound): workflow.update(definition=new_definition)
def get_existing_monitor_pipeline(workflow_arn): """ Create a dummpy implementation of get existing data pipeline """ data_pipeline = Workflow( name="data_pipeline_name", definition=Chain([]), role="workflow_execution_role", ) return data_pipeline.attach(workflow_arn)
def get_existing_training_pipeline(workflow_arn): """ Create a dummpy implementation of get existing training pipeline """ training_pipeline = Workflow( name="training_pipeline_name", definition=Chain([]), role="workflow_execution_role", ) return training_pipeline.attach(workflow_arn)
def get_existing_inference_pipeline(workflow_arn): """ Create a dummy implementation to get existing training pipeline TODO: This could be a good PR for the SDK. """ inference_pipeline = Workflow( name="inference_pipeline_name", definition=Chain([]), role="workflow_execution_role", ) return inference_pipeline.attach(workflow_arn)
def _build_workflow(self): """create a step functions workflow from the chain_of_tasks.""" logger.debug( f"creating a chain from all the different steps. \n {self.chain_of_tasks}" ) workflow_definition = steps.Chain(self.chain_of_tasks) logger.debug(f"creating a workflow with name {self.unique_name}") self.client = boto3.client("stepfunctions") self.workflow = Workflow( name=self.unique_name, definition=workflow_definition, role=self.role.role_arn, client=self.client, )
def create_workflow_and_check_definition(workflow_graph, workflow_name, sfn_client, sfn_role_arn): # Create workflow workflow = Workflow(name=workflow_name, definition=workflow_graph, role=sfn_role_arn, client=sfn_client) state_machine_arn = workflow.create() # Check workflow definition state_machine_desc = sfn_client.describe_state_machine( stateMachineArn=state_machine_arn) assert workflow.definition.to_dict() == json.loads( state_machine_desc.get('definition')) return workflow
def workflow(client): execution_input = ExecutionInput() test_step_01 = Pass(state_id='StateOne', parameters={ 'ParamA': execution_input['Key02']['Key03'], 'ParamD': execution_input['Key01']['Key03'], }) test_step_02 = Pass(state_id='StateTwo', parameters={ 'ParamC': execution_input["Key05"], "ParamB": "SampleValueB", "ParamE": test_step_01.output()["Response"]["Key04"] }) test_step_03 = Pass(state_id='StateThree', parameters={ 'ParamG': "SampleValueG", "ParamF": execution_input["Key06"], "ParamH": "SampleValueH", "ParamI": test_step_02.output() }) workflow_definition = Chain([test_step_01, test_step_02, test_step_03]) workflow = Workflow(name='TestWorkflow', definition=workflow_definition, role='testRoleArn', execution_input=execution_input, client=client) return workflow
def main(): stepfunctions.set_stream_logger(level=logging.INFO) workflow_execution_role = 'arn:aws:iam::829044821271:role/StepFunctionsWorkflowExecutionRole' # Load job name with open('./stepfunctions_name.json', 'r') as f: stepfunctions_name = json.load(f) with open('./face_clip/aws_batch/batch_names.json', 'r') as f: face_clip_name = json.load(f) with open('./tag_extraction/aws_batch/batch_names.json', 'r') as f: tag_extraction_name = json.load(f) # Define steps face_clip_step = steps.BatchSubmitJobStep( state_id = 'Face Clip Step', parameters={ 'JobDefinition': face_clip_name['jobDefinition'], 'JobName': face_clip_name['job'], 'JobQueue': face_clip_name['jobQueue'] } ) tag_extraction_step = steps.BatchSubmitJobStep( state_id = 'Tag Extraction Step', parameters={ 'JobDefinition': tag_extraction_name['jobDefinition'], 'JobName': tag_extraction_name['job'], 'JobQueue': tag_extraction_name['jobQueue'] } ) # Define workflow chain_list = [face_clip_step, tag_extraction_step] workflow_definition = steps.Chain(chain_list) workflow = Workflow( name=stepfunctions_name['workflow'], definition=workflow_definition, role=workflow_execution_role, ) # workflow workflow.create()
def test_workflow_creation_failure_duplicate_state_ids(client): improper_definition = steps.Chain( [steps.Pass('HelloWorld'), steps.Succeed('HelloWorld')]) with pytest.raises(ValueError): workflow = Workflow(name=state_machine_name, definition=improper_definition, role=role_arn, client=client)
def test_catch_state_machine_creation(sfn_client, sfn_role_arn, training_job_parameters): catch_state_name = "TaskWithCatchState" custom_error = "CustomError" task_failed_error = "States.TaskFailed" all_fail_error = "States.ALL" custom_error_state_name = "Custom Error End" task_failed_state_name = "Task Failed End" all_error_state_name = "Catch All End" catch_state_result = "Catch Result" task_resource = "arn:aws:states:::sagemaker:createTrainingJob.sync" # change the parameters to cause task state to fail training_job_parameters["AlgorithmSpecification"]["TrainingImage"] = "not_an_image" asl_state_machine_definition = { "StartAt": catch_state_name, "States": { catch_state_name: { "Resource": task_resource, "Parameters": training_job_parameters, "Type": "Task", "End": True, "Catch": [ { "ErrorEquals": [ all_fail_error ], "Next": all_error_state_name } ] }, all_error_state_name: { "Type": "Pass", "Result": catch_state_result, "End": True } } } task = steps.Task( catch_state_name, parameters=training_job_parameters, resource=task_resource ) task.add_catch( steps.Catch( error_equals=[all_fail_error], next_step=steps.Pass(all_error_state_name, result=catch_state_result) ) ) workflow = Workflow( 'Test_Catch_Workflow', definition=task, role=sfn_role_arn ) workflow_test_suite(sfn_client, workflow, asl_state_machine_definition, catch_state_result)
def test_parallel_state_machine_creation(sfn_client, sfn_role_arn): parallel_state_name = "Parallel" left_pass_name = "Left Pass" right_pass_name = "Right Pass" final_state_name = "Final State" parallel_state_result = "Parallel Result" asl_state_machine_definition = { "StartAt": parallel_state_name, "States": { parallel_state_name: { "Type": "Parallel", "Next": final_state_name, "Branches": [ { "StartAt": left_pass_name, "States": { left_pass_name: { "Type": "Pass", "End": True } } }, { "StartAt": right_pass_name, "States": { right_pass_name: { "Type": "Pass", "End": True } } } ] }, final_state_name: { "Type": "Pass", "Result": parallel_state_result, "End": True } } } parallel_waits = steps.Parallel(parallel_state_name) parallel_waits.add_branch(steps.Pass(left_pass_name)) parallel_waits.add_branch(steps.Pass(right_pass_name)) definition = steps.Chain([ parallel_waits, steps.Pass(final_state_name, result=parallel_state_result) ]) workflow = Workflow( 'Test_Parallel_Workflow', definition=definition, role=sfn_role_arn ) workflow_test_suite(sfn_client, workflow, asl_state_machine_definition, parallel_state_result)
def test_map_state_machine_creation(sfn_client, sfn_role_arn): map_state_name = "Map State" iterated_state_name = "Pass State" final_state_name = "Final State" items_path = "$.array" max_concurrency = 0 map_state_result = "Map Result" state_machine_input = { "array": [1, 2, 3] } asl_state_machine_definition = { "StartAt": map_state_name, "States": { map_state_name: { "ItemsPath": items_path, "Iterator": { "StartAt": iterated_state_name, "States": { iterated_state_name: { "Type": "Pass", "End": True } } }, "MaxConcurrency": max_concurrency, "Type": "Map", "Next": final_state_name }, final_state_name: { "Type": "Pass", "Result": map_state_result, "End": True } } } map_state = steps.Map( map_state_name, items_path=items_path, iterator=steps.Pass(iterated_state_name), max_concurrency=max_concurrency) definition = steps.Chain([ map_state, steps.Pass(final_state_name, result=map_state_result) ]) workflow = Workflow( 'Test_Map_Workflow', definition=definition, role=sfn_role_arn ) workflow_test_suite(sfn_client, workflow, asl_state_machine_definition, map_state_result, state_machine_input)
def test_retry_state_machine_creation(sfn_client, sfn_role_arn, training_job_parameters): retry_state_name = "RetryStateName" all_fail_error = "Starts.ALL" interval_seconds = 1 max_attempts = 2 backoff_rate = 2 task_resource = "arn:aws:states:::sagemaker:createTrainingJob.sync" # change the parameters to cause task state to fail training_job_parameters["AlgorithmSpecification"]["TrainingImage"] = "not_an_image" asl_state_machine_definition = { "StartAt": retry_state_name, "States": { retry_state_name: { "Resource": task_resource, "Parameters": training_job_parameters, "Type": "Task", "End": True, "Retry": [ { "ErrorEquals": [all_fail_error], "IntervalSeconds": interval_seconds, "MaxAttempts": max_attempts, "BackoffRate": backoff_rate } ] } } } task = steps.Task( retry_state_name, parameters=training_job_parameters, resource=task_resource ) task.add_retry( steps.Retry( error_equals=[all_fail_error], interval_seconds=interval_seconds, max_attempts=max_attempts, backoff_rate=backoff_rate ) ) workflow = Workflow( 'Test_Retry_Workflow', definition=task, role=sfn_role_arn ) workflow_test_suite(sfn_client, workflow, asl_state_machine_definition, None)
def __init__(self, preprocessor, estimator, inputs, s3_bucket, role, client=None, **kwargs): """ Args: preprocessor (sagemaker.estimator.EstimatorBase): The estimator used to preprocess and transform the training data. estimator (sagemaker.estimator.EstimatorBase): The estimator to use for training. Can be a BYO estimator, Framework estimator or Amazon algorithm estimator. role (str): An AWS IAM role (either name or full Amazon Resource Name (ARN)). This role is used to create, manage, and execute the Step Functions workflows. inputs: Information about the training data. Please refer to the `fit()` method of the associated estimator, as this can take any of the following forms: * (str) - The S3 location where training data is saved. * (dict[str, str] or dict[str, `sagemaker.session.s3_input`]) - If using multiple channels for training data, you can specify a dict mapping channel names to strings or `sagemaker.session.s3_input` objects. * (`sagemaker.session.s3_input`) - Channel configuration for S3 data sources that can provide additional information about the training dataset. See `sagemaker.session.s3_input` for full details. * (`sagemaker.amazon.amazon_estimator.RecordSet`) - A collection of Amazon `Record` objects serialized and stored in S3. For use with an estimator for an Amazon algorithm. * (list[`sagemaker.amazon.amazon_estimator.RecordSet`]) - A list of `sagemaker.amazon.amazon_estimator.RecordSet` objects, where each instance is a different channel of training data. s3_bucket (str): S3 bucket under which the output artifacts from the training job will be stored. The parent path used is built using the format: ``s3://{s3_bucket}/{pipeline_name}/models/{job_name}/``. In this format, `pipeline_name` refers to the keyword argument provided for TrainingPipeline. If a `pipeline_name` argument was not provided, one is auto-generated by the pipeline as `training-pipeline-<timestamp>`. Also, in the format, `job_name` refers to the job name provided when calling the :meth:`TrainingPipeline.run()` method. client (SFN.Client, optional): boto3 client to use for creating and interacting with the inference pipeline in Step Functions. (default: None) Keyword Args: compression_type (str, optional): Compression type (Gzip/None) of the file for TransformJob. (default:None) content_type (str, optional): Content type (MIME) of the document to be used in preprocessing script. See SageMaker documentation for more details. (default:None) pipeline_name (str, optional): Name of the pipeline. This name will be used to name jobs (if not provided when calling execute()), models, endpoints, and S3 objects created by the pipeline. If a `pipeline_name` argument was not provided, one is auto-generated by the pipeline as `training-pipeline-<timestamp>`. (default:None) """ self.preprocessor = preprocessor self.estimator = estimator self.inputs = inputs self.s3_bucket = s3_bucket for key in self.__class__.__allowed_kwargs: setattr(self, key, kwargs.pop(key, None)) if not self.pipeline_name: self.pipeline_name = 'inference-pipeline-{date}'.format( date=self._generate_timestamp()) self.definition = self.build_workflow_definition() self.input_template = self._extract_input_template(self.definition) workflow = Workflow(name=self.pipeline_name, definition=self.definition, role=role, format_json=True, client=client) super(InferencePipeline, self).__init__(s3_bucket=s3_bucket, workflow=workflow, role=role, client=client)
def main(): sagemaker_session = sagemaker.Session() stepfunctions.set_stream_logger(level=logging.INFO) bucket = 's3://pixiv-image-backet' sagemaker_execution_role = 'arn:aws:iam::829044821271:role/service-role/AmazonSageMaker-ExecutionRole-20200412T194702' workflow_execution_role = 'arn:aws:iam::829044821271:role/StepFunctionsWorkflowExecutionRole' estimator1 = PyTorch(entry_point='train.py', source_dir='projection_discriminator', role=sagemaker_execution_role, framework_version='1.4.0', train_instance_count=2, train_instance_type='ml.m5.2xlarge', hyperparameters={ 'train_epoch': 1, }) estimator2 = PyTorch(entry_point='train.py', source_dir='wgan_gp', role=sagemaker_execution_role, framework_version='1.4.0', train_instance_count=2, train_instance_type='ml.m5.2xlarge', hyperparameters={ 'train_epoch': 1, }) training_step1 = steps.TrainingStep(state_id='Train Step1', estimator=estimator1, data={ 'training': bucket, }, job_name='PD-Train-{0}'.format( uuid.uuid4())) training_step2 = steps.TrainingStep(state_id='Train Step2', estimator=estimator2, data={ 'training': bucket, }, job_name='PD-Train-{0}'.format( uuid.uuid4())) parallel_state = steps.Parallel(state_id='Parallel', ) parallel_state.add_branch(training_step1) parallel_state.add_branch(training_step2) workflow_definition = steps.Chain([parallel_state]) workflow = Workflow( name='MyTraining-{0}'.format(uuid.uuid4()), definition=workflow_definition, role=workflow_execution_role, ) workflow.create() workflow.execute()
def build_workflow(self): """create a step functions workflow from the chain_of_tasks.""" self.chain_of_tasks = self._construct_toposorted_chain_of_tasks() logger.debug("creating a chain from all the different steps.") self.chain_of_tasks = self._integrate_notification_in_workflow( chain_of_tasks=self.chain_of_tasks) logger.debug(f"creating a workflow with name {self.unique_name}") sfn_client = boto3.client("stepfunctions") self.workflow = Workflow( name=self.unique_name, definition=self.chain_of_tasks, role=self.role.role_arn, client=sfn_client, **self.kwargs, )
def test_list_workflows(client): paginator = client.get_paginator('list_state_machines') paginator.paginate = MagicMock(return_value=[{ 'stateMachines': [{ 'stateMachineArn': state_machine_arn, 'name': state_machine_name, 'creationDate': datetime(2019, 1, 1) }], 'NextToken': 'Token' }]) client.get_paginator = MagicMock(return_value=paginator) workflows = Workflow.list_workflows(max_items=999, client=client) paginator.paginate.assert_called_with(PaginationConfig={ 'MaxItems': 999, 'PageSize': 1000 })
def find_state_machine_arn(state_machine: str) -> str: """lookup the state machine arn based on the state machine name.""" workflows = Workflow.list_workflows() state_machine_object = [ workflow for workflow in workflows if workflow.get("name") == state_machine ] if len(state_machine_object) == 1: logger.debug( f"we have found one statemachine {state_machine_object[0]}") return state_machine_object[0].get("stateMachineArn") elif len(state_machine_object) == 0: logger.error(f"statemachine {state_machine} not found.") raise LookupError("no statemachine found.") else: logger.error( f"more than one statemachine found with name {state_machine}.") raise Exception( "more than one statemachine found. Something strange is going on ..." )
def test_pass_state_machine_creation(sfn_client, sfn_role_arn): pass_state_name = "Pass" pass_state_result = "Pass Result" asl_state_machine_definition = { "StartAt": pass_state_name, "States": { pass_state_name: { "Result": pass_state_result, "Type": "Pass", "End": True } } } definition = steps.Pass(pass_state_name, result=pass_state_result) workflow = Workflow(unique_name_from_base('Test_Pass_Workflow'), definition=definition, role=sfn_role_arn) workflow_test_suite(sfn_client, workflow, asl_state_machine_definition, pass_state_result)
def test_task_state_machine_creation(sfn_client, sfn_role_arn, training_job_parameters): task_state_name = "TaskState" final_state_name = "FinalState" resource = "arn:aws:states:::sagemaker:createTrainingJob.sync" task_state_result = "Task State Result" asl_state_machine_definition = { "StartAt": task_state_name, "States": { task_state_name: { "Resource": resource, "Parameters": training_job_parameters, "Type": "Task", "Next": final_state_name }, final_state_name: { "Type": "Pass", "Result" : task_state_result, "End": True } } } definition = steps.Chain([ steps.Task( task_state_name, resource=resource, parameters=training_job_parameters ), steps.Pass(final_state_name, result=task_state_result) ]) workflow = Workflow( 'Test_Task_Workflow', definition=definition, role=sfn_role_arn ) workflow_test_suite(sfn_client, workflow, asl_state_machine_definition, task_state_result)
def create_sfn_workflow(params, steps): sfn_workflow_name = params['sfn-workflow-name'] workflow_execution_role = params['sfn-role-arn'] workflow_graph = Chain(steps) branching_workflow = Workflow( name=sfn_workflow_name, definition=workflow_graph, role=workflow_execution_role, ) branching_workflow.create() branching_workflow.update(workflow_graph) time.sleep(5) return branching_workflow
training_step.add_branch(train_step_A) training_step.add_branch(train_step_B) # Chain the steps together to generate a full AWS Step Functions workflow_definition = steps.Chain([ training_step, endpoint_create_step, endpoint_wait_step, model_test_step ]) # Create a Amazon Step Function workflow based in inputs workflow = Workflow( name=state_machine_name, state_machine_arn=state_machine_arn, definition=workflow_definition, role=workflow_role, execution_input=event_input ) ### END WORK FLOW DEFINITION #### # Manually update some settings that are not generated correctly by the AWS Step Functions Data Science SDK. jsonDef = workflow.definition.to_json(pretty=True) jsonDef = jsonDef.replace("TrainingImage\": \"latesta", "TrainingImage.$\": \"$$.Execution.Input['ecrArnA']") jsonDef = jsonDef.replace("TrainingImage\": \"latestb", "TrainingImage.$\": \"$$.Execution.Input['ecrArnB']") jsonDef = jsonDef.replace("Image\": \"latesta", "Image.$\": \"$$.Execution.Input['ecrArnA']") jsonDef = jsonDef.replace("Image\": \"latestb", "Image.$\": \"$$.Execution.Input['ecrArnB']") jsonDef = jsonDef.replace("ModelDataUrl.$\": \"$['ModelArtifacts']['S3ModelArtifacts']", "ModelDataUrl.$\": \"$['train_step_result']['ModelArtifacts']['S3ModelArtifacts']") jsonDef = jsonDef.replace("TrainingJobName", "TrainingJobName.$")
} ) ## SageMaker の学習ジョブを実行するステップ estimator = create_estimator() data_path = {'train': args.data_path} training_step = steps.TrainingStep( 'Train Step', estimator=estimator, data=data_path, job_name=execution_input['TrainJobName'], wait_for_completion=False # SFnを実行した後に Bitbucket へプルリクを上げるように変更したため、ここは True で良いかも。 ) # 各 Step を連結 chain_list = [etl_step, training_step] workflow_definition = steps.Chain(chain_list) # Workflow の作成 workflow = Workflow( name=FLOW_NAME, definition=workflow_definition, role=WORKFLOW_ROLE, execution_input=execution_input ) workflow.create() # Workflow の実行 execution = workflow.execute(inputs=inputs)
workflow_definition = steps.Chain([ etl_step, training_step, model_step, lambda_step, check_accuracy_step ]) # This can be used to create a brand new workflow try: # This is used to update the existing workflow. # That way you can still see all the step function run history # You could alternatively delete and recreate the workflow state_machine_arn = 'arn:aws:states:ap-southeast-2:' + account_id + ':stateMachine:' + workflow_name workflow = Workflow.attach(state_machine_arn=state_machine_arn) workflow.update( definition = workflow_definition, role=workflow_execution_role ) except: workflow = Workflow( name=workflow_name, definition=workflow_definition, role=workflow_execution_role, execution_input=execution_input ) workflow.create() # Documentation states the following:
def define_inference_pipeline( sm_role, workflow_execution_role, inference_pipeline_name, return_yaml=True, dump_yaml_file="templates/sagemaker_inference_pipeline.yaml", kms_key_id=None, ): """ Return YAML definition of the training pipeline, which consists of multiple Amazon StepFunction steps sm_role: ARN of the SageMaker execution role workflow_execution_role: ARN of the StepFunction execution role return_yaml: Return YAML representation or not, if False, it returns an instance of `stepfunctions.workflow.WorkflowObject` dump_yaml_file: If not None, a YAML file will be generated at this file location """ # Pass required parameters dynamically for each execution using placeholders. execution_input = ExecutionInput( schema={ "InputDataURL": str, "PreprocessingJobName": str, "InferenceJobName": str, "ProcModelS3": str, "PreprocessingCodeURL": str, "InferenceCodeURL": str, "ModelS3": str, "PreprocessedTrainDataURL": str, "PreprocessedTestDataURL": str, "OutputPathURL": str, }) """ Create Preprocessing Model from model artifact. """ # sagemaker_session = sagemaker.Session() sklearn_processor = SKLearnProcessor( framework_version="0.20.0", role=sm_role, instance_type="ml.m5.xlarge", instance_count=1, max_runtime_in_seconds=1200, ) # Create ProcessingInputs and ProcessingOutputs objects for Inputs and # Outputs respectively for the SageMaker Processing Job inputs = [ ProcessingInput( source=execution_input["InputDataURL"], destination="/opt/ml/processing/input", input_name="input-1", ), ProcessingInput( source=execution_input["PreprocessingCodeURL"], destination="/opt/ml/processing/input/code", input_name="code", ), ProcessingInput( source=execution_input["ProcModelS3"], destination="/opt/ml/processing/model", input_name="proc_model", ), ] outputs = [ ProcessingOutput( source="/opt/ml/processing/test", destination=execution_input["PreprocessedTestDataURL"], output_name="test_data", ), ] processing_step = ProcessingStep( "SageMaker pre-processing step", processor=sklearn_processor, job_name=execution_input["PreprocessingJobName"], inputs=inputs, outputs=outputs, container_arguments=["--mode", "infer"], container_entrypoint=[ "python3", "/opt/ml/processing/input/code/preprocessing.py", ], kms_key_id=kms_key_id, ) """ Create inference with sklearn processing step. Inputs are the preprocessed data S3 URL, the inference code S3 URL, and the model S3 URL. Output is the inferred data. """ sklearn_processor2 = SKLearnProcessor( framework_version="0.20.0", role=sm_role, instance_type="ml.m5.xlarge", instance_count=1, max_runtime_in_seconds=1200, ) inputs = [ ProcessingInput( source=execution_input["PreprocessedTestDataURL"], destination="/opt/ml/processing/input", input_name="input-1", ), ProcessingInput( source=execution_input["InferenceCodeURL"], destination="/opt/ml/processing/input/code", input_name="code", ), ProcessingInput( source=execution_input["ModelS3"], destination="/opt/ml/processing/model", input_name="model", ), ] outputs = [ ProcessingOutput( source="/opt/ml/processing/test", destination=execution_input["OutputPathURL"], output_name="test_data", ), ] inference_step = ProcessingStep( "SageMaker inference step", processor=sklearn_processor2, job_name=execution_input["InferenceJobName"], inputs=inputs, outputs=outputs, container_entrypoint=[ "python3", "/opt/ml/processing/input/code/inference.py", ], kms_key_id=kms_key_id, ) # Create Fail state to mark the workflow failed in case any of the steps fail. failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail( "ML Workflow failed", cause="SageMakerProcessingJobFailed") # Add the Error handling in the workflow catch_state_processing = stepfunctions.steps.states.Catch( error_equals=["States.TaskFailed"], next_step=failed_state_sagemaker_processing_failure, ) processing_step.add_catch(catch_state_processing) inference_step.add_catch(catch_state_processing) # Create the Workflow workflow_graph = Chain([processing_step, inference_step]) inference_pipeline = Workflow( name=inference_pipeline_name, definition=workflow_graph, role=workflow_execution_role, ) return inference_pipeline
def test_attach_existing_workflow(client): workflow = Workflow.attach(state_machine_arn, client) assert workflow.name == state_machine_name assert workflow.role == role_arn assert workflow.state_machine_arn == state_machine_arn
model_name=job_name, initial_instance_count=1, instance_type='ml.m5.large') endpoint_step = steps.EndpointStep( "Create or Update Endpoint", endpoint_name=execution_input['EndpointName'], endpoint_config_name=job_name, update=update_endpoint) workflow_definition = steps.Chain( [training_step, model_step, endpoint_config_step, endpoint_step]) # Update the workflow that is already created workflow = Workflow.attach(workflow_arn) workflow.update(definition=workflow_definition) print('Workflow updated: {}'.format(workflow_arn)) # Sleep for 5 seconds then execute after this is applied time.sleep(5) execution = workflow.execute(inputs=execution_params) stepfunction_arn = execution.execution_arn print('Workflow exectuted: {}'.format(stepfunction_arn)) # Export environment variables if not os.path.exists('cloud_formation'): os.makedirs('cloud_formation')
def define_training_pipeline( sm_role, workflow_execution_role, training_pipeline_name, return_yaml=True, dump_yaml_file="templates/sagemaker_training_pipeline.yaml", kms_key_id=None, ): """ Return YAML definition of the training pipeline, which consists of multiple Amazon StepFunction steps sm_role: ARN of the SageMaker execution role workflow_execution_role: ARN of the StepFunction execution role return_yaml: Return YAML representation or not, if False, it returns an instance of `stepfunctions.workflow.WorkflowObject` dump_yaml_file: If not None, a YAML file will be generated at this file location """ # Pass required parameters dynamically for each execution using placeholders. execution_input = ExecutionInput( schema={ "InputDataURL": str, "PreprocessingJobName": str, "PreprocessingCodeURL": str, "TrainingJobName": str, # Prevent sagemaker config hardcode sagemaker_submit_directory in # workflow definition "SMSubmitDirURL": str, # Prevent sagemaker config hardcode sagemaker_region in workflow definition "SMRegion": str, "EvaluationProcessingJobName": str, "EvaluationCodeURL": str, "EvaluationResultURL": str, "PreprocessedTrainDataURL": str, "PreprocessedTestDataURL": str, "PreprocessedModelURL": str, "SMOutputDataURL": str, "SMDebugOutputURL": str, }) """ Data pre-processing and feature engineering """ sklearn_processor = SKLearnProcessor( framework_version="0.20.0", role=sm_role, instance_type="ml.m5.xlarge", instance_count=1, max_runtime_in_seconds=1200, ) # Create ProcessingInputs and ProcessingOutputs objects for Inputs and # Outputs respectively for the SageMaker Processing Job inputs = [ ProcessingInput( source=execution_input["InputDataURL"], destination="/opt/ml/processing/input", input_name="input-1", ), ProcessingInput( source=execution_input["PreprocessingCodeURL"], destination="/opt/ml/processing/input/code", input_name="code", ), ] outputs = [ ProcessingOutput( source="/opt/ml/processing/train", destination=execution_input["PreprocessedTrainDataURL"], output_name="train_data", ), ProcessingOutput( source="/opt/ml/processing/test", destination=execution_input["PreprocessedTestDataURL"], output_name="test_data", ), ProcessingOutput( source="/opt/ml/processing/model", destination=execution_input["PreprocessedModelURL"], output_name="proc_model", ), ] processing_step = ProcessingStep( "SageMaker pre-processing step", processor=sklearn_processor, job_name=execution_input["PreprocessingJobName"], inputs=inputs, outputs=outputs, container_arguments=[ "--train-test-split-ratio", "0.2", "--mode", "train" ], container_entrypoint=[ "python3", "/opt/ml/processing/input/code/preprocessing.py", ], kms_key_id=kms_key_id, ) """ Training using the pre-processed data """ sklearn = SKLearn( entry_point="../../src/mlmax/train.py", train_instance_type="ml.m5.xlarge", role=sm_role, py_version="py3", framework_version="0.20.0", output_kms_key=kms_key_id, ) training_step = MLMaxTrainingStep( "SageMaker Training Step", estimator=sklearn, job_name=execution_input["TrainingJobName"], train_data=execution_input["PreprocessedTrainDataURL"], test_data=execution_input["PreprocessedTestDataURL"], sm_submit_url=execution_input["SMSubmitDirURL"], sm_region=execution_input["SMRegion"], sm_output_data=execution_input["SMOutputDataURL"], sm_debug_output_data=execution_input["SMDebugOutputURL"], wait_for_completion=True, ) """ Model evaluation """ # Create input and output objects for Model Evaluation ProcessingStep. inputs_evaluation = [ ProcessingInput( source=execution_input["PreprocessedTestDataURL"], destination="/opt/ml/processing/test", input_name="input-1", ), ProcessingInput( source=training_step.get_expected_model().model_data, destination="/opt/ml/processing/model", input_name="input-2", ), ProcessingInput( source=execution_input["EvaluationCodeURL"], destination="/opt/ml/processing/input/code", input_name="code", ), ] outputs_evaluation = [ ProcessingOutput( source="/opt/ml/processing/evaluation", destination=execution_input["EvaluationResultURL"], output_name="evaluation", ), ] model_evaluation_processor = SKLearnProcessor( framework_version="0.20.0", role=sm_role, instance_type="ml.m5.xlarge", instance_count=1, max_runtime_in_seconds=1200, ) processing_evaluation_step = ProcessingStep( "SageMaker Processing Model Evaluation step", processor=model_evaluation_processor, job_name=execution_input["EvaluationProcessingJobName"], inputs=inputs_evaluation, outputs=outputs_evaluation, container_entrypoint=[ "python3", "/opt/ml/processing/input/code/evaluation.py" ], ) # Create Fail state to mark the workflow failed in case any of the steps fail. failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail( "ML Workflow failed", cause="SageMakerProcessingJobFailed") # Add the Error handling in the workflow catch_state_processing = stepfunctions.steps.states.Catch( error_equals=["States.TaskFailed"], next_step=failed_state_sagemaker_processing_failure, ) processing_step.add_catch(catch_state_processing) processing_evaluation_step.add_catch(catch_state_processing) training_step.add_catch(catch_state_processing) # Create the Workflow workflow_graph = Chain( [processing_step, training_step, processing_evaluation_step]) training_pipeline = Workflow( name=training_pipeline_name, definition=workflow_graph, role=workflow_execution_role, ) return training_pipeline
def main( git_branch, codebuild_id, pipeline_name, model_name, deploy_role, sagemaker_role, sagemaker_bucket, data_dir, output_dir, ecr_dir, kms_key_id, workflow_role_arn, notification_arn, sagemaker_project_id, tags, ): # Define the function names create_experiment_function_name = "mlops-create-experiment" query_training_function_name = "mlops-query-training" # Get the region region = boto3.Session().region_name print("region: {}".format(region)) if ecr_dir: # Load the image uri and input data config with open(os.path.join(ecr_dir, "imageDetail.json"), "r") as f: image_uri = json.load(f)["ImageURI"] else: # Get the the managed image uri for current region image_uri = get_training_image(region) print("image uri: {}".format(image_uri)) with open(os.path.join(data_dir, "inputData.json"), "r") as f: input_data = json.load(f) print("training uri: {}".format(input_data["TrainingUri"])) print("validation uri: {}".format(input_data["ValidationUri"])) print("baseline uri: {}".format(input_data["BaselineUri"])) # Get the job id and source revisions job_id = get_pipeline_execution_id(pipeline_name, codebuild_id) revisions = get_pipeline_revisions(pipeline_name, job_id) git_commit_id = revisions["ModelSourceOutput"] data_verison_id = revisions["DataSourceOutput"] print("job id: {}".format(job_id)) print("git commit: {}".format(git_commit_id)) print("data version: {}".format(data_verison_id)) # Set the output Data output_data = { "ModelOutputUri": "s3://{}/{}".format(sagemaker_bucket, model_name), "BaselineOutputUri": f"s3://{sagemaker_bucket}/{model_name}/monitoring/baseline/{model_name}-pbl-{job_id}", } print("model output uri: {}".format(output_data["ModelOutputUri"])) # Pass these into the training method hyperparameters = {} if os.path.exists(os.path.join(data_dir, "hyperparameters.json")): with open(os.path.join(data_dir, "hyperparameters.json"), "r") as f: hyperparameters = json.load(f) for i in hyperparameters: hyperparameters[i] = str(hyperparameters[i]) # Define the step functions execution input schema execution_input = ExecutionInput( schema={ "GitBranch": str, "GitCommitHash": str, "DataVersionId": str, "ExperimentName": str, "TrialName": str, "BaselineJobName": str, "BaselineOutputUri": str, "TrainingJobName": str, }) # Create experiment step experiment_step = create_experiment_step(create_experiment_function_name) baseline_step = create_baseline_step(input_data, execution_input, region, sagemaker_role) training_step = create_training_step( image_uri, hyperparameters, input_data, output_data, execution_input, query_training_function_name, region, sagemaker_role, ) workflow_definition = create_graph(experiment_step, baseline_step, training_step) # Create the workflow as the model name workflow = Workflow(model_name, workflow_definition, workflow_role_arn) print("Creating workflow: {0}-{1}".format(model_name, sagemaker_project_id)) # Create output directory if not os.path.exists(output_dir): os.mkdir(output_dir) # Write the workflow graph to json with open(os.path.join(output_dir, "workflow-graph.json"), "w") as f: f.write(workflow.definition.to_json(pretty=True)) # Write the workflow graph to yml with open(os.path.join(output_dir, "workflow-graph.yml"), "w") as f: f.write(workflow.get_cloudformation_template()) # Write the workflow inputs to file with open(os.path.join(output_dir, "workflow-input.json"), "w") as f: workflow_inputs = { "ExperimentName": "{}".format(model_name), "TrialName": "{}-{}".format(model_name, job_id), "GitBranch": git_branch, "GitCommitHash": git_commit_id, "DataVersionId": data_verison_id, "BaselineJobName": "{}-pbl-{}".format(model_name, job_id), "BaselineOutputUri": output_data["BaselineOutputUri"], "TrainingJobName": "{}-{}".format(model_name, job_id), } json.dump(workflow_inputs, f) # Write the dev & prod params for CFN with open(os.path.join(output_dir, "deploy-model-dev.json"), "w") as f: config = get_dev_config(model_name, job_id, deploy_role, image_uri, kms_key_id, sagemaker_project_id) json.dump(config, f) with open(os.path.join(output_dir, "deploy-model-prd.json"), "w") as f: config = get_prd_config( model_name, job_id, deploy_role, image_uri, kms_key_id, notification_arn, sagemaker_project_id, ) json.dump(config, f)