def test_datajob_stack_creates_resources_on_exit_only_when_no_error_occurs( self, m_create_resources): exception_ = None try: with DataJobStack(scope=self.app, id="datajob-stack-with-error") as djs: raise Exception("some exception") except Exception as e: exception_ = e self.assertEqual(m_create_resources.call_count, 0) self.assertIsNotNone(exception_) with DataJobStack(scope=self.app, id="datajob-stack-without-error") as djs: pass self.assertEqual(m_create_resources.call_count, 1)
def test_create_glue_pythonshell_successfully(self): djs = DataJobStack(scope=self.app, id="some-stack", stage="stg") glue_job = GlueJob(djs, "some-task", "some/path/task.py") self.assertEqual(glue_job.job_type, GlueJobType.PYTHONSHELL.value) self.assertEqual(glue_job.glue_version, "1.0") self.assertEqual(glue_job.job_path, "some/path/task.py") self.assertEqual(glue_job.python_version, "3")
def test_datajob_stack_with_stage_passed_to_datajob_stack(self): stage_value = "some-value" with DataJobStack(scope=self.app, id="datajob-stack-no-error", stage=stage_value) as djs: pass self.assertEqual(djs.stage, stage_value)
def test_datajob_context_initiates_without_error(self): exception_ = None try: app = core.App() djs = DataJobStack(scope=app, id="some-stack-name") DataJobContext(djs, unique_stack_name="some-unique-name") except Exception as e: exception_ = e self.assertIsNone(exception_)
def test_datajob_stack_initiates_without_error(self): exception_ = None try: with DataJobStack(scope=self.app, id="datajob-stack-no-error") as djs: pass except Exception as e: exception_ = e self.assertIsNone(exception_)
def test_datajob_context_initiates_without_stage(self): exception_ = None try: app = core.App() djs = DataJobStack(scope=app, id="some-stack-name") djc = DataJobContext(djs) except Exception as e: exception_ = e self.assertIsNone(exception_) # some random characters are appended to the bucketname self.assertIsNone(djc.stage) self.assertTrue(len(djc.data_bucket_name.split("-")[-1]), 4) self.assertTrue(len(djc.deployment_bucket_name.split("-")[-1]), 4)
def test_datajob_context_with_stage(self): exception_ = None try: stack_name = "some-stack" stage = "some-stage" app = core.App() djs = DataJobStack(scope=app, id=stack_name, stage=stage) djc = DataJobContext(djs) self.assertIsNotNone(djc.stage) self.assertEqual(djc.data_bucket_name, djs.unique_stack_name) self.assertTrue(djc.deployment_bucket_name, f"{djs.unique_stack_name}-deployment-bucket") except Exception as e: exception_ = e self.assertIsNone(exception_)
def test_sagemaker_transform_step_successfully(self, m_default_bucket): m_default_bucket.return_value = "sagemaker-bucket-name" with DataJobStack(scope=self.app, id="some-stack", stage="stg") as djs: transformer = Transformer( model_name="some-model", instance_count=1, instance_type="ml.t2.medium", sagemaker_session=self.sagemaker_session, ) transform_step = TransformStep( datajob_stack=djs, name="transform-job", transformer=transformer, data="s3://some-bucket/some-data.csv", ) estimator = SKLearn( entry_point=str( pathlib.Path(current_dir, "resources", "train.py")), train_instance_type="ml.m5.xlarge", role=self.role, framework_version="0.20.0", py_version="py3", sagemaker_session=self.sagemaker_session, ) tuner = HyperparameterTuner( estimator=estimator, hyperparameter_ranges={ "alpha": ContinuousParameter(0.0001, 0.05) }, objective_metric_name="rmse", ) tuner_step = TuningStep( datajob_stack=djs, name="tuning-step", tuner=tuner, data="s3://some-bucket/some-data.csv", ) with StepfunctionsWorkflow(djs, "sequential") as sfn_workflow: transform_step >> tuner_step
def test_datajob_stack_with_no_stage(self): with DataJobStack(scope=self.app, id="datajob-stack-no-stage") as djs: pass self.assertIsNone(djs.stage)
def test_datajob_stack_with_stage_passed_via_cli(self): stage_value = "some-value" scope = core.App(context={"stage": stage_value}) with DataJobStack(scope=scope, id="datajob-stack-no-error") as djs: pass self.assertEqual(djs.stage, stage_value)
def test_datajob_stack_with_no_stage(self): with DataJobStack(scope=self.app, id="datajob-stack-no-stage") as djs: pass self.assertEqual(djs.stage, DEFAULT_STACK_STAGE)
""" import pathlib from aws_cdk import core from datajob.datajob_stack import DataJobStack from datajob.glue.glue_job import GlueJob from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow app = core.App() current_dir = pathlib.Path(__file__).parent.absolute() app = core.App() datajob_stack = DataJobStack(scope=app, id="data-pipeline-pkg", project_root=current_dir) datajob_stack.init_datajob_context() task1 = GlueJob(datajob_stack=datajob_stack, name="task1", job_path="glue_jobs/task1.py") task2 = GlueJob(datajob_stack=datajob_stack, name="task2", job_path="glue_jobs/task2.py") with StepfunctionsWorkflow(datajob_stack=datajob_stack, name="workflow") as sfn: task1 >> task2 datajob_stack.create_resources()
import sagemaker from aws_cdk import core from sagemaker import image_uris from datajob.datajob_stack import DataJobStack from datajob.glue.glue_job import GlueJob from datajob.sagemaker import get_default_sagemaker_role from datajob.sagemaker.sagemaker_job import EndpointConfigStep from datajob.sagemaker.sagemaker_job import EndpointStep from datajob.sagemaker.sagemaker_job import ModelStep from datajob.sagemaker.sagemaker_job import TrainingStep from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow app = core.App() with DataJobStack(scope=app, id="datajob-ml-pipeline-abalone") as djs: sagemaker_default_role = get_default_sagemaker_role(datajob_stack=djs) train_path = f"s3://{djs.context.data_bucket_name}/train/abalone.train" validation_path = ( f"s3://{djs.context.data_bucket_name}/validation/abalone.validation") test_path = f"s3://{djs.context.data_bucket_name}/test/abalone.test" prepare_dataset_step = GlueJob( datajob_stack=djs, name="prepare-dataset", job_path="jobs/prepare_dataset.py", job_type="pythonshell", max_capacity=1, arguments={
import pathlib from aws_cdk import core from datajob.datajob_stack import DataJobStack from datajob.glue.glue_job import GlueJob from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow current_dir = pathlib.Path(__file__).parent.absolute() app = core.App() # the datajob_stack is the instance that will result in a cloudformation stack. # we inject the datajob_stack object through all the resources that we want to add. with DataJobStack(scope=app, id="data-pipeline-pkg", project_root=current_dir) as datajob_stack: # here we define 2 glue jobs with the path to the source code. task1 = GlueJob(datajob_stack=datajob_stack, name="task1", job_path="glue_jobs/task1.py") task2 = GlueJob(datajob_stack=datajob_stack, name="task2", job_path="glue_jobs/task2.py") # we instantiate a step functions workflow # and orchestrate the glue jobs. with StepfunctionsWorkflow(datajob_stack=datajob_stack, name="workflow") as step_functions_workflow: task1 >> task2
from aws_cdk import core from datajob.datajob_stack import DataJobStack from datajob.glue.glue_job import GlueJob from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow app = core.App() with DataJobStack(scope=app, id="data-pipeline-parallel") as datajob_stack: task1 = GlueJob( datajob_stack=datajob_stack, name="task1", job_path="glue_jobs/task.py" ) task2 = GlueJob( datajob_stack=datajob_stack, name="task2", job_path="glue_jobs/task.py" ) task3 = GlueJob( datajob_stack=datajob_stack, name="task3", job_path="glue_jobs/task.py" ) task4 = GlueJob( datajob_stack=datajob_stack, name="task4", job_path="glue_jobs/task.py" ) task5 = GlueJob( datajob_stack=datajob_stack, name="task5", job_path="glue_jobs/task.py" ) # Task2 comes after task1. task4 comes after task3. # Task 5 depends on both task2 and task4 to be finished. # Therefore task1 and task2 can run in parallel, # as well as task3 and task4.
import pathlib from aws_cdk import core from datajob.datajob_stack import DataJobStack from datajob.glue.glue_job import GlueJob from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow current_dir = str(pathlib.Path(__file__).parent.absolute()) app = core.App() with DataJobStack(scope=app, id="datajob-python-pyspark", project_root=current_dir) as datajob_stack: pyspark_job = GlueJob( datajob_stack=datajob_stack, name="pyspark-job", job_path="glue_job/glue_pyspark_example.py", job_type="glueetl", glue_version="2.0", # we only support glue 2.0 python_version="3", worker_type="Standard", # options are Standard / G.1X / G.2X number_of_workers=1, arguments={ "--source": f"s3://{datajob_stack.context.data_bucket_name}/raw/iris_dataset.csv", "--destination": f"s3://{datajob_stack.context.data_bucket_name}/target/pyspark_job/iris_dataset.parquet", },
""" same as ./datajob_stack.py but more explicit """ from aws_cdk import core from datajob.datajob_stack import DataJobStack from datajob.glue.glue_job import GlueJob from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow app = core.App() datajob_stack = DataJobStack(scope=app, id="data-pipeline-simple") datajob_stack.init_datajob_context() task1 = GlueJob(datajob_stack=datajob_stack, name="task1", job_path="glue_jobs/task1.py") task2 = GlueJob(datajob_stack=datajob_stack, name="task2", job_path="glue_jobs/task2.py") with StepfunctionsWorkflow(datajob_stack=datajob_stack, name="workflow") as sfn: task1 >> task2 datajob_stack.create_resources() app.synth()
def test_sagemaker_services_successfully(self, m_default_bucket): m_default_bucket.return_value = "sagemaker-bucket-name" with DataJobStack(scope=self.app, id="some-stack", stage="stg") as djs: processor = SKLearnProcessor( framework_version="0.23-1", role=self.role, instance_type="local", instance_count=1, sagemaker_session=self.sagemaker_session, ) processing_step = ProcessingStep( datajob_stack=djs, name="processing-job", processor=processor, ) estimator = SKLearn( entry_point=str( pathlib.Path(current_dir, "resources", "train.py")), train_instance_type="ml.m5.xlarge", role=self.role, framework_version="0.20.0", py_version="py3", sagemaker_session=self.sagemaker_session, ) training_step = TrainingStep( datajob_stack=djs, name="training-job", estimator=estimator, ) model_step = ModelStep( datajob_stack=djs, name="model-step", model=training_step.sfn_task.get_expected_model(), ) endpoint_config_step = EndpointConfigStep( datajob_stack=djs, name="endpoint-config-step", model_name=model_step.model_name, ) endpoint_step = EndpointStep( datajob_stack=djs, name="endpoint-step", endpoint_config_name=endpoint_config_step.name, ) with StepfunctionsWorkflow( djs, "sequential") as sfn_workflow_sequential: (processing_step >> training_step >> model_step >> endpoint_config_step >> endpoint_step) with StepfunctionsWorkflow(djs, "parallel") as sfn_workflow_parallel: processing_step >> processing_step training_step >> training_step # check if we have the expected value for the execution input self.assertDictEqual( djs.execution_input.execution_input_schema, { "some-stack-stg-processing-job": str, "some-stack-stg-training-job": str, "some-stack-stg-model-step": str, "some-stack-stg-endpoint-config-step": str, "some-stack-stg-endpoint-step": str, }, ) # execution input is added to cloudformation output self.assertDictEqual( djs.outputs, { "DatajobExecutionInput": json.dumps([ "some-stack-stg-processing-job", "some-stack-stg-training-job", "some-stack-stg-model-step", "some-stack-stg-endpoint-config-step", "some-stack-stg-endpoint-step", ]) }, )
def setUp(self) -> None: self.app = core.App() self.djs = DataJobStack(scope=self.app, id="datajob-stack-no-error")
from aws_cdk import core from sagemaker.processing import ProcessingInput from sagemaker.processing import ProcessingOutput from sagemaker.sklearn import SKLearnProcessor from sagemaker.sklearn.estimator import SKLearn from datajob.datajob_stack import DataJobStack from datajob.sagemaker import get_default_sagemaker_role from datajob.sagemaker.sagemaker_job import ModelStep from datajob.sagemaker.sagemaker_job import ProcessingStep from datajob.sagemaker.sagemaker_job import TrainingStep from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow app = core.App() with DataJobStack(scope=app, id="datajob-ml-pipeline-scikitlearn") as djs: role = get_default_sagemaker_role(datajob_stack=djs) sagemaker_session = sagemaker.Session(boto_session=boto3.session.Session( region_name=djs.env.region)) s3_bucket_base_uri = "{}{}".format("s3://", sagemaker_session.default_bucket()) output_data = "{}/{}".format(s3_bucket_base_uri, "data/sklearn_processing/output") input_data = f"s3://sagemaker-sample-data-{djs.env.region}/processing/census/census-income.csv" input_code = sagemaker_session.upload_data( "resources/preprocessing.py", bucket=sagemaker_session.default_bucket(),