Пример #1
0
def test_add_step_builder():
    default_task_json = {
        'Resource': {
            'Fn::Join': ['', ['arn:', {
                'Ref': 'AWS::Partition'
            }, ':states:::elasticmapreduce:addStep.sync']]
        },
        'Parameters': {
            'ClusterId': 'test-cluster-id',
            'Step': {
                'Name': 'test-step',
                'ActionOnFailure': 'CONTINUE',
                'HadoopJarStep': {
                    'Jar': 'Jar',
                    'MainClass': 'Main',
                    'Args': ['Arg1', 'Arg2'],
                    'Properties': []
                }
            }
        },
        'End': True,
        'Type': 'Task'
    }

    stack = core.Stack(core.App(), 'test-stack')

    task = emr_tasks.AddStepBuilder.build(
        stack, 'test-task',
        cluster_id='test-cluster-id',
        emr_step=emr_code.EMRStep('test-step', 'Jar', 'Main', ['Arg1', 'Arg2']),
    )

    print_and_assert(default_task_json, task)
Пример #2
0
# Create a Parallel Task for the Steps
steps = sfn.Parallel(stack, 'Steps', result_path='$.Result.Steps')

# Add a Failure catch to our Parallel phase
steps.add_catch(fail, errors=['States.ALL'], result_path='$.Error')

# Create 5 Phase 1 Parallel Steps. The number of concurrently running Steps is
# defined in the Cluster Configuration
for file in emr_code.Code.files_in_path('./step_sources', 'test_step_*.py'):
    # Define an AddStep Task for Each Step
    step_task = emr_tasks.AddStepBuilder.build(
        stack,
        f'Step_{file}',
        emr_step=emr_code.EMRStep(
            name=f'Step - {file}',
            jar='command-runner.jar',
            args=['spark-submit', f'{step_code.s3_path}/{file}', 'Arg1'],
            code=step_code),
        cluster_id=sfn.TaskInput.from_data_at(
            '$.LaunchClusterResult.ClusterId').value)
    steps.branch(step_task)

# Define a Task to Terminate the Cluster
terminate_cluster = emr_tasks.TerminateClusterBuilder.build(
    stack,
    'TerminateCluster',
    name='Terminate Cluster',
    cluster_id=sfn.TaskInput.from_data_at(
        '$.LaunchClusterResult.ClusterId').value,
    result_path='$.TerminateResult').add_catch(fail,
                                               errors=['States.ALL'],
Пример #3
0
phase_1 = sfn.Parallel(stack, 'Phase1', result_path='$.Result.Phase1')

# Add a Failure catch to our Parallel phase
phase_1.add_catch(terminate_failed_cluster, errors=['States.ALL'], result_path='$.Error')

# Create 5 Phase 1 Parallel Steps. The number of concurrently running Steps is
# defined in the Cluster Configuration
for file in emr_code.Code.files_in_path('./step_sources', 'test_step_*.sh'):
    # Define an AddStep Task for Each Step
    step_task = emr_tasks.AddStepBuilder.build(
        stack, f'Phase1_{file}',
        emr_step=emr_code.EMRStep(
            name=f'Phase 1 - {file}',
            jar='s3://us-west-2.elasticmapreduce/libs/script-runner/script-runner.jar',
            args=[
                f'{step_code.s3_path}/{file}',
                'Arg1',
                'Arg2'
            ],
            code=step_code
        ),
        cluster_id=sfn.TaskInput.from_data_at('$.LaunchClusterResult.ClusterId').value)
    phase_1.branch(step_task)

# Define an AddStep Task for the Validation Step
validate_phase_1 = emr_chains.AddStepWithArgumentOverrides(
    stack, 'ValidatePhase1',
    emr_step=emr_code.EMRStep(
        name='Validate Phase 1',
        jar='s3://us-west-2.elasticmapreduce/libs/script-runner/script-runner.jar',
        args=[
            f'{step_code.s3_path}/phase_1/test_validation.sh',
Пример #4
0
    def __init__(self, scope: core.Construct, id: str, emr_launch_stack,
                 artifact_bucket, output_bucket, **kwargs):
        super().__init__(scope, id, **kwargs)

        launch_function = emr_launch_stack.launch_function

        # Create DynamoDB table for tracking
        dynamo_table = dynamo.Table(
            self,
            "dynamotable",
            partition_key=dynamo.Attribute(name="BatchId",
                                           type=dynamo.AttributeType.STRING),
            sort_key=dynamo.Attribute(name="Name",
                                      type=dynamo.AttributeType.STRING),
            billing_mode=dynamo.BillingMode.PAY_PER_REQUEST)

        emr_role = aws_iam.Role.from_role_arn(
            self, "emr_role_iam", role_arn=emr_launch_stack.instance_role_arn)

        emr_role.add_to_policy(
            aws_iam.PolicyStatement(actions=["dynamodb:*"],
                                    resources=[dynamo_table.table_arn]))

        emr_role.add_to_policy(
            aws_iam.PolicyStatement(actions=[
                "logs:CreateLogStream", "logs:DescribeLogStreams",
                "logs:CreateLogGroup", "logs:PutLogEvents", "ec2:DescribeTags"
            ],
                                    resources=["*"]))

        # SNS Topics for Success/Failures messages from our Pipeline
        success_topic = sns.Topic(self, 'SuccessTopic')
        failure_topic = sns.Topic(self, 'FailureTopic')

        # Upload artifacts to S3
        step_code = s3d.BucketDeployment(
            self,
            id='sparkscript',
            destination_bucket=artifact_bucket,
            destination_key_prefix='steps',
            sources=[
                s3d.Source.asset('infrastructure/emr_orchestration/steps/')
            ])

        # Create a Chain to receive Failure messages
        fail = emr_chains.Fail(self,
                               'FailChain',
                               message=sfn.TaskInput.from_data_at('$.Error'),
                               subject='Pipeline Failure',
                               topic=failure_topic)

        # # Define a Task to Terminate the Cluster on failure
        terminate_failed_cluster = emr_tasks.TerminateClusterBuilder.build(
            self,
            'TerminateFailedCluster',
            name='Terminate Failed Cluster',
            cluster_id=sfn.TaskInput.from_data_at(
                '$.LaunchClusterResult.ClusterId').value,
            result_path='$.TerminateResult').add_catch(fail,
                                                       errors=['States.ALL'],
                                                       result_path='$.Error')

        terminate_failed_cluster.next(fail)

        # Use a NestedStateMachine to launch the cluster
        launch_cluster = emr_chains.NestedStateMachine(
            self,
            'NestedStateMachine',
            name='Launch Cluster StateMachine',
            state_machine=launch_function.state_machine,
            fail_chain=fail)

        pyspark_step = emr_chains.AddStepWithArgumentOverrides(
            self,
            'PySparkSceneDetection',
            emr_step=emr_code.EMRStep(
                name=f'Scene Detection - PySpark Job',
                jar='command-runner.jar',
                args=[
                    'spark-submit', '--master', 'yarn', '--deploy-mode',
                    'cluster', '--packages',
                    'com.audienceproject:spark-dynamodb_2.12:1.1.2',
                    os.path.join(f's3://{artifact_bucket.bucket_name}',
                                 'steps', 'scene_detection.py'), '--batch-id',
                    'DynamoDB.BatchId', '--batch-metadata-table-name',
                    dynamo_table.table_name, '--output-bucket',
                    output_bucket.bucket_name, '--synchronized-table-name',
                    'synchronized-signals'
                ]),
            cluster_id=sfn.TaskInput.from_data_at(
                '$.LaunchClusterResult.ClusterId').value,
            result_path='$.PySparkResult',
            fail_chain=terminate_failed_cluster)

        # Define a Task to Terminate the Cluster
        terminate_cluster = emr_tasks.TerminateClusterBuilder.build(
            self,
            'TerminateCluster',
            name='Terminate Cluster',
            cluster_id=sfn.TaskInput.from_data_at(
                '$.LaunchClusterResult.ClusterId').value,
            result_path='$.TerminateResult').add_catch(fail,
                                                       errors=['States.ALL'],
                                                       result_path='$.Error')

        # A Chain for Success notification when the pipeline completes
        success = emr_chains.Success(
            self,
            'SuccessChain',
            message=sfn.TaskInput.from_data_at('$.TerminateResult'),
            subject='Pipeline Succeeded',
            topic=success_topic)

        # Assemble the Pipeline
        definition = sfn.Chain \
            .start(launch_cluster) \
            .next(pyspark_step) \
            .next(terminate_cluster) \
            .next(success)

        # Create the State Machine
        self.state_machine = sfn.StateMachine(
            self,
            'SceneDetectionStateMachine',
            state_machine_name='scene-detection-pipeline',
            definition=definition)
        self.dynamo_table = dynamo_table
def test_add_step_with_argument_overrides():
    default_fragment_json = {
        'Type':
        'Parallel',
        'End':
        True,
        'Branches': [{
            'StartAt': 'test-fragment: test-step - Override Args',
            'States': {
                'test-fragment: test-step - Override Args': {
                    'Next':
                    'test-fragment: test-step',
                    'Catch': [{
                        'ErrorEquals': ['States.ALL'],
                        'ResultPath': '$.Error',
                        'Next': 'test-fail'
                    }],
                    'Parameters': {
                        'ExecutionInput.$': '$$.Execution.Input',
                        'StepName': 'test-step',
                        'Args': ['Arg1', 'Arg2']
                    },
                    'Type':
                    'Task',
                    'Resource': {
                        'Fn::GetAtt': ['OverrideStepArgsE9376C9F', 'Arn']
                    },
                    'ResultPath':
                    '$.test-fragmentResultArgs'
                },
                'test-fragment: test-step': {
                    'End':
                    True,
                    'Catch': [{
                        'ErrorEquals': ['States.ALL'],
                        'ResultPath': '$.Error',
                        'Next': 'test-fail'
                    }],
                    'Parameters': {
                        'ClusterId': 'test-cluster-id',
                        'Step': {
                            'Name': 'test-step',
                            'ActionOnFailure': 'CONTINUE',
                            'HadoopJarStep': {
                                'Jar': 'Jar',
                                'MainClass': 'Main',
                                'Args.$': '$.test-fragmentResultArgs',
                                'Properties': []
                            }
                        }
                    },
                    'Type':
                    'Task',
                    'Resource': {
                        'Fn::Join': [
                            '',
                            [
                                'arn:', {
                                    'Ref': 'AWS::Partition'
                                }, ':states:::elasticmapreduce:addStep.sync'
                            ]
                        ]
                    }
                },
                'test-fail': {
                    'Type': 'Fail'
                }
            }
        }]
    }

    stack = core.Stack(core.App(), 'test-stack')

    fragment = emr_chains.AddStepWithArgumentOverrides(
        stack,
        'test-fragment',
        emr_step=emr_code.EMRStep('test-step', 'Jar', 'Main',
                                  ['Arg1', 'Arg2']),
        cluster_id='test-cluster-id',
        fail_chain=sfn.Fail(stack, 'test-fail'))

    print_and_assert(default_fragment_json, fragment)