Пример #1
0
def test_run_job_flow(make_stubber, make_unique_name, error_code):
    emr_client = boto3.client('emr')
    emr_stubber = make_stubber(emr_client)
    cluster_name = make_unique_name('cluster-')
    log_uri = 's3://test-bucket'
    release = 'emr-5.30.1'
    instance_type = 'm5.xlarge'
    instance_count = 3
    keep_alive = True
    steps = [{
        'name': make_unique_name('step-'),
        'script_uri': 's3://test-bucket',
        'script_args': ('--testing', )
    }]
    applications = ['test-app']
    cluster_id = 'i-123456789'
    job_flow_role = MagicMock()
    job_flow_role.name = 'job-flow-role'
    service_role = MagicMock()
    service_role.name = 'service_role'
    security_groups = \
        {'manager': MagicMock(id='sg-1234'), 'worker': MagicMock(id='sg-5678')}

    emr_stubber.stub_run_job_flow(cluster_name,
                                  log_uri,
                                  release,
                                  instance_type,
                                  instance_count,
                                  keep_alive,
                                  steps,
                                  applications,
                                  job_flow_role.name,
                                  service_role.name,
                                  security_groups,
                                  cluster_id,
                                  error_code=error_code)

    if error_code is None:
        got_id = emr_basics.run_job_flow(cluster_name, log_uri, keep_alive,
                                         applications, job_flow_role,
                                         service_role, security_groups, steps,
                                         emr_client)
        assert got_id == cluster_id
    else:
        with pytest.raises(ClientError) as exc_info:
            emr_basics.run_job_flow(cluster_name, log_uri, keep_alive,
                                    applications, job_flow_role, service_role,
                                    security_groups, steps, emr_client)
        assert exc_info.value.response['Error']['Code'] == error_code
def demo_long_lived_cluster():
    """
    Shows how to create a long-lived cluster that waits after all steps are run so
    that more steps can be run. At the end of the demo, the cluster is optionally
    terminated.
    """
    print('-' * 88)
    print(f"Welcome to the Amazon EMR long-lived cluster demo.")
    print('-' * 88)

    prefix = 'aws-demo-long-emr'

    s3_resource = boto3.resource('s3')
    iam_resource = boto3.resource('iam')
    emr_client = boto3.client('emr')
    ec2_resource = boto3.resource('ec2')

    # Set up resources for the demo.
    bucket_name = f'{prefix}-{time.time_ns()}'
    script_file_name = 'pyspark_top_product_keyword.py'
    script_key = f'scripts/{script_file_name}'
    bucket = setup_bucket(bucket_name, script_file_name, script_key,
                          s3_resource)
    job_flow_role, service_role = \
        create_roles(f'{prefix}-ec2-role', f'{prefix}-service-role', iam_resource)
    security_groups = create_security_groups(prefix, ec2_resource)
    print(
        "Wait for 10 seconds to give roles and profiles time to propagate...")
    time.sleep(10)

    max_tries = 5
    while True:
        try:
            cluster_id = emr_basics.run_job_flow(
                f'{prefix}-cluster', f's3://{bucket_name}/logs', True,
                ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role,
                security_groups, [], emr_client)
            print(f"Running job flow for cluster {cluster_id}...")
            break
        except ClientError as error:
            max_tries -= 1
            if max_tries > 0 and \
                    error.response['Error']['Code'] == 'ValidationException':
                print(
                    "Instance profile is not ready, let's give it more time..."
                )
                time.sleep(10)
            else:
                raise
    status_poller(
        "Waiting for cluster, this typically takes several minutes...",
        'WAITING',
        lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][
            'State'],
    )

    add_top_product_step('20', 'Books', 'fire', cluster_id, bucket, script_key,
                         emr_client)

    add_top_product_step('20', 'Grocery', 'cheese', cluster_id, bucket,
                         script_key, emr_client)

    review_bucket_folders = s3_resource.meta.client.list_objects_v2(
        Bucket='amazon-reviews-pds',
        Prefix='parquet/',
        Delimiter='/',
        MaxKeys=100)
    categories = [
        cat['Prefix'].split('=')[1][:-1]
        for cat in review_bucket_folders['CommonPrefixes']
    ]
    while True:
        while True:
            input_cat = input(
                f"Your turn! Possible categories are: {categories}. Which category "
                f"would you like to search (enter 'none' when you're done)? ")
            if input_cat.lower() == 'none' or input_cat in categories:
                break
            elif input_cat not in categories:
                print(f"Sorry, {input_cat} is not an allowed category!")
        if input_cat.lower() == 'none':
            break
        else:
            input_keyword = input(
                "What keyword would you like to search for? ")
            input_count = input("How many items would you like to list? ")
            add_top_product_step(input_count, input_cat, input_keyword,
                                 cluster_id, bucket, script_key, emr_client)

    # Clean up demo resources (if you want to).
    remove_everything = input(
        f"Do you want to terminate the cluster and delete the security roles, "
        f"groups, bucket, and all of its contents (y/n)? ")
    if remove_everything.lower() == 'y':
        emr_basics.terminate_cluster(cluster_id, emr_client)
        status_poller(
            "Waiting for cluster to terminate.",
            'TERMINATED', lambda: emr_basics.describe_cluster(
                cluster_id, emr_client)['Status']['State'])
        delete_security_groups(security_groups)
        delete_roles([job_flow_role, service_role])
        delete_bucket(bucket)
    else:
        print(
            f"Remember that running Amazon EMR clusters and objects kept in an "
            f"Amazon S3 bucket can incur charges against your account.")
    print("Thanks for watching!")
def demo_short_lived_cluster():
    """
    Shows how to create a short-lived cluster that runs a step and automatically
    terminates after the step completes.
    """
    print('-' * 88)
    print(f"Welcome to the Amazon EMR short-lived cluster demo.")
    print('-' * 88)

    prefix = f'aws-demo-short-emr'

    s3_resource = boto3.resource('s3')
    iam_resource = boto3.resource('iam')
    emr_client = boto3.client('emr')
    ec2_resource = boto3.resource('ec2')

    # Set up resources for the demo.
    bucket_name = f'{prefix}-{time.time_ns()}'
    script_file_name = 'pyspark_estimate_pi.py'
    script_key = f'scripts/{script_file_name}'
    bucket = setup_bucket(bucket_name, script_file_name, script_key,
                          s3_resource)
    job_flow_role, service_role = create_roles(f'{prefix}-ec2-role',
                                               f'{prefix}-service-role',
                                               iam_resource)
    security_groups = create_security_groups(prefix, ec2_resource)

    # Run the job.
    output_prefix = 'pi-calc-output'
    pi_step = {
        'name':
        'estimate-pi-step',
        'script_uri':
        f's3://{bucket_name}/{script_key}',
        'script_args': [
            '--partitions', '3', '--output_uri',
            f's3://{bucket_name}/{output_prefix}'
        ]
    }
    print(
        "Wait for 10 seconds to give roles and profiles time to propagate...")
    time.sleep(10)
    max_tries = 5
    while True:
        try:
            cluster_id = emr_basics.run_job_flow(
                f'{prefix}-cluster', f's3://{bucket_name}/logs', False,
                ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role,
                security_groups, [pi_step], emr_client)
            print(f"Running job flow for cluster {cluster_id}...")
            break
        except ClientError as error:
            max_tries -= 1
            if max_tries > 0 and \
                    error.response['Error']['Code'] == 'ValidationException':
                print(
                    "Instance profile is not ready, let's give it more time..."
                )
                time.sleep(10)
            else:
                raise

    status_poller(
        "Waiting for cluster, this typically takes several minutes...",
        'RUNNING',
        lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][
            'State'],
    )
    status_poller(
        "Waiting for step to complete...", 'PENDING', lambda: emr_basics.
        list_steps(cluster_id, emr_client)[0]['Status']['State'])
    status_poller(
        "Waiting for cluster to terminate.",
        'TERMINATED', lambda: emr_basics.describe_cluster(
            cluster_id, emr_client)['Status']['State'])

    print(f"Job complete!. The script, logs, and output for this demo are in "
          f"Amazon S3 bucket {bucket_name}. The output is:")
    for obj in bucket.objects.filter(Prefix=output_prefix):
        print(obj.get()['Body'].read().decode())

    # Clean up demo resources (if you want to).
    remove_everything = input(
        f"Do you want to delete the security roles, groups, and bucket (y/n)? "
    )
    if remove_everything.lower() == 'y':
        delete_security_groups(security_groups)
        delete_roles([job_flow_role, service_role])
        delete_bucket(bucket)
    else:
        print(
            f"Remember that objects kept in an Amazon S3 bucket can incur charges"
            f"against your account.")
    print("Thanks for watching!")
Пример #4
0
def demo_short_lived_cluster(PREFIX, S3_BUCKET, S3_KEY, LOCAL_SCRIPT_KEY,
                             REGION, AWS_ACESS_KEY, AWS_SECRET):
    """
    Create a short-lived cluster that runs a step and automatically
    terminates after the step completes.

    :param PREFIX: The prefix to use in the EMR cluster and security groups creation.
    :param S3_BUCKET: The name of the S3 Bucket used.
    :param S3_KEY: The key where the PySpark script will be uploaded.
    :param LOCAL_SCRIPT_KEY: The key where the PySpark script is alocated locally.
    """
    print('-' * 88)
    print(f"Welcome to the Amazon EMR short-lived cluster.")
    print('-' * 88)

    #prefix = f'CSGO-PIPELINE-EMR-CLUSTER'
    prefix = PREFIX

    s3_resource = boto3.resource('s3',
                                 region_name=REGION,
                                 aws_access_key_id=AWS_ACESS_KEY,
                                 aws_secret_access_key=AWS_SECRET)

    iam_resource = boto3.resource('iam',
                                  region_name=REGION,
                                  aws_access_key_id=AWS_ACESS_KEY,
                                  aws_secret_access_key=AWS_SECRET)

    emr_client = boto3.client('emr',
                              region_name=REGION,
                              aws_access_key_id=AWS_ACESS_KEY,
                              aws_secret_access_key=AWS_SECRET)

    ec2_resource = boto3.resource('ec2',
                                  region_name=REGION,
                                  aws_access_key_id=AWS_ACESS_KEY,
                                  aws_secret_access_key=AWS_SECRET)

    S3_URI = 's3://{bucket}/{key}'.format(bucket=S3_BUCKET, key=S3_KEY)

    s3_resource.meta.client.upload_file(LOCAL_SCRIPT_KEY, S3_BUCKET, S3_KEY)

    # Set up resources.
    bucket_name = S3_BUCKET

    #get the local time
    named_tuple = time.localtime()
    time_string = time.strftime("%m.%d.%Y-%Hh%Mm%Ss", named_tuple)

    job_flow_role, service_role = create_roles(
        f'{time_string}-{prefix}-ec2-role',
        f'{time_string}-{prefix}-service-role', iam_resource)

    security_groups = create_security_groups(f'{time_string}-{prefix}',
                                             ec2_resource)

    # Run the job.
    step = {'name': 'pyspark_test', 'script_uri': S3_URI, 'script_args': []}
    print(
        "Wait for 10 seconds to give roles and profiles time to propagate...")
    time.sleep(10)
    max_tries = 5
    while True:
        try:
            cluster_id = emr_basics.run_job_flow(
                f'{prefix}-cluster', f's3://{bucket_name}/logs', False,
                ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role,
                security_groups, [step], emr_client)
            print(f"Running job flow for cluster {cluster_id}...")
            break
        except ClientError as error:
            max_tries -= 1
            if max_tries > 0 and \
                    error.response['Error']['Code'] == 'ValidationException':
                print(
                    "Instance profile is not ready, let's give it more time..."
                )
                time.sleep(10)
            else:
                raise

    status_poller(
        "Waiting for cluster, this typically takes several minutes...",
        'RUNNING',
        lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][
            'State'],
    )
    status_poller(
        "Waiting for step to complete...", 'PENDING', lambda: emr_basics.
        list_steps(cluster_id, emr_client)[0]['Status']['State'])
    status_poller(
        "Waiting for cluster to terminate.",
        'TERMINATED', lambda: emr_basics.describe_cluster(
            cluster_id, emr_client)['Status']['State'])

    print(f"Job complete!. The script, logs, and output are in "
          f"Amazon S3 bucket {bucket_name}/logs.")

    #steps = emr_basics.list_steps(cluster_id, emr_client)

    #for step in steps:
    #   print(emr_basics.describe_step(cluster_id, step, emr_client))

    delete_security_groups(security_groups)
    delete_roles([job_flow_role, service_role])


#if __name__ == '__main__':
#    logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
#    demo_short_lived_cluster("CS_GO_PIPELINE", "fpmacedo", "spark/pyspark_script.py", "pyspark_script.py")