Exemplo n.º 1
0
def generate_layout(user_params, system_params):
    file_name = "compaction_{0}_{1}.py".format(
        user_params['SourceDatabaseName'], user_params['SourceTableName'])

    session = boto3.Session(region_name=system_params['region'])
    glue = session.client('glue')
    s3_client = session.client('s3')

    workflow_name = user_params['WorkflowName']

    # Validate params
    validate_params(user_params, system_params)

    # Create Source Database if it does not exists
    try:
        glue.create_database(
            DatabaseInput={'Name': user_params['SourceDatabaseName']})
        print("New database is created.")
    except glue.exceptions.AlreadyExistsException:
        print("Existing database is used.")

    location = {'LocationConstraint': system_params['region']}
    # Creating script bucket
    the_script_bucket = f"aws-glue-scripts-{system_params['accountId']}-{system_params['region']}"
    try:
        s3_client.head_bucket(Bucket=the_script_bucket)
        print("Script bucket already exists: ", the_script_bucket)
    except ClientError as ce:
        print(ce)
        print(ce.response['ResponseMetadata'])
        print("Creating script bucket: ", the_script_bucket)
        bucket = s3_client.create_bucket(Bucket=the_script_bucket,
                                         CreateBucketConfiguration=location)

    # Creating temp bucket
    the_temp_bucket = f"aws-glue-temporary-{system_params['accountId']}-{system_params['region']}"
    the_temp_prefix = f"{workflow_name}/"
    the_temp_location = f"s3://{the_temp_bucket}/{the_temp_prefix}"
    try:
        s3_client.head_bucket(Bucket=the_temp_bucket)
        print("Temp bucket already exists: ", the_temp_bucket)
    except ClientError as ce:
        print(ce)
        print(ce.response['ResponseMetadata'])
        print("Creating temp bucket: ", the_temp_bucket)
        bucket = s3_client.create_bucket(Bucket=the_temp_bucket,
                                         CreateBucketConfiguration=location)

    # Creating manifest bucket
    if user_params['EnableManifest']:
        the_manifest_bucket = f"aws-glue-blueprint-compaction-manifest-{system_params['accountId']}-{system_params['region']}"
        the_manifest_prefix = f"{workflow_name}/"
        the_manifest_location = f"s3://{the_manifest_bucket}/{the_manifest_prefix}"
        try:
            s3_client.head_bucket(Bucket=the_manifest_bucket)
            print("Manifest bucket already exists: ", the_manifest_bucket)
        except ClientError as ce:
            print(ce)
            print(ce.response['ResponseMetadata'])
            print("Creating Manifest bucket: ", the_manifest_bucket)
            bucket = s3_client.create_bucket(
                Bucket=the_manifest_bucket, CreateBucketConfiguration=location)

    # Upload job script to script bucket
    the_script_key = f"{workflow_name}/{file_name}"
    the_script_location = f"s3://{the_script_bucket}/{the_script_key}"
    with open("compaction/compaction.py", "rb") as f:
        s3_client.upload_fileobj(f, the_script_bucket, the_script_key)

    jobs = []
    crawlers = []

    command = {
        "Name": "glueetl",
        "ScriptLocation": the_script_location,
        "PythonVersion": "3"
    }
    arguments = {
        "--region": system_params['region'],
        "--TempDir": the_temp_location,
        "--job-bookmark-option": "job-bookmark-disable",
        "--job-language": "python",
        "--enable-s3-parquet-optimized-committer": "",
        "--enable-rename-algorithm-v2": "",
        "--enable-metrics": "",
        "--enable-continuous-cloudwatch-log": "true",
        "--enable_size_control": user_params['EnableSizeControl'],
        "--input_database": user_params['SourceDatabaseName'],
        "--input_table": user_params['SourceTableName'],
        "--input_format": user_params['InputDataFormat'],
        "--output_path": user_params['OutputDataLocation'],
        "--desired_size_mb": user_params['DesiredFileSizeMB'],
        "--enable_manifest": user_params['EnableManifest']
    }
    if user_params['InputDataFormatOptions']:
        arguments["--input_format_options"] = user_params[
            'InputDataFormatOptions']
    if user_params['EnableManifest']:
        arguments["--manifest_path"] = the_manifest_location

    crawler_source = None

    try:
        # Get the source table definition and validate the parameters with it.
        src_table = glue.get_table(
            DatabaseName=user_params['SourceDatabaseName'],
            Name=user_params['SourceTableName'])
        if src_table['Table']['StorageDescriptor']['Location'] == user_params[
                'OutputDataLocation']:
            err_msg = 'Location on the source table is same as OutputDataLocation.'
            raise ClientError(
                {
                    "Error": {
                        "Code": "InvalidInputException",
                        "Message": err_msg
                    }
                }, 'validate_params')
        if user_params['InputDataLocation'] and user_params['InputDataLocation'] != "" \
                and src_table['Table']['StorageDescriptor']['Location'] != user_params['InputDataLocation']:
            err_msg = 'Location on the source table is different from InputDataLocation.'
            raise ClientError(
                {
                    "Error": {
                        "Code": "InvalidInputException",
                        "Message": err_msg
                    }
                }, 'validate_params')
        print("Existing table is used.")

    except glue.exceptions.EntityNotFoundException:
        if user_params['InputDataLocation'] and user_params[
                'InputDataLocation'] != "":
            # Create a new source table if it does not exist
            glue.create_table(DatabaseName=user_params['SourceDatabaseName'],
                              TableInput={
                                  'Name': user_params['SourceTableName'],
                                  'StorageDescriptor': {
                                      'Location':
                                      user_params['InputDataLocation']
                                  }
                              })
            print("New table is created.")
        else:
            err_msg = 'Source table does not exist, and input data location is not provided.'
            raise ClientError(
                {
                    "Error": {
                        "Code": "InvalidInputException",
                        "Message": err_msg
                    }
                }, 'validate_params')

    if user_params[
            'InputDataLocation'] and user_params['InputDataLocation'] != "":
        targets_source = {
            "CatalogTargets": [{
                "DatabaseName":
                user_params['SourceDatabaseName'],
                "Tables": [user_params['SourceTableName']]
            }]
        }
        crawler_source = Crawler(
            Name="{}_crawler_source".format(workflow_name),
            Role=user_params['IAMRole'],
            Grouping={"TableGroupingPolicy": "CombineCompatibleSchemas"},
            Targets=targets_source,
            SchemaChangePolicy={"DeleteBehavior": "LOG"},
        )
        crawlers.append(crawler_source)

    if crawler_source:
        transform_job = Job(Name="{0}_compaction_{1}_{2}".format(
            workflow_name, user_params['SourceDatabaseName'],
            user_params['SourceTableName']),
                            Command=command,
                            Role=user_params['IAMRole'],
                            DefaultArguments=arguments,
                            WorkerType="G.1X",
                            NumberOfWorkers=user_params['NumberOfWorkers'],
                            GlueVersion="2.0",
                            DependsOn={crawler_source: "SUCCEEDED"})
    else:
        transform_job = Job(Name="{0}_compaction_{1}_{2}".format(
            workflow_name, user_params['SourceDatabaseName'],
            user_params['SourceTableName']),
                            Command=command,
                            Role=user_params['IAMRole'],
                            DefaultArguments=arguments,
                            WorkerType="G.1X",
                            NumberOfWorkers=user_params['NumberOfWorkers'],
                            GlueVersion="2.0")

    jobs.append(transform_job)

    # Create destination database if it does not exists
    try:
        glue.create_database(
            DatabaseInput={'Name': user_params['DestinationDatabaseName']})
        print("New database is created.")
    except glue.exceptions.AlreadyExistsException:
        print("Existing database is used.")

    try:
        # Get the destination table and validate the parameters with it.
        dst_table = glue.get_table(
            DatabaseName=user_params['DestinationDatabaseName'],
            Name=user_params['DestinationTableName'])
        if dst_table['Table']['StorageDescriptor']['Location'] != user_params[
                'OutputDataLocation']:
            err_msg = 'Location on the destination table is different from the OutputDataLocation.'
            raise ClientError(
                {
                    "Error": {
                        "Code": "InvalidInputException",
                        "Message": err_msg
                    }
                }, 'validate_params')
        print("Existing table is used.")
    except glue.exceptions.EntityNotFoundException:
        # Create destination table if it does not exist
        glue.create_table(DatabaseName=user_params['DestinationDatabaseName'],
                          TableInput={
                              'Name': user_params['DestinationTableName'],
                              'StorageDescriptor': {
                                  'Location': user_params['OutputDataLocation']
                              }
                          })
        print("New table is created.")

    targets_destination = {
        "CatalogTargets": [{
            "DatabaseName":
            user_params['DestinationDatabaseName'],
            "Tables": [user_params['DestinationTableName']]
        }]
    }
    crawler_destination = Crawler(
        Name="{}_crawler_destination".format(workflow_name),
        Role=user_params['IAMRole'],
        Targets=targets_destination,
        SchemaChangePolicy={"DeleteBehavior": "LOG"},
        DependsOn={transform_job: "SUCCEEDED"})
    crawlers.append(crawler_destination)

    if user_params['Frequency']:
        if user_params['Frequency'] == 'Custom':
            schedule = user_params['FrequencyCronFormat']
        else:
            schedule = generate_schedule(user_params['Frequency'])
    else:
        schedule = None
    workflow = Workflow(Name=workflow_name,
                        Entities=Entities(Jobs=jobs, Crawlers=crawlers),
                        OnSchedule=schedule)

    return workflow
Exemplo n.º 2
0
def generate_layout(user_params, system_params):
    file_name = f"custom_connection_to_catalog_{user_params['SourceConnectionName']}.py"

    session = boto3.Session(region_name=system_params['region'])
    glue = session.client('glue')
    s3_client = session.client('s3')

    workflow_name = user_params['WorkflowName']

    # Create Database if it does not exists
    try:
        glue.create_database(
            DatabaseInput={'Name': user_params['DestinationDatabaseName']})
        logger.info("New database is created.")
    except glue.exceptions.AlreadyExistsException:
        logger.info("Existing database is used.")

    # Creating script bucket
    the_script_bucket = f"aws-glue-scripts-{system_params['accountId']}-{system_params['region']}"
    create_s3_bucket_if_needed(s3_client, the_script_bucket,
                               system_params['region'])

    # Creating temp bucket
    the_temp_bucket = f"aws-glue-temporary-{system_params['accountId']}-{system_params['region']}"
    create_s3_bucket_if_needed(s3_client, the_temp_bucket,
                               system_params['region'])
    the_temp_prefix = f"{workflow_name}/"
    the_temp_location = f"s3://{the_temp_bucket}/{the_temp_prefix}"

    # Upload job script to script bucket
    the_script_key = f"{workflow_name}/{file_name}"
    the_script_location = f"s3://{the_script_bucket}/{the_script_key}"
    with open("custom_connection_to_catalog/custom_connection_to_catalog.py",
              "rb") as f:
        s3_client.upload_fileobj(f, the_script_bucket, the_script_key)

    jobs = []
    crawlers = []

    command = {
        "Name": "glueetl",
        "ScriptLocation": the_script_location,
        "PythonVersion": "3"
    }
    arguments = {
        "--region": system_params['region'],
        "--TempDir": the_temp_location,
        "--job-bookmark-option": "job-bookmark-disable",
        "--job-language": "python",
        "--enable-s3-parquet-optimized-committer": "",
        "--enable-rename-algorithm-v2": "",
        "--enable-metrics": "",
        "--enable-continuous-cloudwatch-log": "true",
        "--source_connection": user_params['SourceConnectionName'],
        "--show_tables_query_string": user_params['ShowTablesQueryString'],
        "--output_database": user_params['DestinationDatabaseName']
    }
    if user_params['DestinationTableNamePrefix'] and user_params[
            'DestinationTableNamePrefix'] != "":
        arguments["--output_table_prefix"] = user_params[
            'DestinationTableNamePrefix']

    transform_job = Job(
        Name=
        f"{workflow_name}_custom_connection_to_catalog_{user_params['SourceConnectionName']}",
        Command=command,
        Role=user_params['GlueExecutionRole'],
        DefaultArguments=arguments,
        Connections={"Connections": [user_params['SourceConnectionName']]},
        WorkerType="G.1X",
        NumberOfWorkers=user_params['NumberOfWorkers'],
        GlueVersion="2.0")

    jobs.append(transform_job)

    workflow = Workflow(Name=workflow_name,
                        Entities=Entities(Jobs=jobs, Crawlers=crawlers))

    return workflow
Exemplo n.º 3
0
def generate_layout(user_params, system_params):
    file_name = f"s3_to_dynamodb_{user_params['DynamoDbTableName']}.py"

    session = boto3.Session(region_name=system_params['region'])
    glue = session.client('glue')
    s3_client = session.client('s3')

    workflow_name = user_params['WorkflowName']

    location = {'LocationConstraint': system_params['region']}
    # Creating script bucket
    the_script_bucket = f"aws-glue-scripts-{system_params['accountId']}-{system_params['region']}"
    try:
        s3_client.head_bucket(Bucket=the_script_bucket)
        print("Script bucket already exists: ", the_script_bucket)
    except ClientError as ce:
        print(ce)
        print(ce.response['ResponseMetadata'])
        print("Creating script bucket: ", the_script_bucket)
        bucket = s3_client.create_bucket(Bucket=the_script_bucket,
                                         CreateBucketConfiguration=location)

    # Creating temp bucket
    the_temp_bucket = f"aws-glue-temporary-{system_params['accountId']}-{system_params['region']}"
    the_temp_prefix = f"{workflow_name}/"
    the_temp_location = f"s3://{the_temp_bucket}/{the_temp_prefix}"
    try:
        s3_client.head_bucket(Bucket=the_temp_bucket)
        print("Temp bucket already exists: ", the_temp_bucket)
    except ClientError as ce:
        print(ce)
        print(ce.response['ResponseMetadata'])
        print("Creating temp bucket: ", the_temp_bucket)
        bucket = s3_client.create_bucket(Bucket=the_temp_bucket,
                                         CreateBucketConfiguration=location)

    # Upload job script to script bucket
    the_script_key = f"{workflow_name}/{file_name}"
    the_script_location = f"s3://{the_script_bucket}/{the_script_key}"
    with open("s3_to_dynamodb/s3_to_dynamodb.py", "rb") as f:
        s3_client.upload_fileobj(f, the_script_bucket, the_script_key)

    jobs = []
    crawlers = []

    command = {
        "Name": "glueetl",
        "ScriptLocation": the_script_location,
        "PythonVersion": "3"
    }
    arguments = {
        "--region":
        system_params['region'],
        "--TempDir":
        the_temp_location,
        "--job-bookmark-option":
        "job-bookmark-disable",
        "--job-language":
        "python",
        "--enable-s3-parquet-optimized-committer":
        "",
        "--enable-rename-algorithm-v2":
        "",
        "--enable-metrics":
        "",
        "--enable-continuous-cloudwatch-log":
        "true",
        "--input_path":
        user_params['InputDataLocation'],
        "--input_format":
        user_params['InputDataFormat'],
        "--dynamodb_table":
        user_params['DynamoDbTableName'],
        "--dynamodb_write_throughput_percent":
        user_params['DynamoDbWriteThroughputPercent']
    }
    if user_params['InputDataFormatOptions']:
        arguments["--input_format_options"] = user_params[
            'InputDataFormatOptions']

    transform_job = Job(
        Name=
        f"{workflow_name}_s3_to_dynamodb_{user_params['DynamoDbTableName']}",
        Command=command,
        Role=user_params['IAMRole'],
        DefaultArguments=arguments,
        WorkerType="G.1X",
        NumberOfWorkers=user_params['NumberOfWorkers'],
        GlueVersion="2.0")

    jobs.append(transform_job)

    workflow = Workflow(Name=workflow_name,
                        Entities=Entities(Jobs=jobs, Crawlers=crawlers))

    return workflow
Exemplo n.º 4
0
def generate_layout(user_params, system_params):
    file_name = "conversion_{0}_{1}.py".format(
        user_params['DestinationDatabaseName'],
        user_params['DestinationTableName'])

    session = boto3.Session(region_name=system_params['region'])
    glue = session.client('glue')
    s3_client = session.client('s3')

    workflow_name = user_params['WorkflowName']

    # Create Database if it does not exists
    try:
        glue.create_database(
            DatabaseInput={'Name': user_params['DestinationDatabaseName']})
        print("New database is created.")
    except glue.exceptions.AlreadyExistsException:
        print("Existing database is used.")

    location = {'LocationConstraint': system_params['region']}
    # Creating script bucket
    the_script_bucket = f"aws-glue-scripts-{system_params['accountId']}-{system_params['region']}"
    try:
        s3_client.head_bucket(Bucket=the_script_bucket)
        print("Script bucket already exists: ", the_script_bucket)
    except ClientError as ce:
        print(ce)
        print(ce.response['ResponseMetadata'])
        print("Creating script bucket: ", the_script_bucket)
        bucket = s3_client.create_bucket(Bucket=the_script_bucket,
                                         CreateBucketConfiguration=location)

    # Creating temp bucket
    the_temp_bucket = f"aws-glue-temporary-{system_params['accountId']}-{system_params['region']}"
    the_temp_prefix = f"{workflow_name}/"
    the_temp_location = f"s3://{the_temp_bucket}/{the_temp_prefix}"
    try:
        s3_client.head_bucket(Bucket=the_temp_bucket)
        print("Temp bucket already exists: ", the_temp_bucket)
    except ClientError as ce:
        print(ce)
        print(ce.response['ResponseMetadata'])
        print("Creating temp bucket: ", the_temp_bucket)
        bucket = s3_client.create_bucket(Bucket=the_temp_bucket,
                                         CreateBucketConfiguration=location)

    # Upload job script to script bucket
    the_script_key = f"{workflow_name}/{file_name}"
    the_script_location = f"s3://{the_script_bucket}/{the_script_key}"
    with open("conversion/conversion.py", "rb") as f:
        s3_client.upload_fileobj(f, the_script_bucket, the_script_key)

    # Structure workflow
    jobs = []
    crawlers = []

    # Create tmp Table if it does not exists
    try:
        glue.create_table(DatabaseName=user_params['DestinationDatabaseName'],
                          TableInput={
                              'Name':
                              "source_" + user_params['DestinationTableName'],
                              'StorageDescriptor': {
                                  'Location': user_params['InputDataLocation']
                              },
                              'Parameters': {
                                  'groupFiles': 'inPartition',
                                  'groupSize': '33554432',
                              }
                          })
        print("New table is created.")
    except glue.exceptions.AlreadyExistsException:
        print("Existing table is used.")

    targets = {
        "CatalogTargets": [{
            "DatabaseName":
            user_params['DestinationDatabaseName'],
            "Tables": ["source_" + user_params['DestinationTableName']]
        }]
    }

    crawler = Crawler(
        Name="{}_crawler".format(workflow_name),
        Role=user_params['IAMRole'],
        Targets=targets,
        Grouping={"TableGroupingPolicy": "CombineCompatibleSchemas"},
        SchemaChangePolicy={"DeleteBehavior": "LOG"},
    )
    crawlers.append(crawler)

    command = {
        "Name": "glueetl",
        "ScriptLocation": the_script_location,
        "PythonVersion": "3"
    }
    arguments = {
        "--TempDir": the_temp_location,
        "--job-bookmark-option": "job-bookmark-enable",
        "--job-language": "python",
        "--enable-s3-parquet-optimized-committer": "",
        "--enable-rename-algorithm-v2": "",
        "--enable-metrics": "",
        "--enable-continuous-cloudwatch-log": "true",
        "--output_database": user_params['DestinationDatabaseName'],
        "--tmp_table": "source_" + user_params['DestinationTableName'],
        "--output_table": user_params['DestinationTableName'],
        "--output_path": user_params['OutputDataLocation']
    }

    transform_job = Job(Name="{0}_conversion_{1}_{2}".format(
        workflow_name, user_params['DestinationDatabaseName'],
        user_params['DestinationTableName']),
                        Command=command,
                        Role=user_params['IAMRole'],
                        DefaultArguments=arguments,
                        WorkerType="G.1X",
                        NumberOfWorkers=user_params['NumberOfWorkers'],
                        GlueVersion="2.0",
                        DependsOn={crawler: "SUCCEEDED"})

    jobs.append(transform_job)

    if user_params['Frequency']:
        if user_params['Frequency'] == 'Custom':
            schedule = user_params['FrequencyCronFormat']
        else:
            schedule = generate_schedule(user_params['Frequency'])
    else:
        schedule = None

    workflow = Workflow(Name=workflow_name,
                        Entities=Entities(Jobs=jobs, Crawlers=crawlers),
                        OnSchedule=schedule)

    return workflow
Exemplo n.º 5
0
def generate_layout(user_params, system_params):
    file_name = f"standard_table_to_governed_{user_params['SourceDatabaseName']}_{user_params['SourceTableName']}.py"

    session = boto3.Session(region_name=system_params['region'])
    glue = session.client('glue')
    s3_client = session.client('s3')

    workflow_name = user_params['WorkflowName']

    # Create Database if it does not exists
    try:
        glue.create_database(
            DatabaseInput={
                'Name': user_params['DestinationDatabaseName']
            }
        )
        print("New database is created.")
    except glue.exceptions.AlreadyExistsException:
        print("Existing database is used.")

    location = {'LocationConstraint': system_params['region']}
    # Creating script bucket
    the_script_bucket = f"aws-glue-scripts-{system_params['accountId']}-{system_params['region']}"
    try:
        s3_client.head_bucket(Bucket=the_script_bucket)
        print("Script bucket already exists: ", the_script_bucket)
    except ClientError as ce:
        print(ce)
        print(ce.response['ResponseMetadata'])
        print("Creating script bucket: ", the_script_bucket)
        if system_params['region'] == "us-east-1":
            bucket = s3_client.create_bucket(Bucket=the_script_bucket)
        else:
            bucket = s3_client.create_bucket(Bucket=the_script_bucket, CreateBucketConfiguration=location)

    # Creating temp bucket
    the_temp_bucket = f"aws-glue-temporary-{system_params['accountId']}-{system_params['region']}"
    the_temp_prefix = f"{workflow_name}/"
    the_temp_location = f"s3://{the_temp_bucket}/{the_temp_prefix}"
    try:
        s3_client.head_bucket(Bucket=the_temp_bucket)
        print("Temp bucket already exists: ", the_temp_bucket)
    except ClientError as ce:
        print(ce)
        print(ce.response['ResponseMetadata'])
        print("Creating temp bucket: ", the_temp_bucket)
        if system_params['region'] == "us-east-1":
            bucket = s3_client.create_bucket(Bucket=the_temp_bucket)
        else:
            bucket = s3_client.create_bucket(Bucket=the_temp_bucket, CreateBucketConfiguration=location)

    # Upload job script to script bucket
    the_script_key = f"{workflow_name}/{file_name}"
    the_script_location = f"s3://{the_script_bucket}/{the_script_key}"
    with open("standard_table_to_governed/standard_table_to_governed.py", "rb") as f:
        s3_client.upload_fileobj(f, the_script_bucket, the_script_key)

    jobs = []
    crawlers = []

    command = {
        "Name": "glueetl",
        "ScriptLocation": the_script_location,
        "PythonVersion": "3"
    }
    arguments = {
        "--region": system_params['region'],
        "--TempDir": the_temp_location,
        "--job-bookmark-option": "job-bookmark-enable",
        "--job-language": "python",
        "--enable-s3-parquet-optimized-committer": "",
        "--enable-rename-algorithm-v2": "",
        "--enable-metrics": "",
        "--enable-continuous-cloudwatch-log": "true",
        "--input_database": user_params['SourceDatabaseName'],
        "--input_table": user_params['SourceTableName'],
        "--output_database": user_params['DestinationDatabaseName'],
        "--output_table": user_params['DestinationTableName'],
        "--output_path": user_params['OutputDataLocation']
    }

    transform_job = Job(
        Name=f"{workflow_name}_standard_table_to_governed_{user_params['SourceDatabaseName']}_{user_params['SourceTableName']}",
        Command=command,
        Role=user_params['GlueExecutionRole'],
        DefaultArguments=arguments,
        WorkerType="G.1X",
        NumberOfWorkers=user_params['NumberOfWorkers'],
        GlueVersion="2.0"
    )

    jobs.append(transform_job)

    workflow = Workflow(Name=workflow_name, Entities=Entities(Jobs=jobs, Crawlers=crawlers))

    return workflow