Exemplo n.º 1
0
 def _create_glue_job(
     self,
     context: DataJobContext,
     glue_job_name: str,
     s3_url_glue_job: str = None,
     arguments: dict = None,
     job_type: str = "pythonshell",
     python_version: str = "3",
     glue_version: str = None,
     max_capacity: int = None,
     worker_type: str = None,
     number_of_workers: str = None,
     *args,
     **kwargs,
 ) -> None:
     """Create a glue job with the necessary configuration like,
     paths to wheel and business logic and arguments"""
     logger.debug(f"creating Glue Job {glue_job_name}")
     default_arguments = None
     if context.s3_url_wheel:
         extra_py_files = {
             # path to the wheel of this project
             "--extra-py-files": context.s3_url_wheel
         }
         default_arguments = {**extra_py_files, **arguments}
     glue.CfnJob(
         self,
         id=glue_job_name,
         name=glue_job_name,
         role=self.role.role_arn,
         command=glue.CfnJob.JobCommandProperty(
             name=job_type,
             python_version=python_version,
             script_location=s3_url_glue_job,
         ),
         glue_version=glue_version,
         max_capacity=max_capacity,
         default_arguments=default_arguments,
         worker_type=worker_type,
         number_of_workers=number_of_workers,
         *args,
         **kwargs,
     )
Exemplo n.º 2
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        glue_job_role = iam.Role(
            self,
            "Glue-Job-Role",
            assumed_by=iam.ServicePrincipal("glue.amazonaws.com"),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "AmazonS3FullAccess")
            ],
        )

        job = glue.CfnJob(
            self,
            "glue-test-job",
            role=glue_job_role.role_arn,
            allocated_capacity=1,
            command=glue.CfnJob.JobCommandProperty(
                name="glueetl",
                script_location="s3://my-bucket/glue-scripts/job.scala"),
            glue_version="2.0",
        )
        #
        # file_asset = aws_s3_assets.Asset(self, "glue-asssets", path=os.path.join(ROOT_DIR, "glue"))
        # print(file_asset.bucket)
        bucket_glue = aws_s3.Bucket(self, "BucketGlue")
        # file_asset = aws_s3_assets.Asset(self, "GlueAssets", path="/Users/vincent/Workspace/python_lambda_iac_deployment/python_lambda_iac_deployment/glue/glue_job.py")
        aws_s3_deployment.BucketDeployment(
            self,
            "GlueJobDeployment",
            sources=[
                aws_s3_deployment.Source.asset(
                    "/Users/vincent/Workspace/python_lambda_iac_deployment/python_lambda_iac_deployment/glue"
                )
            ],
            destination_bucket=bucket_glue,
            destination_key_prefix="jobs")
Exemplo n.º 3
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:

        glue.CfnJob(
            scope=self,
            id=modname,
            command={
                'name':
                'glueetl',
                'python_version':
                '3',
                'script_location':
                's3://shaw-stc-edl-etl-config-playpen/test/glue/spark/__main__.py'
            },
            role=etl_role.role_arn,
            default_arguments={'--enable-glue-datacatalog': ''},
            allocated_capacity=10,
            description='Test Spark Glue ETL',
            glue_version='1.0',
            max_capacity=10,
            max_retries=0,
            number_of_workers=1,
            timeout=2880,
            worker_type='Standard')
Exemplo n.º 4
0
    def __init__(
        self,
        scope: cdk.Construct,
        construct_id: str,
        stack_log_level: str,
        vpc,
        my_sql_db_sg,
        store_events_db_endpoint,
        sales_events_bkt,
        _glue_etl_role,
        glue_db_name: str,
        glue_table_name: str,
        tgt_db_secret,
        **kwargs,
    ) -> None:
        super().__init__(scope, construct_id, **kwargs)

        self.template_options.metadata = {"License": "Miztiik Corp."}

        # ADD Permissions to our Glue JOB Role to Access Secrets
        tgt_db_secret.grant_read(_glue_etl_role)

        # # Create GLUE JDBC Connection for RDS MySQL

        # Allow ALL PORTS within SG for GLUE Connections to connect
        # https://docs.aws.amazon.com/glue/latest/dg/connection-defining.html#connection-properties-jdbc
        # https://docs.aws.amazon.com/glue/latest/dg/setup-vpc-for-glue-access.html
        # https://docs.amazonaws.cn/en_us/glue/latest/dg/connection-defining.html

        rds_mysql_conn_props = _glue.CfnConnection.ConnectionInputProperty(
            connection_type="JDBC",
            description="Glue Connection for RDS MySQL Store Events Database",
            name="rdsMySQL57Conn",
            physical_connection_requirements=_glue.CfnConnection.PhysicalConnectionRequirementsProperty(
                security_group_id_list=[my_sql_db_sg.security_group_id],
                subnet_id=vpc.select_subnets(
                        subnet_type=_ec2.SubnetType.PRIVATE
                ).subnet_ids[1]
            ),
            connection_properties={
                "JDBC_CONNECTION_URL": f"jdbc:mysql://{store_events_db_endpoint}:3306/store_events",
                "JDBC_ENFORCE_SSL": "false",
                "USERNAME": "******",
                "PASSWORD": "******"
            }
        )

        rds_mysql_conn = _glue.CfnConnection(
            self,
            "rdsMySQLGlueConnection",
            catalog_id=f"{cdk.Aws.ACCOUNT_ID}",
            connection_input=rds_mysql_conn_props
        )

        # Create the Glue job to convert incoming JSON to parquet
        # Read GlueSpark Code
        try:
            with open(
                "stacks/back_end/glue_stacks/glue_job_scripts/load_json_to_rds.py",
                encoding="utf-8",
                mode="r",
            ) as f:
                load_json_to_rds = f.read()
        except OSError:
            print("Unable to read Glue Job Code")
            raise

        etl_script_asset = _s3_assets.Asset(
            self,
            "etlScriptAsset",
            path="stacks/back_end/glue_stacks/glue_job_scripts/load_json_to_rds.py"
        )

        self.etl_prefix = "stream-etl"
        _glue_etl_job = _glue.CfnJob(
            self,
            "glues3ToRdsIngestorJob",
            name="s3-to-rds-ingestor",
            description="Glue Job to ingest JSON data from S3 to RDS",
            role=_glue_etl_role.role_arn,
            glue_version="2.0",
            command=_glue.CfnJob.JobCommandProperty(
                name="glueetl",
                script_location=f"s3://{etl_script_asset.s3_bucket_name}/{etl_script_asset.s3_object_key}",
                python_version="3"
            ),
            connections={"connections": [rds_mysql_conn_props.name]},
            default_arguments={
                "--enable-metrics": True,
                "--enable-continuous-cloudwatch-log": True,
                "--job-bookmark-option": "job-bookmark-enable",
                '--TempDir': f"s3://{sales_events_bkt.bucket_name}/bookmarks",
                "--src_db_name": glue_db_name,
                "--src_etl_bkt": f"{sales_events_bkt.bucket_name}",
                "--crawler_tbl_prefix": "txns_",
                "--tgt_db_secret_arn": tgt_db_secret.secret_arn,
                "--tgt_tbl_name": glue_table_name,
                "--conn_name": f"{rds_mysql_conn_props.name}"
            },
            allocated_capacity=1,
            # timeout=2,
            max_retries=2,
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=2)
        )

        # Configure a Trigger - Every hour
        _glue_etl_job_trigger = _glue.CfnTrigger(
            self,
            "glueEtlJobtrigger",
            type="SCHEDULED",
            description="Miztiik Automation: Trigger S3 to RDS Ingestor glue job every hour",
            schedule="cron(0 1 * * ? *)",
            start_on_creation=False,
            actions=[
                _glue.CfnTrigger.ActionProperty(
                    job_name=f"{_glue_etl_job.name}",
                    timeout=2
                )
            ]
        )
        _glue_etl_job_trigger.add_depends_on(_glue_etl_job)

        # Configure Glue Workflow
        _glue_etl_job_workflow = _glue.CfnWorkflow(
            self,
            "glueEtlJobWorkflow"
        )

        ###########################################
        ################# OUTPUTS #################
        ###########################################
        output_0 = cdk.CfnOutput(
            self,
            "AutomationFrom",
            value=f"{GlobalArgs.SOURCE_INFO}",
            description="To know more about this automation stack, check out our github page.",
        )

        output_1 = cdk.CfnOutput(
            self,
            "RDSIngestorETLGlueJob",
            value=f"https://console.aws.amazon.com/gluestudio/home?region={cdk.Aws.REGION}#/jobs",
            description="Glue Job to ingest JSON data from S3 to RDS.",
        )
Exemplo n.º 5
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        with open('./props/tasksetting.json', 'r') as f1:
            py_json1 = json.load(f1)
            ts = json.dumps(py_json1)

        # with open('./props/mappingrule.json', 'r') as f2:
        #     py_json2 = json.load(f2)
        #     mr = json.dumps(py_json2)

        with open('./props/config.json', 'r') as f2:
            configuration = json.load(f2)

        def getMappingrules(self, table_list):
            rules = []
            for index, value in enumerate(table_list, 1):
                rules.append({
                    "rule-type": "selection",
                    "rule-id": str(index),
                    "rule-name": str(index),
                    "object-locator": {
                        "schema-name": value['schemaName'],
                        "table-name": value['tableName']
                    },
                    "rule-action": "include",
                    "filters": []
                })
            mapping_rules = {"rules": rules}
            return json.dumps(mapping_rules)

        # The code that defines your stack goes here
        S3Accessrole = _iam.Role(
            self,
            'dmsrole',
            assumed_by=_iam.ServicePrincipal('dms.amazonaws.com'),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    'AmazonS3FullAccess')
            ])

        raw_bucket = s3.Bucket(self,
                               'rawbucket',
                               bucket_name='rawbucket-datalake-cdk-oregon')
        raw_bucket.add_lifecycle_rule(
            enabled=configuration['s3LifecycleRule']['enabled'],
            expiration=core.Duration.days(
                configuration['s3LifecycleRule']['expiration']))

        #my_table = ddb.Table(self, id ='dunamoTable', table_name = 'testcdktable',
        #partition_key = ddb.Attribute(name ='lastname',type = ddb.AttributeType.STRING) )

        dl_dms = _dms.CfnReplicationInstance(
            self,
            'dmsreplication',
            replication_instance_class=configuration['DMS_instance_setting']
            ['instance_class'],
            replication_instance_identifier='datalake-instance-cdk',
            allocated_storage=configuration['DMS_instance_setting']
            ['allocated_storage'])

        source_endpoint = _dms.CfnEndpoint(
            self,
            'sourceendpoint',
            endpoint_type='source',
            engine_name=configuration['engineName'],
            database_name=configuration['databaseName'],
            username=configuration['username'],
            password=configuration['password'],
            port=configuration['port'],
            server_name=configuration['serverName'],
        )

        target_endpoint = _dms.CfnEndpoint(
            self,
            'targetendpoint',
            endpoint_type='target',
            engine_name='s3',
            s3_settings={
                'bucketName': raw_bucket.bucket_name,
                'serviceAccessRoleArn': S3Accessrole.role_arn
            },
            extra_connection_attributes='dataFormat=parquet')

        dms_task = _dms.CfnReplicationTask(
            self,
            'data2lake-task',
            migration_type='full-load-and-cdc',
            replication_instance_arn=dl_dms.ref,
            source_endpoint_arn=source_endpoint.ref,
            target_endpoint_arn=target_endpoint.ref,
            replication_task_settings=ts,
            table_mappings=getMappingrules(self, configuration['tableList']))

        my_table = ddb.Table(self,
                             id='dynamoTable',
                             table_name='ControllerTable',
                             partition_key=ddb.Attribute(
                                 name='path', type=ddb.AttributeType.STRING),
                             billing_mode=ddb.BillingMode.PAY_PER_REQUEST)

        datalake_bucket = s3.Bucket(self,
                                    'datalakebucket',
                                    bucket_name='datalake-bucket-cdk-oregon')

        glue_role = _iam.Role(
            self,
            'gluerole',
            assumed_by=_iam.ServicePrincipal('glue.amazonaws.com'),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AWSGlueServiceRole')
            ])

        raw_bucket.grant_read(glue_role)
        datalake_bucket.grant_read_write(glue_role)

        #lake formation settings
        #If you have attached managed policy ('AWSLakeFormationDataAdmin') to your own iam user, you should change that policy to allow "lakeformation:PutDataLakeSettings",
        #so that the lake setting can be allowed by below code in cdk.
        lake_admin_setting = _lakeformation.CfnDataLakeSettings(
            self,
            'data-lake-GrantAdmin',
            admins=[
                _lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty(
                    data_lake_principal_identifier=configuration[
                        'executiveArn'])
            ])

        glue_database = _glue.Database(self,
                                       'gluedatabase',
                                       database_name='data_lake_gluedb')

        glue_database.node.add_dependency(lake_admin_setting)

        glue_role_permission_inLakeFormation = _lakeformation.CfnPermissions(
            self,
            'permission-glueRole',
            data_lake_principal=_lakeformation.CfnPermissions.
            DataLakePrincipalProperty(
                data_lake_principal_identifier=glue_role.role_arn),
            resource=_lakeformation.CfnPermissions.ResourceProperty(
                database_resource=_lakeformation.CfnPermissions.
                DatabaseResourceProperty(name=glue_database.database_name)),
            permissions=['ALL'])

        crawler = _glue.CfnCrawler(
            self,
            'datalakecrawler',
            name='Crawler-datalake-cdk',
            role=glue_role.role_arn,
            targets={
                's3Targets': [{
                    'path':
                    's3://' + datalake_bucket.bucket_name + '/datalake/'
                }]
            },
            database_name='data_lake_gluedb',
            configuration=
            "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}"
        )

        initialload_script = S3Assets.Asset(self,
                                            'initial-load-code',
                                            path='./Gluejob/InitialLoad.py')
        incrementalload_script = S3Assets.Asset(
            self, 'incremental-load-code', path='./Gluejob/IncrementalLoad.py')

        initialload_script.grant_read(glue_role)
        incrementalload_script.grant_read(glue_role)
        my_table.grant_full_access(glue_role)

        initial_load_job = _glue.CfnJob(
            self,
            'initial-job',
            name='InitialLoad-cdk',
            command=_glue.CfnJob.JobCommandProperty(
                name='glueetl',
                python_version='3',
                script_location='s3://' + initialload_script.s3_bucket_name +
                '/' + initialload_script.s3_object_key),
            role=glue_role.role_arn,
            default_arguments={
                '--prefix': str(configuration['tableList']),
                '--bucket': raw_bucket.bucket_name,
                '--datalake_bucket': datalake_bucket.bucket_name,
                '--datalake_prefix': 'datalake/',
                '--region': CdkpyStack.of(self).region,
                '--controller_table_name': my_table.table_name
            },
            allocated_capacity=configuration['glue_job_setting']
            ['job_capacity'],
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=configuration['glue_job_setting']
                ['max_concurrent_run_JobExecution']))

        incremental_load_job = _glue.CfnJob(
            self,
            'increment-job',
            name='IncrementalLoad-cdk',
            command=_glue.CfnJob.JobCommandProperty(
                name='glueetl',
                script_location='s3://' +
                incrementalload_script.s3_bucket_name + '/' +
                incrementalload_script.s3_object_key,
                python_version='3'),
            role=glue_role.role_arn,
            default_arguments={
                '--prefix': str(configuration['tableList']),
                '--bucket': raw_bucket.bucket_name,
                '--datalake_bucket': datalake_bucket.bucket_name,
                '--datalake_prefix': 'datalake/',
                '--region': CdkpyStack.of(self).region,
                '--controller_table_name': my_table.table_name
            },
            allocated_capacity=2,
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=1))

        job_trigger = _glue.CfnTrigger(
            self,
            'datalake-glue-trigger',
            type='SCHEDULED',
            schedule=configuration['job_trigger_schedule'],
            start_on_creation=False,
            actions=[
                _glue.CfnTrigger.ActionProperty(job_name='IncrementalLoad-cdk')
            ])

        dl_sns = _sns.Topic(self, 'datalake_sns', display_name='data-lake-sns')

        endpoint_email = configuration['emailSubscriptionList']

        for emails in endpoint_email:
            dl_sns.add_subscription(_subscrption.EmailSubscription(emails))

        #Another way to subscribe: dl_subscription = _sns.Subscription(self,'email-subscrption',topic = dl_sns,endpoint='*****@*****.**',protocol= _sns.SubscriptionProtocol.EMAIL)

        glue_events_target = _events_targets.SnsTopic(dl_sns)

        glue_events_rule = _events.Rule(
            self,
            'gluejobevents-datalake',
            description='Using for tracking the failed glue job of data lake',
            rule_name='dl-gluejob-event',
            event_pattern=_events.EventPattern(
                source=['aws.glue'],
                detail_type=['Glue Job State Change'],
                detail={
                    "jobName": [initial_load_job.name],
                    "state": ["FAILED"]
                }),
            targets=[glue_events_target])

        dms_subscription = _dms.CfnEventSubscription(
            self,
            'dmsevents-datalake',
            sns_topic_arn=dl_sns.topic_arn,
            subscription_name='datalake-dmsevents',
            source_type='replication-task',
            event_categories=['failure'])
    def __init__(
        self,
        scope: cdk.Construct,
        construct_id: str,
        stack_log_level: str,
        glue_db_name: str,
        glue_table_name: str,
        etl_bkt,
        src_stream,
        **kwargs,
    ) -> None:
        super().__init__(scope, construct_id, **kwargs)

        self.template_options.metadata = {"License": "Miztiik Corp."}

        # Glue Job IAM Role
        self._glue_etl_role = _iam.Role(
            self,
            "glueJobRole",
            assumed_by=_iam.ServicePrincipal("glue.amazonaws.com"),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    "AmazonS3ReadOnlyAccess"),
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    "service-role/AWSGlueServiceRole")
            ])
        self._glue_etl_role.add_to_policy(
            _iam.PolicyStatement(
                actions=["s3:*"],
                resources=[f"{etl_bkt.bucket_arn}",
                           f"{etl_bkt.bucket_arn}/*"]))

        self._glue_etl_role.add_to_policy(
            _iam.PolicyStatement(actions=["kinesis:DescribeStream"],
                                 resources=[f"{src_stream.stream_arn}"]))

        src_stream.grant_read(self._glue_etl_role)

        # Create the Glue job to convert incoming JSON to parquet
        # Read GlueSpark Code
        try:
            with open(
                    "stacks/back_end/glue_stacks/glue_job_scripts/kinesis_streams_batch_to_s3_etl.py",
                    encoding="utf-8",
                    mode="r",
            ) as f:
                kinesis_streams_batch_to_s3_etl = f.read()
        except OSError:
            print("Unable to read Glue Job Code")
            raise

        etl_script_asset = _s3_assets.Asset(
            self,
            "etlScriptAsset",
            path=
            "stacks/back_end/glue_stacks/glue_job_scripts/kinesis_streams_batch_to_s3_etl.py"
        )

        self.etl_prefix = "stream-etl"
        _glue_etl_job = _glue.CfnJob(
            self,
            "glueJsonToParquetJob",
            name="stream-etl-processor",
            description=
            "Glue Job to process stream of events from Kinesis data stream and store them in parquet format in S3",
            role=self._glue_etl_role.role_arn,
            glue_version="2.0",
            command=_glue.CfnJob.JobCommandProperty(
                name="gluestreaming",
                script_location=
                f"s3://{etl_script_asset.s3_bucket_name}/{etl_script_asset.s3_object_key}",
                python_version="3"),
            default_arguments={
                "--src_db_name": glue_db_name,
                "--src_tbl_name": glue_table_name,
                "--datalake_bkt_name": etl_bkt.bucket_name,
                "--datalake_bkt_prefix": f"{self.etl_prefix}/",
                "--job-bookmark-option": "job-bookmark-enable"
            },
            allocated_capacity=1,
            # timeout=2,
            max_retries=2,
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=1))

        # Configure a Trigger - Every hour
        _glue_etl_job_trigger = _glue.CfnTrigger(
            self,
            "glueEtlJobtrigger",
            type="SCHEDULED",
            description=
            "Miztiik Automation: Trigger streaming etl glue job every hour",
            schedule="cron(0 1 * * ? *)",
            start_on_creation=False,
            actions=[
                _glue.CfnTrigger.ActionProperty(
                    job_name=f"{_glue_etl_job.name}", timeout=2)
            ])
        _glue_etl_job_trigger.add_depends_on(_glue_etl_job)

        ###########################################
        ################# OUTPUTS #################
        ###########################################
        output_0 = cdk.CfnOutput(
            self,
            "AutomationFrom",
            value=f"{GlobalArgs.SOURCE_INFO}",
            description=
            "To know more about this automation stack, check out our github page.",
        )

        output_1 = cdk.CfnOutput(
            self,
            "StreamingETLGlueJob",
            value=
            f"https://console.aws.amazon.com/gluestudio/home?region={cdk.Aws.REGION}#/jobs",
            description="Glue ETL Job.",
        )
Exemplo n.º 7
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        policy_statement = iam.PolicyStatement(actions=['logs:*','s3:*','iam:*','cloudwatch:*','dynamodb:*','glue:*'])
        policy_statement.add_all_resources()
        #define role
        glue_job_role = iam.Role(self,'Glue-Job-Role',assumed_by=iam.ServicePrincipal('glue.amazonaws.com'))
        glue_job_role.add_to_policy(policy_statement)
        #define job
        job = glue.CfnJob(self,'glue-test-job',role=glue_job_role.role_arn,allocated_capacity=10,worker_type="G.1X", command=glue.CfnJob.JobCommandProperty(name='glueetl',script_location='s3://base-nonprod/GlueETLScripts//hello.py'))

# create inline statement, policy then role
# statement = iam.PolicyStatement(actions=["s3:GetObject","s3:PutObject"],
#                                         resources=["arn:aws:s3:::mybucketname",
#                                                     "arn:aws:s3:::mybucketname/data_warehouse/units/*"])
#         write_to_s3_policy = iam.PolicyDocument(statements=[statement])
#         glue_role = iam.Role(
#             self, 'GlueCrawlerFormyDataScienceRole',
#             role_name = 'GlueCrawlerFormyDataScienceRole',
#             inline_policies=[write_to_s3_policy],
#             assumed_by=iam.ServicePrincipal('glue.amazonaws.com'),
#             managed_policies=[iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AWSGlueServiceRole')]
#         )

#define crawler
        # glue_crawler = glue.CfnCrawler(
        #     self, 'glue-crawler-id',
        #     description="Glue Crawler for my-data-science-s3",
        #     name='any name',
        #     database_name='units',
        #     schedule={"scheduleExpression": "cron(5 * * * ? *)"},
        #     role=glue_role.role_arn,
        #     targets={"s3Targets": [{"path": "s3://mybucketname/data_warehouse/units"}]}
        # )
# from aws_cdk import core as cdk
# from aws_cdk import awsglue as glue
# from aws_cdk import aws
# #import * as glue from "@aws-cdk/aws-glue";
# import * as s3 from "@aws-cdk/aws-s3";
# import * as s3Deployment from "@aws-cdk/aws-s3-deployment";
# import * as iam from "@aws-cdk/aws-iam";
# import { replaceValues } from "./lib";
# import { config } from "dotenv";
# config();
#
# const PYTHON_VERSION = "3";
# const GLUE_VERSION = "1.0";
#
# //This value must be glueetl for Apache Spark
# const COMMAND_NAME = "glueetl";
#
# const { RTK, COLLECTIONS, BUCKET_NAME }= process.env;
#
# class GlueETLStack extends cdk.Stack {
#     constructor(scope: cdk.Construct, id: string, props?: cdk.StackProps) {
#         super(scope, id, props);
#
#         const s3Bucket = new s3.Bucket(this, "etl-bucket", {
#             bucketName: BUCKET_NAME,
#             removalPolicy: cdk.RemovalPolicy.DESTROY
#         });
#
#         const dependenciesDeployment = new s3Deployment.BucketDeployment(this, "dependencies-deployment", {
#             sources: [s3Deployment.Source.asset("../dependencies")],
#             destinationBucket: s3Bucket,
#             destinationKeyPrefix: "dependencies"
#         });
#
#         # // Replace hardcoded values in script
#         # replaceValues(
#         #     "scripts/script.py",
#         #     RTK as string,
#         #     MONGO_SERVER as string,
#         #     MONGO_USER as string,
#         #     MONGO_PASSWORD as string,
#         #     MONGO_PORT as string,
#         #     MONGO_SSL == "true" ? "True" : "False",
#         #     MONGO_DATABASE as string,
#         #     `s3://${BUCKET_NAME}/${MONGO_DATABASE as string}/`,
#         #     COLLECTIONS as string
#         # );
#
#         const scriptsDeployment = new s3Deployment.BucketDeployment(this, "scripts-deployment", {
#             sources: [s3Deployment.Source.asset("scripts")],
#             destinationBucket: s3Bucket,
#             destinationKeyPrefix: "scripts"
#         });
#
#         const glueRole = new iam.Role(this, "glue-role", {
#             roleName: "glue-etl-role",
#             assumedBy: new iam.ServicePrincipal("glue.amazonaws.com"),
#             managedPolicies: [
#                 iam.ManagedPolicy.fromAwsManagedPolicyName("AmazonS3FullAccess")
#             ],
#         });
#
#         const glueJob = new glue.CfnJob(this, "glue-job", {
#             name: "glue-job",
#             role: glueRole.roleArn,
#             command: {
#                 name: COMMAND_NAME,
#                 pythonVersion: PYTHON_VERSION,
#                 scriptLocation: `s3://${s3Bucket.bucketName}/scripts/script.py`
#             },
#             glueVersion: GLUE_VERSION,
#             defaultArguments: {
#                 "--extra-jars": `s3://${s3Bucket.bucketName}/${JDBC_PATH}`
#             }
#         });
#
#         const glueTrigger = new glue.CfnTrigger(this, "glue-trigger", {
#             name: "etl-trigger",
#             schedule: "cron(5 * * * ? *)",
#             type: "SCHEDULED",
#             actions: [
#                 {
#                     jobName: glueJob.name
#                 }
#             ],
#             startOnCreation: true
#         });
#         glueTrigger.addDependsOn(glueJob);
#     }
# }
Exemplo n.º 8
0
    def __init__(self, app: core.App, cfn_name: str, stack_env):
        super().__init__(scope=app, id=f"{cfn_name}-{stack_env}")

        glue_code = s3_assets.Asset(
            scope=self,
            id=f"{cfn_name}-glue-script",
            path="./glue_script/glue_job_script.py",
        )

        glue_s3_access_role = iam.Role(
            scope=self,
            id=f"glue_s3_access_role_{stack_env}",
            role_name=f"glue_s3_access_role_{stack_env}",
            assumed_by=iam.ServicePrincipal("glue.amazonaws.com"))

        # add policy to access S3
        glue_s3_access_role.add_to_policy(
            iam.PolicyStatement(effect=iam.Effect.ALLOW,
                                resources=["*"],
                                actions=["s3:*"]))

        # add policy to access CloudWatch Logs
        glue_s3_access_role.add_to_policy(
            iam.PolicyStatement(effect=iam.Effect.ALLOW,
                                resources=["arn:aws:logs:*:*:*"],
                                actions=[
                                    "logs:CreateLogGroup",
                                    "logs:CreateLogStream",
                                    "logs:PutLogEvents",
                                    "logs:DescribeLogStreams"
                                ]))

        # glue
        # specify the name, because `the name` deployed cannot be obtained.
        glue_job_name = f"{cfn_name}-glue-job"
        _ = glue.CfnJob(
            scope=self,
            id=glue_job_name,
            name=glue_job_name,
            command=glue.CfnJob.JobCommandProperty(
                # glueetl or pythonshell
                name=self.GLUE_JOB_COMMAND_GLUE_ETL,
                script_location=
                f"s3://{glue_code.s3_bucket_name}/{glue_code.s3_object_key}"),
            # set role-name!
            role=glue_s3_access_role.role_name,
            glue_version=self.GLUE_VERSION_2_0,
            number_of_workers=2,
            worker_type=self.GLUE_WORKER_TYPE_STANDARD,
            timeout=1800)

        # StepFunction Tasks
        sfn_task_pass = sfn.Pass(scope=self,
                                 id=f"{cfn_name}-sfn-pass",
                                 comment="pass example",
                                 input_path="$",
                                 result_path="$.source",
                                 result=sfn.Result.from_string("example"),
                                 output_path="$")

        # wait until the JOB completed: sfn.IntegrationPattern.RUN_JOB
        # process next step without waiting: sfn.IntegrationPattern.REQUEST_RESPONSE
        sfn_task_glue_job = sfn_tasks.GlueStartJobRun(
            scope=self,
            id=f"{cfn_name}-sfn-lambda-task",
            glue_job_name=glue_job_name,
            integration_pattern=sfn.IntegrationPattern.RUN_JOB,
            input_path="$",
            result_path="$.result",
            output_path="$.output")

        # stepfunctions
        definition = sfn_task_pass.next(sfn_task_glue_job)

        _ = sfn.StateMachine(scope=self,
                             id=f"{cfn_name}-SFn-{stack_env}",
                             definition=definition)
Exemplo n.º 9
0
    def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
        super().__init__(scope, construct_id, **kwargs)

        # Glue job execution IAM Role
        glue_job_role = iam.Role(
            self,
            'Glue-Job-Role',
            assumed_by=iam.ServicePrincipal('glue.amazonaws.com'),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AWSGlueServiceRole')
            ])

        S3_BUCKET_NAME = "MyCdkGlueJobBucket"

        # S3 Bucket to host glue scripts
        bucket = s3.Bucket(self,
                           S3_BUCKET_NAME,
                           versioned=True,
                           removal_policy=RemovalPolicy.DESTROY,
                           auto_delete_objects=True,
                           block_public_access=s3.BlockPublicAccess.BLOCK_ALL)

        # asset to sync local scripts folder with S3 bucket
        asset = s3deploy.Source.asset("./resources/glue-scripts")

        # Sync local scripts with S3 bucket
        s3deploy.BucketDeployment(self,
                                  "DeployGlueJobScripts",
                                  sources=[asset],
                                  destination_bucket=bucket,
                                  destination_key_prefix="glue-python-scripts")

        # Grant read write access for glue execution IAM role for S3 bucket
        bucket.grant_read_write(glue_job_role)

        scriptLocation = 's3://' + bucket.bucket_name + '/glue-python-scripts/hello.py'

        # Python-shell Glue job
        job = glue.CfnJob(self,
                          'Glue-job',
                          name='cdk-test-glue-python-job',
                          role=glue_job_role.role_arn,
                          command=glue.CfnJob.JobCommandProperty(
                              name='pythonshell',
                              python_version='3',
                              script_location=scriptLocation))

        # Glue Start Job Run Task for State Function (integration_pattern = .sync)
        glue_task = sfn_tasks.GlueStartJobRun(
            self,
            "Task",
            glue_job_name=job.name,
            integration_pattern=sfn.IntegrationPattern.RUN_JOB,
            arguments=sfn.TaskInput.from_object(
                {"--message": sfn.JsonPath.string_at("$.message")}),
            timeout=Duration.minutes(6),
            notify_delay_after=Duration.minutes(6))

        # State Function defination
        definition = glue_task
        state_machine = sfn.StateMachine(self,
                                         "GlueJobStateMachine",
                                         definition=definition,
                                         timeout=Duration.minutes(10))

        # CDK Outputs
        CfnOutput(scope=self,
                  id='StateMachineArn',
                  value=state_machine.state_machine_arn)
        CfnOutput(scope=self, id='GlueJobName', value=job.name)
        CfnOutput(scope=self, id='S3BucketName', value=bucket.bucket_name)
Exemplo n.º 10
0
    def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None:
        super().__init__(scope, construct_id, **kwargs)

        policy_statement = iam.PolicyStatement(
            actions=['logs:*', 's3:*', 'iam:*', 'cloudwatch:*', 'glue:*']
        )

        policy_statement.add_all_resources()

        my_lambda = _lambda.Function(
            self, 'lambdaHandler',
            runtime=_lambda.Runtime.PYTHON_3_8,
            code=_lambda.Code.asset('lambda'),
            handler='handler.handler',
        )
        my_lambda_role = iam.Role(
            self,
            'my_lambda_role',
            assumed_by=iam.ServicePrincipal('lambda.amazonaws.com')
        )
        my_lambda_role.add_to_policy(
            policy_statement
        )

        my_bucket = _s3.Bucket(
            self,
            id='s3buckettest',
            bucket_name='csvconverterbv',
        )

        notification = aws_s3_notifications.LambdaDestination(my_lambda)

        my_bucket.add_event_notification(_s3.EventType.OBJECT_CREATED, notification)

        glue_job_role = iam.Role(
            self,
            'Glue-Job-Role',
            assumed_by=iam.ServicePrincipal('glue.amazonaws.com')
        )
        glue_job_role.add_to_policy(
            policy_statement
        )

        code_bucket = _s3.Bucket.from_bucket_attributes(
            self, 'CodeBucket',
            bucket_name='csvconverterbv'
        )

        aws_s3_deployment.BucketDeployment(
            self,
            'S3Deployment',
            destination_bucket=code_bucket,
            sources=[aws_s3_deployment.Source.asset('glue/')],
            destination_key_prefix='glue/'
        )


        job = glue.CfnJob(
            self,
            'glue-test-job',
            name='glue-test-job',
            role=glue_job_role.role_arn,
            allocated_capacity=10,
            command=glue.CfnJob.JobCommandProperty(
                name='glueetl',
                script_location='s3://csvconverterbv/glue/gluejob.py'
            ))