Пример #1
0
    def __init__(
        self,
        scope: core.Construct,
        rds_stack: RDSStack,
        data_lake_bronze_bucket: BaseDataLakeBucket,
        **kwargs,
    ):
        self.deploy_env = active_environment
        self.rds_stack = rds_stack
        self.data_lake_bronze_bucket = data_lake_bronze_bucket

        self.rds_endpoint = dms.CfnEndpoint(
            scope=scope,
            id=f"dms-{self.deploy_env.value}-ecommerce-rds-endpoint",
            endpoint_type="source",
            endpoint_identifier=
            f"dms-source-{self.deploy_env.value}-ecommerce-rds-endpoint",
            engine_name="postgres",
            password=
            db_password,  # should not be hardcoded. Move to SecretsManager and use dynamic reference
            username=db_username,
            database_name=db_name,
            port=5432,
            server_name=self.rds_stack.ecommerce_rds.
            db_instance_endpoint_address,
            extra_connection_attributes=
            "captureDDLs=Y",  # Capture changes in tables
        )

        self.s3_endpoint = dms.CfnEndpoint(
            scope=scope,
            id=f"dms-{self.deploy_env.value}-ecommerce-s3-endpoint",
            endpoint_type="target",
            endpoint_identifier=
            f"dms-target-{self.deploy_env.value}-ecommerce-s3-endpoint",
            engine_name="s3",
            extra_connection_attributes=
            "DataFormat=parquet;maxFileSize=131072;timestampColumnName=extracted_at;includeOpForFullLoad=true;cdcMaxBatchInterval=120",
            s3_settings=dms.CfnEndpoint.S3SettingsProperty(
                bucket_name=self.data_lake_bronze_bucket.bucket_name,
                bucket_folder="ecommerce_rds",
                compression_type="gzip",
                csv_delimiter=",",
                csv_row_delimiter="\n",
                service_access_role_arn=RawDMSRole(
                    scope, self.data_lake_bronze_bucket).role_arn,
            ),
        )
Пример #2
0
    def __init__(
        self,
        scope: core.Construct,
        common_stack: CommonStack,
        data_lake_raw_bucket: BaseDataLakeBucket,
        **kwargs,
    ) -> None:
        self.data_lake_raw_bucket = data_lake_raw_bucket
        self.common_stack = common_stack
        self.deploy_env = scope.deploy_env
        self.rds_endpoint = dms.CfnEndpoint(
            scope,
            f"dms-{self.deploy_env.value}-orders-rds-endpoint",
            endpoint_type="source",
            endpoint_identifier=f"dms-source-{self.deploy_env.value}-orders-rds-endpoint",
            engine_name="postgres",
            password=core.CfnDynamicReference(
                core.CfnDynamicReferenceService.SECRETS_MANAGER,
                key=f"{self.common_stack.orders_rds.secret.secret_arn}:SecretString:password",
            ).to_string(),
            username=core.CfnDynamicReference(
                core.CfnDynamicReferenceService.SECRETS_MANAGER,
                key=f"{self.common_stack.orders_rds.secret.secret_arn}:SecretString:username",
            ).to_string(),
            database_name=core.CfnDynamicReference(
                core.CfnDynamicReferenceService.SECRETS_MANAGER,
                key=f"{self.common_stack.orders_rds.secret.secret_arn}:SecretString:dbname",
            ).to_string(),
            port=5432,
            server_name=self.common_stack.orders_rds.db_instance_endpoint_address,
            extra_connection_attributes="captureDDLs=Y",
        )

        self.s3_endpoint = dms.CfnEndpoint(
            scope,
            f"dms-{self.deploy_env.value}-orders-s3-endpoint",
            endpoint_type="target",
            engine_name="s3",
            endpoint_identifier=f"dms-target-{self.deploy_env.value}-orders-s3-endpoint",
            extra_connection_attributes="DataFormat=parquet;maxFileSize=131072;timestampColumnName=extracted_at;includeOpForFullLoad=true;cdcMaxBatchInterval=120",
            s3_settings=dms.CfnEndpoint.S3SettingsProperty(
                bucket_name=self.data_lake_raw_bucket.bucket_name,
                bucket_folder="orders",
                compression_type="gzip",
                csv_delimiter=",",
                csv_row_delimiter="\n",
                service_access_role_arn=RawDMSRole(
                    scope, self.data_lake_raw_bucket
                ).role_arn,
            ),
        )

        self.dms_sg = ec2.SecurityGroup(
            scope,
            f"dms-{self.deploy_env.value}-sg",
            vpc=self.common_stack.custom_vpc,
            security_group_name=f"dms-{self.deploy_env.value}-sg",
        )

        self.dms_subnet_group = dms.CfnReplicationSubnetGroup(
            scope,
            f"dms-{self.deploy_env.value}-replication-subnet",
            replication_subnet_group_description="dms replication instance subnet group",
            subnet_ids=[
                subnet.subnet_id
                for subnet in self.common_stack.custom_vpc.private_subnets
            ],
            replication_subnet_group_identifier=f"dms-{self.deploy_env.value}-replication-subnet",
        )

        self.instance = dms.CfnReplicationInstance(
            scope,
            f"dms-replication-instance-{self.deploy_env.value}",
            allocated_storage=100,
            publicly_accessible=False,
            engine_version="3.4.4",
            replication_instance_class="dms.t2.small",
            replication_instance_identifier=f"dms-{self.deploy_env.value}-replication-instance",
            vpc_security_group_ids=[self.dms_sg.security_group_id],
            replication_subnet_group_identifier=self.dms_subnet_group.replication_subnet_group_identifier,
        )

        self.instance.node.add_dependency(self.dms_subnet_group)
        self.instance.node.add_dependency(self.dms_sg)

        super().__init__(
            scope,
            f"{self.deploy_env.value}-dms-task-orders-rds",
            migration_type="full-load-and-cdc",
            replication_task_identifier=f"{self.deploy_env.value}-dms-task-orders-rds",
            replication_instance_arn=self.instance.ref,
            source_endpoint_arn=self.rds_endpoint.ref,
            target_endpoint_arn=self.s3_endpoint.ref,
            table_mappings=json.dumps(
                {
                    "rules": [
                        {
                            "rule-type": "selection",
                            "rule-id": "1",
                            "rule-name": "1",
                            "object-locator": {
                                "schema-name": "%",
                                "table-name": "%",
                            },
                            "rule-action": "include",
                            "filters": [],
                        }
                    ]
                }
            ),
        )
Пример #3
0
    def __init__(self, scope: core.Construct, common: Common,
                 data_lake: DataLake, **kwargs) -> None:
        self.rds_endpoint = dms.CfnEndpoint(
            scope,
            f'dms-{common.env}-orders-rds-endpoint',
            endpoint_type='source',
            endpoint_identifier=f'dms-source-{common.env}-orders-rds-endpoint',
            engine_name='postgres',
            password=core.CfnDynamicReference(
                core.CfnDynamicReferenceService.SECRETS_MANAGER,
                key=
                f'{common.orders_rds.secret.secret_arn}:SecretString:password'
            ).to_string(),
            username=core.CfnDynamicReference(
                core.CfnDynamicReferenceService.SECRETS_MANAGER,
                key=
                f'{common.orders_rds.secret.secret_arn}:SecretString:username'
            ).to_string(),
            database_name=core.CfnDynamicReference(
                core.CfnDynamicReferenceService.SECRETS_MANAGER,
                key=f'{common.orders_rds.secret.secret_arn}:SecretString:dbname'
            ).to_string(),
            port=5432,
            server_name=common.orders_rds.db_instance_endpoint_address,
        )

        self.s3_endpoint = dms.CfnEndpoint(
            scope,
            f'dms-{common.env}-orders-s3-endpoint',
            endpoint_type='target',
            engine_name='s3',
            endpoint_identifier=f'dms-target-{common.env}-orders-s3-endpoint',
            extra_connection_attributes=
            "DataFormat=parquet;maxFileSize=131072;timestampColumnName=extracted_at;includeOpForFullLoad=true;cdcInsertsAndUpdates=true",
            s3_settings=dms.CfnEndpoint.S3SettingsProperty(
                bucket_name=data_lake.data_lake_raw_bucket.bucket_name,
                bucket_folder='orders',
                compression_type='gzip',
                csv_delimiter=',',
                csv_row_delimiter='\n',
                service_access_role_arn=RawDMSRole(
                    scope, common.env,
                    data_lake.data_lake_raw_bucket).role_arn))

        self.dms_sg = ec2.SecurityGroup(
            scope,
            f'dms-{common.env}-sg',
            vpc=common.custom_vpc,
            security_group_name=f'dms-{common.env}-sg',
        )

        self.dms_subnet_group = dms.CfnReplicationSubnetGroup(
            scope,
            f'dms-{common.env}-replication-subnet',
            replication_subnet_group_description=
            'dms replication instance subnet group',
            subnet_ids=[
                subnet.subnet_id
                for subnet in common.custom_vpc.private_subnets
            ],
            replication_subnet_group_identifier=
            f'dms-{common.env}-replication-subnet')

        self.instance = dms.CfnReplicationInstance(
            scope,
            f'dms-replication-instance-{common.env}',
            allocated_storage=100,
            publicly_accessible=False,
            engine_version='3.3.2',
            replication_instance_class='dms.t2.small',
            replication_instance_identifier=
            f'dms-{common.env}-replication-instance',
            vpc_security_group_ids=[self.dms_sg.security_group_id],
            replication_subnet_group_identifier=self.dms_subnet_group.
            replication_subnet_group_identifier)

        super().__init__(
            scope,
            f'{common.env}-dms-task-orders-rds',
            migration_type='full-load-and-cdc',
            replication_task_identifier=f'{common.env}-dms-task-orders-rds',
            replication_instance_arn=self.instance.ref,
            source_endpoint_arn=self.rds_endpoint.ref,
            target_endpoint_arn=self.s3_endpoint.ref,
            table_mappings=json.dumps({
                "rules": [{
                    "rule-type": "selection",
                    "rule-id": "1",
                    "rule-name": "1",
                    "object-locator": {
                        "schema-name": "%",
                        "table-name": "%",
                    },
                    "rule-action": "include",
                    "filters": []
                }]
            }))
Пример #4
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        with open('./props/tasksetting.json', 'r') as f1:
            py_json1 = json.load(f1)
            ts = json.dumps(py_json1)

        # with open('./props/mappingrule.json', 'r') as f2:
        #     py_json2 = json.load(f2)
        #     mr = json.dumps(py_json2)

        with open('./props/config.json', 'r') as f2:
            configuration = json.load(f2)

        def getMappingrules(self, table_list):
            rules = []
            for index, value in enumerate(table_list, 1):
                rules.append({
                    "rule-type": "selection",
                    "rule-id": str(index),
                    "rule-name": str(index),
                    "object-locator": {
                        "schema-name": value['schemaName'],
                        "table-name": value['tableName']
                    },
                    "rule-action": "include",
                    "filters": []
                })
            mapping_rules = {"rules": rules}
            return json.dumps(mapping_rules)

        # The code that defines your stack goes here
        S3Accessrole = _iam.Role(
            self,
            'dmsrole',
            assumed_by=_iam.ServicePrincipal('dms.amazonaws.com'),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    'AmazonS3FullAccess')
            ])

        raw_bucket = s3.Bucket(self,
                               'rawbucket',
                               bucket_name='rawbucket-datalake-cdk-oregon')
        raw_bucket.add_lifecycle_rule(
            enabled=configuration['s3LifecycleRule']['enabled'],
            expiration=core.Duration.days(
                configuration['s3LifecycleRule']['expiration']))

        #my_table = ddb.Table(self, id ='dunamoTable', table_name = 'testcdktable',
        #partition_key = ddb.Attribute(name ='lastname',type = ddb.AttributeType.STRING) )

        dl_dms = _dms.CfnReplicationInstance(
            self,
            'dmsreplication',
            replication_instance_class=configuration['DMS_instance_setting']
            ['instance_class'],
            replication_instance_identifier='datalake-instance-cdk',
            allocated_storage=configuration['DMS_instance_setting']
            ['allocated_storage'])

        source_endpoint = _dms.CfnEndpoint(
            self,
            'sourceendpoint',
            endpoint_type='source',
            engine_name=configuration['engineName'],
            database_name=configuration['databaseName'],
            username=configuration['username'],
            password=configuration['password'],
            port=configuration['port'],
            server_name=configuration['serverName'],
        )

        target_endpoint = _dms.CfnEndpoint(
            self,
            'targetendpoint',
            endpoint_type='target',
            engine_name='s3',
            s3_settings={
                'bucketName': raw_bucket.bucket_name,
                'serviceAccessRoleArn': S3Accessrole.role_arn
            },
            extra_connection_attributes='dataFormat=parquet')

        dms_task = _dms.CfnReplicationTask(
            self,
            'data2lake-task',
            migration_type='full-load-and-cdc',
            replication_instance_arn=dl_dms.ref,
            source_endpoint_arn=source_endpoint.ref,
            target_endpoint_arn=target_endpoint.ref,
            replication_task_settings=ts,
            table_mappings=getMappingrules(self, configuration['tableList']))

        my_table = ddb.Table(self,
                             id='dynamoTable',
                             table_name='ControllerTable',
                             partition_key=ddb.Attribute(
                                 name='path', type=ddb.AttributeType.STRING),
                             billing_mode=ddb.BillingMode.PAY_PER_REQUEST)

        datalake_bucket = s3.Bucket(self,
                                    'datalakebucket',
                                    bucket_name='datalake-bucket-cdk-oregon')

        glue_role = _iam.Role(
            self,
            'gluerole',
            assumed_by=_iam.ServicePrincipal('glue.amazonaws.com'),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AWSGlueServiceRole')
            ])

        raw_bucket.grant_read(glue_role)
        datalake_bucket.grant_read_write(glue_role)

        #lake formation settings
        #If you have attached managed policy ('AWSLakeFormationDataAdmin') to your own iam user, you should change that policy to allow "lakeformation:PutDataLakeSettings",
        #so that the lake setting can be allowed by below code in cdk.
        lake_admin_setting = _lakeformation.CfnDataLakeSettings(
            self,
            'data-lake-GrantAdmin',
            admins=[
                _lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty(
                    data_lake_principal_identifier=configuration[
                        'executiveArn'])
            ])

        glue_database = _glue.Database(self,
                                       'gluedatabase',
                                       database_name='data_lake_gluedb')

        glue_database.node.add_dependency(lake_admin_setting)

        glue_role_permission_inLakeFormation = _lakeformation.CfnPermissions(
            self,
            'permission-glueRole',
            data_lake_principal=_lakeformation.CfnPermissions.
            DataLakePrincipalProperty(
                data_lake_principal_identifier=glue_role.role_arn),
            resource=_lakeformation.CfnPermissions.ResourceProperty(
                database_resource=_lakeformation.CfnPermissions.
                DatabaseResourceProperty(name=glue_database.database_name)),
            permissions=['ALL'])

        crawler = _glue.CfnCrawler(
            self,
            'datalakecrawler',
            name='Crawler-datalake-cdk',
            role=glue_role.role_arn,
            targets={
                's3Targets': [{
                    'path':
                    's3://' + datalake_bucket.bucket_name + '/datalake/'
                }]
            },
            database_name='data_lake_gluedb',
            configuration=
            "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}"
        )

        initialload_script = S3Assets.Asset(self,
                                            'initial-load-code',
                                            path='./Gluejob/InitialLoad.py')
        incrementalload_script = S3Assets.Asset(
            self, 'incremental-load-code', path='./Gluejob/IncrementalLoad.py')

        initialload_script.grant_read(glue_role)
        incrementalload_script.grant_read(glue_role)
        my_table.grant_full_access(glue_role)

        initial_load_job = _glue.CfnJob(
            self,
            'initial-job',
            name='InitialLoad-cdk',
            command=_glue.CfnJob.JobCommandProperty(
                name='glueetl',
                python_version='3',
                script_location='s3://' + initialload_script.s3_bucket_name +
                '/' + initialload_script.s3_object_key),
            role=glue_role.role_arn,
            default_arguments={
                '--prefix': str(configuration['tableList']),
                '--bucket': raw_bucket.bucket_name,
                '--datalake_bucket': datalake_bucket.bucket_name,
                '--datalake_prefix': 'datalake/',
                '--region': CdkpyStack.of(self).region,
                '--controller_table_name': my_table.table_name
            },
            allocated_capacity=configuration['glue_job_setting']
            ['job_capacity'],
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=configuration['glue_job_setting']
                ['max_concurrent_run_JobExecution']))

        incremental_load_job = _glue.CfnJob(
            self,
            'increment-job',
            name='IncrementalLoad-cdk',
            command=_glue.CfnJob.JobCommandProperty(
                name='glueetl',
                script_location='s3://' +
                incrementalload_script.s3_bucket_name + '/' +
                incrementalload_script.s3_object_key,
                python_version='3'),
            role=glue_role.role_arn,
            default_arguments={
                '--prefix': str(configuration['tableList']),
                '--bucket': raw_bucket.bucket_name,
                '--datalake_bucket': datalake_bucket.bucket_name,
                '--datalake_prefix': 'datalake/',
                '--region': CdkpyStack.of(self).region,
                '--controller_table_name': my_table.table_name
            },
            allocated_capacity=2,
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=1))

        job_trigger = _glue.CfnTrigger(
            self,
            'datalake-glue-trigger',
            type='SCHEDULED',
            schedule=configuration['job_trigger_schedule'],
            start_on_creation=False,
            actions=[
                _glue.CfnTrigger.ActionProperty(job_name='IncrementalLoad-cdk')
            ])

        dl_sns = _sns.Topic(self, 'datalake_sns', display_name='data-lake-sns')

        endpoint_email = configuration['emailSubscriptionList']

        for emails in endpoint_email:
            dl_sns.add_subscription(_subscrption.EmailSubscription(emails))

        #Another way to subscribe: dl_subscription = _sns.Subscription(self,'email-subscrption',topic = dl_sns,endpoint='*****@*****.**',protocol= _sns.SubscriptionProtocol.EMAIL)

        glue_events_target = _events_targets.SnsTopic(dl_sns)

        glue_events_rule = _events.Rule(
            self,
            'gluejobevents-datalake',
            description='Using for tracking the failed glue job of data lake',
            rule_name='dl-gluejob-event',
            event_pattern=_events.EventPattern(
                source=['aws.glue'],
                detail_type=['Glue Job State Change'],
                detail={
                    "jobName": [initial_load_job.name],
                    "state": ["FAILED"]
                }),
            targets=[glue_events_target])

        dms_subscription = _dms.CfnEventSubscription(
            self,
            'dmsevents-datalake',
            sns_topic_arn=dl_sns.topic_arn,
            subscription_name='datalake-dmsevents',
            source_type='replication-task',
            event_categories=['failure'])
Пример #5
0
    def __init__(self, scope: core.Construct, construct_id: str,
                 **kwargs) -> None:
        super().__init__(scope, construct_id, **kwargs)

        # The code that defines your stack goes here

        #获取vpc
        #vpc = ec2.Vpc.from_lookup(self, 'default',is_default=True,vpc_name='default')
        vpc = ec2.Vpc.from_lookup(self,
                                  'dms-vpc',
                                  vpc_id='vpc-08b56fb6053ca2c75')

        #创建RDS参数组
        db_parameter = rds.ParameterGroup(
            self,
            'dms-param-mysql5.7',
            engine=rds.DatabaseInstanceEngine.mysql(
                version=rds.MysqlEngineVersion.VER_5_7),
            parameters={"binlog_format": "ROW"})

        # sourceDB = rds.DatabaseInstanceFromSnapshot(
        #     self,'dms-rds-soruce',
        #     snapshot_identifier= 'tickets-mysql57',
        #     engine=rds.DatabaseInstanceEngine.MYSQL,
        #     instance_type=ec2.InstanceType.of(ec2.InstanceClass.BURSTABLE3,ec2.InstanceSize.MEDIUM),
        #     vpc=vpc,
        #     parameter_group=db_parameter
        #     )

        # sourceDB = rds.DatabaseInstance(
        #     self,'dms-rds-soruce',
        #     #instance_identifier='dms-rds-soruce',
        #     engine=rds.DatabaseInstanceEngine.mysql(
        #         version=rds.MysqlEngineVersion.VER_5_7
        #     ),
        #     instance_type=ec2.InstanceType.of(ec2.InstanceClass.BURSTABLE3,ec2.InstanceSize.MEDIUM),
        #     vpc=vpc,
        #     parameter_group=db_parameter,
        #     #credentials=rdsPasswordSecret
        #     )

        # sourceDB.connections.allow_default_port_internally()

        dms_rep = dms.CfnReplicationInstance(
            self,
            'dms-replication',
            replication_instance_class='dms.c5.large',
            engine_version='3.4.0')

        stream = kinesis.Stream(self, 'dms-steam')

        streamWriteRole = iam.Role(
            self,
            'dms-stream-role',
            assumed_by=iam.ServicePrincipal('dms.amazonaws.com'))

        streamWriteRole.add_to_policy(
            iam.PolicyStatement(resources=[stream.stream_arn],
                                actions=[
                                    'kinesis:DescribeStream',
                                    'kinesis:PutRecord', 'kinesis:PutRecords'
                                ]))

        source = dms.CfnEndpoint(
            self,
            'dms-source',
            endpoint_type='source',
            engine_name='mysql',
            username='******',
            password='******',
            server_name=
            "dms-rdssource.c7iucbqgd2xo.us-east-1.rds.amazonaws.com",
            port=3306)

        target = dms.CfnEndpoint(self,
                                 'dms-target',
                                 endpoint_type='target',
                                 engine_name='kinesis',
                                 kinesis_settings={
                                     "messageFormat":
                                     "JSON",
                                     'streamArn':
                                     stream.stream_arn,
                                     "serviceAccessRoleArn":
                                     streamWriteRole.role_arn
                                 })

        dmsTableMappings = {
            "rules": [{
                "rule-type": "selection",
                "rule-id": "1",
                "rule-name": "1",
                "object-locator": {
                    "schema-name": "dms_sample",
                    "table-name": "t_log_levelup"
                },
                "rule-action": "include"
            }]
        }

        dms.CfnReplicationTask(self,
                               'dms-stream-repTask',
                               replication_instance_arn=dms_rep.ref,
                               migration_type='full-load-and-cdc',
                               source_endpoint_arn=source.ref,
                               target_endpoint_arn=target.ref,
                               table_mappings=json.dumps(dmsTableMappings))

        analyticsRole = iam.Role(
            self,
            'KinesisAnalyticsRole',
            assumed_by=iam.ServicePrincipal('kinesisanalytics.amazonaws.com'))

        kinesisanalytics.CfnApplicationV2(
            self,
            'KinesisAnalytics',
            application_name='dms-stream-anlytics',
            service_execution_role=analyticsRole.role_arn,
            runtime_environment='SQL-1_0',
            application_configuration={
                'sqlApplicationConfiguration': {
                    'inputs': [{
                        'namePrefix': "exampleNamePrefix",
                        'inputSchema': {
                            'recordColumns': [{
                                'name': "example",
                                'sqlType': "VARCHAR(16)",
                                'mapping': "$.example"
                            }],
                            'recordFormat': {
                                'recordFormatType': "JSON",
                                'mappingParameters': {
                                    'jsonMappingParameters': {
                                        'recordRowPath': "$"
                                    }
                                }
                            }
                        },
                        'kinesisStreamsInput': {
                            'resourceArn': stream.stream_arn
                        }
                    }]
                },
                'applicationCodeConfiguration': {
                    'codeContent': {
                        'textContent': "Example Application Code"
                    },
                    'codeContentType': "PLAINTEXT"
                }
            })