def __init__( self, scope: core.Construct, rds_stack: RDSStack, data_lake_bronze_bucket: BaseDataLakeBucket, **kwargs, ): self.deploy_env = active_environment self.rds_stack = rds_stack self.data_lake_bronze_bucket = data_lake_bronze_bucket self.rds_endpoint = dms.CfnEndpoint( scope=scope, id=f"dms-{self.deploy_env.value}-ecommerce-rds-endpoint", endpoint_type="source", endpoint_identifier= f"dms-source-{self.deploy_env.value}-ecommerce-rds-endpoint", engine_name="postgres", password= db_password, # should not be hardcoded. Move to SecretsManager and use dynamic reference username=db_username, database_name=db_name, port=5432, server_name=self.rds_stack.ecommerce_rds. db_instance_endpoint_address, extra_connection_attributes= "captureDDLs=Y", # Capture changes in tables ) self.s3_endpoint = dms.CfnEndpoint( scope=scope, id=f"dms-{self.deploy_env.value}-ecommerce-s3-endpoint", endpoint_type="target", endpoint_identifier= f"dms-target-{self.deploy_env.value}-ecommerce-s3-endpoint", engine_name="s3", extra_connection_attributes= "DataFormat=parquet;maxFileSize=131072;timestampColumnName=extracted_at;includeOpForFullLoad=true;cdcMaxBatchInterval=120", s3_settings=dms.CfnEndpoint.S3SettingsProperty( bucket_name=self.data_lake_bronze_bucket.bucket_name, bucket_folder="ecommerce_rds", compression_type="gzip", csv_delimiter=",", csv_row_delimiter="\n", service_access_role_arn=RawDMSRole( scope, self.data_lake_bronze_bucket).role_arn, ), )
def __init__( self, scope: core.Construct, common_stack: CommonStack, data_lake_raw_bucket: BaseDataLakeBucket, **kwargs, ) -> None: self.data_lake_raw_bucket = data_lake_raw_bucket self.common_stack = common_stack self.deploy_env = scope.deploy_env self.rds_endpoint = dms.CfnEndpoint( scope, f"dms-{self.deploy_env.value}-orders-rds-endpoint", endpoint_type="source", endpoint_identifier=f"dms-source-{self.deploy_env.value}-orders-rds-endpoint", engine_name="postgres", password=core.CfnDynamicReference( core.CfnDynamicReferenceService.SECRETS_MANAGER, key=f"{self.common_stack.orders_rds.secret.secret_arn}:SecretString:password", ).to_string(), username=core.CfnDynamicReference( core.CfnDynamicReferenceService.SECRETS_MANAGER, key=f"{self.common_stack.orders_rds.secret.secret_arn}:SecretString:username", ).to_string(), database_name=core.CfnDynamicReference( core.CfnDynamicReferenceService.SECRETS_MANAGER, key=f"{self.common_stack.orders_rds.secret.secret_arn}:SecretString:dbname", ).to_string(), port=5432, server_name=self.common_stack.orders_rds.db_instance_endpoint_address, extra_connection_attributes="captureDDLs=Y", ) self.s3_endpoint = dms.CfnEndpoint( scope, f"dms-{self.deploy_env.value}-orders-s3-endpoint", endpoint_type="target", engine_name="s3", endpoint_identifier=f"dms-target-{self.deploy_env.value}-orders-s3-endpoint", extra_connection_attributes="DataFormat=parquet;maxFileSize=131072;timestampColumnName=extracted_at;includeOpForFullLoad=true;cdcMaxBatchInterval=120", s3_settings=dms.CfnEndpoint.S3SettingsProperty( bucket_name=self.data_lake_raw_bucket.bucket_name, bucket_folder="orders", compression_type="gzip", csv_delimiter=",", csv_row_delimiter="\n", service_access_role_arn=RawDMSRole( scope, self.data_lake_raw_bucket ).role_arn, ), ) self.dms_sg = ec2.SecurityGroup( scope, f"dms-{self.deploy_env.value}-sg", vpc=self.common_stack.custom_vpc, security_group_name=f"dms-{self.deploy_env.value}-sg", ) self.dms_subnet_group = dms.CfnReplicationSubnetGroup( scope, f"dms-{self.deploy_env.value}-replication-subnet", replication_subnet_group_description="dms replication instance subnet group", subnet_ids=[ subnet.subnet_id for subnet in self.common_stack.custom_vpc.private_subnets ], replication_subnet_group_identifier=f"dms-{self.deploy_env.value}-replication-subnet", ) self.instance = dms.CfnReplicationInstance( scope, f"dms-replication-instance-{self.deploy_env.value}", allocated_storage=100, publicly_accessible=False, engine_version="3.4.4", replication_instance_class="dms.t2.small", replication_instance_identifier=f"dms-{self.deploy_env.value}-replication-instance", vpc_security_group_ids=[self.dms_sg.security_group_id], replication_subnet_group_identifier=self.dms_subnet_group.replication_subnet_group_identifier, ) self.instance.node.add_dependency(self.dms_subnet_group) self.instance.node.add_dependency(self.dms_sg) super().__init__( scope, f"{self.deploy_env.value}-dms-task-orders-rds", migration_type="full-load-and-cdc", replication_task_identifier=f"{self.deploy_env.value}-dms-task-orders-rds", replication_instance_arn=self.instance.ref, source_endpoint_arn=self.rds_endpoint.ref, target_endpoint_arn=self.s3_endpoint.ref, table_mappings=json.dumps( { "rules": [ { "rule-type": "selection", "rule-id": "1", "rule-name": "1", "object-locator": { "schema-name": "%", "table-name": "%", }, "rule-action": "include", "filters": [], } ] } ), )
def __init__(self, scope: core.Construct, common: Common, data_lake: DataLake, **kwargs) -> None: self.rds_endpoint = dms.CfnEndpoint( scope, f'dms-{common.env}-orders-rds-endpoint', endpoint_type='source', endpoint_identifier=f'dms-source-{common.env}-orders-rds-endpoint', engine_name='postgres', password=core.CfnDynamicReference( core.CfnDynamicReferenceService.SECRETS_MANAGER, key= f'{common.orders_rds.secret.secret_arn}:SecretString:password' ).to_string(), username=core.CfnDynamicReference( core.CfnDynamicReferenceService.SECRETS_MANAGER, key= f'{common.orders_rds.secret.secret_arn}:SecretString:username' ).to_string(), database_name=core.CfnDynamicReference( core.CfnDynamicReferenceService.SECRETS_MANAGER, key=f'{common.orders_rds.secret.secret_arn}:SecretString:dbname' ).to_string(), port=5432, server_name=common.orders_rds.db_instance_endpoint_address, ) self.s3_endpoint = dms.CfnEndpoint( scope, f'dms-{common.env}-orders-s3-endpoint', endpoint_type='target', engine_name='s3', endpoint_identifier=f'dms-target-{common.env}-orders-s3-endpoint', extra_connection_attributes= "DataFormat=parquet;maxFileSize=131072;timestampColumnName=extracted_at;includeOpForFullLoad=true;cdcInsertsAndUpdates=true", s3_settings=dms.CfnEndpoint.S3SettingsProperty( bucket_name=data_lake.data_lake_raw_bucket.bucket_name, bucket_folder='orders', compression_type='gzip', csv_delimiter=',', csv_row_delimiter='\n', service_access_role_arn=RawDMSRole( scope, common.env, data_lake.data_lake_raw_bucket).role_arn)) self.dms_sg = ec2.SecurityGroup( scope, f'dms-{common.env}-sg', vpc=common.custom_vpc, security_group_name=f'dms-{common.env}-sg', ) self.dms_subnet_group = dms.CfnReplicationSubnetGroup( scope, f'dms-{common.env}-replication-subnet', replication_subnet_group_description= 'dms replication instance subnet group', subnet_ids=[ subnet.subnet_id for subnet in common.custom_vpc.private_subnets ], replication_subnet_group_identifier= f'dms-{common.env}-replication-subnet') self.instance = dms.CfnReplicationInstance( scope, f'dms-replication-instance-{common.env}', allocated_storage=100, publicly_accessible=False, engine_version='3.3.2', replication_instance_class='dms.t2.small', replication_instance_identifier= f'dms-{common.env}-replication-instance', vpc_security_group_ids=[self.dms_sg.security_group_id], replication_subnet_group_identifier=self.dms_subnet_group. replication_subnet_group_identifier) super().__init__( scope, f'{common.env}-dms-task-orders-rds', migration_type='full-load-and-cdc', replication_task_identifier=f'{common.env}-dms-task-orders-rds', replication_instance_arn=self.instance.ref, source_endpoint_arn=self.rds_endpoint.ref, target_endpoint_arn=self.s3_endpoint.ref, table_mappings=json.dumps({ "rules": [{ "rule-type": "selection", "rule-id": "1", "rule-name": "1", "object-locator": { "schema-name": "%", "table-name": "%", }, "rule-action": "include", "filters": [] }] }))
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) with open('./props/tasksetting.json', 'r') as f1: py_json1 = json.load(f1) ts = json.dumps(py_json1) # with open('./props/mappingrule.json', 'r') as f2: # py_json2 = json.load(f2) # mr = json.dumps(py_json2) with open('./props/config.json', 'r') as f2: configuration = json.load(f2) def getMappingrules(self, table_list): rules = [] for index, value in enumerate(table_list, 1): rules.append({ "rule-type": "selection", "rule-id": str(index), "rule-name": str(index), "object-locator": { "schema-name": value['schemaName'], "table-name": value['tableName'] }, "rule-action": "include", "filters": [] }) mapping_rules = {"rules": rules} return json.dumps(mapping_rules) # The code that defines your stack goes here S3Accessrole = _iam.Role( self, 'dmsrole', assumed_by=_iam.ServicePrincipal('dms.amazonaws.com'), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( 'AmazonS3FullAccess') ]) raw_bucket = s3.Bucket(self, 'rawbucket', bucket_name='rawbucket-datalake-cdk-oregon') raw_bucket.add_lifecycle_rule( enabled=configuration['s3LifecycleRule']['enabled'], expiration=core.Duration.days( configuration['s3LifecycleRule']['expiration'])) #my_table = ddb.Table(self, id ='dunamoTable', table_name = 'testcdktable', #partition_key = ddb.Attribute(name ='lastname',type = ddb.AttributeType.STRING) ) dl_dms = _dms.CfnReplicationInstance( self, 'dmsreplication', replication_instance_class=configuration['DMS_instance_setting'] ['instance_class'], replication_instance_identifier='datalake-instance-cdk', allocated_storage=configuration['DMS_instance_setting'] ['allocated_storage']) source_endpoint = _dms.CfnEndpoint( self, 'sourceendpoint', endpoint_type='source', engine_name=configuration['engineName'], database_name=configuration['databaseName'], username=configuration['username'], password=configuration['password'], port=configuration['port'], server_name=configuration['serverName'], ) target_endpoint = _dms.CfnEndpoint( self, 'targetendpoint', endpoint_type='target', engine_name='s3', s3_settings={ 'bucketName': raw_bucket.bucket_name, 'serviceAccessRoleArn': S3Accessrole.role_arn }, extra_connection_attributes='dataFormat=parquet') dms_task = _dms.CfnReplicationTask( self, 'data2lake-task', migration_type='full-load-and-cdc', replication_instance_arn=dl_dms.ref, source_endpoint_arn=source_endpoint.ref, target_endpoint_arn=target_endpoint.ref, replication_task_settings=ts, table_mappings=getMappingrules(self, configuration['tableList'])) my_table = ddb.Table(self, id='dynamoTable', table_name='ControllerTable', partition_key=ddb.Attribute( name='path', type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) datalake_bucket = s3.Bucket(self, 'datalakebucket', bucket_name='datalake-bucket-cdk-oregon') glue_role = _iam.Role( self, 'gluerole', assumed_by=_iam.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSGlueServiceRole') ]) raw_bucket.grant_read(glue_role) datalake_bucket.grant_read_write(glue_role) #lake formation settings #If you have attached managed policy ('AWSLakeFormationDataAdmin') to your own iam user, you should change that policy to allow "lakeformation:PutDataLakeSettings", #so that the lake setting can be allowed by below code in cdk. lake_admin_setting = _lakeformation.CfnDataLakeSettings( self, 'data-lake-GrantAdmin', admins=[ _lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty( data_lake_principal_identifier=configuration[ 'executiveArn']) ]) glue_database = _glue.Database(self, 'gluedatabase', database_name='data_lake_gluedb') glue_database.node.add_dependency(lake_admin_setting) glue_role_permission_inLakeFormation = _lakeformation.CfnPermissions( self, 'permission-glueRole', data_lake_principal=_lakeformation.CfnPermissions. DataLakePrincipalProperty( data_lake_principal_identifier=glue_role.role_arn), resource=_lakeformation.CfnPermissions.ResourceProperty( database_resource=_lakeformation.CfnPermissions. DatabaseResourceProperty(name=glue_database.database_name)), permissions=['ALL']) crawler = _glue.CfnCrawler( self, 'datalakecrawler', name='Crawler-datalake-cdk', role=glue_role.role_arn, targets={ 's3Targets': [{ 'path': 's3://' + datalake_bucket.bucket_name + '/datalake/' }] }, database_name='data_lake_gluedb', configuration= "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}" ) initialload_script = S3Assets.Asset(self, 'initial-load-code', path='./Gluejob/InitialLoad.py') incrementalload_script = S3Assets.Asset( self, 'incremental-load-code', path='./Gluejob/IncrementalLoad.py') initialload_script.grant_read(glue_role) incrementalload_script.grant_read(glue_role) my_table.grant_full_access(glue_role) initial_load_job = _glue.CfnJob( self, 'initial-job', name='InitialLoad-cdk', command=_glue.CfnJob.JobCommandProperty( name='glueetl', python_version='3', script_location='s3://' + initialload_script.s3_bucket_name + '/' + initialload_script.s3_object_key), role=glue_role.role_arn, default_arguments={ '--prefix': str(configuration['tableList']), '--bucket': raw_bucket.bucket_name, '--datalake_bucket': datalake_bucket.bucket_name, '--datalake_prefix': 'datalake/', '--region': CdkpyStack.of(self).region, '--controller_table_name': my_table.table_name }, allocated_capacity=configuration['glue_job_setting'] ['job_capacity'], execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=configuration['glue_job_setting'] ['max_concurrent_run_JobExecution'])) incremental_load_job = _glue.CfnJob( self, 'increment-job', name='IncrementalLoad-cdk', command=_glue.CfnJob.JobCommandProperty( name='glueetl', script_location='s3://' + incrementalload_script.s3_bucket_name + '/' + incrementalload_script.s3_object_key, python_version='3'), role=glue_role.role_arn, default_arguments={ '--prefix': str(configuration['tableList']), '--bucket': raw_bucket.bucket_name, '--datalake_bucket': datalake_bucket.bucket_name, '--datalake_prefix': 'datalake/', '--region': CdkpyStack.of(self).region, '--controller_table_name': my_table.table_name }, allocated_capacity=2, execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=1)) job_trigger = _glue.CfnTrigger( self, 'datalake-glue-trigger', type='SCHEDULED', schedule=configuration['job_trigger_schedule'], start_on_creation=False, actions=[ _glue.CfnTrigger.ActionProperty(job_name='IncrementalLoad-cdk') ]) dl_sns = _sns.Topic(self, 'datalake_sns', display_name='data-lake-sns') endpoint_email = configuration['emailSubscriptionList'] for emails in endpoint_email: dl_sns.add_subscription(_subscrption.EmailSubscription(emails)) #Another way to subscribe: dl_subscription = _sns.Subscription(self,'email-subscrption',topic = dl_sns,endpoint='*****@*****.**',protocol= _sns.SubscriptionProtocol.EMAIL) glue_events_target = _events_targets.SnsTopic(dl_sns) glue_events_rule = _events.Rule( self, 'gluejobevents-datalake', description='Using for tracking the failed glue job of data lake', rule_name='dl-gluejob-event', event_pattern=_events.EventPattern( source=['aws.glue'], detail_type=['Glue Job State Change'], detail={ "jobName": [initial_load_job.name], "state": ["FAILED"] }), targets=[glue_events_target]) dms_subscription = _dms.CfnEventSubscription( self, 'dmsevents-datalake', sns_topic_arn=dl_sns.topic_arn, subscription_name='datalake-dmsevents', source_type='replication-task', event_categories=['failure'])
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) # The code that defines your stack goes here #获取vpc #vpc = ec2.Vpc.from_lookup(self, 'default',is_default=True,vpc_name='default') vpc = ec2.Vpc.from_lookup(self, 'dms-vpc', vpc_id='vpc-08b56fb6053ca2c75') #创建RDS参数组 db_parameter = rds.ParameterGroup( self, 'dms-param-mysql5.7', engine=rds.DatabaseInstanceEngine.mysql( version=rds.MysqlEngineVersion.VER_5_7), parameters={"binlog_format": "ROW"}) # sourceDB = rds.DatabaseInstanceFromSnapshot( # self,'dms-rds-soruce', # snapshot_identifier= 'tickets-mysql57', # engine=rds.DatabaseInstanceEngine.MYSQL, # instance_type=ec2.InstanceType.of(ec2.InstanceClass.BURSTABLE3,ec2.InstanceSize.MEDIUM), # vpc=vpc, # parameter_group=db_parameter # ) # sourceDB = rds.DatabaseInstance( # self,'dms-rds-soruce', # #instance_identifier='dms-rds-soruce', # engine=rds.DatabaseInstanceEngine.mysql( # version=rds.MysqlEngineVersion.VER_5_7 # ), # instance_type=ec2.InstanceType.of(ec2.InstanceClass.BURSTABLE3,ec2.InstanceSize.MEDIUM), # vpc=vpc, # parameter_group=db_parameter, # #credentials=rdsPasswordSecret # ) # sourceDB.connections.allow_default_port_internally() dms_rep = dms.CfnReplicationInstance( self, 'dms-replication', replication_instance_class='dms.c5.large', engine_version='3.4.0') stream = kinesis.Stream(self, 'dms-steam') streamWriteRole = iam.Role( self, 'dms-stream-role', assumed_by=iam.ServicePrincipal('dms.amazonaws.com')) streamWriteRole.add_to_policy( iam.PolicyStatement(resources=[stream.stream_arn], actions=[ 'kinesis:DescribeStream', 'kinesis:PutRecord', 'kinesis:PutRecords' ])) source = dms.CfnEndpoint( self, 'dms-source', endpoint_type='source', engine_name='mysql', username='******', password='******', server_name= "dms-rdssource.c7iucbqgd2xo.us-east-1.rds.amazonaws.com", port=3306) target = dms.CfnEndpoint(self, 'dms-target', endpoint_type='target', engine_name='kinesis', kinesis_settings={ "messageFormat": "JSON", 'streamArn': stream.stream_arn, "serviceAccessRoleArn": streamWriteRole.role_arn }) dmsTableMappings = { "rules": [{ "rule-type": "selection", "rule-id": "1", "rule-name": "1", "object-locator": { "schema-name": "dms_sample", "table-name": "t_log_levelup" }, "rule-action": "include" }] } dms.CfnReplicationTask(self, 'dms-stream-repTask', replication_instance_arn=dms_rep.ref, migration_type='full-load-and-cdc', source_endpoint_arn=source.ref, target_endpoint_arn=target.ref, table_mappings=json.dumps(dmsTableMappings)) analyticsRole = iam.Role( self, 'KinesisAnalyticsRole', assumed_by=iam.ServicePrincipal('kinesisanalytics.amazonaws.com')) kinesisanalytics.CfnApplicationV2( self, 'KinesisAnalytics', application_name='dms-stream-anlytics', service_execution_role=analyticsRole.role_arn, runtime_environment='SQL-1_0', application_configuration={ 'sqlApplicationConfiguration': { 'inputs': [{ 'namePrefix': "exampleNamePrefix", 'inputSchema': { 'recordColumns': [{ 'name': "example", 'sqlType': "VARCHAR(16)", 'mapping': "$.example" }], 'recordFormat': { 'recordFormatType': "JSON", 'mappingParameters': { 'jsonMappingParameters': { 'recordRowPath': "$" } } } }, 'kinesisStreamsInput': { 'resourceArn': stream.stream_arn } }] }, 'applicationCodeConfiguration': { 'codeContent': { 'textContent': "Example Application Code" }, 'codeContentType': "PLAINTEXT" } })