def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # CloudFormation Parameters glue_db_name = core.CfnParameter( self, "GlueDatabaseName", type="String", description="Glue Database where the Table belongs.", allowed_pattern="[\w-]+", ) glue_table_name = core.CfnParameter( self, "GlueTableName", type="String", description="Glue Table where access will be granted.", allowed_pattern="[\w-]+", ) grantee_role_arn = core.CfnParameter( self, "GranteeIAMRoleARN", type="String", description="IAM Role's ARN.", allowed_pattern= "arn:(aws[a-zA-Z-]*)?:iam::\d{12}:role\/?[a-zA-Z0-9_+=,.@\-]+") grantee_vpc = core.CfnParameter( self, "GranteeVPC", type="String", description= "VPC ID from where the S3 access point will be accessed.", allowed_pattern="vpc-[a-zA-Z0-9]+") is_lakeformation = core.CfnParameter( self, "LakeFormationParam", type="String", description= "If Lake Formation is used, the stack must be deployed using an IAM role with Lake Formation Admin permissions.", allowed_values=["Yes", "No"]) # CloudFormation Parameter Groups self.template_options.description = "\ This template deploys an S3 Access Point which provides a given IAM Role \ access to the underlying data location for a given Glue Table.\n\ Main use case for this template is to grant an ETL process in another AWS Account, \ access to the S3 objects (e.g., Parquet files) associated to a Glue Table." self.template_options.metadata = { "AWS::CloudFormation::Interface": { "License": "MIT-0", "ParameterGroups": [{ "Label": { "default": "Lake Formation (Producer Account)" }, "Parameters": [is_lakeformation.logical_id] }, { "Label": { "default": "Source Data Catalog Resource (Producer Account)" }, "Parameters": [glue_db_name.logical_id, glue_table_name.logical_id] }, { "Label": { "default": "Grantee IAM Role (Consumer Account)" }, "Parameters": [grantee_role_arn.logical_id, grantee_vpc.logical_id] }], "ParameterLabels": { is_lakeformation.logical_id: { "default": "Are data permissions managed by Lake Formation?" }, glue_db_name.logical_id: { "default": "What is the Glue DB Name for the Table?" }, glue_table_name.logical_id: { "default": "What is the Glue Table Name?" }, grantee_role_arn.logical_id: { "default": "What is the ARN of the IAM Role?" }, grantee_vpc.logical_id: { "default": "What VPC will be used to access the S3 Access Point?" } } } } is_lakeformation_condition = core.CfnCondition( self, "IsLakeFormation", expression=core.Fn.condition_equals("Yes", is_lakeformation)) # Create S3 Access Point to share dataset objects grantee_role = iam.Role.from_role_arn(self, "GranteeIAMRole", grantee_role_arn.value_as_string) glue_table_arn = f"arn:aws:glue:{core.Aws.REGION}:{core.Aws.ACCOUNT_ID}:table/{glue_db_name.value_as_string}/{glue_table_name.value_as_string}" glue_table = glue.Table.from_table_arn(self, "GlueTable", table_arn=glue_table_arn) # Invoke Lambda to obtain S3 bucket and S3 prefix from Glue Table get_s3_from_table_execution_role = iam.Role( self, "GetS3FromTableServiceRole", assumed_by=iam.ServicePrincipal('lambda.amazonaws.com'), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( "service-role/AWSLambdaBasicExecutionRole"), iam.ManagedPolicy.from_aws_managed_policy_name( "service-role/AWSGlueServiceRole") ]) lf_permission = lf.CfnPermissions( self, "LFPermissionForLambda", data_lake_principal=lf.CfnPermissions.DataLakePrincipalProperty( data_lake_principal_identifier=get_s3_from_table_execution_role .role_arn), resource=lf.CfnPermissions.ResourceProperty( table_resource=lf.CfnPermissions.TableResourceProperty( name=glue_table_name.value_as_string, database_name=glue_db_name.value_as_string)), permissions=["DESCRIBE"]) lf_permission.apply_removal_policy(core.RemovalPolicy.DESTROY, apply_to_update_replace_policy=True) lf_permission.node.add_dependency(get_s3_from_table_execution_role) lf_permission.cfn_options.condition = is_lakeformation_condition lf_wait_condition_handle = cfn.CfnWaitConditionHandle( self, "LFWaitConditionHandle") lf_wait_condition_handle.add_metadata( "WaitForLFPermissionIfExists", core.Fn.condition_if(is_lakeformation_condition.logical_id, lf_permission.logical_id, "")) with open("lambda/get_s3_from_table.py", encoding="utf8") as fp: get_s3_from_table_code = fp.read() get_s3_from_table_fn = _lambda.Function( self, "GetS3FromTableHandler", runtime=_lambda.Runtime.PYTHON_3_7, code=_lambda.InlineCode.from_inline(get_s3_from_table_code), handler="index.handler", role=get_s3_from_table_execution_role, timeout=core.Duration.seconds(600)) get_s3_from_table = core.CustomResource( self, "GetS3FromTable", service_token=get_s3_from_table_fn.function_arn, resource_type="Custom::GetS3FromTable", properties={ "GlueDatabase": glue_db_name.value_as_string, "GlueTable": glue_table_name.value_as_string }) get_s3_from_table.node.add_dependency(lf_wait_condition_handle) table_bucket = get_s3_from_table.get_att_string("TableBucket") table_prefix = get_s3_from_table.get_att_string("TablePrefix") # Create S3 Access Point table_name_normalized = core.Fn.join( "-", core.Fn.split("_", glue_table_name.value_as_string)) random_suffix = core.Fn.select( 0, core.Fn.split( "-", core.Fn.select(2, core.Fn.split("/", core.Aws.STACK_ID)))) s3_accesspoint_name = f"{table_name_normalized}-{random_suffix}" s3_accesspoint_arn = f"arn:aws:s3:{core.Aws.REGION}:{core.Aws.ACCOUNT_ID}:accesspoint/{s3_accesspoint_name}" glue_table_accesspoint_path = f"{s3_accesspoint_arn}/object/{table_prefix}" # s3_accesspoint_block_config = s3.CfnAccessPoint.PublicAccessBlockConfigurationProperty(block_public_acls=True, block_public_policy=True, ignore_public_acls=True, restrict_public_buckets=True) s3_accesspoint_policy = iam.PolicyDocument(statements=[ iam.PolicyStatement( effect=iam.Effect.ALLOW, principals=[iam.ArnPrincipal(arn=grantee_role.role_arn)], actions=["s3:GetObject*"], resources=[f"{glue_table_accesspoint_path}*"]), iam.PolicyStatement( effect=iam.Effect.ALLOW, principals=[iam.ArnPrincipal(arn=grantee_role.role_arn)], actions=["s3:ListBucket*"], resources=[s3_accesspoint_arn], conditions={"StringLike": { "s3:prefix": f"{table_prefix}*" }}) ]) s3_accesspoint = s3.CfnAccessPoint( self, "S3AccessPoint", bucket=f"{table_bucket}", name=s3_accesspoint_name, # network_origin = "Internet", policy=s3_accesspoint_policy, vpc_configuration=s3.CfnAccessPoint.VpcConfigurationProperty( vpc_id=grantee_vpc.value_as_string)) glue_table_accesspoint_path_output = f"arn:aws:s3:{core.Aws.REGION}:{core.Aws.ACCOUNT_ID}:accesspoint/{s3_accesspoint.name}/object/{table_prefix}" # Output core.CfnOutput(self, "IAMRoleArnOutput", value=grantee_role.role_arn, description="IAM Role Arn") core.CfnOutput(self, "GlueTableOutput", value=glue_table.table_arn, description="Glue Table ARN") core.CfnOutput(self, "S3AccessPointPathOutput", value=glue_table_accesspoint_path_output, description="S3 Access Point Path for Glue Table")
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) with open('./props/tasksetting.json', 'r') as f1: py_json1 = json.load(f1) ts = json.dumps(py_json1) # with open('./props/mappingrule.json', 'r') as f2: # py_json2 = json.load(f2) # mr = json.dumps(py_json2) with open('./props/config.json', 'r') as f2: configuration = json.load(f2) def getMappingrules(self, table_list): rules = [] for index, value in enumerate(table_list, 1): rules.append({ "rule-type": "selection", "rule-id": str(index), "rule-name": str(index), "object-locator": { "schema-name": value['schemaName'], "table-name": value['tableName'] }, "rule-action": "include", "filters": [] }) mapping_rules = {"rules": rules} return json.dumps(mapping_rules) # The code that defines your stack goes here S3Accessrole = _iam.Role( self, 'dmsrole', assumed_by=_iam.ServicePrincipal('dms.amazonaws.com'), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( 'AmazonS3FullAccess') ]) raw_bucket = s3.Bucket(self, 'rawbucket', bucket_name='rawbucket-datalake-cdk-oregon') raw_bucket.add_lifecycle_rule( enabled=configuration['s3LifecycleRule']['enabled'], expiration=core.Duration.days( configuration['s3LifecycleRule']['expiration'])) #my_table = ddb.Table(self, id ='dunamoTable', table_name = 'testcdktable', #partition_key = ddb.Attribute(name ='lastname',type = ddb.AttributeType.STRING) ) dl_dms = _dms.CfnReplicationInstance( self, 'dmsreplication', replication_instance_class=configuration['DMS_instance_setting'] ['instance_class'], replication_instance_identifier='datalake-instance-cdk', allocated_storage=configuration['DMS_instance_setting'] ['allocated_storage']) source_endpoint = _dms.CfnEndpoint( self, 'sourceendpoint', endpoint_type='source', engine_name=configuration['engineName'], database_name=configuration['databaseName'], username=configuration['username'], password=configuration['password'], port=configuration['port'], server_name=configuration['serverName'], ) target_endpoint = _dms.CfnEndpoint( self, 'targetendpoint', endpoint_type='target', engine_name='s3', s3_settings={ 'bucketName': raw_bucket.bucket_name, 'serviceAccessRoleArn': S3Accessrole.role_arn }, extra_connection_attributes='dataFormat=parquet') dms_task = _dms.CfnReplicationTask( self, 'data2lake-task', migration_type='full-load-and-cdc', replication_instance_arn=dl_dms.ref, source_endpoint_arn=source_endpoint.ref, target_endpoint_arn=target_endpoint.ref, replication_task_settings=ts, table_mappings=getMappingrules(self, configuration['tableList'])) my_table = ddb.Table(self, id='dynamoTable', table_name='ControllerTable', partition_key=ddb.Attribute( name='path', type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) datalake_bucket = s3.Bucket(self, 'datalakebucket', bucket_name='datalake-bucket-cdk-oregon') glue_role = _iam.Role( self, 'gluerole', assumed_by=_iam.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSGlueServiceRole') ]) raw_bucket.grant_read(glue_role) datalake_bucket.grant_read_write(glue_role) #lake formation settings #If you have attached managed policy ('AWSLakeFormationDataAdmin') to your own iam user, you should change that policy to allow "lakeformation:PutDataLakeSettings", #so that the lake setting can be allowed by below code in cdk. lake_admin_setting = _lakeformation.CfnDataLakeSettings( self, 'data-lake-GrantAdmin', admins=[ _lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty( data_lake_principal_identifier=configuration[ 'executiveArn']) ]) glue_database = _glue.Database(self, 'gluedatabase', database_name='data_lake_gluedb') glue_database.node.add_dependency(lake_admin_setting) glue_role_permission_inLakeFormation = _lakeformation.CfnPermissions( self, 'permission-glueRole', data_lake_principal=_lakeformation.CfnPermissions. DataLakePrincipalProperty( data_lake_principal_identifier=glue_role.role_arn), resource=_lakeformation.CfnPermissions.ResourceProperty( database_resource=_lakeformation.CfnPermissions. DatabaseResourceProperty(name=glue_database.database_name)), permissions=['ALL']) crawler = _glue.CfnCrawler( self, 'datalakecrawler', name='Crawler-datalake-cdk', role=glue_role.role_arn, targets={ 's3Targets': [{ 'path': 's3://' + datalake_bucket.bucket_name + '/datalake/' }] }, database_name='data_lake_gluedb', configuration= "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}" ) initialload_script = S3Assets.Asset(self, 'initial-load-code', path='./Gluejob/InitialLoad.py') incrementalload_script = S3Assets.Asset( self, 'incremental-load-code', path='./Gluejob/IncrementalLoad.py') initialload_script.grant_read(glue_role) incrementalload_script.grant_read(glue_role) my_table.grant_full_access(glue_role) initial_load_job = _glue.CfnJob( self, 'initial-job', name='InitialLoad-cdk', command=_glue.CfnJob.JobCommandProperty( name='glueetl', python_version='3', script_location='s3://' + initialload_script.s3_bucket_name + '/' + initialload_script.s3_object_key), role=glue_role.role_arn, default_arguments={ '--prefix': str(configuration['tableList']), '--bucket': raw_bucket.bucket_name, '--datalake_bucket': datalake_bucket.bucket_name, '--datalake_prefix': 'datalake/', '--region': CdkpyStack.of(self).region, '--controller_table_name': my_table.table_name }, allocated_capacity=configuration['glue_job_setting'] ['job_capacity'], execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=configuration['glue_job_setting'] ['max_concurrent_run_JobExecution'])) incremental_load_job = _glue.CfnJob( self, 'increment-job', name='IncrementalLoad-cdk', command=_glue.CfnJob.JobCommandProperty( name='glueetl', script_location='s3://' + incrementalload_script.s3_bucket_name + '/' + incrementalload_script.s3_object_key, python_version='3'), role=glue_role.role_arn, default_arguments={ '--prefix': str(configuration['tableList']), '--bucket': raw_bucket.bucket_name, '--datalake_bucket': datalake_bucket.bucket_name, '--datalake_prefix': 'datalake/', '--region': CdkpyStack.of(self).region, '--controller_table_name': my_table.table_name }, allocated_capacity=2, execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=1)) job_trigger = _glue.CfnTrigger( self, 'datalake-glue-trigger', type='SCHEDULED', schedule=configuration['job_trigger_schedule'], start_on_creation=False, actions=[ _glue.CfnTrigger.ActionProperty(job_name='IncrementalLoad-cdk') ]) dl_sns = _sns.Topic(self, 'datalake_sns', display_name='data-lake-sns') endpoint_email = configuration['emailSubscriptionList'] for emails in endpoint_email: dl_sns.add_subscription(_subscrption.EmailSubscription(emails)) #Another way to subscribe: dl_subscription = _sns.Subscription(self,'email-subscrption',topic = dl_sns,endpoint='*****@*****.**',protocol= _sns.SubscriptionProtocol.EMAIL) glue_events_target = _events_targets.SnsTopic(dl_sns) glue_events_rule = _events.Rule( self, 'gluejobevents-datalake', description='Using for tracking the failed glue job of data lake', rule_name='dl-gluejob-event', event_pattern=_events.EventPattern( source=['aws.glue'], detail_type=['Glue Job State Change'], detail={ "jobName": [initial_load_job.name], "state": ["FAILED"] }), targets=[glue_events_target]) dms_subscription = _dms.CfnEventSubscription( self, 'dmsevents-datalake', sns_topic_arn=dl_sns.topic_arn, subscription_name='datalake-dmsevents', source_type='replication-task', event_categories=['failure'])
def _setup_redshift(self) -> None: port = 5439 database = "test" schema = "public" redshift_role = iam.Role( self, "aws-data-wrangler-redshift-role", assumed_by=iam.ServicePrincipal("redshift.amazonaws.com"), inline_policies={ "KMS": iam.PolicyDocument( statements=[ iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ "kms:Encrypt", "kms:Decrypt", "kms:GenerateDataKey", ], resources=[self.key.key_arn], ) ] ), "S3": iam.PolicyDocument( statements=[ iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ "s3:Get*", "s3:List*", "s3:Put*", ], resources=[ self.bucket.bucket_arn, f"{self.bucket.bucket_arn}/*", ], ) ] ), "LakeFormation": iam.PolicyDocument( statements=[ iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ "lakeformation:GetDataAccess", "lakeformation:GrantPermissions", "lakeformation:GetWorkUnits", "lakeformation:StartQueryPlanning", "lakeformation:GetWorkUnitResults", "lakeformation:GetQueryState", ], resources=["*"], ) ] ), "Glue": iam.PolicyDocument( statements=[ iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ "glue:SearchTables", "glue:GetConnections", "glue:GetDataCatalogEncryptionSettings", "glue:GetTables", "glue:GetTableVersions", "glue:GetPartitions", "glue:DeleteTableVersion", "glue:BatchGetPartition", "glue:GetDatabases", "glue:GetTags", "glue:GetTable", "glue:GetDatabase", "glue:GetPartition", "glue:GetTableVersion", "glue:GetConnection", "glue:GetUserDefinedFunction", "glue:GetUserDefinedFunctions", ], resources=["*"], ) ] ), }, ) lf.CfnPermissions( self, "CodeBuildTestRoleLFPermissions", data_lake_principal=lf.CfnPermissions.DataLakePrincipalProperty( data_lake_principal_identifier=redshift_role.role_arn ), resource=lf.CfnPermissions.ResourceProperty( table_resource=lf.CfnPermissions.TableResourceProperty( database_name="aws_data_wrangler", table_wildcard={}, # type: ignore ) ), permissions=["SELECT", "ALTER", "DESCRIBE", "DROP", "DELETE", "INSERT"], ) redshift.ClusterSubnetGroup( self, "aws-data-wrangler-redshift-subnet-group", description="AWS Data Wrangler Test Athena - Redshift Subnet Group", vpc=self.vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), ) redshift_cluster = redshift.Cluster( self, "aws-data-wrangler-redshift-cluster", default_database_name=database, master_user=redshift.Login( master_username=self.db_username, master_password=self.db_password_secret, ), cluster_type=redshift.ClusterType.SINGLE_NODE, publicly_accessible=True, port=port, vpc=self.vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), security_groups=[self.db_security_group], roles=[redshift_role], ) glue.Connection( self, "aws-data-wrangler-redshift-glue-connection", description="Connect to Redshift.", type=glue.ConnectionType.JDBC, connection_name="aws-data-wrangler-redshift", properties={ "JDBC_CONNECTION_URL": f"jdbc:redshift://{redshift_cluster.cluster_endpoint.hostname}:{port}/{database}", # noqa: E501 "USERNAME": self.db_username, "PASSWORD": self.db_password, }, subnet=self.vpc.private_subnets[0], security_groups=[self.db_security_group], ) secret = secrets.Secret( self, "aws-data-wrangler-redshift-secret", secret_name="aws-data-wrangler/redshift", description="Redshift credentials", generate_secret_string=secrets.SecretStringGenerator( generate_string_key="dummy", secret_string_template=json.dumps( { "username": self.db_username, "password": self.db_password, "engine": "redshift", "host": redshift_cluster.cluster_endpoint.hostname, "port": port, "dbClusterIdentifier": redshift_cluster.cluster_name, } ), ), ) CfnOutput(self, "RedshiftSecretArn", value=secret.secret_arn) CfnOutput(self, "RedshiftIdentifier", value=redshift_cluster.cluster_name) CfnOutput( self, "RedshiftAddress", value=redshift_cluster.cluster_endpoint.hostname, ) CfnOutput(self, "RedshiftPort", value=str(port)) CfnOutput(self, "RedshiftDatabase", value=database) CfnOutput(self, "RedshiftSchema", value=schema) CfnOutput(self, "RedshiftRole", value=redshift_role.role_arn)
def __init__(self, scope: core.Construct, id: str, source_bucket_name: str, glue_database_name: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # get the source bucket - this object is an IBucketProxy interface, not a Buckt construct. # Can not be used to add an event directly. Instead, use a custom resource to add an event trigger later source_bucket = s3.Bucket.from_bucket_name( self, "MySourceBucket", bucket_name=source_bucket_name) # create the new destination bucket - this bucket holds the csv file that containers the FITS header information # the name of the bucket will be <stack-id>-fitsstorebucketXXXXXXXX-YYYYYYYYYYYYY # e.g. my-fits-datalake-fitsstorebucket1234567f-098765432d target_bucket = s3.Bucket(self, "FITSSTORE_BUCKET") # Add the astropy and numpy layers for the lambda function that is used as the event trigger on the source_bucket layer_astropy = lambda_.LayerVersion( self, 'AstroFitsioLayer', code=lambda_.Code.from_asset("resources_layer/astropy.zip"), compatible_runtimes=[lambda_.Runtime.PYTHON_3_7]) # use an AWS provided layer for numpy layer_numpy = lambda_.LayerVersion.from_layer_version_arn( self, "NumpyLayer", "arn:aws:lambda:us-east-1:668099181075:layer:AWSLambda-Python37-SciPy1x:22" ) # create the FITS header extractor lambda function # pass the FITSSTORE_BUCKET to the lambda function as an environment variable handler = lambda_.Function( self, "FITSHeaderExtractorHandler", runtime=lambda_.Runtime.PYTHON_3_7, code=lambda_.Code.asset("resources"), handler="fits_header_extractor.fits_header_extractor_handler", environment=dict(FITSSTORE_BUCKET=target_bucket.bucket_name), layers=[layer_astropy, layer_numpy]) # grant read access to handler on source bucket source_bucket.grant_read(handler) # Give the lambda resource based policy # both source_arn and source_account is needed for security reason handler.add_permission( 's3-trigger-lambda-s3-invoke-function', principal=iam_.ServicePrincipal('s3.amazonaws.com'), action='lambda:InvokeFunction', source_arn=source_bucket.bucket_arn, source_account=self.account) # grant access to the handler # - this is a lot easier than adding policies, but not all constructs support this target_bucket.grant_read_write(handler) # map the put event to hanlder - this doesn't work as source_bucket is not really a Bucket object (IBucketProxy) # You can use this approach if the bucket is created as a new Bucket object #notification = s3_notifications.LambdaDestination(handler) #source_bucket.add_object_created_notification(self, notification ) # use custom resource to add an event trigger on the destnation bucket - # the custom resource creation makes an SDK call to create the event notification on the # Action reference https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/S3.html # Events reference https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html custom_s3_resource = custom_resources_.AwsCustomResource( self, 's3-putobject-custom-notification-resource', policy=custom_resources_.AwsCustomResourcePolicy.from_statements([ iam_.PolicyStatement(effect=iam_.Effect.ALLOW, resources=['*'], actions=['s3:PutBucketNotification']) ]), on_create=custom_resources_.AwsSdkCall( service="S3", action="putBucketNotificationConfiguration", parameters={ "Bucket": source_bucket.bucket_name, "NotificationConfiguration": { "LambdaFunctionConfigurations": [{ "Events": ['s3:ObjectCreated:*', 's3:ObjectRemoved:*'], "LambdaFunctionArn": handler.function_arn, "Filter": { "Key": { "FilterRules": [{ 'Name': 'suffix', 'Value': 'fits' }] } } }] } }, physical_resource_id=custom_resources_.PhysicalResourceId.of( f's3-notification-resource-{str(uuid.uuid1())}'), region=self.region)) # Make sure the lambda function is created first custom_s3_resource.node.add_dependency( handler.permissions_node.find_child( 's3-trigger-lambda-s3-invoke-function')) # create a glue crawler to build the data catalog # Step 1 . create a role for AWS Glue glue_role = iam_.Role( self, "glue_role", assumed_by=iam_.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ iam_.ManagedPolicy.from_managed_policy_arn( self, 'MyFitsCrawlerGlueRole', managed_policy_arn= 'arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole') ]) # glue role needs "*" read/write - otherwise crawler will not be able to create tables (and no error messages in crawler logs) glue_role.add_to_policy( iam_.PolicyStatement(actions=[ 's3:GetObject', 's3:PutObject', 'lakeformation:GetDataAccess' ], effect=iam_.Effect.ALLOW, resources=['*'])) # Step 2. create a database in data catalog db = glue_.Database(self, "MyFitsDatabase", database_name=glue_database_name) # Step 3. create a crawler named "fitsdatalakecrawler-<hex>", and schedule to run every 15 mins # You can change the frequency based on your needs # cron schedule format cron(Minutes Hours Day-of-month Month Day-of-week Year) glue_.CfnCrawler( self, "fits-datalake-crawler", database_name=glue_database_name, role=glue_role.role_arn, schedule={"scheduleExpression": "cron(0/15 * * * ? *)"}, targets={"s3Targets": [{ "path": target_bucket.bucket_name }]}, ) # When your AWS Lake Formation Data catalog settings is not set to # "Use only IAM access control for new databases" or # "Use only IAM access control for new tables in new databse" # you need to grant additional permission to the data catalog database. # in order for the crawler to run, we need to add some permissions to lakeformation location_resource = lakeformation_.CfnResource( self, "MyFitsDatalakeLocationResource", resource_arn=target_bucket.bucket_arn, use_service_linked_role=True) lakeformation_.CfnPermissions( self, "MyFitsDatalakeDatabasePermission", data_lake_principal=lakeformation_.CfnPermissions. DataLakePrincipalProperty( data_lake_principal_identifier=glue_role.role_arn), resource=lakeformation_.CfnPermissions.ResourceProperty( database_resource=lakeformation_.CfnPermissions. DatabaseResourceProperty(name=db.database_name)), permissions=["ALTER", "DROP", "CREATE_TABLE"], ) location_permission = lakeformation_.CfnPermissions( self, "MyFitsDatalakeLocationPermission", data_lake_principal=lakeformation_.CfnPermissions. DataLakePrincipalProperty( data_lake_principal_identifier=glue_role.role_arn), resource=lakeformation_.CfnPermissions.ResourceProperty( data_location_resource=lakeformation_.CfnPermissions. DataLocationResourceProperty( s3_resource=target_bucket.bucket_arn)), permissions=["DATA_LOCATION_ACCESS"], ) #make sure the location resource is created first location_permission.node.add_dependency(location_resource)