示例#1
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        # CloudFormation Parameters

        glue_db_name = core.CfnParameter(
            self,
            "GlueDatabaseName",
            type="String",
            description="Glue Database where the Table belongs.",
            allowed_pattern="[\w-]+",
        )

        glue_table_name = core.CfnParameter(
            self,
            "GlueTableName",
            type="String",
            description="Glue Table where access will be granted.",
            allowed_pattern="[\w-]+",
        )

        grantee_role_arn = core.CfnParameter(
            self,
            "GranteeIAMRoleARN",
            type="String",
            description="IAM Role's ARN.",
            allowed_pattern=
            "arn:(aws[a-zA-Z-]*)?:iam::\d{12}:role\/?[a-zA-Z0-9_+=,.@\-]+")

        grantee_vpc = core.CfnParameter(
            self,
            "GranteeVPC",
            type="String",
            description=
            "VPC ID from where the S3 access point will be accessed.",
            allowed_pattern="vpc-[a-zA-Z0-9]+")

        is_lakeformation = core.CfnParameter(
            self,
            "LakeFormationParam",
            type="String",
            description=
            "If Lake Formation is used, the stack must be deployed using an IAM role with Lake Formation Admin permissions.",
            allowed_values=["Yes", "No"])

        # CloudFormation Parameter Groups

        self.template_options.description = "\
This template deploys an S3 Access Point which provides a given IAM Role \
access to the underlying data location for a given Glue Table.\n\
Main use case for this template is to grant an ETL process in another AWS Account, \
access to the S3 objects (e.g., Parquet files) associated to a Glue Table."

        self.template_options.metadata = {
            "AWS::CloudFormation::Interface": {
                "License":
                "MIT-0",
                "ParameterGroups": [{
                    "Label": {
                        "default": "Lake Formation (Producer Account)"
                    },
                    "Parameters": [is_lakeformation.logical_id]
                }, {
                    "Label": {
                        "default":
                        "Source Data Catalog Resource (Producer Account)"
                    },
                    "Parameters":
                    [glue_db_name.logical_id, glue_table_name.logical_id]
                }, {
                    "Label": {
                        "default": "Grantee IAM Role (Consumer Account)"
                    },
                    "Parameters":
                    [grantee_role_arn.logical_id, grantee_vpc.logical_id]
                }],
                "ParameterLabels": {
                    is_lakeformation.logical_id: {
                        "default":
                        "Are data permissions managed by Lake Formation?"
                    },
                    glue_db_name.logical_id: {
                        "default": "What is the Glue DB Name for the Table?"
                    },
                    glue_table_name.logical_id: {
                        "default": "What is the Glue Table Name?"
                    },
                    grantee_role_arn.logical_id: {
                        "default": "What is the ARN of the IAM Role?"
                    },
                    grantee_vpc.logical_id: {
                        "default":
                        "What VPC will be used to access the S3 Access Point?"
                    }
                }
            }
        }

        is_lakeformation_condition = core.CfnCondition(
            self,
            "IsLakeFormation",
            expression=core.Fn.condition_equals("Yes", is_lakeformation))

        # Create S3 Access Point to share dataset objects

        grantee_role = iam.Role.from_role_arn(self, "GranteeIAMRole",
                                              grantee_role_arn.value_as_string)

        glue_table_arn = f"arn:aws:glue:{core.Aws.REGION}:{core.Aws.ACCOUNT_ID}:table/{glue_db_name.value_as_string}/{glue_table_name.value_as_string}"

        glue_table = glue.Table.from_table_arn(self,
                                               "GlueTable",
                                               table_arn=glue_table_arn)

        # Invoke Lambda to obtain S3 bucket and S3 prefix from Glue Table

        get_s3_from_table_execution_role = iam.Role(
            self,
            "GetS3FromTableServiceRole",
            assumed_by=iam.ServicePrincipal('lambda.amazonaws.com'),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "service-role/AWSLambdaBasicExecutionRole"),
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "service-role/AWSGlueServiceRole")
            ])

        lf_permission = lf.CfnPermissions(
            self,
            "LFPermissionForLambda",
            data_lake_principal=lf.CfnPermissions.DataLakePrincipalProperty(
                data_lake_principal_identifier=get_s3_from_table_execution_role
                .role_arn),
            resource=lf.CfnPermissions.ResourceProperty(
                table_resource=lf.CfnPermissions.TableResourceProperty(
                    name=glue_table_name.value_as_string,
                    database_name=glue_db_name.value_as_string)),
            permissions=["DESCRIBE"])

        lf_permission.apply_removal_policy(core.RemovalPolicy.DESTROY,
                                           apply_to_update_replace_policy=True)
        lf_permission.node.add_dependency(get_s3_from_table_execution_role)
        lf_permission.cfn_options.condition = is_lakeformation_condition

        lf_wait_condition_handle = cfn.CfnWaitConditionHandle(
            self, "LFWaitConditionHandle")
        lf_wait_condition_handle.add_metadata(
            "WaitForLFPermissionIfExists",
            core.Fn.condition_if(is_lakeformation_condition.logical_id,
                                 lf_permission.logical_id, ""))

        with open("lambda/get_s3_from_table.py", encoding="utf8") as fp:
            get_s3_from_table_code = fp.read()

        get_s3_from_table_fn = _lambda.Function(
            self,
            "GetS3FromTableHandler",
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.InlineCode.from_inline(get_s3_from_table_code),
            handler="index.handler",
            role=get_s3_from_table_execution_role,
            timeout=core.Duration.seconds(600))

        get_s3_from_table = core.CustomResource(
            self,
            "GetS3FromTable",
            service_token=get_s3_from_table_fn.function_arn,
            resource_type="Custom::GetS3FromTable",
            properties={
                "GlueDatabase": glue_db_name.value_as_string,
                "GlueTable": glue_table_name.value_as_string
            })

        get_s3_from_table.node.add_dependency(lf_wait_condition_handle)

        table_bucket = get_s3_from_table.get_att_string("TableBucket")
        table_prefix = get_s3_from_table.get_att_string("TablePrefix")

        # Create S3 Access Point

        table_name_normalized = core.Fn.join(
            "-", core.Fn.split("_", glue_table_name.value_as_string))
        random_suffix = core.Fn.select(
            0,
            core.Fn.split(
                "-", core.Fn.select(2, core.Fn.split("/", core.Aws.STACK_ID))))

        s3_accesspoint_name = f"{table_name_normalized}-{random_suffix}"

        s3_accesspoint_arn = f"arn:aws:s3:{core.Aws.REGION}:{core.Aws.ACCOUNT_ID}:accesspoint/{s3_accesspoint_name}"

        glue_table_accesspoint_path = f"{s3_accesspoint_arn}/object/{table_prefix}"

        # s3_accesspoint_block_config = s3.CfnAccessPoint.PublicAccessBlockConfigurationProperty(block_public_acls=True, block_public_policy=True, ignore_public_acls=True, restrict_public_buckets=True)

        s3_accesspoint_policy = iam.PolicyDocument(statements=[
            iam.PolicyStatement(
                effect=iam.Effect.ALLOW,
                principals=[iam.ArnPrincipal(arn=grantee_role.role_arn)],
                actions=["s3:GetObject*"],
                resources=[f"{glue_table_accesspoint_path}*"]),
            iam.PolicyStatement(
                effect=iam.Effect.ALLOW,
                principals=[iam.ArnPrincipal(arn=grantee_role.role_arn)],
                actions=["s3:ListBucket*"],
                resources=[s3_accesspoint_arn],
                conditions={"StringLike": {
                    "s3:prefix": f"{table_prefix}*"
                }})
        ])

        s3_accesspoint = s3.CfnAccessPoint(
            self,
            "S3AccessPoint",
            bucket=f"{table_bucket}",
            name=s3_accesspoint_name,
            # network_origin = "Internet",
            policy=s3_accesspoint_policy,
            vpc_configuration=s3.CfnAccessPoint.VpcConfigurationProperty(
                vpc_id=grantee_vpc.value_as_string))

        glue_table_accesspoint_path_output = f"arn:aws:s3:{core.Aws.REGION}:{core.Aws.ACCOUNT_ID}:accesspoint/{s3_accesspoint.name}/object/{table_prefix}"

        # Output

        core.CfnOutput(self,
                       "IAMRoleArnOutput",
                       value=grantee_role.role_arn,
                       description="IAM Role Arn")

        core.CfnOutput(self,
                       "GlueTableOutput",
                       value=glue_table.table_arn,
                       description="Glue Table ARN")

        core.CfnOutput(self,
                       "S3AccessPointPathOutput",
                       value=glue_table_accesspoint_path_output,
                       description="S3 Access Point Path for Glue Table")
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        with open('./props/tasksetting.json', 'r') as f1:
            py_json1 = json.load(f1)
            ts = json.dumps(py_json1)

        # with open('./props/mappingrule.json', 'r') as f2:
        #     py_json2 = json.load(f2)
        #     mr = json.dumps(py_json2)

        with open('./props/config.json', 'r') as f2:
            configuration = json.load(f2)

        def getMappingrules(self, table_list):
            rules = []
            for index, value in enumerate(table_list, 1):
                rules.append({
                    "rule-type": "selection",
                    "rule-id": str(index),
                    "rule-name": str(index),
                    "object-locator": {
                        "schema-name": value['schemaName'],
                        "table-name": value['tableName']
                    },
                    "rule-action": "include",
                    "filters": []
                })
            mapping_rules = {"rules": rules}
            return json.dumps(mapping_rules)

        # The code that defines your stack goes here
        S3Accessrole = _iam.Role(
            self,
            'dmsrole',
            assumed_by=_iam.ServicePrincipal('dms.amazonaws.com'),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    'AmazonS3FullAccess')
            ])

        raw_bucket = s3.Bucket(self,
                               'rawbucket',
                               bucket_name='rawbucket-datalake-cdk-oregon')
        raw_bucket.add_lifecycle_rule(
            enabled=configuration['s3LifecycleRule']['enabled'],
            expiration=core.Duration.days(
                configuration['s3LifecycleRule']['expiration']))

        #my_table = ddb.Table(self, id ='dunamoTable', table_name = 'testcdktable',
        #partition_key = ddb.Attribute(name ='lastname',type = ddb.AttributeType.STRING) )

        dl_dms = _dms.CfnReplicationInstance(
            self,
            'dmsreplication',
            replication_instance_class=configuration['DMS_instance_setting']
            ['instance_class'],
            replication_instance_identifier='datalake-instance-cdk',
            allocated_storage=configuration['DMS_instance_setting']
            ['allocated_storage'])

        source_endpoint = _dms.CfnEndpoint(
            self,
            'sourceendpoint',
            endpoint_type='source',
            engine_name=configuration['engineName'],
            database_name=configuration['databaseName'],
            username=configuration['username'],
            password=configuration['password'],
            port=configuration['port'],
            server_name=configuration['serverName'],
        )

        target_endpoint = _dms.CfnEndpoint(
            self,
            'targetendpoint',
            endpoint_type='target',
            engine_name='s3',
            s3_settings={
                'bucketName': raw_bucket.bucket_name,
                'serviceAccessRoleArn': S3Accessrole.role_arn
            },
            extra_connection_attributes='dataFormat=parquet')

        dms_task = _dms.CfnReplicationTask(
            self,
            'data2lake-task',
            migration_type='full-load-and-cdc',
            replication_instance_arn=dl_dms.ref,
            source_endpoint_arn=source_endpoint.ref,
            target_endpoint_arn=target_endpoint.ref,
            replication_task_settings=ts,
            table_mappings=getMappingrules(self, configuration['tableList']))

        my_table = ddb.Table(self,
                             id='dynamoTable',
                             table_name='ControllerTable',
                             partition_key=ddb.Attribute(
                                 name='path', type=ddb.AttributeType.STRING),
                             billing_mode=ddb.BillingMode.PAY_PER_REQUEST)

        datalake_bucket = s3.Bucket(self,
                                    'datalakebucket',
                                    bucket_name='datalake-bucket-cdk-oregon')

        glue_role = _iam.Role(
            self,
            'gluerole',
            assumed_by=_iam.ServicePrincipal('glue.amazonaws.com'),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AWSGlueServiceRole')
            ])

        raw_bucket.grant_read(glue_role)
        datalake_bucket.grant_read_write(glue_role)

        #lake formation settings
        #If you have attached managed policy ('AWSLakeFormationDataAdmin') to your own iam user, you should change that policy to allow "lakeformation:PutDataLakeSettings",
        #so that the lake setting can be allowed by below code in cdk.
        lake_admin_setting = _lakeformation.CfnDataLakeSettings(
            self,
            'data-lake-GrantAdmin',
            admins=[
                _lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty(
                    data_lake_principal_identifier=configuration[
                        'executiveArn'])
            ])

        glue_database = _glue.Database(self,
                                       'gluedatabase',
                                       database_name='data_lake_gluedb')

        glue_database.node.add_dependency(lake_admin_setting)

        glue_role_permission_inLakeFormation = _lakeformation.CfnPermissions(
            self,
            'permission-glueRole',
            data_lake_principal=_lakeformation.CfnPermissions.
            DataLakePrincipalProperty(
                data_lake_principal_identifier=glue_role.role_arn),
            resource=_lakeformation.CfnPermissions.ResourceProperty(
                database_resource=_lakeformation.CfnPermissions.
                DatabaseResourceProperty(name=glue_database.database_name)),
            permissions=['ALL'])

        crawler = _glue.CfnCrawler(
            self,
            'datalakecrawler',
            name='Crawler-datalake-cdk',
            role=glue_role.role_arn,
            targets={
                's3Targets': [{
                    'path':
                    's3://' + datalake_bucket.bucket_name + '/datalake/'
                }]
            },
            database_name='data_lake_gluedb',
            configuration=
            "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}"
        )

        initialload_script = S3Assets.Asset(self,
                                            'initial-load-code',
                                            path='./Gluejob/InitialLoad.py')
        incrementalload_script = S3Assets.Asset(
            self, 'incremental-load-code', path='./Gluejob/IncrementalLoad.py')

        initialload_script.grant_read(glue_role)
        incrementalload_script.grant_read(glue_role)
        my_table.grant_full_access(glue_role)

        initial_load_job = _glue.CfnJob(
            self,
            'initial-job',
            name='InitialLoad-cdk',
            command=_glue.CfnJob.JobCommandProperty(
                name='glueetl',
                python_version='3',
                script_location='s3://' + initialload_script.s3_bucket_name +
                '/' + initialload_script.s3_object_key),
            role=glue_role.role_arn,
            default_arguments={
                '--prefix': str(configuration['tableList']),
                '--bucket': raw_bucket.bucket_name,
                '--datalake_bucket': datalake_bucket.bucket_name,
                '--datalake_prefix': 'datalake/',
                '--region': CdkpyStack.of(self).region,
                '--controller_table_name': my_table.table_name
            },
            allocated_capacity=configuration['glue_job_setting']
            ['job_capacity'],
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=configuration['glue_job_setting']
                ['max_concurrent_run_JobExecution']))

        incremental_load_job = _glue.CfnJob(
            self,
            'increment-job',
            name='IncrementalLoad-cdk',
            command=_glue.CfnJob.JobCommandProperty(
                name='glueetl',
                script_location='s3://' +
                incrementalload_script.s3_bucket_name + '/' +
                incrementalload_script.s3_object_key,
                python_version='3'),
            role=glue_role.role_arn,
            default_arguments={
                '--prefix': str(configuration['tableList']),
                '--bucket': raw_bucket.bucket_name,
                '--datalake_bucket': datalake_bucket.bucket_name,
                '--datalake_prefix': 'datalake/',
                '--region': CdkpyStack.of(self).region,
                '--controller_table_name': my_table.table_name
            },
            allocated_capacity=2,
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=1))

        job_trigger = _glue.CfnTrigger(
            self,
            'datalake-glue-trigger',
            type='SCHEDULED',
            schedule=configuration['job_trigger_schedule'],
            start_on_creation=False,
            actions=[
                _glue.CfnTrigger.ActionProperty(job_name='IncrementalLoad-cdk')
            ])

        dl_sns = _sns.Topic(self, 'datalake_sns', display_name='data-lake-sns')

        endpoint_email = configuration['emailSubscriptionList']

        for emails in endpoint_email:
            dl_sns.add_subscription(_subscrption.EmailSubscription(emails))

        #Another way to subscribe: dl_subscription = _sns.Subscription(self,'email-subscrption',topic = dl_sns,endpoint='*****@*****.**',protocol= _sns.SubscriptionProtocol.EMAIL)

        glue_events_target = _events_targets.SnsTopic(dl_sns)

        glue_events_rule = _events.Rule(
            self,
            'gluejobevents-datalake',
            description='Using for tracking the failed glue job of data lake',
            rule_name='dl-gluejob-event',
            event_pattern=_events.EventPattern(
                source=['aws.glue'],
                detail_type=['Glue Job State Change'],
                detail={
                    "jobName": [initial_load_job.name],
                    "state": ["FAILED"]
                }),
            targets=[glue_events_target])

        dms_subscription = _dms.CfnEventSubscription(
            self,
            'dmsevents-datalake',
            sns_topic_arn=dl_sns.topic_arn,
            subscription_name='datalake-dmsevents',
            source_type='replication-task',
            event_categories=['failure'])
 def _setup_redshift(self) -> None:
     port = 5439
     database = "test"
     schema = "public"
     redshift_role = iam.Role(
         self,
         "aws-data-wrangler-redshift-role",
         assumed_by=iam.ServicePrincipal("redshift.amazonaws.com"),
         inline_policies={
             "KMS": iam.PolicyDocument(
                 statements=[
                     iam.PolicyStatement(
                         effect=iam.Effect.ALLOW,
                         actions=[
                             "kms:Encrypt",
                             "kms:Decrypt",
                             "kms:GenerateDataKey",
                         ],
                         resources=[self.key.key_arn],
                     )
                 ]
             ),
             "S3": iam.PolicyDocument(
                 statements=[
                     iam.PolicyStatement(
                         effect=iam.Effect.ALLOW,
                         actions=[
                             "s3:Get*",
                             "s3:List*",
                             "s3:Put*",
                         ],
                         resources=[
                             self.bucket.bucket_arn,
                             f"{self.bucket.bucket_arn}/*",
                         ],
                     )
                 ]
             ),
             "LakeFormation": iam.PolicyDocument(
                 statements=[
                     iam.PolicyStatement(
                         effect=iam.Effect.ALLOW,
                         actions=[
                             "lakeformation:GetDataAccess",
                             "lakeformation:GrantPermissions",
                             "lakeformation:GetWorkUnits",
                             "lakeformation:StartQueryPlanning",
                             "lakeformation:GetWorkUnitResults",
                             "lakeformation:GetQueryState",
                         ],
                         resources=["*"],
                     )
                 ]
             ),
             "Glue": iam.PolicyDocument(
                 statements=[
                     iam.PolicyStatement(
                         effect=iam.Effect.ALLOW,
                         actions=[
                             "glue:SearchTables",
                             "glue:GetConnections",
                             "glue:GetDataCatalogEncryptionSettings",
                             "glue:GetTables",
                             "glue:GetTableVersions",
                             "glue:GetPartitions",
                             "glue:DeleteTableVersion",
                             "glue:BatchGetPartition",
                             "glue:GetDatabases",
                             "glue:GetTags",
                             "glue:GetTable",
                             "glue:GetDatabase",
                             "glue:GetPartition",
                             "glue:GetTableVersion",
                             "glue:GetConnection",
                             "glue:GetUserDefinedFunction",
                             "glue:GetUserDefinedFunctions",
                         ],
                         resources=["*"],
                     )
                 ]
             ),
         },
     )
     lf.CfnPermissions(
         self,
         "CodeBuildTestRoleLFPermissions",
         data_lake_principal=lf.CfnPermissions.DataLakePrincipalProperty(
             data_lake_principal_identifier=redshift_role.role_arn
         ),
         resource=lf.CfnPermissions.ResourceProperty(
             table_resource=lf.CfnPermissions.TableResourceProperty(
                 database_name="aws_data_wrangler",
                 table_wildcard={},  # type: ignore
             )
         ),
         permissions=["SELECT", "ALTER", "DESCRIBE", "DROP", "DELETE", "INSERT"],
     )
     redshift.ClusterSubnetGroup(
         self,
         "aws-data-wrangler-redshift-subnet-group",
         description="AWS Data Wrangler Test Athena - Redshift Subnet Group",
         vpc=self.vpc,
         vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
     )
     redshift_cluster = redshift.Cluster(
         self,
         "aws-data-wrangler-redshift-cluster",
         default_database_name=database,
         master_user=redshift.Login(
             master_username=self.db_username,
             master_password=self.db_password_secret,
         ),
         cluster_type=redshift.ClusterType.SINGLE_NODE,
         publicly_accessible=True,
         port=port,
         vpc=self.vpc,
         vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
         security_groups=[self.db_security_group],
         roles=[redshift_role],
     )
     glue.Connection(
         self,
         "aws-data-wrangler-redshift-glue-connection",
         description="Connect to Redshift.",
         type=glue.ConnectionType.JDBC,
         connection_name="aws-data-wrangler-redshift",
         properties={
             "JDBC_CONNECTION_URL": f"jdbc:redshift://{redshift_cluster.cluster_endpoint.hostname}:{port}/{database}",  # noqa: E501
             "USERNAME": self.db_username,
             "PASSWORD": self.db_password,
         },
         subnet=self.vpc.private_subnets[0],
         security_groups=[self.db_security_group],
     )
     secret = secrets.Secret(
         self,
         "aws-data-wrangler-redshift-secret",
         secret_name="aws-data-wrangler/redshift",
         description="Redshift credentials",
         generate_secret_string=secrets.SecretStringGenerator(
             generate_string_key="dummy",
             secret_string_template=json.dumps(
                 {
                     "username": self.db_username,
                     "password": self.db_password,
                     "engine": "redshift",
                     "host": redshift_cluster.cluster_endpoint.hostname,
                     "port": port,
                     "dbClusterIdentifier": redshift_cluster.cluster_name,
                 }
             ),
         ),
     )
     CfnOutput(self, "RedshiftSecretArn", value=secret.secret_arn)
     CfnOutput(self, "RedshiftIdentifier", value=redshift_cluster.cluster_name)
     CfnOutput(
         self,
         "RedshiftAddress",
         value=redshift_cluster.cluster_endpoint.hostname,
     )
     CfnOutput(self, "RedshiftPort", value=str(port))
     CfnOutput(self, "RedshiftDatabase", value=database)
     CfnOutput(self, "RedshiftSchema", value=schema)
     CfnOutput(self, "RedshiftRole", value=redshift_role.role_arn)
示例#4
0
    def __init__(self, scope: core.Construct, id: str, source_bucket_name: str,
                 glue_database_name: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        # get the source bucket - this object is an IBucketProxy interface, not a Buckt construct.
        # Can not be used to add an event directly. Instead, use a custom resource to add an event trigger later
        source_bucket = s3.Bucket.from_bucket_name(
            self, "MySourceBucket", bucket_name=source_bucket_name)

        # create the new destination bucket - this bucket holds the csv file that containers the FITS header information
        # the name of the bucket will be <stack-id>-fitsstorebucketXXXXXXXX-YYYYYYYYYYYYY
        # e.g. my-fits-datalake-fitsstorebucket1234567f-098765432d
        target_bucket = s3.Bucket(self, "FITSSTORE_BUCKET")

        # Add the astropy and numpy layers for the lambda function that is used as the event trigger on the source_bucket
        layer_astropy = lambda_.LayerVersion(
            self,
            'AstroFitsioLayer',
            code=lambda_.Code.from_asset("resources_layer/astropy.zip"),
            compatible_runtimes=[lambda_.Runtime.PYTHON_3_7])
        # use an AWS provided layer for numpy
        layer_numpy = lambda_.LayerVersion.from_layer_version_arn(
            self, "NumpyLayer",
            "arn:aws:lambda:us-east-1:668099181075:layer:AWSLambda-Python37-SciPy1x:22"
        )

        # create the FITS header extractor lambda function
        # pass the FITSSTORE_BUCKET to the lambda function as an environment variable
        handler = lambda_.Function(
            self,
            "FITSHeaderExtractorHandler",
            runtime=lambda_.Runtime.PYTHON_3_7,
            code=lambda_.Code.asset("resources"),
            handler="fits_header_extractor.fits_header_extractor_handler",
            environment=dict(FITSSTORE_BUCKET=target_bucket.bucket_name),
            layers=[layer_astropy, layer_numpy])

        # grant read access to handler on source bucket
        source_bucket.grant_read(handler)

        # Give the lambda resource based policy
        # both source_arn and source_account is needed for security reason
        handler.add_permission(
            's3-trigger-lambda-s3-invoke-function',
            principal=iam_.ServicePrincipal('s3.amazonaws.com'),
            action='lambda:InvokeFunction',
            source_arn=source_bucket.bucket_arn,
            source_account=self.account)

        # grant access to the handler
        # - this is a lot easier than adding policies, but not all constructs support this
        target_bucket.grant_read_write(handler)

        # map the put event to hanlder - this doesn't work as source_bucket is not really a Bucket object (IBucketProxy)
        # You can use this approach if the bucket is created as a new Bucket object
        #notification = s3_notifications.LambdaDestination(handler)
        #source_bucket.add_object_created_notification(self, notification )

        # use custom resource to add an event trigger on the destnation bucket -
        # the custom resource creation makes an SDK call to create the event notification on the
        # Action reference https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/S3.html
        # Events reference https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html
        custom_s3_resource = custom_resources_.AwsCustomResource(
            self,
            's3-putobject-custom-notification-resource',
            policy=custom_resources_.AwsCustomResourcePolicy.from_statements([
                iam_.PolicyStatement(effect=iam_.Effect.ALLOW,
                                     resources=['*'],
                                     actions=['s3:PutBucketNotification'])
            ]),
            on_create=custom_resources_.AwsSdkCall(
                service="S3",
                action="putBucketNotificationConfiguration",
                parameters={
                    "Bucket": source_bucket.bucket_name,
                    "NotificationConfiguration": {
                        "LambdaFunctionConfigurations": [{
                            "Events":
                            ['s3:ObjectCreated:*', 's3:ObjectRemoved:*'],
                            "LambdaFunctionArn":
                            handler.function_arn,
                            "Filter": {
                                "Key": {
                                    "FilterRules": [{
                                        'Name': 'suffix',
                                        'Value': 'fits'
                                    }]
                                }
                            }
                        }]
                    }
                },
                physical_resource_id=custom_resources_.PhysicalResourceId.of(
                    f's3-notification-resource-{str(uuid.uuid1())}'),
                region=self.region))

        # Make sure the lambda function is created first
        custom_s3_resource.node.add_dependency(
            handler.permissions_node.find_child(
                's3-trigger-lambda-s3-invoke-function'))

        # create a glue crawler to build the data catalog
        # Step 1 . create a role for AWS Glue
        glue_role = iam_.Role(
            self,
            "glue_role",
            assumed_by=iam_.ServicePrincipal('glue.amazonaws.com'),
            managed_policies=[
                iam_.ManagedPolicy.from_managed_policy_arn(
                    self,
                    'MyFitsCrawlerGlueRole',
                    managed_policy_arn=
                    'arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole')
            ])
        # glue role needs "*" read/write - otherwise crawler will not be able to create tables (and no error messages in crawler logs)
        glue_role.add_to_policy(
            iam_.PolicyStatement(actions=[
                's3:GetObject', 's3:PutObject', 'lakeformation:GetDataAccess'
            ],
                                 effect=iam_.Effect.ALLOW,
                                 resources=['*']))

        # Step 2. create a database in data catalog
        db = glue_.Database(self,
                            "MyFitsDatabase",
                            database_name=glue_database_name)

        # Step 3. create a crawler named "fitsdatalakecrawler-<hex>", and schedule to run every 15 mins
        # You can change the frequency based on your needs
        # cron schedule format cron(Minutes Hours Day-of-month Month Day-of-week Year)
        glue_.CfnCrawler(
            self,
            "fits-datalake-crawler",
            database_name=glue_database_name,
            role=glue_role.role_arn,
            schedule={"scheduleExpression": "cron(0/15 * * * ? *)"},
            targets={"s3Targets": [{
                "path": target_bucket.bucket_name
            }]},
        )

        # When your AWS Lake Formation Data catalog settings is not set to
        # "Use only IAM access control for new databases" or
        # "Use only IAM access control for new tables in new databse"
        # you need to grant additional permission to the data catalog database.
        # in order for the crawler to run, we need to add some permissions to lakeformation

        location_resource = lakeformation_.CfnResource(
            self,
            "MyFitsDatalakeLocationResource",
            resource_arn=target_bucket.bucket_arn,
            use_service_linked_role=True)
        lakeformation_.CfnPermissions(
            self,
            "MyFitsDatalakeDatabasePermission",
            data_lake_principal=lakeformation_.CfnPermissions.
            DataLakePrincipalProperty(
                data_lake_principal_identifier=glue_role.role_arn),
            resource=lakeformation_.CfnPermissions.ResourceProperty(
                database_resource=lakeformation_.CfnPermissions.
                DatabaseResourceProperty(name=db.database_name)),
            permissions=["ALTER", "DROP", "CREATE_TABLE"],
        )
        location_permission = lakeformation_.CfnPermissions(
            self,
            "MyFitsDatalakeLocationPermission",
            data_lake_principal=lakeformation_.CfnPermissions.
            DataLakePrincipalProperty(
                data_lake_principal_identifier=glue_role.role_arn),
            resource=lakeformation_.CfnPermissions.ResourceProperty(
                data_location_resource=lakeformation_.CfnPermissions.
                DataLocationResourceProperty(
                    s3_resource=target_bucket.bucket_arn)),
            permissions=["DATA_LOCATION_ACCESS"],
        )
        #make sure the location resource is created first
        location_permission.node.add_dependency(location_resource)