def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) # The code that defines your stack goes here glue_trigger = glue.CfnTrigger(self, "gluetrigger", name="etl-trigger", type="ON_DEMAND", schedule=None, actions=[{ "jobName": "glue_crawler" }]) glue_crawler = glue.CfnCrawler( self, 'glue-crawler-id', description="Glue Crawler for my-data-science-s3", name='nickcrawler', database_name='nike', schedule=None, role= 'arn:aws:iam::919238404395:role/service-role/AWSGlueServiceRole-my_2nd_iamrole', targets={ "s3Targets": [{ "path": "s3://nikhils3/file/Titanic.csv" }] }) glue_trigger.add_depends_on(glue_crawler)
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) s3bucket = s3.Bucket(self, 'vika-yy') kds = data_stream.Stream(self, 'data_stream', shard_count=1) delivery_stream_role = iam.Role( self, 'kdfdelivery_stream_role_role', assumed_by=iam.ServicePrincipal('firehose.amazonaws.com')) delivery_stream_role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( 'AmazonKinesisFullAccess')) delivery_stream_role.add_to_policy( iam.PolicyStatement(effect=iam.Effect.ALLOW, resources=[s3bucket.bucket_arn], actions=["s3:*"])) #s3bucket = s3.Bucket(self, 'vika-yy',bucket_name='yellowtaxicdk-input') s3_dest_config = delivery_stream.CfnDeliveryStream.ExtendedS3DestinationConfigurationProperty( bucket_arn=s3bucket.bucket_arn, buffering_hints=delivery_stream.CfnDeliveryStream. BufferingHintsProperty(interval_in_seconds=60, size_in_m_bs=128), role_arn=delivery_stream_role.role_arn, compression_format='UNCOMPRESSED', s3_backup_mode='Disabled') stream_source_config = delivery_stream.CfnDeliveryStream.KinesisStreamSourceConfigurationProperty( kinesis_stream_arn=kds.stream_arn, role_arn=delivery_stream_role.role_arn) kfirehose = delivery_stream.CfnDeliveryStream( self, 'kfirehose', delivery_stream_name='deliverystream', delivery_stream_type='KinesisStreamAsSource', extended_s3_destination_configuration=s3_dest_config, kinesis_stream_source_configuration=stream_source_config) glue_role = iam.Role( self, 'glue_role', assumed_by=iam.ServicePrincipal('glue.amazonaws.com')) glue_role.add_to_policy( iam.PolicyStatement(effect=iam.Effect.ALLOW, resources=[s3bucket.bucket_arn], actions=["s3:*"])) glue_role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSGlueServiceRole')) bucket_name = s3bucket.bucket_name glue_crawler = glue.CfnCrawler( self, 'glue_crawler', database_name='yellow-taxis', role=glue_role.role_arn, #targets={"s3Targets": [{"path": f'{BUCKET}/input/'}]} targets={"s3Targets": [{ "path": f'{bucket_name}/input/' }]})
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) bucket = s3.Bucket( self, 'csv-bucket', ) glue_role = iam.Role( self, 'glue-role', assumed_by=iam.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSGlueServiceRole'), iam.ManagedPolicy.from_aws_managed_policy_name( 'AmazonS3FullAccess') ]) crawler = glue.CfnCrawler(self, 'csv-crawler', role=glue_role.role_arn, database_name='csv_db', targets={ 's3Targets': [{ "path": f"s3://{bucket.bucket_name}/csv/" }], })
def __init__(self, scope: core.Construct, data_lake: DataLake, **kwargs) -> None: self.env = data_lake.env.value super().__init__(scope, id=f'{self.env}-glue-catalog', **kwargs) self.atomic_events_crawler = glue.CfnCrawler( self, f'{self.env}-atomic-events-crawler', name=f'{self.env}-atomic-events-crawler', description= 'Crawler to detect schema of data sored in data lake raw, atomic events', schedule=glue.CfnCrawler.ScheduleProperty( schedule_expression='cron(0/15 * * * ? *)'), role=data_lake.data_lake_role.role_arn, targets=glue.CfnCrawler.TargetsProperty(s3_targets=[ glue.CfnCrawler.S3TargetProperty( path= f's3://{data_lake.data_lake_raw_bucket.bucket_name}/atomic_events' ) ]), database_name=data_lake.data_lake_raw_database.database_name) self.orders_table = glue.Table( self, f'{self.env}-orders-table', table_name='orders', description='orders captured from Postgres using DMS CDC', database=data_lake.data_lake_raw_database, compressed=True, data_format=glue.DataFormat.PARQUET, s3_prefix='orders/public/orders', bucket=data_lake.data_lake_raw_bucket, columns=[ glue.Column(name='op', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='extracted_at', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='created_at', type=glue.Type(input_string='timestamp', is_primitive=True)), glue.Column(name='order_id', type=glue.Type(input_string='int', is_primitive=True)), glue.Column(name='product_name', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='value', type=glue.Type(input_string='double', is_primitive=True)) ])
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) # The code that defines your stack goes here glue_crawler = glue.CfnCrawler( self, 'phoenix_crawler', description="Glue Crawler for my-data-science-s3", name='phoenixcrawler', database_name='phoenixdb', #schedule={"scheduleExpression": "cron(45 12 * * ? *)"}, role='arn:aws:iam::147279300887:role/successnick', targets={"s3Targets": [{"path": "s3://mydemoobuckett31"}]} )
def __init__(self, scope: cdk.Stack, id: str, base_module, stream_module, **kwargs): super().__init__(scope, id, **kwargs) self.base_module = base_module self.stream_module = stream_module self.glue_service_iam_role = aws_iam.Role( self, "GlueIAMRole", role_name="GlueCrawler-{}".format(self.stack_name), assumed_by=aws_iam.ServicePrincipal(service='glue.amazonaws.com'), ) # Attaching the default aws managed role, and s3 access policy to curated bucket path self.glue_service_iam_role.attach_managed_policy( 'arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole') self.glue_s3_iam_policy_statement = aws_iam.PolicyStatement() actions = ["s3:GetObject", "s3:PutObject"] for action in actions: self.glue_s3_iam_policy_statement.add_action(action) self.glue_s3_iam_policy_statement.add_resource( self.stream_module.output_bucket.bucket_arn + '/twitter-curated/*') self.glue_iam_policy = aws_iam.Policy( self, "GlueIAMPolicy", statements=[self.glue_s3_iam_policy_statement], ) self.glue_iam_policy.attach_to_role(self.glue_service_iam_role) self.glue_database = aws_glue.Database( self, "GlueDatabaseTwitterData", database_name=self.stack_name, ) self.glue_crawler = aws_glue.CfnCrawler( self, "GlueCrawlerTwitterDB", database_name=self.glue_database.database_name, role=self.glue_service_iam_role.role_arn, targets={ "s3Targets": [{ "path": "s3://{}/twitter-curated/".format( self.stream_module.output_bucket.bucket_name) }] }, table_prefix=self.stack_name)
def create_crawler(self, location): """Create crawler in data lake by given dataset location.""" crawler_name = generate_crawler_name(s3url=location) table_prefix = generate_table_prefix(s3url=location) if not self.crawler_role: self.crawler_role() glue.CfnCrawler( scope=self, id=f"oedi-data-lake-crawler--{crawler_name}", name=crawler_name, role=self.crawler_role.role_name, targets={"s3Targets": [{"path": location}]}, database_name=self.database_name, table_prefix=table_prefix, )
def __init__(self, scope: core.Construct, id: str, props: GlueCrawlerProps, **kwargs) -> None: # Glue Cralwer Properties crawlerCfn = glue.CfnCrawler targetProperty = crawlerCfn.TargetsProperty S3TargetProperty = crawlerCfn.S3TargetProperty ScheduleProperty = crawlerCfn.ScheduleProperty # Glue Crawler self.glue_crawler = glue.CfnCrawler( scope, 'clickstream_crawler', role=props.role.role_arn, targets=targetProperty(s3_targets=[ S3TargetProperty( path=f's3://{props.bucket.bucket_name}/kinesis/') ]), database_name='clickstream_db', name='clickstream', schedule=ScheduleProperty(schedule_expression='cron(0 * ? * * *)'))
def _create_glue_crawler(self): """ Implement a glue crawler that can be that runs on our defined data bucket :return: """ s3_target = glue.CfnCrawler.S3TargetProperty( path=f"s3://{self.data_bucket.bucket_name}/") # schedule = "cron(30 5 * * ? *)" db_name = self.glue_db_name crawler = glue.CfnCrawler( self, id=f"glue-crawler-{db_name}", name=f"{db_name}-crawl", database_name=db_name, role=self.glue_role.role_arn, targets=glue.CfnCrawler.TargetsProperty(s3_targets=[s3_target]), # schedule=glue.CfnCrawler.ScheduleProperty(schedule_expression=schedule), ) return crawler
def __init__( self, scope: cdk.Construct, construct_id: str, stack_log_level: str, _glue_etl_role, etl_bkt, etl_bkt_prefix, glue_db_name: str, **kwargs, ) -> None: super().__init__(scope, construct_id, **kwargs) self.template_options.metadata = {"License": "Miztiik Corp."} # Glue Crawler sale_txns_crawler = _glue.CfnCrawler( self, "glueDataLakeCrawler", name="sale_txns_crawler", role=_glue_etl_role.role_arn, database_name=f"{glue_db_name}", table_prefix="sales_txns_in_parquet_", description= "Miztiik Automation: Crawl the sales transactions in parquet format, store in table to enable Athena Querying", targets={ "s3Targets": [{ "path": f"s3://{etl_bkt.bucket_name}/{etl_bkt_prefix}", "exclusions": ["checkpoint/**"] }] }, configuration= "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}", schedule=_glue.CfnCrawler.ScheduleProperty( schedule_expression="cron(0 * * * ? *)")) # Configuration As JSON in human readable format """ { "Version": 1, "CrawlerOutput": { "Partitions": { "AddOrUpdateBehavior": "InheritFromTable" }, "Tables": { "AddOrUpdateBehavior": "MergeNewColumns" } } } and SchemaChangePolicy { "UpdateBehavior": "UPDATE_IN_DATABASE", "DeleteBehavior": "DEPRECATE_IN_DATABASE" } """ ########################################### ################# OUTPUTS ################# ########################################### output_0 = cdk.CfnOutput( self, "AutomationFrom", value=f"{GlobalArgs.SOURCE_INFO}", description= "To know more about this automation stack, check out our github page.", ) output_1 = cdk.CfnOutput( self, "SaleTransactionsCrawler", value= f" https://console.aws.amazon.com/glue/home?region={cdk.Aws.REGION}#crawler:name={sale_txns_crawler.name}", description="Glue ETL Job.", )
def __init__( self, scope: cdk.Construct, construct_id: str, stack_log_level: str, sales_event_bkt, glue_db_name: str, **kwargs, ) -> None: super().__init__(scope, construct_id, **kwargs) self.template_options.metadata = {"License": "Miztiik Corp."} # Glue Job IAM Role self._glue_etl_role = _iam.Role( self, "glueJobRole", assumed_by=_iam.ServicePrincipal("glue.amazonaws.com"), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonS3ReadOnlyAccess" ), _iam.ManagedPolicy.from_aws_managed_policy_name( "service-role/AWSGlueServiceRole" ) ] ) self._glue_etl_role.add_to_policy( _iam.PolicyStatement( actions=[ "s3:*" ], resources=[ f"{sales_event_bkt.bucket_arn}", f"{sales_event_bkt.bucket_arn}/*" ] ) ) # Glue Crawler sale_txns_crawler = _glue.CfnCrawler( self, "glueDataLakeCrawler", name="sale_txns_crawler", role=self._glue_etl_role.role_arn, database_name=f"{glue_db_name}", table_prefix="txns_", description="Miztiik Automation: Crawl the sales transactions in JSON format, store in table to enable querying", targets={ "s3Targets": [ { "path": f"s3://{sales_event_bkt.bucket_name}", "exclusions": [ "db_connectors/**", "bookmarks/**" ] } ] }, configuration="{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}", schema_change_policy=_glue.CfnCrawler.SchemaChangePolicyProperty( update_behavior="UPDATE_IN_DATABASE", delete_behavior="LOG" ), schedule=_glue.CfnCrawler.ScheduleProperty( schedule_expression="cron(0 * * * ? *)" ) ) # Configuration As JSON in human readable format """ { "Version": 1, "CrawlerOutput": { "Partitions": { "AddOrUpdateBehavior": "InheritFromTable" }, "Tables": { "AddOrUpdateBehavior": "MergeNewColumns" } } } and SchemaChangePolicy { "UpdateBehavior": "UPDATE_IN_DATABASE", "DeleteBehavior": "DEPRECATE_IN_DATABASE" } """ ########################################### ################# OUTPUTS ################# ########################################### output_0 = cdk.CfnOutput( self, "AutomationFrom", value=f"{GlobalArgs.SOURCE_INFO}", description="To know more about this automation stack, check out our github page.", ) output_1 = cdk.CfnOutput( self, "SaleTransactionsCrawler", value=f" https://console.aws.amazon.com/glue/home?region={cdk.Aws.REGION}#crawler:name={sale_txns_crawler.name}", description="Glue ETL Job.", )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) with open('./props/tasksetting.json', 'r') as f1: py_json1 = json.load(f1) ts = json.dumps(py_json1) # with open('./props/mappingrule.json', 'r') as f2: # py_json2 = json.load(f2) # mr = json.dumps(py_json2) with open('./props/config.json', 'r') as f2: configuration = json.load(f2) def getMappingrules(self, table_list): rules = [] for index, value in enumerate(table_list, 1): rules.append({ "rule-type": "selection", "rule-id": str(index), "rule-name": str(index), "object-locator": { "schema-name": value['schemaName'], "table-name": value['tableName'] }, "rule-action": "include", "filters": [] }) mapping_rules = {"rules": rules} return json.dumps(mapping_rules) # The code that defines your stack goes here S3Accessrole = _iam.Role( self, 'dmsrole', assumed_by=_iam.ServicePrincipal('dms.amazonaws.com'), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( 'AmazonS3FullAccess') ]) raw_bucket = s3.Bucket(self, 'rawbucket', bucket_name='rawbucket-datalake-cdk-oregon') raw_bucket.add_lifecycle_rule( enabled=configuration['s3LifecycleRule']['enabled'], expiration=core.Duration.days( configuration['s3LifecycleRule']['expiration'])) #my_table = ddb.Table(self, id ='dunamoTable', table_name = 'testcdktable', #partition_key = ddb.Attribute(name ='lastname',type = ddb.AttributeType.STRING) ) dl_dms = _dms.CfnReplicationInstance( self, 'dmsreplication', replication_instance_class=configuration['DMS_instance_setting'] ['instance_class'], replication_instance_identifier='datalake-instance-cdk', allocated_storage=configuration['DMS_instance_setting'] ['allocated_storage']) source_endpoint = _dms.CfnEndpoint( self, 'sourceendpoint', endpoint_type='source', engine_name=configuration['engineName'], database_name=configuration['databaseName'], username=configuration['username'], password=configuration['password'], port=configuration['port'], server_name=configuration['serverName'], ) target_endpoint = _dms.CfnEndpoint( self, 'targetendpoint', endpoint_type='target', engine_name='s3', s3_settings={ 'bucketName': raw_bucket.bucket_name, 'serviceAccessRoleArn': S3Accessrole.role_arn }, extra_connection_attributes='dataFormat=parquet') dms_task = _dms.CfnReplicationTask( self, 'data2lake-task', migration_type='full-load-and-cdc', replication_instance_arn=dl_dms.ref, source_endpoint_arn=source_endpoint.ref, target_endpoint_arn=target_endpoint.ref, replication_task_settings=ts, table_mappings=getMappingrules(self, configuration['tableList'])) my_table = ddb.Table(self, id='dynamoTable', table_name='ControllerTable', partition_key=ddb.Attribute( name='path', type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) datalake_bucket = s3.Bucket(self, 'datalakebucket', bucket_name='datalake-bucket-cdk-oregon') glue_role = _iam.Role( self, 'gluerole', assumed_by=_iam.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSGlueServiceRole') ]) raw_bucket.grant_read(glue_role) datalake_bucket.grant_read_write(glue_role) #lake formation settings #If you have attached managed policy ('AWSLakeFormationDataAdmin') to your own iam user, you should change that policy to allow "lakeformation:PutDataLakeSettings", #so that the lake setting can be allowed by below code in cdk. lake_admin_setting = _lakeformation.CfnDataLakeSettings( self, 'data-lake-GrantAdmin', admins=[ _lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty( data_lake_principal_identifier=configuration[ 'executiveArn']) ]) glue_database = _glue.Database(self, 'gluedatabase', database_name='data_lake_gluedb') glue_database.node.add_dependency(lake_admin_setting) glue_role_permission_inLakeFormation = _lakeformation.CfnPermissions( self, 'permission-glueRole', data_lake_principal=_lakeformation.CfnPermissions. DataLakePrincipalProperty( data_lake_principal_identifier=glue_role.role_arn), resource=_lakeformation.CfnPermissions.ResourceProperty( database_resource=_lakeformation.CfnPermissions. DatabaseResourceProperty(name=glue_database.database_name)), permissions=['ALL']) crawler = _glue.CfnCrawler( self, 'datalakecrawler', name='Crawler-datalake-cdk', role=glue_role.role_arn, targets={ 's3Targets': [{ 'path': 's3://' + datalake_bucket.bucket_name + '/datalake/' }] }, database_name='data_lake_gluedb', configuration= "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}" ) initialload_script = S3Assets.Asset(self, 'initial-load-code', path='./Gluejob/InitialLoad.py') incrementalload_script = S3Assets.Asset( self, 'incremental-load-code', path='./Gluejob/IncrementalLoad.py') initialload_script.grant_read(glue_role) incrementalload_script.grant_read(glue_role) my_table.grant_full_access(glue_role) initial_load_job = _glue.CfnJob( self, 'initial-job', name='InitialLoad-cdk', command=_glue.CfnJob.JobCommandProperty( name='glueetl', python_version='3', script_location='s3://' + initialload_script.s3_bucket_name + '/' + initialload_script.s3_object_key), role=glue_role.role_arn, default_arguments={ '--prefix': str(configuration['tableList']), '--bucket': raw_bucket.bucket_name, '--datalake_bucket': datalake_bucket.bucket_name, '--datalake_prefix': 'datalake/', '--region': CdkpyStack.of(self).region, '--controller_table_name': my_table.table_name }, allocated_capacity=configuration['glue_job_setting'] ['job_capacity'], execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=configuration['glue_job_setting'] ['max_concurrent_run_JobExecution'])) incremental_load_job = _glue.CfnJob( self, 'increment-job', name='IncrementalLoad-cdk', command=_glue.CfnJob.JobCommandProperty( name='glueetl', script_location='s3://' + incrementalload_script.s3_bucket_name + '/' + incrementalload_script.s3_object_key, python_version='3'), role=glue_role.role_arn, default_arguments={ '--prefix': str(configuration['tableList']), '--bucket': raw_bucket.bucket_name, '--datalake_bucket': datalake_bucket.bucket_name, '--datalake_prefix': 'datalake/', '--region': CdkpyStack.of(self).region, '--controller_table_name': my_table.table_name }, allocated_capacity=2, execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=1)) job_trigger = _glue.CfnTrigger( self, 'datalake-glue-trigger', type='SCHEDULED', schedule=configuration['job_trigger_schedule'], start_on_creation=False, actions=[ _glue.CfnTrigger.ActionProperty(job_name='IncrementalLoad-cdk') ]) dl_sns = _sns.Topic(self, 'datalake_sns', display_name='data-lake-sns') endpoint_email = configuration['emailSubscriptionList'] for emails in endpoint_email: dl_sns.add_subscription(_subscrption.EmailSubscription(emails)) #Another way to subscribe: dl_subscription = _sns.Subscription(self,'email-subscrption',topic = dl_sns,endpoint='*****@*****.**',protocol= _sns.SubscriptionProtocol.EMAIL) glue_events_target = _events_targets.SnsTopic(dl_sns) glue_events_rule = _events.Rule( self, 'gluejobevents-datalake', description='Using for tracking the failed glue job of data lake', rule_name='dl-gluejob-event', event_pattern=_events.EventPattern( source=['aws.glue'], detail_type=['Glue Job State Change'], detail={ "jobName": [initial_load_job.name], "state": ["FAILED"] }), targets=[glue_events_target]) dms_subscription = _dms.CfnEventSubscription( self, 'dmsevents-datalake', sns_topic_arn=dl_sns.topic_arn, subscription_name='datalake-dmsevents', source_type='replication-task', event_categories=['failure'])
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) s3_logs_bucket = s3.Bucket( self, "LogsBucket", encryption=s3.BucketEncryption.KMS_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL, lifecycle_rules=[ s3.LifecycleRule( abort_incomplete_multipart_upload_after=core.Duration.days( 7), expiration=core.Duration.days(30)) ]) s3_data_bucket = s3.Bucket( self, "DataBucket", encryption=s3.BucketEncryption.KMS_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL, server_access_logs_bucket=s3_logs_bucket, server_access_logs_prefix=f"s3accesslogs/{PROJECT_NAME}/") glue_database = glue.Database(self, "GlueDatabase", database_name=PROJECT_NAME) glue_table = glue.Table( self, "GlueTable", columns=[ glue.Column(name="timestamp", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="celcius", type=glue.Type(input_string="double", is_primitive=True)), glue.Column(name="fahrenheit", type=glue.Type(input_string="double", is_primitive=True)) ], database=glue_database, data_format=glue.DataFormat( input_format=glue.InputFormat( "org.apache.hadoop.mapred.TextInputFormat"), output_format=glue.OutputFormat( "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat" ), serialization_library=glue.SerializationLibrary( "org.openx.data.jsonserde.JsonSerDe")), table_name=PROJECT_NAME, encryption=glue.TableEncryption.S3_MANAGED, partition_keys=[ glue.Column(name="year", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="month", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="day", type=glue.Type(input_string="int", is_primitive=True)) ]) glue_crawler_role = iam.Role( self, "GlueCrawlerRole", assumed_by=iam.ServicePrincipal("glue.amazonaws.com"), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( "AWSGlueServiceRole") ]) s3_data_bucket.grant_read(glue_crawler_role, objects_key_pattern=f"{PROJECT_PREFIX}/") s3_data_bucket.grant_put(glue_crawler_role, objects_key_pattern=f"{PROJECT_PREFIX}/") glue_crawler = glue.CfnCrawler( self, "GlueCrawler", role=glue_crawler_role.role_arn, database_name=glue_database.database_name, targets={ "s3Targets": [{ "path": f"{s3_data_bucket.bucket_name}/{PROJECT_PREFIX}/" }] }, schedule={"scheduleExpression": "cron(30 04 * * ? *)"})
def __init__(self, scope: core.Construct, id: str, source_bucket_name: str, glue_database_name: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # get the source bucket - this object is an IBucketProxy interface, not a Buckt construct. # Can not be used to add an event directly. Instead, use a custom resource to add an event trigger later source_bucket = s3.Bucket.from_bucket_name( self, "MySourceBucket", bucket_name=source_bucket_name) # create the new destination bucket - this bucket holds the csv file that containers the FITS header information # the name of the bucket will be <stack-id>-fitsstorebucketXXXXXXXX-YYYYYYYYYYYYY # e.g. my-fits-datalake-fitsstorebucket1234567f-098765432d target_bucket = s3.Bucket(self, "FITSSTORE_BUCKET") # Add the astropy and numpy layers for the lambda function that is used as the event trigger on the source_bucket layer_astropy = lambda_.LayerVersion( self, 'AstroFitsioLayer', code=lambda_.Code.from_asset("resources_layer/astropy.zip"), compatible_runtimes=[lambda_.Runtime.PYTHON_3_7]) # use an AWS provided layer for numpy layer_numpy = lambda_.LayerVersion.from_layer_version_arn( self, "NumpyLayer", "arn:aws:lambda:us-east-1:668099181075:layer:AWSLambda-Python37-SciPy1x:22" ) # create the FITS header extractor lambda function # pass the FITSSTORE_BUCKET to the lambda function as an environment variable handler = lambda_.Function( self, "FITSHeaderExtractorHandler", runtime=lambda_.Runtime.PYTHON_3_7, code=lambda_.Code.asset("resources"), handler="fits_header_extractor.fits_header_extractor_handler", environment=dict(FITSSTORE_BUCKET=target_bucket.bucket_name), layers=[layer_astropy, layer_numpy]) # grant read access to handler on source bucket source_bucket.grant_read(handler) # Give the lambda resource based policy # both source_arn and source_account is needed for security reason handler.add_permission( 's3-trigger-lambda-s3-invoke-function', principal=iam_.ServicePrincipal('s3.amazonaws.com'), action='lambda:InvokeFunction', source_arn=source_bucket.bucket_arn, source_account=self.account) # grant access to the handler # - this is a lot easier than adding policies, but not all constructs support this target_bucket.grant_read_write(handler) # map the put event to hanlder - this doesn't work as source_bucket is not really a Bucket object (IBucketProxy) # You can use this approach if the bucket is created as a new Bucket object #notification = s3_notifications.LambdaDestination(handler) #source_bucket.add_object_created_notification(self, notification ) # use custom resource to add an event trigger on the destnation bucket - # the custom resource creation makes an SDK call to create the event notification on the # Action reference https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/S3.html # Events reference https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html custom_s3_resource = custom_resources_.AwsCustomResource( self, 's3-putobject-custom-notification-resource', policy=custom_resources_.AwsCustomResourcePolicy.from_statements([ iam_.PolicyStatement(effect=iam_.Effect.ALLOW, resources=['*'], actions=['s3:PutBucketNotification']) ]), on_create=custom_resources_.AwsSdkCall( service="S3", action="putBucketNotificationConfiguration", parameters={ "Bucket": source_bucket.bucket_name, "NotificationConfiguration": { "LambdaFunctionConfigurations": [{ "Events": ['s3:ObjectCreated:*', 's3:ObjectRemoved:*'], "LambdaFunctionArn": handler.function_arn, "Filter": { "Key": { "FilterRules": [{ 'Name': 'suffix', 'Value': 'fits' }] } } }] } }, physical_resource_id=custom_resources_.PhysicalResourceId.of( f's3-notification-resource-{str(uuid.uuid1())}'), region=self.region)) # Make sure the lambda function is created first custom_s3_resource.node.add_dependency( handler.permissions_node.find_child( 's3-trigger-lambda-s3-invoke-function')) # create a glue crawler to build the data catalog # Step 1 . create a role for AWS Glue glue_role = iam_.Role( self, "glue_role", assumed_by=iam_.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ iam_.ManagedPolicy.from_managed_policy_arn( self, 'MyFitsCrawlerGlueRole', managed_policy_arn= 'arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole') ]) # glue role needs "*" read/write - otherwise crawler will not be able to create tables (and no error messages in crawler logs) glue_role.add_to_policy( iam_.PolicyStatement(actions=[ 's3:GetObject', 's3:PutObject', 'lakeformation:GetDataAccess' ], effect=iam_.Effect.ALLOW, resources=['*'])) # Step 2. create a database in data catalog db = glue_.Database(self, "MyFitsDatabase", database_name=glue_database_name) # Step 3. create a crawler named "fitsdatalakecrawler-<hex>", and schedule to run every 15 mins # You can change the frequency based on your needs # cron schedule format cron(Minutes Hours Day-of-month Month Day-of-week Year) glue_.CfnCrawler( self, "fits-datalake-crawler", database_name=glue_database_name, role=glue_role.role_arn, schedule={"scheduleExpression": "cron(0/15 * * * ? *)"}, targets={"s3Targets": [{ "path": target_bucket.bucket_name }]}, ) # When your AWS Lake Formation Data catalog settings is not set to # "Use only IAM access control for new databases" or # "Use only IAM access control for new tables in new databse" # you need to grant additional permission to the data catalog database. # in order for the crawler to run, we need to add some permissions to lakeformation location_resource = lakeformation_.CfnResource( self, "MyFitsDatalakeLocationResource", resource_arn=target_bucket.bucket_arn, use_service_linked_role=True) lakeformation_.CfnPermissions( self, "MyFitsDatalakeDatabasePermission", data_lake_principal=lakeformation_.CfnPermissions. DataLakePrincipalProperty( data_lake_principal_identifier=glue_role.role_arn), resource=lakeformation_.CfnPermissions.ResourceProperty( database_resource=lakeformation_.CfnPermissions. DatabaseResourceProperty(name=db.database_name)), permissions=["ALTER", "DROP", "CREATE_TABLE"], ) location_permission = lakeformation_.CfnPermissions( self, "MyFitsDatalakeLocationPermission", data_lake_principal=lakeformation_.CfnPermissions. DataLakePrincipalProperty( data_lake_principal_identifier=glue_role.role_arn), resource=lakeformation_.CfnPermissions.ResourceProperty( data_location_resource=lakeformation_.CfnPermissions. DataLocationResourceProperty( s3_resource=target_bucket.bucket_arn)), permissions=["DATA_LOCATION_ACCESS"], ) #make sure the location resource is created first location_permission.node.add_dependency(location_resource)