def create_database(self): """Create the database of data lake in Glue""" id_suffix = self.database_name.replace("_", "-") glue.Database( scope=self, id=f"oedi-data-lake-database--{id_suffix}", database_name=self.database_name )
def _create_glue_db(self): """ Create a glue database that will be visible in Athena """ db_name = self.glue_db_name db = glue.Database( self, f"{db_name}-id", database_name=db_name, location_uri=f"s3://{self.data_bucket.bucket_name}/", ) return db
def __init__(self, scope: cdk.Stack, id: str, base_module, stream_module, **kwargs): super().__init__(scope, id, **kwargs) self.base_module = base_module self.stream_module = stream_module self.glue_service_iam_role = aws_iam.Role( self, "GlueIAMRole", role_name="GlueCrawler-{}".format(self.stack_name), assumed_by=aws_iam.ServicePrincipal(service='glue.amazonaws.com'), ) # Attaching the default aws managed role, and s3 access policy to curated bucket path self.glue_service_iam_role.attach_managed_policy( 'arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole') self.glue_s3_iam_policy_statement = aws_iam.PolicyStatement() actions = ["s3:GetObject", "s3:PutObject"] for action in actions: self.glue_s3_iam_policy_statement.add_action(action) self.glue_s3_iam_policy_statement.add_resource( self.stream_module.output_bucket.bucket_arn + '/twitter-curated/*') self.glue_iam_policy = aws_iam.Policy( self, "GlueIAMPolicy", statements=[self.glue_s3_iam_policy_statement], ) self.glue_iam_policy.attach_to_role(self.glue_service_iam_role) self.glue_database = aws_glue.Database( self, "GlueDatabaseTwitterData", database_name=self.stack_name, ) self.glue_crawler = aws_glue.CfnCrawler( self, "GlueCrawlerTwitterDB", database_name=self.glue_database.database_name, role=self.glue_service_iam_role.role_arn, targets={ "s3Targets": [{ "path": "s3://{}/twitter-curated/".format( self.stream_module.output_bucket.bucket_name) }] }, table_prefix=self.stack_name)
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) self._region = 'aws_region' self._account_id = 'aws_account_id' bucket = s3.Bucket.from_bucket_name(self, 'my_bucket_id', 'my_bucket') database = glue.Database(self, id='my_database_id', database_name='poc') table = glue.Table( self, id='my_table_id', database=database, table_name='my_table', columns=[ glue.Column(name='col1', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='col2', type=glue.Type(input_string='int', is_primitive=True)) ], partition_keys=[ glue.Column(name='dt', type=glue.Type(input_string='string', is_primitive=True)) ], bucket=bucket, s3_prefix='test_data', data_format=glue.DataFormat( input_format=glue.InputFormat( 'org.apache.hadoop.mapred.TextInputFormat'), output_format=glue.OutputFormat( 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ), serialization_library=glue.SerializationLibrary( 'org.openx.data.jsonserde.JsonSerDe')))
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) # The code that defines your stack goes here dbdemo = glue.Database(self, "nikhildb", database_name="nike")
def __init__(self, scope: core.Construct, id: str, landing_zone: ILandingZone, directory: DirectoryServicesConstruct, group_names: [List], **kwargs) -> None: super().__init__(scope, id, **kwargs) self.__landing_zone = landing_zone # Configure the security groups self.security_group = ec2.SecurityGroup( self, 'SecurityGroup', vpc=landing_zone.networking.vpc, allow_all_outbound=True, description='HadoopConstruct Security Group', security_group_name='hadoop-mapreduce-group') for port in services.keys(): self.security_group.add_ingress_rule( peer=ec2.Peer.any_ipv4(), connection=ec2.Port(protocol=ec2.Protocol.TCP, from_port=port, to_port=port, string_representation=services[port])) self.security_group.add_ingress_rule( peer=ec2.Peer.any_ipv4(), connection=ec2.Port(protocol=ec2.Protocol.UDP, from_port=0, to_port=65535, string_representation='Allow All UDP Traffic')) self.security_group.add_ingress_rule( peer=ec2.Peer.any_ipv4(), connection=ec2.Port(protocol=ec2.Protocol.TCP, from_port=0, to_port=65535, string_representation='Allow All TCP Traffic')) # Setup roles... self.jobFlowRole = iam.Role( self, 'JobFlowRole', assumed_by=iam.ServicePrincipal(service='ec2.amazonaws.com'), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( 'AmazonSSMManagedInstanceCore'), iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AmazonElasticMapReduceforEC2Role'), ]) profile_name = 'jobflowprofile@{}-{}'.format( landing_zone.zone_name, core.Stack.of(self).region) job_flow_instance_profile = iam.CfnInstanceProfile( self, 'JobFlowInstanceProfile', instance_profile_name=profile_name, roles=[self.jobFlowRole.role_name]) serviceRole = iam.Role( self, 'ServiceRole', assumed_by=iam.ServicePrincipal( service='elasticmapreduce.amazonaws.com'), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AmazonElasticMapReduceRole') ]) self.database = g.Database(self, 'GlueStore', database_name='demo-database') self.bucket = s3.Bucket(self, 'LogBucket', removal_policy=core.RemovalPolicy.DESTROY) emr_fs = EmrfsConstruct(self, 'Emrfs', landing_zone=landing_zone, directory=directory, group_names=group_names, job_flow_role=self.jobFlowRole) # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-elasticmapreduce-instancefleetconfig.html self.cluster = emr.CfnCluster( self, 'Hadoop', name='HadoopCluster', job_flow_role=profile_name, #'EMR_EC2_DefaultRole', service_role=serviceRole.role_name, log_uri='s3://' + self.bucket.bucket_name + '/logs', release_label='emr-6.2.0', applications=[ emr.CfnCluster.ApplicationProperty(name='Spark'), emr.CfnCluster.ApplicationProperty(name='Presto'), emr.CfnCluster.ApplicationProperty(name='Hue'), emr.CfnCluster.ApplicationProperty(name='Hive'), emr.CfnCluster.ApplicationProperty(name='JupyterHub'), ], configurations=[ emr.CfnCluster.ConfigurationProperty( classification='spark-hive-site', configuration_properties={ 'hive.metastore.client.factory.class': 'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory' }), emr.CfnCluster.ConfigurationProperty( classification='hive-site', configuration_properties={ 'hive.metastore.client.factory.class': 'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory', 'aws.glue.partition.num.segments': '10', #1 to 10; (default=5) 'hive.metastore.schema.verification': 'false', }) ], security_configuration=emr_fs.security_configuration.ref, # kerberos_attributes= emr.CfnCluster.KerberosAttributesProperty( # kdc_admin_password=directory.password, # realm= directory.mad.name.upper(), # ad_domain_join_password=directory.password, # ad_domain_join_user= directory.admin # ), managed_scaling_policy=emr.CfnCluster.ManagedScalingPolicyProperty( compute_limits=emr.CfnCluster.ComputeLimitsProperty( minimum_capacity_units=1, maximum_capacity_units=25, unit_type='InstanceFleetUnits')), instances=emr.CfnCluster.JobFlowInstancesConfigProperty( #hadoop_version='2.4.0', termination_protected=False, master_instance_fleet=emr.CfnCluster. InstanceFleetConfigProperty( target_spot_capacity=1, instance_type_configs=[ emr.CfnCluster.InstanceTypeConfigProperty( instance_type='m5.xlarge', ) ]), core_instance_fleet=emr.CfnCluster.InstanceFleetConfigProperty( target_spot_capacity=1, instance_type_configs=[ emr.CfnCluster.InstanceTypeConfigProperty( instance_type='m5.xlarge', ebs_configuration=emr.CfnCluster. EbsConfigurationProperty(ebs_block_device_configs=[ emr.CfnCluster.EbsBlockDeviceConfigProperty( volume_specification=emr.CfnCluster. VolumeSpecificationProperty( size_in_gb=50, volume_type='gp2')) ])) ]), additional_master_security_groups=[ self.security_group.security_group_id ], additional_slave_security_groups=[ self.security_group.security_group_id ], ec2_subnet_ids=[ net.subnet_id for net in landing_zone.networking.vpc. _select_subnet_objects(subnet_group_name='Hadoop') ], )) self.cluster.add_depends_on(job_flow_instance_profile)
def __init__(self, scope: cdk.Construct, construct_id: str, **kwargs: str) -> None: super().__init__(scope, construct_id, **kwargs) self.vpc = ec2.Vpc( self, "aws-data-wrangler-vpc", cidr="11.19.224.0/19", enable_dns_hostnames=True, enable_dns_support=True, ) cdk.Tags.of(self.vpc).add("Name", "aws-data-wrangler") self.key = kms.Key( self, id="aws-data-wrangler-key", description="Aws Data Wrangler Test Key.", policy=iam.PolicyDocument( statements=[ iam.PolicyStatement( sid="Enable IAM User Permissions", effect=iam.Effect.ALLOW, actions=["kms:*"], principals=[iam.AccountRootPrincipal()], resources=["*"], ) ] ), ) kms.Alias( self, "aws-data-wrangler-key-alias", alias_name="alias/aws-data-wrangler-key", target_key=self.key, ) self.bucket = s3.Bucket( self, id="aws-data-wrangler", block_public_access=s3.BlockPublicAccess( block_public_acls=True, block_public_policy=True, ignore_public_acls=True, restrict_public_buckets=True, ), lifecycle_rules=[ s3.LifecycleRule( id="CleaningUp", enabled=True, expiration=cdk.Duration.days(1), abort_incomplete_multipart_upload_after=cdk.Duration.days(1), ), ], versioned=True, ) glue_db = glue.Database( self, id="aws_data_wrangler_glue_database", database_name="aws_data_wrangler", location_uri=f"s3://{self.bucket.bucket_name}", ) log_group = logs.LogGroup( self, id="aws_data_wrangler_log_group", retention=logs.RetentionDays.ONE_MONTH, ) log_stream = logs.LogStream( self, id="aws_data_wrangler_log_stream", log_group=log_group, ) cdk.CfnOutput(self, "Region", value=self.region) cdk.CfnOutput( self, "VPC", value=self.vpc.vpc_id, export_name="aws-data-wrangler-base-VPC", ) cdk.CfnOutput( self, "PublicSubnet1", value=self.vpc.public_subnets[0].subnet_id, export_name="aws-data-wrangler-base-PublicSubnet1", ) cdk.CfnOutput( self, "PublicSubnet2", value=self.vpc.public_subnets[1].subnet_id, export_name="aws-data-wrangler-base-PublicSubnet2", ) cdk.CfnOutput( self, "PrivateSubnet", value=self.vpc.private_subnets[0].subnet_id, export_name="aws-data-wrangler-base-PrivateSubnet", ) cdk.CfnOutput( self, "KmsKeyArn", value=self.key.key_arn, export_name="aws-data-wrangler-base-KmsKeyArn", ) cdk.CfnOutput( self, "BucketName", value=self.bucket.bucket_name, export_name="aws-data-wrangler-base-BucketName", ) cdk.CfnOutput(self, "GlueDatabaseName", value=glue_db.database_name) cdk.CfnOutput(self, "LogGroupName", value=log_group.log_group_name) cdk.CfnOutput(self, "LogStream", value=log_stream.log_stream_name)
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) with open('./props/tasksetting.json', 'r') as f1: py_json1 = json.load(f1) ts = json.dumps(py_json1) # with open('./props/mappingrule.json', 'r') as f2: # py_json2 = json.load(f2) # mr = json.dumps(py_json2) with open('./props/config.json', 'r') as f2: configuration = json.load(f2) def getMappingrules(self, table_list): rules = [] for index, value in enumerate(table_list, 1): rules.append({ "rule-type": "selection", "rule-id": str(index), "rule-name": str(index), "object-locator": { "schema-name": value['schemaName'], "table-name": value['tableName'] }, "rule-action": "include", "filters": [] }) mapping_rules = {"rules": rules} return json.dumps(mapping_rules) # The code that defines your stack goes here S3Accessrole = _iam.Role( self, 'dmsrole', assumed_by=_iam.ServicePrincipal('dms.amazonaws.com'), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( 'AmazonS3FullAccess') ]) raw_bucket = s3.Bucket(self, 'rawbucket', bucket_name='rawbucket-datalake-cdk-oregon') raw_bucket.add_lifecycle_rule( enabled=configuration['s3LifecycleRule']['enabled'], expiration=core.Duration.days( configuration['s3LifecycleRule']['expiration'])) #my_table = ddb.Table(self, id ='dunamoTable', table_name = 'testcdktable', #partition_key = ddb.Attribute(name ='lastname',type = ddb.AttributeType.STRING) ) dl_dms = _dms.CfnReplicationInstance( self, 'dmsreplication', replication_instance_class=configuration['DMS_instance_setting'] ['instance_class'], replication_instance_identifier='datalake-instance-cdk', allocated_storage=configuration['DMS_instance_setting'] ['allocated_storage']) source_endpoint = _dms.CfnEndpoint( self, 'sourceendpoint', endpoint_type='source', engine_name=configuration['engineName'], database_name=configuration['databaseName'], username=configuration['username'], password=configuration['password'], port=configuration['port'], server_name=configuration['serverName'], ) target_endpoint = _dms.CfnEndpoint( self, 'targetendpoint', endpoint_type='target', engine_name='s3', s3_settings={ 'bucketName': raw_bucket.bucket_name, 'serviceAccessRoleArn': S3Accessrole.role_arn }, extra_connection_attributes='dataFormat=parquet') dms_task = _dms.CfnReplicationTask( self, 'data2lake-task', migration_type='full-load-and-cdc', replication_instance_arn=dl_dms.ref, source_endpoint_arn=source_endpoint.ref, target_endpoint_arn=target_endpoint.ref, replication_task_settings=ts, table_mappings=getMappingrules(self, configuration['tableList'])) my_table = ddb.Table(self, id='dynamoTable', table_name='ControllerTable', partition_key=ddb.Attribute( name='path', type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) datalake_bucket = s3.Bucket(self, 'datalakebucket', bucket_name='datalake-bucket-cdk-oregon') glue_role = _iam.Role( self, 'gluerole', assumed_by=_iam.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSGlueServiceRole') ]) raw_bucket.grant_read(glue_role) datalake_bucket.grant_read_write(glue_role) #lake formation settings #If you have attached managed policy ('AWSLakeFormationDataAdmin') to your own iam user, you should change that policy to allow "lakeformation:PutDataLakeSettings", #so that the lake setting can be allowed by below code in cdk. lake_admin_setting = _lakeformation.CfnDataLakeSettings( self, 'data-lake-GrantAdmin', admins=[ _lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty( data_lake_principal_identifier=configuration[ 'executiveArn']) ]) glue_database = _glue.Database(self, 'gluedatabase', database_name='data_lake_gluedb') glue_database.node.add_dependency(lake_admin_setting) glue_role_permission_inLakeFormation = _lakeformation.CfnPermissions( self, 'permission-glueRole', data_lake_principal=_lakeformation.CfnPermissions. DataLakePrincipalProperty( data_lake_principal_identifier=glue_role.role_arn), resource=_lakeformation.CfnPermissions.ResourceProperty( database_resource=_lakeformation.CfnPermissions. DatabaseResourceProperty(name=glue_database.database_name)), permissions=['ALL']) crawler = _glue.CfnCrawler( self, 'datalakecrawler', name='Crawler-datalake-cdk', role=glue_role.role_arn, targets={ 's3Targets': [{ 'path': 's3://' + datalake_bucket.bucket_name + '/datalake/' }] }, database_name='data_lake_gluedb', configuration= "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}" ) initialload_script = S3Assets.Asset(self, 'initial-load-code', path='./Gluejob/InitialLoad.py') incrementalload_script = S3Assets.Asset( self, 'incremental-load-code', path='./Gluejob/IncrementalLoad.py') initialload_script.grant_read(glue_role) incrementalload_script.grant_read(glue_role) my_table.grant_full_access(glue_role) initial_load_job = _glue.CfnJob( self, 'initial-job', name='InitialLoad-cdk', command=_glue.CfnJob.JobCommandProperty( name='glueetl', python_version='3', script_location='s3://' + initialload_script.s3_bucket_name + '/' + initialload_script.s3_object_key), role=glue_role.role_arn, default_arguments={ '--prefix': str(configuration['tableList']), '--bucket': raw_bucket.bucket_name, '--datalake_bucket': datalake_bucket.bucket_name, '--datalake_prefix': 'datalake/', '--region': CdkpyStack.of(self).region, '--controller_table_name': my_table.table_name }, allocated_capacity=configuration['glue_job_setting'] ['job_capacity'], execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=configuration['glue_job_setting'] ['max_concurrent_run_JobExecution'])) incremental_load_job = _glue.CfnJob( self, 'increment-job', name='IncrementalLoad-cdk', command=_glue.CfnJob.JobCommandProperty( name='glueetl', script_location='s3://' + incrementalload_script.s3_bucket_name + '/' + incrementalload_script.s3_object_key, python_version='3'), role=glue_role.role_arn, default_arguments={ '--prefix': str(configuration['tableList']), '--bucket': raw_bucket.bucket_name, '--datalake_bucket': datalake_bucket.bucket_name, '--datalake_prefix': 'datalake/', '--region': CdkpyStack.of(self).region, '--controller_table_name': my_table.table_name }, allocated_capacity=2, execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=1)) job_trigger = _glue.CfnTrigger( self, 'datalake-glue-trigger', type='SCHEDULED', schedule=configuration['job_trigger_schedule'], start_on_creation=False, actions=[ _glue.CfnTrigger.ActionProperty(job_name='IncrementalLoad-cdk') ]) dl_sns = _sns.Topic(self, 'datalake_sns', display_name='data-lake-sns') endpoint_email = configuration['emailSubscriptionList'] for emails in endpoint_email: dl_sns.add_subscription(_subscrption.EmailSubscription(emails)) #Another way to subscribe: dl_subscription = _sns.Subscription(self,'email-subscrption',topic = dl_sns,endpoint='*****@*****.**',protocol= _sns.SubscriptionProtocol.EMAIL) glue_events_target = _events_targets.SnsTopic(dl_sns) glue_events_rule = _events.Rule( self, 'gluejobevents-datalake', description='Using for tracking the failed glue job of data lake', rule_name='dl-gluejob-event', event_pattern=_events.EventPattern( source=['aws.glue'], detail_type=['Glue Job State Change'], detail={ "jobName": [initial_load_job.name], "state": ["FAILED"] }), targets=[glue_events_target]) dms_subscription = _dms.CfnEventSubscription( self, 'dmsevents-datalake', sns_topic_arn=dl_sns.topic_arn, subscription_name='datalake-dmsevents', source_type='replication-task', event_categories=['failure'])
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) s3_logs_bucket = s3.Bucket( self, "LogsBucket", encryption=s3.BucketEncryption.KMS_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL, lifecycle_rules=[ s3.LifecycleRule( abort_incomplete_multipart_upload_after=core.Duration.days( 7), expiration=core.Duration.days(30)) ]) s3_data_bucket = s3.Bucket( self, "DataBucket", encryption=s3.BucketEncryption.KMS_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL, server_access_logs_bucket=s3_logs_bucket, server_access_logs_prefix=f"s3accesslogs/{PROJECT_NAME}/") glue_database = glue.Database(self, "GlueDatabase", database_name=PROJECT_NAME) glue_table = glue.Table( self, "GlueTable", columns=[ glue.Column(name="timestamp", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="celcius", type=glue.Type(input_string="double", is_primitive=True)), glue.Column(name="fahrenheit", type=glue.Type(input_string="double", is_primitive=True)) ], database=glue_database, data_format=glue.DataFormat( input_format=glue.InputFormat( "org.apache.hadoop.mapred.TextInputFormat"), output_format=glue.OutputFormat( "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat" ), serialization_library=glue.SerializationLibrary( "org.openx.data.jsonserde.JsonSerDe")), table_name=PROJECT_NAME, encryption=glue.TableEncryption.S3_MANAGED, partition_keys=[ glue.Column(name="year", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="month", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="day", type=glue.Type(input_string="int", is_primitive=True)) ]) glue_crawler_role = iam.Role( self, "GlueCrawlerRole", assumed_by=iam.ServicePrincipal("glue.amazonaws.com"), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( "AWSGlueServiceRole") ]) s3_data_bucket.grant_read(glue_crawler_role, objects_key_pattern=f"{PROJECT_PREFIX}/") s3_data_bucket.grant_put(glue_crawler_role, objects_key_pattern=f"{PROJECT_PREFIX}/") glue_crawler = glue.CfnCrawler( self, "GlueCrawler", role=glue_crawler_role.role_arn, database_name=glue_database.database_name, targets={ "s3Targets": [{ "path": f"{s3_data_bucket.bucket_name}/{PROJECT_PREFIX}/" }] }, schedule={"scheduleExpression": "cron(30 04 * * ? *)"})
def __init__(self, scope: core.Construct, id: str, config_dict, **kwargs) -> None: super().__init__(scope, id, **kwargs) """ Create the datalake database """ createDatalakeDB = glue.Database( self, "createDatalakeDB", database_name=config_dict['datalake_db_name']) core.CfnOutput(self, "createDatalakeDBName", value=createDatalakeDB.database_name) """ Create Comp Reg Table """ createDatalakeCompRegTable = glue.Table( self, "createDatalakeCompRegTable", columns=[ glue.Column(name="lot_compound_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="version_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="smiles", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_mw", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="salt_multiplicity", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="salt_name", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="formula_weight", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_alias", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="stereochemistry", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="stereocomment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="geometric_isomerism", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_comment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_project", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="elnref", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="msmethod", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="msmass", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="provider", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="purity", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="puritymethod", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="nmrshifts", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lotalias", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lot_comment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lot_project", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="molfile", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="checksum", type=glue.Type(input_string="string", is_primitive=True)) ], database=createDatalakeDB.from_database_arn( self, "GetDBArn", database_arn=createDatalakeDB.database_arn), data_format=glue.DataFormat( input_format=glue.InputFormat.PARQUET, output_format=glue.OutputFormat.PARQUET, serialization_library=glue.SerializationLibrary.PARQUET), table_name="tbl_compound_data", bucket=s3.Bucket.from_bucket_name( self, "getIBucket", bucket_name=config_dict['datalake_bucket_name']), compressed=True, description= "This table contains data regarding compound registration coming from RDS", partition_keys=[ glue.Column(name="dt", type=glue.Type(input_string="string", is_primitive=True)) ], s3_prefix="compound_reg/compound_data/") core.CfnOutput(self, "createDatalakeCompRegTableName", value=createDatalakeCompRegTable.table_name)
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) # create db for glue schema glue_db = glue.Database( self, 'GlueDB', database_name='reddit_data', ) # data schema glue_table = glue.Table( self, 'GlueTable', table_name='sentiment', columns=[ glue.Column(name='@timestamp', type=glue.Schema.TIMESTAMP), glue.Column(name='id', type=glue.Schema.STRING), glue.Column(name='subreddit', type=glue.Schema.STRING), glue.Column(name='body', type=glue.Schema.STRING), glue.Column(name='is_submitter', type=glue.Schema.BOOLEAN), glue.Column(name='polarity', type=glue.Schema.FLOAT), glue.Column(name='subjectivity', type=glue.Schema.FLOAT), glue.Column(name='author', type=glue.Schema.STRING), ], database=glue_db, data_format=glue.DataFormat.PARQUET, bucket=s3.Bucket.from_bucket_arn(self, 'DataBucket', BUCKET_ARN), s3_prefix='reddit/', ) # role assumed by firehose stream_role = iam.Role( self, 'FirehoseRole', assumed_by=iam.ServicePrincipal('firehose.amazonaws.com'), description='role used by Firehose to access s3 bucket', ) # add s3 statement stream_role.add_to_policy( iam.PolicyStatement( resources=[BUCKET_ARN, f'{BUCKET_ARN}/*'], actions=[ 's3:AbortMultipartUpload', 's3:GetBucketLocation', 's3:GetObject', 's3:ListBucket', 's3:ListBucketMultipartUploads', 's3:PutObject', ], )) # add glue statement stream_role.add_to_policy( iam.PolicyStatement( resources=[ glue_table.table_arn, glue_db.database_arn, glue_db.catalog_arn, ], actions=[ 'glue:GetTable', 'glue:GetTableVersion', 'glue:GetTableVersions', ], )) # cloudwatch statement stream_role.add_to_policy( iam.PolicyStatement( resources=['*'], actions=[ 'logs:PutLogEvents', ], )) data_format_conversion_configuration = kf.CfnDeliveryStream.DataFormatConversionConfigurationProperty( enabled=True, input_format_configuration=kf.CfnDeliveryStream. InputFormatConfigurationProperty( deserializer=kf.CfnDeliveryStream.DeserializerProperty( hive_json_ser_de=kf.CfnDeliveryStream. HiveJsonSerDeProperty(), ), ), output_format_configuration=kf.CfnDeliveryStream. OutputFormatConfigurationProperty( serializer=kf.CfnDeliveryStream.SerializerProperty( parquet_ser_de=kf.CfnDeliveryStream.ParquetSerDeProperty(), ), ), schema_configuration=kf.CfnDeliveryStream. SchemaConfigurationProperty( database_name=glue_db.database_name, table_name=glue_table.table_name, role_arn=stream_role.role_arn, region='us-east-2', ), ) s3_config = kf.CfnDeliveryStream.ExtendedS3DestinationConfigurationProperty( bucket_arn=BUCKET_ARN, # temporary, will replace with env variable role_arn=stream_role.role_arn, data_format_conversion_configuration= data_format_conversion_configuration, prefix='reddit/', buffering_hints=kf.CfnDeliveryStream.BufferingHintsProperty( size_in_m_bs=64, ), ) firehose = kf.CfnDeliveryStream( self, 'FirehoseStream', delivery_stream_name='RedditDataStream', extended_s3_destination_configuration=s3_config, ) # add role dependency firehose.node.add_dependency(stream_role) # add ECS Fargate instance app_role = iam.Role( self, 'RedditStreamingAppRole', assumed_by=iam.ServicePrincipal('ecs-tasks.amazonaws.com'), description= 'Role used by the Reddit Streaming Application Fargate Task', ) # add firehose permissions app_role.add_to_policy( iam.PolicyStatement( resources=[firehose.attr_arn], actions=[ 'firehose:DeleteDeliveryStream', 'firehose:PutRecord', 'firehose:PutRecordBatch', 'firehose:UpdateDestination', ], )) # add ecs and cloudwatch permissions app_role.add_to_policy( iam.PolicyStatement( resources=['*'], actions=[ 'ecr:GetAuthorizationToken', 'ecr:BatchCheckLayerAvailability', 'ecr:GetDownloadUrlForLayer', 'ecr:BatchGetImage', 'logs:CreateLogStream', 'logs:PutLogEvents', ], )) vpc = ec2.Vpc(self, 'RedditVpc', max_azs=3) cluster = ecs.Cluster(self, 'RedditCluster', vpc=vpc) task_definition = ecs.FargateTaskDefinition( self, 'TaskDefinition', memory_limit_mib=512, cpu=256, task_role=app_role, ) task_definition.add_container( id='RedditStreamingApp', image=ecs.ContainerImage.from_asset('./sentiment_analysis'), command=['all'], environment={ 'FIREHOSE_STREAM_NAME': firehose.delivery_stream_name, 'PRAW_CLIENT_SECRET': os.environ['PRAW_CLIENT_SECRET'], 'PRAW_CLIENT_ID': os.environ['PRAW_CLIENT_ID'], 'PRAW_USER_AGENT': os.environ['PRAW_USER_AGENT'], }, logging=ecs.LogDriver.aws_logs(stream_prefix='reddit'), ) container = ecs.FargateService( self, 'StreamingApplication', desired_count=1, task_definition=task_definition, cluster=cluster, assign_public_ip=True, )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # Kinesis to lambda self.stream_lambda = kinesis_lambda.KinesisStreamsToLambda( self, 'clickstream', lambda_function_props=_lambda.FunctionProps( runtime=_lambda.Runtime.PYTHON_3_7, handler='index.lambda_handler', code=_lambda.Code.inline( get_code('send_data_to_firehose.py'))), kinesis_stream_props=kinesis.StreamProps( stream_name='clickstream', retention_period=core.Duration.days(1), shard_count=4), kinesis_event_source_props=lambda_sources.KinesisEventSourceProps( starting_position=_lambda.StartingPosition.TRIM_HORIZON, batch_size=1)) # Lambda to produce data self.produce_fake_data = _lambda.Function( self, 'produce_data', runtime=_lambda.Runtime.PYTHON_3_7, timeout=core.Duration.seconds(90), handler='index.lambda_handler', code=_lambda.Code.inline(get_code('produce_data.py')), environment={ 'STREAM_NAME': self.stream_lambda.kinesis_stream.stream_name }) self.stream_lambda.kinesis_stream.grant_read_write( self.produce_fake_data) # EventBridge to activate my function above self.event_rule = events.Rule( self, 'scheduledRule', schedule=events.Schedule.expression('rate(1 minute)')) self.event_rule.add_target( targets.LambdaFunction(self.produce_fake_data)) # S3 Bucket self.bucket = s3.Bucket(self, 'data-clicks-lake', removal_policy=core.RemovalPolicy.DESTROY, auto_delete_objects=True) # Glue self.glue_db_analytical = glue.Database( self, 'analytic_clickstream', database_name='clickstream_db', location_uri=None, ) self.glue_table_analytical = glue.Table( self, 'analytical-table', table_name='analytical-table', columns=[ glue_column('custid', 'int'), glue_column('trafficfrom', 'string'), glue_column('url', 'string'), glue_column('device', 'string'), glue_column('touchproduct', 'int'), glue_column('trans_timestamp', 'string') ], database=self.glue_db_analytical, data_format=glue.DataFormat.PARQUET, bucket=self.bucket, s3_prefix='kinesis/', ) # Firehose iam_role_firehose_analytical = self.create_firehose_role() self.bucket.grant_read_write(iam_role_firehose_analytical) firehose_props = FirehoseProps( bucket=self.bucket, role=iam_role_firehose_analytical, stream=self.stream_lambda.kinesis_stream, glue_db=self.glue_db_analytical, glue_table=self.glue_table_analytical) self.firehose = FirehoseLib(self, 'firehose_clickstream', firehose_props) # Elasticsearh self.es_domain = ElasticsearchLib(self, 'ES-clickstream-domain').es_domain # Lambda to send data to Elasticsearch self.send_data_to_elasticsearch = lambda_python.PythonFunction( self, 'clickstream_to_es', entry='./analytics_ml_flow/lambda/lambda_with_requirements/', handler='handler', timeout=core.Duration.seconds(180), index='Kinesis_ES.py', environment={ 'ES_HOST_HTTP': self.es_domain.domain_endpoint, 'ES_INDEX': 'clickstream', 'ES_IND_TYPE': 'transactions', 'ES_REGION': 'us-west-2', }) self.es_domain.grant_index_read_write('clickstream', self.send_data_to_elasticsearch) self.es_domain.grant_read_write(self.send_data_to_elasticsearch) stream_source = lambda_sources.KinesisEventSource( self.stream_lambda.kinesis_stream, starting_position=_lambda.StartingPosition.TRIM_HORIZON, batch_size=1) self.stream_lambda.kinesis_stream.grant_read( self.send_data_to_elasticsearch) self.send_data_to_elasticsearch.add_event_source(stream_source) # Glue Crawler crawler_role = self.create_crawler_permissions() glue_props = GlueCrawlerProps(bucket=self.bucket, role=crawler_role) self.glue_crawler = GlueCrawlerLib(self, 'glueCrawler', glue_props)
def __init__(self, scope: core.Construct, id: str, source_bucket_name: str, glue_database_name: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # get the source bucket - this object is an IBucketProxy interface, not a Buckt construct. # Can not be used to add an event directly. Instead, use a custom resource to add an event trigger later source_bucket = s3.Bucket.from_bucket_name( self, "MySourceBucket", bucket_name=source_bucket_name) # create the new destination bucket - this bucket holds the csv file that containers the FITS header information # the name of the bucket will be <stack-id>-fitsstorebucketXXXXXXXX-YYYYYYYYYYYYY # e.g. my-fits-datalake-fitsstorebucket1234567f-098765432d target_bucket = s3.Bucket(self, "FITSSTORE_BUCKET") # Add the astropy and numpy layers for the lambda function that is used as the event trigger on the source_bucket layer_astropy = lambda_.LayerVersion( self, 'AstroFitsioLayer', code=lambda_.Code.from_asset("resources_layer/astropy.zip"), compatible_runtimes=[lambda_.Runtime.PYTHON_3_7]) # use an AWS provided layer for numpy layer_numpy = lambda_.LayerVersion.from_layer_version_arn( self, "NumpyLayer", "arn:aws:lambda:us-east-1:668099181075:layer:AWSLambda-Python37-SciPy1x:22" ) # create the FITS header extractor lambda function # pass the FITSSTORE_BUCKET to the lambda function as an environment variable handler = lambda_.Function( self, "FITSHeaderExtractorHandler", runtime=lambda_.Runtime.PYTHON_3_7, code=lambda_.Code.asset("resources"), handler="fits_header_extractor.fits_header_extractor_handler", environment=dict(FITSSTORE_BUCKET=target_bucket.bucket_name), layers=[layer_astropy, layer_numpy]) # grant read access to handler on source bucket source_bucket.grant_read(handler) # Give the lambda resource based policy # both source_arn and source_account is needed for security reason handler.add_permission( 's3-trigger-lambda-s3-invoke-function', principal=iam_.ServicePrincipal('s3.amazonaws.com'), action='lambda:InvokeFunction', source_arn=source_bucket.bucket_arn, source_account=self.account) # grant access to the handler # - this is a lot easier than adding policies, but not all constructs support this target_bucket.grant_read_write(handler) # map the put event to hanlder - this doesn't work as source_bucket is not really a Bucket object (IBucketProxy) # You can use this approach if the bucket is created as a new Bucket object #notification = s3_notifications.LambdaDestination(handler) #source_bucket.add_object_created_notification(self, notification ) # use custom resource to add an event trigger on the destnation bucket - # the custom resource creation makes an SDK call to create the event notification on the # Action reference https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/S3.html # Events reference https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html custom_s3_resource = custom_resources_.AwsCustomResource( self, 's3-putobject-custom-notification-resource', policy=custom_resources_.AwsCustomResourcePolicy.from_statements([ iam_.PolicyStatement(effect=iam_.Effect.ALLOW, resources=['*'], actions=['s3:PutBucketNotification']) ]), on_create=custom_resources_.AwsSdkCall( service="S3", action="putBucketNotificationConfiguration", parameters={ "Bucket": source_bucket.bucket_name, "NotificationConfiguration": { "LambdaFunctionConfigurations": [{ "Events": ['s3:ObjectCreated:*', 's3:ObjectRemoved:*'], "LambdaFunctionArn": handler.function_arn, "Filter": { "Key": { "FilterRules": [{ 'Name': 'suffix', 'Value': 'fits' }] } } }] } }, physical_resource_id=custom_resources_.PhysicalResourceId.of( f's3-notification-resource-{str(uuid.uuid1())}'), region=self.region)) # Make sure the lambda function is created first custom_s3_resource.node.add_dependency( handler.permissions_node.find_child( 's3-trigger-lambda-s3-invoke-function')) # create a glue crawler to build the data catalog # Step 1 . create a role for AWS Glue glue_role = iam_.Role( self, "glue_role", assumed_by=iam_.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ iam_.ManagedPolicy.from_managed_policy_arn( self, 'MyFitsCrawlerGlueRole', managed_policy_arn= 'arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole') ]) # glue role needs "*" read/write - otherwise crawler will not be able to create tables (and no error messages in crawler logs) glue_role.add_to_policy( iam_.PolicyStatement(actions=[ 's3:GetObject', 's3:PutObject', 'lakeformation:GetDataAccess' ], effect=iam_.Effect.ALLOW, resources=['*'])) # Step 2. create a database in data catalog db = glue_.Database(self, "MyFitsDatabase", database_name=glue_database_name) # Step 3. create a crawler named "fitsdatalakecrawler-<hex>", and schedule to run every 15 mins # You can change the frequency based on your needs # cron schedule format cron(Minutes Hours Day-of-month Month Day-of-week Year) glue_.CfnCrawler( self, "fits-datalake-crawler", database_name=glue_database_name, role=glue_role.role_arn, schedule={"scheduleExpression": "cron(0/15 * * * ? *)"}, targets={"s3Targets": [{ "path": target_bucket.bucket_name }]}, ) # When your AWS Lake Formation Data catalog settings is not set to # "Use only IAM access control for new databases" or # "Use only IAM access control for new tables in new databse" # you need to grant additional permission to the data catalog database. # in order for the crawler to run, we need to add some permissions to lakeformation location_resource = lakeformation_.CfnResource( self, "MyFitsDatalakeLocationResource", resource_arn=target_bucket.bucket_arn, use_service_linked_role=True) lakeformation_.CfnPermissions( self, "MyFitsDatalakeDatabasePermission", data_lake_principal=lakeformation_.CfnPermissions. DataLakePrincipalProperty( data_lake_principal_identifier=glue_role.role_arn), resource=lakeformation_.CfnPermissions.ResourceProperty( database_resource=lakeformation_.CfnPermissions. DatabaseResourceProperty(name=db.database_name)), permissions=["ALTER", "DROP", "CREATE_TABLE"], ) location_permission = lakeformation_.CfnPermissions( self, "MyFitsDatalakeLocationPermission", data_lake_principal=lakeformation_.CfnPermissions. DataLakePrincipalProperty( data_lake_principal_identifier=glue_role.role_arn), resource=lakeformation_.CfnPermissions.ResourceProperty( data_location_resource=lakeformation_.CfnPermissions. DataLocationResourceProperty( s3_resource=target_bucket.bucket_arn)), permissions=["DATA_LOCATION_ACCESS"], ) #make sure the location resource is created first location_permission.node.add_dependency(location_resource)
def __init__(self, scope: core.Construct, id: str, region_name: str, db_name: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # CloudTrail bucket = s3.Bucket(self, 'TrailBucket', versioned=True) tail = cloudtrail.Trail(self, 'CloudTrail', bucket=bucket) db = glue.Database(self, 'cloudtrail', database_name=db_name) awg = core.CfnResource( self, 'AthenaWorkGroup', type="AWS::Athena::WorkGroup", properties={ "Name": f"{db_name}", "State": "ENABLED", "WorkGroupConfiguration": { "ResultConfiguration": { "OutputLocation": f"s3://{bucket.bucket_name}/athena_output/" } } }) # Pipeline for Working on Data project = codebuild.Project( self, 'learner_build', build_spec=codebuild.BuildSpec.from_source_filename( 'buildspec.yml'), environment_variables={ 'arn': { 'value': '-- Pur ARN Here --' }, 'athena_database': { 'value': db_name }, 'region_name': { 'value': region_name }, 'bucket': { 'value': bucket.bucket_name } }, source=codebuild.Source.s3(bucket=bucket, path='pipeline/learner.zip')) project.add_to_role_policy( iam.PolicyStatement(actions=['athena:*'], resources=['*'])) project.add_to_role_policy( iam.PolicyStatement(actions=['iam:*'], resources=['*'])) project.add_to_role_policy( iam.PolicyStatement(actions=['glue:*'], resources=['*'])) project.add_to_role_policy( iam.PolicyStatement(actions=['s3:*'], resources=['*'])) # Lambdas and Api GW api = agw.RestApi(self, "learner-api", rest_api_name="Learner Service", description="System to learn roles") switcher = lambda_.Function( self, "Switcher", runtime=lambda_.Runtime.PYTHON_3_8, code=lambda_.Code.from_asset("lambdas/switcher"), handler="main.handler", ) switcher.add_to_role_policy( iam.PolicyStatement(actions=['iam:*'], resources=['*'])) frontend = lambda_.Function( self, "Frontend", runtime=lambda_.Runtime.PYTHON_3_8, code=lambda_.Code.from_asset("lambdas/frontend"), handler="main.handler", ) learner = lambda_.Function( self, "Learner", runtime=lambda_.Runtime.PYTHON_3_8, code=lambda_.Code.from_asset("lambdas/learner"), handler="main.handler", environment={ 'codebuild': project.project_name, 'region_name': region_name }) learner.add_to_role_policy( iam.PolicyStatement(actions=['codebuild:StartBuild'], resources=[project.project_arn])) get_switcher_integration = agw.LambdaIntegration( switcher, request_templates={"application/json": '{ "statusCode": "200" }'}) get_frontend_integration = agw.LambdaIntegration( frontend, request_templates={"application/json": '{ "statusCode": "200" }'}) get_learner_integration = agw.LambdaIntegration( learner, request_templates={"application/json": '{ "statusCode": "200" }'}) api.root.add_method("GET", get_frontend_integration) switch = api.root.add_resource('switch') switch.add_method("GET", get_switcher_integration) learn = api.root.add_resource('learn') learn.add_method("GET", get_learner_integration) # Outputs core.CfnOutput(self, 'BucketName', value=bucket.bucket_name)
def create_glue_resources(self) -> None: '''Creates Glue Database and Tables ''' if not hasattr(self, 'glue_attr'): self.prepare_glue_attr_types() col = aws_glue.Column # Kinesis and Athena depends on data schema declarations that should # be in a Database and Table in AWS Glue self.glue_db_analytical = aws_glue.Database( self, 'sls-blog-analytical-db', database_name='sls-blog-analytical', location_uri=None, ) self.glue_table_analytical = aws_glue.Table( self, 'analytical-table', table_name='analytical-table', columns=[ col(name='id', type=self.glue_attr_string), col(name='publish_timestamp', type=self.glue_attr_timestamp), col(name='publisher_email', type=self.glue_attr_string), col(name='publisher_name', type=self.glue_attr_string), col(name='item_type', type=self.glue_attr_string), col(name='title', type=self.glue_attr_string), col(name='body', type=self.glue_attr_string), ], database=self.glue_db_analytical, data_format=aws_glue.DataFormat.PARQUET, bucket=self.bucket_analytical, s3_prefix='kinesis/', ) self.glue_table_likes = aws_glue.Table( self, 'likes-table', table_name='likes-table', columns=[ col(name='id', type=self.glue_attr_string), col(name='like', type=self.glue_attr_integer), ], database=self.glue_db_analytical, data_format=aws_glue.DataFormat.PARQUET, bucket=self.bucket_likes, s3_prefix='kinesis/', ) self.glue_table_apirequests = aws_glue.Table( self, 'apirequests-table', table_name='apirequests-table', columns=[ col(name='id', type=self.glue_attr_string), col(name='item_type', type=self.glue_attr_string), col(name='http_method', type=self.glue_attr_string), col(name='timestamp', type=self.glue_attr_timestamp), col(name='datetime', type=self.glue_attr_date), col(name='ip_address', type=self.glue_attr_string), col(name='user_agent', type=self.glue_attr_string), col(name='origin', type=self.glue_attr_string), col(name='country_code', type=self.glue_attr_string), col(name='device_type', type=self.glue_attr_string), col(name='action', type=self.glue_attr_string), col(name='article_id', type=self.glue_attr_string), ], database=self.glue_db_analytical, data_format=aws_glue.DataFormat.PARQUET, bucket=self.bucket_apirequests, s3_prefix='kinesis/', )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) s3_org_data = _s3.Bucket(self, ORIGINAL_DATA_BUCKET_NAME, bucket_name=ORIGINAL_DATA_BUCKET_NAME, removal_policy=core.RemovalPolicy.RETAIN) s3_transformed_data = _s3.Bucket( self, TRANSFORMED_DATA_BUCKET_NAME, bucket_name=TRANSFORMED_DATA_BUCKET_NAME, removal_policy=core.RemovalPolicy.RETAIN) # title-read s3_deployment.BucketDeployment( self, "s3-deployment-{}".format(TITLE_READ), sources=[ s3_deployment.Source.asset("data/{}/".format(TITLE_READ)) ], destination_bucket=s3_org_data, destination_key_prefix="{}/".format(TITLE_READ)) # title s3_deployment.BucketDeployment( self, "s3-deployment-{}".format(TITLE), sources=[s3_deployment.Source.asset("data/{}/".format(TITLE))], destination_bucket=s3_org_data, destination_key_prefix="{}/".format(TITLE)) # user s3_deployment.BucketDeployment( self, "s3-deployment-{}".format(USER), sources=[s3_deployment.Source.asset("data/{}/".format(USER))], destination_bucket=s3_org_data, destination_key_prefix="{}/".format(USER)) statement = iam.PolicyStatement(actions=[ "s3:*", "glue:*", "iam:ListRolePolicies", "iam:GetRole", "iam:GetRolePolicy" ], resources=["*"]) write_to_s3_policy = iam.PolicyDocument(statements=[statement]) glue_role = iam.Role( self, 'GlueCrawlerRole-dna', role_name='GlueCrawlerRole-dna', inline_policies=[write_to_s3_policy], assumed_by=iam.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSGlueServiceRole') ]) #TODO add IAM role for ctas lambda dna_database = glue.Database(self, "dna-glue-database-id", database_name=GLUE_DATABASE_NAME) # create glue table title_read_table = glue.Table( self, "{}-table-id".format(TITLE_READ), table_name="{}_table".format(TITLE_READ).replace("-", "_"), database=dna_database, columns=[{ "name": "USER_ID", "type": glue.Schema.STRING }, { "name": "ITEM_ID", "type": glue.Schema.STRING }, { "name": "TIMESTAMP", "type": glue.Schema.BIG_INT }, { "name": "TITLE", "type": glue.Schema.STRING }, { "name": "EVENT_TYPE", "type": glue.Schema.STRING }], data_format=glue.DataFormat.CSV, bucket=s3_org_data, s3_prefix=TITLE_READ) title_table = glue.Table(self, "{}-table-id".format(TITLE), table_name="{}_table".format(TITLE).replace( "-", "_"), database=dna_database, columns=[{ "name": "ITEM_ID", "type": glue.Schema.STRING }, { "name": "CREATION_TIMESTAMP", "type": glue.Schema.BIG_INT }, { "name": "TITLE", "type": glue.Schema.STRING }, { "name": "TAG", "type": glue.Schema.STRING }], data_format=glue.DataFormat.CSV, bucket=s3_org_data, s3_prefix=TITLE) user_table = glue.Table(self, "{}-table-id".format(USER), table_name="{}_table".format(USER).replace( "-", "_"), database=dna_database, columns=[ { "name": "USER_ID", "type": glue.Schema.STRING }, { "name": "NAME", "type": glue.Schema.STRING }, { "name": "EMAIL", "type": glue.Schema.STRING }, { "name": "GENDER", "type": glue.Schema.STRING, "categorical": True }, { "name": "AGE", "type": glue.Schema.BIG_INT, "categorical": True }, ], data_format=glue.DataFormat.CSV, bucket=s3_org_data, s3_prefix=USER) _athena.CfnWorkGroup(self, "athena_workgroup_id", name=ATHENA_WORKGROUP) ctas_lambda_trigger = _event.Rule( self, "ctas-lambda-trigger-event-id", rule_name="ctas-lambda-trigger-event", schedule=_event.Schedule.cron(minute="10", hour="*")) s3_statement = iam.PolicyStatement( effect=iam.Effect.ALLOW, # resources = [s3_bucket.bucket_arn, "{}/*".format(s3_bucket.bucket_arn)], resources=["*"], actions=["s3:*"]) athena_statement = iam.PolicyStatement( effect=iam.Effect.ALLOW, resources=["*"], actions=["athena:StartQueryExecution", "glue:*"]) ctas_lambda_func = _lambda.Function( self, "CTAS_query", function_name="CTAS_query", runtime=_lambda.Runtime.PYTHON_3_7, code=_lambda.Code.asset("src/lambda"), handler="ctas_lambda.lambda_handler", description="CTAS query to transform AVRO file, batch execution", environment={ "BUCKET_NAME": s3_transformed_data.bucket_name, "DATABASE_NAME": GLUE_DATABASE_NAME, "ATHENA_WORKGROUP": ATHENA_WORKGROUP }, timeout=core.Duration.minutes(3)) ctas_lambda_func.add_to_role_policy(s3_statement) ctas_lambda_func.add_to_role_policy(athena_statement) ctas_lambda_trigger.add_target( _target.LambdaFunction(ctas_lambda_func))