def __init__(self, scope: core.Construct, data_lake: DataLake, **kwargs) -> None: self.env = data_lake.env.value super().__init__(scope, id=f'{self.env}-glue-catalog', **kwargs) self.atomic_events_crawler = glue.CfnCrawler( self, f'{self.env}-atomic-events-crawler', name=f'{self.env}-atomic-events-crawler', description= 'Crawler to detect schema of data sored in data lake raw, atomic events', schedule=glue.CfnCrawler.ScheduleProperty( schedule_expression='cron(0/15 * * * ? *)'), role=data_lake.data_lake_role.role_arn, targets=glue.CfnCrawler.TargetsProperty(s3_targets=[ glue.CfnCrawler.S3TargetProperty( path= f's3://{data_lake.data_lake_raw_bucket.bucket_name}/atomic_events' ) ]), database_name=data_lake.data_lake_raw_database.database_name) self.orders_table = glue.Table( self, f'{self.env}-orders-table', table_name='orders', description='orders captured from Postgres using DMS CDC', database=data_lake.data_lake_raw_database, compressed=True, data_format=glue.DataFormat( input_format=glue.InputFormat.TEXT, output_format=glue.OutputFormat.HIVE_IGNORE_KEY_TEXT, serialization_library=glue.SerializationLibrary.OPEN_CSV), s3_prefix='orders', bucket=data_lake.data_lake_raw_bucket, columns=[ glue.Column(name='created_at', type=glue.Type(input_string='datetime', is_primitive=True)), glue.Column(name='order_id', type=glue.Type(input_string='integer', is_primitive=True)), glue.Column(name='product_name', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='value', type=glue.Type(input_string='float', is_primitive=True)) ])
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) self._region = 'aws_region' self._account_id = 'aws_account_id' bucket = s3.Bucket.from_bucket_name(self, 'my_bucket_id', 'my_bucket') database = glue.Database(self, id='my_database_id', database_name='poc') table = glue.Table( self, id='my_table_id', database=database, table_name='my_table', columns=[ glue.Column(name='col1', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='col2', type=glue.Type(input_string='int', is_primitive=True)) ], partition_keys=[ glue.Column(name='dt', type=glue.Type(input_string='string', is_primitive=True)) ], bucket=bucket, s3_prefix='test_data', data_format=glue.DataFormat( input_format=glue.InputFormat( 'org.apache.hadoop.mapred.TextInputFormat'), output_format=glue.OutputFormat( 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ), serialization_library=glue.SerializationLibrary( 'org.openx.data.jsonserde.JsonSerDe')))
def __init__( self, scope: core.Construct, glue_database: BaseDataLakeGlueDatabase, glue_role: BaseDataLakeGlueRole, **kwargs, ) -> None: self.glue_role = glue_role self.glue_database = glue_database self.deploy_env = self.glue_database.deploy_env self.data_lake_bucket = self.glue_database.data_lake_bucket self.obj_name = f"glue-{self.deploy_env.value}-orders-table" super().__init__( scope, self.obj_name, table_name="orders", description="orders captured from Postgres using DMS CDC", database=self.glue_database, compressed=True, data_format=glue.DataFormat.PARQUET, s3_prefix="orders/public/orders", bucket=self.data_lake_bucket, columns=[ glue.Column(name="op", type=glue.Type(input_string="string", is_primitive=True)), glue.Column( name="extracted_at", type=glue.Type(input_string="string", is_primitive=True), ), glue.Column( name="created_at", type=glue.Type(input_string="timestamp", is_primitive=True), ), glue.Column(name="order_id", type=glue.Type(input_string="int", is_primitive=True)), glue.Column( name="product_name", type=glue.Type(input_string="string", is_primitive=True), ), glue.Column(name="value", type=glue.Type(input_string="double", is_primitive=True)), ], **kwargs, )
def glue_column(name, col_type, is_primitive=True): return glue.Column(name=name, type=glue.Type(input_string=col_type, is_primitive=is_primitive))
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) s3_logs_bucket = s3.Bucket( self, "LogsBucket", encryption=s3.BucketEncryption.KMS_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL, lifecycle_rules=[ s3.LifecycleRule( abort_incomplete_multipart_upload_after=core.Duration.days( 7), expiration=core.Duration.days(30)) ]) s3_data_bucket = s3.Bucket( self, "DataBucket", encryption=s3.BucketEncryption.KMS_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL, server_access_logs_bucket=s3_logs_bucket, server_access_logs_prefix=f"s3accesslogs/{PROJECT_NAME}/") glue_database = glue.Database(self, "GlueDatabase", database_name=PROJECT_NAME) glue_table = glue.Table( self, "GlueTable", columns=[ glue.Column(name="timestamp", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="celcius", type=glue.Type(input_string="double", is_primitive=True)), glue.Column(name="fahrenheit", type=glue.Type(input_string="double", is_primitive=True)) ], database=glue_database, data_format=glue.DataFormat( input_format=glue.InputFormat( "org.apache.hadoop.mapred.TextInputFormat"), output_format=glue.OutputFormat( "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat" ), serialization_library=glue.SerializationLibrary( "org.openx.data.jsonserde.JsonSerDe")), table_name=PROJECT_NAME, encryption=glue.TableEncryption.S3_MANAGED, partition_keys=[ glue.Column(name="year", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="month", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="day", type=glue.Type(input_string="int", is_primitive=True)) ]) glue_crawler_role = iam.Role( self, "GlueCrawlerRole", assumed_by=iam.ServicePrincipal("glue.amazonaws.com"), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( "AWSGlueServiceRole") ]) s3_data_bucket.grant_read(glue_crawler_role, objects_key_pattern=f"{PROJECT_PREFIX}/") s3_data_bucket.grant_put(glue_crawler_role, objects_key_pattern=f"{PROJECT_PREFIX}/") glue_crawler = glue.CfnCrawler( self, "GlueCrawler", role=glue_crawler_role.role_arn, database_name=glue_database.database_name, targets={ "s3Targets": [{ "path": f"{s3_data_bucket.bucket_name}/{PROJECT_PREFIX}/" }] }, schedule={"scheduleExpression": "cron(30 04 * * ? *)"})
def __init__(self, scope: core.Construct, id: str, config_dict, **kwargs) -> None: super().__init__(scope, id, **kwargs) """ Create the datalake database """ createDatalakeDB = glue.Database( self, "createDatalakeDB", database_name=config_dict['datalake_db_name']) core.CfnOutput(self, "createDatalakeDBName", value=createDatalakeDB.database_name) """ Create Comp Reg Table """ createDatalakeCompRegTable = glue.Table( self, "createDatalakeCompRegTable", columns=[ glue.Column(name="lot_compound_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="version_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="smiles", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_mw", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="salt_multiplicity", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="salt_name", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="formula_weight", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_alias", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="stereochemistry", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="stereocomment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="geometric_isomerism", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_comment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_project", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="elnref", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="msmethod", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="msmass", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="provider", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="purity", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="puritymethod", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="nmrshifts", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lotalias", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lot_comment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lot_project", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="molfile", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="checksum", type=glue.Type(input_string="string", is_primitive=True)) ], database=createDatalakeDB.from_database_arn( self, "GetDBArn", database_arn=createDatalakeDB.database_arn), data_format=glue.DataFormat( input_format=glue.InputFormat.PARQUET, output_format=glue.OutputFormat.PARQUET, serialization_library=glue.SerializationLibrary.PARQUET), table_name="tbl_compound_data", bucket=s3.Bucket.from_bucket_name( self, "getIBucket", bucket_name=config_dict['datalake_bucket_name']), compressed=True, description= "This table contains data regarding compound registration coming from RDS", partition_keys=[ glue.Column(name="dt", type=glue.Type(input_string="string", is_primitive=True)) ], s3_prefix="compound_reg/compound_data/") core.CfnOutput(self, "createDatalakeCompRegTableName", value=createDatalakeCompRegTable.table_name)
def __init__(self, scope: core.Construct, app: PMIApp, cid: str, *, partner: str, bucket: s3.IBucket, database: glue.IDatabase): super().__init__( scope, cid, database=database, table_name=partner, description=f"Aggregated viewability metrics for {partner}.", columns=[ glue.Column(name='hit_date', type=glue.Schema.DATE), glue.Column(name='measurement_source_id', type=glue.Schema.INTEGER), glue.Column(name='partner_measured_advertiser_id', type=glue.Schema.BIG_INT), glue.Column(name='partner_measured_campaign_id', type=glue.Schema.BIG_INT), glue.Column(name='partner_measured_channel_id', type=glue.Schema.BIG_INT), glue.Column(name='partner_measured_placement_id', type=glue.Schema.BIG_INT), glue.Column(name='partner_measured_creative_id', type=glue.Schema.BIG_INT), glue.Column(name='media_type_id', type=glue.Schema.INTEGER), glue.Column(name='below_the_fold_imps', type=glue.Schema.INTEGER), glue.Column(name='on_the_fold_imps', type=glue.Schema.INTEGER), glue.Column(name='above_the_fold_imps', type=glue.Schema.INTEGER), glue.Column(name='time_on_page', type=glue.Schema.BIG_INT), glue.Column(name='in_view_time', type=glue.Schema.BIG_INT), glue.Column(name='in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_5s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_15s_imps', type=glue.Schema.INTEGER), glue.Column(name='not_in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='never_in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_load_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_unload_imps', type=glue.Schema.INTEGER), glue.Column(name='completed_1q_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_1q_imps', type=glue.Schema.INTEGER), glue.Column(name='completed_2q_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_2q_imps', type=glue.Schema.INTEGER), glue.Column(name='completed_3q_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_3q_imps', type=glue.Schema.INTEGER), glue.Column(name='completed_4q_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_4q_imps', type=glue.Schema.INTEGER), glue.Column(name='never_started_imps', type=glue.Schema.INTEGER), glue.Column(name='muted_imps', type=glue.Schema.INTEGER), glue.Column(name='full_screen_imps', type=glue.Schema.INTEGER), glue.Column(name='click_through_imps', type=glue.Schema.INTEGER), glue.Column(name='sivt_in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='sivt_not_in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_time_on_page', type=glue.Schema.BIG_INT), glue.Column(name='groupm_in_view_time', type=glue.Schema.BIG_INT), glue.Column(name='groupm_in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_in_view_5s_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_in_view_15s_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_not_in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_never_in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_in_view_load_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_in_view_unload_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_completed_1q_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_in_view_1q_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_completed_2q_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_in_view_2q_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_completed_3q_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_in_view_3q_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_completed_4q_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_in_view_4q_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_never_started_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_muted_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_full_screen_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_click_through_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_sivt_in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_sivt_not_in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='suspicious_imps', type=glue.Schema.INTEGER), glue.Column(name='measured_imps', type=glue.Schema.INTEGER), glue.Column(name='groupm_measured_imps', type=glue.Schema.INTEGER), glue.Column(name='general_invalid_imps', type=glue.Schema.INTEGER), glue.Column(name='viewability_measurement_trusted_imps', type=glue.Schema.INTEGER), glue.Column(name='imps', type=glue.Schema.INTEGER), glue.Column(name='sitting_duck_bot_imps', type=glue.Schema.INTEGER), glue.Column(name='standard_bot_imps', type=glue.Schema.INTEGER), glue.Column(name='volunteer_bot_imps', type=glue.Schema.INTEGER), glue.Column(name='profile_bot_imps', type=glue.Schema.INTEGER), glue.Column(name='masked_bot_imps', type=glue.Schema.INTEGER), glue.Column(name='nomadic_bot_imps', type=glue.Schema.INTEGER), glue.Column(name='other_bot_imps', type=glue.Schema.INTEGER), glue.Column(name='true_view_viewable_imps', type=glue.Schema.INTEGER), glue.Column(name='true_view_measurable_imps', type=glue.Schema.INTEGER), glue.Column(name='yahoo_gemini_billable_imps', type=glue.Schema.INTEGER), glue.Column(name='full_ad_in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='pm_platform', type=glue.Schema.STRING), glue.Column(name='publicis_in_view_imps', type=glue.Schema.INTEGER), glue.Column(name='yahoo_gemini_billable_suspicious_imps', type=glue.Schema.INTEGER), glue.Column(name='average_in_view_time', type=glue.Schema.DOUBLE), glue.Column(name='in_view_lt_1s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_1s_2s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_2s_5s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_5s_10s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_10s_15s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_15s_20s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_20s_25s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_25s_30s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_30s_35s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_35s_40s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_40s_45s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_45s_50s_imps', type=glue.Schema.INTEGER), glue.Column(name='in_view_ge_50s_imps', type=glue.Schema.INTEGER), glue.Column(name='viewability_measured_or_fraud_ads', type=glue.Schema.INTEGER) ], partition_keys=[ glue.Column(name='estdate', type=glue.Schema.STRING) ], data_format=glue.DataFormat.TSV, bucket=bucket, s3_prefix=core.Fn.join('', [ core.Fn.import_value( f'{SHARED_RESOURCES_STACK_NAME_BASE}-{app.env_id}:DataLakeMartBucketDataPrefix' ), partner, '/' ]), compressed=True) # serialization properties cfn_table: glue.CfnTable = self.node.default_child cfn_table.add_property_override( 'TableInput.StorageDescriptor.SerdeInfo.Parameters', { 'field.delim': '\t', 'serialization.null.format': '\\N' }) # data retention period cfn_table.add_property_override('TableInput.Retention', 365)
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) # create db for glue schema glue_db = glue.Database( self, 'GlueDB', database_name='reddit_data', ) # data schema glue_table = glue.Table( self, 'GlueTable', table_name='sentiment', columns=[ glue.Column(name='@timestamp', type=glue.Schema.TIMESTAMP), glue.Column(name='id', type=glue.Schema.STRING), glue.Column(name='subreddit', type=glue.Schema.STRING), glue.Column(name='body', type=glue.Schema.STRING), glue.Column(name='is_submitter', type=glue.Schema.BOOLEAN), glue.Column(name='polarity', type=glue.Schema.FLOAT), glue.Column(name='subjectivity', type=glue.Schema.FLOAT), glue.Column(name='author', type=glue.Schema.STRING), ], database=glue_db, data_format=glue.DataFormat.PARQUET, bucket=s3.Bucket.from_bucket_arn(self, 'DataBucket', BUCKET_ARN), s3_prefix='reddit/', ) # role assumed by firehose stream_role = iam.Role( self, 'FirehoseRole', assumed_by=iam.ServicePrincipal('firehose.amazonaws.com'), description='role used by Firehose to access s3 bucket', ) # add s3 statement stream_role.add_to_policy( iam.PolicyStatement( resources=[BUCKET_ARN, f'{BUCKET_ARN}/*'], actions=[ 's3:AbortMultipartUpload', 's3:GetBucketLocation', 's3:GetObject', 's3:ListBucket', 's3:ListBucketMultipartUploads', 's3:PutObject', ], )) # add glue statement stream_role.add_to_policy( iam.PolicyStatement( resources=[ glue_table.table_arn, glue_db.database_arn, glue_db.catalog_arn, ], actions=[ 'glue:GetTable', 'glue:GetTableVersion', 'glue:GetTableVersions', ], )) # cloudwatch statement stream_role.add_to_policy( iam.PolicyStatement( resources=['*'], actions=[ 'logs:PutLogEvents', ], )) data_format_conversion_configuration = kf.CfnDeliveryStream.DataFormatConversionConfigurationProperty( enabled=True, input_format_configuration=kf.CfnDeliveryStream. InputFormatConfigurationProperty( deserializer=kf.CfnDeliveryStream.DeserializerProperty( hive_json_ser_de=kf.CfnDeliveryStream. HiveJsonSerDeProperty(), ), ), output_format_configuration=kf.CfnDeliveryStream. OutputFormatConfigurationProperty( serializer=kf.CfnDeliveryStream.SerializerProperty( parquet_ser_de=kf.CfnDeliveryStream.ParquetSerDeProperty(), ), ), schema_configuration=kf.CfnDeliveryStream. SchemaConfigurationProperty( database_name=glue_db.database_name, table_name=glue_table.table_name, role_arn=stream_role.role_arn, region='us-east-2', ), ) s3_config = kf.CfnDeliveryStream.ExtendedS3DestinationConfigurationProperty( bucket_arn=BUCKET_ARN, # temporary, will replace with env variable role_arn=stream_role.role_arn, data_format_conversion_configuration= data_format_conversion_configuration, prefix='reddit/', buffering_hints=kf.CfnDeliveryStream.BufferingHintsProperty( size_in_m_bs=64, ), ) firehose = kf.CfnDeliveryStream( self, 'FirehoseStream', delivery_stream_name='RedditDataStream', extended_s3_destination_configuration=s3_config, ) # add role dependency firehose.node.add_dependency(stream_role) # add ECS Fargate instance app_role = iam.Role( self, 'RedditStreamingAppRole', assumed_by=iam.ServicePrincipal('ecs-tasks.amazonaws.com'), description= 'Role used by the Reddit Streaming Application Fargate Task', ) # add firehose permissions app_role.add_to_policy( iam.PolicyStatement( resources=[firehose.attr_arn], actions=[ 'firehose:DeleteDeliveryStream', 'firehose:PutRecord', 'firehose:PutRecordBatch', 'firehose:UpdateDestination', ], )) # add ecs and cloudwatch permissions app_role.add_to_policy( iam.PolicyStatement( resources=['*'], actions=[ 'ecr:GetAuthorizationToken', 'ecr:BatchCheckLayerAvailability', 'ecr:GetDownloadUrlForLayer', 'ecr:BatchGetImage', 'logs:CreateLogStream', 'logs:PutLogEvents', ], )) vpc = ec2.Vpc(self, 'RedditVpc', max_azs=3) cluster = ecs.Cluster(self, 'RedditCluster', vpc=vpc) task_definition = ecs.FargateTaskDefinition( self, 'TaskDefinition', memory_limit_mib=512, cpu=256, task_role=app_role, ) task_definition.add_container( id='RedditStreamingApp', image=ecs.ContainerImage.from_asset('./sentiment_analysis'), command=['all'], environment={ 'FIREHOSE_STREAM_NAME': firehose.delivery_stream_name, 'PRAW_CLIENT_SECRET': os.environ['PRAW_CLIENT_SECRET'], 'PRAW_CLIENT_ID': os.environ['PRAW_CLIENT_ID'], 'PRAW_USER_AGENT': os.environ['PRAW_USER_AGENT'], }, logging=ecs.LogDriver.aws_logs(stream_prefix='reddit'), ) container = ecs.FargateService( self, 'StreamingApplication', desired_count=1, task_definition=task_definition, cluster=cluster, assign_public_ip=True, )
def __init__(self, scope: core.Construct, app: PMIApp, cid: str, *, partner: str, bucket: s3.IBucket, database: glue.IDatabase): super().__init__( scope, cid, database=database, table_name=partner, description=f"Ad sessions (JAS) for {partner}.", columns=[ glue.Column(name='impression_id', type=glue.Schema.STRING), glue.Column(name='site', type=glue.Schema.STRING), glue.Column(name='measurement_source_id', type=glue.Schema.INTEGER), glue.Column(name='partner_measured_advertiser_id', type=glue.Schema.BIG_INT), glue.Column(name='partner_measured_campaign_id', type=glue.Schema.BIG_INT), glue.Column(name='partner_measured_channel_id', type=glue.Schema.BIG_INT), glue.Column(name='partner_measured_placement_id', type=glue.Schema.BIG_INT), glue.Column(name='partner_measured_creative_id', type=glue.Schema.BIG_INT), glue.Column(name='media_type_id', type=glue.Schema.INTEGER), glue.Column(name='below_the_fold', type=glue.Schema.BOOLEAN), glue.Column(name='on_the_fold', type=glue.Schema.BOOLEAN), glue.Column(name='above_the_fold', type=glue.Schema.BOOLEAN), glue.Column(name='time_on_page', type=glue.Schema.INTEGER), glue.Column(name='in_view_time', type=glue.Schema.INTEGER), glue.Column(name='in_view', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_5s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_15s', type=glue.Schema.BOOLEAN), glue.Column(name='not_in_view', type=glue.Schema.BOOLEAN), glue.Column(name='never_in_view', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_load', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_unload', type=glue.Schema.BOOLEAN), glue.Column(name='completed_1q', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_1q', type=glue.Schema.BOOLEAN), glue.Column(name='completed_2q', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_2q', type=glue.Schema.BOOLEAN), glue.Column(name='completed_3q', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_3q', type=glue.Schema.BOOLEAN), glue.Column(name='completed_4q', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_4q', type=glue.Schema.BOOLEAN), glue.Column(name='never_started', type=glue.Schema.BOOLEAN), glue.Column(name='muted', type=glue.Schema.BOOLEAN), glue.Column(name='full_screen', type=glue.Schema.BOOLEAN), glue.Column(name='click_through', type=glue.Schema.BOOLEAN), glue.Column(name='sivt_in_view', type=glue.Schema.BOOLEAN), glue.Column(name='sivt_not_in_view', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_time_on_page', type=glue.Schema.INTEGER), glue.Column(name='groupm_in_view_time', type=glue.Schema.INTEGER), glue.Column(name='groupm_in_view', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_in_view_5s', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_in_view_15s', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_not_in_view', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_never_in_view', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_in_view_load', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_in_view_unload', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_completed_1q', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_in_view_1q', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_completed_2q', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_in_view_2q', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_completed_3q', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_in_view_3q', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_completed_4q', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_in_view_4q', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_never_started', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_muted', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_full_screen', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_click_through', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_sivt_in_view', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_sivt_not_in_view', type=glue.Schema.BOOLEAN), glue.Column(name='suspicious', type=glue.Schema.BOOLEAN), glue.Column(name='measured', type=glue.Schema.BOOLEAN), glue.Column(name='groupm_measured', type=glue.Schema.BOOLEAN), glue.Column(name='general_invalid', type=glue.Schema.BOOLEAN), glue.Column(name='viewability_measurement_trusted', type=glue.Schema.BOOLEAN), glue.Column(name='sitting_duck_bot', type=glue.Schema.BOOLEAN), glue.Column(name='standard_bot', type=glue.Schema.BOOLEAN), glue.Column(name='volunteer_bot', type=glue.Schema.BOOLEAN), glue.Column(name='profile_bot', type=glue.Schema.BOOLEAN), glue.Column(name='masked_bot', type=glue.Schema.BOOLEAN), glue.Column(name='nomadic_bot', type=glue.Schema.BOOLEAN), glue.Column(name='other_bot', type=glue.Schema.BOOLEAN), glue.Column(name='true_view_viewable', type=glue.Schema.BOOLEAN), glue.Column(name='true_view_measurable', type=glue.Schema.BOOLEAN), glue.Column(name='yahoo_gemini_billable', type=glue.Schema.BOOLEAN), glue.Column(name='full_ad_in_view', type=glue.Schema.BOOLEAN), glue.Column(name='publicis_in_view', type=glue.Schema.BOOLEAN), glue.Column(name='yahoo_gemini_billable_suspicious', type=glue.Schema.BOOLEAN), glue.Column(name='average_in_view_time', type=glue.Schema.DOUBLE), glue.Column(name='in_view_lt_1s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_1s_2s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_2s_5s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_5s_10s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_10s_15s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_15s_20s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_20s_25s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_25s_30s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_30s_35s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_35s_40s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_40s_45s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_45s_50s', type=glue.Schema.BOOLEAN), glue.Column(name='in_view_ge_50s', type=glue.Schema.BOOLEAN), glue.Column(name='viewability_measured_or_fraud', type=glue.Schema.BOOLEAN) ], partition_keys=[ glue.Column(name='estdate', type=glue.Schema.STRING), glue.Column(name='esthour', type=glue.Schema.STRING) ], data_format=glue.DataFormat.AVRO, bucket=bucket, s3_prefix=core.Fn.join('', [ core.Fn.import_value( f'{SHARED_RESOURCES_STACK_NAME_BASE}-{app.env_id}:DataLakeJASBucketDataPrefix' ), partner, '/' ])) # data retention period cfn_table: glue.CfnTable = self.node.default_child cfn_table.add_property_override('TableInput.Retention', 90)