def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
		super().__init__(scope, id, **kwargs)

	# CloudFormation Parameters

		glue_db_name = core.CfnParameter(self, "GlueDatabaseNameNycTlc", 
				type="String",
				description="Name of Glue Database to be created for NYC TLC.",
				allowed_pattern="[\w-]+",
				default = "nyc_tlc_db"
			)

		glue_table_name = core.CfnParameter(self, "GlueTableNameNycTlc", 
				type="String",
				description="Name of Glue Table to be created for NYC TLC.",
				allowed_pattern="[\w-]+",
				default = "nyc_tlc_table"
			)

		self.template_options.description = "\
This template deploys the dataset containing New York City Taxi and Limousine Commission (TLC) Trip Record Data.\n \
Sample data is copied from the public dataset into a local S3 bucket, a database and table are created in AWS Glue, \
and the S3 location is registered with AWS Lake Formation."

		self.template_options.metadata = {
			"AWS::CloudFormation::Interface": {
				"License": "MIT-0"
			}
		}
	# Create S3 bucket for storing a copy of the Dataset locally in the AWS Account

		local_dataset_bucket = s3.Bucket(self, "LocalNycTlcBucket",
			block_public_access = s3.BlockPublicAccess(
				block_public_acls=True, 
				block_public_policy=True, 
				ignore_public_acls=True, 
				restrict_public_buckets=True),
			removal_policy = core.RemovalPolicy.DESTROY)

		public_dataset_bucket = s3.Bucket.from_bucket_arn(self, "PublicDatasetBucket", BUCKET_ARN)

		with open("lambda/s3_copy.py", encoding="utf8") as fp:
			s3_copy_code = fp.read()

		s3_copy_execution_role = iam.Role(self, "S3CopyHandlerServiceRole",
			assumed_by = iam.ServicePrincipal('lambda.amazonaws.com'),
			managed_policies = [
				iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AWSLambdaBasicExecutionRole"),
			],
			inline_policies = { "S3CopyHandlerRoleInlinePolicy" : iam.PolicyDocument( 
				statements = [
					iam.PolicyStatement(
						effect=iam.Effect.ALLOW,
						actions=[
							"s3:Get*"
						],
						resources=[
							public_dataset_bucket.bucket_arn,
							public_dataset_bucket.arn_for_objects("*")
						]),
					iam.PolicyStatement(
						effect=iam.Effect.ALLOW,
						actions=[
							"s3:PutObject",
							"s3:GetObject",
							"s3:DeleteObject"
						],
						resources=[local_dataset_bucket.arn_for_objects("*")]
						)
					]
				) }
			)

		s3_copy_fn = _lambda.Function(self, "S3CopyHandler", 
			runtime = _lambda.Runtime.PYTHON_3_7,
			code = _lambda.InlineCode.from_inline(s3_copy_code),
			handler = "index.handler",
			role =  s3_copy_execution_role,
			timeout = core.Duration.seconds(600)
		)

		s3_copy = core.CustomResource(self, "S3Copy", 
			service_token = s3_copy_fn.function_arn,
			resource_type = "Custom::S3Copy",
			properties = {
				"PublicDatasetBucket": public_dataset_bucket.bucket_name,
				"LocalDatasetBucket" : local_dataset_bucket.bucket_name,
				"PublicDatasetObject": OBJECT,
				"LocalDatasetPrefix": glue_table_name.value_as_string
			} 
		)	

	# Create Database, Table and Partitions for Amazon Reviews

		lakeformation_resource = lf.CfnResource(self, "LakeFormationResource", 
			resource_arn = local_dataset_bucket.bucket_arn, 
			use_service_linked_role = True)

		lakeformation_resource.node.add_dependency(s3_copy)

		cfn_glue_db = glue.CfnDatabase(self, "GlueDatabase", 
			catalog_id = core.Aws.ACCOUNT_ID,
			database_input = glue.CfnDatabase.DatabaseInputProperty(
				name = glue_db_name.value_as_string, 
				location_uri=local_dataset_bucket.s3_url_for_object(),
			)
		)

		nyc_tlc_table = glue.CfnTable(self, "GlueTableNycTlc", 
			catalog_id = cfn_glue_db.catalog_id,
			database_name = glue_db_name.value_as_string,
			table_input = glue.CfnTable.TableInputProperty(
				description = "New York City Taxi and Limousine Commission (TLC) Trip Record Data",
				name = glue_table_name.value_as_string,
				parameters = {
					"skip.header.line.count": "1",
					"compressionType": "none",
					"classification": "csv",
					"delimiter": ",",
					"typeOfData": "file"
				},
				storage_descriptor = glue.CfnTable.StorageDescriptorProperty(
					columns = [
						{"name":"vendorid","type":"bigint"},
						{"name":"lpep_pickup_datetime","type":"string"},
						{"name":"lpep_dropoff_datetime","type":"string"},
						{"name":"store_and_fwd_flag","type":"string"},
						{"name":"ratecodeid","type":"bigint"},
						{"name":"pulocationid","type":"bigint"},
						{"name":"dolocationid","type":"bigint"},
						{"name":"passenger_count","type":"bigint"},
						{"name":"trip_distance","type":"double"},
						{"name":"fare_amount","type":"double"},
						{"name":"extra","type":"double"},
						{"name":"mta_tax","type":"double"},
						{"name":"tip_amount","type":"double"},
						{"name":"tolls_amount","type":"double"},
						{"name":"ehail_fee","type":"string"},
						{"name":"improvement_surcharge","type":"double"},
						{"name":"total_amount","type":"double"},
						{"name":"payment_type","type":"bigint"},
						{"name":"trip_type","type":"bigint"},
						{"name":"congestion_surcharge","type":"double"}],
					location = local_dataset_bucket.s3_url_for_object() + "/" + glue_table_name.value_as_string + "/",
					input_format = "org.apache.hadoop.mapred.TextInputFormat",
					output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
					compressed = False,
					serde_info = glue.CfnTable.SerdeInfoProperty( 
						serialization_library = "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
						parameters = {
							"field.delim": ","
						}
					)
				),
				table_type = "EXTERNAL_TABLE"
			)
		)

		nyc_tlc_table.node.add_dependency(cfn_glue_db)

		core.CfnOutput(self, "LocalNycTlcBucketOutput", 
			value=local_dataset_bucket.bucket_name, 
			description="S3 Bucket created to store the dataset")

		core.CfnOutput(self, "GlueDatabaseOutput", 
			value=cfn_glue_db.ref, 
			description="Glue DB created to host the dataset table")

		core.CfnOutput(self, "GlueTableNycTlcOutput", 
			value=nyc_tlc_table.ref, 
			description="Glue Table created to host the dataset")
Exemplo n.º 2
0
    def __init__(
        self,
        scope: cdk.Construct,
        construct_id: str,
        stack_log_level: str,
        src_stream,
        **kwargs,
    ) -> None:
        super().__init__(scope, construct_id, **kwargs)

        self.template_options.description = "Miztiik Automation: Sales Transactions Table Stack."
        self.template_options.metadata = {"License": "Miztiik Corp."}

        # CloudFormation Parameters

        self.glue_db_name = cdk.CfnParameter(
            self,
            "GlueTxnsDbName",
            type="String",
            description=
            "Name of Glue Database to be created for Sales Transactions.",
            allowed_pattern="[\w-]+",
            default="miztiik_sales_db",
        )

        self.glue_table_name = cdk.CfnParameter(
            self,
            "GlueTxnsTableName",
            type="String",
            description=
            "Name of Glue Table to be created for Sales Transactions (JSON).",
            allowed_pattern="[\w-]+",
            default="sales_txns_tbl",
        )

        cfn_txn_db = _glue.CfnDatabase(
            self,
            "GlueTxnsDb",
            catalog_id=cdk.Aws.ACCOUNT_ID,
            database_input=_glue.CfnDatabase.DatabaseInputProperty(
                name=self.glue_db_name.value_as_string,
                description="Database for Sales Transactions."
                # location_uri=txns_bucket.s3_url_for_object(),
            ),
        )

        # Ref: https://docs.aws.amazon.com/glue/latest/dg/add-job-streaming.html
        cfn_txn_table = _glue.CfnTable(
            self,
            "glueTxnsTable01",
            catalog_id=cfn_txn_db.catalog_id,
            database_name=self.glue_db_name.value_as_string,
            table_input=_glue.CfnTable.TableInputProperty(
                description="Sales Transactions Table",
                name=self.glue_table_name.value_as_string,
                parameters={
                    "classification": "json",
                    # "typeOfData": "file"
                },
                # partition_keys=[
                #     {
                #         "name": "product_category",
                #         "type": "string"
                #     }
                # ],
                table_type="EXTERNAL_TABLE",
                storage_descriptor=_glue.CfnTable.StorageDescriptorProperty(
                    # columns=[
                    #     {
                    #         "name": "marketplace",
                    #         "type": "string"
                    #     }
                    # ],
                    location=f"{src_stream.stream_name}",
                    parameters={
                        "endpointUrl":
                        f"https://kinesis.{cdk.Aws.REGION}.amazonaws.com",
                        "streamName": f"{src_stream.stream_name}",
                        "typeOfData": "kinesis"
                    },
                    input_format="org.apache.hadoop.mapred.TextInputFormat",
                    output_format=
                    "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
                    serde_info=_glue.CfnTable.SerdeInfoProperty(
                        name="miztiikAutomationSerDeConfig",
                        serialization_library=
                        "org.openx.data.jsonserde.JsonSerDe",
                        parameters={
                            "paths": "",
                            # "typeOfData": "file"
                        }))))

        cfn_txn_table.add_depends_on(cfn_txn_db)

        ###########################################
        ################# OUTPUTS #################
        ###########################################
        output_0 = cdk.CfnOutput(
            self,
            "AutomationFrom",
            value=f"{GlobalArgs.SOURCE_INFO}",
            description=
            "To know more about this automation stack, check out our github page.",
        )

        output_1 = cdk.CfnOutput(
            self,
            "GlueTxnsTable",
            value=
            f"https://console.aws.amazon.com/glue/home?region={cdk.Aws.REGION}#table:catalog={cdk.Aws.ACCOUNT_ID};name={self.glue_table_name.value_as_string};namespace={self.glue_db_name.value_as_string}",
            description="Glue Transactions Table.",
        )
Exemplo n.º 3
0
    def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
        super().__init__(scope, construct_id, **kwargs)

        #the S3 bucket where CloudFront Access Logs will be stored
        cf_access_logs = s3.Bucket(self, "LogBucket")

        #S3 bucket where Athena will put the results
        athena_results = s3.Bucket(self, "AthenaResultsBucket")

        #create an Athena database
        glue_database_name = "serverlessland_database"
        myDatabase = glue.CfnDatabase(
            self,
            id=glue_database_name,
            catalog_id=account,
            database_input=glue.CfnDatabase.DatabaseInputProperty(
                description=f"Glue database '{glue_database_name}'",
                name=glue_database_name,
            )
        )

        #define a table with the structure of CloudFront Logs https://docs.aws.amazon.com/athena/latest/ug/cloudfront-logs.html
        athena_table = glue.CfnTable(self,
            id='cfaccesslogs',
            catalog_id=account,
            database_name=glue_database_name,
            table_input=glue.CfnTable.TableInputProperty(
                name='cf_access_logs',
                description='CloudFront access logs',
                table_type='EXTERNAL_TABLE',
                parameters = {
                    'skip.header.line.count': '2',
                },
                storage_descriptor=glue.CfnTable.StorageDescriptorProperty(
                    location="s3://"+cf_access_logs.bucket_name+"/",
                    input_format='org.apache.hadoop.mapred.TextInputFormat',
                    output_format='org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
                    compressed=False,
                    serde_info=glue.CfnTable.SerdeInfoProperty(
                        serialization_library='org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe',
                        parameters={
                            'field.delim' : '	'
                        }
                    ),
                    columns=[
                        glue.CfnTable.ColumnProperty(name='date', type='date'),
                        glue.CfnTable.ColumnProperty(name='time', type='string'),
                        glue.CfnTable.ColumnProperty(name='location', type='string'),
                        glue.CfnTable.ColumnProperty(name='bytes', type='bigint'),
                        glue.CfnTable.ColumnProperty(name='request_ip', type='string'),
                        glue.CfnTable.ColumnProperty(name='method', type='string'),
                        glue.CfnTable.ColumnProperty(name='host', type='string'),
                        glue.CfnTable.ColumnProperty(name='uri', type='string'),
                        glue.CfnTable.ColumnProperty(name='status', type='string'),
                        glue.CfnTable.ColumnProperty(name='referer', type='string'),
                        glue.CfnTable.ColumnProperty(name='user_agent', type='string'),
                        glue.CfnTable.ColumnProperty(name='query_string', type='string'),
                        glue.CfnTable.ColumnProperty(name='cookie', type='string'),
                        glue.CfnTable.ColumnProperty(name='result_type', type='string'),
                        glue.CfnTable.ColumnProperty(name='request_id', type='string'),
                        glue.CfnTable.ColumnProperty(name='host_header', type='string'),
                        glue.CfnTable.ColumnProperty(name='request_protocol', type='string'),
                        glue.CfnTable.ColumnProperty(name='request_bytes', type='bigint'),
                        glue.CfnTable.ColumnProperty(name='time_taken', type='float'),
                        glue.CfnTable.ColumnProperty(name='xforwarded_for', type='string'),
                        glue.CfnTable.ColumnProperty(name='ssl_protocol', type='string'),
                        glue.CfnTable.ColumnProperty(name='ssl_cipher', type='string'),
                        glue.CfnTable.ColumnProperty(name='response_result_type', type='string'),
                        glue.CfnTable.ColumnProperty(name='http_version', type='string'),
                        glue.CfnTable.ColumnProperty(name='fle_status', type='string'),
                        glue.CfnTable.ColumnProperty(name='fle_encrypted_fields', type='int'),
                        glue.CfnTable.ColumnProperty(name='c_port', type='int'),
                        glue.CfnTable.ColumnProperty(name='time_to_first_byte', type='float'),
                        glue.CfnTable.ColumnProperty(name='x_edge_detailed_result_type', type='string'),
                        glue.CfnTable.ColumnProperty(name='sc_content_type', type='string'),
                        glue.CfnTable.ColumnProperty(name='sc_content_len', type='string'),
                        glue.CfnTable.ColumnProperty(name='sc_range_start', type='bigint'),
                        glue.CfnTable.ColumnProperty(name='sc_range_end', type='bigint')
                    ]
                ),
            )
        )

        #submit the query and wait for the results
        start_query_execution_job = tasks.AthenaStartQueryExecution(self, "Start Athena Query",
            query_string="SELECT uri FROM cf_access_logs limit 10",
            integration_pattern=sf.IntegrationPattern.RUN_JOB, #executes the command in SYNC mode
            query_execution_context=tasks.QueryExecutionContext(
                database_name=glue_database_name
            ),
            result_configuration=tasks.ResultConfiguration(
                output_location=s3.Location(
                    bucket_name=athena_results.bucket_name,
                    object_key="results"
                )
            )
        )

        #get the results
        get_query_results_job = tasks.AthenaGetQueryResults(self, "Get Query Results",
            query_execution_id=sf.JsonPath.string_at("$.QueryExecution.QueryExecutionId"),
            result_path=sf.JsonPath.string_at("$.GetQueryResults"),
        )

        #prepare the query to see if more results are available (up to 1000 can be retrieved)
        prepare_next_params = sf.Pass(self, "Prepare Next Query Params",
            parameters={
                "QueryExecutionId.$": "$.StartQueryParams.QueryExecutionId",
                "NextToken.$": "$.GetQueryResults.NextToken"
            },
            result_path=sf.JsonPath.string_at("$.StartQueryParams")
        )

        #check to see if more results are available
        has_more_results = sf.Choice(self, "Has More Results?").when(
                    sf.Condition.is_present("$.GetQueryResults.NextToken"),
                    prepare_next_params.next(get_query_results_job)
                ).otherwise(sf.Succeed(self, "Done"))


        #do something with each result
        #here add your own logic
        map = sf.Map(self, "Map State",
            max_concurrency=1,
            input_path=sf.JsonPath.string_at("$.GetQueryResults.ResultSet.Rows[1:]"),
            result_path = sf.JsonPath.DISCARD
        )
        map.iterator(sf.Pass(self, "DoSomething"))


        # Step function to orchestrate Athena query and retrieving the results
        workflow = sf.StateMachine(self, "AthenaQuery",
            definition=start_query_execution_job.next(get_query_results_job).next(map).next(has_more_results),
            timeout=Duration.minutes(60)
        )

        CfnOutput(self, "Logs",
            value=cf_access_logs.bucket_name, export_name='LogsBucket')

        CfnOutput(self, "SFName",
            value=workflow.state_machine_name, export_name='SFName')

        CfnOutput(self, "SFArn",
            value = workflow.state_machine_arn,
            export_name = 'StepFunctionArn',
            description = 'Step Function arn')