def _create_glue_job( self, context: DataJobContext, glue_job_name: str, s3_url_glue_job: str = None, arguments: dict = None, job_type: str = "pythonshell", python_version: str = "3", glue_version: str = None, max_capacity: int = None, worker_type: str = None, number_of_workers: str = None, *args, **kwargs, ) -> None: """Create a glue job with the necessary configuration like, paths to wheel and business logic and arguments""" logger.debug(f"creating Glue Job {glue_job_name}") default_arguments = None if context.s3_url_wheel: extra_py_files = { # path to the wheel of this project "--extra-py-files": context.s3_url_wheel } default_arguments = {**extra_py_files, **arguments} glue.CfnJob( self, id=glue_job_name, name=glue_job_name, role=self.role.role_arn, command=glue.CfnJob.JobCommandProperty( name=job_type, python_version=python_version, script_location=s3_url_glue_job, ), glue_version=glue_version, max_capacity=max_capacity, default_arguments=default_arguments, worker_type=worker_type, number_of_workers=number_of_workers, *args, **kwargs, )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) glue_job_role = iam.Role( self, "Glue-Job-Role", assumed_by=iam.ServicePrincipal("glue.amazonaws.com"), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonS3FullAccess") ], ) job = glue.CfnJob( self, "glue-test-job", role=glue_job_role.role_arn, allocated_capacity=1, command=glue.CfnJob.JobCommandProperty( name="glueetl", script_location="s3://my-bucket/glue-scripts/job.scala"), glue_version="2.0", ) # # file_asset = aws_s3_assets.Asset(self, "glue-asssets", path=os.path.join(ROOT_DIR, "glue")) # print(file_asset.bucket) bucket_glue = aws_s3.Bucket(self, "BucketGlue") # file_asset = aws_s3_assets.Asset(self, "GlueAssets", path="/Users/vincent/Workspace/python_lambda_iac_deployment/python_lambda_iac_deployment/glue/glue_job.py") aws_s3_deployment.BucketDeployment( self, "GlueJobDeployment", sources=[ aws_s3_deployment.Source.asset( "/Users/vincent/Workspace/python_lambda_iac_deployment/python_lambda_iac_deployment/glue" ) ], destination_bucket=bucket_glue, destination_key_prefix="jobs")
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: glue.CfnJob( scope=self, id=modname, command={ 'name': 'glueetl', 'python_version': '3', 'script_location': 's3://shaw-stc-edl-etl-config-playpen/test/glue/spark/__main__.py' }, role=etl_role.role_arn, default_arguments={'--enable-glue-datacatalog': ''}, allocated_capacity=10, description='Test Spark Glue ETL', glue_version='1.0', max_capacity=10, max_retries=0, number_of_workers=1, timeout=2880, worker_type='Standard')
def __init__( self, scope: cdk.Construct, construct_id: str, stack_log_level: str, vpc, my_sql_db_sg, store_events_db_endpoint, sales_events_bkt, _glue_etl_role, glue_db_name: str, glue_table_name: str, tgt_db_secret, **kwargs, ) -> None: super().__init__(scope, construct_id, **kwargs) self.template_options.metadata = {"License": "Miztiik Corp."} # ADD Permissions to our Glue JOB Role to Access Secrets tgt_db_secret.grant_read(_glue_etl_role) # # Create GLUE JDBC Connection for RDS MySQL # Allow ALL PORTS within SG for GLUE Connections to connect # https://docs.aws.amazon.com/glue/latest/dg/connection-defining.html#connection-properties-jdbc # https://docs.aws.amazon.com/glue/latest/dg/setup-vpc-for-glue-access.html # https://docs.amazonaws.cn/en_us/glue/latest/dg/connection-defining.html rds_mysql_conn_props = _glue.CfnConnection.ConnectionInputProperty( connection_type="JDBC", description="Glue Connection for RDS MySQL Store Events Database", name="rdsMySQL57Conn", physical_connection_requirements=_glue.CfnConnection.PhysicalConnectionRequirementsProperty( security_group_id_list=[my_sql_db_sg.security_group_id], subnet_id=vpc.select_subnets( subnet_type=_ec2.SubnetType.PRIVATE ).subnet_ids[1] ), connection_properties={ "JDBC_CONNECTION_URL": f"jdbc:mysql://{store_events_db_endpoint}:3306/store_events", "JDBC_ENFORCE_SSL": "false", "USERNAME": "******", "PASSWORD": "******" } ) rds_mysql_conn = _glue.CfnConnection( self, "rdsMySQLGlueConnection", catalog_id=f"{cdk.Aws.ACCOUNT_ID}", connection_input=rds_mysql_conn_props ) # Create the Glue job to convert incoming JSON to parquet # Read GlueSpark Code try: with open( "stacks/back_end/glue_stacks/glue_job_scripts/load_json_to_rds.py", encoding="utf-8", mode="r", ) as f: load_json_to_rds = f.read() except OSError: print("Unable to read Glue Job Code") raise etl_script_asset = _s3_assets.Asset( self, "etlScriptAsset", path="stacks/back_end/glue_stacks/glue_job_scripts/load_json_to_rds.py" ) self.etl_prefix = "stream-etl" _glue_etl_job = _glue.CfnJob( self, "glues3ToRdsIngestorJob", name="s3-to-rds-ingestor", description="Glue Job to ingest JSON data from S3 to RDS", role=_glue_etl_role.role_arn, glue_version="2.0", command=_glue.CfnJob.JobCommandProperty( name="glueetl", script_location=f"s3://{etl_script_asset.s3_bucket_name}/{etl_script_asset.s3_object_key}", python_version="3" ), connections={"connections": [rds_mysql_conn_props.name]}, default_arguments={ "--enable-metrics": True, "--enable-continuous-cloudwatch-log": True, "--job-bookmark-option": "job-bookmark-enable", '--TempDir': f"s3://{sales_events_bkt.bucket_name}/bookmarks", "--src_db_name": glue_db_name, "--src_etl_bkt": f"{sales_events_bkt.bucket_name}", "--crawler_tbl_prefix": "txns_", "--tgt_db_secret_arn": tgt_db_secret.secret_arn, "--tgt_tbl_name": glue_table_name, "--conn_name": f"{rds_mysql_conn_props.name}" }, allocated_capacity=1, # timeout=2, max_retries=2, execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=2) ) # Configure a Trigger - Every hour _glue_etl_job_trigger = _glue.CfnTrigger( self, "glueEtlJobtrigger", type="SCHEDULED", description="Miztiik Automation: Trigger S3 to RDS Ingestor glue job every hour", schedule="cron(0 1 * * ? *)", start_on_creation=False, actions=[ _glue.CfnTrigger.ActionProperty( job_name=f"{_glue_etl_job.name}", timeout=2 ) ] ) _glue_etl_job_trigger.add_depends_on(_glue_etl_job) # Configure Glue Workflow _glue_etl_job_workflow = _glue.CfnWorkflow( self, "glueEtlJobWorkflow" ) ########################################### ################# OUTPUTS ################# ########################################### output_0 = cdk.CfnOutput( self, "AutomationFrom", value=f"{GlobalArgs.SOURCE_INFO}", description="To know more about this automation stack, check out our github page.", ) output_1 = cdk.CfnOutput( self, "RDSIngestorETLGlueJob", value=f"https://console.aws.amazon.com/gluestudio/home?region={cdk.Aws.REGION}#/jobs", description="Glue Job to ingest JSON data from S3 to RDS.", )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) with open('./props/tasksetting.json', 'r') as f1: py_json1 = json.load(f1) ts = json.dumps(py_json1) # with open('./props/mappingrule.json', 'r') as f2: # py_json2 = json.load(f2) # mr = json.dumps(py_json2) with open('./props/config.json', 'r') as f2: configuration = json.load(f2) def getMappingrules(self, table_list): rules = [] for index, value in enumerate(table_list, 1): rules.append({ "rule-type": "selection", "rule-id": str(index), "rule-name": str(index), "object-locator": { "schema-name": value['schemaName'], "table-name": value['tableName'] }, "rule-action": "include", "filters": [] }) mapping_rules = {"rules": rules} return json.dumps(mapping_rules) # The code that defines your stack goes here S3Accessrole = _iam.Role( self, 'dmsrole', assumed_by=_iam.ServicePrincipal('dms.amazonaws.com'), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( 'AmazonS3FullAccess') ]) raw_bucket = s3.Bucket(self, 'rawbucket', bucket_name='rawbucket-datalake-cdk-oregon') raw_bucket.add_lifecycle_rule( enabled=configuration['s3LifecycleRule']['enabled'], expiration=core.Duration.days( configuration['s3LifecycleRule']['expiration'])) #my_table = ddb.Table(self, id ='dunamoTable', table_name = 'testcdktable', #partition_key = ddb.Attribute(name ='lastname',type = ddb.AttributeType.STRING) ) dl_dms = _dms.CfnReplicationInstance( self, 'dmsreplication', replication_instance_class=configuration['DMS_instance_setting'] ['instance_class'], replication_instance_identifier='datalake-instance-cdk', allocated_storage=configuration['DMS_instance_setting'] ['allocated_storage']) source_endpoint = _dms.CfnEndpoint( self, 'sourceendpoint', endpoint_type='source', engine_name=configuration['engineName'], database_name=configuration['databaseName'], username=configuration['username'], password=configuration['password'], port=configuration['port'], server_name=configuration['serverName'], ) target_endpoint = _dms.CfnEndpoint( self, 'targetendpoint', endpoint_type='target', engine_name='s3', s3_settings={ 'bucketName': raw_bucket.bucket_name, 'serviceAccessRoleArn': S3Accessrole.role_arn }, extra_connection_attributes='dataFormat=parquet') dms_task = _dms.CfnReplicationTask( self, 'data2lake-task', migration_type='full-load-and-cdc', replication_instance_arn=dl_dms.ref, source_endpoint_arn=source_endpoint.ref, target_endpoint_arn=target_endpoint.ref, replication_task_settings=ts, table_mappings=getMappingrules(self, configuration['tableList'])) my_table = ddb.Table(self, id='dynamoTable', table_name='ControllerTable', partition_key=ddb.Attribute( name='path', type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) datalake_bucket = s3.Bucket(self, 'datalakebucket', bucket_name='datalake-bucket-cdk-oregon') glue_role = _iam.Role( self, 'gluerole', assumed_by=_iam.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSGlueServiceRole') ]) raw_bucket.grant_read(glue_role) datalake_bucket.grant_read_write(glue_role) #lake formation settings #If you have attached managed policy ('AWSLakeFormationDataAdmin') to your own iam user, you should change that policy to allow "lakeformation:PutDataLakeSettings", #so that the lake setting can be allowed by below code in cdk. lake_admin_setting = _lakeformation.CfnDataLakeSettings( self, 'data-lake-GrantAdmin', admins=[ _lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty( data_lake_principal_identifier=configuration[ 'executiveArn']) ]) glue_database = _glue.Database(self, 'gluedatabase', database_name='data_lake_gluedb') glue_database.node.add_dependency(lake_admin_setting) glue_role_permission_inLakeFormation = _lakeformation.CfnPermissions( self, 'permission-glueRole', data_lake_principal=_lakeformation.CfnPermissions. DataLakePrincipalProperty( data_lake_principal_identifier=glue_role.role_arn), resource=_lakeformation.CfnPermissions.ResourceProperty( database_resource=_lakeformation.CfnPermissions. DatabaseResourceProperty(name=glue_database.database_name)), permissions=['ALL']) crawler = _glue.CfnCrawler( self, 'datalakecrawler', name='Crawler-datalake-cdk', role=glue_role.role_arn, targets={ 's3Targets': [{ 'path': 's3://' + datalake_bucket.bucket_name + '/datalake/' }] }, database_name='data_lake_gluedb', configuration= "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}" ) initialload_script = S3Assets.Asset(self, 'initial-load-code', path='./Gluejob/InitialLoad.py') incrementalload_script = S3Assets.Asset( self, 'incremental-load-code', path='./Gluejob/IncrementalLoad.py') initialload_script.grant_read(glue_role) incrementalload_script.grant_read(glue_role) my_table.grant_full_access(glue_role) initial_load_job = _glue.CfnJob( self, 'initial-job', name='InitialLoad-cdk', command=_glue.CfnJob.JobCommandProperty( name='glueetl', python_version='3', script_location='s3://' + initialload_script.s3_bucket_name + '/' + initialload_script.s3_object_key), role=glue_role.role_arn, default_arguments={ '--prefix': str(configuration['tableList']), '--bucket': raw_bucket.bucket_name, '--datalake_bucket': datalake_bucket.bucket_name, '--datalake_prefix': 'datalake/', '--region': CdkpyStack.of(self).region, '--controller_table_name': my_table.table_name }, allocated_capacity=configuration['glue_job_setting'] ['job_capacity'], execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=configuration['glue_job_setting'] ['max_concurrent_run_JobExecution'])) incremental_load_job = _glue.CfnJob( self, 'increment-job', name='IncrementalLoad-cdk', command=_glue.CfnJob.JobCommandProperty( name='glueetl', script_location='s3://' + incrementalload_script.s3_bucket_name + '/' + incrementalload_script.s3_object_key, python_version='3'), role=glue_role.role_arn, default_arguments={ '--prefix': str(configuration['tableList']), '--bucket': raw_bucket.bucket_name, '--datalake_bucket': datalake_bucket.bucket_name, '--datalake_prefix': 'datalake/', '--region': CdkpyStack.of(self).region, '--controller_table_name': my_table.table_name }, allocated_capacity=2, execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=1)) job_trigger = _glue.CfnTrigger( self, 'datalake-glue-trigger', type='SCHEDULED', schedule=configuration['job_trigger_schedule'], start_on_creation=False, actions=[ _glue.CfnTrigger.ActionProperty(job_name='IncrementalLoad-cdk') ]) dl_sns = _sns.Topic(self, 'datalake_sns', display_name='data-lake-sns') endpoint_email = configuration['emailSubscriptionList'] for emails in endpoint_email: dl_sns.add_subscription(_subscrption.EmailSubscription(emails)) #Another way to subscribe: dl_subscription = _sns.Subscription(self,'email-subscrption',topic = dl_sns,endpoint='*****@*****.**',protocol= _sns.SubscriptionProtocol.EMAIL) glue_events_target = _events_targets.SnsTopic(dl_sns) glue_events_rule = _events.Rule( self, 'gluejobevents-datalake', description='Using for tracking the failed glue job of data lake', rule_name='dl-gluejob-event', event_pattern=_events.EventPattern( source=['aws.glue'], detail_type=['Glue Job State Change'], detail={ "jobName": [initial_load_job.name], "state": ["FAILED"] }), targets=[glue_events_target]) dms_subscription = _dms.CfnEventSubscription( self, 'dmsevents-datalake', sns_topic_arn=dl_sns.topic_arn, subscription_name='datalake-dmsevents', source_type='replication-task', event_categories=['failure'])
def __init__( self, scope: cdk.Construct, construct_id: str, stack_log_level: str, glue_db_name: str, glue_table_name: str, etl_bkt, src_stream, **kwargs, ) -> None: super().__init__(scope, construct_id, **kwargs) self.template_options.metadata = {"License": "Miztiik Corp."} # Glue Job IAM Role self._glue_etl_role = _iam.Role( self, "glueJobRole", assumed_by=_iam.ServicePrincipal("glue.amazonaws.com"), managed_policies=[ _iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonS3ReadOnlyAccess"), _iam.ManagedPolicy.from_aws_managed_policy_name( "service-role/AWSGlueServiceRole") ]) self._glue_etl_role.add_to_policy( _iam.PolicyStatement( actions=["s3:*"], resources=[f"{etl_bkt.bucket_arn}", f"{etl_bkt.bucket_arn}/*"])) self._glue_etl_role.add_to_policy( _iam.PolicyStatement(actions=["kinesis:DescribeStream"], resources=[f"{src_stream.stream_arn}"])) src_stream.grant_read(self._glue_etl_role) # Create the Glue job to convert incoming JSON to parquet # Read GlueSpark Code try: with open( "stacks/back_end/glue_stacks/glue_job_scripts/kinesis_streams_batch_to_s3_etl.py", encoding="utf-8", mode="r", ) as f: kinesis_streams_batch_to_s3_etl = f.read() except OSError: print("Unable to read Glue Job Code") raise etl_script_asset = _s3_assets.Asset( self, "etlScriptAsset", path= "stacks/back_end/glue_stacks/glue_job_scripts/kinesis_streams_batch_to_s3_etl.py" ) self.etl_prefix = "stream-etl" _glue_etl_job = _glue.CfnJob( self, "glueJsonToParquetJob", name="stream-etl-processor", description= "Glue Job to process stream of events from Kinesis data stream and store them in parquet format in S3", role=self._glue_etl_role.role_arn, glue_version="2.0", command=_glue.CfnJob.JobCommandProperty( name="gluestreaming", script_location= f"s3://{etl_script_asset.s3_bucket_name}/{etl_script_asset.s3_object_key}", python_version="3"), default_arguments={ "--src_db_name": glue_db_name, "--src_tbl_name": glue_table_name, "--datalake_bkt_name": etl_bkt.bucket_name, "--datalake_bkt_prefix": f"{self.etl_prefix}/", "--job-bookmark-option": "job-bookmark-enable" }, allocated_capacity=1, # timeout=2, max_retries=2, execution_property=_glue.CfnJob.ExecutionPropertyProperty( max_concurrent_runs=1)) # Configure a Trigger - Every hour _glue_etl_job_trigger = _glue.CfnTrigger( self, "glueEtlJobtrigger", type="SCHEDULED", description= "Miztiik Automation: Trigger streaming etl glue job every hour", schedule="cron(0 1 * * ? *)", start_on_creation=False, actions=[ _glue.CfnTrigger.ActionProperty( job_name=f"{_glue_etl_job.name}", timeout=2) ]) _glue_etl_job_trigger.add_depends_on(_glue_etl_job) ########################################### ################# OUTPUTS ################# ########################################### output_0 = cdk.CfnOutput( self, "AutomationFrom", value=f"{GlobalArgs.SOURCE_INFO}", description= "To know more about this automation stack, check out our github page.", ) output_1 = cdk.CfnOutput( self, "StreamingETLGlueJob", value= f"https://console.aws.amazon.com/gluestudio/home?region={cdk.Aws.REGION}#/jobs", description="Glue ETL Job.", )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) policy_statement = iam.PolicyStatement(actions=['logs:*','s3:*','iam:*','cloudwatch:*','dynamodb:*','glue:*']) policy_statement.add_all_resources() #define role glue_job_role = iam.Role(self,'Glue-Job-Role',assumed_by=iam.ServicePrincipal('glue.amazonaws.com')) glue_job_role.add_to_policy(policy_statement) #define job job = glue.CfnJob(self,'glue-test-job',role=glue_job_role.role_arn,allocated_capacity=10,worker_type="G.1X", command=glue.CfnJob.JobCommandProperty(name='glueetl',script_location='s3://base-nonprod/GlueETLScripts//hello.py')) # create inline statement, policy then role # statement = iam.PolicyStatement(actions=["s3:GetObject","s3:PutObject"], # resources=["arn:aws:s3:::mybucketname", # "arn:aws:s3:::mybucketname/data_warehouse/units/*"]) # write_to_s3_policy = iam.PolicyDocument(statements=[statement]) # glue_role = iam.Role( # self, 'GlueCrawlerFormyDataScienceRole', # role_name = 'GlueCrawlerFormyDataScienceRole', # inline_policies=[write_to_s3_policy], # assumed_by=iam.ServicePrincipal('glue.amazonaws.com'), # managed_policies=[iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AWSGlueServiceRole')] # ) #define crawler # glue_crawler = glue.CfnCrawler( # self, 'glue-crawler-id', # description="Glue Crawler for my-data-science-s3", # name='any name', # database_name='units', # schedule={"scheduleExpression": "cron(5 * * * ? *)"}, # role=glue_role.role_arn, # targets={"s3Targets": [{"path": "s3://mybucketname/data_warehouse/units"}]} # ) # from aws_cdk import core as cdk # from aws_cdk import awsglue as glue # from aws_cdk import aws # #import * as glue from "@aws-cdk/aws-glue"; # import * as s3 from "@aws-cdk/aws-s3"; # import * as s3Deployment from "@aws-cdk/aws-s3-deployment"; # import * as iam from "@aws-cdk/aws-iam"; # import { replaceValues } from "./lib"; # import { config } from "dotenv"; # config(); # # const PYTHON_VERSION = "3"; # const GLUE_VERSION = "1.0"; # # //This value must be glueetl for Apache Spark # const COMMAND_NAME = "glueetl"; # # const { RTK, COLLECTIONS, BUCKET_NAME }= process.env; # # class GlueETLStack extends cdk.Stack { # constructor(scope: cdk.Construct, id: string, props?: cdk.StackProps) { # super(scope, id, props); # # const s3Bucket = new s3.Bucket(this, "etl-bucket", { # bucketName: BUCKET_NAME, # removalPolicy: cdk.RemovalPolicy.DESTROY # }); # # const dependenciesDeployment = new s3Deployment.BucketDeployment(this, "dependencies-deployment", { # sources: [s3Deployment.Source.asset("../dependencies")], # destinationBucket: s3Bucket, # destinationKeyPrefix: "dependencies" # }); # # # // Replace hardcoded values in script # # replaceValues( # # "scripts/script.py", # # RTK as string, # # MONGO_SERVER as string, # # MONGO_USER as string, # # MONGO_PASSWORD as string, # # MONGO_PORT as string, # # MONGO_SSL == "true" ? "True" : "False", # # MONGO_DATABASE as string, # # `s3://${BUCKET_NAME}/${MONGO_DATABASE as string}/`, # # COLLECTIONS as string # # ); # # const scriptsDeployment = new s3Deployment.BucketDeployment(this, "scripts-deployment", { # sources: [s3Deployment.Source.asset("scripts")], # destinationBucket: s3Bucket, # destinationKeyPrefix: "scripts" # }); # # const glueRole = new iam.Role(this, "glue-role", { # roleName: "glue-etl-role", # assumedBy: new iam.ServicePrincipal("glue.amazonaws.com"), # managedPolicies: [ # iam.ManagedPolicy.fromAwsManagedPolicyName("AmazonS3FullAccess") # ], # }); # # const glueJob = new glue.CfnJob(this, "glue-job", { # name: "glue-job", # role: glueRole.roleArn, # command: { # name: COMMAND_NAME, # pythonVersion: PYTHON_VERSION, # scriptLocation: `s3://${s3Bucket.bucketName}/scripts/script.py` # }, # glueVersion: GLUE_VERSION, # defaultArguments: { # "--extra-jars": `s3://${s3Bucket.bucketName}/${JDBC_PATH}` # } # }); # # const glueTrigger = new glue.CfnTrigger(this, "glue-trigger", { # name: "etl-trigger", # schedule: "cron(5 * * * ? *)", # type: "SCHEDULED", # actions: [ # { # jobName: glueJob.name # } # ], # startOnCreation: true # }); # glueTrigger.addDependsOn(glueJob); # } # }
def __init__(self, app: core.App, cfn_name: str, stack_env): super().__init__(scope=app, id=f"{cfn_name}-{stack_env}") glue_code = s3_assets.Asset( scope=self, id=f"{cfn_name}-glue-script", path="./glue_script/glue_job_script.py", ) glue_s3_access_role = iam.Role( scope=self, id=f"glue_s3_access_role_{stack_env}", role_name=f"glue_s3_access_role_{stack_env}", assumed_by=iam.ServicePrincipal("glue.amazonaws.com")) # add policy to access S3 glue_s3_access_role.add_to_policy( iam.PolicyStatement(effect=iam.Effect.ALLOW, resources=["*"], actions=["s3:*"])) # add policy to access CloudWatch Logs glue_s3_access_role.add_to_policy( iam.PolicyStatement(effect=iam.Effect.ALLOW, resources=["arn:aws:logs:*:*:*"], actions=[ "logs:CreateLogGroup", "logs:CreateLogStream", "logs:PutLogEvents", "logs:DescribeLogStreams" ])) # glue # specify the name, because `the name` deployed cannot be obtained. glue_job_name = f"{cfn_name}-glue-job" _ = glue.CfnJob( scope=self, id=glue_job_name, name=glue_job_name, command=glue.CfnJob.JobCommandProperty( # glueetl or pythonshell name=self.GLUE_JOB_COMMAND_GLUE_ETL, script_location= f"s3://{glue_code.s3_bucket_name}/{glue_code.s3_object_key}"), # set role-name! role=glue_s3_access_role.role_name, glue_version=self.GLUE_VERSION_2_0, number_of_workers=2, worker_type=self.GLUE_WORKER_TYPE_STANDARD, timeout=1800) # StepFunction Tasks sfn_task_pass = sfn.Pass(scope=self, id=f"{cfn_name}-sfn-pass", comment="pass example", input_path="$", result_path="$.source", result=sfn.Result.from_string("example"), output_path="$") # wait until the JOB completed: sfn.IntegrationPattern.RUN_JOB # process next step without waiting: sfn.IntegrationPattern.REQUEST_RESPONSE sfn_task_glue_job = sfn_tasks.GlueStartJobRun( scope=self, id=f"{cfn_name}-sfn-lambda-task", glue_job_name=glue_job_name, integration_pattern=sfn.IntegrationPattern.RUN_JOB, input_path="$", result_path="$.result", output_path="$.output") # stepfunctions definition = sfn_task_pass.next(sfn_task_glue_job) _ = sfn.StateMachine(scope=self, id=f"{cfn_name}-SFn-{stack_env}", definition=definition)
def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) # Glue job execution IAM Role glue_job_role = iam.Role( self, 'Glue-Job-Role', assumed_by=iam.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSGlueServiceRole') ]) S3_BUCKET_NAME = "MyCdkGlueJobBucket" # S3 Bucket to host glue scripts bucket = s3.Bucket(self, S3_BUCKET_NAME, versioned=True, removal_policy=RemovalPolicy.DESTROY, auto_delete_objects=True, block_public_access=s3.BlockPublicAccess.BLOCK_ALL) # asset to sync local scripts folder with S3 bucket asset = s3deploy.Source.asset("./resources/glue-scripts") # Sync local scripts with S3 bucket s3deploy.BucketDeployment(self, "DeployGlueJobScripts", sources=[asset], destination_bucket=bucket, destination_key_prefix="glue-python-scripts") # Grant read write access for glue execution IAM role for S3 bucket bucket.grant_read_write(glue_job_role) scriptLocation = 's3://' + bucket.bucket_name + '/glue-python-scripts/hello.py' # Python-shell Glue job job = glue.CfnJob(self, 'Glue-job', name='cdk-test-glue-python-job', role=glue_job_role.role_arn, command=glue.CfnJob.JobCommandProperty( name='pythonshell', python_version='3', script_location=scriptLocation)) # Glue Start Job Run Task for State Function (integration_pattern = .sync) glue_task = sfn_tasks.GlueStartJobRun( self, "Task", glue_job_name=job.name, integration_pattern=sfn.IntegrationPattern.RUN_JOB, arguments=sfn.TaskInput.from_object( {"--message": sfn.JsonPath.string_at("$.message")}), timeout=Duration.minutes(6), notify_delay_after=Duration.minutes(6)) # State Function defination definition = glue_task state_machine = sfn.StateMachine(self, "GlueJobStateMachine", definition=definition, timeout=Duration.minutes(10)) # CDK Outputs CfnOutput(scope=self, id='StateMachineArn', value=state_machine.state_machine_arn) CfnOutput(scope=self, id='GlueJobName', value=job.name) CfnOutput(scope=self, id='S3BucketName', value=bucket.bucket_name)
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) policy_statement = iam.PolicyStatement( actions=['logs:*', 's3:*', 'iam:*', 'cloudwatch:*', 'glue:*'] ) policy_statement.add_all_resources() my_lambda = _lambda.Function( self, 'lambdaHandler', runtime=_lambda.Runtime.PYTHON_3_8, code=_lambda.Code.asset('lambda'), handler='handler.handler', ) my_lambda_role = iam.Role( self, 'my_lambda_role', assumed_by=iam.ServicePrincipal('lambda.amazonaws.com') ) my_lambda_role.add_to_policy( policy_statement ) my_bucket = _s3.Bucket( self, id='s3buckettest', bucket_name='csvconverterbv', ) notification = aws_s3_notifications.LambdaDestination(my_lambda) my_bucket.add_event_notification(_s3.EventType.OBJECT_CREATED, notification) glue_job_role = iam.Role( self, 'Glue-Job-Role', assumed_by=iam.ServicePrincipal('glue.amazonaws.com') ) glue_job_role.add_to_policy( policy_statement ) code_bucket = _s3.Bucket.from_bucket_attributes( self, 'CodeBucket', bucket_name='csvconverterbv' ) aws_s3_deployment.BucketDeployment( self, 'S3Deployment', destination_bucket=code_bucket, sources=[aws_s3_deployment.Source.asset('glue/')], destination_key_prefix='glue/' ) job = glue.CfnJob( self, 'glue-test-job', name='glue-test-job', role=glue_job_role.role_arn, allocated_capacity=10, command=glue.CfnJob.JobCommandProperty( name='glueetl', script_location='s3://csvconverterbv/glue/gluejob.py' ))