def athena_query_execute_save_s3(self): text = "AthenaUtil Initialization .." print text util = AthenaUtil(s3_staging_folder=self.s3_staging_folder_csv) text = "Started athena_query_execute_save_s3 .." print text print 'athena_ctas_query= ', self.sql_query util.execute_save_s3(self.sql_query, self.s3_staging_folder_csv) return True
class AthenaCTAS(object): region_name = 'us-west-2' client = boto3.client(service_name='glue', region_name='us-west-2', endpoint_url='https://glue.us-west-2.amazonaws.com') s3_staging_folder_csv = None s3_target_folder_parquet = None glue_script_location = None glue_job_name = None glue_role = None util = None glue_job = None def __init__(self, s3_staging_folder_csv, s3_target_folder_parquet, glue_role, glue_script_location): """ constructor requires s3 staging folder for storing results Parameters: s3_staging_folder = s3 folder with write permissions for storing results """ print 's3_staging_folder_csv=', s3_staging_folder_csv print 's3_target_folder_parquet = ', s3_target_folder_parquet self.s3_staging_folder_csv = s3_staging_folder_csv self.s3_target_folder_parquet = s3_target_folder_parquet # self.sql_query = sql_query # Athena Initialization self.util = AthenaUtil(s3_staging_folder=self.s3_staging_folder_csv) #Glue Create Job derive from s3_target_folder_parquet job_name = '.'.join([ re.sub('[^0-9a-zA-Z]+', '', x).title() for x in s3_target_folder_parquet.replace( 's3://', '').split('/')[1:] ]) self.glue_job_name = job_name #'athena_ctas_part2' glue_job = self.client.create_job(Name=self.glue_job_name, Role=glue_role, Command={ 'Name': 'glueetl', 'ScriptLocation': glue_script_location }) self.glue_script_location = glue_script_location # 's3://move-dataeng-temp-dev/glue-etl/scripts/athena_ctas_part2.py' def athena_query_execute_save_s3(self, sql_query): text = "Started athena_query_execute_save_s3 .." print text # self.banner(text) print 'athena_ctas_query= ', sql_query self.util.execute_save_s3(sql_query, self.s3_staging_folder_csv) return True def wait_for_job_to_complete(self, JobRunId): """ waits for query to execute """ text = 'Waiting for JobName = %s and JobId=%s to Complete processing ...' % ( self.glue_job_name, JobRunId) print text # self.banner(text) status = "STARTING" # assumed error_count = 0 response = None response = self.client.get_job_run(JobName=self.glue_job_name, RunId=JobRunId) status = response["JobRun"]["JobRunState"] while ( status in ("QUEUED','RUNNING, STARTING") ): # 'JobRunState': 'STARTING'|'RUNNING'|'STOPPING'|'STOPPED'|'SUCCEEDED'|'FAILED', try: response = self.client.get_job_run(JobName=self.glue_job_name, RunId=JobRunId) status = response["JobRun"]["JobRunState"] # my_print(status) time.sleep(0.5) except botocore.exceptions.ClientError as ce: error_count = error_count + 1 if (error_count > 3): status = "FAILED" print(str(ce)) break # out of the loop if "ExpiredTokenException" in str(ce): self.client = boto3.session.Session( region_name=self.region_name).client('glue') if (status == "FAILED" or status == "STOPPED"): # print(response) pass if response is None: return {"SUCCESS": False, "STATUS": status} else: return response def glue_etl_execute_csv2parquet(self, target_table_name): text = 'Starting glue_etl_execute_csv2parquet process for %s ....' % ( self.glue_job_name) print text print 's3_staging_folder_csv=', self.s3_staging_folder_csv print 's3_target_folder_parquet = ', self.s3_target_folder_parquet print '' # self.banner(text) response = self.client.start_job_run(JobName=self.glue_job_name, Arguments={ '--s3_location_csv_file': self.s3_staging_folder_csv, '--s3_location_parquet_file': self.s3_target_folder_parquet, '--table_name': target_table_name }) return response # if self.wait_for_job_to_complete(response['JobRunId']): # return True # else: # return False def job_cleanup(self): self.banner("Job Cleanup") return self.client.delete_job(JobName=self.glue_job_name) @staticmethod def banner(text, ch='=', length=78): spaced_text = ' %s ' % text banner = spaced_text.center(length, ch) return banner