Exemplo n.º 1
0
 def athena_query_execute_save_s3(self):
     text = "AthenaUtil Initialization .."
     print text
     util = AthenaUtil(s3_staging_folder=self.s3_staging_folder_csv)
     text = "Started athena_query_execute_save_s3 .."
     print text
     print 'athena_ctas_query= ', self.sql_query
     util.execute_save_s3(self.sql_query, self.s3_staging_folder_csv)
     return True
Exemplo n.º 2
0
class AthenaCTAS(object):
    region_name = 'us-west-2'
    client = boto3.client(service_name='glue',
                          region_name='us-west-2',
                          endpoint_url='https://glue.us-west-2.amazonaws.com')

    s3_staging_folder_csv = None
    s3_target_folder_parquet = None
    glue_script_location = None
    glue_job_name = None
    glue_role = None
    util = None
    glue_job = None

    def __init__(self, s3_staging_folder_csv, s3_target_folder_parquet,
                 glue_role, glue_script_location):
        """ constructor requires s3 staging folder for storing results
        Parameters:
        s3_staging_folder = s3 folder with write permissions for storing results
        """
        print 's3_staging_folder_csv=', s3_staging_folder_csv
        print 's3_target_folder_parquet = ', s3_target_folder_parquet
        self.s3_staging_folder_csv = s3_staging_folder_csv
        self.s3_target_folder_parquet = s3_target_folder_parquet
        #         self.sql_query = sql_query
        # Athena Initialization
        self.util = AthenaUtil(s3_staging_folder=self.s3_staging_folder_csv)
        #Glue Create Job derive from s3_target_folder_parquet
        job_name = '.'.join([
            re.sub('[^0-9a-zA-Z]+', '',
                   x).title() for x in s3_target_folder_parquet.replace(
                       's3://', '').split('/')[1:]
        ])
        self.glue_job_name = job_name  #'athena_ctas_part2'
        glue_job = self.client.create_job(Name=self.glue_job_name,
                                          Role=glue_role,
                                          Command={
                                              'Name':
                                              'glueetl',
                                              'ScriptLocation':
                                              glue_script_location
                                          })

        self.glue_script_location = glue_script_location  # 's3://move-dataeng-temp-dev/glue-etl/scripts/athena_ctas_part2.py'

    def athena_query_execute_save_s3(self, sql_query):
        text = "Started athena_query_execute_save_s3 .."
        print text
        #         self.banner(text)
        print 'athena_ctas_query= ', sql_query
        self.util.execute_save_s3(sql_query, self.s3_staging_folder_csv)
        return True

    def wait_for_job_to_complete(self, JobRunId):
        """ waits for query to execute """
        text = 'Waiting for JobName = %s and  JobId=%s  to Complete processing ...' % (
            self.glue_job_name, JobRunId)
        print text
        #         self.banner(text)
        status = "STARTING"  # assumed
        error_count = 0
        response = None
        response = self.client.get_job_run(JobName=self.glue_job_name,
                                           RunId=JobRunId)
        status = response["JobRun"]["JobRunState"]
        while (
                status in ("QUEUED','RUNNING, STARTING")
        ):  # 'JobRunState': 'STARTING'|'RUNNING'|'STOPPING'|'STOPPED'|'SUCCEEDED'|'FAILED',
            try:
                response = self.client.get_job_run(JobName=self.glue_job_name,
                                                   RunId=JobRunId)
                status = response["JobRun"]["JobRunState"]
                # my_print(status)
                time.sleep(0.5)
            except botocore.exceptions.ClientError as ce:

                error_count = error_count + 1
                if (error_count > 3):
                    status = "FAILED"
                    print(str(ce))
                    break  # out of the loop
                if "ExpiredTokenException" in str(ce):
                    self.client = boto3.session.Session(
                        region_name=self.region_name).client('glue')

        if (status == "FAILED" or status == "STOPPED"):
            # print(response)
            pass

        if response is None:
            return {"SUCCESS": False, "STATUS": status}
        else:
            return response

    def glue_etl_execute_csv2parquet(self, target_table_name):
        text = 'Starting glue_etl_execute_csv2parquet process  for %s ....' % (
            self.glue_job_name)
        print text
        print 's3_staging_folder_csv=', self.s3_staging_folder_csv
        print 's3_target_folder_parquet = ', self.s3_target_folder_parquet
        print ''
        #         self.banner(text)
        response = self.client.start_job_run(JobName=self.glue_job_name,
                                             Arguments={
                                                 '--s3_location_csv_file':
                                                 self.s3_staging_folder_csv,
                                                 '--s3_location_parquet_file':
                                                 self.s3_target_folder_parquet,
                                                 '--table_name':
                                                 target_table_name
                                             })
        return response
#         if self.wait_for_job_to_complete(response['JobRunId']):
#             return True
#         else:
#             return False

    def job_cleanup(self):
        self.banner("Job Cleanup")
        return self.client.delete_job(JobName=self.glue_job_name)

    @staticmethod
    def banner(text, ch='=', length=78):
        spaced_text = ' %s ' % text
        banner = spaced_text.center(length, ch)
        return banner