def __init__(self, s3_staging_folder_csv, s3_target_folder_parquet, glue_role, glue_script_location): """ constructor requires s3 staging folder for storing results Parameters: s3_staging_folder = s3 folder with write permissions for storing results """ print 's3_staging_folder_csv=', s3_staging_folder_csv print 's3_target_folder_parquet = ', s3_target_folder_parquet self.s3_staging_folder_csv = s3_staging_folder_csv self.s3_target_folder_parquet = s3_target_folder_parquet # self.sql_query = sql_query # Athena Initialization self.util = AthenaUtil(s3_staging_folder=self.s3_staging_folder_csv) #Glue Create Job derive from s3_target_folder_parquet job_name = '.'.join([ re.sub('[^0-9a-zA-Z]+', '', x).title() for x in s3_target_folder_parquet.replace( 's3://', '').split('/')[1:] ]) self.glue_job_name = job_name #'athena_ctas_part2' glue_job = self.client.create_job(Name=self.glue_job_name, Role=glue_role, Command={ 'Name': 'glueetl', 'ScriptLocation': glue_script_location }) self.glue_script_location = glue_script_location # 's3://move-dataeng-temp-dev/glue-etl/scripts/athena_ctas_part2.py'
def drop_table(table_name, env): s3_location_target = 's3://move-dataeng-temp-%s/athena_ctas/tmp/' % (env) util = AthenaUtil(s3_staging_folder=s3_location_target) sql_query = """drop table %s""" % (table_name) print sql_query QueryExecutionId = util.start_query_execution(sql_query=sql_query) results = util.get_results(QueryExecutionId) print results
def athena_query_execute_save_s3(self): text = "AthenaUtil Initialization .." print text util = AthenaUtil(s3_staging_folder=self.s3_staging_folder_csv) text = "Started athena_query_execute_save_s3 .." print text print 'athena_ctas_query= ', self.sql_query util.execute_save_s3(self.sql_query, self.s3_staging_folder_csv) return True
def extract_athena_data(weekly_query): try: print ("Extract the data from Athena...!!!") #conn = connect_athena() #curr = conn.cursor() if os.path.exists('WeeklyKpiReport.xls'): wbk = xlrd.open_workbook('WeeklyKpiReport.xls', formatting_info=True) workbook = copy(wbk) else: workbook = xlwt.Workbook() worksheet = workbook.add_sheet(weekstartdate) '''bold = workbook.add_format({'bold': True}) date_format = workbook.add_format({'num_format': 'yyyy-mm-dd'}) text_format = workbook.add_format() # Add a number format for cells with money. money = workbook.add_format({'num_format': '$#,##0'})''' date_format = xlwt.XFStyle() date_format.num_format_str = 'yyyy-mm-dd' # Write some data headers. worksheet.write(0, 0, 'metric_name') worksheet.write(0, 1, 'metric_id') worksheet.write(0, 2, 'Week Start Date') worksheet.write(0, 3, 'Week End Date') worksheet.write(0, 4, 'time_dim_key') worksheet.write(0, 5, 'metric_source_id') worksheet.write(0, 6, 'metric_value') worksheet.write(0, 7, 'created_by') worksheet.write(0, 8, 'created_date') worksheet.write(0, 9, 'updated_by') worksheet.write(0, 10, 'updated_date') row = 1 col = 0 util = AthenaUtil(s3_staging_folder='s3://move-dataeng-temp-prod/athena-results/') for k, v in weekly_query.items(): print (v) #curr.execute(v) #queryoutput = curr.fetchone() snap_result = util.execute_query(sql_query=v, use_cache=False) queryoutput = snap_result["ResultSet"]["Rows"][0]['Data'] print (queryoutput[0]) worksheet.write(row, col, k) for i in range(len(queryoutput)): if i == 1 or i == 2 or i == 7 or i == 9: worksheet.write(row, col + 1 + i, queryoutput[i]['VarCharValue'], date_format) else: worksheet.write(row, col + 1 + i, queryoutput[i]['VarCharValue']) query = ("""INSERT INTO dm_rdckpi.metric_actual_fact(metric_id, timedimkey, metric_source_id, metric_value, created_by, created_date, updated_by, updated_date) VALUES(%s,'%s',%s,%s,'%s','%s','%s','%s');""" %(queryoutput[0]['VarCharValue'],queryoutput[3]['VarCharValue'],queryoutput[4]['VarCharValue'],queryoutput[5]['VarCharValue'],queryoutput[6]['VarCharValue'],queryoutput[7]['VarCharValue'],queryoutput[8]['VarCharValue'], queryoutput[9]['VarCharValue'])) worksheet.write(row, 11, query) row += 1 workbook.save('WeeklyKpiReport.xls') except Exception as err: print ("Here is the error...('%s')" %(err))
def refresh_partitions(target_table, env='dev'): ''' Use Athena or LGK to derive to-be processed partitions ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/apillai/ctas-test' % (env) util = AthenaUtil(s3_staging_folder=s3_location_target) athena_ctas_query = '''msck repair table %s;''' % (target_table) print 'athena_ctas_query= ', athena_ctas_query result = util.execute_query(athena_ctas_query) return result
def start_query_execution_and_wait_for_completion(self, sql): """ starts a query execution an waits for it to complete, use this for last queries where u need a query id and want to download results page by page. Parameters: sql query to execute Returns: query_status_result dictionary with following structure: {"SUCCESS" : False | True, "STATUS" : status , "QUERY_TYPE" : "FIRST WORD OF QUERY e.g. SELECT or INSERT or ALTER" , "QUERY" : "ACTUAL QUERY" , "StateChangeReason" : None | "Error string if any"} """ query_status_result = None for attempt in range(3): try: util = None if hasattr(thread_data, 'util') == False: thread_data.util = AthenaUtil(self.s3_staging_folder) util = thread_data.util result = util.start_query_execution(sql_query=sql) query_status_result = util.__wait_for_query_to_complete(result) # print(query_status_result) if query_status_result["SUCCESS"] == True: return query_status_result else: pass # print("attempt",attempt,query_status_result, sql) except Exception as e: print("attempt", attempt, str(e), sql) time.sleep((attempt + 1)**2) return None
def get_pdt_partitions2process_show_partitions(source_table, target_table, env='dev'): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % ( env) util = AthenaUtil(s3_staging_folder=s3_location_target) df_src = util.get_pandas_frame( util.execute_query('''show partitions %s;''' % (source_table))) df_src['date_partition'] = df_src['partition'].apply( lambda part_str: ''.join( [x.split('=')[1] for x in part_str.split('/')][:-1])) s = df_src.groupby('date_partition').size() src_list = [y for x, y in zip(s, s.index) if x == 24] #---- # If no target, return the source list try: df_tgt = util.get_pandas_frame( util.execute_query(''' show partitions %s;''' % (target_table))) df_tgt['date_partition'] = df_tgt['partition'].apply( lambda x: x.split('=')[1].replace('-', '')) return sorted(list(set(src_list) - set(df_tgt['date_partition'])), reverse=True) except: return sorted(list(set(src_list)), reverse=True)
def get_partitions2process(source_table, target_table, env='dev'): ''' Use Athena or LGK to derive to-be processed partitions ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/apillai/ctas-test' % (env) util = AthenaUtil(s3_staging_folder=s3_location_target) athena_ctas_query = '''show partitions %s;''' % (source_table) print 'athena_ctas_query= ', athena_ctas_query result = util.execute_query(athena_ctas_query) df_src = util.get_pandas_frame(result) df_src['date_partition'] = df_src['partition'].apply( lambda part_str: '-'.join( [x.split('=')[1] for x in part_str.split('/')][:-1])) #---- athena_ctas_query = ''' show partitions %s;''' % (target_table) print 'athena_ctas_query= ', athena_ctas_query result = util.execute_query(athena_ctas_query) df_tgt = util.get_pandas_frame(result) df_tgt['date_partition'] = df_tgt['partition'].apply( lambda x: x.split('=')[1]) return sorted( list(set(df_src['date_partition']) - set(df_tgt['date_partition'])), reverse=True)
def get_pdt_partitions2process(source_table, target_table, env='dev'): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % ( env) util = AthenaUtil(s3_staging_folder=s3_location_target) data = {'source_table': source_table, 'target_table': target_table} query_str_a = ''' WITH src as(select concat(year, month, day) as event_date, cardinality(array_agg(distinct hour ) )as hours_list from {source_table} where year = split(cast(current_date as varchar), '-')[1] and ( month = split(cast(current_date as varchar), '-')[2] OR month = split(cast(date_add('day', -30, current_date) as varchar), '-')[2] ) group by 1 having cardinality(array_agg(distinct hour ) ) = 24 ), tgt as ( select distinct event_date from {target_table} ) select src.event_date from src left outer join tgt on (src.event_date = tgt.event_date ) where tgt.event_date IS NULL order by src.event_date desc '''.format(**data) query_str_b = ''' select concat(year, month, day) as event_date, cardinality(array_agg(distinct hour ) )as hours_list from %s where year = split(cast(current_date as varchar), '-')[1] and ( month = split(cast(current_date as varchar), '-')[2] OR month = split(cast(date_add('day', -30, current_date) as varchar), '-')[2] ) group by 1 having cardinality(array_agg(distinct hour ) ) = 24 order by event_date desc ''' % (source_table) # If no target, return the source list try: print 'query_str_a=', query_str_a df_delta = util.get_pandas_frame(util.execute_query(query_str_a)) return sorted(list(df_delta['event_date']), reverse=True) except: print 'Inc Query failed! Falling back to query_str_b=', query_str_b df_delta = util.get_pandas_frame(util.execute_query(query_str_b)) return sorted(list(df_delta['event_date']), reverse=True)
def get_partitions2process(source_table, target_table, env='dev' ): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' %(env) util = AthenaUtil(s3_staging_folder = s3_location_target) data = { 'source_table': source_table,'target_table':target_table} query_str_a = ''' WITH src as(SELECT DISTINCT event_date FROM {source_table} ), tgt as( SELECT DISTINCT event_date FROM {target_table} ) select src.event_date from src left outer join tgt ON (src.event_date = tgt.event_date ) WHERE tgt.event_date IS NULL ORDER BY src.event_date DESC '''.format(**data) query_str_b = ''' select distinct event_date from %s order by event_date desc ''' %(source_table) # If no target, return the source list try: print 'query_str_a=', query_str_a df_delta = util.get_pandas_frame(util.execute_query(query_str_a) ) return sorted(list(df_delta['event_date'][1:] ), reverse=True) except: print 'Inc Query failed! Falling back to query_str_b=', query_str_b df_delta = util.get_pandas_frame(util.execute_query(query_str_b) ) return sorted(list(df_delta['event_date'][1:] ), reverse=True)
def get_pdt_partitions2process(source_table, target_table): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-dev/apillai/ctas-test' util = AthenaUtil(s3_staging_folder=s3_location_target) df_src = util.get_pandas_frame( util.execute_query('''show partitions %s;''' % (source_table))) df_src['date_partition'] = df_src['partition'].apply( lambda part_str: ''.join( [x.split('=')[1] for x in part_str.split('/')][:-1])) #---- df_tgt = util.get_pandas_frame( util.execute_query(''' show partitions %s;''' % (target_table))) df_tgt['date_partition'] = df_tgt['partition'].apply( lambda x: x.split('=')[1].replace('-', '')) return sorted( list(set(df_src['date_partition']) - set(df_tgt['date_partition'])), reverse=True)
def get_record_count(table_name, env): s3_location_target = 's3://move-dataeng-temp-%s/athena_ctas/tmp/' % (env) util = AthenaUtil(s3_staging_folder=s3_location_target) sql_query = """select count(1) as ct from %s""" % (table_name) df_delta = util.get_pandas_frame(util.execute_query(sql_query)) return ''.join(list(df_delta['ct']))
from move_dl_common_api.athena_util import AthenaUtil import boto3, sys, json, time, uuid, datetime, botocore, re, uuid import pandas as pd import json def get_hour_from_filename(in_filename): date, time = in_filename.split('/')[-1].split('.')[0].split('_')[1].split( '-') return date + time[:2] env = 'dev' s3_location_target = 's3://move-dataeng-temp-%s/apillai/audit-rd-pdt' % (env) util = AthenaUtil(s3_staging_folder=s3_location_target) ##-- from argparse import ArgumentParser parser = ArgumentParser( description="Audit program to be called with manadatory Input Arguments") parser.add_argument('--input_date', help='RD Input date in yyyy-mm-dd format') #input_date = '2017-12-01' ar = parser.parse_args() input_date = ar.input_date s3_base_location = 's3://move-dataeng-omniture-prod/homerealtor/raw-data-uncompressed/hit_data' y, m, d = input_date.split('-') table_name = 'move_dl.hit_data_raw' sql_text = """ ALTER TABLE %s ADD IF NOT EXISTS PARTITION (year='%s',month='%s',day='%s') location '%s/year=%s/month=%s/day=%s'; """ % (table_name, y, m, d, s3_base_location, y, m, d)
class AthenaCTAS(object): region_name = 'us-west-2' client = boto3.client(service_name='glue', region_name='us-west-2', endpoint_url='https://glue.us-west-2.amazonaws.com') s3_staging_folder_csv = None s3_target_folder_parquet = None glue_script_location = None glue_job_name = None glue_role = None util = None glue_job = None def __init__(self, s3_staging_folder_csv, s3_target_folder_parquet, glue_role, glue_script_location): """ constructor requires s3 staging folder for storing results Parameters: s3_staging_folder = s3 folder with write permissions for storing results """ print 's3_staging_folder_csv=', s3_staging_folder_csv print 's3_target_folder_parquet = ', s3_target_folder_parquet self.s3_staging_folder_csv = s3_staging_folder_csv self.s3_target_folder_parquet = s3_target_folder_parquet # self.sql_query = sql_query # Athena Initialization self.util = AthenaUtil(s3_staging_folder=self.s3_staging_folder_csv) #Glue Create Job derive from s3_target_folder_parquet job_name = '.'.join([ re.sub('[^0-9a-zA-Z]+', '', x).title() for x in s3_target_folder_parquet.replace( 's3://', '').split('/')[1:] ]) self.glue_job_name = job_name #'athena_ctas_part2' glue_job = self.client.create_job(Name=self.glue_job_name, Role=glue_role, Command={ 'Name': 'glueetl', 'ScriptLocation': glue_script_location }) self.glue_script_location = glue_script_location # 's3://move-dataeng-temp-dev/glue-etl/scripts/athena_ctas_part2.py' def athena_query_execute_save_s3(self, sql_query): text = "Started athena_query_execute_save_s3 .." print text # self.banner(text) print 'athena_ctas_query= ', sql_query self.util.execute_save_s3(sql_query, self.s3_staging_folder_csv) return True def wait_for_job_to_complete(self, JobRunId): """ waits for query to execute """ text = 'Waiting for JobName = %s and JobId=%s to Complete processing ...' % ( self.glue_job_name, JobRunId) print text # self.banner(text) status = "STARTING" # assumed error_count = 0 response = None response = self.client.get_job_run(JobName=self.glue_job_name, RunId=JobRunId) status = response["JobRun"]["JobRunState"] while ( status in ("QUEUED','RUNNING, STARTING") ): # 'JobRunState': 'STARTING'|'RUNNING'|'STOPPING'|'STOPPED'|'SUCCEEDED'|'FAILED', try: response = self.client.get_job_run(JobName=self.glue_job_name, RunId=JobRunId) status = response["JobRun"]["JobRunState"] # my_print(status) time.sleep(0.5) except botocore.exceptions.ClientError as ce: error_count = error_count + 1 if (error_count > 3): status = "FAILED" print(str(ce)) break # out of the loop if "ExpiredTokenException" in str(ce): self.client = boto3.session.Session( region_name=self.region_name).client('glue') if (status == "FAILED" or status == "STOPPED"): # print(response) pass if response is None: return {"SUCCESS": False, "STATUS": status} else: return response def glue_etl_execute_csv2parquet(self, target_table_name): text = 'Starting glue_etl_execute_csv2parquet process for %s ....' % ( self.glue_job_name) print text print 's3_staging_folder_csv=', self.s3_staging_folder_csv print 's3_target_folder_parquet = ', self.s3_target_folder_parquet print '' # self.banner(text) response = self.client.start_job_run(JobName=self.glue_job_name, Arguments={ '--s3_location_csv_file': self.s3_staging_folder_csv, '--s3_location_parquet_file': self.s3_target_folder_parquet, '--table_name': target_table_name }) return response # if self.wait_for_job_to_complete(response['JobRunId']): # return True # else: # return False def job_cleanup(self): self.banner("Job Cleanup") return self.client.delete_job(JobName=self.glue_job_name) @staticmethod def banner(text, ch='=', length=78): spaced_text = ' %s ' % text banner = spaced_text.center(length, ch) return banner
import pandas as pd aws_region_name = 'us-west-2' #s3_bucket = 'aws-athena-query-results-057425096214-us-west-2' #s3_key = 'Unsaved/Abtest_data' temp_location = 's3://move-dataeng-temp-dev/sql_refractor/' result = pd.DataFrame() with open( 'Input_SQL_Redshift.txt', 'r' ) as f: ##### SQL.txt should hold the queries to be executed on the athena seperated by ";" s = f.read() d = s.split(';') athena_df = pd.DataFrame() util = AthenaUtil(s3_staging_folder=temp_location) for _ in d: try: result = util.execute_query(sql_query=_) temp = util.get_pandas_frame(result) print(temp) athena_df = pd.concat([athena_df, temp], ignore_index=True) except Exception as e: print("Exception") print("Not Executed --------- ", _) print(e) #athena_df.columns = ['metric_id', 'start_date', 'end_date', 'fy_monthdimkey', 'metric_value'] athena_df.to_csv('ATHENA_SQL_OUTPUT.csv', index=False)