def get_pdt_partitions2process_show_partitions(source_table, target_table, env='dev'): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % ( env) util = AthenaUtil(s3_staging_folder=s3_location_target) df_src = util.get_pandas_frame( util.execute_query('''show partitions %s;''' % (source_table))) df_src['date_partition'] = df_src['partition'].apply( lambda part_str: ''.join( [x.split('=')[1] for x in part_str.split('/')][:-1])) s = df_src.groupby('date_partition').size() src_list = [y for x, y in zip(s, s.index) if x == 24] #---- # If no target, return the source list try: df_tgt = util.get_pandas_frame( util.execute_query(''' show partitions %s;''' % (target_table))) df_tgt['date_partition'] = df_tgt['partition'].apply( lambda x: x.split('=')[1].replace('-', '')) return sorted(list(set(src_list) - set(df_tgt['date_partition'])), reverse=True) except: return sorted(list(set(src_list)), reverse=True)
def get_partitions2process(source_table, target_table, env='dev'): ''' Use Athena or LGK to derive to-be processed partitions ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/apillai/ctas-test' % (env) util = AthenaUtil(s3_staging_folder=s3_location_target) athena_ctas_query = '''show partitions %s;''' % (source_table) print 'athena_ctas_query= ', athena_ctas_query result = util.execute_query(athena_ctas_query) df_src = util.get_pandas_frame(result) df_src['date_partition'] = df_src['partition'].apply( lambda part_str: '-'.join( [x.split('=')[1] for x in part_str.split('/')][:-1])) #---- athena_ctas_query = ''' show partitions %s;''' % (target_table) print 'athena_ctas_query= ', athena_ctas_query result = util.execute_query(athena_ctas_query) df_tgt = util.get_pandas_frame(result) df_tgt['date_partition'] = df_tgt['partition'].apply( lambda x: x.split('=')[1]) return sorted( list(set(df_src['date_partition']) - set(df_tgt['date_partition'])), reverse=True)
def get_pdt_partitions2process(source_table, target_table, env='dev'): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % ( env) util = AthenaUtil(s3_staging_folder=s3_location_target) data = {'source_table': source_table, 'target_table': target_table} query_str_a = ''' WITH src as(select concat(year, month, day) as event_date, cardinality(array_agg(distinct hour ) )as hours_list from {source_table} where year = split(cast(current_date as varchar), '-')[1] and ( month = split(cast(current_date as varchar), '-')[2] OR month = split(cast(date_add('day', -30, current_date) as varchar), '-')[2] ) group by 1 having cardinality(array_agg(distinct hour ) ) = 24 ), tgt as ( select distinct event_date from {target_table} ) select src.event_date from src left outer join tgt on (src.event_date = tgt.event_date ) where tgt.event_date IS NULL order by src.event_date desc '''.format(**data) query_str_b = ''' select concat(year, month, day) as event_date, cardinality(array_agg(distinct hour ) )as hours_list from %s where year = split(cast(current_date as varchar), '-')[1] and ( month = split(cast(current_date as varchar), '-')[2] OR month = split(cast(date_add('day', -30, current_date) as varchar), '-')[2] ) group by 1 having cardinality(array_agg(distinct hour ) ) = 24 order by event_date desc ''' % (source_table) # If no target, return the source list try: print 'query_str_a=', query_str_a df_delta = util.get_pandas_frame(util.execute_query(query_str_a)) return sorted(list(df_delta['event_date']), reverse=True) except: print 'Inc Query failed! Falling back to query_str_b=', query_str_b df_delta = util.get_pandas_frame(util.execute_query(query_str_b)) return sorted(list(df_delta['event_date']), reverse=True)
def extract_athena_data(weekly_query): try: print ("Extract the data from Athena...!!!") #conn = connect_athena() #curr = conn.cursor() if os.path.exists('WeeklyKpiReport.xls'): wbk = xlrd.open_workbook('WeeklyKpiReport.xls', formatting_info=True) workbook = copy(wbk) else: workbook = xlwt.Workbook() worksheet = workbook.add_sheet(weekstartdate) '''bold = workbook.add_format({'bold': True}) date_format = workbook.add_format({'num_format': 'yyyy-mm-dd'}) text_format = workbook.add_format() # Add a number format for cells with money. money = workbook.add_format({'num_format': '$#,##0'})''' date_format = xlwt.XFStyle() date_format.num_format_str = 'yyyy-mm-dd' # Write some data headers. worksheet.write(0, 0, 'metric_name') worksheet.write(0, 1, 'metric_id') worksheet.write(0, 2, 'Week Start Date') worksheet.write(0, 3, 'Week End Date') worksheet.write(0, 4, 'time_dim_key') worksheet.write(0, 5, 'metric_source_id') worksheet.write(0, 6, 'metric_value') worksheet.write(0, 7, 'created_by') worksheet.write(0, 8, 'created_date') worksheet.write(0, 9, 'updated_by') worksheet.write(0, 10, 'updated_date') row = 1 col = 0 util = AthenaUtil(s3_staging_folder='s3://move-dataeng-temp-prod/athena-results/') for k, v in weekly_query.items(): print (v) #curr.execute(v) #queryoutput = curr.fetchone() snap_result = util.execute_query(sql_query=v, use_cache=False) queryoutput = snap_result["ResultSet"]["Rows"][0]['Data'] print (queryoutput[0]) worksheet.write(row, col, k) for i in range(len(queryoutput)): if i == 1 or i == 2 or i == 7 or i == 9: worksheet.write(row, col + 1 + i, queryoutput[i]['VarCharValue'], date_format) else: worksheet.write(row, col + 1 + i, queryoutput[i]['VarCharValue']) query = ("""INSERT INTO dm_rdckpi.metric_actual_fact(metric_id, timedimkey, metric_source_id, metric_value, created_by, created_date, updated_by, updated_date) VALUES(%s,'%s',%s,%s,'%s','%s','%s','%s');""" %(queryoutput[0]['VarCharValue'],queryoutput[3]['VarCharValue'],queryoutput[4]['VarCharValue'],queryoutput[5]['VarCharValue'],queryoutput[6]['VarCharValue'],queryoutput[7]['VarCharValue'],queryoutput[8]['VarCharValue'], queryoutput[9]['VarCharValue'])) worksheet.write(row, 11, query) row += 1 workbook.save('WeeklyKpiReport.xls') except Exception as err: print ("Here is the error...('%s')" %(err))
def get_partitions2process(source_table, target_table, env='dev' ): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' %(env) util = AthenaUtil(s3_staging_folder = s3_location_target) data = { 'source_table': source_table,'target_table':target_table} query_str_a = ''' WITH src as(SELECT DISTINCT event_date FROM {source_table} ), tgt as( SELECT DISTINCT event_date FROM {target_table} ) select src.event_date from src left outer join tgt ON (src.event_date = tgt.event_date ) WHERE tgt.event_date IS NULL ORDER BY src.event_date DESC '''.format(**data) query_str_b = ''' select distinct event_date from %s order by event_date desc ''' %(source_table) # If no target, return the source list try: print 'query_str_a=', query_str_a df_delta = util.get_pandas_frame(util.execute_query(query_str_a) ) return sorted(list(df_delta['event_date'][1:] ), reverse=True) except: print 'Inc Query failed! Falling back to query_str_b=', query_str_b df_delta = util.get_pandas_frame(util.execute_query(query_str_b) ) return sorted(list(df_delta['event_date'][1:] ), reverse=True)
def refresh_partitions(target_table, env='dev'): ''' Use Athena or LGK to derive to-be processed partitions ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/apillai/ctas-test' % (env) util = AthenaUtil(s3_staging_folder=s3_location_target) athena_ctas_query = '''msck repair table %s;''' % (target_table) print 'athena_ctas_query= ', athena_ctas_query result = util.execute_query(athena_ctas_query) return result
def get_pdt_partitions2process(source_table, target_table): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-dev/apillai/ctas-test' util = AthenaUtil(s3_staging_folder=s3_location_target) df_src = util.get_pandas_frame( util.execute_query('''show partitions %s;''' % (source_table))) df_src['date_partition'] = df_src['partition'].apply( lambda part_str: ''.join( [x.split('=')[1] for x in part_str.split('/')][:-1])) #---- df_tgt = util.get_pandas_frame( util.execute_query(''' show partitions %s;''' % (target_table))) df_tgt['date_partition'] = df_tgt['partition'].apply( lambda x: x.split('=')[1].replace('-', '')) return sorted( list(set(df_src['date_partition']) - set(df_tgt['date_partition'])), reverse=True)
def get_record_count(table_name, env): s3_location_target = 's3://move-dataeng-temp-%s/athena_ctas/tmp/' % (env) util = AthenaUtil(s3_staging_folder=s3_location_target) sql_query = """select count(1) as ct from %s""" % (table_name) df_delta = util.get_pandas_frame(util.execute_query(sql_query)) return ''.join(list(df_delta['ct']))
##-- from argparse import ArgumentParser parser = ArgumentParser( description="Audit program to be called with manadatory Input Arguments") parser.add_argument('--input_date', help='RD Input date in yyyy-mm-dd format') #input_date = '2017-12-01' ar = parser.parse_args() input_date = ar.input_date s3_base_location = 's3://move-dataeng-omniture-prod/homerealtor/raw-data-uncompressed/hit_data' y, m, d = input_date.split('-') table_name = 'move_dl.hit_data_raw' sql_text = """ ALTER TABLE %s ADD IF NOT EXISTS PARTITION (year='%s',month='%s',day='%s') location '%s/year=%s/month=%s/day=%s'; """ % (table_name, y, m, d, s3_base_location, y, m, d) print sql_text util.execute_query(sql_text) s3_base_location = 's3://move-dataeng-omniture-prod/homerealtor/raw-data-uncompressed/column_headers' table_name = 'move_dl.omtr_column_headers_raw' sql_text = """ ALTER TABLE %s ADD IF NOT EXISTS PARTITION (year='%s',month='%s',day='%s') location '%s/year=%s/month=%s/day=%s'; """ % (table_name, y, m, d, s3_base_location, y, m, d) print sql_text util.execute_query(sql_text) data = {'year': y, 'month': m, 'day': d} rd_audit_query = """ WITH dataset as( SELECT d.source_filename, map( m.s3_data , d.s3_data ) AS query_map FROM
aws_region_name = 'us-west-2' #s3_bucket = 'aws-athena-query-results-057425096214-us-west-2' #s3_key = 'Unsaved/Abtest_data' temp_location = 's3://move-dataeng-temp-dev/sql_refractor/' result = pd.DataFrame() with open( 'Input_SQL_Redshift.txt', 'r' ) as f: ##### SQL.txt should hold the queries to be executed on the athena seperated by ";" s = f.read() d = s.split(';') athena_df = pd.DataFrame() util = AthenaUtil(s3_staging_folder=temp_location) for _ in d: try: result = util.execute_query(sql_query=_) temp = util.get_pandas_frame(result) print(temp) athena_df = pd.concat([athena_df, temp], ignore_index=True) except Exception as e: print("Exception") print("Not Executed --------- ", _) print(e) #athena_df.columns = ['metric_id', 'start_date', 'end_date', 'fy_monthdimkey', 'metric_value'] athena_df.to_csv('ATHENA_SQL_OUTPUT.csv', index=False)