def get_pdt_partitions2process_show_partitions(source_table, target_table, env='dev'): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % ( env) util = AthenaUtil(s3_staging_folder=s3_location_target) df_src = util.get_pandas_frame( util.execute_query('''show partitions %s;''' % (source_table))) df_src['date_partition'] = df_src['partition'].apply( lambda part_str: ''.join( [x.split('=')[1] for x in part_str.split('/')][:-1])) s = df_src.groupby('date_partition').size() src_list = [y for x, y in zip(s, s.index) if x == 24] #---- # If no target, return the source list try: df_tgt = util.get_pandas_frame( util.execute_query(''' show partitions %s;''' % (target_table))) df_tgt['date_partition'] = df_tgt['partition'].apply( lambda x: x.split('=')[1].replace('-', '')) return sorted(list(set(src_list) - set(df_tgt['date_partition'])), reverse=True) except: return sorted(list(set(src_list)), reverse=True)
def get_partitions2process(source_table, target_table, env='dev'): ''' Use Athena or LGK to derive to-be processed partitions ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/apillai/ctas-test' % (env) util = AthenaUtil(s3_staging_folder=s3_location_target) athena_ctas_query = '''show partitions %s;''' % (source_table) print 'athena_ctas_query= ', athena_ctas_query result = util.execute_query(athena_ctas_query) df_src = util.get_pandas_frame(result) df_src['date_partition'] = df_src['partition'].apply( lambda part_str: '-'.join( [x.split('=')[1] for x in part_str.split('/')][:-1])) #---- athena_ctas_query = ''' show partitions %s;''' % (target_table) print 'athena_ctas_query= ', athena_ctas_query result = util.execute_query(athena_ctas_query) df_tgt = util.get_pandas_frame(result) df_tgt['date_partition'] = df_tgt['partition'].apply( lambda x: x.split('=')[1]) return sorted( list(set(df_src['date_partition']) - set(df_tgt['date_partition'])), reverse=True)
def get_pdt_partitions2process(source_table, target_table, env='dev'): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % ( env) util = AthenaUtil(s3_staging_folder=s3_location_target) data = {'source_table': source_table, 'target_table': target_table} query_str_a = ''' WITH src as(select concat(year, month, day) as event_date, cardinality(array_agg(distinct hour ) )as hours_list from {source_table} where year = split(cast(current_date as varchar), '-')[1] and ( month = split(cast(current_date as varchar), '-')[2] OR month = split(cast(date_add('day', -30, current_date) as varchar), '-')[2] ) group by 1 having cardinality(array_agg(distinct hour ) ) = 24 ), tgt as ( select distinct event_date from {target_table} ) select src.event_date from src left outer join tgt on (src.event_date = tgt.event_date ) where tgt.event_date IS NULL order by src.event_date desc '''.format(**data) query_str_b = ''' select concat(year, month, day) as event_date, cardinality(array_agg(distinct hour ) )as hours_list from %s where year = split(cast(current_date as varchar), '-')[1] and ( month = split(cast(current_date as varchar), '-')[2] OR month = split(cast(date_add('day', -30, current_date) as varchar), '-')[2] ) group by 1 having cardinality(array_agg(distinct hour ) ) = 24 order by event_date desc ''' % (source_table) # If no target, return the source list try: print 'query_str_a=', query_str_a df_delta = util.get_pandas_frame(util.execute_query(query_str_a)) return sorted(list(df_delta['event_date']), reverse=True) except: print 'Inc Query failed! Falling back to query_str_b=', query_str_b df_delta = util.get_pandas_frame(util.execute_query(query_str_b)) return sorted(list(df_delta['event_date']), reverse=True)
def get_partitions2process(source_table, target_table, env='dev' ): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' %(env) util = AthenaUtil(s3_staging_folder = s3_location_target) data = { 'source_table': source_table,'target_table':target_table} query_str_a = ''' WITH src as(SELECT DISTINCT event_date FROM {source_table} ), tgt as( SELECT DISTINCT event_date FROM {target_table} ) select src.event_date from src left outer join tgt ON (src.event_date = tgt.event_date ) WHERE tgt.event_date IS NULL ORDER BY src.event_date DESC '''.format(**data) query_str_b = ''' select distinct event_date from %s order by event_date desc ''' %(source_table) # If no target, return the source list try: print 'query_str_a=', query_str_a df_delta = util.get_pandas_frame(util.execute_query(query_str_a) ) return sorted(list(df_delta['event_date'][1:] ), reverse=True) except: print 'Inc Query failed! Falling back to query_str_b=', query_str_b df_delta = util.get_pandas_frame(util.execute_query(query_str_b) ) return sorted(list(df_delta['event_date'][1:] ), reverse=True)
def get_pdt_partitions2process(source_table, target_table): ''' Use Athena or LGK to derive to-be processed partitions Source PDT table partition in year/month/day/hour format(Hive) Target table partition in YYYYMMDD format ''' from move_dl_common_api.athena_util import AthenaUtil import json import sys s3_location_target = 's3://move-dataeng-temp-dev/apillai/ctas-test' util = AthenaUtil(s3_staging_folder=s3_location_target) df_src = util.get_pandas_frame( util.execute_query('''show partitions %s;''' % (source_table))) df_src['date_partition'] = df_src['partition'].apply( lambda part_str: ''.join( [x.split('=')[1] for x in part_str.split('/')][:-1])) #---- df_tgt = util.get_pandas_frame( util.execute_query(''' show partitions %s;''' % (target_table))) df_tgt['date_partition'] = df_tgt['partition'].apply( lambda x: x.split('=')[1].replace('-', '')) return sorted( list(set(df_src['date_partition']) - set(df_tgt['date_partition'])), reverse=True)
def get_record_count(table_name, env): s3_location_target = 's3://move-dataeng-temp-%s/athena_ctas/tmp/' % (env) util = AthenaUtil(s3_staging_folder=s3_location_target) sql_query = """select count(1) as ct from %s""" % (table_name) df_delta = util.get_pandas_frame(util.execute_query(sql_query)) return ''.join(list(df_delta['ct']))
ON ( d.year = m.year AND d.month=m.month AND d.day=m.day AND d.col_num = m.col_num ) WHERE m.year = '{year}' AND m.month = '{month}' AND m.day = '{day}' ) select source_filename, count(1) as rd_ct from dataset group by 1 order by 1 """.format(**data) print rd_audit_query df_rd = util.get_pandas_frame(util.execute_query(rd_audit_query)) data = {'input_date': input_date} pdt_audit_query = """ select etl_source_filename, count(1) as pdt_ct from cnpd_omtr_pdt.hit_data_forqa where cast (concat(year, '-', month, '-', day) as date) between date_add('day', -2, cast( '{input_date}' as date)) and date_add('day', 2, cast( '{input_date}' as date)) group by 1 order by 1 """.format(**data) print pdt_audit_query df_pdt = util.get_pandas_frame(util.execute_query(pdt_audit_query)) df_rd_clean = df_rd[1:]
aws_region_name = 'us-west-2' #s3_bucket = 'aws-athena-query-results-057425096214-us-west-2' #s3_key = 'Unsaved/Abtest_data' temp_location = 's3://move-dataeng-temp-dev/sql_refractor/' result = pd.DataFrame() with open( 'Input_SQL_Redshift.txt', 'r' ) as f: ##### SQL.txt should hold the queries to be executed on the athena seperated by ";" s = f.read() d = s.split(';') athena_df = pd.DataFrame() util = AthenaUtil(s3_staging_folder=temp_location) for _ in d: try: result = util.execute_query(sql_query=_) temp = util.get_pandas_frame(result) print(temp) athena_df = pd.concat([athena_df, temp], ignore_index=True) except Exception as e: print("Exception") print("Not Executed --------- ", _) print(e) #athena_df.columns = ['metric_id', 'start_date', 'end_date', 'fy_monthdimkey', 'metric_value'] athena_df.to_csv('ATHENA_SQL_OUTPUT.csv', index=False)