示例#1
0
def get_pdt_partitions2process_show_partitions(source_table,
                                               target_table,
                                               env='dev'):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % (
        env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    df_src = util.get_pandas_frame(
        util.execute_query('''show partitions %s;''' % (source_table)))
    df_src['date_partition'] = df_src['partition'].apply(
        lambda part_str: ''.join(
            [x.split('=')[1] for x in part_str.split('/')][:-1]))
    s = df_src.groupby('date_partition').size()
    src_list = [y for x, y in zip(s, s.index) if x == 24]
    #----
    # If no target, return the source list
    try:
        df_tgt = util.get_pandas_frame(
            util.execute_query(''' show partitions %s;''' % (target_table)))
        df_tgt['date_partition'] = df_tgt['partition'].apply(
            lambda x: x.split('=')[1].replace('-', ''))
        return sorted(list(set(src_list) - set(df_tgt['date_partition'])),
                      reverse=True)
    except:
        return sorted(list(set(src_list)), reverse=True)
示例#2
0
def get_partitions2process(source_table, target_table, env='dev'):
    '''
    Use Athena or LGK to derive to-be processed partitions
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/apillai/ctas-test' % (env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    athena_ctas_query = '''show partitions %s;''' % (source_table)
    print 'athena_ctas_query= ', athena_ctas_query
    result = util.execute_query(athena_ctas_query)
    df_src = util.get_pandas_frame(result)
    df_src['date_partition'] = df_src['partition'].apply(
        lambda part_str: '-'.join(
            [x.split('=')[1] for x in part_str.split('/')][:-1]))
    #----
    athena_ctas_query = ''' show partitions %s;''' % (target_table)
    print 'athena_ctas_query= ', athena_ctas_query
    result = util.execute_query(athena_ctas_query)
    df_tgt = util.get_pandas_frame(result)
    df_tgt['date_partition'] = df_tgt['partition'].apply(
        lambda x: x.split('=')[1])
    return sorted(
        list(set(df_src['date_partition']) - set(df_tgt['date_partition'])),
        reverse=True)
def get_pdt_partitions2process(source_table, target_table, env='dev'):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % (
        env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    data = {'source_table': source_table, 'target_table': target_table}
    query_str_a = '''
       WITH src as(select concat(year, month, day) as event_date, cardinality(array_agg(distinct  hour  ) )as hours_list 
                    from {source_table}  
                    where  year  =  split(cast(current_date as varchar), '-')[1]
                    and ( month  =  split(cast(current_date as varchar), '-')[2]  
                       OR month  =  split(cast(date_add('day', -30, current_date) as varchar), '-')[2] 
                        )
                    group by 1 
                    having cardinality(array_agg(distinct  hour  ) ) = 24 ),
            tgt as ( select distinct event_date
                       from {target_table} )
       select src.event_date 
       from src left outer join tgt on (src.event_date = tgt.event_date )
       where tgt.event_date IS NULL
       order by src.event_date desc '''.format(**data)

    query_str_b = '''
       select concat(year, month, day) as event_date, cardinality(array_agg(distinct  hour  ) )as hours_list 
                    from %s
                    where  year  =  split(cast(current_date as varchar), '-')[1]
                    and ( month  =  split(cast(current_date as varchar), '-')[2]  
                       OR month  =  split(cast(date_add('day', -30, current_date) as varchar), '-')[2] 
                        )
                    group by 1 
                    having cardinality(array_agg(distinct  hour  ) ) = 24  
                    order by event_date desc ''' % (source_table)

    # If no target, return the source list
    try:
        print 'query_str_a=', query_str_a
        df_delta = util.get_pandas_frame(util.execute_query(query_str_a))
        return sorted(list(df_delta['event_date']), reverse=True)
    except:
        print 'Inc Query failed! Falling back to query_str_b=', query_str_b
        df_delta = util.get_pandas_frame(util.execute_query(query_str_b))
        return sorted(list(df_delta['event_date']), reverse=True)
示例#4
0
def extract_athena_data(weekly_query):
    try:
        print ("Extract the data from Athena...!!!")
        #conn = connect_athena()
        #curr = conn.cursor()
        if os.path.exists('WeeklyKpiReport.xls'):
            wbk = xlrd.open_workbook('WeeklyKpiReport.xls', formatting_info=True)
            workbook = copy(wbk)
        else:
            workbook = xlwt.Workbook()

        worksheet = workbook.add_sheet(weekstartdate)
        '''bold = workbook.add_format({'bold': True})
        date_format = workbook.add_format({'num_format': 'yyyy-mm-dd'})
        text_format = workbook.add_format()

        # Add a number format for cells with money.
        money = workbook.add_format({'num_format': '$#,##0'})'''
        date_format = xlwt.XFStyle()
        date_format.num_format_str = 'yyyy-mm-dd'
        # Write some data headers.
        worksheet.write(0, 0, 'metric_name')
        worksheet.write(0, 1, 'metric_id')
        worksheet.write(0, 2, 'Week Start Date')
        worksheet.write(0, 3, 'Week End Date')
        worksheet.write(0, 4, 'time_dim_key')
        worksheet.write(0, 5, 'metric_source_id')
        worksheet.write(0, 6, 'metric_value')
        worksheet.write(0, 7, 'created_by')
        worksheet.write(0, 8, 'created_date')
        worksheet.write(0, 9, 'updated_by')
        worksheet.write(0, 10, 'updated_date')
        row = 1
        col = 0

        util = AthenaUtil(s3_staging_folder='s3://move-dataeng-temp-prod/athena-results/')
        for k, v in weekly_query.items():
            print (v)
            #curr.execute(v)
            #queryoutput = curr.fetchone()
            snap_result = util.execute_query(sql_query=v, use_cache=False)
            queryoutput = snap_result["ResultSet"]["Rows"][0]['Data']
            print (queryoutput[0])
            worksheet.write(row, col, k)
            for i in range(len(queryoutput)):
                if i == 1 or i == 2 or i == 7 or i == 9:
                    worksheet.write(row, col + 1 + i, queryoutput[i]['VarCharValue'], date_format)
                else:
                    worksheet.write(row, col + 1 + i, queryoutput[i]['VarCharValue'])

            query = ("""INSERT INTO dm_rdckpi.metric_actual_fact(metric_id, timedimkey, metric_source_id, metric_value, created_by, created_date, updated_by, updated_date) VALUES(%s,'%s',%s,%s,'%s','%s','%s','%s');""" %(queryoutput[0]['VarCharValue'],queryoutput[3]['VarCharValue'],queryoutput[4]['VarCharValue'],queryoutput[5]['VarCharValue'],queryoutput[6]['VarCharValue'],queryoutput[7]['VarCharValue'],queryoutput[8]['VarCharValue'], queryoutput[9]['VarCharValue']))
            worksheet.write(row, 11, query)
            row += 1
        workbook.save('WeeklyKpiReport.xls')

    except Exception as err:
        print ("Here is the error...('%s')" %(err))
示例#5
0
def get_partitions2process(source_table, target_table, env='dev' ):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' %(env)
    util = AthenaUtil(s3_staging_folder = s3_location_target)
    data = { 'source_table': source_table,'target_table':target_table}
    query_str_a = '''
      WITH src as(SELECT DISTINCT event_date  
                  FROM  {source_table}  ),
          tgt as( SELECT DISTINCT event_date  
                  FROM {target_table} )
    select   src.event_date
    from src left outer join  tgt
    ON (src.event_date = tgt.event_date )
    WHERE tgt.event_date IS NULL
    ORDER BY  src.event_date DESC '''.format(**data)
   

    query_str_b = '''
       select distinct  event_date 
                    from %s
                    order by event_date desc ''' %(source_table)   
    
    
    
    # If no target, return the source list
    try:
        print 'query_str_a=', query_str_a
        df_delta = util.get_pandas_frame(util.execute_query(query_str_a) )
        return sorted(list(df_delta['event_date'][1:] ), reverse=True)
    except:
        print 'Inc Query failed! Falling back to query_str_b=', query_str_b
        df_delta = util.get_pandas_frame(util.execute_query(query_str_b) )
        return sorted(list(df_delta['event_date'][1:] ), reverse=True)
示例#6
0
def refresh_partitions(target_table, env='dev'):
    '''
    Use Athena or LGK to derive to-be processed partitions
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/apillai/ctas-test' % (env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    athena_ctas_query = '''msck repair table   %s;''' % (target_table)
    print 'athena_ctas_query= ', athena_ctas_query
    result = util.execute_query(athena_ctas_query)
    return result
示例#7
0
def get_pdt_partitions2process(source_table, target_table):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-dev/apillai/ctas-test'
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    df_src = util.get_pandas_frame(
        util.execute_query('''show partitions %s;''' % (source_table)))
    df_src['date_partition'] = df_src['partition'].apply(
        lambda part_str: ''.join(
            [x.split('=')[1] for x in part_str.split('/')][:-1]))
    #----
    df_tgt = util.get_pandas_frame(
        util.execute_query(''' show partitions %s;''' % (target_table)))
    df_tgt['date_partition'] = df_tgt['partition'].apply(
        lambda x: x.split('=')[1].replace('-', ''))
    return sorted(
        list(set(df_src['date_partition']) - set(df_tgt['date_partition'])),
        reverse=True)
def get_record_count(table_name, env):
    s3_location_target = 's3://move-dataeng-temp-%s/athena_ctas/tmp/' % (env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    sql_query = """select count(1) as ct from %s""" % (table_name)
    df_delta = util.get_pandas_frame(util.execute_query(sql_query))
    return ''.join(list(df_delta['ct']))
示例#9
0
##--
from argparse import ArgumentParser
parser = ArgumentParser(
    description="Audit  program to be called with manadatory Input Arguments")
parser.add_argument('--input_date', help='RD Input date in yyyy-mm-dd format')
#input_date = '2017-12-01'
ar = parser.parse_args()
input_date = ar.input_date
s3_base_location = 's3://move-dataeng-omniture-prod/homerealtor/raw-data-uncompressed/hit_data'
y, m, d = input_date.split('-')
table_name = 'move_dl.hit_data_raw'
sql_text = """
ALTER TABLE %s ADD IF NOT EXISTS PARTITION  (year='%s',month='%s',day='%s') location '%s/year=%s/month=%s/day=%s';
""" % (table_name, y, m, d, s3_base_location, y, m, d)
print sql_text
util.execute_query(sql_text)
s3_base_location = 's3://move-dataeng-omniture-prod/homerealtor/raw-data-uncompressed/column_headers'
table_name = 'move_dl.omtr_column_headers_raw'
sql_text = """
ALTER TABLE %s ADD IF NOT EXISTS PARTITION  (year='%s',month='%s',day='%s') location '%s/year=%s/month=%s/day=%s';
""" % (table_name, y, m, d, s3_base_location, y, m, d)
print sql_text
util.execute_query(sql_text)

data = {'year': y, 'month': m, 'day': d}
rd_audit_query = """
WITH dataset as(
SELECT d.source_filename,
         map( m.s3_data ,
         d.s3_data ) AS query_map
FROM 
示例#10
0
aws_region_name = 'us-west-2'
#s3_bucket = 'aws-athena-query-results-057425096214-us-west-2'
#s3_key = 'Unsaved/Abtest_data'
temp_location = 's3://move-dataeng-temp-dev/sql_refractor/'
result = pd.DataFrame()

with open(
        'Input_SQL_Redshift.txt', 'r'
) as f:  ##### SQL.txt should hold the queries to be executed on the athena seperated by ";"
    s = f.read()
    d = s.split(';')

athena_df = pd.DataFrame()
util = AthenaUtil(s3_staging_folder=temp_location)

for _ in d:
    try:
        result = util.execute_query(sql_query=_)
        temp = util.get_pandas_frame(result)
        print(temp)
        athena_df = pd.concat([athena_df, temp], ignore_index=True)

    except Exception as e:
        print("Exception")
        print("Not Executed --------- ", _)
        print(e)

#athena_df.columns = ['metric_id', 'start_date', 'end_date', 'fy_monthdimkey', 'metric_value']
athena_df.to_csv('ATHENA_SQL_OUTPUT.csv', index=False)