Пример #1
0
def update_athena_partition(*args, **kwargs):
    execution_date = datetime.datetime.strptime(kwargs['ds'], '%Y-%m-%d')
    execution_month = execution_date.month
    execution_year = execution_date.year
    s3_prefix = Variable.get('bikeshare_s3_prefix')
    logs_location = Variable.get('bikeshare_athena_logs')
    athena_db = Variable.get('bikeshare_athena_db')
    bucket_name = Variable.get('bikeshare_bucket_name')
    athena_table_name = Variable.get('bikeshare_athena_table')
    file_location = 's3://bikeshare-data-copy/' + s3_prefix + f'year={execution_year}/month={execution_month}/'
    result_configuration = {"OutputLocation": logs_location}
    partition_update_query = """
    ALTER TABLE {} add partition (year="{}", month='{}')
    location "{}";
    """
    athena_hook = AWSAthenaHook(aws_conn_id='aws_credentials')
    athena_hook.run_query(partition_update_query.format(
        athena_table_name, execution_year, execution_month, file_location),
                          result_configuration=result_configuration,
                          query_context={"Database": athena_db},
                          client_request_token=str(uuid4()))
Пример #2
0
def check_data_in_redshift(*args, **kwargs):
    execution_date = datetime.datetime.strptime(kwargs['ds'], '%Y-%m-%d')
    execution_month = execution_date.month
    execution_year = execution_date.year
    bucket_name = Variable.get("bikeshare_bucket_name")
    logs_location = Variable.get("bikeshare_athena_logs")
    athena_db = Variable.get("bikeshare_athena_db")
    num_records_athena_query = f"""
    SELECT COUNT(*) FROM trips
    WHERE year = {execution_year} AND month = {execution_month}
    """
    num_records_redshift_query = f"""
    SELECT COUNT(*) FROM trips
    WHERE date_part(year, trips.start_time) = {execution_year} AND date_part(month, trips.start_time) = {execution_month}
    """
    athena_hook = AWSAthenaHook(aws_conn_id='aws_credentials')
    query_id = athena_hook.run_query(
        query=num_records_athena_query,
        query_context={"Database": athena_db},
        result_configuration={"OutputLocation": logs_location},
        client_request_token=str(uuid4()))
    time.sleep(20)
    athena_query_results = athena_hook.get_query_results(
        query_execution_id=query_id)
    num_records_in_s3 = int(athena_query_results['ResultSet']['Rows'][1]
                            ['Data'][0]['VarCharValue'])
    redshift_hook = PostgresHook('redshift_connection')
    redshift_query_results = redshift_hook.get_records(
        sql=num_records_redshift_query)
    num_records_in_redshift = int(redshift_query_results[0][0])
    logging.info(f"Number of records in S3 - {num_records_in_s3}")
    logging.info(f"Number of records in Redshift - {num_records_in_redshift}")
    if num_records_in_s3 == num_records_in_redshift:
        logging.info(
            f"Successfully Copied data for the year - {execution_year}, month - {execution_month}"
        )
    else:
        raise Partial_Data_Missing
Пример #3
0
 def get_hook(self):
     return AWSAthenaHook(self.aws_conn_id, self.sleep_time)