Exemplo n.º 1
0
def insert_into_date():
    with UseRedshift(redshift_db_config) as cursor:
        cursor.execute(rf"""SELECT DISTINCT etl_1.date 
                            FROM public.etl_1 etl_1
                            LEFT JOIN public.dim_date dim_date 
                                ON dim_date.date = etl_1.date
                            WHERE etl_1.in_etl_2=False
                                AND dim_date.date IS NULL""")
        dates = cursor.fetchall()
        for i, date in zip(range(len(dates)), dates):
            date_str = dates[i][0].isoformat()
            split_date = date_str.split('-')
            year = split_date[0]
            month = split_date[1]
            day = split_date[2]
            if month in ['01', '02', '03']:
                qtr = '1'
            elif month in ['04', '05', '06']:
                qtr = '2'
            elif month in ['07', '08', '09']:
                qtr = '3'
            else:
                qtr = '4'
            date_id = str(uuid.uuid4())
            print('-------- Executing INSERT DATE Statement --------')
            cursor.execute(
                rf"""INSERT INTO public.dim_date(
                                id, date, day, month, quarter, year)
                                VALUES(%s, %s, %s, %s, %s, %s)""",
                (date_id, date, day, month, qtr, year))
            print('-------- Insert Statement Complete -------- ')
Exemplo n.º 2
0
def create_table():
    with UseRedshift(redshift_db_config) as cursor:
        SQL_CREATE = rf"""CREATE TABLE IF NOT EXISTS public.s3_load (
                    date VARCHAR(20), 
                    time VARCHAR(20), 
                    server_ip VARCHAR(20), 
                    method VARCHAR(10), 
                    uri_stem VARCHAR(80), 
                    uri_query VARCHAR(1000), 
                    server_port VARCHAR(10), 
                    username VARCHAR(60), 
                    client_ip VARCHAR(20), 
                    client_browser VARCHAR(1000), 
                    client_cookie VARCHAR(1000), 
                    client_referrer VARCHAR(1000), 
                    status VARCHAR(10), 
                    substatus VARCHAR(10), 
                    win32_status VARCHAR(100), 
                    bytes_sent VARCHAR(100), 
                    bytes_received VARCHAR(100), 
                    duration VARCHAR(100), 
                    in_etl_1 BOOLEAN)
                    """
        try:
            print('-------- Executing CREATE TABLE --------')
            cursor.execute(SQL_CREATE)
            print('-------- SQL CREATE Complete --------')
        except Exception as err:
            print('Error executing SQL: ', err)
Exemplo n.º 3
0
def create_table():
    """Creates a table in AWS Redshift if it doesn't already exist."""
    try:
        with UseRedshift(redshift_db_config) as cursor:
            SQL_CREATE = rf"""CREATE TABLE IF NOT EXISTS public.sparktest (
                    date DATE, 
                    time TIMESTAMP, 
                    server_ip VARCHAR(20), 
                    method VARCHAR(10), 
                    uri_stem VARCHAR(80), 
                    uri_query VARCHAR(1000), 
                    server_port INT, 
                    username VARCHAR(60), 
                    client_ip VARCHAR(20), 
                    client_browser VARCHAR(1000), 
                    client_cookie VARCHAR(1000), 
                    client_referrer VARCHAR(1000), 
                    status INT, 
                    substatus INT, 
                    win32_status INT, 
                    bytes_sent INT, 
                    bytes_received INT, 
                    duration INT
                    )"""
            cursor.execute(SQL_CREATE)
    except Exception as err:
        print('Error: ', err)
Exemplo n.º 4
0
def insert_into_file():
    with UseRedshift(redshift_db_config) as cursor:
        cursor.execute(rf"""SELECT DISTINCT etl_1.uri_stem, 
                                            etl_1.bytes_sent
                            FROM public.etl_1 etl_1
                            LEFT JOIN public.dim_file dim_file
                                ON dim_file.uri_stem = etl_1.uri_stem
                                AND dim_file.bytes_sent = etl_1.bytes_sent
                            WHERE etl_1.in_etl_2=False
                                AND dim_file.uri_stem IS NULL
                                AND dim_file.bytes_sent IS NULL""")

        file_contents = cursor.fetchall()
        for i in file_contents:
            uri_stem = i[0]
            bytes_sent = i[1]
            file_ext = re.search(r'\.[A-Za-z0-9]+$', i[0])
            if file_ext is not None:
                file_type = file_ext.group(0)
            else:
                file_type = file_ext
            if uri_stem == '/robots.txt':
                is_crawler = True
            else:
                is_crawler = False
            file_id = str(uuid.uuid4())
            print('-------- Executing INSERT FILE Statement --------')
            cursor.execute(
                rf"""INSERT INTO public.dim_file(
                                id, uri_stem, bytes_sent, file_type, is_crawler)
                                VALUES(%s, %s, %s, %s, %s)""",
                (file_id, uri_stem, bytes_sent, file_type, is_crawler))
            print('-------- Insert Statement Complete -------- ')
Exemplo n.º 5
0
def insert_into_location():
    with UseRedshift(redshift_db_config) as cursor:
        cursor.execute(rf"""SELECT DISTINCT etl_1.client_ip
                            FROM public.etl_1 etl_1
                            LEFT JOIN public.dim_location dim_location
                                ON dim_location.client_ip = etl_1.client_ip
                            WHERE etl_1.in_etl_2=False
                                AND dim_location.client_ip IS NULL""")
        client_ips = cursor.fetchall()
        for ip in client_ips:
            results = requests.get(f'https://ipinfo.io/{ip[0]}')
            results_dict = results.json()
            if 'postal' in results_dict.keys():
                postcode = results_dict['postal']
            else:
                postcode = None
            if 'city' in results_dict.keys():
                city = results_dict['city']
            else:
                city = None
            if 'region' in results_dict.keys():
                region = results_dict['region']
            else:
                region = None
            country = results_dict['country']
            location_id = str(uuid.uuid4())
            print('-------- Executing INSERT LOCATION Statement --------')
            cursor.execute(
                rf"""INSERT INTO public.dim_location(
                                id, client_ip, postcode, city, region, country)
                                VALUES(%s, %s, %s, %s, %s, %s)""",
                (location_id, ip[0], postcode, city, region, country))
            print('-------- Insert Statement Complete -------- ')
Exemplo n.º 6
0
def insert_into_time():
    with UseRedshift(redshift_db_config) as cursor:
        cursor.execute(rf"""SELECT DISTINCT etl_1.time 
                            FROM public.etl_1 etl_1
                            LEFT JOIN public.dim_time dim_time 
                                ON dim_time.time = etl_1.time
                            WHERE etl_1.in_etl_2=False
                                AND dim_time.time IS NULL""")
        times = cursor.fetchall()
        for i, time in zip(range(len(times)), times):
            time_str = times[i][0].isoformat()
            split_time = time_str.split(
                'T')  # returns ['2009-10-24', '01:40:40']
            hr_min_sec = []
            for i in split_time:
                hr_min_sec = i.split(':')  # returns ['01', '40', '40']
            hour = hr_min_sec[0]
            minute = hr_min_sec[1]
            second = hr_min_sec[2]
            time_id = str(uuid.uuid4())
            print('-------- Executing INSERT TIME Statement --------')
            cursor.execute(
                rf"""INSERT INTO public.dim_time(
                                id, time, hour, minute, second)
                                VALUES(%s, %s, %s, %s, %s)""",
                (time_id, time, hour, minute, second))
            print('-------- Insert Statement Complete -------- ')
Exemplo n.º 7
0
def insert_ids_to_fact():
    with UseRedshift(redshift_db_config) as cursor:
        INSERT_DATE_ID = rf"""UPDATE public.etl_1
                            SET date_id = public.dim_date.id
                            FROM public.dim_date
                                WHERE public.etl_1.date = public.dim_date.date
                                AND public.etl_1.date_id IS NULL;"""
        INSERT_TIME_ID = rf"""UPDATE public.etl_1
                            SET time_id = public.dim_time.id
                            FROM public.dim_time
                                WHERE public.etl_1.time = public.dim_time.time
                                AND public.etl_1.time_id IS NULL;"""
        INSERT_LOCATION_ID = rf"""UPDATE public.etl_1
                                SET location_id = public.dim_location.id
                                FROM public.dim_location
                                    WHERE public.etl_1.client_ip = public.dim_location.client_ip
                                    AND public.etl_1.location_id IS NULL;"""
        INSERT_REQUEST_ID = rf"""UPDATE public.etl_1
                                SET request_id = public.dim_request.id
                                FROM public.dim_request
                                    WHERE public.etl_1.method        = public.dim_request.method
                                    AND public.etl_1.client_browser  = public.dim_request.client_browser
                                    AND (public.etl_1.client_referrer = public.dim_request.client_referrer 
                                        OR (public.etl_1.client_referrer IS NULL
                                            AND public.dim_request.client_referrer IS NULL))
                                    AND public.etl_1.status          = public.dim_request.status
                                    AND public.etl_1.duration        = public.dim_request.duration
                                    AND public.etl_1.request_id IS NULL;"""
        # fixed by accepting NULLs for client_referrer

        INSERT_FILE_ID = rf"""UPDATE public.etl_1
                            SET file_id = public.dim_file.id
                            FROM public.dim_file
                            WHERE public.etl_1.uri_stem = public.dim_file.uri_stem
                                AND (public.etl_1.bytes_sent = public.dim_file.bytes_sent
                                        OR (public.etl_1.bytes_sent IS NULL 
                                            AND public.dim_file.bytes_sent IS NULL))
                                AND public.etl_1.file_id IS NULL;"""
        # fixed by accepting NULLs for bytes_sent
        INSERT_VISIT_ID = rf"""UPDATE public.etl_1
                            SET visit_id = public.dim_visit.id
                            FROM public.dim_visit
                            WHERE public.etl_1.client_cookie = public.dim_visit.client_cookie
                                AND public.etl_1.visit_id IS NULL;"""
        try:
            cursor.execute(INSERT_DATE_ID)
            cursor.execute(INSERT_TIME_ID)
            cursor.execute(INSERT_LOCATION_ID)
            cursor.execute(INSERT_REQUEST_ID)
            cursor.execute(INSERT_FILE_ID)
            cursor.execute(INSERT_VISIT_ID)
        except Exception as err:
            print('Error: ', err)

        cursor.execute(rf"""UPDATE public.etl_1 SET in_etl_2=True
                            WHERE in_etl_2=False""")
Exemplo n.º 8
0
def copy_to_redshift():
    """Executes an SQL command to copy s3 folder to a Redshift table."""
    try:
        with UseRedshift(redshift_db_config) as cursor:
            SQL_COPY = rf"""COPY public.sparktest FROM 's3://la-ticket-bucket-eu/spark-etl5'
                    IAM_ROLE 'arn:aws:iam::900056063831:role/RedshiftCopyUnload'
                    FORMAT AS PARQUET;
                    """
            cursor.execute(SQL_COPY)
    except Exception as err:
        print('Error: ', err)
Exemplo n.º 9
0
def create_table():
    with UseRedshift(redshift_db_config) as cursor:
        SQL_CREATE_DATE = rf"""CREATE TABLE IF NOT EXISTS public.dim_date (
                    id VARCHAR(50), 
                    date DATE, 
                    day INTEGER,
                    week INTEGER, 
                    month INTEGER, 
                    quarter INTEGER,
                    year INTEGER)"""
        SQL_CREATE_TIME = rf"""CREATE TABLE IF NOT EXISTS public.dim_time (
                    id VARCHAR(50), 
                    time TIMESTAMP, 
                    hour INTEGER, 
                    minute INTEGER, 
                    second INTEGER)"""
        SQL_CREATE_LOCATION = rf"""CREATE TABLE IF NOT EXISTS public.dim_location (
                    id VARCHAR(50), 
                    client_ip VARCHAR(200), 
                    postcode VARCHAR(10), 
                    city VARCHAR(300), 
                    region VARCHAR(200), 
                    country VARCHAR(200))"""
        SQL_CREATE_REQUEST = rf"""CREATE TABLE IF NOT EXISTS public.dim_request (
                    id VARCHAR(50), 
                    method VARCHAR(10), 
                    client_browser VARCHAR(1000), 
                    client_referrer VARCHAR(1000), 
                    status INTEGER, 
                    duration INTEGER)"""
        SQL_CREATE_FILE = rf"""CREATE TABLE IF NOT EXISTS public.dim_file (
                    id VARCHAR(50), 
                    uri_stem VARCHAR(80), 
                    bytes_sent INTEGER, 
                    file_type VARCHAR(10), 
                    is_crawler BOOLEAN)"""
        SQL_CREATE_VISIT = rf"""CREATE TABLE IF NOT EXISTS public.dim_visit (
                    id VARCHAR(50), 
                    client_cookie VARCHAR(1000))"""
        try:
            print('-------- Executing CREATE TABLE --------')
            cursor.execute(SQL_CREATE_DATE)
            cursor.execute(SQL_CREATE_TIME)
            cursor.execute(SQL_CREATE_LOCATION)
            cursor.execute(SQL_CREATE_REQUEST)
            cursor.execute(SQL_CREATE_FILE)
            cursor.execute(SQL_CREATE_VISIT)
            print('-------- SQL CREATE Complete --------')
        except Exception as err:
            print('Error executing SQL: ', err)
Exemplo n.º 10
0
def insert_into_visit():
    with UseRedshift(redshift_db_config) as cursor:
        cursor.execute(rf"""SELECT DISTINCT etl_1.client_cookie
                            FROM public.etl_1 etl_1
                            LEFT JOIN public.dim_visit dim_visit
                                ON dim_visit.client_cookie = etl_1.client_cookie
                            WHERE etl_1.in_etl_2=False
                                AND dim_visit.client_cookie IS NULL""")

        cookies = cursor.fetchall()
        for i in cookies:
            if i[0] == None:
                client_cookie = ''
            else:
                client_cookie = i[0]
            visit_id = str(uuid.uuid4())
            print('-------- Executing INSERT VISIT Statement --------')
            cursor.execute(
                rf"""INSERT INTO public.dim_visit(
                                id, client_cookie)
                                VALUES(%s, %s)""", (visit_id, client_cookie))
            print('-------- Insert Statement Complete -------- ')
Exemplo n.º 11
0
def insert_into_request():
    with UseRedshift(redshift_db_config) as cursor:
        cursor.execute(rf"""SELECT DISTINCT etl_1.method, 
                                            etl_1.client_browser, 
                                            etl_1.client_referrer, 
                                            etl_1.status, 
                                            etl_1.duration
                            FROM public.etl_1 etl_1
                            LEFT JOIN public.dim_request dim_request
                                ON dim_request.method = etl_1.method
                                AND dim_request.client_browser = etl_1.client_browser
                                AND dim_request.client_referrer = etl_1.client_referrer
                                AND dim_request.status = etl_1.status
                                AND dim_request.duration = etl_1.duration
                            WHERE etl_1.in_etl_2=False
                                AND dim_request.method IS NULL
                                AND dim_request.client_browser IS NULL
                                AND dim_request.client_referrer IS NULL
                                AND dim_request.status IS NULL
                                AND dim_request.duration IS NULL""")
        request_results = cursor.fetchall()
        for i in request_results:
            method = i[0]
            client_browser = i[1]
            client_referrer = i[2]
            status = i[3]
            duration = i[4]
            request_id = str(uuid.uuid4())
            print('-------- Executing INSERT REQUEST Statement --------')
            cursor.execute(
                rf"""INSERT INTO public.dim_request(
                                id, method, client_browser, client_referrer, status, duration)
                                VALUES(%s, %s, %s, %s, %s, %s)""",
                (request_id, method, client_browser, client_referrer, status,
                 duration))
            print('-------- Insert Statement Complete -------- ')
Exemplo n.º 12
0
def create_table():
    with UseRedshift(redshift_db_config) as cursor:
        SQL_CREATE = rf"""CREATE TABLE IF NOT EXISTS public.etl_1 (
                    id VARCHAR(50), 
                    date_id VARCHAR(50), 
                    time_id VARCHAR(50), 
                    location_id VARCHAR(50), 
                    request_id VARCHAR(50), 
                    file_id VARCHAR(50), 
                    visit_id VARCHAR(50),
                    date DATE, 
                    time TIMESTAMP, 
                    server_ip VARCHAR(20), 
                    method VARCHAR(10), 
                    uri_stem VARCHAR(80), 
                    uri_query VARCHAR(1000), 
                    server_port INT, 
                    username VARCHAR(60), 
                    client_ip VARCHAR(20), 
                    client_browser VARCHAR(1000), 
                    client_cookie VARCHAR(1000), 
                    client_referrer VARCHAR(1000), 
                    status INT, 
                    substatus INT, 
                    win32_status INT, 
                    bytes_sent INT, 
                    bytes_received INT, 
                    duration INT, 
                    in_etl_2 BOOLEAN)""" 

        try:
            print('-------- Executing CREATE TABLE --------')
            cursor.execute(SQL_CREATE)
            print('-------- SQL CREATE Complete --------')
        except Exception as err:
            print('Error executing SQL: ', err)
Exemplo n.º 13
0
def insert_into_table():
    with UseRedshift(redshift_db_config) as cursor:
        s3_client = boto3.client('s3',
                                 aws_access_key_id=aws_access_key,
                                 aws_secret_access_key=aws_secret_key)

        ######## CODE TO ITERATE THROUGH ALL FILES IN S3 BUCKET ########
        # bucket_contents = s3_client.list_objects(Bucket='la-ticket-bucket-eu', Prefix='BI_logs/')
        # file_names = []
        # for i in bucket_contents['Contents']:
        #     file_names.append(i['Key'])
        # file_names.pop(0) # remove 'BI_logs' folder name from list, because it's not a specific file name
        # for log in file_names:
        ######## If too many files in S3 bucket, the script freezes after a few hours of running ########
        ######## Instead of iterating through the whole bucket, I process one file each ########

        file_object = s3_client.get_object(Bucket='la-ticket-bucket-eu',
                                           Key='BI_logs/u_ex110407.log')
        file_contents = file_object['Body'].read().decode().split('\n')
        in_etl_1_flag = False

        split_logs = []
        for line in file_contents:
            split_logs.append(line.split(' '))

        for log in split_logs:
            if log[0].startswith('#'):
                pass
            else:
                if len(log) == 18:
                    print('-------- Executing INSERT Statement --------')
                    cursor.execute(
                        rf"""INSERT INTO public.s3_load(
                                    date, time, server_ip, 
                                    method, uri_stem, uri_query, 
                                    server_port, username, client_ip, 
                                    client_browser, client_cookie, 
                                    client_referrer, status, 
                                    substatus, win32_status, 
                                    bytes_sent, bytes_received, 
                                    duration, in_etl_1) VALUES (%s, %s, %s, %s, %s, 
                                    %s, %s, %s, %s, %s, %s, %s, %s, %s, 
                                    %s, %s, %s, %s, %s)""",
                        (log[0], log[1], log[2], log[3], log[4], log[5],
                         log[6], log[7], log[8], log[9], log[10], log[11],
                         log[12], log[13], log[14], log[15], log[16], log[17],
                         in_etl_1_flag))
                    print('-------- Insert Statement Complete -------- ')
                elif len(log) == 14:
                    print('-------- Executing INSERT Statement --------')
                    cursor.execute(
                        rf"""INSERT INTO public.s3_load(
                                    date, time, server_ip, 
                                    method, uri_stem, uri_query, 
                                    server_port, username, client_ip, 
                                    client_browser, status, 
                                    substatus, win32_status,  
                                    duration, in_etl_1) VALUES (%s, %s, %s, %s, %s, 
                                    %s, %s, %s, %s, %s, %s, %s, %s, %s, %s 
                                    )""",
                        (log[0], log[1], log[2], log[3], log[4], log[5],
                         log[6], log[7], log[8], log[9], log[10], log[11],
                         log[12], log[13], in_etl_1_flag))
                    print('-------- Insert Statement Complete -------- ')
                else:
                    print('Unknown log length.')
Exemplo n.º 14
0
def insert_into_table():
    with UseRedshift(redshift_db_config) as cursor:

        # Select all if not already present in public.etl_1 table
        cursor.execute(rf"""SELECT * FROM public.s3_load WHERE in_etl_1 = False""")

        in_etl_2_flag = False

        logs = cursor.fetchall()
        for single_log in logs:
            if len(single_log) == 19:
                new_row = []
                new_row.append(single_log[0]) # append date
                new_timestamp = single_log[0:2]
                new_timestamp = " ".join(new_timestamp)
                new_row.append(new_timestamp) # append timestamp
                new_row.append(single_log[2]) # append server_ip
                new_row.append(single_log[3]) # append method
                new_row.append(single_log[4]) # append uri_stem
                new_row.append(single_log[5]) # append uri_query
                server_port = int(single_log[6])
                new_row.append(server_port) # append server_port
                new_row.append(single_log[7]) # append username
                new_row.append(single_log[8]) # append client_ip
                new_row.append(single_log[9]) # append client_browser
                if single_log[10] == None:
                    new_row.append('')
                else:
                    new_row.append(single_log[10]) # append client_cookie
                new_row.append(single_log[11]) # append client_referrer
                status = int(single_log[12])
                new_row.append(status) # append status
                sub_status = int(single_log[13])
                new_row.append(sub_status) # append sub_status
                win32_status = int(single_log[14])
                new_row.append(win32_status) # append win32_status
                if single_log[15] == None:
                    new_row.append(single_log[15]) # append bytes_sent
                else:
                    bytes_sent = int(single_log[15])
                    new_row.append(bytes_sent) # append bytes_sent
                if single_log[16] == None:
                    new_row.append(single_log[16])
                else:
                    bytes_received = int(single_log[16])
                    new_row.append(bytes_received) # append bytes_received
                duration = int(single_log[17])
                new_row.append(duration) # append duration
                log_id = str(uuid.uuid4())
                print('.')
                cursor.execute(rf"""INSERT INTO public.etl_1(
                                id, date, time, server_ip, 
                                method, uri_stem, uri_query, 
                                server_port, username, client_ip, 
                                client_browser, client_cookie, 
                                client_referrer, status, 
                                substatus, win32_status, 
                                bytes_sent, bytes_received, 
                                duration, in_etl_2) VALUES (%s, %s, %s, %s, %s, %s, 
                                %s, %s, %s, %s, %s, %s, %s, %s, %s, 
                                %s, %s, %s, %s, %s)""", (log_id, new_row[0],
                                new_row[1], new_row[2], new_row[3], new_row[4], new_row[5], 
                                new_row[6], new_row[7], new_row[8], new_row[9], new_row[10], 
                                new_row[11], new_row[12], new_row[13], new_row[14], new_row[15],
                                new_row[16], new_row[17], in_etl_2_flag))
                print('.')
            elif len(single_log) == 15:
                new_row = []
                new_row.append(single_log[0]) # append date
                new_timestamp = single_log[0:2]
                new_timestamp = " ".join(new_timestamp)
                new_row.append(new_timestamp) # append timestamp
                new_row.append(single_log[2]) # append server_ip
                new_row.append(single_log[3]) # append method
                new_row.append(single_log[4]) # append uri_stem
                new_row.append(single_log[5]) # append uri_query
                server_port = int(single_log[6])
                new_row.append(server_port) # append server_port
                new_row.append(single_log[7]) # append username
                new_row.append(single_log[8]) # append client_ip
                new_row.append(single_log[9]) # append client_browser
                status = int(single_log[10])
                new_row.append(status) # append status
                sub_status = int(single_log[11])
                new_row.append(sub_status) # append sub_status
                win32_status = int(single_log[12])
                new_row.append(win32_status) # append win32_status
                duration = int(single_log[13])
                new_row.append(duration) # append duration
                new_row.append('') # to replace client_cookie NULL with empty string
                log_id = str(uuid.uuid4())
                print('.')
                cursor.execute(rf"""INSERT INTO public.etl_1(
                                id, date, time, server_ip, 
                                method, uri_stem, uri_query, 
                                server_port, username, client_ip, 
                                client_browser, status, 
                                substatus, win32_status,  
                                duration, client_cookie, in_etl_2) VALUES (%s, %s, %s, %s, %s, %s, 
                                %s, %s, %s, %s, %s, %s, %s, %s, %s, 
                                %s)""", (log_id, new_row[0],
                                new_row[1], new_row[2], new_row[3], new_row[4], new_row[5], 
                                new_row[6], new_row[7], new_row[8], new_row[9], new_row[10], 
                                new_row[11], new_row[12], new_row[13], new_row[14], in_etl_2_flag))
                print('.')
            else:
                print('not 18 columns long')

        cursor.execute(rf"""UPDATE public.s3_load SET in_etl_1=True
                            WHERE in_etl_1=False""")