def transform_data_func(df):
    try:
        # Removes duplicate rows based on all columns.
        df = df.drop_duplicates()

        # Load order_id from orders
        connection = storage.engine_connect()
        filter_df = pd.DataFrame(
            connection.execute(
                'SELECT DISTINCT order_id FROM olist_db.olist_orders_dataset;')
        )
        # Filter by zip_code_prefix
        df = df[df.order_id.isin(filter_df[0])]

        # Load product_id from sellers
        connection = storage.engine_connect()
        filter_df = pd.DataFrame(
            connection.execute(
                'SELECT DISTINCT product_id FROM olist_db.olist_products_dataset;'
            ))
        # Filter by zip_code_prefix
        df = df[df.product_id.isin(filter_df[0])]

        # Load seller_id from sellers
        connection = storage.engine_connect()
        filter_df = pd.DataFrame(
            connection.execute(
                'SELECT DISTINCT seller_id FROM olist_db.olist_sellers_dataset;'
            ))
        # Filter by zip_code_prefix
        df = df[df.seller_id.isin(filter_df[0])]

        return df
    except BaseException as e:
        raise ValueError(e)
def query_execute(query, table_name):
    try:
        with storage.engine_connect().begin() as connection:
            connection.execute(query)
            result = connection.execute('SELECT COUNT(*) FROM olist_db.{db_table_name};'.format(db_table_name = table_name))
            print('Row count: ' + str([{value for value in row} for row in result if result is not None][0]))
    except BaseException as e:
        raise ValueError(e) 
def transform_data_func(df):
    try:
        # Remove duplicated primary key
        df = df.drop_duplicates(subset=['seller_id'])   

        # Load zip_code_prefix from geolocation
        connection = storage.engine_connect()
        filter_df = pd.DataFrame(connection.execute('SELECT DISTINCT geolocation_zip_code_prefix FROM olist_db.olist_geolocation_dataset;'))
        # Filter by zip_code_prefix
        df = df[df.seller_zip_code_prefix.isin(filter_df[0])]

        return df
    except BaseException as e:
        raise ValueError(e)
Пример #4
0
def transform_data_func(df):
    try:
        # Remove duplicated primary key
        df = df.drop_duplicates(subset=['order_id'])

        # Load customer_id from customers
        connection = storage.engine_connect()
        filter_df = pd.DataFrame(
            connection.execute(
                'SELECT DISTINCT customer_id FROM olist_db.olist_customers_dataset;'
            ))
        # Filter by zip_code_prefix
        df = df[df.customer_id.isin(filter_df[0])]

        return df
    except BaseException as e:
        raise ValueError(e)
def sales_fact(ds, **kwargs):
    # CREATE TEMPORARY TABLES
    query = '''
    DROP TABLE IF EXISTS olist_db.temp_city;
    CREATE TABLE olist_db.temp_city
    SELECT 
    location.city_id,
    location.state_id,
    location.city,
    location.state,
    customers_dataset.customer_id
    FROM
    (SELECT city_id, location_state.state_id, city, state FROM olist_db.d_city AS location_city
    INNER JOIN olist_db.d_state AS location_state ON location_city.state_id = location_state.state_id) AS location,
    (SELECT customers_dataset.customer_id, customer_city AS city, customer_state AS state FROM olist_db.olist_customers_dataset AS customers_dataset
    INNER JOIN olist_db.olist_orders_dataset AS orders_datase ON orders_datase.customer_id = customers_dataset.customer_id) AS customers_dataset
    WHERE customers_dataset.city = location.city
    AND customers_dataset.state = location.state;
    '''
    print('temp_city')
    query_execute(query,'temp_city')

    query = '''
    DROP TABLE IF EXISTS olist_db.temp_payment;
    SET @rownr=0;
    CREATE TABLE olist_db.temp_payment
    SELECT @rownr:=@rownr+1 AS payment_id, type_id, order_id, payment_sequential, payment_installments, payment_value FROM olist_db.olist_order_payments_dataset AS payments_dataset
    INNER JOIN olist_db.d_payment_type AS payment_type ON payment_type.payment_type = payments_dataset.payment_type;
    '''
    print('temp_payment')
    query_execute(query,'temp_payment')                      

    # LOAD FACT SALES
    f_sales = pd.DataFrame()
    with storage.engine_connect().begin() as connection:
        f_sales = pd.read_sql("""
            SELECT 
            orders_dataset.order_id
            , product_id
            , city_id
            , payment_id
            , review_id
            , (SELECT hour_id FROM olist_db.d_hour WHERE hour = HOUR(order_approved_at)) AS hour_id
            , (SELECT day_id FROM olist_db.d_day WHERE day = DAY(order_approved_at)) AS day_id
            , (SELECT month_id FROM olist_db.d_month WHERE month = MONTH(order_approved_at)) AS month_id
            , (SELECT year_id FROM olist_db.d_year WHERE year = YEAR(order_approved_at)) AS year_id
            , order_items_dataset.price
            FROM 
            olist_db.olist_orders_dataset AS orders_dataset
            INNER JOIN olist_db.olist_order_items_dataset AS order_items_dataset ON order_items_dataset.order_id = orders_dataset.order_id
            INNER JOIN olist_db.temp_payment AS temp_payment ON temp_payment.order_id = orders_dataset.order_id
            INNER JOIN olist_db.olist_customers_dataset AS customers_dataset ON customers_dataset.customer_id = orders_dataset.customer_id
            INNER JOIN olist_db.temp_city AS temp_city ON temp_city.customer_id = customers_dataset.customer_id
            INNER JOIN olist_db.olist_order_reviews_dataset AS olist_order_reviews_dataset ON olist_order_reviews_dataset.order_id = orders_dataset.order_id
            WHERE order_approved_at IS NOT NULL;
        """, connection)
    storage.load_data_into_db(f_sales, 'olist_db', 'f_sales') 
    
    # DROP TEMP TABLES
    query = '''
    DROP TABLE IF EXISTS olist_db.temp_city;
    DROP TABLE IF EXISTS olist_db.temp_payment;
    '''
    query_execute(query,'f_sales')