def transform_data_func(df): try: # Removes duplicate rows based on all columns. df = df.drop_duplicates() # Load order_id from orders connection = storage.engine_connect() filter_df = pd.DataFrame( connection.execute( 'SELECT DISTINCT order_id FROM olist_db.olist_orders_dataset;') ) # Filter by zip_code_prefix df = df[df.order_id.isin(filter_df[0])] # Load product_id from sellers connection = storage.engine_connect() filter_df = pd.DataFrame( connection.execute( 'SELECT DISTINCT product_id FROM olist_db.olist_products_dataset;' )) # Filter by zip_code_prefix df = df[df.product_id.isin(filter_df[0])] # Load seller_id from sellers connection = storage.engine_connect() filter_df = pd.DataFrame( connection.execute( 'SELECT DISTINCT seller_id FROM olist_db.olist_sellers_dataset;' )) # Filter by zip_code_prefix df = df[df.seller_id.isin(filter_df[0])] return df except BaseException as e: raise ValueError(e)
def query_execute(query, table_name): try: with storage.engine_connect().begin() as connection: connection.execute(query) result = connection.execute('SELECT COUNT(*) FROM olist_db.{db_table_name};'.format(db_table_name = table_name)) print('Row count: ' + str([{value for value in row} for row in result if result is not None][0])) except BaseException as e: raise ValueError(e)
def transform_data_func(df): try: # Remove duplicated primary key df = df.drop_duplicates(subset=['seller_id']) # Load zip_code_prefix from geolocation connection = storage.engine_connect() filter_df = pd.DataFrame(connection.execute('SELECT DISTINCT geolocation_zip_code_prefix FROM olist_db.olist_geolocation_dataset;')) # Filter by zip_code_prefix df = df[df.seller_zip_code_prefix.isin(filter_df[0])] return df except BaseException as e: raise ValueError(e)
def transform_data_func(df): try: # Remove duplicated primary key df = df.drop_duplicates(subset=['order_id']) # Load customer_id from customers connection = storage.engine_connect() filter_df = pd.DataFrame( connection.execute( 'SELECT DISTINCT customer_id FROM olist_db.olist_customers_dataset;' )) # Filter by zip_code_prefix df = df[df.customer_id.isin(filter_df[0])] return df except BaseException as e: raise ValueError(e)
def sales_fact(ds, **kwargs): # CREATE TEMPORARY TABLES query = ''' DROP TABLE IF EXISTS olist_db.temp_city; CREATE TABLE olist_db.temp_city SELECT location.city_id, location.state_id, location.city, location.state, customers_dataset.customer_id FROM (SELECT city_id, location_state.state_id, city, state FROM olist_db.d_city AS location_city INNER JOIN olist_db.d_state AS location_state ON location_city.state_id = location_state.state_id) AS location, (SELECT customers_dataset.customer_id, customer_city AS city, customer_state AS state FROM olist_db.olist_customers_dataset AS customers_dataset INNER JOIN olist_db.olist_orders_dataset AS orders_datase ON orders_datase.customer_id = customers_dataset.customer_id) AS customers_dataset WHERE customers_dataset.city = location.city AND customers_dataset.state = location.state; ''' print('temp_city') query_execute(query,'temp_city') query = ''' DROP TABLE IF EXISTS olist_db.temp_payment; SET @rownr=0; CREATE TABLE olist_db.temp_payment SELECT @rownr:=@rownr+1 AS payment_id, type_id, order_id, payment_sequential, payment_installments, payment_value FROM olist_db.olist_order_payments_dataset AS payments_dataset INNER JOIN olist_db.d_payment_type AS payment_type ON payment_type.payment_type = payments_dataset.payment_type; ''' print('temp_payment') query_execute(query,'temp_payment') # LOAD FACT SALES f_sales = pd.DataFrame() with storage.engine_connect().begin() as connection: f_sales = pd.read_sql(""" SELECT orders_dataset.order_id , product_id , city_id , payment_id , review_id , (SELECT hour_id FROM olist_db.d_hour WHERE hour = HOUR(order_approved_at)) AS hour_id , (SELECT day_id FROM olist_db.d_day WHERE day = DAY(order_approved_at)) AS day_id , (SELECT month_id FROM olist_db.d_month WHERE month = MONTH(order_approved_at)) AS month_id , (SELECT year_id FROM olist_db.d_year WHERE year = YEAR(order_approved_at)) AS year_id , order_items_dataset.price FROM olist_db.olist_orders_dataset AS orders_dataset INNER JOIN olist_db.olist_order_items_dataset AS order_items_dataset ON order_items_dataset.order_id = orders_dataset.order_id INNER JOIN olist_db.temp_payment AS temp_payment ON temp_payment.order_id = orders_dataset.order_id INNER JOIN olist_db.olist_customers_dataset AS customers_dataset ON customers_dataset.customer_id = orders_dataset.customer_id INNER JOIN olist_db.temp_city AS temp_city ON temp_city.customer_id = customers_dataset.customer_id INNER JOIN olist_db.olist_order_reviews_dataset AS olist_order_reviews_dataset ON olist_order_reviews_dataset.order_id = orders_dataset.order_id WHERE order_approved_at IS NOT NULL; """, connection) storage.load_data_into_db(f_sales, 'olist_db', 'f_sales') # DROP TEMP TABLES query = ''' DROP TABLE IF EXISTS olist_db.temp_city; DROP TABLE IF EXISTS olist_db.temp_payment; ''' query_execute(query,'f_sales')