def test_etl(self): df = storage.extract_data_from_csv(self.file_full_path) self.assertEqual(len(df.index), 3) df = task.transform_data_func(df) self.assertEqual(len(df.index), 2) self.assertEqual( storage.load_data_into_db(df, self.db_name, self.table_name), True)
def load_data(ds, **kwargs): df = kwargs['task_instance'].xcom_pull(task_ids='transform_data') storage.load_data_into_db(df, db_name, table_name)
def sales_fact(ds, **kwargs): # CREATE TEMPORARY TABLES query = ''' DROP TABLE IF EXISTS olist_db.temp_city; CREATE TABLE olist_db.temp_city SELECT location.city_id, location.state_id, location.city, location.state, customers_dataset.customer_id FROM (SELECT city_id, location_state.state_id, city, state FROM olist_db.d_city AS location_city INNER JOIN olist_db.d_state AS location_state ON location_city.state_id = location_state.state_id) AS location, (SELECT customers_dataset.customer_id, customer_city AS city, customer_state AS state FROM olist_db.olist_customers_dataset AS customers_dataset INNER JOIN olist_db.olist_orders_dataset AS orders_datase ON orders_datase.customer_id = customers_dataset.customer_id) AS customers_dataset WHERE customers_dataset.city = location.city AND customers_dataset.state = location.state; ''' print('temp_city') query_execute(query,'temp_city') query = ''' DROP TABLE IF EXISTS olist_db.temp_payment; SET @rownr=0; CREATE TABLE olist_db.temp_payment SELECT @rownr:=@rownr+1 AS payment_id, type_id, order_id, payment_sequential, payment_installments, payment_value FROM olist_db.olist_order_payments_dataset AS payments_dataset INNER JOIN olist_db.d_payment_type AS payment_type ON payment_type.payment_type = payments_dataset.payment_type; ''' print('temp_payment') query_execute(query,'temp_payment') # LOAD FACT SALES f_sales = pd.DataFrame() with storage.engine_connect().begin() as connection: f_sales = pd.read_sql(""" SELECT orders_dataset.order_id , product_id , city_id , payment_id , review_id , (SELECT hour_id FROM olist_db.d_hour WHERE hour = HOUR(order_approved_at)) AS hour_id , (SELECT day_id FROM olist_db.d_day WHERE day = DAY(order_approved_at)) AS day_id , (SELECT month_id FROM olist_db.d_month WHERE month = MONTH(order_approved_at)) AS month_id , (SELECT year_id FROM olist_db.d_year WHERE year = YEAR(order_approved_at)) AS year_id , order_items_dataset.price FROM olist_db.olist_orders_dataset AS orders_dataset INNER JOIN olist_db.olist_order_items_dataset AS order_items_dataset ON order_items_dataset.order_id = orders_dataset.order_id INNER JOIN olist_db.temp_payment AS temp_payment ON temp_payment.order_id = orders_dataset.order_id INNER JOIN olist_db.olist_customers_dataset AS customers_dataset ON customers_dataset.customer_id = orders_dataset.customer_id INNER JOIN olist_db.temp_city AS temp_city ON temp_city.customer_id = customers_dataset.customer_id INNER JOIN olist_db.olist_order_reviews_dataset AS olist_order_reviews_dataset ON olist_order_reviews_dataset.order_id = orders_dataset.order_id WHERE order_approved_at IS NOT NULL; """, connection) storage.load_data_into_db(f_sales, 'olist_db', 'f_sales') # DROP TEMP TABLES query = ''' DROP TABLE IF EXISTS olist_db.temp_city; DROP TABLE IF EXISTS olist_db.temp_payment; ''' query_execute(query,'f_sales')