import pandas as pd import numpy as np import sklearn from sklearn.model_selection import train_test_split from tpot import TPOTRegressor #import data from olist.order import Order from olist.data import Olist data = Olist().get_data() training_orders = Order().get_training_data() orders = data['olist_orders_dataset'] orders['estimate_wait_time'] = (pd.to_datetime(orders['order_estimated_delivery_date'])\ - pd.to_datetime(orders['order_purchase_timestamp'])) / np.timedelta64(24, 'h') training_orders =\ training_orders.merge(orders[['estimate_wait_time', 'order_id']], on='order_id') X = training_orders.drop(['order_id', 'wait_time', 'delay_vs_expected'], axis=1) y = training_orders['wait_time'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error from sklearn.preprocessing import MinMaxScaler from sklearn.pipeline import make_pipeline from olist.order import Order model = 'gradient_boost_2' print("Importing dataset") orders = Order().get_training_data() # Create train and target variable X = orders.drop( ['order_id', 'wait_time', 'delay_vs_expected', 'expected_wait_time'], axis=1) y = orders['wait_time'] # Training test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) # Average CV score on the training set was:-62.99467662836531 exported_pipeline = make_pipeline( MinMaxScaler(),
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from olist.data import Olist from olist.order import Order model = 'random_forest_3' print("Importing dataset") orders = Order().get_training_data() data = Olist().get_data() products = data['olist_products_dataset'] order_items = data['olist_order_items_dataset'] # Build training set order_items_products = \ order_items.groupby(['order_id', 'product_id'], as_index=False)\ .agg({'order_item_id': 'count'})\ .merge(products.drop(['product_category_name'], axis=1), on='product_id') order_items_products = order_items_products\ .groupby('order_id',
def __init__(self): # Import only data once olist = Olist() self.data = olist.get_data() self.matching_table = olist.get_matching_table() self.order = Order()
class Product: def __init__(self): # Import data only once olist = Olist() self.data = olist.get_data() self.matching_table = olist.get_matching_table() self.order = Order() def get_product_features(self): """ Returns a DataFrame with: 'product_id', 'product_category_name', 'product_name_length', 'product_description_length', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm' """ products = self.data['products'] # (optional) convert name to English en_category = self.data['product_category_name_translation'] df = products.merge(en_category, on='product_category_name') df.drop(['product_category_name'], axis=1, inplace=True) df.rename(columns={ 'product_category_name_english': 'category', 'product_name_lenght': 'product_name_length', 'product_description_lenght': 'product_description_length' }, inplace=True) return df def get_price(self): """ Return a DataFrame with: 'product_id', 'price' """ order_items = self.data['order_items'] # There are many different order_items per product_id, each with different prices. Take the mean of various prices return order_items[['product_id', 'price']].groupby('product_id').mean() def get_wait_time(self): """ Returns a DataFrame with: 'product_id', 'wait_time' """ matching_table = self.matching_table orders_wait_time = self.order.get_wait_time() df = matching_table.merge(orders_wait_time, on='order_id') return df.groupby('product_id', as_index=False).agg({'wait_time': 'mean'}) def get_review_score(self): """ Returns a DataFrame with: 'product_id', 'share_of_five_stars', 'share_of_one_stars', 'review_score' """ matching_table = self.matching_table orders_reviews = self.order.get_review_score() # Since the same products can appear multiple times in the same # order, create a product <> order matching table matching_table = matching_table[['order_id', 'product_id']].drop_duplicates() df = matching_table.merge(orders_reviews, on='order_id') df = df.groupby('product_id', as_index=False).agg({ 'dim_is_one_star': 'mean', 'dim_is_five_star': 'mean', 'review_score': 'mean', }) df.columns = [ 'product_id', 'share_of_one_stars', 'share_of_five_stars', 'review_score' ] return df def get_quantity(self): """ Returns a DataFrame with: 'product_id', 'n_orders', 'quantity' """ order_items = self.data['order_items'] n_orders =\ order_items.groupby('product_id')['order_id'].nunique().reset_index() n_orders.columns = ['product_id', 'n_orders'] quantity = \ order_items.groupby('product_id', as_index=False).agg({'order_id': 'count'}) quantity.columns = ['product_id', 'quantity'] return n_orders.merge(quantity, on='product_id') def get_sales(self): """ Returns a DataFrame with: 'seller_id', 'sales' """ return self.data['order_items'][['product_id', 'price']]\ .groupby('product_id')\ .sum()\ .rename(columns={'price': 'sales'}) def get_training_data(self): training_set =\ self.get_product_features()\ .merge( self.get_wait_time(), on='product_id' ).merge( self.get_price(), on='product_id' ).merge( self.get_review_score(), on='product_id' ).merge( self.get_quantity(), on='product_id' ).merge( self.get_sales(), on='product_id' ) return training_set
class Seller: def __init__(self): # Import only data once olist = Olist() self.data = olist.get_data() self.matching_table = olist.get_matching_table() self.order = Order() def get_seller_features(self): """ Returns a DataFrame with: 'seller_id', 'seller_city', 'seller_state' """ sellers = self.data['sellers'] sellers.drop('seller_zip_code_prefix', axis=1, inplace=True) # There is multiple rows per seller sellers.drop_duplicates(inplace=True) return sellers def get_seller_delay_wait_time(self): """ Returns a DataFrame with: 'seller_id', 'delay_to_carrier', 'wait_time' """ # Get data order_items = self.data['order_items'] orders = self.data['orders'].query("order_status=='delivered'") ship = order_items.merge(orders, on='order_id') # Handle datetime ship.loc[:, 'shipping_limit_date'] =\ pd.to_datetime(ship['shipping_limit_date']) ship.loc[:, 'order_delivered_carrier_date'] =\ pd.to_datetime(ship['order_delivered_carrier_date']) ship.loc[:, 'order_delivered_customer_date'] =\ pd.to_datetime(ship['order_delivered_customer_date']) ship.loc[:, 'order_purchase_timestamp'] =\ pd.to_datetime(ship['order_purchase_timestamp']) # Compute delay and wait_time def delay_to_logistic_partner(d): days = np.mean( (d.shipping_limit_date - d.order_delivered_carrier_date)/np.timedelta64(24, 'h') ) if days < 0: return abs(days) else: return 0 def order_wait_time(d): days = np.mean( (d.order_delivered_customer_date - d.order_purchase_timestamp)/np.timedelta64(24, 'h') ) return days delay = ship.groupby('seller_id')\ .apply(delay_to_logistic_partner)\ .reset_index() delay.columns = ['seller_id', 'delay_to_carrier'] wait = ship.groupby('seller_id')\ .apply(order_wait_time)\ .reset_index() wait.columns = ['seller_id', 'wait_time'] df = delay.merge(wait, on='seller_id') return df def get_active_dates(self): orders = self.data['orders'][['order_id', 'order_approved_at']].copy() # create two new columns in view of aggregating orders.loc[:,'date_first_sale'] = pd.to_datetime(orders['order_approved_at']) orders['date_last_sale'] = orders['date_first_sale'] return orders.merge(self.matching_table[['seller_id', 'order_id']], on="order_id")\ .groupby('seller_id')\ .agg({ "date_first_sale": min, "date_last_sale": max }) def get_review_score(self): """ Returns a DataFrame with: 'seller_id', 'share_of_five_stars', 'share_of_one_stars', 'review_score' """ matching_table = self.matching_table orders_reviews = self.order.get_review_score() # Since same seller can appear multiple times in the same # order, create a seller <> order matching table matching_table = matching_table[['order_id', 'seller_id']].drop_duplicates() df = matching_table.merge(orders_reviews, on='order_id') df = df.groupby('seller_id', as_index=False).agg({'dim_is_one_star': 'mean', 'dim_is_five_star': 'mean', 'review_score': 'mean'}) df.columns = ['seller_id', 'share_one_stars', 'share_of_five_stars', 'review_score'] return df def get_quantity(self): """ Returns a DataFrame with: 'seller_id', 'n_orders', 'quantity' """ matching_table = self.matching_table n_orders = matching_table.groupby('seller_id')['order_id']\ .nunique()\ .reset_index() n_orders.columns = ['seller_id', 'n_orders'] quantity = matching_table.groupby('seller_id', as_index=False).agg({'order_id': 'count'}) quantity.columns = ['seller_id', 'quantity'] return n_orders.merge(quantity, on='seller_id') def get_sales(self): """ Returns a DataFrame with: 'seller_id', 'sales' """ return self.data['order_items'][['seller_id', 'price']]\ .groupby('seller_id')\ .sum()\ .rename(columns={'price': 'sales'}) def get_training_data(self): training_set =\ self.get_seller_features()\ .merge( self.get_seller_delay_wait_time(), on='seller_id' ).merge( self.get_active_dates(), on='seller_id' ).merge( self.get_review_score(), on='seller_id' ).merge( self.get_quantity(), on='seller_id' ).merge( self.get_sales(), on='seller_id' ) return training_set
class Product: def __init__(self): # Import only data once olist = Olist() self.data = olist.get_data() self.matching_table = olist.get_matching_table() self.order = Order() def get_product_features(self): """ 03-01 > Returns a DataFrame with: 'product_id', 'product_category_name', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm' """ products = self.data['olist_products_dataset'] en_category = self.data['product_category_name_translation'] df = products.merge(en_category, on='product_category_name') df.drop(['product_category_name'], axis=1, inplace=True) df.rename(columns={'product_category_name_english': 'category'}, inplace=True) return df def get_wait_time(self): """ 03-01 > Returns a DataFrame with: 'product_id', 'wait_time' """ mathing_table = self.matching_table orders_wait_time = self.order.get_wait_time() df = mathing_table.merge(orders_wait_time, on='order_id') return df.groupby('product_id', as_index=False).agg({'wait_time': 'mean'}) def get_review_score(self): """ 03-01 > Returns a DataFrame with: 'product_id', 'share_of_five_stars', 'share_of_one_stars', 'avg_review_score' """ matching_table = self.matching_table orders_reviews = self.order.get_review_score() # Since same products can appear multiple times in the same # order, create a product <> order matching table matching_table = matching_table[['order_id', 'product_id']].drop_duplicates() df = matching_table.merge(orders_reviews, on='order_id') df = df.groupby('product_id', as_index=False).agg({'dim_is_one_star': 'mean', 'dim_is_five_star': 'mean', 'review_score': 'mean'}) df.columns = ['product_id', 'share_of_one_stars', 'share_of_five_stars', 'avg_review_score'] return df def get_quantity(self): """ 03-01 > Returns a DataFrame with: 'product_id', 'n_orders', 'quantity' """ matching_table = self.matching_table n_orders =\ matching_table.groupby('product_id')['order_id'].nunique().reset_index() n_orders.columns = ['product_id', 'n_orders'] quantity = \ matching_table.groupby('product_id', as_index=False).agg({'order_id': 'count'}) quantity.columns = ['product_id', 'quantity'] return n_orders.merge(quantity, on='product_id') def get_training_data(self): training_set =\ self.get_product_features()\ .merge( self.get_wait_time(), on='product_id' ).merge( self.get_review_score(), on='product_id' ).merge( self.get_quantity(), on='product_id' ) return training_set
class Seller: def __init__(self): # Import only data once olist = Olist() self.data = olist.get_data() self.matching_table = olist.get_matching_table() self.order = Order() def get_seller_features(self): """ 03-01 > Returns a DataFrame with: 'seller_id', 'seller_city', 'seller_state' """ sellers = self.data['olist_sellers_dataset'] sellers = sellers.dropna() df = sellers[['seller_id', 'seller_city', 'seller_state']] return df def get_seller_delay_wait_time(self): """ 03-01 > Returns a DataFrame with: 'seller_id', 'delay_to_carrier', 'seller_wait_time' """ df_merge_col = pd.merge(self.data['olist_order_items_dataset'], self.data['olist_orders_dataset'], on='order_id') # handle datetime df_merge_col['shipping_limit_date'] = pd.to_datetime( df_merge_col['shipping_limit_date']) df_merge_col['order_purchase_timestamp'] = pd.to_datetime( df_merge_col['order_purchase_timestamp']) df_merge_col['order_approved_at'] = pd.to_datetime( df_merge_col['order_approved_at']) df_merge_col['order_delivered_carrier_date'] = pd.to_datetime( df_merge_col['order_delivered_carrier_date']) df_merge_col['order_estimated_delivery_date'] = pd.to_datetime( df_merge_col['order_estimated_delivery_date']) df_merge_col['order_delivered_customer_date'] = pd.to_datetime( df_merge_col['order_delivered_customer_date']) # compute delay_to_carrier (if the order is delivered after the shipping limit date, # return the number of days between two dates, otherwise 0) df_merge_col['delay_to_carrier'] = \ (df_merge_col['order_delivered_carrier_date']-df_merge_col['shipping_limit_date']) / np.timedelta64(24, 'h') #df_merge_col['delay_to_carrier'] = df_merge_col['delay_to_carrier'] if df_merge_col['delay_to_carrier'] > 0 else 0 # compute seller_wait_time (Average number of days customers waited) df_merge_col['seller_wait_time'] = \ (df_merge_col['order_delivered_customer_date'] - df_merge_col['order_purchase_timestamp']) / np.timedelta64(24, 'h') df = df_merge_col.groupby(['seller_id', 'order_id' ])[['delay_to_carrier', 'seller_wait_time' ]].mean().reset_index() df = df_merge_col.groupby(['seller_id' ])[['delay_to_carrier', 'seller_wait_time' ]].mean().reset_index() return df def get_review_score(self): """ 03-01 > Returns a DataFrame with: 'seller_id', 'share_of_five_stars', 'share_of_one_stars', 'review_score' """ matching_table = self.matching_table orders_reviews = self.order.get_review_score() # Since same seller can appear multiple times in the same # order, create a seller <> order matching table matching_table = matching_table[['order_id', 'seller_id']].drop_duplicates() df = matching_table.merge(orders_reviews, on='order_id') df = df.groupby('seller_id', as_index=False).agg({ 'dim_is_one_star': 'mean', 'dim_is_five_star': 'mean', 'review_score': 'mean' }) df.columns = [ 'seller_id', 'share_one_stars', 'share_of_five_stars', 'avg_review_score' ] return df def get_quantity(self): """ 03-01 > Returns a DataFrame with: 'seller_id', 'n_orders', 'quantity' """ # n_orders (int) The number of orders the seller was involved with. matching_table = self.matching_table n_orders = matching_table.groupby( 'seller_id')['order_id'].nunique().reset_index() n_orders = n_orders.rename(columns={'order_id': 'n_orders'}) quantity = matching_table.groupby('seller_id', as_index=False)['order_id'].count() quantity = quantity.rename(columns={'order_id': 'quantity'}) get_quantity = n_orders.merge(quantity, how='left') return get_quantity def get_training_data(self): """ 03 > 01 Returns a DataFrame with: seller_id, seller_state, seller_city, delay_to_carrier, seller_wait_time, share_of_five_stars, share_of_one_stars, seller_review_score, n_orders """ seller_training_set =\ self.get_seller_features()\ .merge( self.get_seller_delay_wait_time(), on='seller_id' ).merge( self.get_review_score(), on='seller_id' ).merge( self.get_quantity(), on='seller_id' ) return seller_training_set
class Product: def __init__(self): # Import only data once olist = Olist() self.data = olist.get_data() self.matching_table = olist.get_matching_table() self.order = Order() def get_product_features(self): """ Returns a DataFrame with: 'product_id', 'product_category_name', 'product_name_length', 'product_description_length', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm' """ products = self.data['products'] # convert name to english en_category = self.data['product_category_name_translation'] df = products.merge(en_category, on='product_category_name') df.drop(['product_category_name'], axis=1, inplace=True) df.rename(columns={'product_category_name_english': 'category'}, inplace=True) return df def get_price(self): """ Return a DataFrame with: 'product_id', 'price' """ order_items = self.data['order_items'] # There are many different order_items per product_id, each with different prices. Take the mean of various prices return order_items[['product_id', 'price']].groupby('product_id').mean() def get_wait_time(self): """ Returns a DataFrame with: 'product_id', 'wait_time' """ matching_table = self.matching_table orders_wait_time = self.order.get_wait_time() df = matching_table.merge(orders_wait_time, on='order_id') return df.groupby('product_id', as_index=False).agg({'wait_time': 'mean'}) def get_review_score(self): """ Returns a DataFrame with: 'product_id', 'share_of_five_stars', 'share_of_one_stars', 'review_score' """ matching_table = self.matching_table orders_reviews = self.order.get_review_score() # Since same products can appear multiple times in the same # order, create a product <> order matching table matching_table = matching_table[['order_id', 'product_id']].drop_duplicates() df = matching_table.merge(orders_reviews, on='order_id') df['cost_of_reviews'] = df.review_score.map({ 1: 100, 2: 50, 3: 40, 4: 0, 5: 0 }) df = df.groupby('product_id', as_index=False).agg({ 'dim_is_one_star': 'mean', 'dim_is_five_star': 'mean', 'review_score': 'mean', 'cost_of_reviews': 'sum' }) df.columns = [ 'product_id', 'share_of_one_stars', 'share_of_five_stars', 'review_score', 'cost_of_reviews' ] return df def get_quantity(self): """ Returns a DataFrame with: 'product_id', 'n_orders', 'quantity' """ matching_table = self.matching_table n_orders =\ matching_table.groupby('product_id')['order_id'].nunique().reset_index() n_orders.columns = ['product_id', 'n_orders'] quantity = \ matching_table.groupby('product_id', as_index=False).agg({'order_id': 'count'}) quantity.columns = ['product_id', 'quantity'] return n_orders.merge(quantity, on='product_id') def get_sales(self): """ Returns a DataFrame with: 'seller_id', 'sales' """ return self.data['order_items'][['product_id', 'price']]\ .groupby('product_id')\ .sum()\ .rename(columns={'price': 'sales'}) def get_training_data(self): training_set =\ self.get_product_features()\ .merge( self.get_wait_time(), on='product_id' ).merge( self.get_price(), on='product_id' ).merge( self.get_review_score(), on='product_id' ).merge( self.get_quantity(), on='product_id' ).merge( self.get_sales(), on='product_id' ) # compute economics (revenues, profits) olist_sales_cut = 0.1 training_set['revenues'] = olist_sales_cut * training_set['sales'] training_set['profits'] = training_set['revenues'] - training_set[ 'cost_of_reviews'] return training_set def get_product_cat(self, agg="median"): ''' Returns a DataFrame aggregating various properties for each product 'category', using the aggregation function passed in argument. The 'quantity' columns refers to the total number of product sold for this category. ''' product_cat = self.get_training_data().groupby("category").agg({ "review_score": agg, "share_of_one_stars": agg, "share_of_five_stars": agg, "wait_time": agg, "price": agg, "product_weight_g": agg, "product_length_cm": agg, "product_height_cm": agg, "product_width_cm": agg, "product_photos_qty": agg, "n_orders": agg, "quantity": "sum", }) return product_cat