示例#1
0
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor

#import data
from olist.order import Order
from olist.data import Olist

data = Olist().get_data()
training_orders = Order().get_training_data()

orders = data['olist_orders_dataset']

orders['estimate_wait_time'] = (pd.to_datetime(orders['order_estimated_delivery_date'])\
    - pd.to_datetime(orders['order_purchase_timestamp'])) / np.timedelta64(24, 'h')

training_orders =\
    training_orders.merge(orders[['estimate_wait_time', 'order_id']], on='order_id')

X = training_orders.drop(['order_id', 'wait_time', 'delay_vs_expected'],
                         axis=1)
y = training_orders['wait_time']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
示例#2
0
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

from olist.order import Order

model = 'gradient_boost_2'

print("Importing dataset")
orders = Order().get_training_data()

# Create train and target variable
X = orders.drop(
    ['order_id', 'wait_time', 'delay_vs_expected', 'expected_wait_time'],
    axis=1)
y = orders['wait_time']

# Training test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42)

# Average CV score on the training set was:-62.99467662836531
exported_pipeline = make_pipeline(
    MinMaxScaler(),
示例#3
0
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from olist.data import Olist
from olist.order import Order

model = 'random_forest_3'

print("Importing dataset")

orders = Order().get_training_data()
data = Olist().get_data()

products = data['olist_products_dataset']
order_items = data['olist_order_items_dataset']

# Build training set
order_items_products = \
    order_items.groupby(['order_id',
                         'product_id'],
                        as_index=False)\
               .agg({'order_item_id': 'count'})\
               .merge(products.drop(['product_category_name'],
                                    axis=1),
                      on='product_id')

order_items_products = order_items_products\
                        .groupby('order_id',
示例#4
0
 def __init__(self):
     # Import only data once
     olist = Olist()
     self.data = olist.get_data()
     self.matching_table = olist.get_matching_table()
     self.order = Order()
示例#5
0
class Product:
    def __init__(self):
        # Import data only once
        olist = Olist()
        self.data = olist.get_data()
        self.matching_table = olist.get_matching_table()
        self.order = Order()

    def get_product_features(self):
        """
        Returns a DataFrame with:
       'product_id', 'product_category_name', 'product_name_length',
       'product_description_length', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'
        """

        products = self.data['products']

        # (optional) convert name to English
        en_category = self.data['product_category_name_translation']
        df = products.merge(en_category, on='product_category_name')
        df.drop(['product_category_name'], axis=1, inplace=True)
        df.rename(columns={
            'product_category_name_english': 'category',
            'product_name_lenght': 'product_name_length',
            'product_description_lenght': 'product_description_length'
        },
                  inplace=True)

        return df

    def get_price(self):
        """
        Return a DataFrame with:
        'product_id', 'price'
        """
        order_items = self.data['order_items']
        # There are many different order_items per product_id, each with different prices. Take the mean of various prices
        return order_items[['product_id',
                            'price']].groupby('product_id').mean()

    def get_wait_time(self):
        """
        Returns a DataFrame with:
        'product_id', 'wait_time'
        """
        matching_table = self.matching_table
        orders_wait_time = self.order.get_wait_time()

        df = matching_table.merge(orders_wait_time, on='order_id')

        return df.groupby('product_id',
                          as_index=False).agg({'wait_time': 'mean'})

    def get_review_score(self):
        """
        Returns a DataFrame with:
        'product_id', 'share_of_five_stars', 'share_of_one_stars',
        'review_score'
        """
        matching_table = self.matching_table
        orders_reviews = self.order.get_review_score()

        # Since the same products can appear multiple times in the same
        # order, create a product <> order matching table

        matching_table = matching_table[['order_id',
                                         'product_id']].drop_duplicates()
        df = matching_table.merge(orders_reviews, on='order_id')
        df = df.groupby('product_id', as_index=False).agg({
            'dim_is_one_star':
            'mean',
            'dim_is_five_star':
            'mean',
            'review_score':
            'mean',
        })
        df.columns = [
            'product_id', 'share_of_one_stars', 'share_of_five_stars',
            'review_score'
        ]

        return df

    def get_quantity(self):
        """
        Returns a DataFrame with:
        'product_id', 'n_orders', 'quantity'
        """
        order_items = self.data['order_items']

        n_orders =\
            order_items.groupby('product_id')['order_id'].nunique().reset_index()
        n_orders.columns = ['product_id', 'n_orders']

        quantity = \
            order_items.groupby('product_id',
                                   as_index=False).agg({'order_id': 'count'})
        quantity.columns = ['product_id', 'quantity']

        return n_orders.merge(quantity, on='product_id')

    def get_sales(self):
        """
        Returns a DataFrame with:
        'seller_id', 'sales'
        """
        return self.data['order_items'][['product_id', 'price']]\
            .groupby('product_id')\
            .sum()\
            .rename(columns={'price': 'sales'})

    def get_training_data(self):

        training_set =\
            self.get_product_features()\
                .merge(
                self.get_wait_time(), on='product_id'
               ).merge(
                self.get_price(), on='product_id'
               ).merge(
                self.get_review_score(), on='product_id'
               ).merge(
                self.get_quantity(), on='product_id'
               ).merge(
                self.get_sales(), on='product_id'
               )

        return training_set
示例#6
0
class Seller:

    def __init__(self):
        # Import only data once
        olist = Olist()
        self.data = olist.get_data()
        self.matching_table = olist.get_matching_table()
        self.order = Order()

    def get_seller_features(self):
        """
        Returns a DataFrame with:
        'seller_id', 'seller_city', 'seller_state'
        """
        sellers = self.data['sellers']
        sellers.drop('seller_zip_code_prefix',
                     axis=1,
                     inplace=True)
        # There is multiple rows per seller
        sellers.drop_duplicates(inplace=True)

        return sellers

    def get_seller_delay_wait_time(self):
        """
        Returns a DataFrame with:
        'seller_id', 'delay_to_carrier', 'wait_time'
        """
        # Get data
        order_items = self.data['order_items']
        orders = self.data['orders'].query("order_status=='delivered'")

        ship = order_items.merge(orders, on='order_id')

        # Handle datetime
        ship.loc[:, 'shipping_limit_date'] =\
            pd.to_datetime(ship['shipping_limit_date'])
        ship.loc[:, 'order_delivered_carrier_date'] =\
            pd.to_datetime(ship['order_delivered_carrier_date'])
        ship.loc[:, 'order_delivered_customer_date'] =\
            pd.to_datetime(ship['order_delivered_customer_date'])
        ship.loc[:, 'order_purchase_timestamp'] =\
            pd.to_datetime(ship['order_purchase_timestamp'])

        # Compute delay and wait_time
        def delay_to_logistic_partner(d):
            days = np.mean(
                (d.shipping_limit_date - d.order_delivered_carrier_date)/np.timedelta64(24, 'h')
                )
            if days < 0:
                return abs(days)
            else:
                return 0

        def order_wait_time(d):
            days = np.mean(
                (d.order_delivered_customer_date - d.order_purchase_timestamp)/np.timedelta64(24, 'h')
                )
            return days

        delay = ship.groupby('seller_id')\
                    .apply(delay_to_logistic_partner)\
                    .reset_index()
        delay.columns = ['seller_id', 'delay_to_carrier']

        wait = ship.groupby('seller_id')\
                   .apply(order_wait_time)\
                   .reset_index()
        wait.columns = ['seller_id', 'wait_time']

        df = delay.merge(wait, on='seller_id')

        return df

    def get_active_dates(self):
        orders = self.data['orders'][['order_id', 'order_approved_at']].copy()

        # create two new columns in view of aggregating
        orders.loc[:,'date_first_sale'] = pd.to_datetime(orders['order_approved_at'])
        orders['date_last_sale'] = orders['date_first_sale']

        return orders.merge(self.matching_table[['seller_id', 'order_id']], on="order_id")\
            .groupby('seller_id')\
            .agg({
            "date_first_sale": min,
            "date_last_sale": max
        })

    def get_review_score(self):
        """
        Returns a DataFrame with:
        'seller_id', 'share_of_five_stars', 'share_of_one_stars', 'review_score'
        """
        matching_table = self.matching_table
        orders_reviews = self.order.get_review_score()

        # Since same seller can appear multiple times in the same
        # order, create a seller <> order matching table

        matching_table = matching_table[['order_id',
                                         'seller_id']].drop_duplicates()
        df = matching_table.merge(orders_reviews,
                                  on='order_id')

        df = df.groupby('seller_id',
                        as_index=False).agg({'dim_is_one_star': 'mean',
                                             'dim_is_five_star': 'mean',
                                             'review_score': 'mean'})
        df.columns = ['seller_id', 'share_one_stars',
                      'share_of_five_stars', 'review_score']

        return df

    def get_quantity(self):
        """
        Returns a DataFrame with:
        'seller_id', 'n_orders', 'quantity'
        """
        matching_table = self.matching_table

        n_orders = matching_table.groupby('seller_id')['order_id']\
            .nunique()\
            .reset_index()
        n_orders.columns = ['seller_id', 'n_orders']

        quantity = matching_table.groupby('seller_id', as_index=False).agg({'order_id': 'count'})
        quantity.columns = ['seller_id', 'quantity']

        return n_orders.merge(quantity, on='seller_id')

    def get_sales(self):
        """
        Returns a DataFrame with:
        'seller_id', 'sales'
        """
        return self.data['order_items'][['seller_id', 'price']]\
            .groupby('seller_id')\
            .sum()\
            .rename(columns={'price': 'sales'})


    def get_training_data(self):

        training_set =\
            self.get_seller_features()\
                .merge(
                self.get_seller_delay_wait_time(), on='seller_id'
               ).merge(
                self.get_active_dates(), on='seller_id'
               ).merge(
                self.get_review_score(), on='seller_id'
               ).merge(
                self.get_quantity(), on='seller_id'
               ).merge(
                self.get_sales(), on='seller_id'
               )

        return training_set
示例#7
0
class Product:

    def __init__(self):
        # Import only data once
        olist = Olist()
        self.data = olist.get_data()
        self.matching_table = olist.get_matching_table()
        self.order = Order()

    def get_product_features(self):
        """
        03-01 > Returns a DataFrame with:
       'product_id', 'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'
        """

        products = self.data['olist_products_dataset']
        en_category = self.data['product_category_name_translation']
        df = products.merge(en_category,
                            on='product_category_name')
        df.drop(['product_category_name'], axis=1, inplace=True)
        df.rename(columns={'product_category_name_english': 'category'},
                  inplace=True)
        return df

    def get_wait_time(self):
        """
        03-01 > Returns a DataFrame with:
        'product_id', 'wait_time'
        """
        mathing_table = self.matching_table
        orders_wait_time = self.order.get_wait_time()

        df = mathing_table.merge(orders_wait_time,
                                 on='order_id')

        return df.groupby('product_id',
                          as_index=False).agg({'wait_time': 'mean'})

    def get_review_score(self):
        """
        03-01 > Returns a DataFrame with:
        'product_id', 'share_of_five_stars', 'share_of_one_stars',
        'avg_review_score'
        """
        matching_table = self.matching_table
        orders_reviews = self.order.get_review_score()

        # Since same products can appear multiple times in the same
        # order, create a product <> order matching table

        matching_table = matching_table[['order_id',
                                         'product_id']].drop_duplicates()
        df = matching_table.merge(orders_reviews,
                                  on='order_id')

        df = df.groupby('product_id',
                        as_index=False).agg({'dim_is_one_star': 'mean',
                                             'dim_is_five_star': 'mean',
                                             'review_score': 'mean'})
        df.columns = ['product_id', 'share_of_one_stars',
                      'share_of_five_stars', 'avg_review_score']

        return df

    def get_quantity(self):
        """
        03-01 > Returns a DataFrame with:
        'product_id', 'n_orders', 'quantity'
        """
        matching_table = self.matching_table

        n_orders =\
            matching_table.groupby('product_id')['order_id'].nunique().reset_index()
        n_orders.columns = ['product_id', 'n_orders']

        quantity = \
            matching_table.groupby('product_id',
                                   as_index=False).agg({'order_id': 'count'})
        quantity.columns = ['product_id', 'quantity']

        return n_orders.merge(quantity, on='product_id')

    def get_training_data(self):

        training_set =\
            self.get_product_features()\
                .merge(
                self.get_wait_time(), on='product_id'
               ).merge(
                self.get_review_score(), on='product_id'
               ).merge(
                self.get_quantity(), on='product_id'
               )

        return training_set
示例#8
0
class Seller:
    def __init__(self):
        # Import only data once
        olist = Olist()
        self.data = olist.get_data()
        self.matching_table = olist.get_matching_table()
        self.order = Order()

    def get_seller_features(self):
        """
        03-01 > Returns a DataFrame with:
       'seller_id', 'seller_city', 'seller_state'
        """
        sellers = self.data['olist_sellers_dataset']
        sellers = sellers.dropna()
        df = sellers[['seller_id', 'seller_city', 'seller_state']]
        return df

    def get_seller_delay_wait_time(self):
        """
        03-01 > Returns a DataFrame with:
       'seller_id', 'delay_to_carrier', 'seller_wait_time'
        """
        df_merge_col = pd.merge(self.data['olist_order_items_dataset'],
                                self.data['olist_orders_dataset'],
                                on='order_id')
        # handle datetime
        df_merge_col['shipping_limit_date'] = pd.to_datetime(
            df_merge_col['shipping_limit_date'])
        df_merge_col['order_purchase_timestamp'] = pd.to_datetime(
            df_merge_col['order_purchase_timestamp'])
        df_merge_col['order_approved_at'] = pd.to_datetime(
            df_merge_col['order_approved_at'])
        df_merge_col['order_delivered_carrier_date'] = pd.to_datetime(
            df_merge_col['order_delivered_carrier_date'])
        df_merge_col['order_estimated_delivery_date'] = pd.to_datetime(
            df_merge_col['order_estimated_delivery_date'])
        df_merge_col['order_delivered_customer_date'] = pd.to_datetime(
            df_merge_col['order_delivered_customer_date'])
        # compute delay_to_carrier (if the order is delivered after the shipping limit date,
        # return the number of days between two dates, otherwise 0)
        df_merge_col['delay_to_carrier'] = \
                (df_merge_col['order_delivered_carrier_date']-df_merge_col['shipping_limit_date']) / np.timedelta64(24, 'h')
        #df_merge_col['delay_to_carrier'] = df_merge_col['delay_to_carrier'] if df_merge_col['delay_to_carrier'] > 0 else 0
        # compute seller_wait_time (Average number of days customers waited)
        df_merge_col['seller_wait_time'] = \
                (df_merge_col['order_delivered_customer_date'] - df_merge_col['order_purchase_timestamp']) / np.timedelta64(24, 'h')
        df = df_merge_col.groupby(['seller_id', 'order_id'
                                   ])[['delay_to_carrier', 'seller_wait_time'
                                       ]].mean().reset_index()
        df = df_merge_col.groupby(['seller_id'
                                   ])[['delay_to_carrier', 'seller_wait_time'
                                       ]].mean().reset_index()
        return df

    def get_review_score(self):
        """
        03-01 > Returns a DataFrame with:
        'seller_id', 'share_of_five_stars', 'share_of_one_stars',
        'review_score'
        """
        matching_table = self.matching_table
        orders_reviews = self.order.get_review_score()
        # Since same seller can appear multiple times in the same
        # order, create a seller <> order matching table
        matching_table = matching_table[['order_id',
                                         'seller_id']].drop_duplicates()
        df = matching_table.merge(orders_reviews, on='order_id')
        df = df.groupby('seller_id', as_index=False).agg({
            'dim_is_one_star': 'mean',
            'dim_is_five_star': 'mean',
            'review_score': 'mean'
        })
        df.columns = [
            'seller_id', 'share_one_stars', 'share_of_five_stars',
            'avg_review_score'
        ]
        return df

    def get_quantity(self):
        """
        03-01 > Returns a DataFrame with:
        'seller_id', 'n_orders', 'quantity'
        """
        # n_orders (int) The number of orders the seller was involved with.
        matching_table = self.matching_table
        n_orders = matching_table.groupby(
            'seller_id')['order_id'].nunique().reset_index()
        n_orders = n_orders.rename(columns={'order_id': 'n_orders'})
        quantity = matching_table.groupby('seller_id',
                                          as_index=False)['order_id'].count()
        quantity = quantity.rename(columns={'order_id': 'quantity'})
        get_quantity = n_orders.merge(quantity, how='left')
        return get_quantity

    def get_training_data(self):
        """
        03 > 01 Returns a DataFrame with:
        seller_id, seller_state, seller_city, delay_to_carrier,
        seller_wait_time, share_of_five_stars, share_of_one_stars,
        seller_review_score, n_orders
        """
        seller_training_set =\
            self.get_seller_features()\
                .merge(
            self.get_seller_delay_wait_time(), on='seller_id'
               ).merge(
            self.get_review_score(), on='seller_id'
               ).merge(
            self.get_quantity(), on='seller_id'
            )
        return seller_training_set
示例#9
0
class Product:
    def __init__(self):
        # Import only data once
        olist = Olist()
        self.data = olist.get_data()
        self.matching_table = olist.get_matching_table()
        self.order = Order()

    def get_product_features(self):
        """
        Returns a DataFrame with:
       'product_id', 'product_category_name', 'product_name_length',
       'product_description_length', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'
        """

        products = self.data['products']

        # convert name to english
        en_category = self.data['product_category_name_translation']
        df = products.merge(en_category, on='product_category_name')
        df.drop(['product_category_name'], axis=1, inplace=True)
        df.rename(columns={'product_category_name_english': 'category'},
                  inplace=True)

        return df

    def get_price(self):
        """
        Return a DataFrame with:
        'product_id', 'price'
        """
        order_items = self.data['order_items']
        # There are many different order_items per product_id, each with different prices. Take the mean of various prices
        return order_items[['product_id',
                            'price']].groupby('product_id').mean()

    def get_wait_time(self):
        """
        Returns a DataFrame with:
        'product_id', 'wait_time'
        """
        matching_table = self.matching_table
        orders_wait_time = self.order.get_wait_time()

        df = matching_table.merge(orders_wait_time, on='order_id')

        return df.groupby('product_id',
                          as_index=False).agg({'wait_time': 'mean'})

    def get_review_score(self):
        """
        Returns a DataFrame with:
        'product_id', 'share_of_five_stars', 'share_of_one_stars',
        'review_score'
        """
        matching_table = self.matching_table
        orders_reviews = self.order.get_review_score()

        # Since same products can appear multiple times in the same
        # order, create a product <> order matching table

        matching_table = matching_table[['order_id',
                                         'product_id']].drop_duplicates()
        df = matching_table.merge(orders_reviews, on='order_id')

        df['cost_of_reviews'] = df.review_score.map({
            1: 100,
            2: 50,
            3: 40,
            4: 0,
            5: 0
        })

        df = df.groupby('product_id', as_index=False).agg({
            'dim_is_one_star':
            'mean',
            'dim_is_five_star':
            'mean',
            'review_score':
            'mean',
            'cost_of_reviews':
            'sum'
        })
        df.columns = [
            'product_id', 'share_of_one_stars', 'share_of_five_stars',
            'review_score', 'cost_of_reviews'
        ]

        return df

    def get_quantity(self):
        """
        Returns a DataFrame with:
        'product_id', 'n_orders', 'quantity'
        """
        matching_table = self.matching_table

        n_orders =\
            matching_table.groupby('product_id')['order_id'].nunique().reset_index()
        n_orders.columns = ['product_id', 'n_orders']

        quantity = \
            matching_table.groupby('product_id',
                                   as_index=False).agg({'order_id': 'count'})
        quantity.columns = ['product_id', 'quantity']

        return n_orders.merge(quantity, on='product_id')

    def get_sales(self):
        """
        Returns a DataFrame with:
        'seller_id', 'sales'
        """
        return self.data['order_items'][['product_id', 'price']]\
            .groupby('product_id')\
            .sum()\
            .rename(columns={'price': 'sales'})

    def get_training_data(self):

        training_set =\
            self.get_product_features()\
                .merge(
                self.get_wait_time(), on='product_id'
               ).merge(
                self.get_price(), on='product_id'
               ).merge(
                self.get_review_score(), on='product_id'
               ).merge(
                self.get_quantity(), on='product_id'
               ).merge(
                self.get_sales(), on='product_id'
               )

        # compute economics (revenues, profits)
        olist_sales_cut = 0.1
        training_set['revenues'] = olist_sales_cut * training_set['sales']
        training_set['profits'] = training_set['revenues'] - training_set[
            'cost_of_reviews']
        return training_set

    def get_product_cat(self, agg="median"):
        '''
        Returns a DataFrame aggregating various properties for each product 'category',
        using the aggregation function passed in argument.
        The 'quantity' columns refers to the total number of product sold for this category.
        '''
        product_cat = self.get_training_data().groupby("category").agg({
            "review_score":
            agg,
            "share_of_one_stars":
            agg,
            "share_of_five_stars":
            agg,
            "wait_time":
            agg,
            "price":
            agg,
            "product_weight_g":
            agg,
            "product_length_cm":
            agg,
            "product_height_cm":
            agg,
            "product_width_cm":
            agg,
            "product_photos_qty":
            agg,
            "n_orders":
            agg,
            "quantity":
            "sum",
        })
        return product_cat