示例#1
0
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor

#import data
from olist.order import Order
from olist.data import Olist

data = Olist().get_data()
training_orders = Order().get_training_data()

orders = data['olist_orders_dataset']

orders['estimate_wait_time'] = (pd.to_datetime(orders['order_estimated_delivery_date'])\
    - pd.to_datetime(orders['order_purchase_timestamp'])) / np.timedelta64(24, 'h')

training_orders =\
    training_orders.merge(orders[['estimate_wait_time', 'order_id']], on='order_id')

X = training_orders.drop(['order_id', 'wait_time', 'delay_vs_expected'],
                         axis=1)
y = training_orders['wait_time']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
示例#2
0
 def __init__(self):
     # Import only data once
     olist = Olist()
     self.data = olist.get_data()
     self.matching_table = olist.get_matching_table()
     self.order = Order()
 def __init__(self):
     self.data = Olist().get_data()
    def get_distance_seller_customer(self):
        """
        02-01 > Returns a DataFrame with order_id
        and distance between seller and customer
        """
        # Optional
        # Hint: you can use the haversine_distance logic coded in olist/utils.py
        data = self.data
        matching_table = Olist().get_matching_table()

        # Since one zipcode can map to multiple (lat, lng), take first one
        geo = data['geolocation']
        geo = geo.groupby('geolocation_zip_code_prefix',
                          as_index=False).first()

        # Select sellers and customers
        sellers = data['sellers']
        customers = data['customers']

        # Merge geo_location for sellers
        sellers_mask_columns = ['seller_id', 'seller_zip_code_prefix',
                                'seller_city', 'seller_state',
                                'geolocation_lat', 'geolocation_lng']

        sellers_geo = sellers.merge(geo,
                                    how='left',
                                    left_on='seller_zip_code_prefix',
                                    right_on='geolocation_zip_code_prefix')[sellers_mask_columns]

        # Merge geo_location for customers
        customers_mask_columns = ['customer_id', 'customer_zip_code_prefix',
                                  'customer_city', 'customer_state',
                                  'geolocation_lat', 'geolocation_lng']

        customers_geo = customers.merge(geo,
                                        how='left',
                                        left_on='customer_zip_code_prefix',
                                        right_on='geolocation_zip_code_prefix')[customers_mask_columns]

        # Use the matching table and merge customers and sellers
        matching_geo = matching_table.merge(sellers_geo,
                                            on='seller_id')\
                                     .merge(customers_geo,
                                            on='customer_id',
                                            suffixes=('_seller',
                                                      '_customer'))
        # Remove na()
        matching_geo = matching_geo.dropna()

        matching_geo.loc[:, 'distance_seller_customer'] =\
            matching_geo.apply(lambda row:
                               haversine_distance(row['geolocation_lng_seller'],
                                                  row['geolocation_lat_seller'],
                                                  row['geolocation_lng_customer'],
                                                  row['geolocation_lat_customer']),
                               axis=1)
        # Since an order can have multiple sellers,
        # return the average of the distance per order
        order_distance =\
            matching_geo.groupby('order_id',
                                 as_index=False).agg({'distance_seller_customer':
                                                      'mean'})

        return order_distance
示例#5
0
 def __init__(self):
     # Assign an attribute ".data" to all new instances of Order
     self.data = Olist().get_data()