import pandas as pd import numpy as np import sklearn from sklearn.model_selection import train_test_split from tpot import TPOTRegressor #import data from olist.order import Order from olist.data import Olist data = Olist().get_data() training_orders = Order().get_training_data() orders = data['olist_orders_dataset'] orders['estimate_wait_time'] = (pd.to_datetime(orders['order_estimated_delivery_date'])\ - pd.to_datetime(orders['order_purchase_timestamp'])) / np.timedelta64(24, 'h') training_orders =\ training_orders.merge(orders[['estimate_wait_time', 'order_id']], on='order_id') X = training_orders.drop(['order_id', 'wait_time', 'delay_vs_expected'], axis=1) y = training_orders['wait_time'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
def __init__(self): # Import only data once olist = Olist() self.data = olist.get_data() self.matching_table = olist.get_matching_table() self.order = Order()
def __init__(self): self.data = Olist().get_data()
def get_distance_seller_customer(self): """ 02-01 > Returns a DataFrame with order_id and distance between seller and customer """ # Optional # Hint: you can use the haversine_distance logic coded in olist/utils.py data = self.data matching_table = Olist().get_matching_table() # Since one zipcode can map to multiple (lat, lng), take first one geo = data['geolocation'] geo = geo.groupby('geolocation_zip_code_prefix', as_index=False).first() # Select sellers and customers sellers = data['sellers'] customers = data['customers'] # Merge geo_location for sellers sellers_mask_columns = ['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state', 'geolocation_lat', 'geolocation_lng'] sellers_geo = sellers.merge(geo, how='left', left_on='seller_zip_code_prefix', right_on='geolocation_zip_code_prefix')[sellers_mask_columns] # Merge geo_location for customers customers_mask_columns = ['customer_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state', 'geolocation_lat', 'geolocation_lng'] customers_geo = customers.merge(geo, how='left', left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix')[customers_mask_columns] # Use the matching table and merge customers and sellers matching_geo = matching_table.merge(sellers_geo, on='seller_id')\ .merge(customers_geo, on='customer_id', suffixes=('_seller', '_customer')) # Remove na() matching_geo = matching_geo.dropna() matching_geo.loc[:, 'distance_seller_customer'] =\ matching_geo.apply(lambda row: haversine_distance(row['geolocation_lng_seller'], row['geolocation_lat_seller'], row['geolocation_lng_customer'], row['geolocation_lat_customer']), axis=1) # Since an order can have multiple sellers, # return the average of the distance per order order_distance =\ matching_geo.groupby('order_id', as_index=False).agg({'distance_seller_customer': 'mean'}) return order_distance
def __init__(self): # Assign an attribute ".data" to all new instances of Order self.data = Olist().get_data()