from sklearn.model_selection import train_test_split from tpot import TPOTRegressor #import data from olist.order import Order from olist.data import Olist data = Olist().get_data() training_orders = Order().get_training_data() orders = data['olist_orders_dataset'] orders['estimate_wait_time'] = (pd.to_datetime(orders['order_estimated_delivery_date'])\ - pd.to_datetime(orders['order_purchase_timestamp'])) / np.timedelta64(24, 'h') training_orders =\ training_orders.merge(orders[['estimate_wait_time', 'order_id']], on='order_id') X = training_orders.drop(['order_id', 'wait_time', 'delay_vs_expected'], axis=1) y = training_orders['wait_time'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py')
'product_id'], as_index=False)\ .agg({'order_item_id': 'count'})\ .merge(products.drop(['product_category_name'], axis=1), on='product_id') order_items_products = order_items_products\ .groupby('order_id', as_index=False)\ .agg({'product_weight_g': 'sum', 'product_length_cm': 'max', 'product_height_cm': 'max', 'product_width_cm': 'max'}) orders_2 = orders.merge(order_items_products, on='order_id', how='left').dropna() # Create train and target variable X_2 = orders_2.drop( ['order_id', 'expected_wait_time', 'delay_vs_expected', 'wait_time'], axis=1) y_2 = orders_2['wait_time'] # Training test split X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size=0.30, random_state=42) # Average CV score on the training set was:-62.99467662836531 exported_pipeline = RandomForestRegressor(bootstrap=True,