示例#1
0
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor

#import data
from olist.order import Order
from olist.data import Olist

data = Olist().get_data()
training_orders = Order().get_training_data()

orders = data['olist_orders_dataset']

orders['estimate_wait_time'] = (pd.to_datetime(orders['order_estimated_delivery_date'])\
    - pd.to_datetime(orders['order_purchase_timestamp'])) / np.timedelta64(24, 'h')

training_orders =\
    training_orders.merge(orders[['estimate_wait_time', 'order_id']], on='order_id')

X = training_orders.drop(['order_id', 'wait_time', 'delay_vs_expected'],
                         axis=1)
y = training_orders['wait_time']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_boston_pipeline.py')
示例#2
0
                         'product_id'],
                        as_index=False)\
               .agg({'order_item_id': 'count'})\
               .merge(products.drop(['product_category_name'],
                                    axis=1),
                      on='product_id')

order_items_products = order_items_products\
                        .groupby('order_id',
                                 as_index=False)\
                        .agg({'product_weight_g': 'sum',
                              'product_length_cm': 'max',
                              'product_height_cm': 'max',
                              'product_width_cm': 'max'})

orders_2 = orders.merge(order_items_products, on='order_id',
                        how='left').dropna()

# Create train and target variable
X_2 = orders_2.drop(
    ['order_id', 'expected_wait_time', 'delay_vs_expected', 'wait_time'],
    axis=1)
y_2 = orders_2['wait_time']

# Training test split
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2,
                                                            y_2,
                                                            test_size=0.30,
                                                            random_state=42)

# Average CV score on the training set was:-62.99467662836531
exported_pipeline = RandomForestRegressor(bootstrap=True,