def create_predictions_sales(train, test, load_or_run='run'): sub = train.drop(['Customers', 'Date'], axis=1) subtest = test.drop(['Sales', 'Customers', 'Date'], axis=1) subtest.Open = subtest.Open.astype('int') for c in sub.columns: if sub[c].dtype == 'object' or sub[c].dtype.name == 'category': print(c) lbl = preprocessing.LabelEncoder() lbl.fit(list(sub[c].values)) sub[c] = lbl.transform(sub[c].values) for c in subtest.columns: if subtest[c].dtypes == 'object' or subtest[c].dtype.name == 'category': print(c) lbl = preprocessing.LabelEncoder() lbl.fit(list(subtest[c].values)) subtest[c] = lbl.transform(subtest[c].values) target = np.array(sub.Sales) sub = sub.drop('Sales', axis=1) traincols = sub.columns sub = np.array(sub) subtest = np.array(subtest) trn, tst, trgt_train, trgt_test = train_test_split(sub, target, test_size=.3, random_state=42) def rmse(preds, target): error = np.sqrt(((preds - target)**2).mean()) print(error) return (error) def mae(preds, target): error = np.mean(abs(preds - target)) print(error) return (error) if load_or_run == 'load': xg = joblib.load("sales2.joblib.dat") print('loaded') else: param_grid = { 'n_jobs': [4], 'learning_rate': [.05, .1, .2], 'max_depth': [8, 10], 'n_estimators': [500], 'booster': ['gbtree'], 'gamma': [0], 'subsample': [1], 'colsample_bytree': [1] } xg = XGBRegressor(silent=0) xg = GridSearchCV(xg, param_grid) xg.fit(X=trn, y=trgt_train) xg.Features = traincols joblib.dump(xg, "sales2.joblib.dat") print('ran') # feats = pd.DataFrame({'feats': traincols, 'importances':xg2.feature_importances_}) # feats.plot.bar( ) print(xg.best_estimator_) preds = xg.predict(tst) rmse(preds, trgt_test) mae(preds, trgt_test) testpreds = xg.predict(subtest) trainpreds = xg.predict(sub) # # rf = RandomForestRegressor(n_estimators = 500, random_state = 42, n_jobs = 4) # rf.fit(trn, trgt_train) # preds = rf.predict(tst) # rmse(preds, trgt_test) # mae(preds, trgt_test) return (trainpreds, testpreds)