Exemplo n.º 1
0
def create_predictions_sales(train, test, load_or_run='run'):
    sub = train.drop(['Customers', 'Date'], axis=1)
    subtest = test.drop(['Sales', 'Customers', 'Date'], axis=1)

    subtest.Open = subtest.Open.astype('int')
    for c in sub.columns:
        if sub[c].dtype == 'object' or sub[c].dtype.name == 'category':
            print(c)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(sub[c].values))
            sub[c] = lbl.transform(sub[c].values)

    for c in subtest.columns:
        if subtest[c].dtypes == 'object' or subtest[c].dtype.name == 'category':
            print(c)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(subtest[c].values))
            subtest[c] = lbl.transform(subtest[c].values)

    target = np.array(sub.Sales)
    sub = sub.drop('Sales', axis=1)
    traincols = sub.columns
    sub = np.array(sub)
    subtest = np.array(subtest)

    trn, tst, trgt_train, trgt_test = train_test_split(sub,
                                                       target,
                                                       test_size=.3,
                                                       random_state=42)

    def rmse(preds, target):
        error = np.sqrt(((preds - target)**2).mean())
        print(error)
        return (error)

    def mae(preds, target):
        error = np.mean(abs(preds - target))
        print(error)
        return (error)

    if load_or_run == 'load':
        xg = joblib.load("sales2.joblib.dat")
        print('loaded')

    else:
        param_grid = {
            'n_jobs': [4],
            'learning_rate': [.05, .1, .2],
            'max_depth': [8, 10],
            'n_estimators': [500],
            'booster': ['gbtree'],
            'gamma': [0],
            'subsample': [1],
            'colsample_bytree': [1]
        }
        xg = XGBRegressor(silent=0)
        xg = GridSearchCV(xg, param_grid)
        xg.fit(X=trn, y=trgt_train)
        xg.Features = traincols
        joblib.dump(xg, "sales2.joblib.dat")
        print('ran')


#    feats = pd.DataFrame({'feats': traincols, 'importances':xg2.feature_importances_})
#    feats.plot.bar( )
    print(xg.best_estimator_)
    preds = xg.predict(tst)
    rmse(preds, trgt_test)
    mae(preds, trgt_test)
    testpreds = xg.predict(subtest)
    trainpreds = xg.predict(sub)
    #
    #    rf = RandomForestRegressor(n_estimators = 500, random_state = 42, n_jobs = 4)
    #    rf.fit(trn, trgt_train)
    #    preds = rf.predict(tst)
    #    rmse(preds, trgt_test)
    #    mae(preds, trgt_test)

    return (trainpreds, testpreds)