Пример #1
0
 def test_regressor(self):
     reg = RGFRegressor()
     reg.fit(self.X_train, self.y_train)
     y_pred = reg.predict(self.X_test)
     mse = mean_squared_error(self.y_test, y_pred)
     print("MSE: {0:.5f}".format(mse))
     self.assertLess(mse, 6.0)
Пример #2
0
    def test_attributes(self):
        reg = RGFRegressor()
        attributes = ('n_features_', 'fitted_', 'sl2_', 'min_samples_leaf_',
                      'n_iter_')

        for attr in attributes:
            self.assertRaises(NotFittedError, getattr, reg, attr)
        reg.fit(self.X_train, self.y_train)
        self.assertEqual(reg.n_features_, self.X_train.shape[-1])
        self.assertTrue(reg.fitted_)
        if reg.sl2 is None:
            self.assertEqual(reg.sl2_, reg.l2)
        else:
            self.assertEqual(reg.sl2_, reg.sl2)
        if reg.min_samples_leaf < 1:
            self.assertLessEqual(reg.min_samples_leaf_,
                                 0.5 * self.X_train.shape[0])
        else:
            self.assertEqual(reg.min_samples_leaf_, reg.min_samples_leaf)
        if reg.n_iter is None:
            if reg.loss == "LS":
                self.assertEqual(reg.n_iter_, 10)
            else:
                self.assertEqual(reg.n_iter_, 5)
        else:
            self.assertEqual(reg.n_iter_, reg.n_iter)
Пример #3
0
 def test_regressor(self):
     reg = RGFRegressor()
     reg.fit(self.X_train, self.y_train)
     y_pred = reg.predict(self.X_test)
     mse = mean_squared_error(self.y_test, y_pred)
     print("MSE: {0:.5f}".format(mse))
     self.assertLess(mse, 6.0)
Пример #4
0
 def test_regressor_sparse_input(self):
     reg = RGFRegressor(prefix='reg')
     for sparse_format in (csr_matrix, csc_matrix, coo_matrix):
         X_sparse = sparse_format(self.X)
         reg.fit(X_sparse, self.y)
         y_pred = reg.predict(X_sparse)
         mse = mean_squared_error(self.y, y_pred)
         self.assertLess(mse, 6.0)
Пример #5
0
 def test_regressor_sparse_input(self):
     reg = RGFRegressor()
     for sparse_format in (sparse.bsr_matrix, sparse.coo_matrix, sparse.csc_matrix,
                           sparse.csr_matrix, sparse.dia_matrix, sparse.dok_matrix, sparse.lil_matrix):
         X_sparse = sparse_format(self.X)
         reg.fit(X_sparse, self.y)
         y_pred = reg.predict(X_sparse)
         mse = mean_squared_error(self.y, y_pred)
         self.assertLess(mse, 6.0)
Пример #6
0
    def test_joblib_pickle(self):
        reg = RGFRegressor()
        reg.fit(self.X_train, self.y_train)
        y_pred1 = reg.predict(self.X_test)
        joblib.dump(reg, 'test_reg.pkl')

        # Remove model file
        _cleanup()

        reg2 = joblib.load('test_reg.pkl')
        y_pred2 = reg2.predict(self.X_test)

        np.testing.assert_allclose(y_pred1, y_pred2)
Пример #7
0
    def test_pickle(self):
        reg = RGFRegressor()
        reg.fit(self.X_train, self.y_train)
        y_pred1 = reg.predict(self.X_test)
        s = pickle.dumps(reg)

        # Remove model file
        _cleanup()

        reg2 = pickle.loads(s)
        y_pred2 = reg2.predict(self.X_test)

        np.testing.assert_allclose(y_pred1, y_pred2)
Пример #8
0
    def test_cleanup(self):
        reg1 = RGFRegressor()
        reg1.fit(self.X_train, self.y_train)

        reg2 = RGFRegressor()
        reg2.fit(self.X_train, self.y_train)

        self.assertNotEqual(reg1.cleanup(), 0)
        self.assertEqual(reg1.cleanup(), 0)

        glob_file = os.path.join(_get_temp_path(), reg1._file_prefix + "*")
        self.assertFalse(glob.glob(glob_file))

        self.assertRaises(NotFittedError, reg1.predict, self.X_test)
        reg2.predict(self.X_test)
Пример #9
0
    def test_params(self):
        reg = RGFRegressor()

        valid_params = dict(max_leaf=300,
                            test_interval=100,
                            algorithm='RGF_Sib',
                            loss='Log',
                            reg_depth=1.1,
                            l2=0.1,
                            sl2=None,
                            normalize=False,
                            min_samples_leaf=9,
                            n_iter=None,
                            n_tree_search=2,
                            opt_interval=100,
                            learning_rate=0.4,
                            verbose=True,
                            prefix='rgf_regressor',
                            inc_prefix=True,
                            clean=True)
        reg.set_params(**valid_params)
        reg.fit(self.X_train, self.y_train)

        non_valid_params = dict(max_leaf=0,
                                test_interval=0,
                                algorithm='RGF_Test',
                                loss=True,
                                reg_depth=0.1,
                                l2=11,
                                sl2=-1.1,
                                normalize='False',
                                min_samples_leaf=0.7,
                                n_iter=11.1,
                                n_tree_search=0,
                                opt_interval=100.1,
                                learning_rate=-0.5,
                                verbose=-1,
                                prefix='',
                                inc_prefix=1,
                                clean=0)
        for key in non_valid_params:
            reg.set_params(**valid_params)  # Reset to valid params
            reg.set_params(**{key: non_valid_params[key]})  # Pick and set one non-valid parametr
            self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train)
Пример #10
0
    def test_sample_weight(self):
        reg = RGFRegressor()

        y_pred = reg.fit(self.X_train, self.y_train).predict(self.X_test)
        y_pred_weighted = reg.fit(self.X_train,
                                  self.y_train,
                                  np.ones(self.y_train.shape[0])
                                  ).predict(self.X_test)
        np.testing.assert_allclose(y_pred, y_pred_weighted)

        np.random.seed(42)
        idx = np.random.choice(400, 80, replace=False)
        self.X_train[idx] = -99999  # Add some outliers
        y_pred_corrupt = reg.fit(self.X_train, self.y_train).predict(self.X_test)
        mse_corrupt = mean_squared_error(self.y_test, y_pred_corrupt)
        weights = np.ones(self.y_train.shape[0])
        weights[idx] = np.nextafter(np.float32(0), np.float32(1))  # Eliminate outliers
        y_pred_weighted = reg.fit(self.X_train, self.y_train, weights).predict(self.X_test)
        mse_fixed = mean_squared_error(self.y_test, y_pred_weighted)
        self.assertLess(mse_fixed, mse_corrupt)
Пример #11
0
    def test_sample_weight(self):
        reg = RGFRegressor()

        y_pred = reg.fit(self.X_train, self.y_train).predict(self.X_test)
        y_pred_weighted = reg.fit(self.X_train,
                                  self.y_train,
                                  np.ones(self.y_train.shape[0])
                                  ).predict(self.X_test)
        np.testing.assert_allclose(y_pred, y_pred_weighted)

        np.random.seed(42)
        idx = np.random.choice(400, 80, replace=False)
        self.X_train[idx] = -99999  # Add some outliers
        y_pred_corrupt = reg.fit(self.X_train, self.y_train).predict(self.X_test)
        mse_corrupt = mean_squared_error(self.y_test, y_pred_corrupt)
        weights = np.ones(self.y_train.shape[0])
        weights[idx] = np.nextafter(np.float32(0), np.float32(1))  # Eliminate outliers
        y_pred_weighted = reg.fit(self.X_train, self.y_train, weights).predict(self.X_test)
        mse_fixed = mean_squared_error(self.y_test, y_pred_weighted)
        self.assertLess(mse_fixed, mse_corrupt)
Пример #12
0
    def test_params(self):
        reg = RGFRegressor()

        valid_params = dict(max_leaf=300,
                            test_interval=100,
                            algorithm='RGF_Sib',
                            loss='Log',
                            reg_depth=1.1,
                            l2=0.1,
                            sl2=None,
                            normalize=False,
                            min_samples_leaf=9,
                            n_iter=None,
                            n_tree_search=2,
                            opt_interval=100,
                            learning_rate=0.4,
                            memory_policy='conservative',
                            verbose=True)
        reg.set_params(**valid_params)
        reg.fit(self.X_train, self.y_train)

        non_valid_params = dict(max_leaf=0,
                                test_interval=0,
                                algorithm='RGF_Test',
                                loss=True,
                                reg_depth=0.1,
                                l2=11,
                                sl2=-1.1,
                                normalize='False',
                                min_samples_leaf=0.7,
                                n_iter=11.1,
                                n_tree_search=0,
                                opt_interval=100.1,
                                learning_rate=-0.5,
                                memory_policy='Generos',
                                verbose=-1)
        for key in non_valid_params:
            reg.set_params(**valid_params)  # Reset to valid params
            reg.set_params(**{key: non_valid_params[key]})  # Pick and set one non-valid parametr
            self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train)
Пример #13
0
    def test_attributes(self):
        reg = RGFRegressor()
        attributes = ('n_features_', 'fitted_', 'sl2_', 'min_samples_leaf_', 'n_iter_')

        for attr in attributes:
            self.assertRaises(NotFittedError, getattr, reg, attr)
        reg.fit(self.X_train, self.y_train)
        self.assertEqual(reg.n_features_, self.X_train.shape[-1])
        self.assertTrue(reg.fitted_)
        if reg.sl2 is None:
            self.assertEqual(reg.sl2_, reg.l2)
        else:
            self.assertEqual(reg.sl2_, reg.sl2)
        if reg.min_samples_leaf < 1:
            self.assertLessEqual(reg.min_samples_leaf_, 0.5 * self.X_train.shape[0])
        else:
            self.assertEqual(reg.min_samples_leaf_, reg.min_samples_leaf)
        if reg.n_iter is None:
            if reg.loss == "LS":
                self.assertEqual(reg.n_iter_, 10)
            else:
                self.assertEqual(reg.n_iter_, 5)
        else:
            self.assertEqual(reg.n_iter_, reg.n_iter)
Пример #14
0
# Drop irrelevant columns from train and test
x_train = x_train.drop([id_column], axis=1)
test = test.drop([id_column], axis=1)   

# -----------------------------------------------------------------------------
# 			STEP 4 - TRAIN ML MODEL AND GENERATE PREDICTIONS
# -----------------------------------------------------------------------------
# XGBoost
if ml_algorithm == 'XGBoost': 
	d_train = xgb.DMatrix(x_train, label=y_train)
	d_test = xgb.DMatrix(test)
	model = xgb.train(params, d_train, num_rounds, verbose_eval=10)
	prediction = model.predict(d_test)
# LightGBM
elif ml_algorithm == 'LightGBM': 
	d_train = lgb.Dataset(x_train, label=y_train)
	d_test = lgb.Dataset(test)
	model = lgb.train(params, d_train, num_rounds, verbose_eval=10)
	prediction = model.predict(d_test)	
# RGFRegressor, FastRGFRegressor, Ridge Regression, Lasso Regression
else:
	model.fit(x_train, y_train)
	prediction = model.predict(test) 

# -----------------------------------------------------------------------------
# 				STEP 5 - GENERATE KAGGLE SUBMISSION FILE
# -----------------------------------------------------------------------------
print('Generate Submission ...')
submission = submission.append(pd.DataFrame(
                {id_column_label: test_id, target_column_label: prediction}))
submission.to_csv(submission_file_path, index=False)
from rgf.sklearn import FastRGFRegressor, RGFRegressor

boston = load_boston()
rng = check_random_state(42)
perm = rng.permutation(boston.target.size)
boston.data = boston.data[perm]
boston.target = boston.target[perm]

train_x = boston.data[:300]
test_x = boston.data[300:]
train_y = boston.target[:300]
test_y = boston.target[300:]

start = time.time()
reg = RGFRegressor()
reg.fit(train_x, train_y)
score = reg.score(test_x, test_y)
end = time.time()
print("RGF: {} sec".format(end - start))
print("score: {}".format(score))

start = time.time()
reg = FastRGFRegressor()
reg.fit(train_x, train_y)
score = reg.score(test_x, test_y)
end = time.time()
print("FastRGF: {} sec".format(end - start))
print("score: {}".format(score))

start = time.time()
reg = RandomForestRegressor(n_estimators=100)
Пример #16
0
    def stacklearning(self):
        class extAll(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                return self

            def predict(self, X):
                return self

        class extMorgan(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                _,morgan,_=sepTables(X)
                return morgan
        class extMACCS(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                maccs,morgan,_=sepTables(X)
                maccs = pd.concat([morgan,maccs],axis=1)

                return maccs

        class extDescriptor(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                maccs,morgan,descriptor=sepTables(X)
                descriptor = pd.concat([morgan,descriptor],axis=1)
                descriptor = pd.concat([maccs,descriptor],axis=1)
                return descriptor

        class extPCA(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                model = PCA(n_components=64)
                _,morgan,_=sepTables(X)
                morgan = morgan.reset_index().drop('index', axis=1)
                W = pd.DataFrame(model.fit_transform(X))
                W = pd.concat([morgan,W],axis=1)
                return W

        lgbm = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06)
        rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf1 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf2 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf3 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf4 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)

        pipe1 = make_pipeline(extMACCS(), rgf)
        pipe2 = make_pipeline(extMorgan(), rgf1)
        pipe3 = make_pipeline(extDescriptor(), rgf2)
        pipe4 = make_pipeline(extPCA(), rgf3)
        pipe7 =make_pipeline(extDescriptor(), rgf4)
        pipe8 =make_pipeline(extDescriptor(), rgf4)

        xgb = xgboost.XGBRegressor()
        nbrs = KNeighborsRegressor(2)
        svr = SVR(gamma='auto',kernel='linear')
        sgd = SGDRegressor(max_iter=1000)
        pls = PLSRegression(n_components=3)
        ext = ExtraTreesRegressor(n_estimators=30,max_features= 20,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)

        pipe5 = make_pipeline(extMorgan(), nbrs)
        pipe6 = make_pipeline(extMACCS(), rgf)
        alldata = make_pipeline(extAll())

        meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400)

        stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3], meta_regressor=rgf, verbose=1)
        #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,pipe5,pipe7,pipe1], meta_regressor=rgf,verbose=1)

        scores = cross_val_score(stack2, X, y, cv=10)
        print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'stacking'))
        stack1_score = cross_val_score(stack1,X,y, cv=10)
        rgf_score = cross_val_score(rgf,X,y,cv=10)

        stack2.fit(X_train, y_train)
        y_pred = stack2.predict(X_train)
        y_val = stack2.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        rgf.fit(X_train, y_train)
        y_pred = rgf.predict(X_train)
        y_val = rgf.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        pipe1.fit(X_train, y_train)
        y_pred = pipe1.predict(X_train)
        y_val = pipe1.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))


        cols = np.arange(1,550,1).tolist()
        cols = X.columns.tolist()
        cols = [1,2,3]
        # Initializing Classifiers
        reg1 = Ridge(random_state=1)
        #reg2 = ExtraTreesRegressor()
        reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)
        reg3 = SVR(gamma='auto',kernel='linear')
        reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06)
        pls = PLSRegression(n_components=3)
        pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50))
        #linear =SGDRegressor(max_iter=1000)
        rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        nbrs = KNeighborsRegressor(2)
        pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31))

        meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)

        stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1)
        stackReg.fit(X_train, y_train)
        y_pred = stackReg.predict(X_train)
        y_val = stackReg.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))

        rgf.fit(X_train, y_train)
        y_pred = reg4.predict(X_train)
        y_val = reg4.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
Пример #17
0
def run(seed):

    # create folders for scores models and preds
    folder_models = './models/age/scores/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/age/scores/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Loading data...')

    # load biases
    ic_bias = read_pickle('./data/biases/ic_biases.pickle')
    ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle')
    fnc_bias = read_pickle('./data/biases/fnc_biases.pickle')
    fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle')
    pca_bias = read_pickle('./data/biases/200pca_biases.pickle')
    pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle')

    # load classifier and add extra sites2
    extra_site = pd.DataFrame()
    extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy')

    # load competiton data
    ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv')
    fnc_df = pd.read_csv('./data/raw/fnc.csv')
    loading_df = pd.read_csv('./data/raw/loading.csv')
    labels_df = pd.read_csv('./data/raw/train_scores.csv')

    ids_df = ids_df.append(extra_site)
    print('Detected Site2 ids count: ', ids_df['Id'].nunique())

    # load created features
    agg_df = pd.read_csv('./data/features/agg_feats.csv')
    im_df = pd.read_csv('./data/features/im_feats.csv')
    dl_df = pd.read_csv('./data/features/dl_feats.csv')

    pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv')
    for i in range(1, 6):
        part = pd.read_csv(
            './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i))
        del part['Id']
        pca_df = pd.concat((pca_df, part), axis=1)

    # merge data
    ic_cols = list(loading_df.columns[1:])
    fnc_cols = list(fnc_df.columns[1:])
    agg_cols = list(agg_df.columns[1:])
    im_cols = list(im_df.columns[1:])
    pca_cols = list(pca_df.columns[1:])
    dl_cols = list(dl_df.columns[1:])

    df = fnc_df.merge(loading_df, on='Id')
    df = df.merge(agg_df, how='left', on='Id')
    df = df.merge(im_df, how='left', on='Id')
    df = df.merge(pca_df, how='left', on='Id')
    df = df.merge(dl_df, how='left', on='Id')
    df = df.merge(labels_df, how='left', on='Id')

    del loading_df, fnc_df, agg_df, im_df, pca_df
    gc.collect()

    # split train and test
    df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0
    df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1

    train = df.query('is_test==0')
    del train['is_test']
    test = df.query('is_test==1')
    del test['is_test']
    y = train['age'].copy().reset_index(drop=True)

    # apply biases
    for c in ic_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c]

    for c in fnc_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c]

    for c in pca_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # I. Create fnc score
    print('Creating FNC score...')

    # prepare datasets for fnc score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, fnc_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'Huber', 'OMP']
    names = [name + '_fnc_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000, reg_depth=5, normalize=True),
        ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0),
        BayesianRidge(),
        HuberRegressor(epsilon=2.5, alpha=1),
        OrthogonalMatchingPursuit(n_nonzero_coefs=300)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 5, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 5, names)

    # save oof, pred, models
    np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # II. Create agg score
    print('Creating AGG score...')

    # prepare datasets for agg score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, agg_cols)

    # define models
    names = ['RGF', 'ENet', 'Huber']
    names = [name + '_agg_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0),
        HuberRegressor(epsilon=2.5, alpha=1)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # III. Create pca score
    print('Creating PCA score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, pca_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'OMP']
    names = [name + '_pca_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 4, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 4, names)

    # save oof, pred, models
    np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # IV. Create im score
    print('Creating IM score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, im_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'OMP']
    names = [name + '_im_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 4, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 4, names)

    # save oof, pred, models
    np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # V. Create dl score
    print('Creating DL score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, dl_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge']
    names = [name + '_dl_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # VI. Training and predicting procedure
    print('Training has started...')
    print('Reading scores from ', folder_preds)

    # add scores
    for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']:
        train[prefix +
              '_score'] = np.load(folder_preds +
                                  '{}_score_seed{}.npy'.format(prefix, seed))
        test[prefix + '_score'] = np.load(
            folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed))
    score_cols = [c for c in train.columns if c.endswith('_score')]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # create differents datasets
    # linear
    linear_cols = sorted(
        list(
            set(ic_cols + fnc_cols + pca_cols + agg_cols + im_cols) -
            set(['IC_20'])))
    train_linear, test_linear = scale_select_data(train, test, df_scale,
                                                  linear_cols)

    # kernel
    kernel_cols = sorted(list(set(ic_cols + pca_cols) - set(['IC_20'])))
    train_kernel, test_kernel = scale_select_data(train=train,
                                                  test=test,
                                                  df_scale=df_scale,
                                                  cols=kernel_cols,
                                                  scale_cols=pca_cols)

    # score
    sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20'])))
    train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols)

    # dl
    dict_cols = sorted(
        list(
            set(ic_cols + fnc_cols + dl_cols + im_cols + agg_cols) -
            set(['IC_20'])))
    train_dl, test_dl = scale_select_data(train, test, df_scale, dict_cols)

    # learning process on different datasets
    names = ['MLP', 'RGF', 'SVM', 'BR', 'OMP', 'EN', 'KR']
    names = [name + '_seed{}'.format(seed) for name in names]
    pack = [
        MLPRegressor(activation='tanh', random_state=0),
        RGFRegressor(max_leaf=1500, loss='Abs'),
        NuSVR(C=10, nu=0.4, kernel='rbf'),
        BayesianRidge(),
        OrthogonalMatchingPursuitCV(),
        ElasticNet(alpha=0.5, l1_ratio=0.7, random_state=0),
        KernelRidge(kernel='poly', alpha=0.5)
    ]

    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2 +
            [train_dl] * 2, y)
    de_blend = zoo.blend_oof()
    preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2 +
                        [test_dl] * 2,
                        names,
                        is_blend=False)

    # rewrite folders for models and preds
    folder_models = './models/age/stack/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/age/stack/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Saving models to', folder_models)
    print('Saving predictions to', folder_preds)

    # save oofs and models
    zoo.save_oofs(names, folder=folder_preds)
    zoo.save_models(names, folder=folder_models)

    # stacking predictions
    print('Stacking predictions...')
    folds = KFold(n_splits=10, shuffle=True, random_state=0)
    stack = pd.DataFrame(zoo.oof_preds).T
    stack.columns = names

    model_stacker_rgf = RGFRegressor(max_leaf=1000,
                                     reg_depth=25,
                                     verbose=False)
    rgf_pred = cross_val_predict(model_stacker_rgf,
                                 stack,
                                 y.dropna(),
                                 cv=folds,
                                 n_jobs=-1)

    model_stacker_br = BayesianRidge()
    br_pred = cross_val_predict(model_stacker_br,
                                stack,
                                y.dropna(),
                                cv=folds,
                                n_jobs=-1)

    model_stacker_rgf.fit(stack, y.dropna())
    model_stacker_br.fit(stack, y.dropna())

    # save models
    save_pickle(model_stacker_br,
                folder_models + 'BRidge_stack_seed{}'.format(seed))
    save_pickle(model_stacker_rgf,
                folder_models + 'RGF_stack_seed{}'.format(seed))
    print('Final age NMAE: {:.5f}'.format(
        NMAE(y, 0.75 * br_pred + 0.25 * rgf_pred)))

    test_preds = pd.DataFrame(preds).T
    test_preds.columns = names

    age_prediction = pd.DataFrame()
    age_prediction['Id'] = test['Id'].values
    age_prediction['pred'] = 0.25 * model_stacker_rgf.predict(
        test_preds) + 0.75 * model_stacker_br.predict(test_preds)
    age_prediction.to_csv(folder_preds + 'age_stack_seed{}.csv'.format(seed),
                          index=False)
    print('age seed pred is saved as',
          folder_preds + 'age_stack_seed{}.csv'.format(seed))
Пример #18
0
    def stacklearning(self):
        class sparseNorm(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                from sklearn import preprocessing
                Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values))
                return Y
        fm = sgd.FMRegression(
            n_iter=4743,
            init_stdev=0.1,
            rank=100,
            l2_reg_w=0,
            l2_reg_V=0,
            step_size=0.1,
        )
        fm = sgd.FMRegression(
            n_iter=9943,
            init_stdev=0.1,
            rank=219,
            l2_reg_w=0,
            l2_reg_V=0.06454,
            step_size=0.1,
        )
        pipe = make_pipeline(sparseNorm(), fm)
        calcACC(pipe, X=X2)

        xgb = xgboost.XGBRegressor(
                    n_estimators=100,
                    max_depth=7,
                    gamma=0,
                    colsample_bytree=0.1
                )
        lgbm = LGBMRegressor(
            boosting_type='gbdt', num_leaves=367,
            learning_rate=0.06,feature_fraction=0.14,
            max_depth=28, min_data_in_leaf=8
        )
        rgf = RGFRegressor(
            max_leaf=1211, algorithm="RGF", test_interval=100,
            loss="LS", verbose=False, l2=0.93,
            min_samples_leaf=2
        )
        rf = RandomForestRegressor(
            max_depth=20, random_state=0,
            n_estimators=56,min_samples_split=2,
            max_features=0.21
        )
        rf = RandomForestRegressor()
        ext = ExtraTreesRegressor(
            n_estimators=384,max_features= 2228,
            min_samples_split= 0.01,max_depth= 856,
            min_samples_leaf= 1
        )
        svr = SVR(
            gamma=9.5367431640625e-07,
            epsilon=0.0009765625,
            C= 2048.0
        )

        #test combination
        desNew = make_pipeline(extdescriptorNew(),rf)
        morNew = make_pipeline(extMorganNew(),rf)
        kotNew = make_pipeline(extklekotaTothNew(),rf)
        macNew = make_pipeline(extMACCSNew(),rf)

        desMac = make_pipeline(extDescriptorMACCS(),rf)
        morMac = make_pipeline(extMorganMACCS(),rf)
        kotMac = make_pipeline(extKlekotaTothMACCS(),rf)

        morKotNew = make_pipeline(extMorganKlekotaTothNew(),rf)
        des = make_pipeline(extOnlyDescriptor(),rf)
        mor = make_pipeline(extOnlyMorgan(),rf)
        kot = make_pipeline(extOnlyklekotaToth(),rf)
        mac = make_pipeline(extOnlyMACCS(),rf)
        all = make_pipeline(extAll(),rf)
        allwithoutNew = make_pipeline(extAllwithoutNew(),rf)
        allwithoutMaccs = make_pipeline(extAllwithoutMaccs(),rf)
        allwithoutDes = make_pipeline(extAllwithoutDescriptor(),rf)

        testDic = {"Desc+New":desNew,"Mor+New":morNew,"kot+New":kotNew,"MACCS+New":macNew,"Des+MAC":desMac,"Morgan+Maccs":morMac,"Kot+MACCS":kotMac,"mor+kot+New":morKotNew,
        "descriptor":des,"morgan":mor,"kot":kot,"MACCS":mac,"All":all,"All without "
                                                                      "new":allwithoutNew,
                   "All without MACCS":allwithoutMaccs,"All without Des":allwithoutDes}

        #10fold
        cv = KFold(n_splits=10, shuffle=True, random_state=0)

        #Fingerprinttest
        resultDic={}
        resultDic2={}
        for name,model in testDic.items():
            #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1)
            #calcACC(model,X=X,y=y2,name=name)

            Scores = cross_validate(model, X2, y2, cv=cv,scoring=myScoreFunc)
            RMSETmp = Scores['test_RMSE'].mean()
            CORRTmP = Scores['test_Correlation coefficient'].mean()
            resultDic.update({name:[RMSETmp,CORRTmP]})
            print(name,RMSETmp,CORRTmP)

        #stacking
        alldata = make_pipeline(extAll())
        # random forest
        #1.1546 0.70905
        stack = StackingRegressor(regressors=[alldata], meta_regressor=rf,verbose=1)

        # Light Gradient boosting
        # 1.160732 0.703776
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=lgbm,verbose=1)

        # XGboost
        # 1.1839805 0.689571
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=xgb,verbose=1)

        # Regularized greedily forest
        # 1.17050 0.6992
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=rgf,verbose=1)

        #pls 22.808047774809697 0.6410026452910016 i=4
        for i in np.arange(3,11,1):
            pls = PLSRegression(n_components=i)
            testmodel = StackingRegressor(regressors=[alldata], meta_regressor=pls,verbose=0)
            calcACC(testmodel)
        pls = PLSRegression(n_components=4)

        #SVR
        svr = SVR(gamma=9.5367431640625/10000000,C=1559.4918100725592,
                  epsilon=0.0009765625,)
        svr = SVR(kernel='rbf',gamma=9.5367431640625e-07,epsilon=0.0009765625,C=2048.0)

        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=svr, verbose=1)
        calcACC(svr)

        #Extratree  1.157420824123527 0.7061010221224269
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=ext, verbose=1)
        calcACC(testmodel)

        #k-NN
        nbrs = KNeighborsRegressor(3)

        ##Linear regressions
        #Stochastic Gradient Descenta
        sgd = SGDRegressor(max_iter=1000)
        # Ridge
        for i in [1,10,100,1000]:
            ridge = Ridge(alpha=i)
            calcACC(ridge)
        ridge = Ridge(alpha=45.50940042350705)
        calcACC(ridge)
        # multiple linear
        lin = make_pipeline(forlinear(),LinearRegression(n_jobs=-1))
        calcACC(lin)



        #stacking
        #0.69
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all], meta_regressor=rf,verbose=1)
        #1.1532 0.70926
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,
                              verbose=1)
        #1.16420 0.7041
        testmodel = StackingRegressor(regressors=[alldata,alldata,all], meta_regressor=rf,verbose=1)
        #1.16379 0.7044
        stack1 = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,verbose=1)
        testmodel  = StackingRegressor(regressors=[alldata,stack1,stack1], meta_regressor=rf,verbose=1)
        #1.1535496740699531 0.7108839199109559
        pcaFeature = make_pipeline(extPCA())
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=rf,verbose=1)
        #1.181801005432221 0.6889745579620922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=lgbm,verbose=1)
        #0.70613
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=xgb,verbose=1)
        #0.71641717
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)
        #0.7146922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,ridge,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)

        #new features
        pcaFeature = make_pipeline(extPCA())

        #old
        pipe1 = make_pipeline(extMACCS(), rf)
        pipe2 = make_pipeline(extMorgan(), rf)
        pipe3 = make_pipeline(extDescriptor(), rf)

        pipe4 = make_pipeline(extPCA(), rgf)
        pipe7 =make_pipeline(extDescriptor(), rgf)
        pipe8 =make_pipeline(extDescriptor(), rgf)

        xgb = xgboost.XGBRegressor()
        nbrs = KNeighborsRegressor(2)
        svr = SVR(gamma='auto',kernel='linear')

        pls = PLSRegression(n_components=4)

        extMACCSdata = make_pipeline(extMACCS())

        nbrsPipe = make_pipeline(extMorgan(), nbrs)
        pipe6 = make_pipeline(extMACCS(), rgf)
        alldata = make_pipeline(extAll())
        ave = extAverage()
        withoutdesc =  make_pipeline(extMACCS())

        meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400)
        #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1)

        #0.70
        stack = StackingRegressor(regressors=[pipe1,pipe2,pipe3,xgb,lgbm,rgf,rf], meta_regressor=ave, verbose=1)

        #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1)

        #0.69######################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        #0.70
        stack2 = StackingRegressor(regressors=[stack1,alldata,rgf,lgbm,xgb], meta_regressor=rf,verbose=1)

        #0.71
        stack3 = StackingRegressor(regressors=[stack2,pipe1], meta_regressor=ave, verbose=1)
        ###########################
        ###########################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,withoutdesc,lgbm,rgf], meta_regressor=rf,verbose=1)
        stack3 = StackingRegressor(regressors=[stack2,pipe1,xgb], meta_regressor=ave, verbose=1)
        ###########################

        #stackingwithknn
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,nbrs,pipe1], meta_regressor=rf, verbose=1)


        #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1)

        cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
        cv = KFold(n_splits=10, shuffle=True, random_state=0)
        St1Scores = cross_validate(stack1,X,y,cv=cv)
        St1Scores['test_score'].mean()**(1/2)

        St2Scores = cross_validate(stack2,X,y,cv=cv)
        St2Scores['test_score'].mean()**(1/2)

        St3Scores = cross_validate(stack3,X,y,cv=cv)
        St3Scores['test_score'].mean()**(1/2)

        stackScore = cross_validate(stack, X, y, cv=cv)
        stackScore['test_score'].mean()**(1/2)

        lgbmScores =cross_validate(lgbm,X,y,cv=cv)
        lgbmScores['test_score'].mean()**(1/2)

        rgfScores = cross_validate(rgf,X,y,cv=cv)
        rgfScores['test_score'].mean()**(1/2)

        RFScores = cross_validate(rf,X,y,cv=cv)
        RFScores['test_score'].mean()**(1/2)

        scores = cross_validate(stack2,X,y,cv=cv)
        scores['test_score'].mean()**(1/2)
        print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores['test_score'].mean(), scores['test_score'].std(), 'stacking'))

        stack3.fit(X, y)
        y_pred = stack3.predict(X_train)
        y_val = stack3.predict(X_test)
        #stack3.score(X_train, y_train)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(stack3.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        stack1.fit(X, y)
        valy =  (10 **(stack1.predict(exX))).tolist()

        sgd.fit(X,y)
        valy =  (10 **(sgd.predict(exX))).tolist()

        rgfpipe = make_pipeline(extMACCS(), rf)
        rgf.fit(X,y)
        valy =  (10 **(rgf.predict(exX))).tolist()

        nbrs.fit(X,y)
        valy =  (10 **(nbrs.predict(exX))).tolist()

        pipe = make_pipeline(extMACCS(), rf)
        pipe.fit(X,y)
        valy =  (10 **(pipe.predict(exX))).tolist()


        rf.fit(X, y)
        y_pred = rf.predict(X_train)
        y_val = rf.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(rf.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        lgbm.fit(X, y)
        #y_pred = pipe1.predict(X_train)
        #y_val = pipe1.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(lgbm.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
Пример #19
0
class RGF(ModelBase):
    """"""

    _l_drop_cols = ['Item_Outlet_Sales', 'index']

    ## training, parameter tuning for single L1
    def train(self, importance=False):
        """"""
        print('\n parameters %s \n' % self.parameters)
        d_fold_val = {}
        for fold in range(self.kfold):
            print('\n---- fold %s begins.\n' % fold)

            ## load data
            TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold,
                                                  self.data_format)
            TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold,
                                                self.data_format)
            self.TrainData = DataUtil.load(TrainFile, format=self.data_format)
            self.TestData = DataUtil.load(TestFile, format=self.data_format)

            ## train and predict on valid
            self.__fit()
            eval = self.__predict()
            d_fold_val[fold] = eval

            ## save
            OutputDir = '%s/kfold/%s' % (self.OutputDir, fold)
            if (os.path.exists(OutputDir) == False):
                os.makedirs(OutputDir)
            DataUtil.save(self.TrainData,
                          '%s/train.%s' % (OutputDir, self.data_format),
                          format=self.data_format)
            DataUtil.save(self.TestData,
                          '%s/test.%s' % (OutputDir, self.data_format),
                          format=self.data_format)

            print('\n---- Fold %d done. ----\n' % fold)

        return d_fold_val

    ## inferring for fold data and holdout data
    def infer(self, head, HoldoutData, SubmitData, metric_pk=False):
        """"""
        ##
        l_pred_fold = []
        PredHoldout = pd.DataFrame(index=HoldoutData.index)
        PredHoldout['index'] = HoldoutData['index']
        PredHoldout['Item_Outlet_Sales'] = HoldoutData['Item_Outlet_Sales']
        PredSubmit = pd.DataFrame(index=SubmitData.index)
        for fold in range(self.kfold):
            ## load
            TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold,
                                                  self.data_format)
            TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold,
                                                self.data_format)
            self.TrainData = DataUtil.load(TrainFile, format=self.data_format)
            self.TestData = DataUtil.load(TestFile, format=self.data_format)

            ## fit
            PredFold = pd.DataFrame(index=self.TestData.index)
            PredFold['index'] = self.TestData['index']
            PredFold['Item_Outlet_Sales'] = self.TestData['Item_Outlet_Sales']
            PredFold['fold'] = fold
            self.__fit()

            ## inferring
            PredFold[head] = self._model.predict(
                self.TestData[self._l_train_columns])
            PredHoldout['fold%s' % (fold)] = self._model.predict(
                HoldoutData[self._l_train_columns])
            PredSubmit['fold%s' % fold] = self._model.predict(
                SubmitData[self._l_train_columns])
            l_pred_fold.append(PredFold)
        ## aggregate folds data
        PredKFold = pd.concat(l_pred_fold, axis=0, ignore_index=True)
        ## save for folds data
        for fold in range(self.kfold):
            FoldOutputDir = '%s/kfold/%s' % (self.OutputDir, fold)
            if (os.path.exists(FoldOutputDir) == False):
                os.makedirs(FoldOutputDir)
            TrainFile = '%s/train.%s' % (FoldOutputDir, self.data_format)
            TestFile = '%s/test.%s' % (FoldOutputDir, self.data_format)

            TrainData = PredKFold[PredKFold['fold'] != fold]
            TestData = PredKFold[PredKFold['fold'] == fold]
            DataUtil.save(TrainData, TrainFile, format=self.data_format)
            DataUtil.save(TestData, TestFile, format=self.data_format)

        HoldCols = [
            col for col in PredHoldout.columns if col.startswith('fold')
        ]
        ## save for holdout data
        PredHoldout[head] = PredHoldout[HoldCols].mean(axis=1)
        HoldoutOutputDir = '%s/holdout' % self.OutputDir
        if (os.path.exists(HoldoutOutputDir) == False):
            os.makedirs(HoldoutOutputDir)
        DataUtil.save(PredHoldout,
                      '%s/test.%s' % (HoldoutOutputDir, self.data_format),
                      format=self.data_format)
        ## save for submit data
        PredSubmit[head] = PredSubmit[HoldCols].mean(axis=1)
        SubmitOutputDir = '%s/submit' % self.OutputDir
        if (os.path.exists(SubmitOutputDir) == False):
            os.makedirs(SubmitOutputDir)
        DataUtil.save(PredSubmit,
                      '%s/test.%s' % (SubmitOutputDir, self.data_format),
                      format=self.data_format)

        ## metric PK
        if (metric_pk):
            d_metric = {}
            for col in self._l_train_columns:
                diff = (HoldoutData[col] - HoldoutData['Item_Outlet_Sales'])
                rmse = np.sqrt(np.sum(diff * diff) / len(diff))
                d_metric[col] = rmse
            diff = PredHoldout[head] - PredHoldout['Item_Outlet_Sales']
            ensemble_metric = np.sqrt(np.sum(diff * diff) / len(diff))
            print('\n===== metric pk result ====\n')
            print('single model: %s, ensemble model %s: %s' %
                  (d_metric, head, ensemble_metric))
            print('\n===== metric pk result ====\n')

        return

    ## L1 fitting
    def __fit(self):
        """"""
        start = time.time()
        ##
        id_cols = [
            col for col in self.TrainData.columns
            if (col.startswith('Item_Identifier'))
        ]
        self._l_drop_cols.extend(id_cols)
        X = self.TrainData.drop(self._l_drop_cols, axis=1)
        Y = self.TrainData['Item_Outlet_Sales']
        ##
        self._l_train_columns = X.columns
        print('Size of feature space: %s' % len(self._l_train_columns))
        ##
        self._model = RGFRegressor(
            algorithm=self.parameters['algorithm'],
            loss=self.parameters['loss'],
            learning_rate=self.parameters['learning_rate'],
            n_iter=self.parameters['n_iter'],
            reg_depth=self.parameters['reg_depth'],
            l2=self.parameters['l2'],
            sl2=self.parameters['sl2'],
            #min_samples_leaf= self.parameters['min_samples_leaf'],
            max_leaf=self.parameters['max_leaf'],
            verbose=True)
        self._model.fit(X, Y)
        end = time.time()
        print('\nTraining is done. Time elapsed %ds' % (end - start))

        return

    ## predict
    def __predict(self):
        """"""
        start = time.time()
        ##
        x_test = self.TestData[self._l_train_columns]
        pred_test = self._model.predict(x_test)
        truth_test = self.TestData['Item_Outlet_Sales']
        ## RMSE
        diff = (pred_test - truth_test)
        rmse = np.sqrt(np.sum(diff * diff) / len(diff))

        ##
        end = time.time()
        print('\n Prediction done. Time consumed %ds' % (end - start))

        return rmse
Пример #20
0
        ):  # explain for regression convert y to bins and use that for split
            dev_X, val_X = train.iloc[dev_index, :], train.iloc[val_index, :]
            dev_y, val_y = y[dev_index], y[val_index]
            dev_X = dev_X[(dev_y > lbound) & (dev_y < ubound)]
            dev_y = dev_y[(dev_y > lbound) & (dev_y < ubound)]
            val_X2 = val_X[(val_y > lbound) & (val_y < ubound)]
            val_y2 = val_y[(val_y > lbound) & (val_y < ubound)]
            print(dev_X.shape)
            rgf = RGFRegressor(max_leaf=1000,
                               algorithm="RGF_Sib",
                               test_interval=100,
                               loss="LS",
                               learning_rate=0.01,
                               verbose=False)

            model = rgf.fit(dev_X, dev_y)
            print("predicting..")
            preds = model.predict(val_X)
            oobval[val_index] += preds.reshape(-1, 1)
            valerr.append(mean_absolute_error(val_y, preds))
            print(valerr, "mean:", np.mean(valerr), "std:", np.std(valerr))
            oobtest += model.predict(test.values).reshape(-1, 1)
            val_scores.append(mean_absolute_error(model.predict(valid), yval))
            del (rgf, model)
            gc.collect()

            print(val_scores, np.mean(val_scores), "---", np.std(val_scores))

pred2 = oobtest / (nbag * nfold)
oobpred2 = oobval / (nbag)
print(mean_absolute_error(y, oobpred2))
Пример #21
0
def rgf_state_prediction(state, lookback, horizon, predictors):
    clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state))

    for cluster in clusters:
        data_full, group = get_cluster_data(geocode=cluster[0],
                                            clusters=clusters,
                                            data_types=DATA_TYPES,
                                            cols=predictors)
        for city in cluster:
            if os.path.isfile('saved_models/rgf/{}/rgf_metrics_{}.pkl'.format(
                    state, city)):
                print(city, 'done')
                continue

            target = 'casos_est_{}'.format(city)
            casos_est_columns = ['casos_est_{}'.format(i) for i in group]
            casos_columns = ['casos_{}'.format(i) for i in group]

            data = data_full.drop(casos_columns, axis=1)
            data_lag = build_lagged_features(data, lookback)
            data_lag.dropna()
            targets = {}
            for d in range(1, horizon + 1):
                if d == 1:
                    targets[d] = data_lag[target].shift(-(d - 1))
                else:
                    targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

            X_data = data_lag.drop(casos_est_columns, axis=1)
            X_train, X_test, y_train, y_test = train_test_split(
                X_data,
                data_lag[target],
                train_size=0.7,
                test_size=0.3,
                shuffle=False)

            city_name = get_city_names([city, 0])[0][1]
            preds = np.empty((len(data_lag), horizon))
            metrics = pd.DataFrame(index=('mean_absolute_error',
                                          'explained_variance_score',
                                          'mean_squared_error',
                                          'mean_squared_log_error',
                                          'median_absolute_error', 'r2_score'))
            for d in range(1, horizon + 1):
                model = RGFRegressor(max_leaf=300,
                                     algorithm="RGF_Sib",
                                     test_interval=100,
                                     loss="LS",
                                     verbose=False)

                tgt = targets[d][:len(X_train)]
                tgtt = targets[d][len(X_train):]
                try:
                    model.fit(X_train, tgt)
                except ValueError as err:
                    print(
                        '-----------------------------------------------------'
                    )
                    print(city, 'ERRO')
                    print(
                        '-----------------------------------------------------'
                    )
                    break
                pred = model.predict(X_data[:len(targets[d])])

                dif = len(data_lag) - len(pred)
                if dif > 0:
                    pred = list(pred) + ([np.nan] * dif)
                preds[:, (d - 1)] = pred
                pred_m = model.predict(X_test[:(len(tgtt))])
                metrics[d] = calculate_metrics(pred_m, tgtt)

            metrics.to_pickle('{}/{}/rgf_metrics_{}.pkl'.format(
                'saved_models/rgf', state, city))
            plot_prediction(preds, targets[1], city_name, len(X_train))
            # plt.show()
    return None
Пример #22
0
# In[9]:
#https://www.analyticsvidhya.com/blog/2018/02/introductory-guide-regularized-greedy-forests-rgf-python/

###############Classifier#####################
from rgf.sklearn import RGFRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.utils.validation import check_random_state
from sklearn.model_selection import StratifiedKFold, cross_val_score

rgf = RGFRegressor(max_leaf=400,
                    algorithm="RGF_Sib",
                    test_interval=100,
                    verbose=True)

rgf.fit(train_all[features], train_all['kda_ratio'])
valid_preds = list(rgf.predict(validation_all[features]))
test_preds = list(rgf.predict(test_all[features]))

valid_preds = model.predict(validation_all[features])
print('The rmse of prediction using validation set is:', mean_squared_error(validation_set['kda_ratio'], valid_preds) ** 0.5)

test_preds = list(rgf.predict(test_all[features]))


##Using grid serach
parameters = {'max_leaf':[1000,1200,1300,1400,1500,1600,1700,1800,1900,2000],
              'l2':[0.1,0.2,0.3],
              'min_samples_leaf':[5,10]}

model = GridSearchCV(estimator=rgf,