def RidgeRegressionEnsembleTest(): #dataset = DatasetFactory.friedman1(n_samples=200200) #dataset = DatasetFactory.friedman2(n_samples=200200) dataset = DatasetFactory.friedman3(n_samples=200200) Xtrain, X, ytrain, y = model_selection.train_test_split(dataset.data, dataset.target, random_state=0, train_size=200) ensemble = EnsembleRegressor(type='ridge') ensemble.fit(Xtrain, ytrain, samples_per_regressor=200, regressor_overlap=200) ridgecv = linear_model.RidgeCV(alphas=np.arange(.1, 1, .2), fit_intercept=True, normalize=True) ridgecv.fit(Xtrain, ytrain) y_ridgecv = ridgecv.predict(X) Z = ensemble.predict(X) sio.savemat( 'RidgeRegression_Friedman3_200k.mat', { 'names': ensemble.regressor_labels, 'Z': Z, 'y': y, # 'Ztrain': Z_train, 'ytrain': ytrain, 'y_RidgeCV': y_ridgecv, 'samples_per_regressor': 200, 'regressor_samples_overlap': 200, 'Ey': np.mean(y), 'Ey2': np.mean(y**2), 'Description': 'Ridge Regression (Friedman #3)' })
def UnequalMLPsEnsembleTest(): #dataset = DatasetFactory.friedman1(n_samples=200200) #dataset = DatasetFactory.friedman2(n_samples=200200) dataset = DatasetFactory.friedman3(n_samples=200200) Xtrain, X, ytrain, y = cross_validation.train_test_split( dataset.data, dataset.target, random_state=0, train_size=200) ensemble = EnsembleRegressor(type='auto_large') ensemble.fit(Xtrain,ytrain,samples_per_regressor=200,regressor_overlap=200) Ztrain = ensemble.predict(Xtrain) Z = ensemble.predict(X) sio.savemat('ManualEnsembleDatasets\DifferentRegressors_Friedman3.mat', { 'names': ensemble.regressor_labels, 'Z': Z, 'y': y, 'Ztrain': Ztrain, 'ytrain': ytrain, 'samples_per_regressor': 200, 'regressor_samples_overlap': 200, 'Ey': np.mean(y), 'Ey2': np.mean(y ** 2), 'Description': 'Different Regressors (Friedman #3)' })
def UnequalMLPsEnsembleTest(): #dataset = DatasetFactory.friedman1(n_samples=200200) #dataset = DatasetFactory.friedman2(n_samples=200200) dataset = DatasetFactory.friedman3(n_samples=200200) Xtrain, X, ytrain, y = model_selection.train_test_split(dataset.data, dataset.target, random_state=0, train_size=200) ensemble = EnsembleRegressor(type='auto_large') ensemble.fit(Xtrain, ytrain, samples_per_regressor=200, regressor_overlap=200) Ztrain = ensemble.predict(Xtrain) Z = ensemble.predict(X) sio.savemat( 'ManualEnsembleDatasets\DifferentRegressors_Friedman3.mat', { 'names': ensemble.regressor_labels, 'Z': Z, 'y': y, 'Ztrain': Ztrain, 'ytrain': ytrain, 'samples_per_regressor': 200, 'regressor_samples_overlap': 200, 'Ey': np.mean(y), 'Ey2': np.mean(y**2), 'Description': 'Different Regressors (Friedman #3)' })
def RealDatasetsManualEnsembleTest(): for name,func in dataset_list.iteritems(): print(name + ":", end="") dataset = func() print(" X.shape = " + str(dataset.data.shape)) ensemble = EnsembleRegressor(type='auto', verbose=True) #auto_large if name is 'blog_feedback': continue # samples_per_regressor = 2810 # overlap = 2810 # train_size = 2810 else: samples_per_regressor = 200 overlap = 0 train_size = samples_per_regressor * ensemble.regressor_count if len(dataset.target) < train_size + 500: # ignore datasets with less than 6000 samples continue # if dataset.data.shape[1] < 5: # ignore datasets with less than 5 covariates # continue Xtrain, X, ytrain, y = cross_validation.train_test_split( dataset.data, dataset.target, random_state=0, train_size=train_size) ensemble.fit(Xtrain, ytrain, samples_per_regressor=samples_per_regressor, regressor_overlap=overlap) Ztrain = ensemble.predict(Xtrain) Z = ensemble.predict(X) sio.savemat(path.join('ManualEnsembleDatasets',name + '.mat'), { 'names': ensemble.regressor_labels, 'Z': Z, 'y': y, 'Ztrain': Ztrain, 'ytrain': ytrain, 'samples_per_regressor': train_size, 'regressor_samples_overlap': train_size, 'Ey': np.mean(y), 'Ey2': np.mean(y ** 2), 'Description': ('Different Regressors (%s)' % name) })
def RealDatasetsLargeMLPEnsembleTest(): for name,func in dataset_list.iteritems(): print(name) dataset = func() if len(dataset.target) < 5500: # ignore datasets with less than 6000 samples continue if dataset.data.shape[1] < 5: # ignore datasets with less than 5 covariates continue if name is 'blog_feedback': train_size = 10000 else: train_size = 500 Xtrain, X, ytrain, y = cross_validation.train_test_split( dataset.data, dataset.target, random_state=0, train_size=train_size) if name is 'affairs': # ytrain, y = [np_utils.to_categorical(x) for x in (ytrain, y)] continue ensemble = EnsembleRegressor(type='mlp_large', verbose=True) ensemble.fit(Xtrain, ytrain, samples_per_regressor=train_size, regressor_overlap=train_size) Ztrain = ensemble.predict(Xtrain) Z = ensemble.predict(X) sio.savemat(path.join('ManualEnsembleDatasets',name + '_10mlp.mat'), { 'names': ensemble.regressor_labels, 'Z': Z, 'y': y, 'Ztrain': Ztrain, 'ytrain': ytrain, 'samples_per_regressor': train_size, 'regressor_samples_overlap': train_size, 'Ey': np.mean(y), 'Ey2': np.mean(y ** 2), 'Description': ('Different Regressors (%s)' % name) })
def RidgeRegressionEnsembleTest(): #dataset = DatasetFactory.friedman1(n_samples=200200) #dataset = DatasetFactory.friedman2(n_samples=200200) dataset = DatasetFactory.friedman3(n_samples=200200) Xtrain, X, ytrain, y = cross_validation.train_test_split( dataset.data, dataset.target, random_state=0, train_size=200) ensemble = EnsembleRegressor(type='ridge') ensemble.fit(Xtrain,ytrain,samples_per_regressor=200,regressor_overlap=200) ridgecv = linear_model.RidgeCV(alphas=np.arange(.1,1,.2), fit_intercept=True, normalize=True) ridgecv.fit(Xtrain,ytrain) y_ridgecv = ridgecv.predict(X) Z = ensemble.predict(X) sio.savemat('RidgeRegression_Friedman3_200k.mat', { 'names': ensemble.regressor_labels, 'Z': Z, 'y': y, # 'Ztrain': Z_train, 'ytrain': ytrain, 'y_RidgeCV': y_ridgecv, 'samples_per_regressor': 200, 'regressor_samples_overlap': 200, 'Ey': np.mean(y), 'Ey2': np.mean(y ** 2), 'Description': 'Ridge Regression (Friedman #3)' })
def RealDatasetsManualEnsembleTest(): for name, func in dataset_list.iteritems(): print(name + ":", end="") dataset = func() print(" X.shape = " + str(dataset.data.shape)) ensemble = EnsembleRegressor(type='auto', verbose=True) #auto_large if name is 'blog_feedback': continue # samples_per_regressor = 2810 # overlap = 2810 # train_size = 2810 else: samples_per_regressor = 200 overlap = 0 train_size = samples_per_regressor * ensemble.regressor_count if len( dataset.target ) < train_size + 500: # ignore datasets with less than 6000 samples continue # if dataset.data.shape[1] < 5: # ignore datasets with less than 5 covariates # continue Xtrain, X, ytrain, y = model_selection.train_test_split( dataset.data, dataset.target, random_state=0, train_size=train_size) ensemble.fit(Xtrain, ytrain, samples_per_regressor=samples_per_regressor, regressor_overlap=overlap) Ztrain = ensemble.predict(Xtrain) Z = ensemble.predict(X) sio.savemat( path.join('ManualEnsembleDatasets', name + '.mat'), { 'names': ensemble.regressor_labels, 'Z': Z, 'y': y, 'Ztrain': Ztrain, 'ytrain': ytrain, 'samples_per_regressor': train_size, 'regressor_samples_overlap': train_size, 'Ey': np.mean(y), 'Ey2': np.mean(y**2), 'Description': ('Different Regressors (%s)' % name) })
def RealDatasetsLargeMLPEnsembleTest(): for name, func in dataset_list.iteritems(): print(name) dataset = func() if len(dataset.target ) < 5500: # ignore datasets with less than 6000 samples continue if dataset.data.shape[ 1] < 5: # ignore datasets with less than 5 covariates continue if name is 'blog_feedback': train_size = 10000 else: train_size = 500 Xtrain, X, ytrain, y = model_selection.train_test_split( dataset.data, dataset.target, random_state=0, train_size=train_size) if name is 'affairs': # ytrain, y = [np_utils.to_categorical(x) for x in (ytrain, y)] continue ensemble = EnsembleRegressor(type='mlp_large', verbose=True) ensemble.fit(Xtrain, ytrain, samples_per_regressor=train_size, regressor_overlap=train_size) Ztrain = ensemble.predict(Xtrain) Z = ensemble.predict(X) sio.savemat( path.join('ManualEnsembleDatasets', name + '_10mlp.mat'), { 'names': ensemble.regressor_labels, 'Z': Z, 'y': y, 'Ztrain': Ztrain, 'ytrain': ytrain, 'samples_per_regressor': train_size, 'regressor_samples_overlap': train_size, 'Ey': np.mean(y), 'Ey2': np.mean(y**2), 'Description': ('Different Regressors (%s)' % name) })
est_y.extend(estimator.predict(X_test2).tolist()) true_y.extend(y_test2.tolist()) return bias_variance(est_y, true_y) df = pd.DataFrame(boston.data, columns=boston.feature_names) df['target'] = boston.target X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2) print X_train.shape, X_test.shape estimators = [ { "context": EnsembleRegressor(), "tuned_parameters": [], "name": "EnsembleRegressor" }, { "context": BaggingRegressor(tree.DecisionTreeRegressor(max_depth=12), max_samples=0.9, max_features=0.5, n_estimators=50), "tuned_parameters": [], "name": "Bagging" }, { "context":
def make_large_ensemble(dataset, mat_filename='large_ensemble.mat'): """ construct_ensemble splits the dataset into train and 'test'. The ensemble regressors are trained on the training set. The test set is saved to the mat file to be used by matlab code. :param dataset: a dataset object created by DatasetFactory :param mat_filename: name of the mat file to save the results to # :param ensemble_type: 'auto' or 'mlp' for the choice of regressors in the ensemble # :param train_size: proportion or number of samples used for training (defaults to 25% for n>20,000, otherwise 50%) # :param test_size: proportion or number of samples used for testing (defaults to 75% for n>20,000, and 50% otherwise) # :param samples_per_regressor: Number of samples from X_train that each regressor will be trained on. # Default 'None' will cause all regressors to be trained on all samples. # :param overlap: this is the number of samples overlapping for every adjacent pair of regressors. # Defaults to no overlap if there are at least 100 samples per regressor, else full overlap (overlap=n). # :param plotting: plots results # :param ensemble_train_size: The number of samples to output for training supervised ensemble methods # :param scale_data: boolean, if True the data will be scaled to mean-centered and variance 1. # :param Description: The text that will be written in the 'Description' field in the output file """ # Init ensemble_type = 'auto_large' # 'mlp_different' # 'mlp_large' ###################################################################### (n_samples, n_features) = dataset.data.shape ensemble = EnsembleRegressor(verbose=False, type=ensemble_type) m = ensemble.regressor_count train_size = np.max([200, 100 * dataset.data.shape[1] ]) # at least 100 samples per dimension (n/p >= 100) n = n_samples - train_size n_train = 200 # taken out of val_size samples_per_regressor = train_size #//m overlap = samples_per_regressor # 0 # scale data dataset.data = preprocessing.scale(dataset.data) # split to train / validation X_train, X_val, y_train, y_val = model_selection.train_test_split( dataset.data, dataset.target, random_state=0, train_size=train_size, test_size=n) msg = "features=%s, n_tot=%d, training each regressors on %d, n=%d, n_train=%d, m=%d" % \ (n_features, n_samples, samples_per_regressor, n, n_train, m) print(msg) ensemble.fit( X_train, y_train ) # full overlap #, samples_per_regressor=samples_per_regressor, regressor_overlap=overlap) scores_train = ensemble.score(X_train, y_train) MSEs_train = ensemble.mean_squared_error(X_train, y_train) / np.var(y_train) scores_val = ensemble.score(X_val, y_val) MSEs_val = ensemble.mean_squared_error(X_val, y_val) / np.var(y_val) for i, regr in enumerate(ensemble.regressors): print('## ' + str(i) + '. ' + regr.__class__.__name__ + ':') print(regr) print('\tMSE/Var(Y): %.2f/%.2f' % (MSEs_train[i], MSEs_val[i])) print('\tVariance score (R^2): %.2f/%.2f\n' % (scores_train[i], scores_val[i])) # create predictions matrix on the test set Zval = ensemble.predict(X_val) # Set aside n_train samples as a training set for the supervised ensemble learners Z_train, Z, y_ensemble_train, y_ensemble_test = \ model_selection.train_test_split(Zval.T, y_val, random_state=42, train_size=n_train) Z_train = Z_train.T Z = Z.T # Add Description if none Description = "%s was generated with %s regressors of type %s:\n%s" % \ (mat_filename, msg, ensemble_type, str(locals())) sio.savemat( mat_filename, { 'names': ensemble.regressor_labels, 'Z': Z, 'y': y_ensemble_test, 'Ztrain': Z_train, 'ytrain': y_ensemble_train, 'samples_per_regressor': samples_per_regressor, 'regressor_samples_overlap': overlap, 'Ey': np.mean(y_ensemble_test), # np.mean(dataset.target), 'Ey2': np.mean(y_ensemble_test**2), # np.mean(dataset.target ** 2) 'Description': Description }) results_df = pd.DataFrame({ 'i': range(1, 1 + len(MSEs_train)), 'MSE_train': MSEs_train, 'MSE_val': MSEs_val, 'R2_train': scores_train, 'R2_val': scores_val }) return results_df
def make_ensemble(dataset, mat_filename='ensemble.mat', ensemble_type='auto', train_size=None, test_size=None, samples_per_regressor=None, overlap=None, plotting=True, Description=None, scale_data=False, ensemble_train_size=200): """ construct_ensemble splits the dataset into train and 'test'. The ensemble regressors are trained on the training set. The test set is saved to the mat file to be used by matlab code. :param dataset: a dataset object created by DatasetFactory :param mat_filename: name of the mat file to save the results to :param ensemble_type: 'auto' or 'mlp' for the choice of regressors in the ensemble :param train_size: proportion or number of samples used for training (defaults to 25% for n>20,000, otherwise 50%) :param test_size: proportion or number of samples used for testing (defaults to 75% for n>20,000, and 50% otherwise) :param samples_per_regressor: Number of samples from X_train that each regressor will be trained on. Default 'None' will cause all regressors to be trained on all samples. :param overlap: this is the number of samples overlapping for every adjacent pair of regressors. Defaults to no overlap if there are at least 100 samples per regressor, else full overlap (overlap=n). :param plotting: plots results :param ensemble_train_size: The number of samples to output for training supervised ensemble methods :param scale_data: boolean, if True the data will be scaled to mean-centered and variance 1. :param Description: The text that will be written in the 'Description' field in the output file """ if scale_data: dataset.data = preprocessing.scale(dataset.data) if (train_size is None) and (test_size is None): if len(dataset.target) < 20000: (test_size, train_size) = (0.75, 0.25) else: (test_size, train_size) = (0.5, 0.5) X_train, X_test, y_train, y_test = model_selection.train_test_split( dataset.data, # preprocessing.scale(dataset.data) dataset.target, random_state=0, test_size=test_size, train_size=train_size) # Prepare ensemble regressors ensemble = EnsembleRegressor(verbose=False, type=ensemble_type) n = len(y_train) m = ensemble.regressor_count # decide on how many samples per regressor and what's the overlap between regressors if samples_per_regressor and (overlap is not None): pass # both were defined by caller elif (overlap is not None) and (samples_per_regressor is None): samples_per_regressor = ( n // m) + overlap # '//' is python operator for floor of n/m else: # both are None or only samples_per_regressor was given if n < m * 100: # reserve at least 100 samples for training the individual regressors overlap = n samples_per_regressor = (samples_per_regressor or n) else: # we have enough samples to be training on different parts of the dataset overlap = 0 samples_per_regressor = (samples_per_regressor or n // m) assert train_size == (samples_per_regressor * m) - overlap * (m - 1), "inconsistent parameters" print("Training set size: %d with %d attributes" % X_train.shape) print("Each regressor is trained on %d samples" % samples_per_regressor) print("Test set size: %d" % len(y_test)) ensemble.fit(X_train, y_train, samples_per_regressor=samples_per_regressor, regressor_overlap=overlap) scores = ensemble.score(X_train, y_train) MSEs = ensemble.mean_squared_error(X_train, y_train) for i, regr in enumerate(ensemble.regressors): print('## ' + str(i) + '. ' + regr.__class__.__name__ + ':') print(regr) print('\tMSE: %.2f' % MSEs[i]) print('\tVariance score (R^2): %.2f\n' % scores[i]) # create predictions matrix on the test set Z = ensemble.predict(X_test) # Set aside 200 samples as a training set for the supervised ensemble learners Z_train, Z, y_ensemble_train, y_ensemble_test = \ model_selection.train_test_split(Z.T, y_test, random_state=0, train_size=ensemble_train_size) Z_train = Z_train.T Z = Z.T # Add Description if none if not Description: Description = "%s was generated with %d samples and %d regressors of type %s:\n%s" % \ (mat_filename, n, m, ensemble_type, str(locals())) sio.savemat( mat_filename, { 'names': ensemble.regressor_labels, 'Z': Z, 'y': y_ensemble_test, 'Ztrain': Z_train, 'ytrain': y_ensemble_train, 'samples_per_regressor': samples_per_regressor, 'regressor_samples_overlap': overlap, 'Ey': np.mean(y_ensemble_test), # np.mean(dataset.target), 'Ey2': np.mean(y_ensemble_test**2), # np.mean(dataset.target ** 2) 'Description': Description }) if plotting: plot_regression_results(ensemble, Z, y_ensemble_test) plot_y_e_correlation(ensemble, Z, y_ensemble_test)
def make_ensemble(dataset, mat_filename='ensemble.mat', ensemble_type='auto', train_size=None, test_size=None, samples_per_regressor=None, overlap=None, plotting=True, Description=None, scale_data=False): """ construct_ensemble splits the dataset into train and 'test'. The ensemble regressors are trained on the training set. The test set is saved to the mat file to be used by matlab code. :param dataset: a dataset object created by DatasetFactory :param mat_filename: name of the mat file to save the results to :param ensemble_type: 'auto' or 'mlp' for the choice of regressors in the ensemble :param train_size: proportion or number of samples used for training (defaults to 25% for n>20,000, otherwise 50%) :param test_size: proportion or number of samples used for testing (defaults to 75% for n>20,000, and 50% otherwise) :param samples_per_regressor: Number of samples from X_train that each regressor will be trained on. Default 'None' will cause all regressors to be trained on all samples. :param overlap: this is the number of samples overlapping for every adjacent pair of regressors. Defaults to no overlap if there are at least 100 samples per regressor, else full overlap (overlap=n). :param plotting: plots results """ if scale_data: dataset.data = preprocessing.scale(dataset.data) if (train_size is None) and (test_size is None): if len(dataset.target) < 20000: (test_size,train_size) = (0.75, 0.25) else: (test_size, train_size) = (0.5, 0.5) X_train, X_test, y_train, y_test = cross_validation.train_test_split( dataset.data, # preprocessing.scale(dataset.data) dataset.target, random_state=0, test_size=test_size, train_size=train_size) # Prepare ensemble regressors ensemble = EnsembleRegressor(verbose=False, type=ensemble_type) n = len(y_train) m = ensemble.regressor_count # decide on how many samples per regressor and what's the overlap between regressors if samples_per_regressor and overlap: pass # both were defined by caller elif overlap and not samples_per_regressor: samples_per_regressor = (n // m) + overlap # '//' is python operator for floor of n/m else: # both are None or only samples_per_regressor was given if n < m*100: # reserve at least 100 samples for training the individual regressors overlap = n samples_per_regressor = (samples_per_regressor or n) else: # we have enough samples to be training on different parts of the dataset overlap = 0 samples_per_regressor = (samples_per_regressor or n // m) print("Training set size: %d with %d attributes" % X_train.shape) print("Each regressor is trained on %d samples" % samples_per_regressor) print("Test set size: %d" % len(y_test)) ensemble.fit(X_train, y_train, samples_per_regressor=samples_per_regressor, regressor_overlap=overlap) scores = ensemble.score(X_train, y_train) MSEs = ensemble.mean_squared_error(X_train, y_train) for i, regr in enumerate(ensemble.regressors): print('## ' + str(i) + '. ' + regr.__class__.__name__ + ':') print(regr) print('\tMSE: %.2f' % MSEs[i]) print('\tVariance score (R^2): %.2f\n' % scores[i]) # create predictions matrix on the test set Z = ensemble.predict(X_test) # Set aside 200 samples as a training set for the supervised ensemble learners Z_train, Z, y_ensemble_train, y_ensemble_test = \ cross_validation.train_test_split(Z.T, y_test, random_state=0, train_size=200) Z_train = Z_train.T Z = Z.T # Add Description if none if not Description: Description = "%s was generated with %d samples and %d regressors of type %s:\n%s" % \ (mat_filename, n, m, ensemble_type, str(locals())) sio.savemat(mat_filename, { 'names': ensemble.regressor_labels, 'Z': Z, 'y': y_ensemble_test, 'Ztrain': Z_train, 'ytrain': y_ensemble_train, 'samples_per_regressor': samples_per_regressor, 'regressor_samples_overlap': overlap, 'Ey': np.mean(y_ensemble_test), # np.mean(dataset.target), 'Ey2': np.mean(y_ensemble_test ** 2), # np.mean(dataset.target ** 2) 'Description': Description }) if plotting: plot_regression_results(ensemble, Z, y_ensemble_test) plot_y_e_correlation(ensemble, Z, y_ensemble_test)
def run(self, train_data_path): """Takes argument 'train_data_path'. train_data_path: Training data path. Performs models selection process on the specified order. A no. of reqred models can added to this method body and corss validated These can be saved as it is or ensembling can be applied. """ #Loading training data dtrain = pd.read_csv(train_data_path) #gets predictors predictor_vars = self.get_predictors(dtrain) #Model I xgboost = XGBRegressor(learning_rate=0.06, n_estimators=1000, max_depth=2, min_child_weight=2, gamma=0, subsample=0.4, colsample_bytree=0.2, objective='reg:linear', nthread=-1, scale_pos_weight=1, seed=27, reg_alpha=77) #Model II xgboost2 = XGBRegressor(learning_rate=0.04, n_estimators=1500, max_depth=2, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.2, objective='reg:linear', nthread=-1, scale_pos_weight=1, seed=99, reg_alpha=1.7) #Model III xgboost3 = XGBRegressor(learning_rate=0.02, n_estimators=1200, max_depth=3, min_child_weight=2, gamma=0, subsample=0.65, colsample_bytree=0.2, objective='reg:linear', nthread=-1, scale_pos_weight=1, seed=585, reg_alpha=5000) #Model IV lightgbm = LGBMRegressor(objective='regression', num_leaves=4, min_data_in_leaf=5, learning_rate=0.02, n_estimators=3000, max_bin=320, bagging_fraction=0.85, bagging_freq=10, bagging_seed=9, feature_fraction=0.2, feature_fraction_seed=9, data_random_seed=9, reg_alpha=0.55, reg_lambda=0.3, verbose=-1) #Model V lightgbm2 = LGBMRegressor(objective='regression', num_leaves=4, min_data_in_leaf=3, learning_rate=0.01, n_estimators=4000, max_bin=295, bagging_fraction=0.5, bagging_freq=10, bagging_seed=24, feature_fraction=0.2, feature_fraction_seed=24, data_random_seed=24, reg_alpha=10, reg_lambda=0.7, verbose=-1) #Ensembling all the five models ens_model = EnsembleRegressor( [xgboost, xgboost2, xgboost3, lightgbm, lightgbm2]) #Performs cross validation on the ensembled model. self.cross_validate(cv=5, model=ens_model, X=dtrain[predictor_vars], y=dtrain[self.target_var], n_jobs=1) #CV Score is: 0.92528287952747 all predictors #Saving the final model. self.finalize_and_save(ens_model, self.output_file_path, dtrain[predictor_vars], dtrain[self.target_var])
bagging_seed=24, feature_fraction=0.2, feature_fraction_seed=24, data_random_seed=24, reg_alpha=10, reg_lambda=0.7, verbose=-1) #CV Score is: 0.9243765697929301 ####################################################################### """ ENSEMBLING """ xgb_ens = EnsembleRegressor([xgboost,xgboost2, xgboost3]) #CV Score is: 0.9246359450211432 xgb_ens = EnsembleRegressor([xgboost,xgboost2, xgboost3, lightgbm]) #CV Score is: 0.9249748684043093 xgb_ens = EnsembleRegressor([xgboost,xgboost2, xgboost3, lightgbm, lightgbm2]) #CV Score is: 0.92528287952747 #CV Score is: 0.9253181909342896 ###################################################################### """ CROSS VALIDATION""" ms =ModelSelector() ms.cross_validate(cv=5,model=xgb_ens,X=dtrain.drop(['SalePrice'], axis=1)[predictor_vars], y=dtrain['SalePrice'], n_jobs = 1) #CV Score is: 0.92528287952747 all predictor variables #Using feature importance to check for improvement.