def bagging_regressor(self, data): train, validacion = data x_tr, y_tr = train x_val, y_val = validacion #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1])) #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1])) print('Start training BaggingRegressor...') start_time = self.timer() bg = BaggingRegressor(oob_score=True, verbose=1) bg.fit(x_tr, y_tr) print("The R2 is: {}".format(bg.score(x_tr, y_tr))) # print("The alpha choose by CV is:{}".format(krrl.alpha_)) self.timer(start_time) print("Making prediction on validation data") y_val = np.expm1(y_val) y_val_pred = np.expm1(bg.predict(x_val)) mae = mean_absolute_error(y_val, y_val_pred) print("El mean absolute error de es {}".format(mae)) print('Saving model into a pickle') try: os.mkdir('pickles') except: pass with open('pickles/bg.pkl', 'wb') as f: pickle.dump(bg, f) print('Making prediction and saving into a csv') y_test = bg.predict(self.x_test) return y_test
def model_fit_rf_bagging(): def in_limits(x): if x<1: return 1 if x>3: return 3 return x print "STARTING MODEL" X = full_data[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values y = full_data['relevance'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) in_limits = np.vectorize(in_limits,otypes=[np.float]) y_pred = in_limits(y_pred) RMSE = mean_squared_error(y_test, y_pred)**0.5 print "RMSE: ",RMSE # for the submission real_X_test = real_full_test[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values test_pred = clf.predict(real_X_test) test_pred = in_limits(test_pred) return test_pred
class LinReg: lr = None sc = None X_train = None y_train = None def __init__(self): pass def train(self, X_train, y_train, model_num): # Scale features self.sc = StandardScaler() self.X_train = self.sc.fit_transform(X_train) self.y_train = y_train base_model = Ridge(fit_intercept=True) self.lr = BaggingRegressor(base_estimator=base_model, n_estimators=model_num).fit( self.X_train, self.y_train) def predict(self, x_test, retstd=True): x_pred = self.sc.transform(x_test) if retstd is False: return self.lr.predict(x_pred) error = [] for x in range(len(x_pred)): preds = [] for pred in self.lr.estimators_: preds.append(pred.predict([x_pred[x]])[0]) error.append(statistics.stdev(preds)) error = np.array(error) return self.lr.predict(x_pred), error
class MFBaggingRegressorStacked(MFModel): """Stacked Gradient Boosting Regression predictor.""" def __init__(self, **args): """Init model.""" self.model_lf = BaggingRegressor(**copy.deepcopy(args)) self.model_hf = BaggingRegressor(**copy.deepcopy(args)) def fit(self, X_train_lf, y_train_lf, X_train_hf, y_train_hf): """Fits a model to low- and high- fidelity samples using stacking scheme for BaggingRegressor.""" self.model_lf.fit(X_train_lf, y_train_lf) X_train_hf = np.hstack( (X_train_hf, self.model_lf.predict(X_train_hf).reshape(-1, 1))) self.model_hf.fit(X_train_hf, y_train_hf) def predict_hf(self, X): """Predict low-fidelity values.""" y_pred_lf = self.model_lf.predict(X) X = np.hstack((X, y_pred_lf.reshape(-1, 1))) base_preds = [e.predict(X) for e in self.model_hf.estimators_] y_pred_hf = np.mean(base_preds, axis=0) rho = linregress(y_pred_lf, y_pred_hf)[0] # get slope return rho, y_pred_hf, np.std(base_preds, axis=0) def predict_lf(self, X): """Predict low-fidelity values.""" base_preds = [e.predict(X) for e in self.model_lf.estimators_] return np.mean(base_preds, axis=0), np.std(base_preds, axis=0)
def bagging_model(df_dict,filename): result_bagging=pd.DataFrame() for i in range(36): df=df_dict['Model'+str(i+1)] mae_gs=pd.DataFrame() for feature in [int(x) for x in np.linspace(start = 1, stop = 10, num = 5)]: for sample in [int(x) for x in np.linspace(start = 1, stop = 10, num = 5)]: for estimator in [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]: bagging=BaggingRegressor(max_features=feature, max_samples=sample, n_estimators=estimator) bagging.fit(split_data(df)[0],split_data(df)[1]) bagging_ypred=bagging.predict(split_data(df)[2]) bagging_mae=MAE(split_data(df)[3],bagging_ypred) mae_gs=mae_gs.append(pd.DataFrame({'max_features':[feature], 'max_samples':[sample], 'n_estimators':[estimator], 'valid_mae':[bagging_mae]}),ignore_index=True) parameter_best=mae_gs.loc[mae_gs['valid_mae'].idxmin(),:] bagging_best=BaggingRegressor(max_features=int(parameter_best['max_features']), max_samples=int(parameter_best['max_samples']), n_estimators=int(parameter_best['n_estimators'])) bagging_best.fit(split_data(df)[0],split_data(df)[1]) bagging_ypred=bagging_best.predict(split_data(df)[4]) mae_bagging=pd.Series(MAE(split_data(df)[5],bagging_ypred)) df=parameter_best.append(mae_bagging) print(i) result_bagging=result_bagging.append(df,ignore_index=True) #pickle.dump(bagging_best,open(path+ '/model_bagging/' + filename + '_bagging_%s.sav'%('df'+str(i)),'wb')) result_bagging.columns=['mae','max_features','max_samples','n_estimators','valid_mae'] return result_bagging
def Bagging_reg(X,y,X_test): BR = BaggingRegressor(SVR(kernel='linear',epsilon=0.01),n_estimators=5,warm_start=True) #BR = BaggingRegressor(base_estimator=LinearRegression(fit_intercept=False),n_estimators=5) BR.fit(X, y) y_BR_pred = BR.predict(X_test) pred_BR = pd.DataFrame(y_BR_pred) y_BR_test = BR.predict(X) test_BR = pd.DataFrame(y_BR_test) return pred_BR,test_BR
class ModelSelectionRegressor(BaseEstimator, RegressorMixin): def __init__(self, objective, optimizer, bagging_params={}, random_state=None): self.objective = objective self.optimizer = optimizer self.bagging_params = bagging_params self.is_frozen = False self.random_state = random_state def freeze(self): self.is_frozen = True def fit(self, X, y): self.objective.X = X self.objective.y = y if not self.is_frozen: self.best_parameters = self.optimizer.execute(self.objective) #logging.info('best_parameters: %s' % str(self.best_parameters)) if self.bagging_params == {}: self.model = self.objective.instantiate(self.best_parameters) else: self.model = BaggingRegressor( base_estimator=self.objective.instantiate( self.best_parameters), **(self.bagging_params), random_state=self.random_state) self.model.fit(X[:, self.best_parameters['features']], y) return self def predict(self, X): return self.model.predict(X[:, self.best_parameters['features']]) def predict_std(self, X): if isinstance(self.model, BaggingRegressor): ens_preds = [] for e in self.model.estimators_: ens_preds.append( e.predict(X[:, self.best_parameters['features']])) ens_preds = np.stack(ens_preds, axis=1) return self.model.predict( X[:, self.best_parameters['features']]), np.std(ens_preds, axis=1) else: return self.model.predict( X[:, self.best_parameters['features']]), np.repeat(0.0, len(X)) def score(self): return self.best_parameters['score']
def runEnsembleBaggingLR(X_train, y_train, X_test, y_test): # Trained on sparse format bag_knn = BaggingRegressor( base_estimator=KNeighborsRegressor(), random_state=1, ).fit(X_train, y_train) mse = mean_squared_error(y_test, bag_knn.predict(X_test)) r2 = r2_score(y_test, bag_knn.predict(X_test)) print('\nEnsemble Bagging using KNN Regression:') print('MSE = ', mse) print('R^2: ', r2) return bag_knn, mse, r2
def estimate_forest_variance(X, y, runs=100, **kwargs): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) imputer = SimpleImputer(strategy='mean') X_train = imputer.fit_transform(X_train) X_test = imputer.transform(X_test) h_bar = BaggingRegressor(base_estimator=RandomForestRegressor( n_estimators=100, **kwargs), n_estimators=runs, n_jobs=1, max_samples=0.8) h_bar.fit(X_train, y_train) h_bar_preds = h_bar.predict(X_test) estimators_preds = [] for tree in h_bar.estimators_: estimators_preds.append(tree.predict(X_test)) estimators_preds = np.array(estimators_preds) var = np.mean((estimators_preds - h_bar_preds)**2) return var
def avmPredict(params): town = getPlace(params['lat'], params['long'])[0] x, y, z = getXYZ(params['lat'], params['long']) r = 1.0 data = [] target = [] header = [] with open('../../../data/working22.csv') as f: f = csv.reader(f) header = next(f) for row in f: t = (map(float, row[:3] + row[4:]), float(row[3])) if weightF([x, y, z], t[0][0:3], r): data.append(t[0]) target.append(t[1]) ensemble = BaggingRegressor() ensemble.fit(data, target) test = createTest(params) return ensemble.predict(test)
def Breg(self, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, warm_start=False, n_jobs=None, random_state=None, verbose=0): model = BaggingRegressor(n_estimators=n_estimators, max_samples=max_samples, max_features=max_features,bootstrap=bootstrap, bootstrap_features=bootstrap_features, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose) kf = KFold(n_splits=self.cv) test_result = {'RMSE': [], 'R2': [], 'MAE': []} for train_index, test_index in kf.split(self.data.copy()): train_X = self.data.copy().drop( [self.name_target], axis=1).iloc[train_index, :] train_y = self.data.copy().loc[train_index, [self.name_target]] test_X = self.data.copy().drop( [self.name_target], axis=1).iloc[test_index, :] test_y = self.data.copy().loc[test_index, [self.name_target]] model.fit(train_X, train_y) # 测试 y_pred = model.predict(test_X) test_result['RMSE'].append( np.sqrt(metrics.mean_squared_error(test_y, y_pred))) test_result['R2'].append(metrics.r2_score(test_y, y_pred)) test_result['MAE'].append( metrics.mean_absolute_error(test_y, y_pred)) for key, values in test_result.items(): test_result[key] = np.array(values).mean() return model, test_result, train_X,test_X,test_y
def fitPredict(): df_train = pd.read_pickle('../../resources/data/dframes/train_df.pickle') print('Loaded train df') df_test = pd.read_pickle('../../resources/data/dframes/test_df.pickle') print('Loaded test df') id_test = df_test['id'] y_train = df_train['relevance'].values X_train = df_train.drop(['id', 'relevance'], axis=1).values X_test = df_test.drop(['id', 'relevance'], axis=1).values rf = RandomForestRegressor(n_estimators=15, max_depth=None, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) print('Fitting') clf.fit(X_train, y_train) print('Predicting') y_pred = clf.predict(X_test) pd.DataFrame({ "id": id_test, "relevance": y_pred }).to_csv('../../resources/results/submission.csv', index=False)
class BaggingLearner: # see more here: http://scikit-learn.org/stable/modules/ensemble.html#bagging def __init__(self, estimator): self.estimator = estimator self.model = BaggingRegressor(self.estimator.model) pass def fit(self, X, y): if isinstance(Y, pd.DataFrame): y = y.ix[:,0] self.model.fit(X, y) def predict(self, X): prediction = (pd.DataFrame(index=X.index, data={self.GetName():self.model.predict(X)})) return prediction def GetName(self): return "BaggingLearner (" + self.estimator.GetName() + ")" def Save(self, filename): self.estimator.Save(filename + '-estimator.pkl') joblib.dump(self.model, filename + '-model.pkl') joblib.dump(self, filename + '.pkl') def Load(self, filename): self.estimator.Load(filename + '-estimator.pkl') self.model = joblib.load(filename + '-model.pkl') self = joblib.load(filename + '.pkl')
class BAGGING(): """docstring for ClassName""" def __init__(self, BaggingRegressor, N): self.cores_number = int(np.ceil(multiprocessing.cpu_count()/N)) self.model = BaggingRegressor( base_estimator=None, bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs= self.cores_number, oob_score=False, random_state=None, verbose=0, warm_start=False) print("Bagging Cores: ", self.cores_number) def fit(self, X_train, y_train, X_test, y_test, error_type = "MAE"): error_dict = {"MSE":"rmse", "R2":{"l1","l2"}, "MAE":"mae","LOGLOSS": "multi_logloss" } error_metric = error_dict[error_type] self.model.fit(X_train, y_train ) def predict(self, X_test): prediction=self.model.predict(X_test) return(prediction)
class _BaggingRegressorImpl: def __init__( self, base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0, ): estimator_impl = base_estimator self._hyperparams = { "base_estimator": estimator_impl, "n_estimators": n_estimators, "max_samples": max_samples, "max_features": max_features, "bootstrap": bootstrap, "bootstrap_features": bootstrap_features, "oob_score": oob_score, "warm_start": warm_start, "n_jobs": n_jobs, "random_state": random_state, "verbose": verbose, } self._wrapped_model = SKLModel(**self._hyperparams) self._hyperparams["base_estimator"] = base_estimator def get_params(self, deep=True): out = self._wrapped_model.get_params(deep=deep) # we want to return the lale operator, not the underlying impl out["base_estimator"] = self._hyperparams["base_estimator"] return out def fit(self, X, y, sample_weight=None): if isinstance(X, pd.DataFrame): feature_transformer = FunctionTransformer( func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns), inverse_func=None, check_inverse=False, ) self._hyperparams["base_estimator"] = ( feature_transformer >> self._hyperparams["base_estimator"] ) self._wrapped_model = SKLModel(**self._hyperparams) self._wrapped_model.fit(X, y, sample_weight) return self def predict(self, X): return self._wrapped_model.predict(X) def score(self, X, y, sample_weight=None): return self._wrapped_model.score(X, y, sample_weight)
def estimate_tree_variance(X, y, runs=100, **kwargs): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) imputer = SimpleImputer(strategy='mean') X_train = imputer.fit_transform(X_train) X_test = imputer.transform(X_test) """ The BaggingRegressor here estimates the average of multiple decision trees each trained on a random sub-sample of the training set (~80% of it) by averaging their predictions on each testing sample as the final prediction """ h_bar = BaggingRegressor(base_estimator=DecisionTreeRegressor( random_state=42, **kwargs), n_estimators=runs, n_jobs=6, max_samples=0.8, random_state=42) h_bar.fit(X_train, y_train) h_bar_preds = h_bar.predict(X_test) """ Here we access each individual tree and take its prediction on the test samples """ estimators_preds = [] for tree in h_bar.estimators_: estimators_preds.append(tree.predict(X_test)) """ Here we simply implement the variance defintion Var(h) = E[E_x[(h(x) - h_bar(x)) ^ 2]] """ estimators_preds = np.array(estimators_preds) var = np.mean((estimators_preds - h_bar_preds)**2) return var
def bagging_logistic_regression(): data_train = pd.read_csv(TRAIN_FILE_PATH) # 逻辑回归建模 raw_data_train, processer = process_train_data(data_train) data_train = raw_data_train.as_matrix() y = data_train[:, 0] # 取survived X = data_train[:, 1:] # 取survived clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1, bootstrap=True, bootstrap_features=False, n_jobs=-1) # 同样对test_data预处理 raw_data_test = pd.read_csv(TEST_FILE_PATH) data_test = process_test_data(raw_data_test, processer) # 预测 predictions = bagging_clf.predict(data_test.as_matrix()) predict_result = pd.DataFrame( dict(PassengerId=raw_data_test.PassengerId.as_matrix(), Survived=predictions.astype(np.int32))) # predict_result.to_csv(RESULT_OUTPUT_PATH, index=False) print(predict_result)
class BaggingRegressorPrim(primitive): def __init__(self, random_state=0): super(BaggingRegressorPrim, self).__init__(name='BaggingRegressor') self.hyperparams = [] self.type = 'Regressor' self.description = "A Bagging regressor. A Bagging regressor is an ensemble meta-estimator that fits base regressors each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it. This algorithm encompasses several works from the literature. When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting [1]. If samples are drawn with replacement, then the method is known as Bagging [2]. When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces [3]. Finally, when base estimators are built on subsets of both samples and features, then the method is known as Random Patches [4]." self.hyperparams_run = {'default': True} self.random_state = random_state self.model = BaggingRegressor(random_state=random_state, n_jobs=5) self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.model.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) output['predictions'] = self.model.predict(output['X']) output['X'] = pd.DataFrame(output['predictions'], columns=[self.name + "Pred"]) final_output = {0: output} return final_output
def KNeighborsBagging(neigh): kn1 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='uniform') kn2 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='distance') bgg = BaggingRegressor(kn1, n_estimators=10, max_samples=0.7, max_features=0.9, verbose=0) #, max_features=0.5 bgg.fit(X_train, y_train) print(bgg.score(X_train, y_train)) y_pred = bgg.predict(X_test) # Generate ROC curve values: fpr, tpr, thresholds fpr, tpr, thresholds = roc_curve(y_test, y_pred) # Plot ROC curve plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr, tpr) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.show()
class Predictor(BaseEstimator): '''Predictor: modify this class to create a predictor of your choice. This could be your own algorithm, of one for the scikit-learn models, for which you choose the hyper-parameters.''' def __init__(self): '''This method initializes the predictor.''' self.mod = BaggingRegressor(base_estimator=RandomForestRegressor( n_estimators=50)) print("PREDICTOR=" + self.mod.__str__()) def fit(self, X, y): ''' This is the training method: parameters are adjusted with training data.''' self.mod = self.mod.fit(X, y) return self def predict(self, X): ''' This is called to make predictions on test data. Predicted classes are output.''' return self.mod.predict(X) def save(self, path="./"): pickle.dump(self, open(path + '_model.pickle', "w")) def load(self, path="./"): self = pickle.load(open(path + '_model.pickle')) return self
def train_model(train, test, labels): rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=10) #rf = RandomForestRegressor(n_estimators=45, max_depth=9, random_state=10) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.2, random_state=25) clf.fit(train, labels) #clf = SVR(C=1.0, epsilon=0.2) #clf.fit(train, labels) #clf = GaussianNB() #clf.fit(train, labels) print "Good!" predictions = clf.predict(test) print predictions.shape predictions = pd.DataFrame(predictions, columns = ['relevance']) print "Good again!" print "Predictions head -------" print predictions.head() print predictions.shape print "TEST head -------" print test.head() print test.shape #test['id'].to_csv("TEST_TEST.csv",index=False) #predictions.to_csv("PREDICTIONS.csv",index=False) #test = test.reset_index() #predictions = predictions.reset_index() #test = test.groupby(level=0).first() #predictions = predictions.groupby(level=0).first() predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False) print predictions return predictions
def run_gpr(down_station, input_list, include_time, sample_size, network_type, include_diff, n_estimators, b): start_time_run = time.time() result_dir = util.get_result_dir(down_station, network_type, n_estimators, b, sample_size) if not os.path.exists(result_dir): os.makedirs(result_dir) (y_train, x_train, y_cv, x_cv, _, _, _, _, train_y_max, train_y_min, _, _, _, _, _) = data.construct(down_station, input_list, include_time, sample_size, network_type) # n_estimators = 50 gpr = BaggingRegressor(GaussianProcessRegressor(copy_X_train=False), max_samples=1.0 / n_estimators, n_estimators=n_estimators, n_jobs=1) # svr = SVR(C=_C, epsilon=_epsilon, verbose=True, cache_size=1024) # No bagging # gpr = GaussianProcessRegressor(copy_X_train=False) gpr.fit(x_train, y_train) util.save_sklearn_model(gpr, result_dir) y_cv_pred = gpr.predict(x_cv) predict.plot_prediction(y_cv_pred, result_dir, y_cv, train_y_max, train_y_min) elapsed_time_run = time.time() - start_time_run print( time.strftime("Fitting time : %H:%M:%S", time.gmtime(elapsed_time_run)))
def procedureA(goldenFlag=False): # Trains and generates a prediction file # Uses hard heuristic for buy_or_not popFlag = True X, Y = getDataXY(currYearFlag=False, popFlag=popFlag) X, Y = shuffle(X, Y, random_state=0) if popFlag: encoder = oneHot(X[:, 2:]) Xt = encoder.transform(X[:, 2:]) Xt = np.hstack((X[:, :2], Xt)) else: encoder = oneHot(X) Xt = encoder.transform(X) buySet = set() for i in range(X.shape[0]): tmpTup = (X[i][0], X[i][2]) buySet.add(tmpTup) # Y_buy = [1] * Xt.shape[0] min_max_scaler = preprocessing.MinMaxScaler() # Xt = min_max_scaler.fit_transform(Xt) if goldenFlag: print Xt.shape Xt = getGoldenX(Xt, 2, 2 + encoder.feature_indices_[1], 2 + encoder.feature_indices_[0], 2 + min(9, encoder.feature_indices_[1])) split = 0.9 X_train, X_test = Xt[:(int(Xt.shape[0] * split)), :], Xt[int(Xt.shape[0] * split):, :] Y_train, Y_test = Y[:(int(Y.shape[0] * split)), :], Y[int(Y.shape[0] * split):, :] Y_train = Y_train.ravel() Y_test = Y_test.ravel() print X_train.shape print X_test.shape # clf = Ridge(alpha = 100) # clf = SVR(C = 10.0, kernel = 'poly', degree = 2) # clf = LinearSVR(C = 1.0) clf = BaggingRegressor(DecisionTreeRegressor(), n_estimators=125, n_jobs=4, random_state=0) # clf = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = 100) # clf = DecisionTreeRegressor() # clf = RandomForestRegressor(random_state = 0, n_estimators = 200, n_jobs = 4) clf.fit(X_train, Y_train.ravel()) Y_pred = clf.predict(X_test) evaluatePred(Y_pred, Y_test) return clf, encoder, min_max_scaler
def random_forest(X,Y,Xt): print('learn') rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf.fit(X, Y) print('predict') Yp_clamped = clf.predict(Xt) return Yp_clamped
def bagging_regressor_train(X_train, y_train, X_valid, y_valid): model = BaggingRegressor(base_estimator=SVR(), n_estimators=10, random_state=0) model.fit(X_train, y_train) y_pred = model.predict(X_valid) valid_score = np.sqrt(mean_squared_error(y_valid, y_pred)) return model, valid_score
def run(): print "Bagged Decision Tree Regression started..." #Preparing Training data dir_path = "" train_file_path = dir_path + "train.csv" train_file = read_csv(train_file_path, skiprows=1, header=None) train_file = train_file.drop(train_file.columns[0], axis=1) train_file = train_file.values #Combining previous 5 time step data into one row train_X_temp = train_file[5:50000, :-1] train_Y = train_file[6:50001, -1] train_X = np.zeros((train_X_temp.shape[0], 8 * 5)) for i in range(train_X_temp.shape[0]): for j in range(5): for k in range(8): train_X[i][j * 8 + k] = train_X_temp[i - j][k] #Preparing testing data test_file_name = dir_path + "test2.csv" test_file = read_csv(test_file_name, skiprows=1, header=None) test_file = test_file.values test_X = np.array(test_file[:, :-1]) test_y = test_file[:, -1] # print "\nSimple Decison Tree:" # dec_tree = DecisionTreeRegressor(max_depth = 5) # dec_tree.fit(train_X, train_Y) # prediction = dec_tree.predict(test_X) # print "Predictions: \n",prediction # print "Score: ",dec_tree.score(test_X,test_y) # print "\nADABoost Decision Tree:" # ada_boost = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 5),n_estimators = 10) # ada_boost.fit(train_X, train_Y) # prediction = ada_boost.predict(test_X) # print "Predictions: \n",prediction # print "Score: ",ada_boost.score(test_X,test_y) #Model training and prediction print "\nBagged Decision Tree:" start = time.time() bag_reg = BaggingRegressor(DecisionTreeRegressor(), n_jobs=2, random_state=0).fit(train_X, train_Y) #bag_reg.set_params(n_jobs=1) #Calculating and printing Results prediction = bag_reg.predict(test_X) mse = np.mean((prediction - test_y)**2) print "MSE: ", mse # print "Predictions: \n",prediction print "Score: ", bag_reg.score(test_X, test_y) print "Time: ", (time.time() - start) print "Decision Tree Regressor done...\n"
def test_sparse_regression(): # Check regression for various parameter settings on sparse input. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) class CustomSVR(SVR): """SVC variant that records the nature of the training set""" def fit(self, X, y): super().fit(X, y) self.data_type_ = type(X) return self parameter_sets = [ {"max_samples": 0.5, "max_features": 2, "bootstrap": True, "bootstrap_features": True}, {"max_samples": 1.0, "max_features": 4, "bootstrap": True, "bootstrap_features": True}, {"max_features": 2, "bootstrap": False, "bootstrap_features": True}, {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False}, ] for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: # Trained on sparse format sparse_classifier = BaggingRegressor( base_estimator=CustomSVR(), random_state=1, **params ).fit(X_train_sparse, y_train) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_results = BaggingRegressor( base_estimator=CustomSVR(), random_state=1, **params ).fit(X_train, y_train).predict(X_test) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert_array_almost_equal(sparse_results, dense_results) assert all([t == sparse_type for t in types]) assert_array_almost_equal(sparse_results, dense_results)
def test_sparse_regression(): # Check regression for various parameter settings on sparse input. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50], diabetes.target[:50], random_state=rng) class CustomSVR(SVR): """SVC variant that records the nature of the training set""" def fit(self, X, y): super().fit(X, y) self.data_type_ = type(X) return self parameter_sets = [ {"max_samples": 0.5, "max_features": 2, "bootstrap": True, "bootstrap_features": True}, {"max_samples": 1.0, "max_features": 4, "bootstrap": True, "bootstrap_features": True}, {"max_features": 2, "bootstrap": False, "bootstrap_features": True}, {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False}, ] for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: # Trained on sparse format sparse_classifier = BaggingRegressor( base_estimator=CustomSVR(), random_state=1, **params ).fit(X_train_sparse, y_train) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_results = BaggingRegressor( base_estimator=CustomSVR(), random_state=1, **params ).fit(X_train, y_train).predict(X_test) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert_array_almost_equal(sparse_results, dense_results) assert all([t == sparse_type for t in types]) assert_array_almost_equal(sparse_results, dense_results)
def test_parallel_regression(): # Check parallel regression. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
def train_and_validate(X_inp, Y_inp, seed): # To compensate inbalance, we need to define different test size on each # error visibility level # testsizes = [ 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5 ] X_train = [] Y_train = [] X_validation = [] Y_validation = [] for i in range(0, 20): X_this = [ X_inp[j, :] for j in range(0, len(Y)) if Y[j] > i / 20 - 0.001 and Y[j] < i / 20 + 0.001 ] Y_this = [ Y_inp[j] for j in range(0, len(Y)) if Y[j] > i / 20 - 0.001 and Y[j] < i / 20 + 0.001 ] X_train_this, X_validation_this, Y_train_this, Y_validation_this = model_selection.train_test_split( X_this, Y_this, test_size=testsizes[i], random_state=seed) Y_train.extend(Y_train_this) X_train.extend(X_train_this) Y_validation.extend(Y_validation_this) X_validation.extend(X_validation_this) # ===================================================================== # Regression training here. You can just use whatever regression model # you like. # # # First, scale the features scaler = MinMaxScaler(feature_range=(0.0001, 1)) X_train = scaler.fit_transform(X_train) # Then, define the model and the parameters #model = GradientBoostingRegressor() #model = MLPRegressor() #model = SVR() #model = RandomForestRegressor() model = BaggingRegressor( base_estimator=ensemble.GradientBoostingRegressor()) model.fit(X_train, Y_train) # ===================================================================== # Validation part starts here. Nothing very special. # X_validation = scaler.transform(X_validation) Y_pred = model.predict(X_validation) return Y_pred, Y_validation
def HHT_MARS_TEST(series, regressors=4, delay=1, N=2000): series = series[len(series) - 2000:] series = np.array(series) series = series.reshape(-1, 1) D = regressors # number of regressors T = delay # delay N = N series = series[500:] data = np.zeros((N - 500 - T - (D - 1) * T, D)) lbls = np.zeros((N - 500 - T - (D - 1) * T, )) for t in range((D - 1) * T, N - 500 - T): data[t - (D - 1) * T, :] = [ series[t - 3 * T], series[t - 2 * T], series[t - T], series[t] ] lbls[t - (D - 1) * T] = series[t + T] trnData = data[:lbls.size - round(lbls.size * 0.3), :] trnLbls = lbls[:lbls.size - round(lbls.size * 0.3)] chkData = data[lbls.size - round(lbls.size * 0.3):, :] chkLbls = lbls[lbls.size - round(lbls.size * 0.3):] aa = np.array(chkLbls[-4:]).reshape(1, -1) chkData = np.append(chkData, aa, axis=0) mars = Earth() mars.fit(trnData, trnLbls) boosted_mars = AdaBoostRegressor(base_estimator=mars, n_estimators=25, learning_rate=0.1, loss='exponential') bag = BaggingRegressor(base_estimator=mars, n_estimators=25) bag.fit(trnData, trnLbls) boosted_mars.fit(trnData, trnLbls) pred2 = bag.predict(chkData) oos_preds = boosted_mars.predict(chkData) stack_predict = np.vstack([oos_preds, pred2]).T params_xgd = { 'max_depth': 7, 'objective': 'reg:linear', 'learning_rate': 0.05, 'n_estimators': 10000 } clf = xgb.XGBRegressor(**params_xgd) clf.fit(stack_predict[:-1, :], chkLbls, eval_set=[(stack_predict[:-1, :], chkLbls)], eval_metric='rmse', early_stopping_rounds=20, verbose=False) xgb_pred = clf.predict(stack_predict) return xgb_pred
def Bagging(x_train, y_train, x_test, y_test): estimator = BaggingRegressor(n_estimators=1000, random_state=0, n_jobs=-1) estimator.fit(x_train, y_train) t = estimator.score(x_train, y_train) y_pred = estimator.predict(x_test) mse_score = mse(y_test, y_pred) print("mse_score: " + str(mse_score)) r2_score = r2(y_test, y_pred) print("r2_score: " + str(r2_score)) print(t)
def main(): arg = args() train = rw.read(arg.train) test = rw.read(arg.test) X = train.loc[:, train.columns != 'Market Share_total'] y = train['Market Share_total'] bagging_regressor = BaggingRegressor() bagging_regressor.fit(X, y) predictions = bagging_regressor.predict(test) rw.write(predictions, "test_results.csv")
def Bagging(Xtrain, Ytrain, Xtest, Ytest): """ Apply the extra trees regressor """ from sklearn.ensemble import BaggingRegressor print('\nBagging regressor:') clf = BaggingRegressor(n_estimators=100, n_jobs=-1).fit(Xtrain, Ytrain) print('Accuracy: {0}'.format(clf.score(Xtrain, Ytrain))) #find the training error prediction = clf.predict(Xtrain) Etrain = error(prediction, Ytrain) print('Training error: {0}'.format(Etrain)) #find the test error prediction = clf.predict(Xtest) Etrain = error(prediction, Ytest) print('Test error: {0}'.format(Etrain))
def procedureA(goldenFlag = False): # Trains and generates a prediction file # Uses hard heuristic for buy_or_not popFlag = True X, Y = getDataXY(currYearFlag = False, popFlag = popFlag) X, Y = shuffle(X, Y, random_state = 0) if popFlag: encoder = oneHot(X[:, 2:]) Xt = encoder.transform(X[:, 2:]) Xt = np.hstack((X[:,:2], Xt)) else: encoder = oneHot(X) Xt = encoder.transform(X) buySet = set() for i in range(X.shape[0]): tmpTup = (X[i][0], X[i][2]) buySet.add(tmpTup) # Y_buy = [1] * Xt.shape[0] min_max_scaler = preprocessing.MinMaxScaler() # Xt = min_max_scaler.fit_transform(Xt) if goldenFlag: print Xt.shape Xt = getGoldenX(Xt, 2, 2 + encoder.feature_indices_[1], 2 + encoder.feature_indices_[0], 2 + min(9, encoder.feature_indices_[1])) split = 0.9 X_train, X_test = Xt[:(int(Xt.shape[0]*split)),:], Xt[int(Xt.shape[0]*split):, :] Y_train, Y_test = Y[:(int(Y.shape[0]*split)),:], Y[int(Y.shape[0]*split):, :] Y_train = Y_train.ravel() Y_test = Y_test.ravel() print X_train.shape print X_test.shape # clf = Ridge(alpha = 100) # clf = SVR(C = 10.0, kernel = 'poly', degree = 2) # clf = LinearSVR(C = 1.0) clf = BaggingRegressor(DecisionTreeRegressor(), n_estimators = 125, n_jobs = 4, random_state = 0) # clf = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = 100) # clf = DecisionTreeRegressor() # clf = RandomForestRegressor(random_state = 0, n_estimators = 200, n_jobs = 4) clf.fit(X_train, Y_train.ravel()) Y_pred = clf.predict(X_test) evaluatePred(Y_pred, Y_test) return clf, encoder, min_max_scaler
def get_bagging_prediction(X_train, y_train, X_test, X_valid=None, GS=False): if not GS: rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) if X_valid is None: return y_pred else: return y_pred, clf.predict(X_valid) else: rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]} model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=2, verbose=VERBOSE, scoring=RMSE) model.fit(X_train, y_train) y_pred = model.predict(X_test) if X_valid is None: return y_pred else: return y_pred, model.predict(X_valid)
def runTests(): # Generate the training samples, extract training features and target trainSamples = GenSamples(numSamples) trainFeatures = extractFeatures(trainSamples) trainPred = extractPred(trainSamples) # Generate the test samples, extracr test features and target testSamples = GenSamples(numTestSamples) testFeatures = extractFeatures(testSamples) testPred = extractPred(testSamples) R2List = OrderedDict() R2List['TrainROI'] = [] R2List['TestROI'] = [] print 'Running Tests: ' for i in range(numTests): # Bootstrap is True by default i.e., sampling with replacement # Bootstrap features is False by default i.e., all features used classifier = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=numTrees, max_samples=int(0.5*numSamples), max_features=int(1)) classifier.fit(trainFeatures, trainPred) predictROI = {} predictROI['Training'] = classifier.predict(trainFeatures) predictROI['Test'] = classifier.predict(testFeatures) R2 = {} R2['Train'] = r2_score(trainPred, predictROI['Training']) R2['Test'] = r2_score(testPred, predictROI['Test']) R2List['TrainROI'].append(R2['Train']) R2List['TestROI'].append(R2['Test']) print 'Best Train ROI: ', max(R2List['TrainROI']) print 'Best Test ROI: ', max(R2List['TestROI'])
def test_single_estimator(): # Check singleton ensembles. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) clf1 = BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=1, bootstrap=False, bootstrap_features=False, random_state=rng).fit(X_train, y_train) clf2 = KNeighborsRegressor().fit(X_train, y_train) assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
def train_model(training, testing, window=5, n=5): X_train, y_train = prepare_data(training) X_test, y_test = prepare_data(testing) rf = RandomForestRegressor() rf.fit(X_train, y_train) predrf = rf.predict(X_test) print "mse for random forest regressor: ", mean_squared_error(predrf, y_test) gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025) gb.fit(X_train, y_train) predgb = gb.predict(X_test) print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test) ## plot feature importance using GBR results fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility']) fx_imp /= fx_imp.max() # normalize fx_imp.sort() ax = fx_imp.plot(kind='barh') fig = ax.get_figure() fig.savefig("output/feature_importance.png") adb = AdaBoostRegressor(DecisionTreeRegressor()) adb.fit(X_train, y_train) predadb = adb.predict(X_test) print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test) scale = StandardScaler() scale.fit(X_train) X_trainscale = scale.transform(X_train) X_testscale = scale.transform(X_test) knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5) knn.fit(X_trainscale, y_train) predknn = knn.predict(X_testscale) print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test) pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test) result = testing.copy() result.ix[5:-5, 'trend'] = pred_test result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values result.ix[:-5, 'pred_date'] = result.index[5:] return result
def procc_modelfusion(df_test, data_test): from sklearn.ensemble import BaggingRegressor from sklearn import linear_model train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') train_np = train_df.as_matrix() # y即Survival结果 y = train_np[:, 0] # X即特征属性值 X = train_np[:, 1:] # fit到BaggingRegressor之中 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1) bagging_clf.fit(X, y) test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') predictions = bagging_clf.predict(test) result = pd.DataFrame({'PassengerId' : data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)}) result.to_csv("logistic_regression_predictions3.csv", index=False)
class Regressor(BaseEstimator): def __init__(self): # self.clf = GradientBoostingRegressor(n_estimators=200, max_features="sqrt", max_depth=5) # self.clf = LinearRegression() self.clf = BaggingRegressor(LinearRegression()) # self.clf = GaussianProcess(theta0=4) # self.sp = RandomizedLasso() self.sp = SparseRandomProjection(n_components=5) # self.sp = TruncatedSVD() # self.sp = KernelPCA(n_components=3, tol=0.0001, kernel="poly") # self.clf = ExtraTreesRegressor(n_estimators=200, max_features="sqrt", max_depth=5) def fit(self, X, y): # print(self.sp) # Xr = self.sp.fit_transform(X, y) self.clf.fit(X, y.ravel()) def predict(self, X): # Xr = self.sp.transform(X) return self.clf.predict(X)
class BaggingRegressor(BaseEstimator): """ Usage: ``` "model": { "class": "ume.ensemble.BaggingRegressor", "params": { "base_estimator": { "class": "sklearn.svm.SVR", "params": { "kernel": "rbf", "degree": 1, "C": 1000000.0, "epsilon": 0.01, }, }, "bag_kwargs": { "n_estimators": 100, "n_jobs": 5, "max_samples": 0.9, }, } } ``` """ def __init__(self, base_estimator=None, bag_kwargs=None): klass = dynamic_load(base_estimator['class']) svr_reg = klass(**base_estimator['params']) self.__clf = SK_BaggingRegressor(base_estimator=svr_reg, **bag_kwargs) def fit(self, X, y): return self.__clf.fit(X, y) def predict(self, X): return self.__clf.predict(X)
sorted_pairs = sorted(pairs, key = lambda pair: pair[1]) features_sorted, featImportances_sorted = zip(*sorted_pairs) fig, ax = plt.subplots() plt.barh(pos, featImportances_sorted, 1, color = "blue") plt.yticks(pos,features_sorted) ax.set_title('Gradient Boosting: Relative Feature Importance') #Tree Bagging TreeBagger=BaggingRegressor() TreeBagger.fit(Xtrain, Ytrain) fig = plt.figure() ax1 = fig.add_subplot(2, 1, 1) ax1.plot_date(dates, modeldata.Load[45000:50000], 'r-',tz=None, xdate=True, ydate=False, label='Actual Load') ax1.set_title('Tree Bagging: Actual and Predicted Loads') plt.plot(dates, TreeBagger.predict(Xtest), 'g-',label='Predicted Load') ax1.legend() ax2 = fig.add_subplot(2, 1, 2) ax2.plot_date(dates, modeldata.Load[45000:50000]-TreeBagger.predict(Xtest), 'r-',tz=None, xdate=True, ydate=False) ax2.set_title('Error between actual and predicted loads, MW') MSEs_Bagging=[mean_squared_error(Ytest, TreeBagger.predict(Xtest)), mean_squared_error(Ytrain, TreeBagger.predict(Xtrain))] #Model Comparison: Bar charts fig, ax = plt.subplots() width=.3 rects1 = ax.bar([0,1,2], [MSEs_Boost[0],MSEs_lm[0], MSEs_Bagging[0]], width, color='y') rects2 = ax.bar([width, width+1, width+2], [MSEs_Boost[1],MSEs_lm[1], MSEs_Bagging[1]], width, color='b') ax.set_xticks([width, width+1, width+2]) ax.set_xticklabels(('Gradient Boosting', 'Linear Model', 'Tree Bagging'))
# define the training and testing sets X_train = train.iloc[:, 1:] y_train = train.iloc[:, 0] X_test = test.iloc[:, 1:] y_test = test.iloc[:, 0] # instruct BaggingRegressor to use DecisionTreeRegressor as the "base estimator" from sklearn.ensemble import BaggingRegressor bagreg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1) # fit and predict bagreg.fit(X_train, y_train) y_pred = bagreg.predict(X_test) y_pred # calculate RMSE np.sqrt(metrics.mean_squared_error(y_test, y_pred)) # ## Estimating out-of-sample error # # For bagged models, out-of-sample error can be estimated without using **train/test split** or **cross-validation**! # # On average, each bagged tree uses about **two-thirds** of the observations. For each tree, the **remaining observations** are called "out-of-bag" observations. # show the first bootstrap sample samples[0]
# 大家都看过知识问答的综艺节目中, 求助现场观众时候, 让观众投票, 最高的答案作为自己的答案的形式吧, 每个人都有一个判定结果, # 最后我们相信答案在大多数人手里. # 再通俗一点举个例子. 你和你班某数学大神关系好, 每次作业都『模仿』他的, 于是绝大多数情况下, 他做对了, 你也对了. # 突然某一天大神脑子犯糊涂, 手一抖, 写错了一个数, 于是…恩, 你也只能跟着错了. # 我们再来看看另外一个场景, 你和你班 5 个数学大神关系都很好, 每次都把他们作业拿过来, 对比一下, 再『自己做』, 那你想想, # 如果哪天某大神犯糊涂了, 写错了, but 另外四个写对了啊, 那你肯定相信另外 4 人的是正确答案吧? # 最简单的模型融合大概就是这么个意思, 比如分类问题, 当我们手头上有一堆在同一份数据集上训练得到的分类器 # (比如 logistic regression, SVM, KNN, random forest, 神经网络), 那我们让他们都分别去做判定, 然后对结果做投票统计, 取票数最多的结果为最后结果. # 模型融合可以比较好地缓解, 训练过程中产生的过拟合问题, 从而对于结果的准确度提升有一定的帮助. # 话说回来, 回到我们现在的问题. 你看, 我们现在只讲了 logistic regression, 如果我们还想用这个融合思想去提高我们的结果, 我们该怎么做呢? # 既然这个时候模型没得选, 那咱们就在数据上动动手脚咯. 大家想想, 如果模型出现过拟合现在, 一定是在我们的训练上出现拟合过度造成的对吧. # 那我们干脆就不要用全部的训练集, 每次取训练集的一个 subset, 做训练, 这样, 我们虽然用的是同一个机器学习算法, # 但是得到的模型却是不一样的;同时, 因为我们没有任何一份子数据集是全的, 因此即使出现过拟合, 也是在子训练集上出现过拟合, # 而不是全体数据上, 这样做一个融合, 可能对最后的结果有一定的帮助. 对, 这就是常用的 Bagging. # 我们用 scikit-learn 里面的 Bagging 来完成上面的思路, 过程非常简单. 代码如下: from sklearn.ensemble import BaggingRegressor train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') train_np = train_df.as_matrix() y = train_np[:, 0] # y 即 Survival 结果 X = train_np[:, 1:] # X 即特征属性值 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) # fit 到 BaggingRegressor 之中 bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1) bagging_clf.fit(X, y) test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') predictions = bagging_clf.predict(test) result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)}) result.to_csv("./tmp_dataset/Kaggle-Titanic/result.csv", index=False) # 0.75598; 竟然更低了, 可能是 BaggingRegressor 随机分配的时候运气不好 # 上一个结果和博客中作者的结果一毛一样, 第二次竟然不一样了
model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=5, verbose=0, scoring=RMSE) errors = [] X_train = df.drop(['product_uid', 'id', 'relevance'], axis=1).values y_train = df['relevance'].values model.fit(X_train, y_train) print("Best parameters found by grid search:") print(model.best_params_) print("Best CV score:") print(model.best_score_) del X_train, y_train kf = KFold(df.shape[0], n_folds=K_fold) for train_index, test_index in kf: train_set = df.iloc[train_index] test_set = df.iloc[test_index] y_train = train_set['relevance'].values X_train = train_set.drop(['product_uid', 'id', 'relevance'], axis=1).values y_test = train_set['relevance'].values X_test = test_set.drop(['product_uid', 'id', 'relevance'], axis=1).values clf2.fit(X_train,y_train) result = clf2.predict(X_test) error = np.sqrt(mean_squared_error(result,y_test)) errors.extend([error]) print np.mean(errors)
rf = RandomForestRegressor() br = BaggingRegressor(rf) pipe = pipeline.Pipeline([('rf', rf), ('br', br)]) parameters = dict(rf__n_estimators=[5, 10, 15, 20], rf__max_depth=[2, 4, 6, 8, 10], rf__random_state=[0, 5, 10, 15], br__n_estimators=[5, 15, 25, 35, 45, 55], br__max_samples=[0.1, 0.2, 0.3], br__random_state=[0, 5, 10, 15, 20, 25, 30]) model = grid_search.GridSearchCV(pipe, parameters) model.fit(features_train, labels_train) print("Best parameters:") print(model.best_params_) print("Best CV score:") print(model.best_score_) #Best parameters: #{'br__max_samples': 0.1, 'br__n_estimators': 45, 'rf__max_depth': 6, 'br__random_state': 25, 'rf__random_state': 0, 'rf__n_estimators': 5} #Best CV score: 0.13390585367 pred = model.predict(features_test) """ # Use the best parameters from gridsearch rf = RandomForestRegressor(n_estimators=5, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf.fit(features_train, labels_train) pred = clf.predict(features_test) # Write predicted numbers to submission.csv file pd.DataFrame({"id": id_test, "relevance": pred}).to_csv('submission.csv',index=False)
lm_bagged = BaggingRegressor( base_estimator = lm, n_estimators = 75, max_samples = n_samp, max_features = n_feat, bootstrap = True, oob_score = False, warm_start = False, n_jobs = -1 ) log_bagged = BaggingClassifier( base_estimator = log, n_estimators = 75, max_samples = n_samp, max_features = n_feat, bootstrap = True, oob_score = False, warm_start = False, n_jobs = -1 ) lm_bagged.fit(X = train[features], y = train['y']) log_bagged.fit(X = train[features], y = train['y']) lm_bagged_preds = lm_bagged.predict(X = test[features]) log_bagged_preds = log_bagged.predict_proba(X = test[features]) write_function(lm_bagged_preds, '/tmp/lm_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat)) write_function(second_pos_clip(log_bagged_preds), '/tmp/log_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat))
#RMSE : 0.508722146314 #trainSetFeatures = SelectKBest(f_regression, k=180).fit_transform(trainSetFeatures, trainSetLabels) #testSetFeatures = SelectKBest(f_regression, k=180).fit_transform(testSetFeatures, testSetLabels) #RMSE : 0.508874063656 #trainSetFeatures = SelectKBest(f_regression, k=200).fit_transform(trainSetFeatures, trainSetLabels) #testSetFeatures = SelectKBest(f_regression, k=200).fit_transform(testSetFeatures, testSetLabels) #RMSE :0.512003679803 #trainSetFeatures = SelectKBest(f_regression, k=220).fit_transform(trainSetFeatures, trainSetLabels) #testSetFeatures = SelectKBest(f_regression, k=220).fit_transform(testSetFeatures, testSetLabels) print "\nBegin training..." #train the model random_forest_regressor = RandomForestRegressor(n_estimators=15, max_depth=100, random_state=0) bagging_regressor = BaggingRegressor(random_forest_regressor, n_estimators=45, max_samples=0.1, random_state=25) bagging_regressor.fit(trainSetFeatures, trainSetLabels) print "\nBegin prediction..." #make the prediction on the test set predictedLabels = bagging_regressor.predict(testSetFeatures) print "\nOutput the result..." #output the prediction testSetId = testSet['id'] pd.DataFrame({"id": testSetId, "relevance": predictedLabels}).to_csv('IOFolder/random_forest_results.csv', index=False) print "RMSE :\t", utils.getRMSE(testSetLabels, predictedLabels)
plt.legend(loc='upper right') plt.grid(b=True) plt.subplot(132) t = np.arange(N) plt.plot(t, x, 'r-', lw=1, label=u'原始数据') plt.plot(abnormal, x[abnormal], 'go', markeredgecolor='g', ms=3, label=u'异常值') plt.legend(loc='upper right') plt.title(u'异常检测', fontsize=18) plt.grid(b=True) # 预测 plt.subplot(133) select = np.ones(N, dtype=np.bool) select[abnormal] = False t = np.arange(N) dtr = DecisionTreeRegressor(criterion='mse', max_depth=10) br = BaggingRegressor(dtr, n_estimators=10, max_samples=0.3) br.fit(t[select].reshape(-1, 1), x[select]) y = br.predict(np.arange(N).reshape(-1, 1)) y[select] = x[select] plt.plot(x, 'g--', lw=1, label=u'原始值') # 原始值 plt.plot(y, 'r-', lw=1, label=u'校正值') # 校正值 plt.legend(loc='upper right') plt.title(u'异常值校正', fontsize=18) plt.grid(b=True) plt.tight_layout(1.5, rect=(0, 0, 1, 0.95)) plt.suptitle(u'排污数据的异常值检测与校正', fontsize=22) plt.show()
# Getting Testing Data out of the DF test_data_frame = data_frame_regression.iloc[num_train:] # Getting IDs for Testing Data id_test = test_data_frame['id'] relevance_train = train_data_frame['relevance'].values # All the Independent Variables in the Regressor # These are Words in Title, Desription, Values X_train = train_data_frame.drop(['id', 'relevance'], axis=1).values # Same for Test Data X_test = test_data_frame.drop(['id', 'relevance'], axis=1).values # Using RandomForest Regressor rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) # Using Bagging Regressor clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) # Fit the Training Data to a Model clf.fit(X_train, relevance_train) # Predicting the relevance for Testind Data relevance_pred = clf.predict(X_test) # Writing the Relevance Values to Submission.csv pandas.DataFrame({"id": id_test, "relevance": relevance_pred}).to_csv('submission.csv', index=False)
for train, test in kf: TR.append(train) TS.append(test) A = [] B = [] for k in range(kfcv): print k X_train = X[TR[k], :] y_train = y[TR[k]] X_test = X[TS[k], :] y_test = y[TS[k]] model.fit(X_train, y_train) y_predict = model.predict(X_test) # plt.subplot(2, 10, k + 1) # plt.scatter(y_predict, y_test) # plt.xlabel('y_predict') # plt.ylabel('y_true') # plt.title('Fold = %d' % (k + 1)) A.extend(list(y_predict)) B.extend(list(y_test)) # mse = mean_squared_error(y_predict, y_test) # print 'mse = %f' % mse mse = mean_squared_error(A, B)
} model_gbr_allfeatures = grid_search.GridSearchCV(estimator =gbr, param_grid = parameters, n_jobs = -1, cv = 2, verbose = 20, scoring='mean_squared_error') model_gbr_allfeatures.fit(X_train, Y_train) print(model_gbr_allfeatures.best_params_) #'max_depth': 6, 'n_estimators': 500, 'learning_rate': 0.1, 'max_features': 'auto' predictions_gbr_allfeatures = model_gbr_allfeatures.predict(X_test) mean_squared_error(Y_test, predictions_gbr_allfeatures) #7.071566 #ensembling randomForest model using bagging bag = BaggingRegressor(rfr, n_estimators=500, max_samples=0.1, random_state=25) bag.fit(X_train, Y_train) predictions_rfr_bagging = bag.predict(X_test) mean_squared_error(Y_test, predictions_rfr_bagging) #recursive selection of features for randomForest from sklearn.feature_selection import RFECV rfecv = RFECV(estimator=rfr, step=1, cv=3, scoring='mean_squared_error') rfecv.fit(X_train, Y_train) print("Optimal number of features : %d" % rfecv.n_features_)
# midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 # diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1]) # return midpoint, diff # # plot_learning_curve(model, u"learning curve", X, Y) # 6...................模型融合.....................# train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') train_np = train_df.as_matrix() # y即Survival结果 y = train_np[:, 0] x = train_np[:, 1:] # fit 到BaggingRegressor之中 model = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_model = BaggingRegressor(model,n_estimators=20,max_samples=0.8,max_features=1.0,bootstrap=True,bootstrap_features=False,n_jobs=-1) bagging_model.fit(x,y) test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') predictions = bagging_model.predict(test) result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(),'Survived':predictions.astype(np.int32 )}) result.to_csv('./result.csv',index=False)
if not os.path.exists(out_dir): os.mkdir(out_dir) filenames = [] for sts12_train_id, sts12_test_id, sts13_test_id, sts14_test_id in id_pairs: # combine 2012, 2013 training and test data X_sts12_train, y_sts12_train = ntnu_sts12.read_train_data(sts12_train_id, feats) X_sts12_test, y_sts12_test = ntnu_sts12.read_test_data(sts12_test_id, feats) X_sts13_test, y_sts13_test = sts13.read_test_data(sts13_test_id, feats) X_train = np.vstack([X_sts12_train, X_sts12_test, X_sts13_test]) y_train = np.hstack([y_sts12_train, y_sts12_test, y_sts13_test]) regressor.fit(X_train, y_train) X_test = read_blind_test_data(sts14_test_id, feats) y_test = regressor.predict(X_test) test_input = read_system_input(test_input_fnames[sts14_test_id]) postprocess(test_input, y_test) fname = "{}/STS-en.output.{}.txt".format(out_dir, sts14_test_id) write_scores(fname, y_test) filenames.append(fname) descr_fname = "{}/STS-en-{}-{}.description.txt".format(out_dir, GROUP, APPROACH) open(descr_fname, "w").write(DESCRIPTION) filenames.append(descr_fname) filenames = " ".join(filenames) zipfile = "STS-en-{}-{}.zip".format(GROUP, APPROACH)
] full_predictions = [] for alg, predictors in algorithms: if alg == "xgboost_Label": full_predictions.append(xgboost_Label(train, test, labels)) elif alg == "xgboost_Vect": full_predictions.append(xgboost_Vect(train, test, labels)) elif alg == "xgboost_Dummies": full_predictions.append(xgboost_Dummies(train, test, labels)) else: if predictors == "dummies": print ("Train ", alg.__class__.__name__, " dummies Model ") alg = BaggingRegressor(alg) alg.fit(train_du, labels) print "Prediction :", alg.__class__.__name__, " dummies Model " prediction = alg.predict(test_du) full_predictions.append(prediction) else: print ("Train ", alg.__class__.__name__, " Label Model ") alg = BaggingRegressor(alg) alg.fit(train_rf, labels) print "Prediction :", alg.__class__.__name__, " Label Model " prediction = alg.predict(test_rf) full_predictions.append(prediction) # Ensemble models RF_label_pred = full_predictions[0] RF_dummies_pred = full_predictions[1] pred_xgb_dummies = full_predictions[2] pred_xgb_Label = full_predictions[3] pred_xgb_Vect = full_predictions[4]
df_all['letter_in_description'] = df_all['product_info'].map( lambda x: str_common_letter(x.split('\t')[0], x.split('\t')[2])) print("Drop columns that were changed...") df_all = df_all.drop(['search_term', 'product_title', 'product_description', 'product_info'], axis=1) # Set up training and test sets df_train = df_all.iloc[:num_train] df_test = df_all.iloc[num_train:] id_test = df_test['id'] y_train = df_train['relevance'].values # Drop 'id' and 'relevance' columns from the training and test sets X_train = df_train.drop(['id', 'relevance'], axis=1).values X_test = df_test.drop(['id', 'relevance'], axis=1).values # Setup RandomForest and Bagging Regressors rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) # Fit the training data into the regression model using the output values clf.fit(X_train, y_train) # Run the prediction y_pred = clf.predict(X_test) # Set up our Data Frame datafr = pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('../dataset/submission.csv', index=False) print(datafr)
# -*- coding: utf-8 -*- import numpy as np import pandas as pd from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeRegressor import matplotlib.pyplot as plt def f(x): return 0.5 * np.exp(-(x+3)**2) + np.exp(-x**2) + 0.5 * np.exp(-(x-3)**2) N = 200 # 200 samples x_train = np.linspace(-5.5, 5.5, N) X_train = pd.DataFrame({"x": x_train}) y_train = f(x_train) + (np.random.rand(N) - 0.5) * (2 * 0.05) dtr = DecisionTreeRegressor(max_depth=5) br = BaggingRegressor(dtr, n_estimators=200, max_samples=0.2) br.fit(X_train, y_train) x_test = np.linspace(x_train.min() * 1.1, x_train.max() * 1.1, 1000) X_test = pd.DataFrame({"x": x_test}) y_test = f(x_test) y_predict = br.predict(X_test) plt.scatter(x_train, y_train) plt.scatter(x_test, y_test) plt.scatter(x_test, y_predict) plt.show()