predictions = np.zeros_like(valid_salaries) for cur_class_id in range(num_classes + 1): predictions_part, idx = predict(cur_class_id) if idx is not None: predictions[idx] = predictions_part print "Part MAE: ", metric(valid_salaries[idx], predictions_part) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(ExtraTreesRegressor(), name, mae) dio.save_prediction(name, predictions, type_n=type_v) #oob_predictions = classifier.oob_prediction_ #mae_oob = mean_absolute_error(salaries, oob_predictions) #print "MAE OOB: ", mae_oob #classifier1 = ExtraTreesRegressor(n_estimators=n_trees, #verbose=1, #n_jobs=3, #oob_score=False, #min_samples_split=min_samples_split, #random_state=3465343) #scores = cross_val_score(classifier1, features, salaries, cv=3, score_func=metric, verbose=1) #print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) #mae_cv = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) #dio.save_model(classifier, name, mae_cv=mae_cv, parameters=param)
def createExtraTree(): clf = ExtraTreesRegressor(n_estimators=50) return clf
X.head() # In[34]: y.head() # In[35]: X.shape, y.shape # In[36]: #now lets see the feature importance from sklearn.ensemble import ExtraTreesRegressor model = ExtraTreesRegressor() # In[37]: model.fit(X, y) # In[38]: print(model.feature_importances_) # In[39]: #lets plot the feature importance for better visualization feat_importance = pd.Series(model.feature_importances_, index=X.columns) feat_importance
kf = KFold(shuffle=True, random_state=19) from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor from lightgbm import LGBMRegressor from xgboost import XGBRegressor, XGBRFRegressor from catboost import CatBoostRegressor trees = { 'linear': LinearRegression(), 'randomfor': RandomForestRegressor(random_state=19), 'gradientb': GradientBoostingRegressor(random_state=19), 'xgb': XGBRegressor(random_state=19), 'xgbrf': XGBRFRegressor(random_state=19), 'catboost': CatBoostRegressor(random_state=19, silent=True), 'DecisionTr': DecisionTreeRegressor(random_state=19), 'extratre': ExtraTreesRegressor(random_state=19), } !pip3 install catboost scores = [] rmse=[] mse=[] mae=[] for train_index, test_index in kf.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] from sklearn import metrics regression_model = LinearRegression()
x_t1 = np.zeros(hours) u_T = np.empty((0, hours)) u_t = np.zeros(hours) r_T = np.empty((0, hours)) r_t = np.zeros(hours) q_T = np.empty((0, hours)) q_t = np.zeros(hours) rew = [] r_cum = 0 trees = [] model = [] for i in range(hours): inputs = pd.DataFrame({'X': [x_t[i]], 'U': [u_t[i]], 't': [i + 7]}) outputs = q_t[i] model = ExtraTreesRegressor(n_estimators=50) model.fit(inputs, [outputs]) trees.append(model) "Building the set of four tuples x_t, u_t, r_t, x_t+1" for d in range(days): x_k = 0 r_cum = 0 for i, t in enumerate(t_time): x_t[i] = x_k model = trees[i] if np.random.random() > epsilon: a = [] for u in U: a.append( model.predict(
# Ensemble ensembles = [] ensembles.append(('ScalesAB', Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostRegressor())]))) ensembles.append(('ScalesGBM', Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingRegressor())]))) ensembles.append(('ScalesRF', Pipeline([('Scaler', StandardScaler()), ('RF', RandomForestRegressor())]))) ensembles.append(('ScalesET', Pipeline([('Scaler', StandardScaler()), ('ET', ExtraTreesRegressor())]))) results = [] names = [] for name, model in ensembles: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg)
mae_rf = metrics.mean_absolute_error(y_test, forest_test_pred) #%% from sklearn.model_selection import cross_val_score forest_scores = cross_val_score(regr, x, y, cv=5, scoring='neg_mean_absolute_error') print(f'Cross-validation scores for RF: {-np.mean(forest_scores)}') #%% extra = ExtraTreesRegressor(n_estimators=300, criterion='mae', random_state=1, n_jobs=-1) regr = TransformedTargetRegressor(regressor=extra, func=func, inverse_func=inverse_func) regr.fit(x_train, y_train) extra_train_pred = regr.predict(x_train) extra_test_pred = regr.predict(x_test) print('MAE train data: %.3f, MAE test data: %.3f' % (metrics.mean_absolute_error(y_train, extra_train_pred), metrics.mean_absolute_error(y_test, extra_test_pred))) mae_extra = metrics.mean_absolute_error(y_test, extra_test_pred) #%%
import sklearn from sklearn.ensemble import ExtraTreesRegressor from sklearn.model_selection import train_test_split import warnings warnings.filterwarnings('ignore') df = pd.read_csv('deploy_df') df.drop('Unnamed: 0', axis=1, inplace=True) print(df.head()) x = df.drop('rate', axis=1) y = df['rate'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=10) #Building a Model from sklearn.ensemble import ExtraTreesRegressor ET_Model = ExtraTreesRegressor(n_estimators=120) ET_Model.fit(x_train, y_train) y_predict = ET_Model.predict(x_test) print(y_predict) #Lets save the model using Pickle import pickle pickle.dump(ET_Model, open('model.pkl', 'wb')) model = pickle.load(open('model.pkl', 'rb'))
from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from tpot.export_utils import set_param_recursive from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=123) # Average CV score on the training set was: 0.9721928061326809 exported_pipeline = make_pipeline( make_union( FunctionTransformer(copy), SelectFromModel(estimator=ExtraTreesRegressor( max_features=0.6000000000000001, n_estimators=100), threshold=0.2)), GaussianProcessRegressor(kernel=Matern(length_scale=3.7, nu=1.5), n_restarts_optimizer=20, normalize_y=False)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 123) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
support_regressor.score(X_test, y_test))) dtr = DecisionTreeRegressor() dtr.fit(X_train, y_train) print("Coefficient of determination R^2 <-- on train set: {}".format( dtr.score(X_train, y_train))) print("Coefficient of determination R^2 <-- on test set: {}".format( dtr.score(X_test, y_test))) indiana_jones = Lasso(alpha=1.0) indiana_jones.fit(X_train, y_train) print("Coefficient of determination R^2 <-- on train set : {}".format( indiana_jones.score(X_train, y_train))) print("Coefficient of determination R^2 <-- on test set: {}".format( indiana_jones.score(X_test, y_test))) etr = ExtraTreesRegressor(n_estimators=300) etr.fit(X_train, y_train) print(etr.feature_importances_) indecis = np.argsort(etr.feature_importances_)[::-1] plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w') plt.title("Feature importances") plt.bar(range(X_train.shape[1]), etr.feature_importances_[indecis], color="r", align="center") plt.xticks(range(X_train.shape[1]), indecis) plt.show()
xtrain, ytrain, xtest, x_cv, y_cv = np.array(train)[:, :20], np.array( train)[:, 20], np.array(test)[:, :20], np.array(data_10)[:, :20], np.array( data_10)[:, 20], print 'train', np.array(train).shape print xtrain[1] print 'xtrain', xtrain.shape print 'ytrain', ytrain.shape print 'test', xtest.shape estimators = 100 #sup_vec = svm.SVC(C=11000, verbose = 2, probability=True) #sup_vec = RandomForestRegressor(n_estimators=estimators, verbose=2, n_jobs=-1, max_leaf_nodes=100) #sup_vec = ExtraTreesRegressor(n_estimators=estimators, verbose=2, n_jobs=-1, max_leaf_nodes=100) #sup_vec = AdaBoostRegressor(RandomForestRegressor(n_estimators=100, verbose=2, n_jobs = -1),n_estimators=100) sup_vec = AdaBoostRegressor(ExtraTreesRegressor(n_estimators=100, verbose=2, n_jobs=-1), n_estimators=160, loss='exponential') #sup_vec = AdaBoostRegressor(DecisionTreeRegressor(max_depth=10),n_estimators=300) #dt_stump = DecisionTreeClassifier(max_depth=4, min_samples_leaf=1) #dt_stump.fit(xtrain, ytrain) #dt_stump_err = 1.0 - dt_stump.score(xtrain, ytrain) #n_estimators = 400 # A learning rate of 1. may not be optimal for both SAMME and SAMME.R #learning_rate = 1. #sup_vec = AdaBoostClassifier( # base_estimator=dt_stump,
# ExtraTreesRegressor: rather than trying every variable, try few splits with few variables thus reducing correlation between the individual trees ######################################## from BlueBookForBullDozers.utilities import proc_df, split_vals, print_score, draw_tree, plt, set_rf_samples # read the dataframe from the saved feather file df_raw = pd.read_feather("tmp/raw") # create the instance of the model we want # if we remove max_depth, #leaves = #rows in data set ie. highly over fit trees # m = RandomForestRegressor(n_jobs=-1, n_estimators=1, max_depth=3, bootstrap=False) # min_samples_leaf = stop when number of samples in a leaf becomes <=3 # max_features = only use half of the columns at every spilt m = ExtraTreesRegressor(n_jobs=-1, n_estimators=40, min_samples_leaf=3, max_features=0.5, oob_score=False) # replace categorical variables by their numeric codes, handle missing values and separate out SalePrice df, y, _ = proc_df(df_raw, y_fld="SalePrice") # create validation set and the final training set n_validation = 12000 n_training = len(df) - n_validation X_training, X_validation = split_vals(df, n_training) y_training, y_validation = split_vals(y, n_training) # rather than limit the training data to the first 30k rows or any continuous subset, sample random 20k rows. # This way the if there are enough number of trees we can get access to entire data set. set_rf_samples(20000)
""" 5: t-SNE """ tsne = manifold.TSNE(n_components=3) model_tsne = tsne.fit(d7) model_tsne.metric data_tsne = tsne.fit_transform(d7) data_pca_fa_ica_tsne = tsne.fit_transform(data_pca_fa_ica) """------------------------------------------------------------------------ """ """Step 15: Regression Modelling """ """------------------------------------------------------------------------ """ """Regression Models """ forest = ExtraTreesRegressor(n_estimators=250,random_state=0) rforest = RandomForestRegressor() params = {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 10, 'learning_rate': 0.1, 'loss': 'ls'} clf = ensemble.GradientBoostingRegressor(**params) """ 1: ML No Dimensionality Reduction """ """------------------------------------------------------------------------ """ X = d7.copy(deep=True) y=target X = X.astype(np.float32) offset = int(X.shape[0] * 0.9) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:]
score = clf.score(X_test, Y_test) print(score) y_pred = clf.predict(X_test) names = [ "Decision Tree Regressor", "MLP Regressor", "Random Forest Regressor", "AdaBoost", "Bagging Regressor", "Extra Trees Regressor" ] classifiers = [ DecisionTreeRegressor(max_depth=5, max_features=1), MLPRegressor(alpha=1, max_iter=200, power_t=0.9, batch_size=50), RandomForestRegressor(max_depth=5, max_features=1, n_estimators=10), AdaBoostRegressor(n_estimators=10), BaggingRegressor(max_features=1, n_estimators=10, base_estimator=clf), ExtraTreesRegressor(max_depth=5) ] for name, clf in zip(names, classifiers): clf.fit(X_train, Y_train) score = clf.score(X_test, Y_test) y_pred = clf.predict(X_test) print(name + ": " + str(score)) mse = mean_squared_log_error(Y_test, y_pred) print('MSE: %.4f' % mse) # print(confusion_matrix(Y_test,y_pred,labels=None)) # print(cohen_kappa_score(Y_test,y_pred, labels=None)) # print(classification_report(Y_test,y_pred,labels=None))
n_jobs=20, random_state=2017, max_features="auto", verbose=1) adaboost = AdaBoostRegressor(n_estimators=30, random_state=2017, learning_rate=0.01) gbdt = GradientBoostingRegressor(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017, max_depth=5, verbose=1) extratree = ExtraTreesRegressor(n_estimators=600, max_depth=8, max_features="auto", n_jobs=20, random_state=2017, verbose=1) lr_reg = LinearRegression(n_jobs=-1) ############################################################################################################################################# # parameters : regression ################################################################################################################### ############################################################################################################################################# if __name__ == '__main__': for i in range(1, folds + 1): train_xgboost_regression(trainingSet, testingSet, feature_names, i, nbags, xgbModelName, xgbParameters, num_rounds) fulltrain_xgboost_regression(trainingSet, testingSet, feature_names, nbags, xgbModelName, xgbParameters, num_rounds)
X_train, X_test, y_train, y_test = train_test_split(dat1, target, test_size = 0.2, random_state=42) y_train = y_train.values.ravel() models = [] models.append(('SVR', SVR())) models.append(('KNN', KNeighborsRegressor())) models.append(('DT', DecisionTreeRegressor())) models.append(('RF', RandomForestRegressor())) models.append(('l', Lasso())) models.append(('EN', ElasticNet())) models.append(('R', Ridge())) models.append(('BR', BayesianRidge())) models.append(('GBR', GradientBoostingRegressor())) models.append(('RF', AdaBoostRegressor())) models.append(('ET', ExtraTreesRegressor())) models.append(('BgR', BaggingRegressor())) scoring = 'neg_mean_squared_error' results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=42) cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) pipeline = make_pipeline(preprocessing.StandardScaler(), GradientBoostingRegressor(random_state=42))
from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import VarianceThreshold from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Imputer # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) imputer = Imputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) # Score on the training set was:-0.04520857750956616 exported_pipeline = make_pipeline( VarianceThreshold(threshold=0.15000000000000002), ExtraTreesRegressor(bootstrap=False, max_features=0.6500000000000001, min_samples_leaf=11, min_samples_split=8, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def feature_selection(self, X, y, method): """ purpose: select feature input: X:train data y:lable method: uesed method return: """ X_indices = np.arange(X.shape[-1]) score = [] # Removing features with low variance # correlation coefficient # SelectKBest(lambda X,Y: np.array(map(lambda x: pearsonr(x, Y), X.T)).T, k=2).fit_transform(data, target) # mutual information # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(data, target) # Univariate feature selection (for classification) if method == 'chi-squared': skb = SelectKBest(chi2) skb.fit_transform(X, y) score = skb.scores_ # Univariate feature selection (for regression) if method == 'f_regression': skb = SelectKBest(f_regression) skb.fit_transform(X, y) score = skb.scores_ # L1-based feature selection (for classification) if method == 'LinearSVC': lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y) sfm = SelectFromModel(lsvc, prefit=True) X_new = sfm.transform(X) # L1-based feature selection (for regression) elif method == 'LassoCV': lasso = LassoCV().fit(X, y) score = lasso.coef_ sfm = SelectFromModel(lasso, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for classification) elif method == 'ExtraTreesClassifier': clf = ExtraTreesClassifier() clf = clf.fit(X, y) print clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for regression) elif method == 'ExtraTreesRegressor': clf = ExtraTreesRegressor() clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for classifier) elif method == 'GradientBoostingClassifier': clf = GradientBoostingClassifier(learning_rate=0.01) clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for regression) elif method == 'GradientBoostingRegressor': clf = GradientBoostingRegressor(learning_rate=0.01) clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Print the feature ranking indices = np.argsort(score)[::-1] print("Feature ranking:") for f in X_indices: print("feature %d: %s (%f)" % (indices[f], self.columns[indices[f]], score[indices[f]])) #draw plot plt.figure() # plt.bar(indices, score, width=0.2, color='r') plt.barh(indices, score, height=0.2, color='r') plt.title(method) plt.xlabel("score") plt.ylabel("feature") plt.grid(axis='x') plt.show() pass
n_faces = 5 rng = check_random_state(4) face_ids = rng.randint(test.shape[0], size=(n_faces, )) test = test[face_ids, :] n_pixels = data.shape[1] X_train = train[:, :int(np.ceil(0.5 * n_pixels))] # Upper half of the faces y_train = train[:, int(np.floor(0.5 * n_pixels)):] # Lower half of the faces X_test = test[:, :int(np.ceil(0.5 * n_pixels))] y_test = test[:, int(np.floor(0.5 * n_pixels)):] # Fit estimators ESTIMATORS = { "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32, random_state=0), "K-nn": KNeighborsRegressor(), "Linear regression": LinearRegression(), "Ridge": RidgeCV(), "Lasso": Lasso(), # "ElasticNet_0.5": ElasticNet(alpha=100000, l1_ratio=0.001), # "ElasticNet_0.1" : ElasticNet(alpha=0.0001, l1_ratio=0.01), } y_test_predict = dict()
# dft = dft.drop(columns = i) # dfv = dfv.drop(columns = i) # dfkaggle2 = dfkaggle2.drop(columns = i) l.append(i) encoder=ce.CatBoostEncoder(cols=l,return_df=1,drop_invariant=1,handle_missing=False,sigma=None,a=2) encoder.fit(X=dft,y=dft['Total_Yearly_Income_EUR']) dft=encoder.transform(dft) dfv=encoder.transform(dfv) dfkaggle2=encoder.transform(dfkaggle2) imp = IterativeImputer( max_iter = 3 , estimator = ExtraTreesRegressor() , n_nearest_features = 5 ) dfc = dft.copy() dft = dft.drop(columns = 'Total_Yearly_Income_EUR') dfvc = dfv.copy() dfv = dfv.drop(columns = 'Total_Yearly_Income_EUR') dfkc = dfkaggle2.copy() dfkaggle2 = dfkaggle2.drop(columns = 'Total_Yearly_Income_EUR') dfcolumns = dft.columns imp.fit(dft)
boston = load_boston() X, y = boston.data, boston.target # Make train/test split # As usual in machine learning task we have X_train, y_train, and X_test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # print(X_train.shape) # First-layer model models_1 = [ ExtraTreesRegressor(random_state=0, n_jobs=1, n_estimators=100, max_depth=3), RandomForestRegressor(random_state=0, n_jobs=1, n_estimators=100, max_depth=3), XGBRegressor(random_state=0, n_jobs=1, learning_rate=0.1, n_estimators=100, max_depth=3) ] # Second-layer model models_2 = [ GradientBoostingRegressor(random_state=0), SVR(),
def featureSelection(X, y, method = 'lasso', select = 500): t0 = time.time() # sparse (15 seconds) if method == 'lasso': from sklearn import linear_model a = 0.861 if select == 500 else 0.0755 lasso = linear_model.Lasso(alpha = a) lasso.fit(X,y) XSelected = X[:,lasso.coef_ != 0] indices = np.where(lasso.coef_ != 0) if indices > select: indices = np.argsort(-lasso.coef_)[:select] # non-sparse (157 seconds) if method == 'rf': from sklearn.ensemble import ExtraTreesRegressor from sklearn.datasets import load_iris from sklearn.feature_selection import SelectFromModel t = ExtraTreesRegressor(n_estimators=50) t.fit(X, y) model = SelectFromModel(t, prefit=True, max_features = select) XSelected = model.transform(X) indices = np.where(model.get_support) # non-sparse (8.5 seconds) if method == 'svm': from sklearn.svm import SVR from sklearn.feature_selection import SelectFromModel SVMReg = SVR(kernel = 'linear', gamma='scale', C=1.0, epsilon=0.2) SVMReg.fit(X, y) model = SelectFromModel(SVMReg, prefit=True, max_features = select) XSelected = model.transform(X) indices = np.where(model.get_support()) # wrapper model (preset number of features) (1000 seconds / 5000 seconds) if method == 'hsiclasso': from pyHSICLasso import HSICLasso hsic_lasso = HSICLasso() hsic_lasso.input(X,y) hsic_lasso.regression(select) XSelected = X[:,hsic_lasso.get_index()] indices = hsic_lasso.get_index() # dimensionality reduction # PCA # MDS # PLS # DWT # f = h5py.File('selected/' + str(select) + '/X_' + method + '.hdf5', "w") # f.create_dataset('X', data=XSelected) # f.create_dataset('indices', data=indices) # f.close() # return indices np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', indices) # np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', XSelected) print("--- %s seconds ---" % (time.time() - t0))
Xtrain_scaled = robustscaler.transform(Xtrain) Xtest_scaled = robustscaler.transform(Xtest) Xvalidate_scaled = robustscaler.transform(Xvalidate) # Transform pandas into numpy arrays (no nneed to do it if you are scaling # the results) Ytrain = Ytrain.values Ytest = Ytest.values Yvalidate = Yvalidate.values # Best pipeline recommended by TPOT exported_pipeline = make_pipeline( StackingEstimator(estimator=LinearRegression()), StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True, max_features=0.9000000000000001, min_samples_leaf=3, min_samples_split=10, n_estimators=100, random_state=42)), ExtraTreesRegressor(bootstrap=False, max_features=0.55, min_samples_leaf=5, min_samples_split=17, n_estimators=100, random_state=42) ) exported_pipeline.fit(Xtrain_scaled, Ytrain) print('Print MAE - test') y_predicted = exported_pipeline.predict(Xtest_scaled) mae = mean_absolute_error(Ytest, y_predicted) print(mae)
def _select_estimator(estimator, n_jobs, n_estimators, random_state=None): '''Select estimator and parameters from argument name.''' # Regressors if estimator == 'RandomForestRegressor': param_dist = {**parameters['ensemble'], **parameters['bootstrap']} estimator = RandomForestRegressor(n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'ExtraTreesRegressor': param_dist = {**parameters['ensemble'], **parameters['bootstrap']} estimator = ExtraTreesRegressor(n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'GradientBoostingRegressor': param_dist = parameters['ensemble'] estimator = GradientBoostingRegressor(n_estimators=n_estimators, random_state=random_state) elif estimator == 'SVR': param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]} estimator = SVR(kernel='rbf') elif estimator == 'LinearSVR': param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]} estimator = SVR(kernel='linear') elif estimator == 'Ridge': param_dist = parameters['linear'] estimator = Ridge(solver='auto', random_state=random_state) elif estimator == 'Lasso': param_dist = parameters['linear'] estimator = Lasso(random_state=random_state) elif estimator == 'ElasticNet': param_dist = parameters['linear'] estimator = ElasticNet(random_state=random_state) elif estimator == 'KNeighborsRegressor': param_dist = parameters['kneighbors'] estimator = KNeighborsRegressor(algorithm='auto') # Classifiers elif estimator == 'RandomForestClassifier': param_dist = { **parameters['ensemble'], **parameters['bootstrap'], **parameters['criterion'] } estimator = RandomForestClassifier(n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'ExtraTreesClassifier': param_dist = { **parameters['ensemble'], **parameters['bootstrap'], **parameters['criterion'] } estimator = ExtraTreesClassifier(n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'GradientBoostingClassifier': param_dist = parameters['ensemble'] estimator = GradientBoostingClassifier(n_estimators=n_estimators, random_state=random_state) elif estimator == 'LinearSVC': param_dist = parameters['linear_svm'] estimator = LinearSVC(random_state=random_state) elif estimator == 'SVC': param_dist = parameters['svm'] estimator = SVC(kernel='rbf', random_state=random_state) elif estimator == 'KNeighborsClassifier': param_dist = parameters['kneighbors'] estimator = KNeighborsClassifier(algorithm='auto') return param_dist, estimator
#ranked_features=[] #for f in range(X.shape[1]): # ranked_features.append(features[indices[f]]) # print("%d. %s (%f)" % (f + 1, features[indices[f]], importances[indices[f]])) # Plot the feature importances of the forest #plt.figure() #plt.title("Feature importances") #plt.bar(range(X.shape[1]), importances[indices], # color="r", yerr=std[indices], align="center") #plt.xticks(range(X.shape[1]),ranked_features,fontsize=6.5) #plt.xlim([-1, X.shape[1]]) #plt.savefig('random_forrest_feature_importance.png') for score in scores: forest = GridSearchCV(ExtraTreesRegressor(random_state=1), tuned_parameters, verbose=10, cv=4, n_jobs=4, scoring='%s' % score) forest.fit(X_train, y_train) model = forest.fit(X_train, y_train) model_train = model.predict(X_train) model_test = model.predict(X_test) r2_score_train = r2_score(y_train, model_train) mse_score_train = mean_squared_error(y_train, model_train) rmse_score_train = np.sqrt(mse_score_train) r2_score_test = r2_score(y_test, model_test) mse_score_test = mean_squared_error(y_test, model_test)
def __init__(self): self.etr_model = ExtraTreesRegressor(n_estimators=200, random_state=2019)
df_2 = pd.read_csv("/home/mldm/covid_BRTT/dataset_finali/" + dataset, parse_dates=["Data"], infer_datetime_format=True) rnd_state = rnd ################################################################################ train_2 = df_2.tail(int(len(df_2) * 0.7)) test_2 = df_2.drop(train_2.index) train_X_2 = train_2[predictor_columns] train_y_2 = train_2[feature] test_X_2 = test_2[predictor_columns] test_y_2 = test_2[feature] ################################################################################ extra_tree_regressor = ExtraTreesRegressor(bootstrap=False, random_state=rnd_state) ################################################################################ imp = SimpleImputer(missing_values=np.nan, strategy="mean") imp = imp.fit(train_X_2) ################################################################################ grid_regressor_2 = GridSearchCV(extra_tree_regressor, param_grd, n_jobs=-1, verbose=0) ################################################################################ grid_regressor_2.fit(imp.transform(train_X_2), train_y_2) ################################################################################ best_regressor_2 = grid_regressor_2.best_estimator_ imputer = SimpleImputer(missing_values=np.nan, strategy="mean") imputer = imputer.fit(test_X_2)
train = train.drop('Item_Outlet_Sales', axis=1) features = train.columns target = 'Item_Outlet_Sales' X_train, X_test = train, test # Starting with different supervised learning algorithm, let us check which algorithm gives us the best results print('=====\nTrain against different learning algorithms and pick one that produces the best result\n=====') model_factory = [ RandomForestRegressor(), # XGBRegressor(nthread=1), # MLPRegressor(), Ridge(), BayesianRidge(), ExtraTreesRegressor(), ElasticNet(), KNeighborsRegressor(), GradientBoostingRegressor() ] for model in model_factory: model.seed = 42 num_folds = 3 scores = cross_val_score( model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error'
def main(): boston = loadData() X = boston.data Y = boston.target X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.25, random_state=33) print('The max target value is', np.max(boston.target)) print('The min target value is ', np.min(boston.target)) print('The average target value is ', np.mean(boston.target)) # 对数据进行标准化处理 ss_X = StandardScaler() ss_Y = StandardScaler() X_train = ss_X.fit_transform(X_train) X_test = ss_X.transform(X_test) Y_train = ss_Y.fit_transform(Y_train.reshape(-1, 1)) Y_test = ss_Y.transform(Y_test.reshape(-1, 1)) # 导入线性回归模型并训练 lr = LinearRegression() lr.fit(X_train, Y_train) lr_Y_predict = lr.predict(X_test) # 导入SDGR模型并训练 sgdr = SGDRegressor() sgdr.fit(X_train, Y_train.ravel()) sgdr_Y_predict = sgdr.predict(X_test) # 评估模型性能 进行对比 发现模型自带评价score等价于r2_score print('-----------------------------------------------------------------------') print('The value of default measurement of LinerRegression is ', lr.score(X_test, Y_test)) print('The value of R-squared of LinerRegression is ', r2_score(Y_test, lr_Y_predict)) print('The mean squared error of LinerRegression is ', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_Y_predict))) print('The mean absolute error of LinerRegression is ', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_Y_predict))) print('-----------------------------------------------------------------------') print('The value of default measurement of SGDRession is ', sgdr.score(X_test, Y_test)) print('The value of R-squared of SGDRession is ', r2_score(Y_test, sgdr_Y_predict)) print('The mean squared error of SGDRession is ', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_Y_predict))) print('The mean absolute error of SGDRession is ', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_Y_predict))) # SVM Regression # 线性核函数SVR liner_svr = SVR(kernel='linear') liner_svr.fit(X_train, Y_train.ravel()) liner_svr_y_predict = liner_svr.predict(X_test) # 多项式核函数SVR poly_svr = SVR(kernel='poly') poly_svr.fit(X_train, Y_train.ravel()) poly_svr_y_predict = poly_svr.predict(X_test) # 径向基核函数SVR rbf_svr = SVR(kernel="rbf") rbf_svr.fit(X_train, Y_train.ravel()) rbf_svr_y_predict = rbf_svr.predict(X_test) # 对三种核函数的SVR进行性能评估 print('-----------------------------------------------------------------------') print('R-square value of linear SVR is:', liner_svr.score(X_test, Y_test)) print('The MSE of linear SVR is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(liner_svr_y_predict))) print('The MAE of linear SVR is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(liner_svr_y_predict))) print('-----------------------------------------------------------------------') print('R-square value of poly SVR is:', poly_svr.score(X_test, Y_test)) print('The MSE of poly SVR is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict))) print('The MAE of poly SVR is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict))) print('-----------------------------------------------------------------------') print('R-square value of rbf SVR is:', rbf_svr.score(X_test, Y_test)) print('The MSE of rbf SVR is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict))) print('The MAE of rbf SVR is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict))) # 两种K近邻模型 # 预测方式:平均回归 uni_knr = KNeighborsRegressor(weights='uniform') uni_knr.fit(X_train, Y_train.ravel()) uni_knr_y_predicrt = uni_knr.predict(X_test) # 预测方式:距离加权 dis_knr = KNeighborsRegressor(weights='distance') dis_knr.fit(X_train, Y_train.ravel()) dis_knr_y_predict = dis_knr.predict(X_test) # 对两种k近邻模型进行性能评估 print('-----------------------------------------------------------------------') print('R-square value of uniform-weighted KNR is:', uni_knr.score(X_test, Y_test)) print('The MSE of uniform-weighted KNR is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predicrt))) print('The MAE of uniform-weighted KNR is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predicrt))) print('-----------------------------------------------------------------------') print('R-square value of distance-weighted KNR is:', dis_knr.score(X_test, Y_test)) print('The MSE of distance-weighted KNR is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict))) print('The MAE of distance-weighted KNR is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict))) # 使用回归树模型 dtr = DecisionTreeRegressor() dtr.fit(X_train, Y_train.ravel()) dtr_y_predict = dtr.predict(X_test) # 对回归树进行性能评估 print('-----------------------------------------------------------------------') print('R-square value of DecisionTreeRegressor is:', dtr.score(X_test, Y_test)) print('The MSE of DecisionTreeRegressor is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict))) print('The MAE of DecisionTreeRegressor is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict))) # 使用三种集成模型进行训练 rfr = RandomForestRegressor() rfr.fit(X_train, Y_train.ravel()) rfr_y_predict = rfr.predict(X_test) etr = ExtraTreesRegressor() etr.fit(X_train, Y_train.ravel()) etg_y_predict = etr.predict(X_test) gbr = GradientBoostingRegressor() gbr.fit(X_train, Y_train.ravel()) gbr_y_predict = gbr.predict(X_test) # 对三种集成模型进行性能评估 print('-----------------------------------------------------------------------') print('R-square value of RandomForestRegressor is:', rfr.score(X_test, Y_test)) print('The MSE of RandomForestRegressor is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict))) print('The MAE of RandomForestRegressor is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict))) print('-----------------------------------------------------------------------') print('R-square value of ExtraTreesRegressor is:', etr.score(X_test, Y_test)) print('The MSE of ExtraTreesRegressor is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etg_y_predict))) print('The MAE of ExtraTreesRegressor is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etg_y_predict))) print(np.sort(list(zip(etr.feature_importances_, boston.feature_names)), axis=0)) print('-----------------------------------------------------------------------') print('R-square value of GradientBoostingRegressor is:', gbr.score(X_test, Y_test)) print('The MSE of GradientBoostingRegressor is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict))) print('The MAE of GradientBoostingRegressor is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict)))
train = pd.read_hdf('train.h5') train = train[col] d_mean = train.median(axis=0) train = o.train[col] n = train.isnull().sum(axis=1) for c in train.columns: train[c + '_nan_'] = pd.isnull(train[c]) d_mean[c + '_nan_'] = 0 train = train.fillna(d_mean) train['znull'] = n n = [] print "start train for Trees" rfr = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0) model1 = rfr.fit(train, o.train['y']) # clf_gbdt = GradientBoostingRegressor(n_estimators=15, learning_rate=0.5,max_depth=6, random_state=0) # model1 = clf_gbdt.fit(train, o.train['y']) #model1 = rfr.fit(train, o.train['y']) #model_xgb = runXGB(train,o.train['y']) print "finised train" #https://www.kaggle.com/bguberfain/two-sigma-financial-modeling/univariate-model-with-clip/run/482189 low_y_cut = -0.06 high_y_cut = 0.06 y_is_above_cut = (o.train.y > high_y_cut) y_is_below_cut = (o.train.y < low_y_cut) y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)