class EXTRATREE(): """docstring for ClassName""" def __init__(self, ExtraTreesRegressor, N): self.cores_number = int(np.ceil(multiprocessing.cpu_count() / N)) self.model = ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=self.cores_number, oob_score=False, random_state=None, verbose=True, warm_start=False) print("ExtraTreesRegressor Cores: ", self.cores_number) def fit(self, X_train, y_train, X_test, y_test, error_type="MAE"): error_dict = {"MSE": "mse", "MAE": "mae"} error_metric = error_dict[error_type] self.model.set_params(criterion=error_metric) self.model.fit(X_train, y_train) def predict(self, X_test): prediction = self.model.predict(X_test) return (prediction)
def __init__(self, nb_classes, bags=1, params={}): # params self.params = params #common self.nb_classes = nb_classes self.bags = bags self.bags_models = tuple() self.train_y = None for bag in range(self.bags): model = ExtraTreesRegressor() model.set_params(**self.params) model.set_params(random_state = (self.params['random_state'] + bag)) self.bags_models = self.bags_models + (model, )
def Extra_trees_regression(X_train, y_train, X_test, params): # Случайный поиск по сетке if hyperparameters == 'RandomGridSearch': # Осуществляем поиск по сетке с кросс-валидацией (число фолдов равно 3) max_depth = [5, 10, 15, 20, 25] min_samples_split = [2, 5, 10] max_leaf_nodes = [10, 50, 100] param_grid = {'max_depth': max_depth, 'min_samples_split': min_samples_split, 'max_leaf_nodes': max_leaf_nodes} # Задаем модель, которую будем обучать estimator = ExtraTreesRegressor(n_estimators = 200, n_jobs = 8) # Производим обучение модели с заданными вариантами параметров (осуществляем поиск по сетке) optimizer = RandomizedSearchCV(estimator, param_grid, n_iter = 5, cv = 3, iid = 'deprecated', scoring = 'neg_mean_absolute_error') optimizer.fit(X_train, y_train) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ # Полный поиск по сетке elif hyperparameters == 'GridSearch': max_depth = [5, 10, 15, 20, 25] min_samples_split = [2, 5, 10] max_leaf_nodes = [10, 50, 100] param_grid = {'max_depth': max_depth, 'min_samples_split': min_samples_split,'max_leaf_nodes': max_leaf_nodes} # Задаем модель, которую будем обучать estimator = ExtraTreesRegressor(n_estimators = 200, n_jobs = 8) # Производим обучение модели с заданными вариантами параметров (осуществляем поиск по сетке) optimizer = GridSearchCV(estimator, param_grid, cv = 3, iid = 'deprecated', scoring = 'neg_mean_absolute_error') optimizer.fit(X_train, np.ravel(y_train)) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ elif hyperparameters == 'Custom': estimator = ExtraTreesRegressor() # Задаем нужные параметры estimator.set_params(**params) # Проверка по кросс-валидации fold = KFold(n_splits = 3, shuffle = True) validation_score = cross_val_score(estimator = estimator, X = X_train, y = y_train, cv = fold, scoring = 'neg_mean_absolute_error') # Обучаем модель уже на всех данных estimator.fit(X_train, np.ravel(y_train)) predicted = estimator.predict(X_test) return(predicted, validation_score)
def extra_trees_regression(X_train, y_train, X_test, params, use_cv: bool = True): # If there are not enough points for cross validation if use_cv is False: if params is None: model = ExtraTreesRegressor() else: model = ExtraTreesRegressor(**params) model.fit(X_train, y_train) predicted = model.predict(X_test) # Calculate score on train train_predicted = model.predict(X_train) validation_score = mean_absolute_error( np.ravel(y_train), np.ravel(train_predicted)) return predicted, validation_score # Random grid search if hyperparameters == 'RandomGridSearch': # Carry out a random grid search with cross-validation (the number of folds is 3) max_depth = [5, 10, 15, 20, 25] min_samples_split = [2, 5, 10] max_leaf_nodes = [10, 50, 100] param_grid = { 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'max_leaf_nodes': max_leaf_nodes } # Set the model to be trained estimator = ExtraTreesRegressor(n_estimators=50, n_jobs=-1) # Train the model with the given options of parameters optimizer = RandomizedSearchCV( estimator, param_grid, n_iter=5, cv=3, iid='deprecated', scoring='neg_mean_absolute_error') optimizer.fit(X_train, y_train) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ # Full grid search elif hyperparameters == 'GridSearch': max_depth = [5, 10, 15, 20, 25] min_samples_split = [2, 5, 10] max_leaf_nodes = [10, 50, 100] param_grid = { 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'max_leaf_nodes': max_leaf_nodes } # Set the model to be trained estimator = ExtraTreesRegressor(n_estimators=50, n_jobs=-1) # Train the model with the given options of parameters optimizer = GridSearchCV(estimator, param_grid, cv=3, iid='deprecated', scoring='neg_mean_absolute_error') optimizer.fit(X_train, np.ravel(y_train)) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ elif hyperparameters == 'Custom': estimator = ExtraTreesRegressor() # Set the params estimator.set_params(**params) # Cross-validation fold = KFold(n_splits=3, shuffle=True) validation_score = cross_val_score( estimator=estimator, X=X_train, y=y_train, cv=fold, scoring='neg_mean_absolute_error') estimator.fit(X_train, np.ravel(y_train)) predicted = estimator.predict(X_test) return predicted, validation_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.model_selection import train_test_split import pickle x, y = datahelper.get_xy('data/', num_hours=3, error_minutes=15) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) extra_trees = ExtraTreesRegressor( n_jobs=-1) # n_jobs=-1 is used for parallel calculations without limit estimators = np.arange(10, 1000, 10) scores = dict() current_index = 0 for n in estimators: extra_trees.set_params(n_estimators=n) extra_trees.fit(x_train, y_train) scores[current_index] = extra_trees.score(x_test, y_test) current_index += 1 sorted_by_scores = [(k, scores[k]) for k in sorted(scores, key=scores.get, reverse=True)] index, score = sorted_by_scores[0] extra_trees.set_params(n_estimators=estimators[index]) extra_trees.fit(x_train, y_train) filename = '../models/extra_trees_model.sav' print('Saving model with ', estimators[index], ' estimators to file ', filename) with open(filename, 'wb') as h:
import ch9util from tempfile import mkdtemp import os import joblib X_train, X_test, y_train, y_test = ch9util.temp_split() params = { 'min_samples_split': [1, 3], 'bootstrap': [True, False], 'min_samples_leaf': [3, 4] } gscv = GridSearchCV(ExtraTreesRegressor(random_state=41), param_grid=params, cv=5) gscv.fit(X_train, y_train) preds = gscv.predict(X_test) ch9util.npy_save('etr.npy', preds) dir = mkdtemp() pkl = os.path.join(dir, 'params.pkl') joblib.dump(gscv.best_params_, pkl) params = joblib.load(pkl) print('Best params', gscv.best_params_) print('From pkl', params) est = ExtraTreesRegressor(random_state=41) est.set_params(**params) est.fit(X_train, y_train) preds2 = est.predict(X_test) print('Max diff', (preds - preds2).max())
from sklearn.grid_search import GridSearchCV from sklearn.ensemble import ExtraTreesRegressor import ch9util from tempfile import mkdtemp import os import joblib X_train, X_test, y_train, y_test = ch9util.temp_split() params = {'min_samples_split': [1, 3], 'bootstrap': [True, False], 'min_samples_leaf': [3, 4]} gscv = GridSearchCV(ExtraTreesRegressor(random_state=41), param_grid=params, cv=5) gscv.fit(X_train, y_train) preds = gscv.predict(X_test) ch9util.npy_save('etr.npy', preds) dir = mkdtemp() pkl = os.path.join(dir, 'params.pkl') joblib.dump(gscv.best_params_, pkl) params = joblib.load(pkl) print('Best params', gscv.best_params_) print('From pkl', params) est = ExtraTreesRegressor(random_state=41) est.set_params(**params) est.fit(X_train, y_train) preds2 = est.predict(X_test) print('Max diff', (preds - preds2).max())