def _logreg_gridsearch_model( task, numeric_features, categoric_features, learning_rate, use_dask, n_iter, scoring, ): if learning_rate is None: param_space = { 'clf__C': np.logspace(-5, 5, 100), 'clf__class_weight': ['balanced', None], } model = LogisticRegression(max_iter=10000, fit_intercept=False) else: param_space = { 'clf__penalty': ['l1', 'l2'], 'clf__alpha': np.logspace(-5, 5, 100), 'clf__class_weight': ['balanced', None], } learning_rate_schedule = ('constant' if isinstance( learning_rate, float) else learning_rate) eta0 = learning_rate if isinstance(learning_rate, float) else 0 model = SGDClassifier( learning_rate=learning_rate_schedule, eta0=eta0, loss='log', max_iter=10000, fit_intercept=False, ) pipe = Pipeline([ ( 'preprocessing', simple_proc_for_linear_algoritms(numeric_features, categoric_features), ), ('clf', model), ]) if use_dask: from dask_ml.model_selection import RandomizedSearchCV return RandomizedSearchCV(pipe, param_space, n_iter=n_iter, scoring=scoring, cv=5) else: from sklearn.model_selection import RandomizedSearchCV return RandomizedSearchCV(pipe, param_space, n_iter=n_iter, scoring=scoring, cv=5)
def main(): print("Loading data...", end='\r') x, y, iterator = load_data_nozeros_bypoint() print("Loaded ") n_estimators = [int(x) for x in np.linspace(start=50, stop=1000, num=10)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid params = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } scoring = { 'accuracy': 'accuracy', 'precision': 'precision', 'recall': 'recall', 'f1': 'f1', 'mcc': sklearn.metrics.make_scorer(sklearn.metrics.matthews_corrcoef) } out_dir = gouda.ensure_dir(os.path.join(RESULTS_DIR, 'log_results')) print(out_dir) rf = RandomForestClassifier() with ProgressBar(): grid_search = RandomizedSearchCV(rf, params, scoring=scoring, n_jobs=-1, cv=iterator, refit='mcc', iid=True, cache_cv=True) grid_search.fit(x, y) configs, means = get_configurations(grid_search.cv_results_) output_path = os.path.join(out_dir, 'new_RF.pickle') dump([ 'test', 'grid_search.cv_results_', grid_search.cv_results_, 'grid_search.best_params_', grid_search.best_params_, 'grid_search.best_score_', grid_search.best_score_, 'grid_search.best_estimator_', grid_search.best_estimator_ ], output_path)
def _mlp_gridsearch_model( task, numeric_features, categoric_features, learning_rate, use_dask, n_iter, scoring, ): param_space = { 'clf__hidden_layer_sizes': [ (24, ), (12, 12), (6, 6, 6, 6), (4, 4, 4, 4, 4, 4), (12, 6, 3, 3), ], 'clf__activation': ['relu', 'logistic', 'tanh'], 'clf__batch_size': [16, 32, 64, 128, 256, 512], 'clf__alpha': uniform(0.0001, 0.9), 'clf__learning_rate': ['constant', 'adaptive'], } model = (MLPClassifier(learning_rate_init=learning_rate) if task == 'classification' else MLPRegressor( learning_rate_init=learning_rate)) pipe = Pipeline([ ( 'preprocessing', simple_proc_for_linear_algoritms(numeric_features, categoric_features), ), ('clf', model), ]) if use_dask: from dask_ml.model_selection import RandomizedSearchCV return RandomizedSearchCV(pipe, param_space, n_iter=n_iter, scoring=scoring, cv=5) else: from sklearn.model_selection import RandomizedSearchCV return RandomizedSearchCV(pipe, param_space, n_iter=n_iter, scoring=scoring, cv=5)
def _xgboost_gridsearch_model( task, numeric_features, categoric_features, learning_rate, use_dask, n_iter, scoring, ): param_space = { 'clf__max_depth': randint(2, 11), 'clf__min_child_weight': randint(1, 11), 'clf__subsample': uniform(0.5, 0.5), 'clf__colsample_bytree': uniform(0.5, 0.5), 'clf__colsample_bylevel': uniform(0.5, 0.5), 'clf__gamma': uniform(0, 1), 'clf__reg_alpha': uniform(0, 1), 'clf__reg_lambda': uniform(0, 10), 'clf__base_score': uniform(0.1, 0.9), 'clf__scale_pos_weight': uniform(0.1, 9.9), } model = (xgbsk.XGBClassifier(learning_rate=learning_rate) if task == 'classification' else xgbsk.XGBRegressor( learning_rate=learning_rate)) pipe = Pipeline([ ( 'preprocessing', simple_proc_for_tree_algoritms(numeric_features, categoric_features), ), ('clf', model), ]) if use_dask: from dask_ml.model_selection import RandomizedSearchCV return RandomizedSearchCV(pipe, param_space, n_iter=n_iter, scoring=scoring, cv=5) else: from sklearn.model_selection import RandomizedSearchCV return RandomizedSearchCV(pipe, param_space, n_iter=n_iter, scoring=scoring, cv=5)
def model_selection(pipeline: Pipeline, X, y, n_iter: int, log) -> Pipeline: """Performs model selection using randomized search with cross-validation Parameters ---------- pipline: Pipeline pipeline on which the search is to be performed X: dataframe containing the features on which model selection is performed y: dataframe containing the outcome on which model selection is performed n_iter: int number of search steps to be performed log: logger object Returns ------- None """ param_dists = { "feature_gen__cat_avg__category points_average__min_count": [15, 30, 50], "feature_gen__cat_not_winery__category_cutoff__min_count": [15, 30, 50], "feature_gen__cat_winery__category_cutoff__min_count": [5, 10], "feature_gen__designation__decomposition__n_components": [5, 20, 50], "feature_gen__designation__vectorizer__sublinear_tf": [True, False], "feature_gen__description__decomposition__n_components": [20, 50, 75], "feature_gen__description__vectorizer__sublinear_tf": [True, False], "feature_gen__title__decomposition__n_components": [5, 20, 50], "feature_gen__title__vectorizer__sublinear_tf": [True, False], "regressor__min_samples_leaf": [5, 10, 25, 50, 100], "regressor__max_features": ['sqrt', 'log2'], "regressor__n_estimators": [50, 100, 300] } log.info('Running model selection') log.info(f'n_iter = {n_iter}') searchcv = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dists, n_iter=int(n_iter), cv=5, scoring='neg_mean_squared_error', return_train_score=False) searchcv.fit(X, y) log.info('Model selection done') return searchcv
def search(model, X, y, params, method="randomized", n_iter=30, cv=5, **kwargs): """Run a cross-validated search for hyperparameters.""" if method.lower() == "randomized": search = RandomizedSearchCV(model, param_distributions=params, n_iter=n_iter, cv=cv) elif method.lower() == "grid": search = GridSearchCV(model, param_grid=params, cv=cv) elif method.lower() == "bayes": search = BayesSearchCV(model, search_spaces=params, n_iter=n_iter, cv=cv) else: message = ("'method' must be either 'randomized', 'grid' or 'bayes'." " Got method='{}'".format(method)) LOGGER.error(message) raise ValueError(message) method_name = method.capitalize() + "SearchCV" LOGGER.info("Beginning " + method_name) when_started = time() progress(search.fit(X, y)) total_time = time() - when_started n_settings = len(search.cv_results_['params']) LOGGER.warn( "{} took {:.2f} seconds for {} candidates parameter settings.".format( method_name, total_time, n_settings)) return search
def train(self): seed = random.seed(42) bins = qcut(self.Y, 5, labels=False, duplicates='drop') X_train, self.X_test, Y_train, self.Y_test = train_test_split( self.X, self.Y, test_size=0.3, stratify=bins, random_state=42) estimator = GradientBoostingRegressor(random_state=42) selector = RFECV(estimator, cv=2, min_features_to_select=1) if self.load_save == True: self.logger.warning( 'The predefined parameters are going to be used') self.model = joblib.load(self.name + '.pkl') self.hp = None else: self.logger.warning('The RandomizedSearchCV() is going to be used') grid = { 'estimator__n_estimators': [int(x) for x in linspace(10, 1000, num=101)], 'estimator__max_depth': [int(x) for x in linspace(1, 100, num=101)], 'estimator__min_samples_split': [int(x) for x in linspace(2, 50, num=49)], 'estimator__min_samples_leaf': [int(x) for x in linspace(2, 50, num=49)] } # client = Client('192.168.200.1:8786') self.rscv = RandomizedSearchCV( estimator=selector, param_distributions=grid, n_iter=50, scoring='r2', cv= 2, # <-- change the number os simulations here! 4000 is the original iid=False, random_state=42, n_jobs=3) # , scheduler=client) self.rscv.fit(X_train, Y_train) self.model = self.rscv.best_estimator_ joblib.dump(self.model, self.name + '.pkl', compress=1) self.hp = self.rscv.best_params_
def hpo_dask(self, model, params, X, y, exp_name='exp_0', joblib=True, cv=2, n_iter=10, verbose=10, n_jobs=-1, random_state=0, report=True): self.__engine_init() if joblib: from sklearn.model_selection import RandomizedSearchCV import joblib search = RandomizedSearchCV(model, params, cv=cv, n_iter=n_iter, verbose=verbose, n_jobs=n_jobs, random_state=random_state) with joblib.parallel_backend('dask'): print("Using dask backend") print("Started fitting") else: from dask_ml.model_selection import RandomizedSearchCV search = RandomizedSearchCV(model, params, cv=cv, n_iter=n_iter, n_jobs=-1, random_state=random_state) print("Started fitting") search.fit(X, y) best = search.best_estimator_ print("Best score {}".format(search.best_score_)) print("Best params {}".format(search.best_params_)) if report: from joblib import dump, load import json self.__check_dirs() print("Saving report and best model") cv_report = pd.DataFrame(search.cv_results_) rep_name = "{}_cv_results.csv".format(exp_name) path_report = os.path.join(self.report_dir, rep_name) cv_report.to_csv(path_report) best_name = "{}_best.pkl" path_model = os.path.join(self.models_dir, best_name) dump(best, path_model) param_name = "{}_best_params.json".format(search.best_params_) path_best_params = os.path.join(self.report_dir, param_name) with open(path_best_params, 'w') as fp: json.dump(search.best_params_, fp) return best
'gamma':[0,0.03,0.1,0.3], 'min_child_weight':[1.5,6,10], 'learning_rate':[0.1,0.07], 'max_depth':[3,5], 'n_estimators':[10000], 'reg_alpha':[1e-5, 1e-2, 0.75], 'reg_lambda':[1e-5, 1e-2, 0.45], 'subsample':[0.6,0.95] } print("hyperparameters") #gb_model = xgboost.XGBRegressor(learning_rate =0.1, n_estimators=1000, max_depth=5, # min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=6, scale_pos_weight=1, seed=27) cv = KFold(10, shuffle=True) rsearch1 = RandomizedSearchCV(estimator = bst, param_distributions = parameters_for_testing, n_jobs=-1,iid=False,n_iter = 100, cv = cv ,scoring='neg_mean_squared_error') rsearch1.fit(X_train, y_train) print ('######################################################') print (rsearch1.grid_scores_) print('best params') print (rsearch1.best_params_) print('best score') print (rsearch1.best_score_) #bst.save_model('0001.model')
def train_by_subject(labels_df, features_df, cohort, device, instrument, subject_id, label): label_cols = ['on_off', 'dyskinesia', 'tremor', 'subject_id'] id_cols = ['measurement_id', 'id'] labels_df["subject_id"] = labels_df["subject_id"].astype(str) subj_means = labels_df.groupby('subject_id').mean() df = features_df.dropna().merge(labels_df, right_on='measurement_id', left_on='measurement_id') print('%d rows dropped due to nans in features' % (features_df.shape[0] - df.shape[0])) # Model ## Model spec scaler = preprocessing.RobustScaler(quantile_range=(1, 99)) scaler_pg = { 'scaler__quantile_range': [(.1, 99.9), (.5, 99.5), (1, 99), (5, 95), (10, 90)], } # Keep features w/ variance in top x%ile var = lambda X, y: np.var(X, axis=0) f_select = feature_selection.SelectPercentile(var, percentile=95) f_select_pg = {'f_select__percentile': stats.uniform(0, 100)} model = ensemble.RandomForestRegressor() model_pg = { 'model__regressor__n_estimators': stats.randint(50, 100), 'model__regressor__max_depth': stats.randint(10, 25), 'model__regressor__max_features': [.25, 'auto'] } clip_out = preprocessing.FunctionTransformer(np.clip, kw_args={ 'a_min': 0, 'a_max': 4 }) clipped_model = compose.TransformedTargetRegressor( regressor=model, inverse_func=clip_out.transform) pipe = pipeline.Pipeline([ ('scaler', scaler), ('f_select', f_select), ('model', clipped_model), ], verbose=1) param_grid = { **scaler_pg, **f_select_pg, **model_pg, } metric = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False) cv = model_selection.StratifiedKFold(shuffle=True) ## Model eval subj_df = df print(f'working on {label}') labeled_samps = subj_df.dropna(subset=[label]) if not labeled_samps.shape[0]: print(f'skipping {label}') return None print(labeled_samps.columns.values.tolist()) y = subj_df.loc[labeled_samps.index, label].astype('int') X = labeled_samps.drop(columns=[*label_cols, *id_cols]) search = RandomizedSearchCV(pipe, param_grid, n_iter=20, scoring=metric, cv=cv, refit=False) cv_fit = search.fit(X, y) cv_results_df = pd.DataFrame(cv_fit.cv_results_) resultset_json = { 'cohort': cohort, 'subject_id': subject_id, 'model_type': str(type(model).__name__), 'label': label } win_params = cv_results_df.loc[cv_results_df.rank_test_score == 1, 'params'].values[0] winner = pipe.set_params(**win_params) return winner, cv_results_df, resultset_json
class model(object): def __init__(self, name, X, Y, x=None, logger=None, load_save=False): self.name = name # self.Xt = custom_scaler.scaler(X) # self.Yt = custom_scaler.scaler(Y) self.X = X self.Y = Y self.x = x self.columns = self.X.columns self.logger = logger self.load_save = load_save self.run() def summary(self): if not self.hp == None: [ self.logger.info('Hyperparameter "{kn}" = "{kv}"'.format( kn=i, kv=self.hp[i])) for i in self.hp.keys() ] self.logger.info( "Best predictors = %s" % ", ".join(self.columns[self.model.get_support(indices=True)])) self.logger.info( "Importances = %s" % ", ".join(map(str, self.model.estimator_.feature_importances_))) Y_pred = self.model.predict(self.X_test) stats = (metrics.r2_score(self.Y_test, Y_pred), metrics.mean_squared_error(self.Y_test, Y_pred)**0.5, (sum(self.Y_test - Y_pred) / sum(self.Y_test)) * 100) self.logger.info("r2 = %s, RMSE = %s, PBIAS = %s" % stats) def train(self): seed = random.seed(42) bins = qcut(self.Y, 5, labels=False, duplicates='drop') X_train, self.X_test, Y_train, self.Y_test = train_test_split( self.X, self.Y, test_size=0.3, stratify=bins, random_state=42) estimator = GradientBoostingRegressor(random_state=42) selector = RFECV(estimator, cv=2, min_features_to_select=1) if self.load_save == True: self.logger.warning( 'The predefined parameters are going to be used') self.model = joblib.load(self.name + '.pkl') self.hp = None else: self.logger.warning('The RandomizedSearchCV() is going to be used') grid = { 'estimator__n_estimators': [int(x) for x in linspace(10, 1000, num=101)], 'estimator__max_depth': [int(x) for x in linspace(1, 100, num=101)], 'estimator__min_samples_split': [int(x) for x in linspace(2, 50, num=49)], 'estimator__min_samples_leaf': [int(x) for x in linspace(2, 50, num=49)] } # client = Client('192.168.200.1:8786') self.rscv = RandomizedSearchCV( estimator=selector, param_distributions=grid, n_iter=50, scoring='r2', cv= 2, # <-- change the number os simulations here! 4000 is the original iid=False, random_state=42, n_jobs=3) # , scheduler=client) self.rscv.fit(X_train, Y_train) self.model = self.rscv.best_estimator_ joblib.dump(self.model, self.name + '.pkl', compress=1) self.hp = self.rscv.best_params_ def predict(self): if isinstance(self.x, DataFrame): Y_mod = self.model.predict(self.x) self.Y_mod = Y_mod.astype(float) # self.Y_mod = self.Yt.inverse_transform(self.Y_mod) savetxt(self.name + '.txt', self.Y_mod) # df = self.x.copy() # df[self.name] = self.Y_mod # savetxt(self.name+'.txt', df, header=" ".join(df.columns), comments='') def run(self): self.logger.info('Starting the %s processing' % self.name) self.train() self.summary() self.predict()