def fit_lasso(X_train, y_train, X_test, y_test, nfolds=10, n_jobs=7): model = Lasso() params = { 'alpha': [0.005, 0.01, 0.1, 1.0, 5.0, 10.0, 100.0, 500.0, 750.0, 1000.0], 'copy_X': [True], 'fit_intercept': [True, False], 'normalize': [True, False], 'precompute': [False] } cv = KFold(n_splits=nfolds, shuffle=True, random_state=42) n_iter_search = 40 random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=n_iter_search, verbose=10, scoring="neg_mean_absolute_error", n_jobs=n_jobs, cv=cv) random_search = random_search.fit(X_train, y_train) xgb_model = random_search.best_estimator_ test_preds = xgb_model.predict(X_test) train_preds = xgb_model.predict(X_train) model = Lasso(**random_search.best_params_) train_cross_preds = cross_val_predict(model, X_train, y_train, cv=cv) random_search.feats = X_train.columns return train_preds, train_cross_preds, test_preds
def train_xgb(X, y, mod_number=1, cv=None, outfile="model.pickle", n_iter_search=100, nfolds=20, random_state=42): """ Train an XGBoost model with hyper parameter optimization. Parameters ---------- X : matrix Matrix with all the features, every instance should be coupled to the y-value y : vector Vector with the class, every value should be coupled to an x-vector with features Returns ------- object Trained XGBoost model object Cross-validation results """ xgb_handle = xgb.XGBClassifier() one_to_left = st.beta(10, 1) from_zero_positive = st.expon(0, 50) #Define distributions to sample from for hyper parameter optimization param_dist = { "n_estimators": st.randint(25, 150), "max_depth": st.randint(5, 10), "learning_rate": st.uniform(0.05, 0.4), #"colsample_bytree": one_to_left, "subsample": one_to_left, "gamma": st.uniform(0, 10), "reg_alpha": from_zero_positive, "min_child_weight": from_zero_positive, } if not cv: cv = KFold(n_splits=nfolds, shuffle=True,random_state=random_state) mcc = make_scorer(matthews_corrcoef) random_search = RandomizedSearchCV(xgb_handle, param_distributions=param_dist, n_iter=n_iter_search,verbose=10,scoring="roc_auc", n_jobs=1,refit=True,cv=cv) random_search.fit(X, y) random_search.feats = X.columns pickle.dump(random_search,open(outfile,"wb")) return(random_search.best_score_)
def fit_xgb(X_train, y_train, X_test, y_test, config_file="config.ini"): """ Extract all features we can extract; without parallelization; use if you want to run feature extraction with a single core Parameters ---------- X_train : pd.DataFrame feature matrix y_train : pd.DataFrame/Series objective values for training X_test : pd.DataFrame feature matrix for testing/evaluating y_test : pd.DataFrame/Series objective values for testing/evaluating config_file : str location of the configuration file that contains the hyperparemeter spaces Returns ------- list predictions for the train set list cross-validation predictions (hyperparameters still determined on the training set; not the model parameters) list test predictions sklearn.model_selection.RandomizedSearchCV object containing the model and training settings """ cparser = ConfigParser() cparser.read(config_file) # get hyperparameter space to sample from n_estimators = eval(cparser.get("fitXGB", "n_estimators")) max_depth = eval(cparser.get("fitXGB", "max_depth")) learning_rate = eval(cparser.get("fitXGB", "learning_rate")) gamma = eval(cparser.get("fitXGB", "gamma")) reg_alpha = eval(cparser.get("fitXGB", "reg_alpha")) reg_lambda = eval(cparser.get("fitXGB", "reg_lambda")) random_state = cparser.getint("fitXGB", "random_state") nfolds = cparser.getint("fitXGB", "nfolds") n_iter_search = cparser.getint("fitXGB", "n_iter_search") verbose = cparser.getint("fitXGB", "verbose") n_jobs = cparser.getint("fitXGB", "n_jobs") eval_metric = cparser.get("fitXGB", "eval_metric").strip('"') model = xgb.XGBRegressor() params = { 'n_estimators': n_estimators, 'max_depth': max_depth, 'learning_rate': learning_rate, 'gamma': gamma, 'reg_alpha': reg_alpha, 'reg_lambda': reg_lambda, 'n_jobs': [n_jobs] } cv = KFold(n_splits=nfolds, shuffle=True, random_state=random_state) random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=n_iter_search, verbose=verbose, scoring=eval_metric, cv=cv, random_state=random_state) random_search = random_search.fit(X_train, y_train) xgb_model = random_search.best_estimator_ train_preds = xgb_model.predict(X_train) # train using the best hyperparameters and make cv preds model = xgb.XGBRegressor(**random_search.best_params_) if verbose > 0: logging.debug("Predicting tR with CV now...") train_cross_preds = cross_val_predict(model, X_train, y_train, cv=cv) random_search.feats = X_train.columns test_preds = xgb_model.predict(X_test) if verbose > 0: logging.debug("=====") logging.debug(random_search.best_params_) logging.debug(random_search.best_score_) logging.debug("=====") return train_preds, train_cross_preds, test_preds, random_search