def train_h2o(X: pd.DataFrame, y: pd.Series, config: Config): h2o.init() X["target"] = y train = h2o.H2OFrame(X) train_x = train.columns train_y = "target" train_x.remove(train_y) if config["mode"] == "classification": train[train_y] = train[train_y].asfactor() aml = H2OAutoML(max_runtime_secs=int(config.time_left() * 0.9), max_models=20, nfolds=3, exclude_algos=["GBM", "DeepLearning", "DRF"], seed=42) aml.train(x=train_x, y=train_y, training_frame=train) config['params']['pipeline'][config["stage"]]["model"] = h2o.save_model( model=aml.leader, path=config.model_dir + "/h2o.model", force=True) if config.verbose: print(aml.leaderboard) X.drop("target", axis=1, inplace=True)
def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config): params = { "objective": "regression" if config.is_regression() else "binary", "metric": "rmse" if config.is_regression() else "auc", "verbosity": -1, "seed": 1, } X_sample, y_sample = data_sample(X, y, config, nrows=20000) hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config) X_train, X_val, y_train, y_val = data_split(X, y, config) config["model"] = lgb.train( {**params, **hyperparams}, lgb.Dataset(X_train, label=y_train), 5000, lgb.Dataset(X_val, label=y_val), early_stopping_rounds=100, verbose_eval=100, ) config.save() try: with time_limit(config.time_left() - 10): config["model"] = lgb.train( {**params, **hyperparams}, lgb.Dataset(X, label=y), int(1.2 * config["model"].best_iteration), ) except TimeoutException: Log.print("Timed out!")
def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config): params = { "objective": "regression" if config["mode"] == "regression" else "binary", "metric": "rmse" if config["mode"] == "regression" else "auc", "verbosity": -1, "seed": 1, } X_sample, y_sample = data_sample(X, y) hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config) for i in range(1): print( '################################################################## cv ' + str(i)) t1_bagging = time.time() params['seed'] = i + 1 # cv nfold = 5 if config["mode"] == 'classification': skf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=777) else: skf = KFold(n_splits=nfold, shuffle=True, random_state=777) skf_split = skf.split(X, y) log('####################################################################### begin cv' ) log('####### cur time = ' + str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) score_list = [] config["model"] = [] for fid, (train_idx, valid_idx) in enumerate(skf_split): t1_cv = time.time() print("FoldID:{}".format(fid)) X_train, y_train = X.iloc[train_idx], y.iloc[train_idx] X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx] dtrain = lgb.Dataset(X_train, label=y_train) dvalid = lgb.Dataset(X_valid, label=y_valid, reference=dtrain) cur_model = lgb.train({ **params, **hyperparams }, dtrain, 3000, dvalid, early_stopping_rounds=50, verbose_eval=100) config["model"].append(cur_model) score_list.append(cur_model.best_score) # gc.collect() sys.stdout.flush() t2_cv = time.time() time_left = config.time_left() print('######### cv' + str(time_left)) if (t2_cv - t1_cv) * (nfold - fid + 1) >= time_left: pass #break log('######################################################################### end cv' ) log('####### cur time = ' + str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) valid_auc = np.array( [i['valid_0'][params['metric']] for i in score_list]) print('valid', valid_auc, np.mean(valid_auc)) cv_score = pd.DataFrame( {'cv': np.hstack([valid_auc, np.mean(valid_auc)])}) path = config['path_pred'] print(path) cv_score.to_csv(path + '/cv_score_' + str(i) + '.csv', index=False) t2_bagging = time.time() time_left = config.time_left() print('#########bagging' + str(time_left)) if (t2_bagging - t1_bagging) * 1.5 >= time_left: #break pass