def train_EXT(estimator, trainX, trainY, method, n_jobs=4, skip=False): # Extremely Randomized Trees logger = misc.init_logger(method) xmlPath = os.path.join(os.path.dirname(__file__), "params", '%s.xml' % method[method.find('_') + 1:]) if not skip: logger.info("Begin to train ExtraTrees...") misc.modelfit(estimator, trainX, trainY, method, n_jobs=n_jobs) # fine tune n_estimators param_grid = {"n_estimators": np.arange(50, 601, 50)} best_params, best_score = misc.run_gridsearch(trainX, trainY, estimator, param_grid, sample_weight=False, cv=5, scoring='roc_auc', n_jobs=n_jobs, method=method) estimator.set_params(n_estimators=best_params['n_estimators']) # fine tune max_depth and min_samples_split param_grid = { "max_depth": np.arange(5, 30, 2), "min_samples_split": np.arange(0.005, 0.031, 0.005) } best_params, best_score = misc.run_gridsearch(trainX, trainY, estimator, param_grid, sample_weight=False, cv=5, scoring='roc_auc', n_jobs=n_jobs, method=method) estimator.set_params( max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split']) # fine tune min_samples_split and min_samples_leaf param_grid = {"min_samples_leaf": np.arange(5, 51, 5)} best_params, best_score = misc.run_gridsearch(trainX, trainY, estimator, param_grid, sample_weight=False, cv=5, scoring='roc_auc', n_jobs=n_jobs, method=method) estimator.set_params(min_samples_leaf=best_params['min_samples_leaf']) # fine tune max_features feat_num = len(list(trainX.columns)) param_grid = { "max_features": np.arange(int(np.sqrt(feat_num)), int(0.4 * feat_num), 2) } best_params, best_score = misc.run_gridsearch(trainX, trainY, estimator, param_grid, sample_weight=False, cv=5, scoring='roc_auc', n_jobs=n_jobs, method=method) if best_params['max_features'] == int(np.sqrt(feat_num)): estimator.set_params(max_features='auto') else: estimator.set_params(max_features=best_params['max_features']) # refine-tune n_estimators param_grid = {"n_estimators": np.arange(40, 1001, 40)} best_params, best_score = misc.run_gridsearch(trainX, trainY, estimator, param_grid, sample_weight=False, cv=5, scoring='roc_auc', n_jobs=n_jobs, method=method) estimator.set_params(n_estimators=best_params['n_estimators']) misc.update_params_toXML(estimator, method, xmlPath) logger.info("After parameters tuning, Get the CV score...") misc.modelfit(estimator, trainX, trainY, method, n_jobs=n_jobs) else: try: estimator = misc.load_params_fromXML(estimator, method, xmlPath) except Exception: return estimator logger.info("After parameters tuning. The current parameters are\n %s" % str(estimator.get_params())) return estimator
def train_GBDT(estimator, trainX, trainY, method, n_jobs=4, skip=False): # GBDT logger = misc.init_logger(method) xmlPath = os.path.join(os.path.dirname(__file__), "params", '%s.xml' % method[method.find('_') + 1:]) if not skip: logger.info("Begin to train GBDT...") misc.modelfit(estimator, trainX, trainY, method, n_jobs=n_jobs) # fine tune n_estimators param_grid = {"n_estimators": np.arange(50, 601, 50)} best_params, best_score = misc.run_gridsearch(trainX, trainY, estimator, param_grid, sample_weight=True, cv=5, scoring='roc_auc', n_jobs=n_jobs, method=method) best_n_estimators = best_params['n_estimators'] estimator.set_params(n_estimators=best_n_estimators) # fine tune max_depth and min_samples_split param_grid = { "max_depth": np.arange(5, 30, 2), "min_samples_split": np.arange(0.005, 0.031, 0.005) } best_params, best_score = misc.run_gridsearch(trainX, trainY, estimator, param_grid, sample_weight=True, cv=5, scoring='roc_auc', n_jobs=n_jobs, method=method) estimator.set_params( max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split']) # fine tune min_samples_split and min_samples_leaf param_grid = {"min_samples_leaf": np.arange(5, 51, 5)} best_params, best_score = misc.run_gridsearch(trainX, trainY, estimator, param_grid, sample_weight=True, cv=5, scoring='roc_auc', n_jobs=n_jobs, method=method) estimator.set_params(min_samples_leaf=best_params['min_samples_leaf']) # fine tune max_features feat_num = len(list(trainX.columns)) param_grid = { "max_features": np.arange(int(np.sqrt(feat_num)), int(0.4 * feat_num), 2) } best_params, best_score = misc.run_gridsearch(trainX, trainY, estimator, param_grid, sample_weight=True, cv=5, scoring='roc_auc', n_jobs=n_jobs, method=method) if best_params['max_features'] == int(np.sqrt(feat_num)): estimator.set_params(max_features='auto') else: estimator.set_params(max_features=best_params['max_features']) # fine tune subsample param_grid = {"subsample": [0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]} best_params, best_score = misc.run_gridsearch(trainX, trainY, estimator, param_grid, sample_weight=True, cv=5, scoring='roc_auc', n_jobs=n_jobs, method=method) estimator.set_params(subsample=best_params['subsample']) # refine-tune n_estimatosr pairs = [(0.1, best_n_estimators), (0.075, int(best_n_estimators * 4.0 / 3)), (0.05, best_n_estimators * 2), (0.04, int(best_n_estimators * 5.0 / 2)), (0.03, int(best_n_estimators * 10.0 / 3)), (0.01, best_n_estimators * 10), (0.005, best_n_estimators * 20)] max_n_estimators = 2400 opt_params = None opt_score = 0.0 for learning_rate, n_estimators in pairs: if n_estimators > max_n_estimators: break estimator.set_params(learning_rate=learning_rate, n_estimators=n_estimators) auc_score, acc_score = misc.modelfit(estimator, trainX, trainY, method, n_jobs=n_jobs) logger.info( "With learning_rate %s, n_estimators %s, auc_score is %s, acc_score is %s" % (learning_rate, n_estimators, auc_score, acc_score)) if auc_score > opt_score: opt_params = (learning_rate, n_estimators) opt_score = auc_score logger.info( "best learning_rate is %s, best n_estimators is %s. The corresponding auc_score is %s" % (opt_params[0], opt_params[1], opt_score)) estimator.set_params(learning_rate=opt_params[0], n_estimators=opt_params[1]) misc.update_params_toXML(estimator, method, xmlPath) logger.info("After parameters tuning, Get the CV score...") misc.modelfit(estimator, trainX, trainY, method, n_jobs=n_jobs) else: try: estimator = misc.load_params_fromXML(estimator, method, xmlPath) except Exception: return estimator logger.info("After parameters tuning. The current parameters are\n %s" % str(estimator.get_params())) return estimator