def __init__(self, output_dim, boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, class_weight=None, min_split_gain=0., min_child_weight=1e-3, min_child_samples=20, subsample=1., subsample_freq=0, colsample_bytree=1., reg_alpha=0., reg_lambda=0., random_state=None, n_jobs=-1, silent=True, importance_type='split'): if output_dim > 2: objective = "multiclass" else: objective = "regression" self.model = lightgbm.LGBMModel(boosting_type, num_leaves, max_depth, learning_rate, n_estimators, subsample_for_bin, objective, class_weight, min_split_gain, min_child_weight, min_child_samples, subsample, subsample_freq, colsample_bytree, reg_alpha, reg_lambda, random_state, n_jobs, silent, importance_type, n_classes=output_dim, first_metric_only=True) self.output_dim = output_dim self.custom_metrics = {} self.result = None self.rgetter = RGetter() self.stop_training = False self.custom_loss_callable = None
def optmize_hyperparams(self, param_grid, X, Y, cv=4, scoring='neg_mean_squared_error', verbose=1): '''Use GridSearchCV to optimize models params''' params = self.params params['learning_rate'] = 0.05 params['n_estimators'] = 1000 gsearch1 = GridSearchCV(estimator=lgb.LGBMModel(**params), param_grid=param_grid, scoring=scoring, n_jobs=1, iid=False, cv=4) gsearch1.fit(X, Y) scores = gsearch1.grid_scores_ best_params = gsearch1.best_params_ best_score = np.sqrt(-gsearch1.best_score_) if verbose > 0: if verbose > 1: print('Scores are: ', scores) print('Best params: ', best_params) print('Best score: ', best_score)
def __init__( self, boostring='dart', learning_rate=0.05, min_data_in_leaf=20, #applications='binary' feature_fraction=0.7, num_leaves=41, metric='auc', drop_date=0.15): self.parameters = { 'boosting': boostring, # dart (drop out trees) often performs better #'application': applications, # Binary classification 'learning_rate': learning_rate, # Learning rate, controls size of a gradient descent step 'min_data_in_leaf': min_data_in_leaf, # Data set is quite small so reduce this a bit 'feature_fraction': feature_fraction, # Proportion of features in each boost, controls overfitting 'num_leaves': num_leaves, # Controls size of tree since LGBM uses leaf wise splits 'metric': metric, # Area under ROC curve as the evaulation metric 'drop_rate': drop_date } self.evaluation_results = {} self.model = lgb.LGBMModel()
def __init__(self, **kwargs): # TODO: use config file to set default parameters (like in candle) # self.model = lgb.LGBMModel( # objective = LGBM_REGRESSOR.ml_objective, # n_estimators = n_estimators, # n_jobs = n_jobs, # random_state = random_state) self.model = lgb.LGBMModel( objective = LGBM_CLASSIFIER.ml_objective, **kwargs )
def __init__(self, n_estimators=100, eval_metric=['l2', 'l1'], n_jobs=1, random_state=None, logger=None): # TODO: use config file to set default parameters (like in candle) self.model = lgb.LGBMModel(objective=LGBM_REGRESSOR.ml_objective, n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state)
def run_rfe(self, model_params, target, X_vars, threshold=0, model_type='indicator'): if model_type == 'indicator': model = lightgbm.LGBMModel(**model_params, importance_type='gain') elif model_type == 'regressor': model = lightgbm.LGBMRegressor(**model_params, importance_type='gain') eval_set = [(self.df_tune[X_vars], self.df_tune[self.target])] model.fit(X=self.df_train[X_vars], y=self.df_train[self.target], eval_set=eval_set, verbose=False) #Dataframe of features and their corresponding level of importance by gain importance_df = pd.DataFrame( data={ 'features': X_vars, 'gain_importances': model.feature_importances_ }) while sum(model.feature_importances_ <= threshold) > 0: print( f"{sum(model.feature_importances_ <= threshold)} features below threshold" ) print("The following features will be removed:") print(importance_df.loc[model.feature_importances_ <= threshold] ['features'].tolist()) features = importance_df.loc[ model.feature_importances_ > threshold]['features'].tolist() eval_set = [(self.df_tune[features], self.df_tune[self.target])] model.fit(X=self.df_train[features], y=self.df_train[self.target], eval_set=eval_set, verbose=False) importance_df = pd.DataFrame( data={ 'features': model.booster_.feature_name(), 'gain_importances': model.feature_importances_ }) self.feature_importance_df = importance_df.sort_values( by=['gain_importances'], ascending=False) self.post_rfe_model = model return self.post_rfe_model, self.feature_importance_df
def Model_stack(df_train_x, df_train_y, df_test): # kernel has 'linear'/'poly'/'rbf'/'sigmoid'/'precomputed'/'callable' 如果没有给出,默认'rbf' callable 预先计算内核矩阵 svr_ = SVR(kernel='linear', degree=3, coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=20) lgb_ = lgb.LGBMModel(boosting_type='gbdt', num_leaves=35, max_depth=20, max_bin=255, learning_rate=0.03, n_estimator=10, subsample_for_bin=2000, objective='regression', min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, verbose=0, subsample_freq=1, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True) RF_model = RandomForestRegressor(n_estimators=50, max_depth=25, min_samples_split=20, min_samples_leaf=10, max_features='sqrt', oob_score=True, random_state=10) # 贝叶斯岭回归 BR_model = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.0000001, verbose=False) linear_model = LinearRegression() ls = Lasso(alpha=0.00375) x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size=0.6) rg = RidgeCV(cv=5) stack = pd.DataFrame() stack_test = pd.DataFrame() ls.fit(x_train, y_train) lgb_.fit(x_train, y_train) RF_model.fit(x_train, y_train) svr_.fit(x_train, y_train) linear_model.fit(x_train, y_train) BR_model.fit(x_train, y_train) stack['rf'] = ls.predict(x_test) stack['adaboost'] = lgb_.predict(x_test) stack['gbdt'] = RF_model.predict(x_test) stack['lightgbm'] = svr_.predict(x_test) stack['linear_model'] = linear_model.predict(x_test) stack['BR'] = BR_model.predict(x_test) # print('stacking_model: ',Cross_validation(stack, y_test, rg)) rg.fit(stack, y_test) stack_test['rf'] = ls.predict(df_test) stack_test['adaboost'] = lgb_.predict(df_test) stack_test['gbdt'] = RF_model.predict(df_test) stack_test['lightgbm'] = svr_.predict(df_test) stack_test['linear_model'] = linear_model.predict(df_test) stack_test['BR'] = BR_model.predict(df_test) final_ans = rg.predict(stack_test) pd.DataFrame(final_ans).to_csv('predict_drop+3.txt', index=False, header=False)
def lgb_params(self): return lgb.LGBMModel(boosting_type=self.boosting_type, num_leaves=self.num_leaves, max_depth=self.max_depth, learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_bin=self.max_bin, subsample_for_bin=self.subsample_for_bin, objective=self.objective, min_split_gain=self.min_split_gain, min_child_weight=self.min_child_weight, min_child_samples=self.min_child_samples, subsample=self.subsample, subsample_freq=self.subsample_freq, colsample_bytree=self.colsample_bytree, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, # random_state=self.random_state, # n_jobs=n_jobs, #silent=silent )
def main(): args = handleArguments() #Rea ddata XImgTrain, yImgTrain = loadEdgesDataFromDirs( True, target=args.targetTrainDir, nonTarget=args.nonTargetTrainDir) XAudioTargetTrain = extractMFCCFromDir(args.targetTrainDir) # yAudioTargetTrain = [1 for i in range(len(XAudioTargetTrain))] XAudioNonTargetTrain = extractMFCCFromDir(args.nonTargetTrainDir) # yAudioNonTargetTrain = [0 for i in range(len(XAudioNonTargetTrain))] #train and pickle if (args.hmmModelOutput): hmmClassifier = HMMBinaryModel() hmmClassifier.fit(XAudioTargetTrain, XAudioNonTargetTrain) pickle.dump(hmmClassifier, args.hmmModelOutput) if (args.lgbmModelOutput): gbmClassifier = lgbm.LGBMModel(objective='binary', random_state=42) gbmClassifier.fit(XImgTrain, yImgTrain) pickle.dump(gbmClassifier, args.lgbmModelOutput)
def model_train(train_data, test_data, train_target, test_target): # 多元线性回归 model = LinearRegression() model.fit(train_data, train_target) score = mean_squared_error(test_target, model.predict(test_data)) print('LinearRegression:', score) # K近邻回归 model = KNeighborsRegressor(n_neighbors=8) model.fit(train_data, train_target) score = mean_squared_error(test_target, model.predict(test_data)) print('KNeighborsRegressor:', score) # 决策树回归 model = DecisionTreeRegressor(random_state=0) model.fit(train_data, train_target) score = mean_squared_error(test_target, model.predict(test_data)) print('DecisionTreeRegressor:', score) # 随机森林回归 model = RandomForestRegressor(n_estimators=200) model.fit(train_data, train_target) score = mean_squared_error(test_target, model.predict(test_data)) print('RandomForestRegressor:', score) # LGB模型回归 model = lgb.LGBMModel( learning_rate=0.01, max_depth=-1, n_estimators=5000, boosting_type='gbdt', random_state=2020, objective='regression', ) model.fit(train_data, train_target) score = mean_squared_error(test_target, model.predict(test_data)) print('lightGbm:', score)
def init(self, param: dict) -> None: """Initialize predictor.""" self._model = lgb.LGBMModel(**param)
import lightgbm as lgbm clf = lgbm.LGBMModel(boosting_type='gbdt', objective='regression', n_estimators=300, num_boost_round=200000, num_leaves=30, learning_rate=0.05, min_split_gain=0.25, min_child_weight=1, min_child_samples=10, scale_pos_weight=1, seed=42, max_depth=-1, subsample=0.8, bagging_fraction=1, max_bin=5000, bagging_freq=20, colsample_bytree=0.6, metric="rmse", n_jobs=10, silent=False)
add_name = np.array([n for n in names if n[:3] == 'AD_']) if len(add_name) != 0: first_X, first_y, second_X, second_y, second_index = Ad_split(X, y, names, AD=True) kf = KFold(n_splits=5) mse = [] for train_idx, test_idx in tqdm(kf.split(first_X), total=5): train_X, test_X = first_X[train_idx], first_X[test_idx] train_y, test_y = first_y[train_idx], first_y[test_idx] vector = [] for i in range(train_y.shape[1]): first_model = lgb.LGBMModel(objective='regression') first_model.fit(train_X, train_y[:, i]) first_model.booster_.save_model('first_model_' + str(i) + '.txt') pred = first_model.predict(test_X) vector.append(mean_squared_error(test_y[:, i], pred)) mse.append(vector) print('5-Fold score(mse):', np.mean(mse, axis=0)) add = np.zeros((len(second_X), first_y.shape[1]), dtype=np.float32) for i in range(first_y.shape[1]): bst = lgb.Booster(model_file='first_model_' + str(i) + '.txt') add[:, i] = bst.predict(second_X) second_X = np.concatenate([second_X, add], axis=1) else: second_X, second_y, second_index = Ad_split(X, y, names, AD=False)
results.reset_index(inplace=True, drop=True) # Convert from a string to a dictionary ast.literal_eval(results.loc[0, 'params']) # Extract the ideal number of estimators and hyperparameters best_bayes_estimators = int(results.loc[0, 'estimators']) best_bayes_params = ast.literal_eval(results.loc[0, 'params']).copy() del best_bayes_params['metric'] print("Creating Model") # Re-create the best model and train on the training data best_bayes_model = lgb.LGBMModel(n_estimators=best_bayes_estimators, n_jobs=-1, metric='multi_error', random_state=50, **best_bayes_params) print("Fitting Model") best_bayes_model.fit(train_features, train_labels) print("Predicting Model") # Evaluate on the testing data Predictions = best_bayes_model.predict(test_features) correct = 0 #Calculate the number of times the model correctly predicted the test labels given the test features for i in range(0, Predictions.shape[0]): maxProbability = np.max(Predictions[i, :]) for j in range(0, len(Predictions[i, :])):
verbose=0, warm_start=False) #myGBR.get_params # 获取模型的所有参数 ##############################--lightgbm--#################################### mylgb = lgb.LGBMModel(boosting_type='gbdt', num_leaves=40, max_depth=7, max_bin=233, learning_rate=0.03, n_estimator=10, subsample_for_bin=300, objective='regression', min_split_gain=0.0, min_child_weight=0.1, min_child_samples=20, subsample=1.0, verbose=0, subsample_freq=1, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True) ###############################--xgboost--###################################### cv_params = {'n_estimators': [2000]} other_params = { 'learning_rate': 0.005,
def objective(hpp): '''Returns compet. cv validation score - avg ensemble of folds''' # CV main loop for i, (train_index, test_index) in enumerate(kf.split(x_train_test)): # Get current fold x_train = x_train_test[train_index] y_train = y_train_test[train_index] x_test = x_train_test[test_index] y_test = y_train_test[test_index] # Create lgb booster bst = lgb.LGBMModel( objective='regression', num_leaves=int(hpp['num_leaves']), learning_rate=hpp['lr'], n_estimators=10000, min_child_samples=int(hpp['min_child_samples']), subsample=hpp['subsample'], reg_lambda=hpp['reg_lambda'], ) bst.fit( X=x_train, y=y_train, eval_set=[(x_test, y_test)], eval_metric='rmse', early_stopping_rounds=15, categorical_feature=ccols, verbose=False, ) # Calculate competition metric below # Predict y_pred = bst.predict(x_test) y_pred = np.clip(y_pred, 0, np.inf) y_pred = np.expm1(np.squeeze(y_pred)) # Build dataframe with predictions and truth per session session_df = train_test_df.iloc[test_index, [0, -1]].copy() session_df['y_pred'] = y_pred # Aggregate pred and truth per user y_true_user = session_df.groupby( 'fullVisitorId')['totals.transactionRevenue'].sum().values y_pred_user = session_df.groupby( 'fullVisitorId')['y_pred'].sum().values # Apply log1p to aggregated predictions y_pred_user = np.log1p(y_pred_user) y_true_user = np.log1p(y_true_user) # Comp. metric val_loss = np.sqrt(np.mean(np.power(y_pred_user - y_true_user, 2))) return { 'loss': val_loss, 'params': hpp, 'status': STATUS_OK, }
def fit(self, clinical, genes, treatments, outcome, optimization_n_call=50, optimization_n_folds=2, optimization_early_stopping_rounds=1, clinical_marker_selection_threshold=.05, gene_selection_threshold=.05, dae_early_stopping_rounds=1000, dae_decay_rate=0.1, dae_learning_rate=1e-4, dae_steps=50000, lgb_fixed_parameters=dict(), lgb_early_stopping_rounds=10, predictor_n_folds=5): """ """ self.__reset__() ############################################################################################ # Select gene expressions ############################################################################################ self.selected_clinical = self.select_markers( clinical, outcome, threshold=clinical_marker_selection_threshold) self.selected_genes = self.select_markers( genes, outcome, threshold=gene_selection_threshold) if self.n_gene_limit is None: self.n_gene_limit = self.select_k_top_markers(self.selected_genes[2]) if self.n_gene_limit is not None: if 4 <= self.n_gene_limit < len(self.selected_genes[0]): self.selected_genes = (self.selected_genes[0][:self.n_gene_limit], self.selected_genes[1][:self.n_gene_limit], self.selected_genes[2][:self.n_gene_limit]) pd.DataFrame({'clinical_marker': self.selected_clinical[0], 'pvalue': self.selected_clinical[1], 'entropy': self.selected_clinical[2]}).to_csv( os.path.join( self.output_path, 'selected_markers', 'clinical_{0:03}_{1:03}.csv'.format( self.experiment_number, self.number_of_experiments)), index=False) pd.DataFrame({'gene': self.selected_genes[0], 'pvalue': self.selected_genes[1], 'entropy': self.selected_genes[2]}).to_csv( os.path.join( self.output_path, 'selected_markers', 'genes_{0:03}_{1:03}.csv'.format( self.experiment_number, self.number_of_experiments)), index=False) clinical = clinical.loc[:, self.selected_clinical[0]].join(treatments) genes = genes.loc[:, self.selected_genes[0]] ############################################################################################ # Normalizing Gene Expression Data ############################################################################################ self.genes_min_max_scaler = MinMaxScaler() genes = pd.DataFrame(self.genes_min_max_scaler.fit_transform(genes), index=genes.index, columns=genes.columns) ############################################################################################ # Genetic Profiling ############################################################################################ self.fit_genetic_profiling(genes) profiling = self.predict_genetic_profiling(genes) clinical = pd.concat([clinical, profiling], axis=1) ############################################################################################ # Gene Clustering ############################################################################################ self.fit_gene_clustering(genes) gene_clusters = self.predict_gene_clustering(genes) clinical = pd.concat([clinical, gene_clusters], axis=1) ############################################################################################ # Normalizing Clinical Data ############################################################################################ self.clinical_min_max_scaler = MinMaxScaler() clinical = pd.DataFrame(self.clinical_min_max_scaler.fit_transform(clinical), index=clinical.index, columns=clinical.columns) clinical = clinical.fillna(0) ############################################################################################ # Denoising Autoencoder ############################################################################################ self.fit_dae(markers=genes, decay_rate=dae_decay_rate, learning_rate=dae_learning_rate, steps=dae_steps, early_stopping_rounds=dae_early_stopping_rounds) dda = self.predict_dae(genes) ############################################################################################ # Joining all features ############################################################################################ join = clinical.join(genes, how='inner').join(dda, how='inner') x = join.values y = outcome.values # smote = SMOTE(sampling_strategy='minority', random_state=self.random_state, n_jobs=-1) # x, y = smote.fit_resample(x, y) # del smote ############################################################################################ # LightGBM Hyperparameter Optimization ############################################################################################ lgb_params = LightGBMOptimizer( n_calls=optimization_n_call, n_folds=optimization_n_folds, fixed_parameters=lgb_fixed_parameters, early_stopping_rounds=optimization_early_stopping_rounds, random_state=self.random_state).optimize(x, y) self.lgb_optimized_params = lgb_params lgb_params = {**lgb_params, **lgb_fixed_parameters} ############################################################################################ # Training ############################################################################################ kkfold = StratifiedKFold(predictor_n_folds, random_state=self.random_state) for iii, (t_index, v_index) in enumerate(kkfold.split(x, y)): x_train, y_train = x[t_index, :], y[t_index] x_valid, y_valid = x[v_index, :], y[v_index] ############################################################################### # Light GBM ############################################################################### lgb = lightgbm.LGBMModel(**lgb_params) lgb.fit( X=x_train, y=y_train, eval_set=[(x_valid, y_valid)], early_stopping_rounds=lgb_early_stopping_rounds, verbose=self.verbose is not None and self.verbose > 0) y_train_hat_lgb = lgb.predict(x_train) y_valid_hat_lgb = lgb.predict(x_valid) self.lgb_models.append(lgb) self.lgb_mins.append(min(np.min(y_train_hat_lgb), np.min(y_valid_hat_lgb))) self.lgb_maxs.append(max(np.max(y_train_hat_lgb), np.max(y_valid_hat_lgb))) ############################################################################### # Performance metrics ############################################################################### # y_train_hat = (y_train_hat_lgb - self.lgb_mins[-1]) / (self.lgb_maxs[-1] - self.lgb_mins[-1]) # y_valid_hat = (y_valid_hat_lgb - self.lgb_mins[-1]) / (self.lgb_maxs[-1] - self.lgb_mins[-1]) y_train_hat = y_train_hat_lgb y_valid_hat = y_valid_hat_lgb self.predictor_train_losses.append(log_loss(y_train, y_train_hat)) self.predictor_train_aucs.append(roc_auc_score(y_train, y_train_hat)) self.predictor_valid_losses.append(log_loss(y_valid, y_valid_hat)) self.predictor_valid_aucs.append(roc_auc_score(y_valid, y_valid_hat)) print('TRAIN mean log loss: {0:03}'.format(np.mean(self.predictor_train_losses))) print('TRAIN mean AUC: {0:03}'.format(np.mean(self.predictor_train_aucs))) print('VALID mean log loss: {0:03}'.format(np.mean(self.predictor_valid_losses))) print('VALID mean AUC: {0:03}'.format(np.mean(self.predictor_valid_aucs)))
metrics='auc', seed=42) # results to retun score = cv_results['auc-mean'][-1] estimators = len(cv_results['auc-mean']) hyperparameters['n_estimators'] = estimators return [score, hyperparameters, iteration] score, params, iteration = objective(default_params, 1) print('The cross-validation ROC AUC was {:.5f}.'.format(score)) # Create a default model model = lgb.LGBMModel() model.get_params() # Hyperparameter grid param_grid = { 'boosting_type': ['gbdt', 'goss', 'dart'], 'num_leaves': list(range(20, 150)), 'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base=10, num=1000)), 'subsample_for_bin': list(range(20000, 300000, 20000)), 'min_child_samples': list(range(20, 500, 5)), 'reg_alpha': list(np.linspace(0, 1)), 'reg_lambda':
def fit(self, genes, outcome, clinical=None, treatments=None, optimization_n_call=50, optimization_n_folds=2, optimization_early_stopping_rounds=1, clinical_marker_selection_threshold=.05, gene_selection_threshold=.05, dae_early_stopping_rounds=1000, dae_decay_rate=0.1, dae_learning_rate=1e-4, dae_steps=50000, dae_keep_probability=.75, use_predictor=True, lgb_fixed_parameters=None, lgb_early_stopping_rounds=10, predictor_n_folds=5, minor_class_augmentation=False): """ """ self.__reset__() self.minor_class_augmentation = minor_class_augmentation x = None ############################################################################################ # Select gene expressions ############################################################################################ if clinical is not None: self.selected_clinical = self.select_markers( clinical, outcome, threshold=clinical_marker_selection_threshold, random_state=self.random_state) self.selected_genes = self.select_markers( genes, outcome, threshold=gene_selection_threshold, random_state=self.random_state) # if self.n_gene_limit is None: # self.n_gene_limit = self.select_k_top_markers(self.selected_genes[2]) if self.n_gene_limit is not None: if 4 <= self.n_gene_limit < len(self.selected_genes[0]): self.selected_genes = ( self.selected_genes[0][:self.n_gene_limit], self.selected_genes[1][:self.n_gene_limit], self.selected_genes[2][:self.n_gene_limit]) if self.export_metadata: if clinical is not None: pd.DataFrame({ 'clinical_marker': self.selected_clinical[0], 'pvalue': self.selected_clinical[1], 'entropy': self.selected_clinical[2] }).to_csv(os.path.join( self.output_path, 'selected_markers', 'clinical_{0:03}_{1:03}.csv'.format( self.experiment_number, self.number_of_experiments)), index=False) pd.DataFrame({ 'gene': self.selected_genes[0], 'pvalue': self.selected_genes[1], 'entropy': self.selected_genes[2] }).to_csv(os.path.join( self.output_path, 'selected_markers', 'genes_{0:03}_{1:03}.csv'.format(self.experiment_number, self.number_of_experiments)), index=False) if clinical is not None: if len(self.selected_clinical[0]) > 0: x = clinical.loc[:, self.selected_clinical[0]] if treatments is not None: if treatments.shape[1] > 0: x = treatments if x is None else x.join(treatments) assert len(self.selected_genes[0]) >= 4, \ 'At least 4 genes are required for MuLT approach. You can increase the threshold.' genes = genes.loc[:, self.selected_genes[0]] # self.raw_genes_min_max_scaler = MinMaxScaler() genes_norm = self.raw_genes_min_max_scaler.fit_transform(genes) genes_norm = pd.DataFrame(genes_norm, columns=genes.columns, index=genes.index) ############################################################################################ # Normalizing Gene Expression Data ############################################################################################ self.genes_min_max_scaler = MinMaxScaler() genes = pd.DataFrame(self.genes_min_max_scaler.fit_transform( np.log1p(genes)), index=genes.index, columns=genes.columns) ############################################################################################ # Genetic Profiling ############################################################################################ self.fit_genetic_profiling(genes_norm) profiling = self.predict_genetic_profiling(genes_norm) x = pd.concat([x, profiling], axis=1) if x is not None else profiling ############################################################################################ # Gene Clustering ############################################################################################ self.fit_gene_clustering(genes_norm) gene_clusters = self.predict_gene_clustering(genes_norm) x = pd.concat([x, gene_clusters], axis=1) ############################################################################################ # Denoising Autoencoder ############################################################################################ self.fit_dae(markers=genes, keep_probability=dae_keep_probability, decay_rate=dae_decay_rate, learning_rate=dae_learning_rate, steps=dae_steps, early_stopping_rounds=dae_early_stopping_rounds) if use_predictor: dda = self.predict_dae(genes) ############################################################################################ # Joining all features ############################################################################################ x = x.join(genes_norm).join(dda, how='inner').fillna(0) x, y = x.values, outcome.values if minor_class_augmentation: smote = SMOTE(sampling_strategy='minority', random_state=self.random_state, n_jobs=-1) x, y = smote.fit_resample(x, y) del smote ############################################################################################ # LightGBM Hyper parameter Optimization ############################################################################################ if lgb_fixed_parameters is None: lgb_fixed_parameters = dict() self.predictor_optimizer = LightGBMOptimizer( n_calls=optimization_n_call, n_folds=optimization_n_folds, fixed_parameters=lgb_fixed_parameters, early_stopping_rounds=optimization_early_stopping_rounds, random_state=self.random_state) lgb_params = self.predictor_optimizer.optimize(x, y) self.lgb_optimized_params = lgb_params lgb_params = {**lgb_params, **lgb_fixed_parameters} ############################################################################################ # Training ############################################################################################ if predictor_n_folds > 1: kkfold = StratifiedKFold(predictor_n_folds, random_state=self.random_state) splits = kkfold.split(x, y) else: splits = [(list(range(0, x.shape[0])), None)] for iii, (t_index, v_index) in enumerate(splits): x_train, y_train = x[t_index, :], y[t_index] if v_index is not None: x_valid, y_valid = x[v_index, :], y[v_index] ############################################################################### # Light GBM ############################################################################### lgb = lightgbm.LGBMModel(**lgb_params) lgb.fit(X=x_train, y=y_train, eval_set=[(x_valid, y_valid)] if v_index is not None else None, early_stopping_rounds=lgb_early_stopping_rounds if v_index is not None else None, verbose=self.verbose is not None and self.verbose > 0) y_train_hat_lgb = lgb.predict(x_train) self.lgb_models.append(lgb) if v_index is not None: y_valid_hat_lgb = lgb.predict(x_valid) self.lgb_mins.append( min(np.min(y_train_hat_lgb), np.min(y_valid_hat_lgb))) self.lgb_maxs.append( max(np.max(y_train_hat_lgb), np.max(y_valid_hat_lgb))) else: self.lgb_mins.append(min(y_train_hat_lgb)) self.lgb_maxs.append(max(y_train_hat_lgb)) ############################################################################### # Performance metrics ############################################################################### y_train_hat = (y_train_hat_lgb - self.lgb_mins[-1]) / ( self.lgb_maxs[-1] - self.lgb_mins[-1]) if v_index is not None: y_valid_hat = (y_valid_hat_lgb - self.lgb_mins[-1]) / ( self.lgb_maxs[-1] - self.lgb_mins[-1]) self.predictor_train_losses.append( log_loss(y_train, y_train_hat)) self.predictor_train_aucs.append( roc_auc_score(y_train, y_train_hat)) if v_index is not None: self.predictor_valid_losses.append( log_loss(y_valid, y_valid_hat)) self.predictor_valid_aucs.append( roc_auc_score(y_valid, y_valid_hat)) if self.verbose: print('Train mean log loss: {0:03}'.format( np.mean(self.predictor_train_losses))) print('Train mean AUC: {0:03}'.format( np.mean(self.predictor_train_aucs))) if v_index is not None: print('Valid mean log loss: {0:03}'.format( np.mean(self.predictor_valid_losses))) print('Valid mean AUC: {0:03}'.format( np.mean(self.predictor_valid_aucs)))
def run_hyperopt(self, param_space, X_vars, model_params, fmin_max_evals, algo='tpe', metric='balanced_accuracy_score', trials_obj=None, model_type='indicator'): ''' Function to run Bayeisan or Random Search hyperparameter optimization ''' #Builds the model object to conduct hyperparameter tuning on if model_type == 'indicator': hyperopt_model = lightgbm.LGBMModel(**model_params, importance_type='gain') elif model_type == 'regressor': hyperopt_model = lightgbm.LGBMRegressor(**model_params, importance_type='gain') eval_set = [(self.df_tune[X_vars], self.df_tune[self.target])] hyperopt_model.fit(X=self.df_train[X_vars], y=self.df_train[self.target], eval_set=eval_set, verbose=False) data = self.df_tune def evaluate_metric(params): hyperopt_model.set_params(**params, bagging_freq=1).fit( X=self.df_train[X_vars], y=self.df_train[self.target], eval_set=eval_set, verbose=False) eval_x = data[X_vars] y_true = data[self.target] y_score = hyperopt_model.predict(eval_x) y_pred = [np.argmax(i) for i in y_score] if isinstance(metric, str): sk_scorer = getattr(metrics, metric, None) if sk_scorer is None: print(f"Specified metric {metric} does not exist in sklearn") score = sk_scorer(y_true=y_true, y_pred=y_pred) return {'loss': -score, 'params': params, 'status': STATUS_OK} if trials_obj is None: self.trials = Trials() else: self.trials = trials_obj if algo == 'tpe': algo = tpe.suggest elif algo == 'random': algo = rand.suggest best_params = fmin(evaluate_metric, space=param_space, algo=algo, max_evals=fmin_max_evals, rstate=np.random.RandomState(self.seed), trials=self.trials) return best_params, self.trials
oof_y_train_test_pred = np.zeros(y_train_test.shape[0]) y_val_preds_sess = np.zeros((x_val.shape[0], num_folds)) for i, (train_index, test_index) in enumerate(sess_folds): # Get current fold x_train, y_train = x_train_test[train_index], y_train_test[train_index] x_test, y_test = x_train_test[test_index], y_train_test[test_index] # Create lgb booster bst = lgb.LGBMModel( objective='regression', num_leaves=73, learning_rate=0.072, n_estimators=10000, min_child_samples=155, subsample=0.99, reg_lambda=0.0, ) bst.fit( X=x_train, y=y_train, eval_set=[(x_test, y_test)], eval_metric='rmse', early_stopping_rounds=20, categorical_feature=cat_col_nums, verbose=False, )