def fit(self, X, y, indicator): ''' indicator=1 means we intend to do just sampling and one-time fitting for evaluating a fixed set of hyper-parameters, 0 means run hyperopt to search in the neighborhood of the seed hyper-parameters to see if model quality is improving. ''' XFull = X yFull = y self.Xe_train, self.Xe_test, self.ys_train, self.ys_test = \ train_test_split(XFull, yFull.ravel(),test_size = self.test_size, random_state=self.seed,shuffle=True) if indicator == 1: ## just fit lightgbm once to obtain the AUC w.r.t a fixed set of hyper-parameters ## model = LGBMClassifier(random_state=self.seed, min_data=1, min_data_in_bin=1) model.set_params(**self.param_space) model.fit(self.Xe_train, self.ys_train) mypreds = model.predict_proba(self.Xe_test)[:, 1] auc = auc_metric(self.ys_test.reshape(-1, 1), mypreds.reshape(-1, 1)) return auc else: trials = Trials() best = fmin(fn=self.gbc_objective, space=self.param_space, algo=tpe.suggest, trials=trials, max_evals=self.max_evaluations) params = space_eval(self.param_space, best) self.best_params = params return params, 1 - np.min([x['loss'] for x in trials.results])
def do_generate_metrics_lgbm_optimazed_model(X_train, y_train, X_test, y_test, grid): model = LGBMClassifier(random_state=0) model.set_params(**grid.best_params_) model.fit(X_train, y_train) metrics = calculate_metrics(model, X_test, y_test) print(model.get_params(), " ", model.score) print(grid.best_params_, " ", grid.best_score_) return model, metrics
def gbc_objective(self, space): model = LGBMClassifier(random_state=self.seed, min_data=1, min_data_in_bin=1) model.set_params(**space) model.fit(self.Xe_train, self.ys_train) mypreds = model.predict_proba(self.Xe_test)[:, 1] auc = auc_metric(self.ys_test.reshape(-1, 1), mypreds.reshape(-1, 1)) return {'loss': (1 - auc), 'status': STATUS_OK}
def fit_lgb(self, X_train, y_train, X_val, y_val, X_test, y_test, **param): """ using turned parameters to fit training dataset, and save the fitted model to a txt file. Also, it return f1 score on test set. Args: X_train: Dataframe df: train set y_train: series: train set response X_val: Dataframe df: validation set y_val: series: validation set response X_test: Dataframe df: test set y_test: series: test set response **param: LightGBM parameters selected from function - turining_lgb() return: f1_score for test set """ model0 = LGBMClassifier(is_unbalance=True, reg_lambda=1) model0.set_params(**param) model0.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=150) fold_pred = model0.predict(X_test, num_iteration=model0.best_iteration_) fold_pred_prob = model0.predict_proba( X_test, num_iteration=model0.best_iteration_) model_probs = fold_pred_prob[:, 1] joblib.dump(model0, 'lgb_model.pkl') f1 = sklearn.metrics.f1_score(y_test, fold_pred) print("lgb turning model - f1_score:{}".format(f1)) lgb.plot_importance(model0.booster_).plot() plt.title( "Feature Importance for selected features in the final LGB model") plt.xlabel('Importance') plt.ylabel('Features') plt.show() # calculate the precision-recall auc precision, recall, _ = precision_recall_curve(y_test, model_probs) auc_score = auc(recall, precision) print('AUC: %.3f' % auc_score) # plot precision-recall curves self.plot_pr_curve(y_test, model_probs) # ROC sklearn.metrics.plot_roc_curve(model0, X_test, y_test) plt.title("ROC for NN + lightGBM") plt.show() return f1
def do_generate_metrics_lgbm_optimazed_model(X_train, y_train, X_test, y_test, grid): file_operations.write_logs(FILENAME, "LGBM metrics calculation\n") model = LGBMClassifier(random_state=0) model.set_params(**grid.best_params_) model.fit(X_train, y_train) metrics = calculate_metrics(model, X_test, y_test) file_operations.write_logs( FILENAME, "Generated model params and results\n params:" + str(model.get_params()) + "\nscore " + str(model.score(X_test, y_test))) file_operations.write_logs( FILENAME, "Search grid best params and results\n params:" + str(grid.best_params_) + "\nscore " + str(grid.best_score_)) return model, metrics
def get_best_hyperparameters(self, train_data, train_labels, validation_ratio, random_state): print('\nFile: {} Class: {} Function: {} State: {}'.format( 'hyperparameters_tuner.py', 'HyperparametersTuner', 'get_best_hyperparameters', 'Start')) self._train_data, self._validation_data, self._train_labels, self._validation_labels = train_test_split( train_data, train_labels, test_size=validation_ratio, random_state=random_state, shuffle=True, stratify=train_labels) classifier = LGBMClassifier() classifier.set_params(**self._fixed_hyperparameters) classifier.fit(self._train_data, self._train_labels) predictions = classifier.predict_proba(self._validation_data)[:, 1] labels = self._validation_labels fixed_hyperparameters_score = roc_auc_score(labels, predictions) print('labels.shape: {}'.format(labels.shape)) print('predictions.shape: {}'.format(predictions.shape)) trials = Trials() best = fmin(fn=self.objective, space=self._search_space, algo=tpe.suggest, trials=trials, max_evals=self._max_evaluations) best_trial_hyperparameters = space_eval(self._search_space, best) best_trial_hyperparameters_score = 1 - np.min( [x['loss'] for x in trials.results]) if fixed_hyperparameters_score > best_trial_hyperparameters_score: print('best auc score: {}'.format(fixed_hyperparameters_score)) return self._fixed_hyperparameters else: print( 'best auc score: {}'.format(best_trial_hyperparameters_score)) return best_trial_hyperparameters print('File: {} Class: {} Function: {} State: {} \n'.format( 'hyperparameters_tuner.py', 'HyperparametersTuner', 'get_best_hyperparameters', 'End'))
def objective(self, trial_hyperparameters): print('\nFile: {} Class: {} Function: {} State: {}'.format( 'hyperparameters_tuner.py', 'HyperparametersTuner', 'objective', 'Start')) print('trial_hyperparameters: {}'.format(trial_hyperparameters)) classifier = LGBMClassifier() classifier.set_params(**self._fixed_hyperparameters) classifier.fit(self._train_data, self._train_labels) predictions = classifier.predict_proba(self._validation_data)[:, 1] labels = self._validation_labels trial_score = roc_auc_score(labels, predictions) print('labels.shape: {}'.format(labels.shape)) print('predictions.shape: {}'.format(predictions.shape)) print('File: {} Class: {} Function: {} State: {} \n'.format( 'hyperparameters_tuner.py', 'HyperparametersTuner', 'objective', 'End')) return {'loss': (1 - trial_score), 'status': STATUS_OK}
def fit_predict(self, X_train, y_train, X_valid, y_valid, X_test, **kwargs): clf = LGBMClassifier() if self.params is not None: clf.set_params(**self.params) # print(clf.get_params()) eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf = clf.fit( X_train, y_train, eval_set=eval_set, eval_metric=None, eval_names=('Train', 'Valid'), verbose=100, early_stopping_rounds=100, # fit_params **kwargs # TODO: set_params ) valid_predict = clf.predict_proba(X_valid) test_predict = clf.predict_proba(X_test) return valid_predict, test_predict
def modelLGBMClassifier(self, trial: optuna.trial.Trial): opt_params = dict( num_leaves=trial.suggest_int("num_leaves", 2, 2**8), learning_rate=trial.suggest_discrete_uniform( 'learning_rate', 0.001, 1, 0.001), n_estimators=trial.suggest_int("n_estimators", 2, 2**10, log=True), min_child_samples=trial.suggest_int('min_child_samples', 2, 2**8), min_child_weight=trial.suggest_loguniform('min_child_weight', 1e-8, 1), min_split_gain=trial.suggest_loguniform('min_split_gain', 1e-8, 1), subsample=trial.suggest_uniform('subsample', 0.4, 1), subsample_freq=trial.suggest_int("subsample_freq", 0, 2**4), colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.4, 1), reg_alpha=trial.suggest_loguniform('reg_alpha', 1e-8, 10), reg_lambda=trial.suggest_loguniform('reg_lambda', 1e-8, 10), ) clf = LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, objective=None, class_weight=None, min_split_gain=0., min_child_weight=1e-3, min_child_samples=20, subsample=1., subsample_freq=0, colsample_bytree=1., reg_alpha=0., reg_lambda=0., random_state=None, n_jobs=-1, silent=True, importance_type='split') clf.set_params(**{**opt_params, **self.params}) return clf
# {'subsample': [i/10.0 for i in range(6,10)], # 'colsample_bytree':[i/10.0 for i in range(6,10)]}, # {'reg_alpha': [1e-2, 0.1, 1, 2, 5, 10], # 'reg_lambda': [0.01,0.1, 1, 2, 5, 10]}, # {'learning_rate':np.linspace(0.01, 1.0, 50)} ] for params in lt_params: grid = GridSearchCV(estimator=gbm, param_grid=params) grid.fit(X, y) bestParams.update(grid.best_params_) gbm.set_params(**bestParams) print('最优参数:\n', bestParams) print('score=', grid.best_score_) mdl = grid.best_estimator_ y_pred = mdl.predict(X, num_iteration=mdl.best_iteration_) displayClassifierMetrics(y, y_pred, grid.classes_) y_prob = mdl.predict_proba(X, num_iteration=mdl.best_iteration_) displayROCurve(y, y_prob, grid.classes_) # 显示特征重要性 # lightgbm.LGBMClassifier # (boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
# %% class Counter(TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): print(X.shape[1]) return X # %% sel_mod = LGBMClassifier(metric='auc', n_estimators=200, boosting_type=BOOSTING) sel_mod.set_params(**pars) model = make_pipeline( SelectFromModel(sel_mod), Counter(), LGBMClassifier(metric='auc', boosting_type=BOOSTING, n_estimators=2000)) # %% prun = PrunedCV(N_FOLD, 0.02, minimize=False) # %% def objective(trial): joblib.dump(study, 'study_{}.pkl'.format(BOOSTING)) params = { 'selectfrommodel__threshold':
class EveryTime: NAME = 'EveryTime' def __init__(self, datainfo, timeinfo): self._info = extract(datainfo, timeinfo) print_data_info(self._info) print_time_info(self._info) self._validation_ratio = 0.25 self._max_data = 400000 self._iteration = 0 self._random_state = 13 self._max_evaluations = 25 self._dataset_budget_threshold = 0.8 self._should_correct = False self._correction_threshold = 0.8 self._correction_n_splits = 8 self._epsilon = 0.001 self._ensemble_size = 4 self._minority_threshold = 10000 self._large_fraction = 8 self._small_fraction = 4 self._categorical_frequency_map = {} self._mvc_frequency_map = {} self._train_data = [] self._train_labels = [] self._best_hyperparameters = None self._classifier = None self._imbalanced_sampler = OldRandomMajorityUnderSampler( self._random_state, self._small_fraction) self._too_much_data_sampler = StratifiedRandomSampler( self._max_data, self._random_state) self._test_sampler = RandomSampler(self._random_state) self._profile = Profile.LGBM_ORIGINAL_NAME self._is_first = True def fit(self, F, y, datainfo, timeinfo): print('\nFile: {} Class: {} Function: {} State: {}'.format( 'architectures.py', 'OriginalEnsemble', 'fit', 'Start')) info = extract(datainfo, timeinfo) self._info.update(info) print_time_info(self._info) data = get_data(F, self._info) y = y.ravel() print('data.shape: {}'.format(data.shape)) print('y.shape: {}'.format(y.shape)) bincount = np.bincount(y.astype(int)) print('Number of 0 label: {}'.format(bincount[0])) print('Number of 1 label: {}'.format(bincount[1])) if min(bincount) < self._minority_threshold: self._imbalanced_sampler = OldRandomMajorityUnderSampler( self._random_state, self._large_fraction) size = int(min(bincount) * self._large_fraction * 2.5) self._too_much_data_sampler = StratifiedRandomSampler( size, self._random_state) self._categorical_frequency_map = {} self._mvc_frequency_map = {} self._transform(data, DataType.TRAIN) self._train_data = np.concatenate( (self._train_data, data), axis=0) if len(self._train_data) > 0 else data self._train_labels = np.concatenate( (self._train_labels, y), axis=0) if len(self._train_labels) > 0 else y self._train_data, self._train_labels = self._imbalanced_sampler.sample( self._train_data, self._train_labels) self._train_data, self._train_labels = self._too_much_data_sampler.sample( self._train_data, self._train_labels) print('self._train_data.shape: {}'.format(self._train_data.shape)) print('self._train_labels.shape: {}'.format(self._train_labels.shape)) self._iteration += 1 print('File: {} Class: {} Function: {} State: {} \n'.format( 'architectures.py', 'OriginalEnsemble', 'fit', 'End')) def predict(self, F, datainfo, timeinfo): print('\nFile: {} Class: {} Function: {} State: {}'.format( 'architectures.py', 'OriginalEnsemble', 'predict', 'Start')) info = extract(datainfo, timeinfo) self._info.update(info) print_time_info(self._info) test_data = get_data(F, self._info) print('test_data.shape: {}'.format(test_data.shape)) transformed_test_data = self._transform(test_data, DataType.TEST) train_data = self._transform(self._train_data, DataType.TRAIN) train_labels = self._train_labels print('transformed_test_data.shape: {}'.format( transformed_test_data.shape)) print('train_data.shape: {}'.format(train_data.shape)) size = len(train_data) if len(transformed_test_data) > len( train_data) else len(transformed_test_data) train_weights = correct_covariate_shift( train_data, self._test_sampler.sample(transformed_test_data, size), self._random_state, self._correction_threshold, self._correction_n_splits) if self._should_correct else None fixed_hyperparameters, search_space = Profile.parse_profile( self._profile) if self._best_hyperparameters is None: tuner = HyperparametersTuner(fixed_hyperparameters, search_space, self._max_evaluations) self._best_hyperparameters = tuner.get_best_hyperparameters( train_data, train_labels, self._validation_ratio, self._random_state) print('self._best_hyperparameters: {}'.format( self._best_hyperparameters)) if has_sufficient_time(self._dataset_budget_threshold, self._info): t_d, validation_data, t_l, validation_labels = train_test_split( train_data, train_labels, test_size=self._validation_ratio, random_state=self._random_state, shuffle=True, stratify=train_labels) self._classifier = LGBMClassifier() self._classifier.set_params(**self._best_hyperparameters) self._classifier.fit(train_data, train_labels, sample_weight=train_weights) else: print('Time budget exceeded.') predictions = self._classifier.predict_proba(transformed_test_data)[:, 1] self._iteration += 1 print('predictions.shape: {}'.format(predictions.shape)) print('File: {} Class: {} Function: {} State: {} \n'.format( 'architectures.py', 'OriginalEnsemble', 'predict', 'End')) return predictions def _transform(self, data, datatype): transformed_data = np.array([]) time_data, numerical_data, categorical_data, mvc_data = split_data_by_type( data, self._info) if len(time_data) > 0: transformed_data = subtract_min_time(time_data) transformed_data = np.concatenate( (transformed_data, difference_between_time_columns(time_data)), axis=1) transformed_data = np.concatenate( (transformed_data, extract_detailed_time(time_data)), axis=1) if len(numerical_data) > 0: transformed_data = numerical_data if len(transformed_data) == 0 else \ np.concatenate((transformed_data, numerical_data), axis=1) if len(categorical_data) > 0: if (datatype == DataType.TRAIN and self._iteration % 2 == 0) or datatype == DataType.TEST: self._categorical_frequency_map = count_frequency( self._categorical_frequency_map, categorical_data) encoded_categorical_data = encode_frequency( self._categorical_frequency_map, categorical_data) transformed_data = np.concatenate( (transformed_data, encoded_categorical_data), axis=1) if len(mvc_data) > 0: if (datatype == DataType.TRAIN and self._iteration % 2 == 0) or datatype == DataType.TEST: self._mvc_frequency_map = count_frequency( self._mvc_frequency_map, mvc_data) encoded_mvc_data = encode_frequency(self._mvc_frequency_map, mvc_data) transformed_data = np.concatenate( (transformed_data, encoded_mvc_data), axis=1) return np.nan_to_num(transformed_data)
def predict(self, F, datainfo, timeinfo): print('\nFile: {} Class: {} Function: {} State: {}'.format( 'architectures.py', 'OriginalEnsemble', 'predict', 'Start')) info = extract(datainfo, timeinfo) self._info.update(info) print_time_info(self._info) test_data = get_data(F, self._info) print('test_data.shape: {}'.format(test_data.shape)) transformed_test_data = self._transform(test_data, DataType.TEST) train_data = self._transform(self._train_data, DataType.TRAIN) train_labels = self._train_labels print('transformed_test_data.shape: {}'.format( transformed_test_data.shape)) print('train_data.shape: {}'.format(train_data.shape)) size = len(train_data) if len(transformed_test_data) > len( train_data) else len(transformed_test_data) train_weights = correct_covariate_shift( train_data, self._test_sampler.sample(transformed_test_data, size), self._random_state, self._correction_threshold, self._correction_n_splits) if self._should_correct else None fixed_hyperparameters, search_space = Profile.parse_profile( self._profile) if self._best_hyperparameters is None: tuner = HyperparametersTuner(fixed_hyperparameters, search_space, self._max_evaluations) self._best_hyperparameters = tuner.get_best_hyperparameters( train_data, train_labels, self._validation_ratio, self._random_state) print('self._best_hyperparameters: {}'.format( self._best_hyperparameters)) if has_sufficient_time(self._dataset_budget_threshold, self._info) or len(self._classifiers) == 0: t_d, validation_data, t_l, validation_labels = train_test_split( train_data, train_labels, test_size=self._validation_ratio, random_state=self._random_state, shuffle=True, stratify=train_labels) new_classifier = LGBMClassifier() new_classifier.set_params(**self._best_hyperparameters) new_classifier.fit(train_data, train_labels, sample_weight=train_weights) new_predictions = new_classifier.predict_proba(validation_data)[:, 1] new_weight = compute_weight(new_predictions, validation_labels, self._epsilon) self._ensemble_weights = np.array([]) for i in range(len(self._classifiers)): currrent_classifier = self._classifiers[i] currrent_classifier_predictions = currrent_classifier.predict_proba( validation_data)[:, 1] currrent_classifier_weight = compute_weight( currrent_classifier_predictions, validation_labels, self._epsilon) self._ensemble_weights = np.append(self._ensemble_weights, currrent_classifier_weight) self._classifiers = np.append(self._classifiers, new_classifier) self._ensemble_weights = np.append(self._ensemble_weights, new_weight) print('self._ensemble_weights: {}'.format(self._ensemble_weights)) if len(self._classifiers) > self._ensemble_size: i = remove_worst_classifier(self._classifiers, validation_data, validation_labels) print('Removed classifier: {}'.format(i)) self._classifiers = np.delete(self._classifiers, i) self._ensemble_weights = np.delete(self._ensemble_weights, i) else: print('Time budget exceeded.') if len(self._classifiers) == 1: predictions = self._classifiers[0].predict_proba( transformed_test_data)[:, 1] else: predictions = np.zeros(len(transformed_test_data)) for i in range(len(self._classifiers)): predictions = np.add( predictions, self._ensemble_weights[i] * self._classifiers[i].predict_proba( transformed_test_data)[:, 1]) predictions = np.divide(predictions, np.sum(self._ensemble_weights)) self._iteration += 1 print('predictions.shape: {}'.format(predictions.shape)) print('File: {} Class: {} Function: {} State: {} \n'.format( 'architectures.py', 'OriginalEnsemble', 'predict', 'End')) return predictions
class LGBMClassifierCV(object): """cross_val_predict""" def __init__(self, params=None, cv=5, random_state=None, n_repeats=None): self.clf = LGBMClassifier() if params: self.clf.set_params(**params) if n_repeats: self._kf = RepeatedStratifiedKFold(cv, True, random_state) # 复制N次 self._num_preds = cv * n_repeats else: self._kf = StratifiedKFold(cv, True, random_state) self._num_preds = cv def fit(self, X, y, X_test=None, feval=roc_auc_score, sample_weight=None, init_score=None, eval_metric='auc', early_stopping_rounds=100, verbose=100, feature_name='auto', categorical_feature='auto', callbacks=None): """输入数组""" if X_test is None: X_test = X[:1] # 将第一行作为test集 self.oof_train = np.zeros(len(X)) self.oof_test = np.zeros( (len(X_test), self._num_preds)) # num_preds:有多少折 for n_fold, (train_index, valid_index) in enumerate(self._kf.split(X, y)): if verbose: print("\033[94mFold %s started at %s\033[0m" % (n_fold + 1, time.ctime())) X_train, y_train = X[train_index], y[train_index] X_valid, y_valid = X[valid_index], y[valid_index] eval_set = [(X_train, y_train), (X_valid, y_valid)] # 需要同时验证两个集合 ######################################################################## self.clf.fit(X_train, y_train, sample_weight, init_score, eval_set, eval_names=('Train', 'Valid'), eval_sample_weight=None, eval_class_weight=None, eval_init_score=None, eval_metric=eval_metric, early_stopping_rounds=early_stopping_rounds, verbose=verbose, feature_name=feature_name, categorical_feature=categorical_feature, callbacks=callbacks) self.oof_train[valid_index] = self.clf.predict_proba(X_valid)[:, 1] self.oof_test[:, n_fold] = self.clf.predict_proba(X_test)[:, 1] ######################################################################## # 输出 测试集 out-of-fold self.oof_test_rank = (pd.DataFrame(self.oof_test).rank().mean(axis=1) / len(self.oof_test)).values self.oof_test = self.oof_test.mean(axis=1) # 测试集的oof score算平均 assert len(X) == len(self.oof_train) assert len(X_test) == len(self.oof_test) # 计算 训练集 oof 得分(out_of_fold) if feval: self.oof_train_score = feval(y, self.oof_train) print( f"\n\033[94mtrain CV Score: {self.oof_train_score} ended at {time.ctime()}\033[0m" ) return self.oof_train_score def oof_submit(self, ids, pred_ranking=False, file=None, preds=None): """preds分用于submit""" if file is None: file = f'submit_{self.oof_train_score}.csv' print(f'Save {file} ...') if preds is None: preds = self.oof_test if pred_ranking else self.oof_test_rank if not isinstance(ids, pd.DataFrame): ids = pd.DataFrame(ids) ids.assign(preds=preds).to_csv(file, index=False, header=False) @property def oof_train_and_test(self): return np.r_[self.oof_train, self.oof_test] def oof_save(self, file='./oof_train_and_test.csv'): pd.DataFrame(self.oof_train_and_test, columns=['oof_train_and_test']).to_csv(file, index=False) def plot_feature_importances(self, feature_names=None, topk=20, figsize=(10, 6), pic_name=None): columns = ['Importances', 'Features'] importances = self.clf.feature_importances_.tolist() if feature_names is None: feature_names = list( map(lambda x: f'F_{x}', range(len(importances)))) _ = list(zip(importances, feature_names)) df = pd.DataFrame(_, columns=columns).sort_values('Importances', 0, False) plt.figure(figsize=figsize) sns.barplot(*columns, data=df[:topk]) plt.title('Features Importances\n') plt.tight_layout() if pic_name is None: plt.savefig(f'importances_{self.oof_train_score}.png')
# Parameters of the model # https://lightgbm.readthedocs.io/en/latest/Parameters.html params = { 'objective': 'binary', 'learning_rate': 0.005, 'num_leaves': 3, 'min_data_in_leaf': 10, 'colsample_bytree': 1, 'max_bin': 10, 'random_seed': RS } # We will define the model for various C's and make a search of the optimum model_lightgbm_L1 = LGBMClassifier() model_lightgbm_L1.set_params(**params) iter = [500, 1000, 1500] print('\nLightGBM Level 1 CV...') print('########################################################') scores = [] for nrounds in iter: model_lightgbm_L1.set_params(n_estimators=nrounds) print('\nn rounds: ', nrounds) s = Model_cv(model_lightgbm_L1, n_folds, X1_train, X1_test, Y_train, RS, makepred=False)
class Model: def __init__(self, data_info, time_info): # Print data information info_dict = extract(data_info, time_info) print_data_info(info_dict) # # Install hyperopt and lightgbm # pip_install('hyperopt') # pip_install('lightgbm') print('Using algo: {}'.format(params['algo'])) # Settings if params['algo'] == Algo.ORIGINAL: self._dataset_budget_threshold = 0.8 self._max_train_data = 200000 self.batch_size = 50000 self.delta_n_estimators = 100 self.delta_num_leaves = 20 self.delta_learning_rate = 0.005 self.delta_max_depth = 1 self.delta_feature_fraction = 0.1 self.delta_bagging_fraction = 0.1 self.delta_bagging_freq = 1 self.max_evaluation = 30 self.param_choice_fixed = { 'n_estimators': 400, 'learning_rate': 0.01, 'num_leaves': 50, 'feature_fraction': 0.6, 'bagging_fraction': 0.6, 'bagging_freq': 2, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc' } elif params['algo'] == Algo.FACEBOOK_LR: self._dataset_budget_threshold = 0.8 self._max_train_data = 100000 self.batch_size = 25000 self.delta_n_estimators = 50 self.delta_num_leaves = 10 self.delta_learning_rate = 0.005 self.delta_max_depth = 1 self.delta_feature_fraction = 0.1 self.delta_bagging_fraction = 0.1 self.delta_bagging_freq = 1 self.max_evaluation = 30 self.param_choice_fixed = { 'n_estimators': 75, 'learning_rate': 0.01, 'num_leaves': 15, 'feature_fraction': 0.6, 'bagging_fraction': 0.6, 'bagging_freq': 2, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc' } elif params['algo'] == Algo.BASIC: self._dataset_budget_threshold = 0.8 self._max_train_data = 100000 self.batch_size = 25000 self.delta_n_estimators = 50 self.delta_num_leaves = 10 self.delta_learning_rate = 0.005 self.delta_max_depth = 1 self.delta_feature_fraction = 0.1 self.delta_bagging_fraction = 0.1 self.delta_bagging_freq = 1 self.max_evaluation = 30 self.param_choice_fixed = { 'n_estimators': 75, 'learning_rate': 0.01, 'num_leaves': 15, 'feature_fraction': 0.6, 'bagging_fraction': 0.6, 'bagging_freq': 2, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc' } self._train_data = np.array([]) self._train_labels = np.array([]) self._transformed_train_data = np.array([]) self.best_hyperparams = {} self._classifier = None self._classifier2 = None self._data_processor = DataProcessor(info_dict) self._sampler = Sampler() self.mdl = StreamSaveRetrainPredictor() def fit(self, F, y, data_info, time_info): ''' This function trains the model parameters. Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' info_dict = extract(data_info, time_info) print_time_info(info_dict) if params['algo'] == Algo.OLD_CODE: return self.mdl.partial_fit(F, y, data_info, time_info) elif params['algo'] == Algo.ORIGINAL: return self._original_fit(F, y, info_dict) elif params['algo'] == Algo.FACEBOOK_LR: return self._facebook_lr_fit(F, y, info_dict) elif params['algo'] == Algo.BASIC: return self._basic_fit(F, y, info_dict) def predict(self, F, data_info, time_info): ''' This function should provide predictions of labels on (test) data. Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually returns probabilities or continuous values. ''' info_dict = extract(data_info, time_info) print_time_info(info_dict) if params['algo'] == Algo.OLD_CODE: return self.mdl.predict(F, data_info, time_info) elif params['algo'] == Algo.ORIGINAL: return self._original_predict(F, info_dict) elif params['algo'] == Algo.FACEBOOK_LR: return self._facebook_lr_predict(F, info_dict) elif params['algo'] == Algo.BASIC: return self._basic_predict(F, info_dict) def save(self, path="./"): pickle.dump(self, open(path + '_model.pickle', "w")) def load(self, path="./"): modelfile = path + '_model.pickle' if isfile(modelfile): with open(modelfile) as f: self = pickle.load(f) return self def _original_fit(self, F, y, info_dict): data = self._convert_nan_to_num(F, info_dict) if self._data_processor.is_uninitialized: self._data_processor.preprocess(data) sampled_data, sampled_labels = self._sampler.majority_undersampling( data, y) if len(self._train_data) == 0 and len(self._train_labels) == 0: self._train_data = sampled_data self._train_labels = sampled_labels else: self._train_data = np.concatenate((self._train_data, sampled_data), axis=0) self._train_labels = np.concatenate( (self._train_labels, sampled_labels), axis=0) def _original_predict(self, F, info_dict): data = self._convert_nan_to_num(F, info_dict) if self._has_sufficient_time(info_dict) or self._classifier is None: self._data_processor.preprocess(data) self._data_processor.prepare_frequency_map() current_train_data = self._train_data current_train_labels = self._train_labels print('self._train_data.shape: {}'.format(self._train_data.shape)) print('self._train_labels.shape: {}'.format( self._train_labels.shape)) print('self._train_data.size: {}'.format(self._train_data.size)) print('len(self._train_data): {}'.format(len(self._train_data))) print('self._max_train_data: {}'.format(self._max_train_data)) if self._too_much_training_data(): remove_percentage = 1.0 - (float(self._max_train_data) / len(self._train_data)) print('remove_percentage: {}'.format(remove_percentage)) current_train_data, current_train_labels = self._sampler.random_sample_in_order(self._train_data, \ self._train_labels.reshape(-1,1), \ remove_percentage) print('current_train_data.shape: {}'.format( current_train_data.shape)) print('current_train_labels: {}'.format( current_train_labels.shape)) self._train_data, self._train_labels = current_train_data, current_train_labels.reshape( (-1, )) print('new self._train_data.shape: {}'.format( self._train_data.shape)) print('new self._train_labels.shape: {}'.format( self._train_labels.shape)) self._transformed_train_data = self._data_processor.transform_data( current_train_data) self._transformed_train_labels = current_train_labels if not self.best_hyperparams: self._find_best_hyperparameters() self._classifier = LGBMClassifier(random_state=20, min_data=1, min_data_in_bin=1) self._classifier.set_params(**self.best_hyperparams) self._classifier.fit(self._transformed_train_data, self._transformed_train_labels.ravel()) if data.shape[ 0] <= self.batch_size: ### if it is relatively small array probs = self._classifier.predict_proba( self._data_processor.transform_data(data))[:, 1] return probs else: print('BATCH') print('data.shape: {}'.format(data.shape)) results = np.array( []) ## for chunking results to handle memory limit for i in range(0, data.shape[0], self.batch_size): Xsplit = data[i:(i + self.batch_size), :] results = np.append( results, self._classifier.predict_proba( self._data_processor.transform_data(Xsplit))[:, 1]) del Xsplit print('results.shape: {}'.format(results.shape)) return results return [] def _facebook_lr_fit(self, F, y, info_dict): data = self._convert_nan_to_num(F, info_dict) if self._data_processor.is_uninitialized: self._data_processor.preprocess(data) sampled_data, sampled_labels = self._sampler.majority_undersampling( data, y) if len(self._train_data) == 0 and len(self._train_labels) == 0: self._train_data = sampled_data self._train_labels = sampled_labels else: self._train_data = np.concatenate((self._train_data, sampled_data), axis=0) self._train_labels = np.concatenate( (self._train_labels, sampled_labels), axis=0) def _facebook_lr_predict(self, F, info_dict): data = self._convert_nan_to_num(F, info_dict) if self._has_sufficient_time( info_dict ) or self._classifier is None or self._classifier2 is None: self._data_processor.preprocess(data) self._data_processor.prepare_frequency_map() current_train_data = self._train_data current_train_labels = self._train_labels print('self._train_data.shape: {}'.format(self._train_data.shape)) print('self._train_labels.shape: {}'.format( self._train_labels.shape)) print('self._train_data.size: {}'.format(self._train_data.size)) print('len(self._train_data): {}'.format(len(self._train_data))) print('self._max_train_data: {}'.format(self._max_train_data)) if self._too_much_training_data(): remove_percentage = 1.0 - (float(self._max_train_data) / len(self._train_data)) print('remove_percentage: {}'.format(remove_percentage)) current_train_data, current_train_labels = self._sampler.random_sample_in_order(self._train_data, \ self._train_labels.reshape(-1,1), \ remove_percentage) print('current_train_data.shape: {}'.format( current_train_data.shape)) print('current_train_labels: {}'.format( current_train_labels.shape)) self._train_data, self._train_labels = current_train_data, current_train_labels.reshape( (-1, )) print('new self._train_data.shape: {}'.format( self._train_data.shape)) print('new self._train_labels.shape: {}'.format( self._train_labels.shape)) self._transformed_train_data = self._data_processor.transform_data( current_train_data) self._transformed_train_labels = current_train_labels if not self.best_hyperparams: self._find_best_hyperparameters() self._classifier = LGBMClassifier(random_state=20, min_data=1, min_data_in_bin=1) self._classifier.set_params(**self.best_hyperparams) self._classifier.fit(self._transformed_train_data, self._transformed_train_labels.ravel()) probs = self._classifier.predict(self._transformed_train_data, pred_leaf=True) new_probs = onehot_sparse(probs) self._classifier2 = LogisticRegression() self._classifier2.fit(new_probs, self._transformed_train_labels.ravel()) del probs del new_probs if data.shape[ 0] <= self.batch_size: ### if it is relatively small array probs = probs = self._classifier.predict( self._data_processor.transform_data(data), pred_leaf=True) new_probs = onehot_sparse(probs) actual_probs = self._classifier2.predict_proba(new_probs)[:, 1] return actual_probs else: print('BATCH') print('data.shape: {}'.format(data.shape)) results = np.array( []) ## for chunking results to handle memory limit for i in range(0, data.shape[0], self.batch_size): Xsplit = data[i:(i + self.batch_size), :] probs = probs = self._classifier.predict( self._data_processor.transform_data(Xsplit), pred_leaf=True) new_probs = onehot_sparse(probs) actual_probs = self._classifier2.predict_proba(new_probs)[:, 1] results = np.append(results, actual_probs) del Xsplit del probs del new_probs del actual_probs print('results.shape: {}'.format(results.shape)) return results return [] def _basic_fit(self, F, y, info_dict): data = self._convert_nan_to_num(F, info_dict) y = y.reshape((-1, )) # if self._data_processor.is_uninitialized: # self._data_processor.preprocess(data) print('data.shape: {}'.format(data.shape)) print('y.shape: {}'.format(y.shape)) if self._has_sufficient_time(info_dict) or self._classifier is None: self._classifier = LGBMClassifier(random_state=20, min_data=1, min_data_in_bin=1) self._classifier.set_params(**self.param_choice_fixed) transformed_data = self._data_processor.simple_transform_data(data) self._classifier.fit(transformed_data, y) def _basic_predict(self, F, info_dict): data = self._convert_nan_to_num(F, info_dict) transformed_data = self._data_processor.simple_transform_data(data) probs = self._classifier.predict_proba(transformed_data)[:, 1] return probs def _convert_nan_to_num(self, F, info_dict): # Convert time and numerical nan data = F['numerical'] data = np.nan_to_num(data) # Convert categorical nan if info_dict['no_of_categorical_features'] > 0: categorical_data = F['CAT'].fillna('nan').values data = np.concatenate((data, categorical_data), axis=1) del categorical_data # Convert mvc nan if info_dict['no_of_mvc_features'] > 0: mvc_data = F['MV'].fillna('nan').values data = np.concatenate((data, mvc_data), axis=1) del mvc_data return data def _has_sufficient_time(self, info_dict): return info_dict['dataset_time_spent'] < info_dict[ 'time_budget'] * self._dataset_budget_threshold def _too_much_training_data(self): return self._train_data.shape[0] > self._max_train_data def _find_best_hyperparameters(self): param_choice_fixed = self.param_choice_fixed autohyper = HyperparametersTuner(parameter_space=param_choice_fixed) best_score_choice1 = autohyper.fit( self._transformed_train_data, self._transformed_train_labels.ravel(), 1) #Get the AUC for the fixed hyperparameter+Hyperopt combination on the internal validation set #Step:1-Define the search space for Hyperopt to be a small delta region over the initial set of fixed hyperparameters n_estimators_low = param_choice_fixed[ 'n_estimators'] - self.delta_n_estimators n_estimators_high = param_choice_fixed[ 'n_estimators'] + self.delta_n_estimators learning_rate_low = np.log(0.001) if ( param_choice_fixed['learning_rate'] - self.delta_learning_rate ) < 0.001 else np.log(param_choice_fixed['learning_rate'] - self.delta_learning_rate) learning_rate_high = np.log(param_choice_fixed['learning_rate'] + self.delta_learning_rate) num_leaves_low = param_choice_fixed[ 'num_leaves'] - self.delta_num_leaves num_leaves_high = param_choice_fixed[ 'num_leaves'] + self.delta_num_leaves feature_fraction_low = np.log(0.05) if ( param_choice_fixed['feature_fraction'] - self.delta_feature_fraction) < 0.05 else np.log( param_choice_fixed['feature_fraction'] - self.delta_feature_fraction) feature_fraction_high = np.log(1.0) if ( param_choice_fixed['feature_fraction'] + self.delta_feature_fraction) > 1.0 else np.log( param_choice_fixed['feature_fraction'] + self.delta_feature_fraction) bagging_fraction_low = np.log(0.05) if ( param_choice_fixed['bagging_fraction'] - self.delta_bagging_fraction) < 0.05 else np.log( param_choice_fixed['bagging_fraction'] - self.delta_bagging_fraction) bagging_fraction_high = np.log(1.0) if ( param_choice_fixed['bagging_fraction'] + self.delta_bagging_fraction) > 1.0 else np.log( param_choice_fixed['bagging_fraction'] + self.delta_bagging_fraction) bagging_freq_low = 1 if ( param_choice_fixed['bagging_freq'] - self.delta_bagging_freq ) < 1 else param_choice_fixed['bagging_freq'] - self.delta_bagging_freq bagging_freq_high = param_choice_fixed[ 'bagging_freq'] + self.delta_bagging_freq boosting_type = param_choice_fixed['boosting_type'] objective = param_choice_fixed['objective'] metric = param_choice_fixed['metric'] #set the search space to be explored by Hyperopt param_space_forFixed = { 'objective': "binary", 'n_estimators': hp.choice( 'n_estimators', np.arange(n_estimators_low, n_estimators_high, 50, dtype=int)), 'num_leaves': hp.choice('num_leaves', np.arange(num_leaves_low, num_leaves_high, 5, dtype=int)), 'feature_fraction': hp.loguniform('feature_fraction', feature_fraction_low, feature_fraction_high), 'bagging_fraction': hp.loguniform('bagging_fraction', bagging_fraction_low, bagging_fraction_high), 'bagging_freq': hp.choice( 'bagging_freq', np.arange(bagging_freq_low, bagging_freq_high + 1, 1, dtype=int)), 'learning_rate': hp.loguniform('learning_rate', learning_rate_low, learning_rate_high), 'boosting_type': boosting_type, 'metric': metric, 'verbose': -1 } #run Hyperopt to search nearby region in the hope to obtain a better combination of hyper-parameters autohyper = HyperparametersTuner(max_evaluations=self.max_evaluation, parameter_space=param_space_forFixed) best_hyperparams_choice2, best_score_choice2 = autohyper.fit( self._transformed_train_data, self._transformed_train_labels.ravel(), 0) #Compare choice-1 & choice-2 and take the better one if best_score_choice1 >= best_score_choice2: self.best_hyperparams = param_choice_fixed else: self.best_hyperparams = best_hyperparams_choice2 print('\nBest Hyperparams: {}\n'.format(self.best_hyperparams))
class LGBMClassifierCV(object): """cross_val_predict""" def __init__(self, params=None, cv=5, cv_seed=None, n_repeats=None): self.clf = LGBMClassifier() self.cv = cv if params: self.clf.set_params(**params) if n_repeats: self._kf = RepeatedStratifiedKFold(cv, shuffle=True, random_state=cv_seed) self._num_preds = cv * n_repeats else: self._kf = StratifiedKFold(cv, shuffle=True, random_state=cv_seed) self._num_preds = cv def fit(self, X, y, X_test=None, feval=roc_auc_score, fix_valid_index=None, sample_weight=None, init_score=None, eval_metric='auc', early_stopping_rounds=300, verbose=100, feature_name='auto', categorical_feature='auto', callbacks=None): """ :param X: 数组 :param y: :param X_test: :param feval: :param fix_valid_index: 默认折外为验证集,可添加验证集范围(指定其在X里的index) :return: """ self.best_info = {} self.feature_importances = 0 if X_test is None: X_test = X[:1] self.oof_train = np.zeros(len(X)) self.oof_test = np.zeros((len(X_test), self._num_preds)) for n_fold, (train_index, valid_index) in enumerate(self._kf.split(X, y)): if verbose: print("\033[94mFold %s started at %s\033[0m" % (n_fold + 1, time.ctime())) # 设置valid早停范围:原生X索引 if fix_valid_index is not None: valid_index = list(set(fix_valid_index) & set(valid_index)) # 线下 + 线上验证集 X_train, y_train = X[train_index], y[train_index] X_valid, y_valid = X[valid_index], y[valid_index] eval_set = [(X_train, y_train), (X_valid, y_valid)] ######################################################################## self.clf.fit(X_train, y_train, sample_weight, init_score, eval_set, eval_names=('Train', 'Valid'), eval_sample_weight=None, eval_class_weight=None, eval_init_score=None, eval_metric=eval_metric, early_stopping_rounds=early_stopping_rounds, verbose=verbose, feature_name=feature_name, categorical_feature=categorical_feature, callbacks=callbacks) self.oof_train[valid_index] = self.clf.predict_proba(X_valid)[:, 1] self.oof_test[:, n_fold] = self.clf.predict_proba(X_test)[:, 1] # best info self.best_info.setdefault('best_iteration', []).append(self.clf.best_iteration_) # todo: 支持多分类 self.best_info.setdefault('best_score_train', []).append( self.clf.best_score_['Train']['auc']) self.best_info.setdefault('best_score_valid', []).append( self.clf.best_score_['Valid']['auc']) # feature importances self.feature_importances += self.clf.feature_importances_ / self.cv ######################################################################## # 输出 测试集 oof self.oof_test_rank = (pd.DataFrame(self.oof_test).rank().mean(1) / len(self.oof_test)).values self.oof_test = self.oof_test.mean(1) assert len(X) == len(self.oof_train) assert len(X_test) == len(self.oof_test) # 计算 训练集 oof 得分 if feval is not None and verbose > 0: self.oof_score = feval(y, self.oof_train) print("\n\033[94mScore Info:\033[0m") print(f"\033[94m {self.cv:>2} CV: {self.oof_score:.6f}\033[0m") _ = np.array(self.best_info['best_iteration']) print( f"\033[94m Iter: {_.mean():.0f} +/- {_.std():.0f}\033[0m") _ = np.array(self.best_info['best_score_valid']) print( f"\033[94m Valid: {_.mean():.6f} +/- {_.std():.6f} \033[0m\n" ) return self.oof_score def oof_submit(self, ids, pred_ranking=False, file=None, preds=None): """preds藏分用""" if file is None: file = f'submit_cv{self.cv}_{self.oof_score}.csv' print(f'Save {file} ...') if preds is None: preds = self.oof_test_rank if pred_ranking else self.oof_test if not isinstance(ids, pd.DataFrame): ids = pd.DataFrame(ids) ids.assign(preds=preds).to_csv(file, index=False, header=False) @property def oof_train_and_test(self): return np.r_[self.oof_train, self.oof_test] def oof_save(self, file='./oof_train_and_test.csv'): pd.DataFrame(self.oof_train_and_test, columns=['oof_train_and_test']).to_csv(file, index=False) def plot_feature_importances(self, feature_names=None, topk=20, figsize=None, pic_name=None): columns = ['Importances', 'Features'] importances = self.feature_importances.tolist() if feature_names is None: feature_names = list( map(lambda x: f'F_{x}', range(len(importances)))) _ = sorted(zip(importances, feature_names), reverse=True) self.df_feature_importances = pd.DataFrame(_, columns=columns) plt.figure(figsize=(14, topk // 5) if figsize is None else figsize) sns.barplot(*columns, data=self.df_feature_importances[:topk]) plt.title('Features Importances\n') plt.tight_layout() if pic_name is None: plt.savefig(f'importances_{self.oof_score}.png') @classmethod def opt_cv(cls, X, y, X_test=None, cv_list=range(3, 16), params=None, cv_seed=777, topk=5): oofs = [] for cv in tqdm(cv_list, desc='opt cv'): # range(3, 16): oof = cls(params, cv, cv_seed=cv_seed) oof.fit(X, y, X_test, verbose=0) oofs.append((oof.oof_score, cv, oof)) return sorted(oofs)[::-1][:topk]
def lgbm(df, predictions_path, test_path, model_path, acct_id, summary, dock_path, rp_dir, root_dir): X_train, y_train = train_model(df) ch = st.radio("Choose From", ('Basic Parameters', 'Enter Manually')) if ch == 'Basic Parameters': n_estimators = st.number_input( label='Enter Number of Estimator (Integer)', value=100, min_value=2) random_state = st.number_input(label='Enter Random state(Integer)', value=0, min_value=0) max_depth = st.number_input(label='Enter Depth of Tree (Integer)', value=-1) learning_rate = st.text_input(label='Enter learning rate', value='0.01', max_chars=10, type='default') subsample = st.number_input(label='Enter value for subsample ', value=1.0, min_value=0.0) num_leaves = st.number_input(label='Enter min samples split (Integer)', value=31, min_value=0) reg_alpha = st.text_input(label='Enter value for reg alpha', value='0', max_chars=10, type='default') reg_lambda = st.text_input(label='Enter value for reg lambda', value='0', max_chars=10, type='default') class_weight = st.text_input( label='Enter class weights in dictionary format', value='balanced', max_chars=20, type='default') boosting_type = st.selectbox(label='Select criterion', options=['gbdt', 'dart', 'goss', 'rf']) if class_weight != 'balanced': class_weight = eval(class_weight) else: pass if st.checkbox(label='Train Model'): lgb = LGBMClassifier(max_depth=max_depth, subsample=subsample, random_state=random_state, num_leaves=num_leaves, n_estimators=n_estimators, learning_rate=learning_rate, reg_alpha=reg_alpha, reg_lambda=reg_lambda, class_weight=class_weight, boosting_type=boosting_type) clf = lgb.fit(X_train, y_train) scores = cross_val_score(clf, X_train, y_train, cv=5) st.write('cross-validation scores: ' + str(scores)) st.write('accuracy score of cross validation :' + str(scores.mean() * 100)) summary['cross-val scores'] = str(scores) mod_spec = str(clf).split('(')[0] + ": " + str(clf.get_params()) summary['model specs'] = str(mod_spec) save_summary(summary, dock_path) st.success('Model Training Completed!') if st.checkbox(label='See predictions'): make_predictions(predictions_path, test_path, model_path, clf, summary, dock_path) if st.button("Generate Test Files"): with st.spinner("Execution in Progress"): os.system('python ' + rp_dir + '/PMML_creation.py ' + str(acct_id) + " " + str(root_dir)) os.system('python ' + rp_dir + '/generate_Test_Files.py ' + str(acct_id) + " " + str(root_dir)) st.success("Test Files Generated") st.success("Account ready for Deployment") else: params = st.text_input(label='Enter Best Parameters') mod_o = LGBMClassifier() if params == "": rs_params = hyper_tune(mod_o, df) elif params != "": if st.checkbox(label='Train Model'): st.text("Training Model with user defined Parameters") params = eval(params) st.text(params) lgb = LGBMClassifier() lgb = lgb.set_params(**params) clf = lgb.fit(X_train, y_train) scores = cross_val_score(clf, X_train, y_train, cv=5) st.write('Cross-validation scores: ' + str(scores)) st.write('Accuracy score of cross validation :' + str(scores.mean() * 100)) summary['cross-val scores'] = str(scores) mod_spec = str(clf).split('(')[0] + ": " + str( clf.get_params()) summary['model specs'] = str(mod_spec) save_summary(summary, dock_path) st.success('Model Training Completed!') if st.checkbox(label='See predictions'): make_predictions(predictions_path, test_path, model_path, clf, summary, dock_path) if st.button("Generate Test Files"): with st.spinner("Execution in Progress"): os.system('python ' + rp_dir + '/PMML_creation.py ' + str(acct_id) + " " + str(root_dir)) os.system('python ' + rp_dir + '/generate_Test_Files.py ' + str(acct_id) + " " + str(root_dir)) st.success("Test Files Generated") st.success("Account ready for Deployment") if st.checkbox(label='Show Help Text?'): expander = st.beta_expander("FAQ") st.write("Please Enter Parameters in the following format:") st.text("{'boosting_type': 'gbdt', 'class_weight':" + str({ 0: 1, 1: 5 }) + ", 'colsample_bytree': 1.0, 'learning_rate': 0.08, }") st.write( "If you wish to enter a range of params for hyper tuning ") st.text( "{'num_leaves': randint(6, 50),'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100]}" )