def get_useful_features_byLightBGM(X, Y): # 特殊参数设置 importance_filter = 6 model_3 = LGBMRegressor(num_leaves=36, n_estimators=100, learning_rate=0.07, random_state=0) Y_log = np.log1p(Y) model_3.fit(X, Y_log, verbose=True) feature_score = model_3.feature_importances_ importance_feature_map = list(zip(feature_score, X.columns)) useless_feature = [] for i in importance_feature_map: if i[0] <= importance_filter: useless_feature.append(i[1]) feature = [c for c in X.columns] useful_feature = [aa for aa in feature if aa not in useless_feature] print('有用:', len(useful_feature)) print('无用:', len(useless_feature)) print('全部:', len(feature)) return useful_feature
def score_of_nonlinearmodel(self, model=None): """ 树模型 :param models: :return: """ if not [model]: if (self.numNull != 0) | (self.numInf != 0): print('特征中有NaN或Inf!!!') print('NaN:{},Inf:{}'.format(self.numNull, self.numInf)) model = LGBMRegressor(n_estimators=100) model_name = str(model).split('(')[0] model.fit(self.train_X, self.train_y) if self.showFig: sns.barplot(abs(model.feature_importances_), self.continuous_feature_names) plt.title('{} importances of features'.format(model_name)) plt.show() sc = [abs(x) for x in model.feature_importances_] sum_sc = sum(sc) featureScore = [round(s / sum_sc, 4) for s in sc] print(model_name + ' is finished') return featureScore
def lightBGM_model(X, Y): model = LGBMRegressor(num_leaves=36, n_estimators=100, learning_rate=0.07, random_state=0) model.fit(X, Y, verbose=True) return model
class LGBMRegressorPrim(primitive): def __init__(self, random_state=0): super(LGBMRegressorPrim, self).__init__(name='LGBMRegressor') self.hyperparams = [] self.type = 'Regressor' self.description = "LightGBM is a gradient boosting framework that uses tree based learning algorithms." self.hyperparams_run = {'default': True} self.random_state = random_state self.model = LGBMRegressor() self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.model.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) output['predictions'] = self.model.predict(output['X']) output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"]) final_output = {0: output} return final_output
def init(PROPERTIES_PATH, LOAD_FROM_DISK): # boost_params = {'n_estimators': 200, # 'min_samples_split': 40, # 'min_samples_leaf': 4, # 'max_features': 'sqrt', # 'max_depth': 20, # 'learning_rate': 0.05} # # boost = GradientBoostingRegressor(**boost_params) boost = LGBMRegressor(learning_rate=0.05, n_estimators=1127, max_depth=-1, min_child_weight=0, num_leaves=68, min_child_samples=5, objective='regression', subsample_for_bin=1000, min_split_gain=0, feature_fraction=0.5, nthread=-1) train_data = load_all_data(get_connection(PROPERTIES_PATH), TABLE_LIST, is_train=True, load_from_disk=LOAD_FROM_DISK) train_data = data_preprocessing(train_data) train_X, train_Y = train_data boost.fit(train_X, train_Y) np.save('col.npy', train_X.columns) print("training has been completed succesfully !!!!") print("--------------------------------------------") return boost
def bulid_onetrain(train_data, test,pred= features,label= 'label',seed=1099,est=6000, is_shuffle=True): train_x,train_y=train_data[features].values,train_data[label].values clf=LGBMRegressor( learning_rate=0.01, boosting_type = 'gbdt', objective = 'regression', n_estimators=est, num_leaves=156, subsample=0.8, njobs=-1, max_depth=8, reg_lambda=0, colsample_bytree=0.8, random_state=2019, # 2019 metric=['mse']) clf.fit( train_x, train_y, eval_set=[(train_x, train_y)], eval_metric=['mse'], categorical_feature='auto', verbose=100) #train_pred= clf.predict(train_x, num_iteration=clf.best_iteration_) test_pred= clf.predict(test[pred], num_iteration=clf.best_iteration_) #print('mean_squared_error:',mean_squared_error(train_y,train_pred)) test['label'] = test_pred return test[['loadingOrder', 'label']],clf
def train_lightgbm(verbose=True): """Train a boosted tree with LightGBM.""" if verbose: print("Training with LightGBM") df = pd.read_csv(STAGE1_LABELS) x = np.array([ np.mean(np.load(os.path.join(FEATURE_FOLDER, '%s.npy' % str(id))), axis=0).flatten() for id in df['id'].tolist() ]) y = df['cancer'].as_matrix() trn_x, val_x, trn_y, val_y = cross_validation.train_test_split( x, y, random_state=42, stratify=y, test_size=0.20) ''' params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'l2'}, 'num_leaves': 21, 'learning_rate': 0.001, 'nthread':24, 'subsample':0.80, 'colsample_bytree':0.80, 'seed':42, 'verbose': verbose, } ''' skf = StratifiedKFold(n_splits=5, random_state=2048, shuffle=True) result = [] clfs = [] oof_preds = [] for train_index, test_index in skf.split(x, y): trn_x, val_x = x[train_index, :], x[test_index, :] trn_y, val_y = y[train_index], y[test_index] val_ids = pd.DataFrame(ids.iloc[test_index].values, columns=['id']) clf = LGBMRegressor(max_depth=50, num_leaves=21, n_estimators=5000, min_child_weight=1, learning_rate=0.001, nthread=24, subsample=0.80, colsample_bytree=0.80, seed=42) clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=verbose, eval_metric='l2', early_stopping_rounds=300) val_preds = pd.DataFrame(clf.predict(val_x), columns=["cancer"]) oof_preds.append(pd.concat([val_ids, val_preds], axis=1)) clfs.append(clf) return clfs, oof_preds
def get_ntree(): rmse_t_total, rmse_v_total = [], [] for ntree in range(10, 500, 10): lgb_base = LGBMRegressor(n_estimators=ntree, objective='regression', random_state=1234, n_jobs=2, colsample_bytree=0.8, reg_alpha=1, max_depth=10, subsample=0.8) print('此时 ntree = %s' % ntree) lgb_base.fit(X_t, y_t) y_t_pre = lgb_base.predict(X_t) y_v_pre = lgb_base.predict(X_v) rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre)) rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre)) rmse_t_total.append(rmse_t_each) rmse_v_total.append(rmse_v_each) myfile = open('D:\\workspace python\\statContest\\save\\' + 'lgbbase2_rmse_0412.txt', 'a', encoding='utf-8') print(rmse_t_each, ',', rmse_v_each, file=myfile) myfile.close() return rmse_t_total, rmse_v_total
def lightGBM_train_nocross(j,param,x_train, x_test, y_train, y_test): gbm = LGBMRegressor(**param,num_leaves=31,learning_rate=0.01,object='regression') gbm.fit(x_train, y_train) y_pred = gbm.predict(x_test) y_pred = DataFrame(y_pred) rmse_lightGBM.append(np.sqrt(mean_squared_error(y_pred, y_test))) r2_lightGBM.append(r2_score(y_test, y_pred)) return rmse_lightGBM,r2_lightGBM,gbm
def lgb(x_train, y_train, x_val, y_val): lgb = LGBMRegressor(n_estimators=1000, max_depth=10, subsample=0.8, colsample_bytree=0.8, learning_rate=0.01, random_state=2020) lgb.fit(x_train, y_train) result = lgb.predict(x_val) score = mean_absolute_error(result, y_val) return score
def train_lightgbm(trn_x, val_x, trn_y, val_y): clf = LGBMRegressor(max_depth=50, num_leaves=21, n_estimators=5000, min_child_weight=9, learning_rate=0.01, nthread=24, subsample=0.80, colsample_bytree=0.80, seed=42) clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='l2', early_stopping_rounds=300) return clf
def build_model(train_data, test, pred, label, seed=2099, is_shuffle=True): train_pred = np.zeros((train_data.shape[0], )) test_pred = np.zeros((test.shape[0], )) n_splits = 5 # Kfold fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed) kf_way = fold.split(train_data[pred]) # params # test_x=np.concatenate([test[pred].values,geohash_test],axis=1) # train for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1): train_x, train_y = train_data[pred].iloc[train_idx].values, train_data[ label].iloc[train_idx] valid_x, valid_y = train_data[pred].iloc[valid_idx].values, train_data[ label].iloc[valid_idx] # geohash_tr_x,geohash_val_x=geohash_train[train_idx],geohash_train[valid_idx] # train_x=np.concatenate([train_x,geohash_tr_x],axis=1) # valid_x=np.concatenate([valid_x,geohash_val_x],axis=1) # 数据加载 clf = LGBMRegressor( learning_rate=0.5, n_estimators=6000, boosting_type='gbdt', objective='regression', num_leaves=156, subsample=0.8, njobs=-1, max_depth=6, reg_lambda=0, colsample_bytree=0.8, random_state=2019, # 2019 metric=['mse']) clf.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], eval_metric=['mse'], categorical_feature='auto', early_stopping_rounds=100, verbose=100) train_pred[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration_) test_pred += clf.predict( test[pred], num_iteration=clf.best_iteration_) / fold.n_splits print('mean_squared_error:', mean_squared_error(train_data[label].values, train_pred)) test['label'] = test_pred return test[['loadingOrder', 'label']], clf
def setUp(self): X_train, y_train, X_test, y_test = titanic_fare() self.test_len = len(X_test) train_names, test_names = titanic_names() _, self.names = titanic_names() model = LGBMRegressor() model.fit(X_train, y_train) self.explainer = RegressionExplainer(model, X_test, y_test, r2_score, shap='tree', cats=['Sex', 'Deck', 'Embarked'], idxs=test_names, units="$")
def train_LGBM(self,train, t_target, valid, v_target,parm,use_custom_loss = False,reg_alpha = 0,reg_lambda = 0): #entity_features_columns = ['total_floor','building_material','city_town', 'building_type', 'building_use', 'parking_way', 'I_index_50', 'I_index_500', 'I_index_1000', 'I_index_5000', 'I_index_10000', 'II_index_50', 'II_index_500', 'II_index_1000', 'II_index_5000', 'II_index_10000', 'III_index_50', 'III_index_500', 'III_index_1000', 'III_index_5000', 'III_index_10000', 'IV_index_50', 'IV_index_500', 'IV_index_1000', 'IV_index_5000', 'IV_index_10000', 'V_index_50', 'V_index_500', 'V_index_1000', 'V_index_5000', 'V_index_10000', 'VI_index_50', 'VI_index_500', 'VI_index_1000', 'VI_index_5000', 'VI_index_10000', 'VII_index_50', 'VII_index_500', 'VII_index_1000', 'VII_index_5000', 'VII_index_10000', 'VIII_index_50', 'VIII_index_500', 'VIII_index_1000', 'VIII_index_5000', 'VIII_index_10000', 'IX_index_50', 'IX_index_500', 'IX_index_1000', 'IX_index_5000', 'IX_index_10000', 'X_index_50', 'X_index_500', 'X_index_1000', 'X_index_5000', 'X_index_10000', 'XI_index_50', 'XI_index_500', 'XI_index_1000', 'XI_index_5000', 'XI_index_10000', 'XII_index_50', 'XII_index_500', 'XII_index_1000', 'XII_index_5000', 'XII_index_10000', 'XIII_index_50', 'XIII_index_500', 'XIII_index_1000', 'XIII_index_5000', 'XIII_index_10000', 'XIV_index_50', 'XIV_index_500', 'XIV_index_1000', 'XIV_index_5000', 'XIV_index_10000','parking_price_isna','txn_floor_isna'] #entity_features_columns = ['building_material', 'city', 'town', 'village', 'building_type', 'building_use', 'parking_way','parking_price_isna','txn_floor_isna'] if use_custom_loss: self.loss = custom_loss learning_rate = parm['learning_rate'] n_estimators = parm['n_estimators'] max_depth = parm['max_depth'] num_leaves = parm['num_leaves'] feature_fraction = parm['feature_fraction'] flag = True good_depth = 0 good_leaves = 0 good_fraction = 0 for depth in max_depth: for leaves in num_leaves: for fraction in feature_fraction: rf = LGBMRegressor(learning_rate=learning_rate, objective='regression', n_estimators=n_estimators, max_depth=depth, num_leaves=leaves, reg_alpha=reg_alpha, reg_lambda = reg_lambda, feature_fraction=fraction, bagging_freq=1, metric='rmse') rf.fit(train, t_target, # should we drop the features that are not correlate to our target? eval_set=[(train, t_target), (valid, v_target)], #early_stopping_rounds=100, verbose=5000, eval_metric=self.loss, categorical_feature=self.entity_features_columns ) print("Finished.") if flag: self.model = rf flag = False y_predict ,y_true= self.predict(valid,v_target) point = self.score(y_true,y_predict) if point > self.max_point: self.max_point = point self.model = rf good_depth = depth good_leaves = leaves good_fraction = fraction print(f"depth : {good_depth} leaves : {good_leaves} fraction :{good_fraction}") self.model.booster_.save_model(f'models/lightgbm{good_depth}_{good_leaves}_{good_fraction}.txt') return self
def predict(X_train, Y_train, X_test): print("Y_train is 1:", Y_train.count(1)) print("Y_train is 0:", Y_train.count(0)) clfs = [ LGBMRegressor(learning_rate=0.0475, max_depth=13, n_estimators=100, num_leaves=80), XGBRegressor(learning_rate=0.0475, max_depth=4, n_estimators=300)] X = np.array(X_train, dtype='float32') y = np.array(Y_train, dtype='float32') X_predict = np.array(X_test, dtype='float32') dataset_blend_train = np.zeros((X.shape[0], len(clfs)), dtype='float32') dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)), dtype='float32') '''5折stacking''' n_folds = 5 skf = StratifiedKFold(n_splits=n_folds) for j, clf in enumerate(clfs): '''依次训练各个单模型''' print("clf", j) dataset_blend_test_j = np.zeros((X_predict.shape[0], n_folds), dtype='float32') for i, (train, test) in enumerate(skf.split(X, y)): '''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。''' print("stacking Fold", i) X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test] # if j == 0: # class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) # clf.class_weight = dict(enumerate(class_weights)) # else: # class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) # clf.scale_pos_weight = class_weights[1] / class_weights[0] # print('scale_pos_weight:', clf.scale_pos_weight) clf.fit(X_train, y_train) y_submission = clf.predict(X_test) dataset_blend_train[test, j] = y_submission dataset_blend_test_j[:, i] = clf.predict(X_predict) '''对于测试集,直接用这k个模型的预测值均值作为新的特征''' dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) del dataset_blend_test_j # print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j])) # clf = LogisticRegression() # clf = GradientBoostingRegressor(learning_rate=0.02, max_depth=6) clf = LGBMRegressor() class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y) clf.class_weight = dict(enumerate(class_weights)) dataset_blend_train = np.append(dataset_blend_train, X, axis=1) dataset_blend_test = np.append(dataset_blend_test, X_predict, axis=1) clf.fit(dataset_blend_train, y) y_submission = clf.predict(dataset_blend_test) return y_submission
def tune_params(): rmse_t_total, rmse_v_total = [], [] for max_depth in range(6, 11): for subsample in [0.6, 0.7, 0.8]: for colsample_bytree in [0.6, 0.7, 0.8]: for reg_alpha in [0.1, 1, 10]: lgb_base = LGBMRegressor(n_estimators=150, objective='regression', random_state=1234, n_jobs=3, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, max_depth=max_depth, subsample=subsample) _params = { 'max_depth': max_depth, 'subsample': subsample, 'colsample_bytree': colsample_bytree, 'reg_alpha': reg_alpha, } lgb_base.fit(X_t, y_t) y_t_pre = lgb_base.predict(X_t) y_v_pre = lgb_base.predict(X_v) rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre)) rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre)) rmse_t_total.append(rmse_t_each) rmse_v_total.append(rmse_v_each) print(_params) myfile1 = open( 'D:\\workspace python\\statContest\\save\\' + 'lgbbase2_saveparams_rmse_0412.txt', 'a', encoding='utf-8') print(_params['max_depth'], _params['subsample'], _params['colsample_bytree'], _params['reg_alpha'], file=myfile1) myfile1.close() print(rmse_t_each, rmse_v_each) myfile = open('D:\\workspace python\\statContest\\save\\' + 'lgbbase2_tunparms_rmse_0412.txt', 'a', encoding='utf-8') print(rmse_t_each, ',', rmse_v_each, file=myfile) myfile.close() return rmse_t_total, rmse_v_total
def LGB_train(self,X_train, X_valid, labels_train, labels_valid, X_test, lgb_param_all): lgb_param_contrl = {'early_stopping_rounds': 100, 'categorical_feature': 'auto'} lgb_param = lgb_param_all.copy() objective_type = lgb_param['objective_type'] lgb_param.pop('objective_type') for k in ['early_stopping_rounds', 'categorical_feature']: if k in lgb_param: lgb_param_contrl[k] = lgb_param[k] lgb_param.pop(k) if not self.config.retrain: # 调用已有模型进行增量训练 model_load = self.load_model() if not model_load: print('不存在模型:{},从头训练'.format(self.modelName)) if objective_type == 'regressor': clf = LGBMRegressor(**lgb_param) else: clf = LGBMClassifier(**lgb_param) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) else: clf = model_load.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) else: if objective_type == 'regressor': clf = LGBMRegressor(**lgb_param) else: clf = LGBMClassifier(**lgb_param) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) val_lgb_pre = clf.predict(X_valid.values, num_iteration=clf.best_iteration_) test_lgb_pre = clf.predict(X_test.values, num_iteration=clf.best_iteration_) metrics_name = self.config.metrics_name myMetrics = defindMetrics.MyMetrics(metrics_name) score_lgb = myMetrics.metricsFunc(val_lgb_pre, labels_valid) self.save_model(clf, self.config.saveModel) return val_lgb_pre, test_lgb_pre, score_lgb
def train_LightGBM(x_train, y_train): clf = LGBMRegressor( n_estimators=10000, learning_rate=0.02, boosting_type='gbdt', objective='regression_l1', max_depth=-1, num_leaves=31, min_child_samples=20, feature_fraction=0.8, bagging_freq=1, bagging_fraction=0.8, lambda_l2=2, random_state=2020, ) clf.fit(x_train, y_train) return clf
def train_lgb_model(best_nodes, X_train_scaled, Y_train): rsg = LGBMRegressor( learning_rate=best_nodes["learning_rate"], n_estimators=int(best_nodes["n_estimators"]), max_depth=best_nodes["max_depth"], #eval_metric=best_nodes["eval_metric"], num_leaves=best_nodes["num_leaves"], subsample=best_nodes["subsample"], colsample_bytree=best_nodes["colsample_bytree"], min_child_samples=best_nodes["min_child_samples"], min_child_weight=best_nodes["min_child_weight"]) rsg.fit(X_train_scaled, Y_train) Y_pred = rsg.predict(X_train_scaled) print("mse:", np.mean((Y_pred - Y_train)**2)) print("rmse:", np.sqrt(np.mean((Y_pred - Y_train)**2))) return rsg
def lightBGM_model_with_test(X, Y): model = LGBMRegressor(num_leaves=36, n_estimators=100, learning_rate=0.07, random_state=0) useful_feature = get_useful_features_byLightBGM(X, Y) X_U = X[useful_feature] x1, x2, y1, y2 = train_test_split(X_U, Y, test_size=0.2) y1_log = np.log1p(y1) model.fit(x1, y1_log, verbose=True) predict_log = model.predict(x2) predict = np.expm1(predict_log) error = error_fun(predict, y2)[1] del x1, x2, y1, y2 return error
def train_lightgbm(verbose=True): """Train a boosted tree with LightGBM.""" if verbose: print("Training with LightGBM") df = pd.read_csv(STAGE1_LABELS) x = np.array([np.mean(np.load(FEATURE_FOLDER+'%s.npy' % str(id)), axis=0).flatten() for id in df['id'].tolist()]) y = df['cancer'].as_matrix() trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y, test_size=0.20) clf = LGBMRegressor(max_depth=50, num_leaves=21, n_estimators=5000, min_child_weight=1, learning_rate=0.001, nthread=24, subsample=0.80, colsample_bytree=0.80, seed=42) clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=verbose, eval_metric='l2', early_stopping_rounds=300) return clf
def fit(self): if self.First_change: #boxcox变换 act = boxcox(self.train_label + 0.1)[0] self.act_ = boxcox(self.train_label + 0.1)[1] else: act = self.train_label steps = self.steps actual = act n_samples = len(self.train_label) y_pred_train = np.zeros(n_samples, np.float32) n_estimators_list = self.n_estimators_list for i in range(1): num = np.random.randint(0, 5000) print("----training begin----") for step in range(steps): print(step) actual = actual - y_pred_train #残差计算 if step > 0: #残差进行标签压缩变换,和boxcox变换 actual_ = sigmod(actual) actual_box = boxcox(actual_)[0] actual_box_val = boxcox(actual_)[1] self.box_value.append(actual_box_val) actual_used = actual_box else: actual_used = actual #阶段模型生成 model = LGBMRegressor(n_estimators=n_estimators_list[step], max_depth=3, learning_rate=0.02, subsample=1, colsample_bytree=1) model.fit(self.train.values, actual_used) #阶段模型训练 y_pred_train_ = model.predict(self.train.values) #阶段预测输出 if step > 0: #阶段反变换计算输出 y_pred_train = (y_pred_train_ * actual_box_val + 1)**(1 / actual_box_val) y_pred_train = sigmod_trans(y_pred_train) else: y_pred_train = y_pred_train_ self.model_list.append(model) #阶段模型存储
def modelingLGBM(hold_out_train,hold_out_test): from sklearn.linear_model import LassoCV as LaCV from sklearn.ensemble import RandomForestRegressor as RFR from sklearn.linear_model import Ridge from sklearn.linear_model import RANSACRegressor from sklearn.neural_network import MLPRegressor as MLP from xgboost.sklearn import XGBRegressor as XGBR from xgboost.sklearn import DMatrix from lightgbm.sklearn import LGBMRegressor as LGBM traindata=hold_out_train.copy() testdata=hold_out_test.copy() traindata=traindata.drop(['Store','Customers','Date','Open','PromoInterval','monthstr'],axis=1) testdata=testdata.drop(['Store','Customers','Date','Open','PromoInterval','monthstr'],axis=1) train_x=traindata.drop(['Sales'],axis=1) train_y=np.log1p(traindata['Sales']) test_x=testdata.drop(['Sales'],axis=1) # #归一化 # min_max_scaler = MinMaxScaler() # train_x = min_max_scaler.fit_transform(train_x) # test_x = min_max_scaler.fit_transform(test_x) smalest_rmspe=1000 subsamples=np.arange(0.5,0.6,0.1) for subsample in subsamples: time1 = time.time() lgbmModel = LGBM(n_estimators=8000,subsample=0.8) print(lgbmModel) lgbmModel.fit(train_x, train_y) sales_predict = lgbmModel.predict(test_x) rmspe = RMSPE(testdata['Sales'], np.expm1(sales_predict)) print(rmspe) time2 = time.time() print('耗费时间:', (time2 - time1)) if smalest_rmspe>rmspe: smalest_rmspe=rmspe best_model=lgbmModel return best_model
def predict_lgb(X, y, df2, params, ind): X_train, y_train = X, y output = df2[(df2.index >= ind) & (df2.index < (ind + 28))] # dataset for prediction X = output.iloc[:, 1:] # this basically drops the "value" column lgb_model = LGBMRegressor(**params) lgb_reg = lgb_model.fit(X_train, y_train.value.ravel()) preds = lgb_reg.predict(X) return preds
def get_model_result(self, params: dict) -> dict: X, y = self.X, self.Y X_test, y_test = self.X_test, self.Y_test # X, y = self.X.values, self.Y.values # X_test, y_test = self.X_test.values, self.Y_test.values if isinstance(self.estimator, lgb.Booster): params["metric"] = "auc" estimator = lgb.train(params, self.dataset_train) pred_train = pd.Series(estimator.predict(self.dataset_train), index=self.X.index) pred_test = pd.Series(estimator.predict(self.dataset_test), index=self.X_test.index) elif isinstance(self.estimator, LGBMRegressor): estimator = LGBMRegressor(**params) estimator.fit(X, y, eval_metric="auc") pred_train = pd.Series(estimator.predict(X), index=self.X.index) pred_test = pd.Series(estimator.predict(X_test), index=self.X_test.index) elif isinstance(self.estimator, LGBMClassifier): estimator = LGBMClassifier(**params) estimator.fit(X, y, eval_metric="auc") pred_train = pd.Series(estimator.predict_proba(X)[:, 1], index=self.X.index) pred_test = pd.Series(estimator.predict_proba(X_test)[:, 1], index=self.X_test.index) else: raise TypeError( "Input model should be a `lgb.Booster` or `LGBMClassifier`/`LGBMRegressor`!" ) # 置空得分 pred_train.loc[~pred_train.index.isin(self.hit_indices)] = np.nan pred_test.loc[~pred_test.index.isin(self.hit_indices)] = np.nan # 计算模型评估指标 ks_train, ks_test = calc_ks(-pred_train, y), calc_ks(-pred_test, y_test) auc_train, auc_test = calc_auc(pred_train, y), calc_auc(pred_test, y_test) # return {'train': (ks_train, auc_train), 'test': (ks_test, auc_test)} return {"ks": (ks_train, ks_test), "auc": (auc_train, auc_test)}
def model_lgb(self, X, Y): # create dataset for lightgbm # specify your configurations as a dict # params = { # 'task': 'train', # 'boosting_type': 'gbdt', # 可换为rf(随机森林) dart goss # 'objective': 'binary', # 'metric': {'cross_entropy'}, # cross_entropy # 'num_leaves': 80, # 50 # # 'max_depth': 6, # 6 # 'learning_rate': 0.06, # 'bagging_fraction': 0.8, # 'bagging_freq': 5, # 'seed': 0, # # 'min_data_in_leaf ': 100, # } # f1 0.43 # train # X, Y = SMOTE().fit_sample(X, Y) # print("Y is 1:", Y.count(1)) # print("Y is 0:", Y.count(0)) class_weights = class_weight.compute_class_weight( 'balanced', np.unique(Y), Y) class_weights = dict(enumerate(class_weights)) print("class_weights", class_weights) lgb_model = LGBMRegressor(learning_rate=0.0475, max_depth=13, n_estimators=100, num_leaves=50, class_weight=class_weights) # lgb_model = LGBMRegressor(learning_rate=0.0475, max_depth=13, n_estimators=100, num_leaves=60, # class_weight=class_weights) # 0.552 # lgb_model = LGBMRegressor(learning_rate=0.0475, max_depth=13, n_estimators=100, num_leaves=70, # class_weight=class_weights) # 0.542 # {'learning_rate': 0.0475, 'max_depth': 13, 'n_estimators': 100, 'num_leaves': 70} 0.464 print("Training lgb model....") gbm = lgb_model.fit(X, Y) print("feature_importances_ : ", gbm.feature_importances_) print("Save model to " + self.model_path) dump(gbm, self.model_path)
for i in range(len(duration)): duration_hours.append(int(duration[i].split(sep = "h")[0])) duration_mins.append(int(duration[i].split(sep = "m")[0].split(sep = "h")[-1])) X["Duration_hours"] = duration_hours X["Duration_mins"] = duration_mins X.drop(["Duration"], axis = 1,inplace = True) X.drop(["Dep_Time"], axis = 1,inplace = True) X.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) from lightgbm.sklearn import LGBMRegressor reg = LGBMRegressor() reg.fit(X_train,y_train) y_pred=reg.predict(X_test) from sklearn import metrics print('MAE:', metrics.mean_absolute_error(y_test, y_pred)) print('MSE:', metrics.mean_squared_error(y_test, y_pred)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) filename = 'flightfare.pkl' pickle.dump(reg, open(filename, 'wb'))
best_params = space_eval(hyper_space, best_vals) print("BEST PARAMETERS: " + str(best_params)) # Print best CV score scores = [-trial['result']['loss'] for trial in trials.trials] print("BEST CV SCORE: " + str(np.max(scores))) # Print execution time tdiff = trials.trials[-1]['book_time'] - trials.trials[0]['book_time'] print("ELAPSED TIME: " + str(tdiff.total_seconds() / 60)) # Set params est.set_params(**best_params) # Fit est.fit(X_train, y_train) y_pred = est.predict(X_test) # Predict score = r2_score(y_test, y_pred) print("R2 SCORE ON TEST DATA: {}".format(score)) #============================================================================== # Tree structure of hyperparameter space (Optional) #============================================================================== # You must change the evaluate function in order to extract learning rate # and n_estimators from choices. Please add the following code to the start of # evaluate function # # Choices # if 'choices' in params.keys(): # params['learning_rate'] = params['choices']['learning_rate']
class b_model: # 这个地方可以定义全局变量 params = { 'learning_rate': 0.015, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mse', 'num_leaves': 12, 'max_depth': 9, 'max_bin': 130, 'feature_fraction': 0.9, 'reg_lambda': 50, 'min_data': 25, 'min_child_weight': 0.001, 'verbose': -1, } no_use = [ "血糖", "blood_sugar", "id", "blood_sugar_log", '体检日期', 'feature_5_less_25', 'feature_4_less_60', '性别' ] def __init__(self): # 在创建类的时候需要哪些参数 self.model = LGBMRegressor(learning_rate=0.015, objective="regression", metric='mse', num_leaves=12, max_depth=9, max_bin=130, feature_fraction=0.9, reg_lambda=50, min_data=25, min_child_weight=0.001, num_boost_round=3000, random_state=42) def __make_feature(self, train, test): # 构造特征 if train.empty: test['性别'] = test['性别'].map({'男': 1, '女': 0, '??': 1}) return test if test.empty: train['性别'] = train['性别'].map({'男': 1, '女': 0, '??': 1}) return train else: train_id = train.id.values.copy() test_id = test.id.values.copy() data = pd.concat([train, test]) data['性别'] = data['性别'].map({'男': 1, '女': 0, '??': 1}) train_feat = data[data.id.isin(train_id)] test_feat = data[data.id.isin(test_id)] return train_feat, test_feat def fit(self, X, y=None): X.drop(X[X["年龄"] >= 84].index, inplace=True) fea_train = pd.read_csv("./feature/fea_train.csv") fea_train1 = pd.read_csv("./feature/fea_train_1.csv") fea_train2 = pd.read_csv("./feature/fea_train_2.csv") X = pd.merge(X, fea_train, how="left", on="id") X = pd.merge(X, fea_train1, how="left", on="id") X = pd.merge(X, fea_train2, how="left", on="id") X = self.__make_feature(train=X, test=pd.DataFrame()) if y == None: y = X["血糖"].values predictors = [f for f in list(X.columns) if f not in self.no_use] X_train, X_test, y_train, y_test = train_test_split(X[predictors], y, test_size=0.1, random_state=42) self.model.fit(X_train[predictors], y_train, eval_metric="mse", early_stopping_rounds=100, verbose=100, eval_set=(X_test[predictors], y_test)) from sklearn.metrics import mean_squared_error print("线下误差:{}".format(0.5 * mean_squared_error( y_test, self.model.predict(X_test[predictors])))) return self def predict(self, X): # 对测试集进行预测,传入模型,和测试数据 fea_test = pd.read_csv("./feature/fea_test.csv") fea_test1 = pd.read_csv("./feature/fea_test_1.csv") fea_test2 = pd.read_csv("./feature/fea_test_2.csv") X = pd.merge(X, fea_test, how="left", on="id") X = pd.merge(X, fea_test1, how="left", on="id") X = pd.merge(X, fea_test2, how="left", on="id") X = self.__make_feature(test=X, train=pd.DataFrame()) predictors = [f for f in list(X.columns) if f not in self.no_use] test_pred = self.model.predict(X[predictors]) print("最大值:{}".format(test_pred.max())) return test_pred def get_params(self): return self.params
X_val, Y_val = train.iloc[val_idx][feats], train.iloc[val_idx].label clf = LGBMRegressor( n_estimators=100000, learning_rate=0.1, num_leaves=255, subsample=0.8, colsample_bytree=0.8, random_state=2020, metric='RMSE', n_jobs=24, ) clf.fit( X_trn, Y_trn, eval_set=[(X_val, Y_val)], early_stopping_rounds=200, verbose=1000, ) oof[val_idx] = clf.predict(X_val) sub += clf.predict(X_test) / skf.n_splits sub = pd.DataFrame({ 'queryid': test.query_id, 'documentid': test.doc_id, 'predict_label': sub, }) oof = pd.DataFrame({ 'query_id': train.query_id, 'doc_id': train.doc_id,