def fit_model(self, X_train, y_train, X_test, y_test): clf = XGBRegressor(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train, y_train) # y_pre = clf.predict(X_test) # y_pro = clf.predict_proba(X_test)[:, 1] # print # "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro) # print("pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre)) new_feature = clf.apply(X_train) X_train_new = self.mergeToOne(X_train, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print "Training set sample number remains the same" return X_train_new, y_train, X_test_new, y_test
def cbd_model(cbd_df,cbd_finalinput): ''' function that creates model from the cbd dataframe and returns the predicted number of crimes for the next three days ''' X_cbd=cbd_df[['year', 'month', 'day', 'tmax', 'tmin', 'consumer_price_index', 'gdp_millions_2007', 'seasonally_adjusted_unemployment', 'unadjusted_unemployment', 'Possession, cocaine ', 'Heroin, possession ', 'Heroin Price Canada', 'day_segment_1200pm-1159pm', 'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday', 'day_of_week_Wednesday']] y_cbd=cbd_df['number_of_crimes'] scaler = StandardScaler() scaler.fit(X_cbd) # Don't cheat - fit only on training data X_cbd = scaler.transform(X_cbd) cbd_input_scaled = scaler.transform(cbd_finalinput) xgb=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1) xgb.fit(X_cbd,y_cbd) predict_cbd=xgb.predict(cbd_input_scaled) return predict_cbd
def XGB_reg_evaluation(individual, evaluation_method='roll_win'): ''' evaluation_method : can be roll_win, mse ''' if evaluation_method == 'roll_win': trainNumber = individual[6] # the train num param = { 'eta': individual[0], 'silent': True, 'objective': "reg:linear", 'nthread': -1, 'min_child_weight': individual[1], 'max_depth': individual[2], 'subsample': individual[3], 'colsample_bylevel': individual[4], 'seed': 0 } roll_win_mseValue = 0 for i in xrange(N_validation): trainingX, trainingY = trainX[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window),:],\ trainY[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window)] testingX, testingY= trainX[(trainNum - (i + 1) * window):(trainNum - i * window),:], \ trainY[(trainNum - (i + 1) * window):(trainNum - i * window)] dtrain = xgb.DMatrix(data=trainingX, label=trainingY) bst = xgb.train(params=param, dtrain=dtrain, num_boost_round=individual[5]) testingX = xgb.DMatrix(testingX) roll_win_mseValue += sum( (testingY - bst.predict(testingX))**2) / window roll_win_mseValue /= N_validation return (roll_win_mseValue, ) if evaluation_method == 'mse': ### The cross validation evaluation N_SPLITS = N_splits kf = KFold(n_splits=N_SPLITS) cv_mseValue = 0 fc = XGBRegressor(learning_rate=individual[0], n_estimators=individual[5], silent=True, objective="reg:linear", nthread=-1, gamma=0, min_child_weight=individual[1], max_depth=individual[2], subsample=individual[3], colsample_bylevel=individual[4], seed=0) for train, test in kf.split(trainX): fc.fit(trainX[train, :], trainY[train]) cv_mseValue += sum( (trainY[test] - fc.predict(trainX[test, :]))**2) / len(test) cv_mseValue = cv_mseValue / N_SPLITS return (cv_mseValue, ) print "There is no evaluation method for %s" % evaluation_method raise Exception("evaluation_method is not valid")
def go(data_dict,feats_to_use, params={"seed":0,"silent":False,"n_jobs":-1}, parameter_tuning=False): ''' if with_gpu: xgb = XGBRegressor(seed=0, silent=False, tree_method='gpu_hist', n_gpus=-1) else: xgb = XGBRegressor(seed=0, silent=False, n_jobs=-1) ''' X_train=data_dict['X_train'][feats_to_use].copy() y_train=data_dict['y_train'].copy() X_test=data_dict['X_test'][feats_to_use].copy() X_val=data_dict['X_val'][feats_to_use].copy() y_val=data_dict['y_val'].copy() if parameter_tuning: fit_params={ "early_stopping_rounds":10, "eval_metric" : "rmse", "eval_set" : [(X_val,y_val)]} xgb=XGBRegressor() train_val_features=pd.concat([X_train,X_val]) train_val_labels=pd.concat([y_train,y_val]) test_fold = np.zeros(train_val_features.shape[0]) # initialize all index to 0 test_fold[:X_train.shape[0]] = -1 # set index of training set to -1, indicating not to use it in validation ps=PredefinedSplit(test_fold=test_fold) X_train=data_dict['X_train'][feats_to_use] y_train=data_dict['y_train'] X_test=data_dict['X_test'][feats_to_use] grid=GridSearchCV(xgb,params,fit_params=fit_params,scoring=RMSE , cv=ps, verbose=32, n_jobs=-1) start=time.time() grid.fit(train_val_features,train_val_labels) elapsed=time.time()-start print (elapsed) print ('best params:',grid.best_params_) print ('best score:',grid.best_score_) return grid.best_params_, grid.best_estimator_ else: xgb=XGBRegressor(**params) print (xgb) print ('start xgboost training') start=time.time() eval_set=[(X_val,y_val)] xgb.fit(X_train,y_train, eval_set=eval_set,eval_metric='rmse',early_stopping_rounds=30) elapsed=time.time()-start print (elapsed) data_dict['y_pred']=np.exp(xgb.predict(X_test))-1 #generate submission data_dict['X_test']['item_cnt_month']=data_dict['y_pred'] test=pd.read_csv('test.csv') submission=pd.merge(test,data_dict['X_test'], on=['shop_id','item_id'],how='left')[['ID','item_cnt_month']] return submission, xgb
class my_model: def __init__(self, d): self.linear_reg = linear_model.Ridge() self.xgb_reg = XGBRegressor(max_depth=7) self.d = d def fit(self, X, y): self.linear_reg.fit(X[:, 0].reshape(-1, 1), y) self.l_reg_res = self.linear_reg.predict(X[:, 0].reshape(-1, 1)) self.xgb_reg.fit(X[:, 1:], y - self.l_reg_res) X_nn = np.hstack([ X, self.xgb_reg.predict(X[:, 1:]).reshape(-1, 1), self.l_reg_res.reshape(-1, 1) ]) return X_nn def predict(self, X): if isinstance(X[0, -1], str): for i in range(X.shape[0]): X[i, -1] = self.d[X[i, -1]] X = X.astype(np.float64, copy=False) X_nn_final = np.hstack([ X, self.xgb_reg.predict(X[:, 1:]).reshape(-1, 1), self.linear_reg.predict(X[:, 1].reshape(-1, 1)).reshape(-1, 1) ]) return X_nn_final
def run_xgb(output_train, df_X_train, df_Y_train, output_test, df_X_test, df_Y_test): xgb_estimator = XGBRegressor() param_grid = { 'nthread': [4], #when use hyperthread, xgboost may become slower 'objective': ['reg:linear'], 'learning_rate': [.03, 0.05, .07], #so called `eta` value 'max_depth': [5, 6, 7], 'min_child_weight': [4], 'silent': [1], 'subsample': [0.7], 'colsample_bytree': [0.7], 'n_estimators': [30] } opt_pars = {"score": None, "alpha": None} xgb_grid = GridSearchCV(xgb_estimator, param_grid) xgb_grid.fit(df_X_train, df_Y_train.cnt) r2_train = xgb_grid.best_score_ opt_pars = xgb_grid.best_params_ # n_estimators = 30,max_features='log2',bootstrap=True, max_depth=None xgb_opt = XGBRegressor(random_state=1).set_params(**opt_pars) xgb_opt.fit(df_X_train, df_Y_train.cnt) r2_train = xgb_opt.score(df_X_train, df_Y_train.cnt) r2_test = xgb_opt.score(df_X_test, df_Y_test.cnt) result = df_proc.compare_results("XGBoost", xgb_opt, output_train, df_X_train, output_test, df_X_test) return { "r2": [r2_train, r2_test], "R2": [result[1], result[2]], "plot": result[0] }
class XGBWrapper_regr(object): """ A wrapper for xgboost model so that we will have a single api for various models. """ def __init__(self): self.model = XGBRegressor() def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None): self.model = self.model.set_params(**params) eval_set = [(X_train, y_train)] if X_valid is not None: eval_set.append((X_valid, y_valid)) if X_holdout is not None: eval_set.append((X_holdout, y_holdout)) self.model.fit(X=X_train, y=y_train, eval_set=eval_set, eval_metric='rmse', verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds']) scores = self.model.evals_result() self.best_score_ = {k: {m: m_v[-1] for m, m_v in v.items()} for k, v in scores.items()} # self.best_score_ = {k: {m: n if m != 'cappa' else -n for m, n in v.items()} for k, v in self.best_score_.items()} self.feature_importances_ = self.model.feature_importances_ def predict(self, X_test): return self.model.predict(X_test)
def xgboostmodel(self): df = pd.read_csv(datafile, encoding='utf-8', index_col=0) print(df.shape) traindata = df.iloc[:, :].values x = traindata[:, :-1] y = traindata[:, -1] x_train, x_test, y_train, y_test = train_test_split( x, y, train_size=0.7) # list if self.params is None: params = {'max_depth': 80, 'n_estimators': 512} else: params = self.params raw_model = XGBRegressor(max_depth=128, n_estimators=768, learning_rate=0.01, silence=False) raw_model.fit(x_train, y_train) raw_model.save_model(self.model_file) pred = raw_model.predict(x_test) self.true = y_test self.pred = pred self.show_save_figure(fig_path=self.fig_path, modelname=self.job_name, detal_idx=500) t_mean = self.cal_mean(self.true) p_mean = self.cal_mean(self.pred) self.save_result(self.result_path, true_mean=t_mean, pred_mean=p_mean)
def train_xg_boost(params): xg_model = XGBRegressor(n_estimators=int(params['n_estimators']), learning_rate=params['eta'], n_jobs=-1, max_depth=int(params['max_depth']), gamma=params['gamma'], colsample_bytree=params['colsample_bytree'], min_child_weight=params['min_child_weight'], reg_alpha=params['xg_reg_alpha'], subsample=params['subsample'], reg_lambda=params['xg_reg_lambda'] ) # In[ ]: xg_model.fit(X_train.values, y_train.values) training_values = xg_model.predict(X_train.values) print(training_values) training_rmse = math.sqrt(mean_squared_error(y_train, training_values)) print("training_rmse", training_rmse) validation_values = xg_model.predict(X_validtn.values) validation_rmse = math.sqrt(mean_squared_error(y_validtn, validation_values)) print("validation_rmse", validation_rmse) """test_submission = pd.DataFrame() test_submission["Score"] = xg_model.predict(combined_test_data) test_submission.to_excel('submission4.xlsx', index=False)""" return { 'loss': validation_rmse, 'status': STATUS_OK, 'eval_time': time.time(), }
class XGBRegressorMetaPrim(primitive): def __init__(self, random_state=0): super(XGBRegressorMetaPrim, self).__init__(name='XGBRegressorMeta') self.hyperparams = [] self.type = 'ensemble' self.description = "XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable." self.hyperparams_run = {'default': True} self.random_state = random_state self.model = XGBRegressor(random_state=self.random_state, n_jobs=5) self.accept_type = 'xgb' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.model.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) output['predictions'] = self.model.predict(output['X']) output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"]) final_output = {0: output} return final_output
def __train_model(self, features): combo_list = [ ['available_year_avg', 'min_nights_year_avg', 'price_year_avg'] # ['available_winter_avg', 'min_nights_winter_avg', 'price_winter_avg'], # ['available_spring_avg', 'min_nights_spring_avg', 'price_spring_avg'], # ['available_summer_avg', 'min_nights_summer_avg', 'price_summer_avg'] ] for combo in combo_list: X_base = features.drop([ 'price_year_avg', 'price_winter_avg', 'price_spring_avg', 'price_summer_avg', 'price_fall_avg', 'available_year_avg', 'available_winter_avg', 'available_spring_avg', 'available_summer_avg', 'available_fall_avg', 'min_nights_year_avg', 'min_nights_winter_avg', 'min_nights_spring_avg', 'min_nights_summer_avg', 'min_nights_fall_avg' ], axis=1) X_base[combo[0]] = features[combo[0]] X_base[combo[1]] = features[combo[1]] y = features[combo[2]] X_train, X_test, y_train, y_test = train_test_split(X_base, y, test_size=.25, random_state=42, shuffle=True) model = XGBRegressor( objective='reg:squarederror', learning_rate=0.1, max_depth=8, n_estimators=200, cv=5, n_jobs=-1 ) model.fit(X_train, y_train) self.logger.info('Gradient boost model:') self.logger.info(f'Target label: {combo[2]}') self.logger.info(f'R^2: {model.score(X_test, y_test)}') self.logger.info(f'MAE: {mean_absolute_error(y_test, model.predict(X_test))}') return model
def over_sample(train, test, feat): predictors = [x for x in train.columns if x not in ['ID', 'y']] groups = list(train[feat].unique()) result = None for name in groups: train_temp = pd.concat([train, train[train[feat] == name]]) test_temp = test[test[feat] == name] model = XGBRegressor(max_depth=4, learning_rate=0.0045, n_estimators=1250, silent=True, objective='reg:linear', nthread=-1, min_child_weight=1, max_delta_step=0, subsample=0.93, seed=27) model.fit(train_temp[predictors], train_temp['y']) pred = model.predict(test_temp[predictors]) if result is None: result = pd.DataFrame({'ID': test_temp['ID'].values, 'y': pred}) else: result = pd.concat([ result, pd.DataFrame({ 'ID': test_temp['ID'].values, 'y': pred }) ]) result.sort_values('ID', inplace=True) return result
def generate_XGB_model(train_df): train_df.drop(['conversionTime'], axis=1, inplace=True) print 'Train And Fix Missing App Count Value...' train_df, xgb_appcount = train_model_for_appcounts(train_df) joblib.dump(xgb_appcount, 'XGB_missing.model') '''print 'Train And Fix Missing Age Value...' train_df, xgb_age = train_model_for_age(train_df) joblib.dump(xgb_age, 'XGB_age.model')''' train_df.drop(['marriageStatus', 'haveBaby', 'sitesetID', 'positionType'], axis=1, inplace=True) print 'Done' print train_df.info() print train_df.describe() print train_df.isnull().sum() train_np = train_df.as_matrix() y = train_np[:, 0] X = train_np[:, 1:] print 'Train Xgboost Model...' start_time = datetime.datetime.now() xbg_clf = XGBRegressor(n_estimators=100, max_depth=6, objective="binary:logistic", silent=False) xbg_clf.fit(X, y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds) model_df = pd.DataFrame({ 'columns': list(train_df.columns)[1:], 'values': xbg_clf.feature_importances_ }) print model_df return xbg_clf
def get_ntree(): rmse_t_total, rmse_v_total = [], [] for ntree in range(10, 500, 10): xgb_base = XGBRegressor(objective='reg:linear', n_estimators=ntree, random_state=1234, silent=0, booster='gbtree', eval_metric='rmse') rmse_t_1, rmse_v_1 = [], [] print('此时 ntree = %s' % ntree) for train, test in get_cv(y=y_train, n_splits=5, random_state=42): X_t, y_t = X_train[train], y_train[train] X_v, y_v = X_train[test], y_train[test] xgb_base.fit(X_t, y_t) y_t_pre = xgb_base.predict(X_t) y_v_pre = xgb_base.predict(X_v) rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre)) rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre)) rmse_t_1.append(rmse_t_each) rmse_v_1.append(rmse_v_each) rmse_t = np.mean(rmse_t_1) rmse_v = np.mean(rmse_v_1) rmse_t_total.append(rmse_t) rmse_v_total.append(rmse_v) return rmse_t_total, rmse_v_total
def xgboost_single_pred(self): x_train = self.x_train y_train = self.y_train x_test = self.x_test y_test = self.y_test self.y_pred_all_xgb = [] y_train = list(y_train) xgboost_clf = XGBRegressor(learning_rate=0.1, n_estimators=75) for i in range(len(x_test)): xgboost_clf.fit(x_train, y_train) x_test_one = x_test.iloc[i:i + 1] y_test_one = xgboost_clf.predict(x_test_one) self.y_pred_all_xgb.append(list(y_test_one)[0]) x_train = x_train.append(x_test_one) y_train.append(y_test[i]) xgboost_mse = mean_squared_error(self.y_test, self.y_pred_all_xgb) xgboost_rmse = np.sqrt(xgboost_mse) y_pred_all_xgb = pd.DataFrame(list(self.y_pred_all_xgb)) ratio_single_xgb = pd.DataFrame(list(self.y_test)) / y_pred_all_xgb return xgboost_rmse, y_pred_all_xgb, ratio_single_xgb
def fit_model_split(self, X_train, y_train, X_test, y_test): ##X_train_1用于生成模型 X_train_2用于和新特征组成新训练集合 X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split( X_train, y_train, test_size=0.6, random_state=0) clf = XGBRegressor(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train_1, y_train_1) # y_pre = clf.predict(X_train_2) # y_pro = clf.predict_proba(X_train_2)[:, 1] # print # "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_train_2, y_pro) # print # "pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score(y_train_2, y_pre) new_feature = clf.apply(X_train_2) X_train_new2 = self.mergeToOne(X_train_2, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print "Training set of sample size 0.4 fewer than before" return X_train_new2, y_train_2, X_test_new, y_test
def fit_model(self, data, target, test): clf = XGBRegressor(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) data = np.array(data).astype(float) scaler = MinMaxScaler() temp = scaler.fit(data) data = scaler.transform(data) test = scaler.transform(test) target = scaler.fit_transform(target) clf.fit(data, target) new_feature = clf.apply(data) new_test = clf.apply(test) X_train_new = self.mergeToOne(pd.DataFrame(data), new_feature) X_test_new = self.mergeToOne(pd.DataFrame(test), new_test) X_train_new = pd.DataFrame(X_train_new) X_test_new = pd.DataFrame(X_test_new) return X_train_new, target, X_test_new
def FI_xgb_sklearn(): X, y = load_traindata(encodetype='le') cols = list(X.columns) rndcol = np.random.randn(X.shape[0]) X = np.column_stack((X, rndcol)) cols.append('random') xgb1 = XGBRegressor(learning_rate=0.01, n_estimators=3320, max_depth=3, min_child_weight=4, colsample_bytree=0.8, subsample=0.8, importance_type='total_gain', objective='reg:linear', n_jobs=-1, random_state=0, seed=27, silent=True) xgb1.fit(X, y) imp = sorted(list(zip(cols, xgb1.feature_importances_)), key=lambda t: abs(t[1]), reverse=True) imp = pd.DataFrame(imp, columns=['Feature', 'Importance']) rnd_idx = np.argwhere(imp['Feature'] == 'random')[0][0] print(imp.iloc[:rnd_idx + 1, :]) return imp
def xgt_regressor(lr, max_d, estimators, X_train, X_test, y_train, y_test, obj): rmse = 10000 for i in lr: for j in max_d: for k in estimators: clf=XGBRegressor(learning_rate=i, n_estimators=k, max_depth=j, min_child_weight=1, gamma=1, subsample=0.5, colsample_bytree=0.8, objective=obj, nthread=4, scale_pos_weight=1, missing=np.nan) clf.fit(X_train,y_train) y_pred=clf.predict(X_test) a,b,c = pred_eval(y_pred,y_test) if a < rmse: b_lr = i b_d = j b_e = k rmse = a clf_b = clf return clf_b, (b_lr, b_d, b_e)
def xgbt_base_rmse_mode(train_input, train_target, test_input, test_target): param = { 'n_estimators': 10, 'learning_rate': 0.01, } adj_params = { 'n_estimators': [10, 50, 100, 200, 300, 400, 500, 1000], 'learning_rate': [0.01, 0.1, 1] } xgbt = XGBRegressor(**param) cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) cscv = GridSearchCV(xgbt, adj_params, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1) cscv.fit(train_input, train_target) xgbt = XGBRegressor(**cscv.best_params_) xgbt.fit(train_input, train_target.ravel()) predicted = xgbt.predict(test_input) xgbt_base_rmse = np.sqrt(metrics.mean_squared_error( test_target, predicted)) print("xgbt_base_rmse: ", xgbt_base_rmse) #print ("RMSE:", np.sqrt(metrics.mean_squared_error(test_target, predicted))) return xgbt_base_rmse
def learn_model(X_train, y_train, X_valid, y_valid): t1 = time() model = XGBRegressor(max_depth=7, n_estimators=500) model.fit(X_train, y_train, eval_metric="rmse", eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=True, early_stopping_rounds=10) t2 = time() print('Total of training time: ', t2 - t1) return model
def apply_model(self, feature_names, cv_type=None, search_type=None, scorer=None): self.cv_type = cv_type self.search_type = search_type self.scorer = scorer if cv_type is None: print("Seed:", self.seed) x_gb = XGBRegressor(learning_rate=0.1, n_estimators=300, random_state=self.seed) x_gb.fit(self.X_train, self.y_train) else: if search_type == "random": parameter_grid = self._get_random_parameter_grid() print("Seed:", self.seed) x_gb_search = RandomizedSearchCV( estimator=XGBRegressor(n_estimators=300, random_state=self.seed), param_distributions=parameter_grid, scoring=self._get_scorer(), n_iter=150, cv=self._get_cv(), iid=False, random_state=self.seed, n_jobs=-1) x_gb_search.fit(self.X_train, self.y_train) x_gb = x_gb_search.best_estimator_ mean_test_score = x_gb_search.cv_results_["mean_test_score"] std_test_score = x_gb_search.cv_results_["std_test_score"] print(max(mean_test_score), std_test_score[np.argmax(mean_test_score)]) print(x_gb) elif search_type == "smac": smac = self._get_smac() try: incumbent = smac.optimize() finally: incumbent = smac.solver.incumbent x_gb = XGBRegressor( learning_rate=0.1, n_estimators=300, seed=self.seed, max_depth=incumbent["max_depth"], min_child_weight=incumbent["min_child_weight"], gamma=incumbent["gamma"], subsample=incumbent["subsample"], colsample_bytree=incumbent["colsample_bytree"]) x_gb.fit(self.X_train, self.y_train) print(x_gb) else: raise ValueError("search_type must be either random or smac.") train_prediction, test_prediction = self.apply_predict(x_gb) return train_prediction, test_prediction
def xgb_regressor(x, y): model = XGBRegressor(n_estimators=33, learning_rate=0.1, subsample=0.6, max_depth=1, objective='rank:pairwise', random_state=0) model.fit(x, y) return model
def get_feat_imp(train,ID='id',target='price_doc'): predictors = [x for x in train.columns if x not in [ID,target]] model = XGBRegressor( max_depth=5, learning_rate=0.05, n_estimators=385, silent=True, objective='reg:linear', nthread=-1, min_child_weight=1, max_delta_step=0, subsample=0.93, seed=27) model.fit(train[predictors],train[target]) feat_imp = pd.Series(model.booster().get_fscore(),index=predictors).sort_values(ascending=False) return feat_imp
def cv_test(train, cv=5): t0 = time.time() target = 'y' predictors = [x for x in train.columns if x not in ['ID', 'y']] train_X = train[predictors] train_Y = train[target] mean_r2 = [] kf = KFold(len(train_Y), n_folds=cv, shuffle=True, random_state=520) for i, (train_index, test_index) in enumerate(kf): x_train = train_X.iloc[train_index] x_test = train_X.iloc[test_index] y_train = train_Y.iloc[train_index] y_test = train_Y.iloc[test_index] lgb_model = LGBMRegressor(boosting_type='gbdt', num_leaves=10, max_depth=4, learning_rate=0.005, n_estimators=675, max_bin=25, subsample_for_bin=50000, min_split_gain=0, min_child_weight=5, min_child_samples=10, subsample=0.995, subsample_freq=1, colsample_bytree=1, reg_alpha=0, reg_lambda=0, seed=0, nthread=-1, silent=True) xgb_model = XGBRegressor(max_depth=4, learning_rate=0.0045, n_estimators=1250, silent=True, objective='reg:linear', nthread=-1, min_child_weight=1, max_delta_step=0, subsample=0.93, seed=27) xgb_model.fit(x_train, y_train) pred = xgb_model.predict(x_test) from sklearn.metrics import r2_score score = r2_score(y_test, pred) mean_r2.append(score) print('{0}: r2:{1}\n\n'.format(i + 1, score)) print(u'r2-均值:%s' % (np.array(mean_r2).mean())) print('Done in %.1fs!' % (time.time() - t0)) return None
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate( cross_validation_iterator.split(X_train), start=1): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) submission_file_path = os.path.join( SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue model = XGBRegressor(learning_rate=0.01, max_depth=12, n_estimators=N_ESTIMATORS, silent=False, objective="reg:linear", gamma=1, min_child_weight=1, subsample=0.8, colsample_bytree=0.5, reg_alpha=1, seed=cross_validation_index, nthread=-1) model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])], eval_metric=lambda y_predicted, y_true: ("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))), early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True) # Perform the testing procedure Y_test = model.predict(X_test) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
def model(df, alpha): X = df y = df.pop('y') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42) upper = _random_search( X_train, y_train, XGBOOSTQUANTILE(quant_alpha=1 - alpha / 2, quant_delta=1, quant_thres=6, quant_var=3.2), { 'quant_delta': uniform(.01, 12), 'quant_thres': uniform(1, 12), 'quant_var': uniform(1, 12) }).predict(X_test) lower = _random_search( X_train, y_train, XGBOOSTQUANTILE(quant_alpha=alpha / 2, quant_delta=1, quant_thres=6, quant_var=3.2), { 'quant_delta': uniform(.01, 12), 'quant_thres': uniform(1, 12), 'quant_var': uniform(1, 12) }).predict(X_test) median = _random_search( X_train, y_train, XGBOOSTQUANTILE(quant_alpha=.5, quant_delta=1, quant_thres=6, quant_var=3.2), { 'quant_delta': uniform(.01, 12), 'quant_thres': uniform(1, 12), 'quant_var': uniform(1, 12) }).predict(X_test) xgbls = XGBRegressor() xgbls.fit(X_train, y_train) mean = xgbls.predict(X_test) return pd.concat([ X_test.reset_index(drop=True), y_test.reset_index(drop=True), pd.DataFrame(upper, columns=['upper_bound']), pd.DataFrame(lower, columns=['lower_bound']), pd.DataFrame(mean, columns=['mean']), pd.DataFrame(median, columns=['median']) ], axis=1)
def xgb_regression(X_train,y_train,X_val, y_val,X_test,y_test,args): if y_test.shape[-1] == 1: model = XGBRegressor( learn_rate=0.1, max_depth=4, # 4 min_child_weight=10, gamma=1, # 1 subsample=0.8, colsample_bytree=0.8, reg_alpha=0.8, objective='reg:linear', n_estimators=2000, tree_method='gpu_hist', n_gpus=-1 ) model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', early_stopping_rounds=300) y_pred = model.predict(X_test) y_test = y_test.astype('float') MSE = mean_squared_error(y_test, y_pred) RMSE = MSE ** 0.5 return RMSE else: RMSEs = [] if len(y_train.shape) == 3: y_train = [x[0] for x in y_train] y_val = [x[0] for x in y_val] y_test = [x[0] for x in y_test] y_train = pd.DataFrame(y_train) y_val = pd.DataFrame(y_val) y_test = pd.DataFrame(y_test) for i in range(y_test.shape[1]): if float(max(y_val[i])) == 0 or float(max(y_train[i])) == 0 or float(max(y_test[i])) == 0: continue model = XGBRegressor( learn_rate=0.1, max_depth=4, # 4 min_child_weight=10, gamma=1, # 1 subsample=0.8, colsample_bytree=0.8, reg_alpha=0.8, objective='reg:linear', n_estimators=2000, tree_method='gpu_hist', n_gpus=-1 ) model.fit(X_train, [float(k) for k in y_train[i]], eval_set=[(X_val, [float(k) for k in y_val[i]])], eval_metric='rmse', early_stopping_rounds=300) y_pred = model.predict(X_test) y_test = y_test.astype('float') MSE = mean_squared_error(y_test[i], y_pred) RMSE = MSE ** 0.5 RMSEs.append(RMSE) return np.mean(RMSEs)
def run_xgb(**args): print("building xgb model:") xgb_model = XGBRegressor() xgb_model.fit(args["training_data"], args["training_label"]) output = xgb_model.predict(args["test_data"]) pickle.dump(xgb_model, open("xgb_testmodel.p", "wb")) output = list(map(lambda e: round(e), output)) print(output) pickle.dump(output, open("xgb_output.p", "wb")) return output
def fit_model(X, y, model_name): print('Model Fitting started: ', datetime.now()) start_time = time.time() classifier = XGBRegressor(objective='reg:squarederror', n_jobs=8, n_estimators= 1000, verbosity= 3) classifier.fit(X, y) pickle.dump(classifier, open("models/{}.pickle.dat".format(model_name), 'wb')) print('Duration Fitting: ', (time.time() - start_time)) return classifier
def xgb(x_train, y_train, x_val, y_val): xgb = XGBRegressor(n_estimators=1000, max_depth=10, learning_rate=0.01, subsample=0.8, colsample_bytree=0.8, random_state=2000) xgb.fit(x_train, y_train) result = xgb.predict(x_val) score = mean_absolute_error(result, y_val) return score
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue model = XGBRegressor( learning_rate=0.01, max_depth=12, n_estimators=N_ESTIMATORS, silent=False, objective="reg:linear", gamma=1, min_child_weight=1, subsample=0.8, colsample_bytree=0.5, reg_alpha=1, seed=cross_validation_index, nthread=-1) model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])], eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))), early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True) # Perform the testing procedure Y_test = model.predict(X_test) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ # checks for sampling sample_fraction = float(sample_fraction) if sample_fraction > 1: sample_fraction = 1.0 if sample_fraction * n_samples > 1: n_samples = round(1.0/sample_fraction) if sample_fraction <= 0: print('sample_fraction 0 or negative, switching to 0.1') sample_fraction = 0.1 # if sample_fraction is results in sample smaller than 1 if round(sample_fraction * len(df)) == 0: sample_fraction = 1.0/len(df) # check if data is dataframe if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.sample_fraction = sample_fraction self.n_samples = n_samples self.num_training_rounds = num_training_rounds self.prefix = prefix # init the classifier: if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'multiclass': self.scoring = 'merror' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'multi:softmax', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) # if preferred scoring metric is stated: if scoring: self.scoring = scoring else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() # if subsampling if self.sample_fraction == 1.0: df_list = [self.df] else: df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples) print(df_list) for idx, current_df in enumerate(df_list): print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction)) xgtrain = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: xgb_param['num_class'] = len(current_df[self.target_column].unique()) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(current_df[self.predictors]) if self.target_type == 'binary' or self.target_type == 'multiclass': train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions)) if self.target_type == 'binary': print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))) filename = self.prefix + '_' + str(idx) + '.pkl' self.save(filename) def predict(self, test_df, return_multi_outputs=False, return_mean_std=False): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan # prediction print('## predicting from test set') output_list = [] output = None for idx, ns in enumerate(range(self.n_samples)): if self.n_samples == 1: xgb = self if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) else: try: filename = self.prefix + '_' + str(idx) + '.pkl' xgb = self.load(filename) if self.target_type == 'binary': output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': output = xgb.clf.predict(self.test_df[self.predictors]) output_list.append(list(output)) except IOError: print('no file found, skipping') # average the outputs if n_samples is more than one if self.n_samples == 1: self.output = output try: self.multi_outputs = [list(output)] except: self.multi_outputs = None else: self.output = np.mean(output_list, axis=0) self.multi_outputs = output_list if return_multi_outputs: return self.multi_outputs elif return_mean_std: return (self.output, np.std(output_list, axis=0)) else: return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def random_sample(self, df, fraction=0.2, n_samples=None): """ splits into random samples - n_samples: how many samples you want returned (default = All) - fraction : what fraction of data to include in the sample (default = 0.2) """ print('constructing random samples') num_rows = len(df) len_sample = round(fraction * num_rows) # create list of slice index lists indices = range(0,num_rows) slice_list = [] tmp_idx_list = [] while len(indices) > 0: while len(tmp_idx_list) < len_sample and len(indices) > 0: idx = indices.pop(random.randrange(len(indices))) tmp_idx_list.append(idx) slice_list.append(tmp_idx_list) tmp_idx_list = [] # get slices sample_list = [] for s in range(n_samples): try: sample_list.append(df.loc[slice_list[s],:]) except: pass return sample_list def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) try: for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) print('results written to ' + filename) except: print('write_csv failed') def save(self, filename='xgb.pkl'): joblib.dump(self, filename) def load(self, model_file='xgb.pkl'): xgb = joblib.load(model_file) return xgb
pd.Series(rf_age_valid_pred).isnull().sum() statistics.mean(error_df.sqerr) len(rf_age_valid_pred) len(age_valid_Y) pd.crosstab(pd.Series(rf_age_valid_pred).apply(lambda x: round(x)),age_valid_Y) ## XGB for age prediction from xgboost.sklearn import XGBRegressor xgb = XGBRegressor(max_depth=6, learning_rate=0.2, n_estimators=100, objective='reg:linear', subsample=0.5, colsample_bytree=0.5, seed=321) eval_set = [(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y), (mvt_valid_X.drop(['age', 'gender'], axis=1),age_valid_Y)] xgb.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y, eval_set = eval_set, eval_metric= 'rmse',early_stopping_rounds= 10, verbose=1) xgb_age_valid_pred = xgb.predict(mvt_valid_X.drop(['age', 'gender'], axis=1)) ## ADAboost for age prediction from sklearn.ensemble import AdaBoostRegressor ada = AdaBoostRegressor(n_estimators=50,learning_rate=0.1,loss='linear', random_state=321) ada.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y.values,) ada_age_valid_pred = ada.predict(mvt_valid_X.drop(['age', 'gender'], axis=1)) len(ada_age_valid_pred) len(age_valid_Y) error_df = pd.DataFrame(pd.Series(ada_age_valid_pred), columns=['pred']) error_df.reset_index( inplace=True) act_df = pd.DataFrame(age_valid_Y)
gsearch7.best_params_ #{'reg_alpha': 0.09} xgb1.set_params(reg_alpha= 0.09) #parameters of algorithm xgb1=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=0.7, gamma=0, learning_rate=0.5, max_delta_step=0, max_depth=7, min_child_weight=4, missing=None, n_estimators=1000, n_jobs=1, nthread=4, objective='reg:gamma', random_state=0, reg_alpha=0.09, reg_lambda=1, scale_pos_weight=1, seed=1048, silent=True, subsample=0.86) ################### #Fit the algorithm on the data xgb1.fit(train_x, train_y,eval_metric='rmse') ### train vs test pre = xgb1.predict(test_x) plt.figure(figsize=(16,9)) plt.style.use('ggplot') plt.plot(pre,label='predict_load1') plt.plot(np.array(test_y),label='test_load1') plt.title('Test_prediction MAE=%s' % str(np.sum(abs(pre-test_y))/len(pre))) plt.legend(loc='upper left') plt.savefig("D:\\load Forecasting\\plot\\TraiVsTest.jpg") #####Newdata pre1=xgb1.predict(load1_test.drop('load1',1)) plt.figure(figsize=(16,9))
class Xgb: def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None): """ input params: - df (DataFrame): dataframe of training data - target_column (string): name of target column - id_column (string): name of id column - target_type (string): 'linear' or 'binary' - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding - drop_columns (list): list of columns to drop - numeric_columns (list): list of columns to convert to numeric - verbose (bool): verbosity of printouts """ if type(df) == pd.core.frame.DataFrame: self.df = df self.early_stopping_rounds = early_stopping_rounds if target_column: self.target_column = target_column self.id_column = id_column self.target_type = target_type self.categorical_columns = categorical_columns self.numeric_columns = numeric_columns self.drop_columns = drop_columns self.verbose = verbose self.num_training_rounds = num_training_rounds # init the classifier if self.target_type == 'binary': self.scoring = 'auc' self.clf = XGBClassifier( learning_rate =0.1, n_estimators = num_training_rounds, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', scale_pos_weight = 1, seed = 123) elif self.target_type == 'linear': self.scoring = 'rmse' self.clf = XGBRegressor( n_estimators = num_training_rounds, objective = 'reg:linear' ) else: print('please provide target column name') else: print('please provide pandas dataframe') def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() xgtrain = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(self.df[self.predictors]) if self.target_type == 'binary': train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))) def predict(self, test_df): print('### predicting ###') print('## preprocessing test set') if self.id_column in test_df: ids = test_df[self.id_column] if self.target_column in test_df.columns: targets = test_df[self.target_column] self.test_df = self.preprocess(test_df, train=False) if self.id_column in test_df: self.test_df[self.id_column] = ids if self.target_column in test_df.columns: self.test_df[self.target_column] = targets for col in self.predictors: if col not in self.test_df.columns: self.test_df[col] = np.nan if self.target_type == 'binary': self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1] elif self.target_type == 'linear': self.output = self.clf.predict(self.test_df[self.predictors]) return self.output def feature_importance(self, num_print=10, display=True): feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True) impt = pd.DataFrame(feature_importance) impt.columns = ['feature', 'importance'] print(impt[:num_print]) if display: impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3)) def preprocess(self, df, train=True): # one hot encoding of categorical variables print('## one hot encoding of categorical variables') for col in self.categorical_columns: if self.verbose: print('one hot encoding: ', col) df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1) df = df.drop([col], axis=1) # if training, determine columns to be removed if train: # drop columns that are too sparse to be informative self.cols_to_remove = [] print('## dropping columns below sparsity threshold') for col in df.columns: nan_cnt = 0 for x in df[col]: try: if np.isnan(x): nan_cnt += 1 except: pass if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop if self.verbose: print('will drop', col) self.cols_to_remove.append(col) # drop columns that have no standard deviation (not informative) print('## dropping columns with no variation') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64': if df[col].std() == 0: print('will drop', col) self.cols_to_remove.append(col) if self.verbose and self.cols_to_remove: print('dropping the following columns:', self.cols_to_remove) df = df.drop(self.cols_to_remove, axis=1) if self.verbose: print('## DataFrame shape is now:', df.shape) # convert to numerical where possible #print('## converting numerical data to numeric dtype') #df = df.convert_objects(convert_numeric=True) # convert columns specified to be int and float for col in self.numeric_columns: if col not in self.cols_to_remove: if self.verbose: print('converting', col) df[col] = pd.to_numeric(df[col], errors='coerce') if self.verbose: print(df[col].dtype) # drop those marked for dropping df = df.drop(self.drop_columns, axis=1) # drop all those that are object type print('## dropping non-numerical columns') for col in df.columns: if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool': pass else: if self.verbose: print('dropping because not int, float, or bool:', col) df = df.drop([col], axis=1) return df def _to_int(self, num): try: return int(num) except: return def _to_float(self, num): try: return float(num) except: return def write_csv(self, filename, include_actual=False): """ write results to csv - include actual: if actual values are known for test set, and we want to print them """ with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) headers = [self.id_column, self.target_column] if include_actual: headers.append('actual') writer.writerow(headers) for idx, value in enumerate(self.output): test_id = self.test_df[self.id_column][idx] test_output = self.output[idx] to_write = [test_id, test_output] if include_actual: to_write.append(self.test_df[self.target_column][idx]) writer.writerow(to_write) def save(self, filename='xgb.pkl'): joblib.dump(self, filename)