'colsample_bytree': 0.7, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': 1 } print('train shape', x_train.shape) # x_train.to_csv('../subs/naive2.csv', index=True) dtrain = xgb.DMatrix(x_train, y_train) dtest = xgb.DMatrix(x_test) cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20, verbose_eval=20, show_stdv=False) cv_output[['train-rmse-mean', 'test-rmse-mean']].plot() num_boost_rounds = len(cv_output) print('num rounds:', num_boost_rounds) model = xgb.train(xgb_params, dtrain, num_boost_round= num_boost_rounds) fig, ax = plt.subplots(1, 1, figsize=(8, 13)) xgb.plot_importance(model, height=0.5, ax=ax) plt.show() y_predict = model.predict(dtest) output = pd.DataFrame({'id': id_test, 'price_doc': y_predict}) output.to_csv('../subs/xgbSub_seed255.csv', index=False)
def train_helper(X_train, X_test, y_train, y_test, model_name): xg_train = xgboost.DMatrix( X_train, label=y_train) xg_test = xgboost.DMatrix(X_test, label=y_test) le = load_label_encoder(model_name) param = {} # use softmax multi-class classification param['objective'] = 'multi:softprob' param['eta'] = 0.002 param['max_depth'] = 7 param['nthread'] = 7 param['num_class'] = len(le.classes_) param['eval_metric'] = 'merror' evals = [ (xg_train, 'train'), (xg_test, 'eval') ] # Train xgboost print "Training classifier..." t1 = time.time() bst = xgboost.train(param, xg_train, 500, evals, early_stopping_rounds=10) xgboost.plot_importance(bst) t2 = time.time() print t2-t1 bst.save_model(classifier_filename(model_name)) return bst
def test_importance_plot_lim(self): np.random.seed(1) dm = xgb.DMatrix(np.random.randn(100, 100), label=[0, 1] * 50) bst = xgb.train({}, dm) assert len(bst.get_fscore()) == 71 ax = xgb.plot_importance(bst) assert ax.get_xlim() == (0., 11.) assert ax.get_ylim() == (-1., 71.) ax = xgb.plot_importance(bst, xlim=(0, 5), ylim=(10, 71)) assert ax.get_xlim() == (0., 5.) assert ax.get_ylim() == (10., 71.)
def run_xgb(train, test, features, target, random_state=0): eta = 0.02 max_depth = 5 subsample = 0.75 colsample_bytree = 0.7 start_time = time.time() print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree)) params = { "objective": "multi:softprob", "num_class": 12, "booster" : "gbtree", "eval_metric": "mlogloss", "eta": eta, "max_depth": max_depth, "subsample": subsample, "colsample_bytree": colsample_bytree, "silent": 1, "seed": random_state, } num_boost_round = 500*2 early_stopping_rounds = 50 test_size = 0.3 X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state) print('Length train:', len(X_train.index)) print('Length valid:', len(X_valid.index)) y_train = X_train[target] y_valid = X_valid[target] dtrain = xgb.DMatrix(X_train[features], y_train) dvalid = xgb.DMatrix(X_valid[features], y_valid) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True) print "importance of feathure" xgb.plot_importance(gbm) show() #time.sleep(60*5) print("Validating...") check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration) score = log_loss(y_valid.tolist(), check) print("Predict test set...") test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration) print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2))) return test_prediction.tolist(), score
def run_train_validation(self): x_train, y_train,x_validation,y_validation = self.get_train_validationset() dtrain = xgb.DMatrix(x_train, label= y_train,feature_names=x_train.columns) dvalidation = xgb.DMatrix(x_validation, label= y_validation,feature_names=x_validation.columns) self.set_xgb_parameters() evals=[(dtrain,'train'),(dvalidation,'eval')] model = xgb.train(self.xgb_params, dtrain, evals=evals, **self.xgb_learning_params) xgb.plot_importance(model) plt.show() print "features used:\n {}".format(self.get_used_features()) return
def test_sklearn_plotting(): tm._skip_if_no_sklearn() from sklearn.datasets import load_iris iris = load_iris() classifier = xgb.XGBClassifier() classifier.fit(iris.data, iris.target) import matplotlib matplotlib.use('Agg') from matplotlib.axes import Axes from graphviz import Digraph ax = xgb.plot_importance(classifier) assert isinstance(ax, Axes) assert ax.get_title() == 'Feature importance' assert ax.get_xlabel() == 'F score' assert ax.get_ylabel() == 'Features' assert len(ax.patches) == 4 g = xgb.to_graphviz(classifier, num_trees=0) assert isinstance(g, Digraph) ax = xgb.plot_tree(classifier, num_trees=0) assert isinstance(ax, Axes)
def test_plotting(self): bst2 = xgb.Booster(model_file='xgb.model') # plotting import matplotlib matplotlib.use('Agg') from matplotlib.axes import Axes from graphviz import Digraph ax = xgb.plot_importance(bst2) assert isinstance(ax, Axes) assert ax.get_title() == 'Feature importance' assert ax.get_xlabel() == 'F score' assert ax.get_ylabel() == 'Features' assert len(ax.patches) == 4 ax = xgb.plot_importance(bst2, color='r', title='t', xlabel='x', ylabel='y') assert isinstance(ax, Axes) assert ax.get_title() == 't' assert ax.get_xlabel() == 'x' assert ax.get_ylabel() == 'y' assert len(ax.patches) == 4 for p in ax.patches: assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'], title=None, xlabel=None, ylabel=None) assert isinstance(ax, Axes) assert ax.get_title() == '' assert ax.get_xlabel() == '' assert ax.get_ylabel() == '' assert len(ax.patches) == 4 assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue g = xgb.to_graphviz(bst2, num_trees=0) assert isinstance(g, Digraph) ax = xgb.plot_tree(bst2, num_trees=0) assert isinstance(ax, Axes)
def save_topn_features(self, fname="XGBRegressor_topn_features.txt", topn=-1): ax = xgb.plot_importance(self.model) yticklabels = ax.get_yticklabels()[::-1] if topn == -1: topn = len(yticklabels) else: topn = min(topn, len(yticklabels)) with open(fname, "w") as f: for i in range(topn): f.write("%s\n"%yticklabels[i].get_text())
def plot_feat_importances(): gbm = xgboost.XGBClassifier(silent=False, seed=8).fit(X_train, y_train) plot = xgboost.plot_importance(gbm) ticks = plot.set_yticklabels(df_xgb.columns) importances = rf.feature_importances_ std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0) indices = np.argsort(importances) plt.barh(range(len(indices)), importances[indices], yerr=std[indices], color='lightblue') ticks = plt.yticks(range(len(indices)), df_xgb.columns)
def plot_importance(self, ax=None, height=0.2, xlim=None, title='Feature importance', xlabel='F score', ylabel='Features', grid=True, **kwargs): """Plot importance based on fitted trees. Parameters ---------- ax : matplotlib Axes, default None Target axes instance. If None, new figure and axes will be created. height : float, default 0.2 Bar height, passed to ax.barh() xlim : tuple, default None Tuple passed to axes.xlim() title : str, default "Feature importance" Axes title. To disable, pass None. xlabel : str, default "F score" X axis title label. To disable, pass None. ylabel : str, default "Features" Y axis title label. To disable, pass None. kwargs : Other keywords passed to ax.barh() Returns ------- ax : matplotlib Axes """ import xgboost as xgb if not isinstance(self._df.estimator, xgb.XGBModel): raise ValueError('estimator must be XGBRegressor or XGBClassifier') return xgb.plot_importance(self._df.estimator.booster(), ax=ax, height=height, xlim=xlim, title=title, xlabel=xlabel, ylabel=ylabel, grid=True, **kwargs)
import xgboost as xgb #Subset the data and set up model parameters offset = 5000 num_round = 500 xgtest = xgb.DMatrix(test) gb_params = {"objective":"reg:linear", "eta": 0.01, "min_child_weight": 6, \ "subsample": 0.7, "colsample_bytree": 0.7, "scale_pos_weight": 1, "silent": 1, "max_depth": 8} #Create a train and validation dmatrices xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:]) xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset]) #Train model and predict test values watchlist = [(xgtrain, 'train'),(xgval, 'val')] model = xgb.train(gb_params, xgtrain, num_round, watchlist, \ early_stopping_rounds=4) xg_preds = model.predict(xgtest, ntree_limit=model.best_iteration) xgb.plot_importance(model) fscore = [ (v,k) for k,v in model.get_fscore().iteritems() ] fscore.sort(reverse=True) #Send predicted scores to csv file submission = pd.DataFrame({"Id": test_ind, "Hazard": xg_preds}) submission = submission.set_index("Id") submission.to_csv('/Users/btrani/Git/projects/Kaggle/Liberty_Mutual/sub_gb_7.csv')
params["colsample_bytree"] = 0.8 cv_results = xgb.cv(params,train_matrix, num_boost_round = params["num_rounds"], nfold = params.get('nfold',5), metrics = params['eval_metric'], early_stopping_rounds = params["early_stopping_rounds"], verbose_eval = True, seed = seed) n_best_trees = cv_results.shape[0] watchlist = [(train_matrix, 'train')] gbt = xgb.train(params, train_matrix, n_best_trees,watchlist) xgb.plot_importance(gbt) datas.groupby("age")['is_fraud'].agg(['count','mean']) ## false negative cost more ## false positive is acceptable ########### plot ROC on validation set Xtrain_only,Xvalid,ytrain_only,yvalid = train_test_split(Xtrain,ytrain,test_size=0.3,random_state=seed) onlytrain_matrix = xgb.DMatrix(Xtrain_only,ytrain_only) valid_matrix = xgb.DMatrix(Xvalid,yvalid) temp_gbt = xgb.train(params, onlytrain_matrix, n_best_trees,[(onlytrain_matrix,'train_only'),(valid_matrix,'validate')]) yvalid_proba_pred = temp_gbt.predict(valid_matrix,ntree_limit=n_best_trees) fpr,tpr,thresholds = roc_curve(yvalid,yvalid_proba_pred) roc = pd.DataFrame({'FPR':fpr,'TPR':tpr,'Threshold':thresholds})
#n_job = 7 time 0:00:00.030917 #5. 시각화 import matplotlib.pyplot as plt import numpy as np ''' # def plot_feature_importances_dataset(model): # n_features = dataset.data.shape[1] # plt.barh(np.arange(n_features), model.feature_importances_, # align='center') # plt.yticks(np.arange(n_features), dataset.feature_names) # plt.xlabel("Feature Improtances") # plt.ylabel("Features") # plt.ylim(-1, n_features) ''' plot_importance(model) #xgboost --- plot_importance plt.show() #F score로 확인 #DecisionTreeClassifier #정리전 # [0.0125026 0. 0.03213177 0.95536562] # acc : 0.9333333333333333 #정리후 # [0.44369011 0.53961888 0.01669101] # acc col정리 : 0.9 #RandomForestClassifier #정리전 # [0.08142011 0.02056809 0.41412333 0.48388846]
def fea_plot(xg_model, feature, label, type='weight', max_num_features=None, x_axis_label=None, ranks_dir='./'): fig, AX = plt.subplots(nrows=1, ncols=2) fscore = xg_model.get_score(importance_type=type) fscore = sorted(fscore.items(), key=itemgetter(1), reverse=True) # sort scores fea_index = get_fea_index(fscore, max_num_features) #save ranks to files path_to_save = '../average_rank/ranks/' + ranks_dir if not os.path.isdir(path_to_save): os.mkdir(path_to_save) path_to_save = path_to_save + '/index_' + type + '.txt' save_rank_file = open(path_to_save, 'w') all_feat_index = get_fea_index(fscore, None) all_feat_index = [i + 1 for i in all_feat_index] print('fscore len') print(len(all_feat_index)) if (x_axis_label != None): all_x_axis_label = get_axis_label(all_feat_index, x_axis_label) else: all_x_axis_label = all_feat_index for item in all_x_axis_label: save_rank_file.write("%s\n" % item) save_rank_file.close() if (x_axis_label != None): mapper = {'f{0}'.format(i): v for i, v in enumerate(x_axis_label)} mapped = { mapper[k]: v for k, v in xg_model.get_score(importance_type=type).items() } xgb.plot_importance(mapped, xlabel=type, ax=AX[0], max_num_features=max_num_features) else: xgb.plot_importance(xg_model, xlabel=type, importance_type=type, ax=AX[0], max_num_features=max_num_features) print(fea_index) print(max_num_features) feature = feature[:, fea_index] dimension = len(fea_index) X = range(1, dimension + 1) Yp = np.mean(feature[np.where(label == 1)[0]], axis=0) Yn = np.mean(feature[np.where(label != 1)[0]], axis=0) for i in range(0, dimension): param = np.fmax(Yp[i], Yn[i]) if (param != 0): Yp[i] /= param Yn[i] /= param else: print('oops!seems wrong') p1 = AX[1].bar(X, +Yp, facecolor='#ff9999', edgecolor='white') p2 = AX[1].bar(X, -Yn, facecolor='#9999ff', edgecolor='white') AX[1].legend((p1, p2), ('Malware', 'Normal')) AX[1].set_title('Comparison of selected features by their means') AX[1].set_xlabel('Feature Index') AX[1].set_ylabel('Mean Value') AX[1].set_ylim(-1.1, 1.1) #update on 5/25/2017, this line should be added or removed according to the inputdata format fea_index = [i + 1 for i in fea_index] if (x_axis_label != None): tar_x_axis_label = get_axis_label(fea_index, x_axis_label) else: tar_x_axis_label = fea_index plt.xticks(X, tar_x_axis_label, rotation=80) plt.suptitle('Feature Selection results') #seems useless SMALL_SIZE = 8 MEDIUM_SIZE = 10 BIGGER_SIZE = 11 plt.rc('font', size=SMALL_SIZE) # controls default text sizes plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title plt.rc('axes', labelsize=BIGGER_SIZE) # fontsize of the x and y labels plt.rc('xtick', labelsize=BIGGER_SIZE) # fontsize of the tick labels plt.rc('ytick', labelsize=BIGGER_SIZE) # fontsize of the tick labels plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
def plot_feature_importances(self, model): plot_importance(model) plt.show()
def plot_importance_matrix(self,vars_names): pdb.set_trace() xgb.plot_importance(self.clf)
fscore_lo = np.percentile(fscore, 2.5, axis=0) fscore_hi = np.percentile(fscore, 97.5, axis=0) ind_sort = np.array(np.argsort(fscore_mean)) fscore_mean_sorted = fscore_mean[ind_sort] # ci_sorted = fscore_ci[ind_sort] fscore_lo_sorted = fscore_lo[ind_sort] fscore_hi_sorted = fscore_hi[ind_sort] feature_label_sorted = feature_label[ind_sort] feature_label_short = [] for i in range(feature_label_sorted.size): feature_label_short.append(dic[feature_label_sorted[i]]) get_ipython().magic(u'matplotlib inline') plt.figure(figsize=(4,12)) axes = plt.gca() # plt.barh(np.arange(val_sorted.size), val_sorted, xerr=ci_sorted, height=.7, color=(.4,.4,.8), align='center', ecolor=(0,0,0)) plt.barh(np.arange(fscore_mean_sorted.size), fscore_mean_sorted, xerr=np.array([fscore_mean_sorted-fscore_lo_sorted,fscore_hi_sorted-fscore_mean_sorted]), height=.7, color=(.4,.4,.8), align='center', ecolor=(0,0,0)) plt.yticks(np.arange(len(feature_label_short)), feature_label_short, fontsize=12, color=(0,0,0)); # axes.set_ylim([3.5, len(feature_label_short)-9.5]) # axes.set_xlim([0, 0.04]) plt.box(on=False) plt.xlabel('Gini Importance',fontsize=14) plt.grid() # In[ ]: np.percentile(fscore, 2.5, axis=0) xgb.plot_importance()
cv_xgb = xgb.cv(params = our_params, dtrain = xgdmat, num_boost_round = 3000, nfold = 5, metrics = ['error'], # Make sure you enter metrics inside a list or you may encounter issues! early_stopping_rounds = 100) # Look for early stopping that minimizes error print('Tail:\n') print(cv_xgb.tail(5)) our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic', 'max_depth':3, 'min_child_weight':1} print('Final Train: \n') final_gb = xgb.train(our_params, xgdmat, num_boost_round = 432) xgb.plot_importance(final_gb) plt.show() #Predicting: testdmat = xgb.DMatrix(X_pred) y_pred = final_gb.predict(testdmat) y_pred[y_pred > 0.5] = 1 y_pred[y_pred <= 0.5] = 0 y_pred = y_pred.astype(np.int64) #Submission submission = pd.DataFrame({ "PassengerId": test_df["PassengerId"], "Survived": y_pred
# draw feature importance plt.plot(RF_clf_Ab.feature_importances_) plt.title("Feature importance by Random Forest Model") plt.show() # #-----------------------------------------------------------------------------# # 1,7 train the XGBclassifier on the training data xgb_clf_Ab = xgb.XGBClassifier() xgb_clf_Ab.fit(X_Ab_train,y_Ab_train) # predict accuracy xgb_acc_Ab= xgb_clf_Ab.score(X_Ab_test,y_Ab_test) print("Accuracy of xgb: {:.4f}".format(xgb_acc_Ab)) # draw feature importance xgb.plot_importance(xgb_clf_Ab,title = 'Feature importance',xlabel = 'F score', ylabel = 'Features', grid = True ) plt.show() #-----------------------------------------------------------------------------# # 2. train the models on the training data with 5-fold cross validation and check the stability of prediction accuracy #-----------------------------------------------------------------------------# # 2.1 train the logistic regression model on the training data with 5-fold cross validation LR_cv = linear_model.LogisticRegression() LR_scores = cross_validation.cross_val_score(LR_cv, X_Ab,y_Ab, cv = 5) print("Logistic Regression") print(LR_scores) print("Accuracy of LR: %0.2f (+/- %0.2f)" %(LR_scores.mean(),LR_scores.std()*2)) #-----------------------------------------------------------------------------# # 2.2 train SVM classifier with 5-fold cross validation
def plot_importance(self): ax = xgb.plot_importance(self.model) self.save_topn_features() return ax
y_pred=loaded_model.predict_proba(X_test) # In[147]: ll = log_loss(y_test,y_pred) ll # In[148]: xgb.plot_importance(loaded_model) # # XGBOOST PARAMETER TUNING # In[149]: HYPER_PARAMS = { 'learning_rate': 0.20, 'n_estimators':0, 'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.9, 'max_delta_step': 1, 'objective': 'multi:softmax',
ax.tick_params(axis='x', rotation=90) ax.set_xlabel('GAME DATE') ax.set_ylabel('POINTS SCORED') ax.set_title('MAVS 2020 ACTUAL vs PREDICTED vs VEGAS\n BEAT VEGAS 13 out of 25 (DON\'T BET ON THIS!)') ax.legend(loc=2) dal_res = dal_pred - dal_y_test vegas_res = dal_test_vegas - dal_y_test abs(dal_res) < abs(vegas_res) plt_x plt_y = X_test.columns[plt_x] fig,ax = plt.subplots(figsize=(30,24)) # plt.scatter(y_test, bst1.predict(X_test)-y_test) xgb.plot_importance(bst1, ax=ax) ax.barh(plt_y, plt_x) # dal_train_vegas = season[(season.SEASON_ID==i)].iloc[:int(len(season[(season.SEASON_ID==i)])*(2/3)), 5] DUMMY REGRESSOR: dummy_regr = DummyRegressor(strategy="mean") dummy_regr.fit(X_train, y_train) #21.982204049481744 mean_squared_error(y_test, dummy_regr.predict(X_test), squared = False)
colsample_bytree=colsample, subsample=subsample) m.fit(Xtr, ytr) pp = m.predict_proba(Xts)[:, 1] if FINAL_SUBMISSION: import datetime timestamp = datetime.datetime.now().strftime( '%Y-%m-%d-%H:%M') scores = np.c_[np.arange(len(pp)), pp] np.savetxt('../out/vilab-submission-%s.csv' % timestamp, scores, '%d,%.8f', ',', header='id,probability', comments='') toc() else: toc('cs=%.2f md=%2d lr=%.2f mcw=%1d g=%d score=%.4f' % ( colsample, max_depth, learning_rate, min_child_weight, gamma, roc_auc_score(yts, pp))) sys.stdout.flush() import matplotlib.pyplot as plt plt.ioff() xgb.plot_importance(m, tick_label=names) plt.savefig('xgb-features.pdf') plt.show() ''' xgb.plot_tree(m) plt.savefig('xgb-tree.pdf', dpi=900) plt.show() '''
#test = [] pred2 = model.predict(dtest) df2 = pd.DataFrame() df2["Orginal"] = testDelay df2["Predicted"] = pred2 df2.to_csv('compareDelay.csv', index = False) import matplotlib.pyplot as plt plt.style.use("ggplot") mapper = { 'f{0}' . format (I): v for I, v in enumerate (train.columns)} mapped = {mapper [k]: v for k, v in model.get_fscore().items()} import operator mapped = sorted(mapped.items(), key=operator.itemgetter(1)) xgb.plot_importance(mapped) plt.show() df = pd.DataFrame(mapped, columns=['feature', 'fscore']) df['fscore'] = df['fscore'] / df['fscore'].sum() df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(25, 15)) plt.title('XGBoost Feature Importance') plt.xlabel('relative importance') plt.gcf().savefig('feature_importance_xgb.png') xx = np.linspace(-10,500) yy = xx h0 = plt.plot(xx, yy, 'k-', label="ideal Values") plt.scatter(df2.Orginal, df2.Predicted, c = 'y') plt.legend() plt.show()
##xgboost.train(params, dtrain, num_boost_round=10, evals=(), obj=None, ## feval=None, maximize=False, early_stopping_rounds=None, ## evals_result=None, verbose_eval=True, learning_rates=None, ## xgb_model=None) # evallist = [(dtest,'eval'), (dtrain,'train')] watchlist = [ (xg_train,'train'), (xg_test, 'test') ] evals_result = {} num_round = 10 bst = xgb.train(param,xg_train, num_round, evals_result=evals_result) pred = bst.predict(xg_test) print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) )) xgb.plot_importance(bst) xgb.plot_tree(bst, num_trees=2) #=============Logistic Regression============================================================== #Define sigmoid function def sigmoid(z): return 1 / (1 + e**(-z)) #Calcualte the cost to be minimized -- using the sigmoid function def cost(theta, X, y, l): m = X.shape[0] #Number of rows in the data z = X.dot(theta) O = (-1 / m) * (log(sigmoid(z)).T.dot(y) + log(1-sigmoid(z)).T.dot((1-y))) # print(m) # print(theta)
def train(param, num_round=1000, early_stopping_rounds=20): exec_time = time.strftime("%Y%m%d%I%p%M", time.localtime()) os.mkdir('{0}_{1}'.format(model_path, exec_time)) os.mkdir('{0}_{1}'.format(submission_path, exec_time)) train_params = param.copy() train_params['num_boost_round'] = num_round train_params['early_stopping_rounds'] = early_stopping_rounds json.dump(train_params, open('{0}_{1}{2}'.format(model_path, exec_time, model_params), 'wb+')) print 'get training data' train_features = pd.read_csv(train_path + 'train_features.csv').astype(float) train_labels = pd.read_csv(train_path + 'labels.csv').astype(float) validate_features = pd.read_csv(validate_path + 'train_features.csv').astype(float) validate_labels = pd.read_csv(validate_path + 'labels.csv').astype(float) predict_features = pd.read_csv(predict_path + 'train_features.csv').astype(float) create_feature_map(train_features.columns.tolist(), '{0}_{1}{2}'.format(model_path, exec_time, model_fmap_file)) train_matrix = xgboost.DMatrix(train_features.values, label=train_labels.values, feature_names=train_features.columns) val_matrix = xgboost.DMatrix(validate_features.values, label=validate_labels.values, feature_names=validate_features.columns) predict_matrix = xgboost.DMatrix(predict_features.values, feature_names=predict_features.columns) watchlist = [(train_matrix, 'train'), (val_matrix, 'eval')] print 'model training' with open('{0}_{1}{2}'.format(model_path, exec_time, model_train_log), 'wb+') as outf: sys.stdout = outf model = xgboost.train(param, train_matrix, num_boost_round=num_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds) sys.stdout = save_stdout print 'model.best_score: {0}, model.best_iteration: {1}, model.best_ntree_limit: {2}'.format(model.best_score, model.best_iteration, model.best_ntree_limit) print 'output offline model data' model.save_model('{0}_{1}{2}'.format(model_path, exec_time, model_file)) model.dump_model('{0}_{1}{2}'.format(model_path, exec_time, model_dump_file)) importance = model.get_fscore(fmap='{0}_{1}{2}'.format(model_path, exec_time, model_fmap_file)) importance = sorted(importance.items(), key=operator.itemgetter(1)) df = pd.DataFrame(importance, columns=['feature', 'fscore']) df['fscore'] = df['fscore'] / df['fscore'].sum() df.to_csv('{0}_{1}{2}'.format(model_path, exec_time, model_feature_importance_csv), index=False) xgboost.plot_importance(model) plt.gcf().set_size_inches(20, 16) plt.gcf().set_tight_layout(True) plt.gcf().savefig('{0}_{1}{2}'.format(model_path, exec_time, model_feature_importance_file)) plt.close() train_pred_labels = model.predict(train_matrix, ntree_limit=model.best_ntree_limit) val_pred_labels = model.predict(val_matrix, ntree_limit=model.best_ntree_limit) train_pred_frame = pd.Series(train_pred_labels, index=train_features.index) train_pred_frame.name = probability_consumed_label val_pred_frame = pd.Series(val_pred_labels, index=validate_features.index) val_pred_frame.name = probability_consumed_label train_true_frame = pd.read_csv(train_path + 'labels.csv')['Label'] val_true_frame = pd.read_csv(validate_path + 'labels.csv')['Label'] train_coupons = pd.read_csv(train_path + 'dataset.csv') val_coupons = pd.read_csv(validate_path + 'dataset.csv') train_check_matrix = train_coupons[[coupon_label]].join(train_true_frame).join(train_pred_frame) val_check_matrix = val_coupons[[coupon_label]].join(val_true_frame).join(val_pred_frame) print 'Average auc of train matrix: ', check_average_auc(train_check_matrix) print 'Average auc of validate matrix', check_average_auc(val_check_matrix) val_coupons = val_coupons.join(val_pred_frame).join(val_pred_frame.map(lambda x: 0. if x < 0.5 else 1.).rename('map')).join(val_true_frame) val_coupons.to_csv('{0}_{1}{2}'.format(model_path, exec_time, val_diff_file), index=False) print confusion_matrix(val_coupons['Label'], val_coupons['map']) labels = model.predict(predict_matrix, ntree_limit=model.best_ntree_limit) frame = pd.Series(labels, index=predict_features.index) frame.name = probability_consumed_label plt.figure() frame.hist(figsize=(10, 8)) plt.title('results histogram') plt.xlabel('predict probability') plt.gcf().savefig('{0}_{1}{2}'.format(submission_path, exec_time, submission_hist_file)) plt.close() submission = pd.read_csv(predict_path + 'dataset.csv') submission = submission[[user_label, coupon_label, date_received_label]].join(frame) submission.to_csv('{0}_{1}{2}'.format(submission_path, exec_time, submission_file), index=False)
#%% predictions = my_model.predict(val_X) #print (predictions) #%% #print((mean_squared_error(val_y, ) rmse = np.sqrt(mean_squared_error(val_y, predictions)) R = r2_score(val_y, predictions) MAPE = np.mean(np.abs((val_y - predictions) / val_y)) * 100 print("RMSE: %f" % (rmse)) print("R^: ", R) print("MAPE: ", MAPE) #%% xgb.plot_importance(my_model) plt.rcParams['figure.figsize'] = [5, 5] plt.show() #%% xgb.plot_tree(my_model, num_trees=0) plt.rcParams['figure.figsize'] = [50, 10] plt.show() #xgb.plot_tree(my_model,num_trees=0) #plt.rcParams['figure.figsize'] = [50, 10] #plt.show()
model = xgb.train(params, dtrain, 200, watchlist, maximize=True, early_stopping_rounds = 25, verbose_eval=5) del dvalid else: dtrain = xgb.DMatrix(train, y) del train, y gc.collect() watchlist = [(dtrain, 'train')] model = xgb.train(params, dtrain, 30, watchlist, maximize=True, verbose_eval=1) del dtrain gc.collect() print('[{}] Finish XGBoost Training'.format(time.time() - start_time)) # Plot the feature importance from xgboost plot_importance(model) plt.gcf().savefig('feature_importance_xgb.png') # Load the test for predict test = pd.read_csv(path+"test.csv", usecols=test_columns, dtype=dtypes) test = pd.merge(test, ip_count, on='ip', how='left', sort=False) del ip_count gc.collect() sub['click_id'] = test['click_id'].astype('int') test['clicks_by_ip'] = test['clicks_by_ip'].astype('uint16') test = timeFeatures(test) test.drop(['click_id', 'ip'], axis=1, inplace=True) dtest = xgb.DMatrix(test) del test
reg_alpha=0.05, reg_lambda=2, subsample=1.0, colsample_bytree=1.0, max_delta_step=1, scale_pos_weight=1, objective='multi:softprob', nthread=8, seed=0 # , # silent = False ) print('training...') xgb_model.fit(training, label) print('predicting...') predicted = xgb_model.predict_proba(testing) predicted = pandas.DataFrame(predicted) predicted.columns = xgb_model.classes_ # Name index column. predicted.index.name = 'Id' # Write csv. print('Saving prediction...') predicted.to_csv('Prediction.csv') # feature importance feat_imp = pandas.Series(xgb_model.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') matplotlib.pyplot.show() plot_importance(xgb_model, title='Feature importance') matplotlib.pyplot.show() plot_tree(xgb_model, num_trees=0) matplotlib.pyplot.show()
# best : {'gamma': 0.4, 'learning_rate': 0.05740649534056902, 'max_depth': 5, 'min_child_weight': 6, 'n_estimators': 166, 'subsample': 0.6}
# Train model one last time on all of the optimized parameter, including those found from the grid search & early boosting final_model = xgb.train(final_params, xgb_train, num_boost_round = best_iteration) # Create an XGBoost data matrix for the testing data. X_test_xgbmat = xgb.DMatrix(X_test) # With the trained final model, predict the labels of our test set y_pred = final_model.predict(X_test_xgbmat) y_pred[y_pred > 0.5] = 1 y_pred[y_pred <= 0.5] = 0 # Produce accuracy score for the above prediction. The goal is to improve upon the MLE baseline estimate. accuracy = accuracy_score(y_pred, y_test) print "Test Set Accuracy: ", accuracy print "Baseline MLE Accuracy: ", max(1-np.sum(y_train)/float(y_train.size), np.sum(y_train)/float(y_train.size)) importance_dict = final_model.get_score(importance_type= 'gain') # Select top 'n' feature to show in the importance plot n = 10 top_n_importance_dict = dict(sorted(importance_dict.iteritems(), key=operator.itemgetter(1), reverse=True)[:n]) # Plot top 'n' features with respect to average gain per split xgb.plot_importance(top_n_importance_dict, xlabel = 'Average Gain Per Split', title = "'" + y_string + "'\n" + 'Top 5 Feature Importance') plt.show()
def XGBoost_regressor2(): """ Train an XGBoost model with XGBoost lib. This method is mainly used to find relative importance of the features. """ train = xgb.DMatrix('train_libSVM.dat') all_train = xgb.DMatrix('all_train_libSVM.dat') test = xgb.DMatrix('test_libSVM.dat') validation = xgb.DMatrix('validate_libSVM.dat') param = {'max_depth': 11, 'eta': 0.002, 'silent': 1, 'objective': 'reg:linear', 'gamma': 2.2, 'subsample': 0.8, 'colsample_bytree': 0.7, 'scale_pos_weight': 0.55, 'min_child_weight': 5, 'n_jobs': 4} # 0.03-> 900, 1600 without features of SVD similarity between # search term and other columns # eta ntrees error # 0.03-> 900 -> 0.2397 * 2 = 4795 # 0.025 -> 1900 -> 4782 # 0.06 -> 640 -> 0.2400 * 2 = 4801 ############# common brand & SVD brand deleted ############ # 0.03-> 900 -> 0.2397 * 2 = 4794 ############ add KL distance ######## # 0.03 -> 966 -> 0.2397 # 0.03 -> 1102 -> 0.234 or so ##### add spell checking # round = 200 # depth = 12 -> 0.235371 # depth = 11 min_cw = 5 -> 0.235316 SELECTED # depth = 10 -> 0.235840 # depth = 9 -> 0.235912 # depth = 8 -> 0.236202 # min_child_weight = 6 -> 0.235679 # min_child_weight = 4 -> 0.235478 ###### as of April 16 # 3500, 0.01, 0.238908 # 1500, 0.03, 0.238893 # 750, 0.06, 0.239034 watchlist = [(validation, 'eval'), (train, 'train')] # TODO: do data cleaning again. # add approximate matching # check KL distance # n = 1096 num_round = 10000 xgb_model = xgb.train(param, train, num_round, watchlist) # xgb_model = xgb.cv(param, all_train, num_round, nfold=5, # metrics={'error'}) # print xgb_model.head() # xgb_model.info() prediction = xgb_model.predict(test) importance = xgb_model.get_fscore(fmap='xgb.fmap') print importance sorted_importance = sorted(importance.items(), key=operator.itemgetter(1)) print sorted_importance importance_of_feature_file\ = open('importance_of_feature_file', 'w') pickle.dump(sorted_importance, importance_of_feature_file) importance_of_feature_file.close() xgb.plot_importance(xgb_model) test_id = pd.read_pickle('id_test') prediction = prediction * 2 + 1 prediction[prediction > 3] = 3 prediction[prediction < 1] = 1 clean_result(prediction) pd.DataFrame({"id": test_id.values, "relevance": prediction})\ .to_csv('submission.csv', index=False)
def plot_importance(self): xgb.plot_importance(self.model._Booster)
del xgb_train, xgb_val gc.collect() cv_scores.append(roc_auc_score(y_val, bst.predict(xgb.DMatrix(X_val), ntree_limit=bst.best_ntree_limit))) print(cv_scores) print('predicting...') if i == 0: pred = bst.predict(xgb.DMatrix(np.array(test_x)), ntree_limit=bst.best_ntree_limit) else: pred += bst.predict(xgb.DMatrix(np.array(test_x)), ntree_limit=bst.best_ntree_limit) del train_x, train_y gc.collect() print('mean_score:', np.mean(cv_scores)) pred /= folds df_test['is_churn'] = pred.clip(0.0000001, 0.999999) df_test = df_test[['msno', 'is_churn']] # df_test.to_csv(out_path + 'stack_submissions{}.csv'.format(datetime.now().strftime("%Y%m%d-%H%M%S")), index=False) df_test = [] plt.rcParams['figure.figsize'] = (7.0, 7.0) xgb.plot_importance(booster=bst) plt.show() # plt.savefig('./feature_importance.png', dpi=100)
def plotFeatureImportance(bst): plt.figure(figsize=(10,10), dpi=200) xgb.plot_importance(bst, height=0.2) plt.gcf().savefig('feature_importance_xgb.png', dpi=200)
train.dtypes train. # In[2]: X_train = train.drop("count",1) Y_train = train['count'] T_train_xgb = xgb.DMatrix(X_train, Y_train) params = {"objective": "reg:linear",'bst:max_depth':13,"booster":"gbtree" } gbm = xgb.train(dtrain=T_train_xgb, params = params) X_test = xgb.DMatrix(test) Y_pred = gbm.predict(X_test) print(Y_pred) xgb.plot_importance(gbm) # In[ ]: plt.show() # In[35]: result['count'] = pd.DataFrame(Y_pred) #result[result['count']<=0] = 0 result_final = result.set_index('datetime') result_final[result_final['count']<=0]=0 result_final
y = train.cardio train.drop('cardio', axis=1, inplace=True) z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['y'] = y train2, y, test2 = cleanup_and_generate(train, y, test) et1(train2.values, y, test2.values, v, z) keras1(train2, y, test2, v, z) rf1(train2.values, y, test2.values, v, z) xgb1(train2, y, test2, v, z) xgb2(train2, y, test2, v, z) z.y = z.xgb1 * 0.4 + z.xgb2 * 0.4 + (z.keras1 + z.rf1 + z.et1) * (0.2 / 3) z.y = prestore(z.y) save_results(v, z) print('done: %s.'%(now())) #''' clf = xgb.XGBClassifier(n_estimators=1000, learning_rate=.005) clf.fit(train2, y) for c in ['weight', 'gain', 'cover']: xgb.plot_importance(clf, title = 'Feature ' + c, importance_type=c) #'''
# True value y_predict = X_test_now[:,price_index] X_test_now = np.delete(X_test_now, [house_pk_index, price_index], axis = 1) # predictd value dpredict = xgb.DMatrix(X_test_now) ypred_with_evallist = bst_with_evallist.predict(dpredict) RMSE = np.sqrt(((ypred_with_evallist - y_predict) ** 2).mean()) print('######################################') print("RMSE of bst_with_evallist :", RMSE) print('The r2 score for this Group %d is : %4f' % (test_group - 1, \ metrics.r2_score(y_predict, ypred_with_evallist))) print('') ''' # In[Importance plot & Trees plotted into .pdf] # Attribute's importance plot ''' xgb.plot_importance(bst_with_evallist) # Tree plot and saved into pdf num_trees = len(bst_with_evallist.get_dump()) for tree_index in range(num_trees): dot = xgb.to_graphviz(bst_with_evallist, num_trees = tree_index) dot.render("trees/tree{}".format(tree_index)) '''