def saving_trees(model_ehr, logs_file): """ Function to visualize XGB trees. The ranges generated are compared with RENASCA """ from matplotlib.pylab import rcParams rcParams['figure.figsize'] = 60, 50 xgb.plot_tree(model_ehr, ax=plt.gca()) features_url = os.path.join(logs_file, str('plot_tree_' + '.svg')) plt.savefig(features_url)
def plot_BDT(bst): ''' Produces two different plots: a) Plot specified tree b) Plot importance based on fitted trees. Parameters ---------- bst : trained booster model Booster. Returns ------- None. ''' xgb.plot_importance(bst) fig = plt.gcf() fig.set_size_inches(20, 10) fig.savefig('plotImportanceHiggs.pdf') plt.clf() xgb.plot_tree(bst, num_trees=4) fig = plt.gcf() fig.set_size_inches(15, 10) fig.savefig('plotTreeHiggs.pdf', dpi=300) plt.clf()
def boost(data: pd.DataFrame): X, y = data.iloc[:, :-1], data.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42) xg_reg = xgb.XGBRegressor( objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.25, max_depth=40, alpha=50, n_estimators=100, reg_lambda=30, ) xg_reg.fit(X_train, y_train) preds = xg_reg.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, preds)) print(f"[boost] RMSE: {rmse}") xgb.plot_tree(xg_reg, num_trees=0) plt.savefig(f"{data_root}/figs/tree_development.svg", format="svg") xgb.plot_importance(xg_reg) plt.rcParams['figure.figsize'] = [5, 5] plt.savefig(f"{data_root}/figs/importance_development.svg", format="svg") bundle = data.copy().iloc[y_test.index] bundle['TARGET'] = preds bundle['DIFF'] = abs(bundle['TARGET'] - bundle['HOURSINDEVELOPMENT']) return bundle.sort_values(by='DIFF')
def multiple_run(model, dtrain, predictors, cv_folds=4, early_stopping_rounds=100): print("running a multiple fit with CV to check predictors") xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['target'].values) cvresult = xgb.cv(model.get_xgb_params(), xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds, verbose_eval=10, feval=gini_xgb, maximize=True) gc.collect() # fit the algorithm on the data model.fit(dtrain[predictors], dtrain['target']) # plot feature importances fig, ax = plt.subplots(figsize=(20, 20)) xgb.plot_importance(model, ax=ax) plt.savefig("importance.pdf") # plot tree fig, ax = plt.subplots(figsize=(20, 20)) xgb.plot_tree(model, ax=ax) plt.savefig("tree.pdf")
def get_plot_tree(model, num_trees=2): from xgboost import plot_tree from matplotlib.pylab import rcParams ##set up the parameters rcParams['figure.figsize'] = 20, 10 plot_tree(model, num_trees=num_trees)
def plot_trees(self, step): """ Plot the trees via an internal function Arguments --------- step : int Number of steps between two plotted trees. Returns ------- fig_trees : list List of figure objects containing the graphs of the trees. """ fig_trees = [] for i in range(len(self.trees)): if i % step == 0: print("Plotting tree {} / {}...".format(i, len(self.trees))) try: xgboost.plot_tree(self.model, num_trees=i) fig = matplotlib.pyplot.gcf() fig.set_size_inches(50, 25) ax = plt.gca() ax.set_title("Tree {}".format(i)) fig_trees.append(fig) except ValueError: print("Bad tree {}".format(i)) return fig_trees
def train(): # Read the csv file stpData = pd.read_csv('StudentsPerformance-encoded.csv') # split and train X, y = stpData.iloc[:, :-3], stpData.iloc[:, -3] data_dmatrix = xgb.DMatrix(data=X, label=y) # split and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) xg_reg = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.1, max_depth=7, alpha=10, n_estimators=1000, num_boost_round=1000) # test the model accuracy xg_reg.fit(X_train, y_train) preds = xg_reg.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, preds)) print("RMSE: %f" % (rmse)) # Plot the feature importance and the tree xgb.plot_tree(xg_reg, num_trees=100) xgb.plot_importance(xg_reg) fig = matplotlib.pyplot.gcf() fig.set_size_inches(150, 100) fig.savefig('tree.png')
def showTree(bst, ntree): xgb.plot_tree(bst, num_trees=ntree, fontsize='24', rankdir='LR', size="7.75,10.25") plt.savefig('tree.png')
def train_predict_all(self, x, y, column_id, x_all, feature_names=None, column_names=None): if self.balance: ratio = float(np.sum(y == False)) / np.sum(y == True) print "weight ratio: " + str(ratio) self.params[column_id]['scale_pos_weight'] = ratio xgdmat = xgb.DMatrix(x, y, feature_names=feature_names) self.model[column_id] = xgb.train(self.params[column_id], xgdmat, num_boost_round=3000, verbose_eval=False) if feature_names != None: all_trees = self.model[column_id].get_dump() print "number trees:" + str(len(all_trees)) plot_tree(self.model[column_id]) fig = plt.gcf() fig.set_size_inches(150, 100) plt.savefig('out/' + str(column_id) + "_" + column_names[column_id] + '.pdf') # predict all_records = xgb.DMatrix(x_all, feature_names=feature_names) probability_prediction = self.model[column_id].predict(all_records) class_prediction = (probability_prediction > 0.5) return probability_prediction, class_prediction
def viz_trees(self): self.xg_regressor = xgb.train(params=self.params, dtrain=self.Dmatrix, num_boost_round=10) xgb.plot_tree(self.xg_regressor, num_trees=0) plt.rcParams['figure.figsize'] = [50, 10] plt.show()
def xbg_plot(): """ Function for plotting the first tree and the feature importance (how many times the feature appears in the trees) with the XGBoost classifier. """ xg_clf.fit(X_train, y_train) plt.rcParams["figure.figsize"] = [40, 40] xgb.plot_tree(xg_clf, num_trees=0) plt.title("Plot of the first tree with XGBoost") plt.tight_layout() plt.savefig("Figures/xgb_tree.png") plt.show() fonts = { "font.size": 18, "legend.fontsize": "medium", "xtick.labelsize": 16, "ytick.labelsize": 16, "axes.titlesize": 18 } plt.rcParams.update(fonts) plt.rcParams["figure.figsize"] = [10, 10] xgb.plot_importance(xg_clf) plt.savefig("Figures/Importance.png") plt.show()
def plot_estimator(self, item): """ plot critical information of an XGB regressor if item == "importance", plot feature importance if item == "tree", plot the tree structure of listed in tree_index :param item: "importance" or "tree" :return: None, save plots """ if item == "importance": plot_importance(self.best_estimator, importance_type="gain", xlabel="Feature contribution", ylabel=None, title=None, grid=False, xlim=(0, 35000)) plt.tight_layout() plt.savefig("feature_importance_by_gain.jpg", dpi=600) plt.show() else: tree_index = [0, 1, 100, 300, 400] # tree index used to plot structure for index in tree_index: fig, ax = plt.subplots() fig.set_size_inches(300, 150) plot_tree(self.best_estimator, num_trees=index, ax=ax) plt.savefig('./tree structure/tree_' + str(index) + ".jpg") plt.show()
def get_tree_plot(self): file_path = gu.get_target_path([self.local_folder, 'tree'], file_extension='png') plt.rcParams.update(plt.rcParamsDefault) plt.rcParams['figure.figsize'] = [50, 10] xgb.plot_tree(self.model, num_trees=0) plt.savefig(file_path, bbox_inches='tight') plt.close()
def XGModelExperiment(): X_train, y_train, X_val, y_val, test_train, test_val = getAllCleanedDataExperiment( binning=1) X_train_dummies = pd.get_dummies(X_train) X_val_dummies = pd.get_dummies(X_val) test_train_dummies = pd.get_dummies(test_train) printFullDf(X_train.head()) # DMatrix train = xgb.DMatrix(data=X_train_dummies, label=y_train) val = xgb.DMatrix(data=X_val_dummies, label=y_val) test = xgb.DMatrix(data=test_train_dummies, label=test_val) params = { "base_score": 0.5, "booster": 'gbtree', "colsample_bylevel": 1, "colsample_bynode": 1, "colsample_bytree": 0.4, "gamma": 0.1, "gpu_id": -1, "importance_type": 'gain', "interaction_constraints": '', "learning_rate": 0.2, 'max_delta_step': 0, 'max_depth': 5, 'min_child_weight': 1, 'monotone_constraints': '()', 'n_estimators': 50, 'n_jobs': 0, 'num_parallel_tree': 1, 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': 'exact', 'validate_parameters': 0, 'verbosity': None, 'objective': 'binary:hinge' } params['eval_metric'] = 'auc' evallist = [(val, 'eval'), (train, 'train')] num_rounds = 20 bst = xgb.train(params, train, num_rounds, evallist, early_stopping_rounds=4) ypred = bst.predict(test, ntree_limit=bst.best_ntree_limit) print(f1_score(test_val, ypred)) # print(bst.get_score(importance_type='gain')) # print(bst.get_score(importance_type='weight')) xgb.plot_importance(bst, importance_type='gain') xgb.plot_importance(bst, importance_type='weight') xgb.plot_tree(bst) plt.show()
def single_tree(cols=['乳酸脱氢酶', '淋巴细胞(%)', '超敏C反应蛋白']): print('single_tree:\n') #获取375病人(data_df_unna) 和110病人(data_pre_df)数据 data_df_unna, data_pre_df = data_preprocess() #去掉全空行,此时375总数目变成351 data_df_unna = data_df_unna.dropna(subset=cols, how='any') cols.append('Type2') #获取病人的结局标签 Tets_Y = data_pre_df.reset_index()[['PATIENT_ID', '出院方式']].copy() #修改dataframe的名字 Tets_Y = Tets_Y.rename(columns={'PATIENT_ID': 'ID', '出院方式': 'Y'}) # 获取110病人的标签数据 y_true = Tets_Y['Y'].values x_col = cols[:-1] y_col = cols[-1] # 获取351病人的三特征数据 x_np = data_df_unna[x_col].values # 获取351病人的标签数据 y_np = data_df_unna[y_col].values # 获取110病人的三特征数据 x_test = data_pre_df[x_col].values # 在351病人上划分训练集和验证集,此时110视为测试集 X_train, X_val, y_train, y_val = train_test_split(x_np, y_np, test_size=0.3, random_state=6) #限定单树xgb模型 model = xgb.XGBClassifier( max_depth=3, n_estimators=1, ) model.fit(X_train, y_train) #训练集混淆矩阵 pred_train = model.predict(X_train) show_confusion_matrix(y_train, pred_train) print(classification_report(y_train, pred_train)) #验证集混淆矩阵 pred_val = model.predict(X_val) show_confusion_matrix(y_val, pred_val) print(classification_report(y_val, pred_val)) #测试集混淆矩阵 pred_test = model.predict(x_test) print('True test label:', y_true) print('Predict test label:', pred_test.astype('int32')) show_confusion_matrix(y_true, pred_test) print(classification_report(y_true, pred_test)) plt.figure(dpi=300, figsize=(8, 6)) plot_tree(model) plt.show() graph = xgb.to_graphviz(model) graph.render(filename='single-tree.dot')
def tree_example(info_train, target_train): # Create and fit a classifier to plot an example of a tree classifier_xgb = xgboost.XGBClassifier(max_depth=3, random_state=1, use_label_encoder=False) classifier_xgb.fit(info_train, target_train) # Plot one of the resulting trees to examine what happens inside xgboost.plot_tree(classifier_xgb, rankdir='LR')
def viz_tree(model): rcParams['figure.figsize'] = 80, 120 # xgb.plot_tree(xg_reg, rankdir='LR'); plt.show() xgb.plot_tree(model, num_trees=0, rankdir='LR') fig = plt.gcf() fig.set_size_inches(150, 100) fig.savefig('tree.png') plt.show(fig)
def fig_fixing(xlf, fmp=None, save: bool = True): fig, ax = pyplot.subplots() fig.set_size_inches(100, 100) if fmp is not None: xgb.plot_tree(xlf, ax=ax, fmap="{}.fmap".format(fmp)) else: xgb.plot_tree(xlf, ax=ax) # print(fmp) if save: fig.savefig("{}.png".format(fmp))
def classifi_plot(self): fig = plt.figure(figsize = (19.20,10.80)) ax1 = fig.add_subplot(224) ax2 = fig.add_subplot(211) ax3 = fig.add_subplot(223) cm = pd.DataFrame(metrics.confusion_matrix(self.test_y, self.predictions), columns=self.target_names, index=self.target_names) sns.heatmap(cm, annot=True, ax=ax1) xgb.plot_tree(self.gbm, num_trees=0,ax=ax2) xgb.plot_importance(self.gbm, ax=ax3) plt.show()
def model_with_sklearn_api(self, cv_routine='grid_search', param_grid=None, plot=True, use_lightgbm=False): self.logger.info('modelling with sklearn api...') # specify model using sklearn api if use_lightgbm: xg_reg = lgb.LGBMRegressor(objective='regression', random_state=self.random_state) else: xg_reg = xgb.XGBRegressor(objective='reg:linear', random_state=self.random_state) # note: at time of writing at least (2018-12), it seems that early # stopping (e.g. by adding early_stopping_rounds=50) is not supported # with sklearn's hyper-parameter optimisers like GridSearchCV # note: xgb.XGBRegressor and lgb.LGBMRegressor support many types of # regressions (i.e. many different loss functions), e.g. Poisson # regression, via the 'objective' argument if param_grid is None: param_grid = self._get_default_param_gird(cv_routine=cv_routine, use_lightgbm=use_lightgbm) xg_reg = self._fit_model(xg_reg=xg_reg, cv_routine=cv_routine, param_grid=param_grid) if cv_routine in ['grid_search', 'randomised_search']: # use xgboot's api to further optimise n_estimators early stopping # note: no scaling here (but that is fine for tree weak learners) xg_reg = self._update_n_estimators(est=xg_reg, use_lightgbm=use_lightgbm) else: assert (cv_routine in ['bayes_search']) # note: Bayes searches should be capable of optimising # n_estimators well without early stopping # get predictions for test set preds = xg_reg.predict(self._X_test) # get root mean-square-error in test set rmse = np.sqrt(mean_squared_error(y_true=self._y_test, y_pred=preds)) self.logger.info('RMSE in test set: {}'.format(rmse)) if plot: # visualise model (using xgboost functionality) xgb.plot_tree(xg_reg.named_steps['model'], num_trees=0) plt.rcParams['figure.figsize'] = [50, 10] plt.show() xgb.plot_importance(xg_reg.named_steps['model']) plt.rcParams['figure.figsize'] = [5, 5] plt.show() # TODO: ensure names of features are used return xg_reg
def optmodelcomp(): """ Compares performanceo of optimal models, including ROC and PR curves, as well as area under curves. Also plots an example decision tree from tree based xgb models, along with importance of the features. The optimization of the xgb models are performed in this function, while the NN model is previously optimized and the LogReg model has no hyperparameters. """ loader = pulsardat() model1 = xgbtreeopter('dart') model2 = xgbtreeopter('gbtree') model3 = xgblinearopter() model4 = LogReg() model5 = NNmodel() model5.paramchanger('layers', [64, 2]) #selected values from NNopter model5.paramchanger('batch_size', 64) models = [model1, model2, model3, model4, model5] A = analyze(models, loader) Ks = [3, 4, 5] funcs = [PRcurve, metrics.roc_curve] figs = A.kfold_analysis(Ks, funcs) plt.figure(figs[0].number) plt.xlabel("Recall", fontsize=14) plt.ylabel("Precision", fontsize=14) plt.grid() plt.legend() plt.savefig("Auc_PR.png") plt.figure(figs[1].number) plt.xlabel("False positive ratio", fontsize=14) plt.ylabel("True positive ratio", fontsize=14) plt.grid() plt.legend() plt.savefig("Auc_ROC.png") plt.figure() A.models[1].model.fit(A.df[A.xlabels].values, A.df[A.ylabels].values) xgb.plot_tree(A.models[1].model) fig = plt.gcf() fig.set_size_inches(150, 100) plt.savefig("tree_plot.pdf") plt.figure() importances = A.models[1].feature_importances() plt.bar(np.arange(len(A.xlabels)), importances) plt.ylabel("Importance", fontsize=14) plt.xlabel("Predictor number", fontsize=14) plt.savefig("importance.png") plt.show() """
def plot(self): fig, ax = plt.subplots() # weight, gain, or cover # importance_type='weight' xgb.plot_importance(self.model, ax=ax, fmap=feature_file) plt.savefig(plots_folder / 'XGB_importance_{}.png'.format(self.accesses)) fig, ax = plt.subplots(constrained_layout=True, figsize=(150, 100)) xgb.plot_tree(self.model, ax=ax, fmap=feature_file, rankdir='LR') plt.savefig(plots_folder / 'XGB_tree_{}.png'.format(self.accesses))
def visualize_model(self, model_filepath): """ Visualizes one tree of a model for reporting purposes """ bst = xgb.Booster() bst.load_model(model_filepath) #plot the model xgb.plot_tree(bst, num_trees=1, rank_dir='LR') plt.show()
def XGBoost(self, X, yy, y): # Reshape from 3D to 2D: https://stackoverflow.com/questions/61573260/reshape-3d-numpy-array-of-images-to-2d-numpy-array-for-xgboost-dmatrix-input X = X.reshape(X.shape[0], -1) print(X.shape) # PCA dimenstionality reduction #pca = PCA(n_components=2) #principalComponents = pca.fit_transform(X) #principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2']) #X = np.array(principalDf['pc1'].tolist()) #X = pca.components_ # Split the dataset x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state=42, stratify=y) print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) # Reshape from 3D to 2D: https://stackoverflow.com/questions/61573260/reshape-3d-numpy-array-of-images-to-2d-numpy-array-for-xgboost-dmatrix-input # x_train = x_train.reshape(x_train.shape[0], -1) # x_test = x_test.reshape(x_test.shape[0], -1) # print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) dtrain = xgb.DMatrix(data=x_train, label=y_train) dtest = xgb.DMatrix(data=x_test, label=y_test) eval_list = [(dtest, 'eval')] # Train the model params = { 'max_depth': 3, 'objective': 'multi:softmax', # error evaluation for multiclass training 'num_class': 3, 'tree_method':'gpu_hist' } model = xgb.train(params, dtrain, evals=eval_list, early_stopping_rounds=20, verbose_eval=True) # Evaluate predictions y_pred = model.predict(dtest) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test.argmax(axis=1), predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) # Plots xgb.plot_tree(model) fig = matplotlib.pyplot.gcf() fig.set_size_inches(150, 150) fig.savefig('xgboost/tree.png') # Confusion matrix #cm = confusion_matrix(y_train.argmax(axis=1), y_pred.argmax(axis=0)) #self.plot_confusion_matrix(cm, self.target_names) # Save the model model.save_model('./saved_models/xgboost_audio_classifier.hdf5') print('Complete.')
def main(_): # creating training data data = np.random.rand(5, 10) # 5 entities, each contains 10 features label = np.random.randint(2, size=5) # binary target dtrain = xgb.DMatrix(data, label=label) #csr = scipy.sparse.csr_matrix((dat, (row, col))) # data creation using scipy #dtrain = xgb.DMatrix(csr) # Booster parameters param = { 'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' } param['nthread'] = 4 param['eval_metric'] = 'auc' # Evaluation parameters #param['eval_metric'] = ['auc', 'ams@0'] plst = param.items() plst += [('eval_metric', 'ams@0')] # Testing data data = np.random.rand(7, 10) # 7 entities, each contains 10 features dtest = xgb.DMatrix(data) # Specify validations set to watch performance evallist = [(dtest, 'eval'), (dtrain, 'train')] # Training num_round = 10 bst = xgb.train(plst, dtrain, num_round, evallist) bst.save_model('0001.model') # Saving the model bst.dump_model('dump.raw.txt') # dump model bst.dump_model('dump.raw.txt', 'featmap.txt') # dump model with feature map bst = xgb.Booster({'nthread': 4}) # init model bst.load_model('model.bin') # load data # Testing ypred = bst.predict(dtest) #ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit) # Use this one only if early stopping is enabled in training # Plotting xgb.plot_importance(bst) xgb.plot_tree(bst, num_trees=2) xgb.to_graphviz(bst, num_trees=2) file = open("results.txt", "w") file.write(ypred) file.close()
def train(self, dataset, model, logger=print): dtrain = xgb.DMatrix(dataset.data(), label=dataset.label()) param = self.config['train'].get('param') or {} num_round = self.config['train']['epochs'] evallist = [(dtrain, 'train')] model.set_model(xgb.train(param, dtrain, num_round, evallist)) model.save() xgb.plot_importance(model.model()) plt.savefig('importance.png') xgb.plot_tree(model.model(), num_trees=2) plt.savefig('tree.png')
def PlotModelTree(self, model, model_name): # plot tree plot_tree(model, rankdir='LR') fig = plt.gcf() fig.set_size_inches(25, 15) fig.savefig(self.getOutputFolder() + '/model_tree.png') # plot importances plot_importance(model) fig = plt.gcf() fig.set_size_inches(25, 25) fig.savefig(self.getOutputFolder() + '/feat_importances.png')
def tree_pic(self, features, fmap_filename, path_1): outfile = open(fmap_filename, 'w') i = 0 for feat in features: outfile.write('{0}\t{1}\tq\n'.format(i, feat)) i = i + 1 outfile.close() from xgboost import plot_tree plot_tree(self.model, num_trees=0, fmap=fmap_filename) fig = plt.gcf() fig.set_size_inches(15, 10) fig.savefig(path_1)
def model_plot(): import matplotlib.pyplot as plt bst = xgb.Booster({"nthread": 4}) # init model bst.load_model("../data/model/xgb.model") # load data # xgb.plot_importance(bst) # plt.show() # To plot the output tree via matplotlib, use plot_tree, specifying the ordinal number of the target tree. xgb.plot_tree(bst, num_trees=2) plt.show() # When using IPython, you can use the to_graphviz function, which converts the target tree to a graphviz instance. # The graphviz instance is automatically rendered in IPython. xgb.to_graphviz(bst, num_trees=2)
def feature_importance_xgboost(): params = dict() params['eta'] = 0.3 params['min_child_weight'] = 10 params['cosample_bytree'] = 0.8 params['max_depth'] = 5 params['subsample'] = 0.5 params['gamma'] = 2.0 params['alpha'] = 1.0 config = { 'eval_metric': 'rmse', 'objective': 'reg:linear', 'nthread': 4, 'booster': 'gbtree', 'tree_method': 'exact', 'silent': 1 } config = {**config, **params} xtrain, xtest, ytrain, ytest, fname = get_data_boston_with_fname() dtrain = xgb.DMatrix(xtrain, label=ytrain) dtest = xgb.DMatrix(xtest, label=ytest) evallist = [(dtrain, 'train'), (dtest, 'test')] num_boost_round = 10 model = xgb.train(config, dtrain, num_boost_round, evals=evallist, early_stopping_rounds=100, verbose_eval=True) fmap_fp = 'fmap.txt' f = open(fmap_fp, 'w') for i, feature in enumerate(fname): f.write('{0}\t{1}\tq\n'.format(i, feature)) f.close() feature_weights = model.get_fscore(fmap=fmap_fp) feature_weights = sorted(feature_weights.items(), key=lambda x: x[1], reverse=True) print(feature_weights) model.save_model('model.bin') model.dump_model('desc.txt', fmap=fmap_fp) xgb.plot_tree(model, fmap=fmap_fp, num_trees=2) plt.show()
def plot_tree(self, num_trees=0, rankdir='UT', ax=None, **kwargs): """Plot specified tree. Parameters ---------- booster : Booster, XGBModel Booster or XGBModel instance num_trees : int, default 0 Specify the ordinal number of target tree rankdir : str, default "UT" Passed to graphiz via graph_attr ax : matplotlib Axes, default None Target axes instance. If None, new figure and axes will be created. kwargs : Other keywords passed to to_graphviz Returns ------- ax : matplotlib Axes """ import xgboost as xgb if not isinstance(self._df.estimator, xgb.XGBModel): raise ValueError('estimator must be XGBRegressor or XGBClassifier') return xgb.plot_tree(self._df.estimator.booster(), num_trees=num_trees, rankdir=rankdir, **kwargs)
def test_sklearn_plotting(): tm._skip_if_no_sklearn() from sklearn.datasets import load_iris iris = load_iris() classifier = xgb.XGBClassifier() classifier.fit(iris.data, iris.target) import matplotlib matplotlib.use('Agg') from matplotlib.axes import Axes from graphviz import Digraph ax = xgb.plot_importance(classifier) assert isinstance(ax, Axes) assert ax.get_title() == 'Feature importance' assert ax.get_xlabel() == 'F score' assert ax.get_ylabel() == 'Features' assert len(ax.patches) == 4 g = xgb.to_graphviz(classifier, num_trees=0) assert isinstance(g, Digraph) ax = xgb.plot_tree(classifier, num_trees=0) assert isinstance(ax, Axes)
def test_plotting(self): bst2 = xgb.Booster(model_file='xgb.model') # plotting import matplotlib matplotlib.use('Agg') from matplotlib.axes import Axes from graphviz import Digraph ax = xgb.plot_importance(bst2) assert isinstance(ax, Axes) assert ax.get_title() == 'Feature importance' assert ax.get_xlabel() == 'F score' assert ax.get_ylabel() == 'Features' assert len(ax.patches) == 4 ax = xgb.plot_importance(bst2, color='r', title='t', xlabel='x', ylabel='y') assert isinstance(ax, Axes) assert ax.get_title() == 't' assert ax.get_xlabel() == 'x' assert ax.get_ylabel() == 'y' assert len(ax.patches) == 4 for p in ax.patches: assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'], title=None, xlabel=None, ylabel=None) assert isinstance(ax, Axes) assert ax.get_title() == '' assert ax.get_xlabel() == '' assert ax.get_ylabel() == '' assert len(ax.patches) == 4 assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue g = xgb.to_graphviz(bst2, num_trees=0) assert isinstance(g, Digraph) ax = xgb.plot_tree(bst2, num_trees=0) assert isinstance(ax, Axes)
# plot decision tree from numpy import loadtxt from xgboost import XGBClassifier from xgboost import plot_tree from matplotlib import pyplot # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:,0:8] y = dataset[:,8] # fit model no training data model = XGBClassifier() model.fit(X, y) # plot single tree plot_tree(model) pyplot.show()
param['objective'] = 'multi:softprob' param['eta'] = 1.3 param['max_depth'] = 6 param['silent'] = 1 param['nthread'] = 4 param['num_class'] = 3 watchlist = [(xg_train, 'train'), (xg_test, 'test')] num_round = 100 bst = xgb.train(param, xg_train, num_round) yprob = bst.predict( xg_test ).reshape( y_test.shape[0], 3 ) ylabel = np.argmax(yprob, axis=1) print ('predicting, classification error=%f' % (sum( int(ylabel[i]) != y_test[i] for i in range(len(y_test))) / float(len(y_test)) )) fig, ax = plt.subplots(1, 1) xgb.plot_tree(bst, ax=ax) fig.savefig('analysis/output/xg.png', dpi=600) ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10) total_score = 0.0 total_f1 = 0.0 runs = 0 for train, test in ss.split(X, y): X_train = np.array(X)[train] y_train = y[train] X_test = np.array(X)[test] y_test = y[test] count_vect = CountVectorizer(ngram_range=(1, 3))
testreader = csv.reader(open("../avtest.csv",'r'), delimiter=",") test = [] i = 0 for row in testreader: i += 1 #convert strings to floats converted = [] #remove old converted every 5000 cycles if ((i % 5000.0) == 0): print "clean up"+str(i) gc.collect() for j in row: if (len(j) > 0): converted.append(float(j)) else: converted.append(float("nan")) test.append(converted) print "done looping" test = numpy.array(test) print test[0] print test.shape dfintest = xgboost.DMatrix(test,missing=float("nan")) finpred = model.predict(dfintest) print finpred to_output.to_output(finpred,"xpredictions.csv") xgboost.plot_importance(model) plt.show() xgboost.plot_tree(model) plt.show()
#Predict on Test Set X_test4_new = X_test[cols4] dtest4_new = xgb.DMatrix(X_test4_new, label=y3_test, weight=wt4_test) dtest4_new.save_binary('/Users/wangbruce/Google Drive/test4_new.buffer') y4_pred_p = bst4.predict(dtest4_new) y4_pred = [0 if x <0.5 else 1 for x in y4_pred_p] (y4_pred == y3_test).mean() ############################################################################# #plot xgb.plot_importance(bst4) plt.show() xgb.plot_tree(bst4) plt.show() def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(range(0,2))) plt.xticks(tick_marks, range(0,2), rotation=45) plt.yticks(tick_marks, range(0,2)) plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label')
reg_alpha=0.05, reg_lambda=2, subsample=1.0, colsample_bytree=1.0, max_delta_step=1, scale_pos_weight=1, objective='multi:softprob', nthread=8, seed=0 # , # silent = False ) print('training...') xgb_model.fit(training, label) print('predicting...') predicted = xgb_model.predict_proba(testing) predicted = pandas.DataFrame(predicted) predicted.columns = xgb_model.classes_ # Name index column. predicted.index.name = 'Id' # Write csv. print('Saving prediction...') predicted.to_csv('Prediction.csv') # feature importance feat_imp = pandas.Series(xgb_model.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') matplotlib.pyplot.show() plot_importance(xgb_model, title='Feature importance') matplotlib.pyplot.show() plot_tree(xgb_model, num_trees=0) matplotlib.pyplot.show()
colsample_bytree=colsample, subsample=subsample) m.fit(Xtr, ytr) pp = m.predict_proba(Xts)[:, 1] if FINAL_SUBMISSION: import datetime timestamp = datetime.datetime.now().strftime( '%Y-%m-%d-%H:%M') scores = np.c_[np.arange(len(pp)), pp] np.savetxt('../out/vilab-submission-%s.csv' % timestamp, scores, '%d,%.8f', ',', header='id,probability', comments='') toc() else: toc('cs=%.2f md=%2d lr=%.2f mcw=%1d g=%d score=%.4f' % ( colsample, max_depth, learning_rate, min_child_weight, gamma, roc_auc_score(yts, pp))) sys.stdout.flush() import matplotlib.pyplot as plt plt.ioff() xgb.plot_importance(m, tick_label=names) plt.savefig('xgb-features.pdf') plt.show() ''' xgb.plot_tree(m) plt.savefig('xgb-tree.pdf', dpi=900) plt.show() '''
# plot decision tree from numpy import loadtxt from xgboost import XGBClassifier from xgboost import plot_tree from matplotlib import pyplot # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:,0:8] y = dataset[:,8] # fit model no training data model = XGBClassifier() model.fit(X, y) # plot single tree plot_tree(model, num_trees=0, rankdir='LR') pyplot.show()
#!/usr/bin/python # -*- coding: utf-8 -*- import numpy as np from sklearn import datasets from sklearn import tree import graphviz import xgboost as xgb import matplotlib.axes as axes iris = datasets.load_iris() X = iris.data y = iris.target #clf = tree.DecisionTreeClassifier() #clf = clf.fit(X, y) #dot_data = tree.export_graphviz(clf, out_file=None) #graph = graphviz.Source(dot_data) #graph.render("iris") clf = xgb.XGBClassifier() clf = clf.fit(X, y) xgb.plot_tree(clf)
## feval=None, maximize=False, early_stopping_rounds=None, ## evals_result=None, verbose_eval=True, learning_rates=None, ## xgb_model=None) # evallist = [(dtest,'eval'), (dtrain,'train')] watchlist = [ (xg_train,'train'), (xg_test, 'test') ] evals_result = {} num_round = 10 bst = xgb.train(param,xg_train, num_round, evals_result=evals_result) pred = bst.predict(xg_test) print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) )) xgb.plot_importance(bst) xgb.plot_tree(bst, num_trees=2) #=============Logistic Regression============================================================== #Define sigmoid function def sigmoid(z): return 1 / (1 + e**(-z)) #Calcualte the cost to be minimized -- using the sigmoid function def cost(theta, X, y, l): m = X.shape[0] #Number of rows in the data z = X.dot(theta) O = (-1 / m) * (log(sigmoid(z)).T.dot(y) + log(1-sigmoid(z)).T.dot((1-y))) # print(m) # print(theta) # print(theta[1:])
# In[126]: gbb = gb.booster() gbb.dump_model('trees.txt') # In[118]: import matplotlib.pyplot as plt # %matplotlib inline fig = plt.figure(figsize=[10,10]) ax = fig.gca() gb = gbm[0] xgb.plot_tree(gb, num_trees=13, ax=ax) # In[ ]: import matplotlib.pyplot as plt import numpy as np dic = {'lgt mean':'light intensity mean', 'lgt std':'light intensity variance', 'lgt off':'darkness duration', 'lgt zcrossing':'light change', 'lgt skew':'light intensity skewness', 'lgt kurt':'light intensity kurtosis', 'aud mean':'sound amplitude mean', 'aud std':'sound amplitude variance', 'aud skew':'sound amplitude skewness', 'aud kurt':'sound amplitude kurtosis', 'aud frq mean':'sound frequency mean', 'aud frq std':'sound frequency variance', 'aud frq skew':'sound frequency skewness', 'aud frq kurt':'sound frequency kurtosis', 'scr frq':'screen on/off frequency', 'scr dur mean':'screen on time', 'scr dur std':'screen on time variance', 'still':'stillness time', 'tilting':'tilting time', 'walking':'walking time', 'unknown act':'unknown activity time', 'still-walking':'still/walking transition', 'still-tilting':'still/tilting transition', 'still-unknown':'still/unknown transition', 'walking-unknown':'walking/unknown transition', 'call in':'no incoming calls', 'call out': 'no outgoing calls', 'sms in':'no incoming sms', 'sms out':'no outgoing sms', 'call missed':'no missed calls', 'n wifi': 'no wifi nets', 'temperature':'outside temperature', 'dew point':'outside windchill', 'weather':'outside weather', 'lat mean':'latitude mean', 'lng mean':'longitude mean', 'loc var':'location variance', 'duration':'visit timespan', 'midtime':'visit timestamp', 'midhour':'visit time of day', 'dow start':'arrive day of week', 'dow end': 'leave day of week', 'fsq 0':'Foursquare Nightlife Spot', 'fsq 1':'Foursquare Outdoors & Recreation', 'fsq 2':'Foursquare Arts & Entertainment' , 'fsq 3':'Foursquare Professional or Medical Office', 'fsq 4':'Foursquare Food', 'fsq 5':'Foursquare Home', 'fsq 6':'Foursquare Shop or Store', 'fsq 7':'Foursquare Travel or Transport', 'fsq 8':'Foursquare Unknown', 'fsq distance': 'Foursquare distance', 'LT frequency':'visit frequency', 'LT interval mean':'mean time between visits', 'n gps':'visit duration'} # extracting means and CIs feature_label = x_train.columns fscore = pd.DataFrame(index=np.arange(n_bootstrap), columns=feature_label) for i in range(n_bootstrap): keys = np.array(gbm[i].booster().get_fscore().keys())