class TestNode(unittest.TestCase): def setUp(self): X, y = make_classification(n_features=5, random_state=1) self.classifier = XGBClassifier(n_estimators=3) self.classifier.fit(X, y) self.predictions = self.classifier.predict_proba(X) self.model_dump = [ tree.split('\n') for tree in self.classifier.booster().get_dump() ] def test_parse_root_node(self): node = bdt2cpp.Node(self.model_dump[0][0]) self.assertEqual(node.cut_value, -0.464102) self.assertFalse(node.weight) self.assertEqual(node.root, node) self.assertIsNone(node.parent) return node def test_parse_left_node(self): node = self.test_parse_root_node() root = node node.left = bdt2cpp.Node(self.model_dump[0][1], parent=node) node = node.left self.assertEqual(node.parent, root) self.assertFalse(node.cut_value) self.assertEqual(node.weight, -0.184906) return node
def get_xgb_feature_importance_plot(best_param_, experiment_, png_folder, png_fname, score_threshold=0.8): # 1. train_X, train_y = experiment_.get_train_data() clf = XGBClassifier() try: del best_param_['model_type'] except: pass clf.set_params(**best_param_) clf.fit(train_X, train_y) index2feature = clf.booster().get_fscore() fis = pd.DataFrame({'name':index2feature.keys(), 'score':index2feature.values()}) fis = fis.sort('score', ascending=False) if len(fis.index) > 20: score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold) #where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) # 2. plot #gs = GridSpec(2,2) #ax1 = plt.subplot(gs[:,0]) #ax2 = plt.subplot(gs[0,1]) #ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance sns.barplot(x = 'score', y = 'name', data = fis, #ax=ax1, color="blue") #plt.title("Feature_Importance", fontsize=10) plt.ylabel("Feature", fontsize=10) plt.xlabel("Feature_Importance : f-Score", fontsize=10) """ # 3.2 PDF confidence_score = clf.oob_decision_function_[:,1] sns.distplot(confidence_score, kde=False, rug=False, ax=ax2) ax2.set_title("PDF") # 3.3 CDF num_bins = min(best_param_.get('n_estimators',1), 100) counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10) """ png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname) plt.tight_layout() plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1) plt.close() return True
def xgb_train_offline(): print('data process...') data = get_data() data = fea_select(data) id_fea = ['user_id', 'item_id', 'shop_id', 'context_id', 'context_page_id'] data.drop(id_fea, axis=1, inplace=True) # 暂时不先利用这些id特征 train, val, test = gen_train_val_test(data, True) y_train = train['is_trade'] X_train = train.drop(['is_trade'], axis=1) y_val = val['is_trade'] X_val = val.drop(['is_trade'], axis=1) print('start training...') xgb = XGBClassifier(objective='binary:logistic', learning_rate=0.01, n_estimators=10, max_depth=5, subsample=0.7, colsample_bytree=0.7, reg_lambda=0.005, nthread=4, seed=128, silent=10) xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='logloss', early_stopping_rounds=50) y_prob = xgb.predict_proba(X_val, ntree_limit=xgb.best_ntree_limit)[:, 1] print('result log_loss = {0}'.format(log_loss(y_val, y_prob))) fea_score = pd.DataFrame() fea_score['feature'] = X_train.columns.tolist() fea_score['score'] = list(xgb.booster().get_fsocre()) fea_score = fea_score.sort_values(by='score', ascending=False).reset_index(drop=True) fea_score.to_csv(file_path + 'fea_score.csv', index=None) fea_map = pd.Series(data=fea_score['score'].values, index=fea_score['feature']) fea_map.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show() print('end...')
def XGBooster(Xtrain, Ytrain, Xtest, Ytest): # Train XG = XGBClassifier() XG.fit(Xtrain, Ytrain) #Feature Importancess XGFeature_importances = pd.DataFrame(XG.booster().get_fscore(), index=Xtrain.columns, columns=['importance']).sort_values( 'importance', ascending=False) # Test yPred = XG.predict(Xtest) XGPredictions = [round(value) for value in yPred] XGAccuracy = accuracy_score(Ytest, XGPredictions) return XGFeature_importances, XG, yPred, XGPredictions, XGAccuracy, print( "Accuracy: %.2f%%" % (XGAccuracy * 100.0))
# train clf.fit(X, y, sample_weight=w) #save results if options.optimize: with open('%s/best_params.json' % options.out_dir, 'w+') as fout: fout.write(json.dumps(clf.best_params_)) pd.DataFrame(clf.cv_results_).to_hdf('%s/cv_results.hd5' % options.out_dir, key='cv_results') if options.refit: clf = clf.best_estimator_ else: with open('%s/best_params.json' % options.out_dir, 'w+') as fout: fout.write(json.dumps(options.clf_params)) if not options.optimize or optimize.optimize and options.refit: if options.save_pickle: with gopen('%s/model.pkl.gz' % options.out_dir, 'w+') as fout: pickle.dump(clf, fout) fout.close() try: model = clf.get_booster() except: model = clf.booster() model.save_model('%s/model.xgb' % options.out_dir) ## ## ## # train it ## clf.fit(X_train,y_train,w_train)
df_all = pd.concat([df_all, bow], axis=1) df_all['num_zero'] = num_zero df_all = pipeline.fit(df_all).transform(df_all) X_train = df_all.iloc[:df_train.shape[0], :] X_test = df_all.iloc[df_train.shape[0]:, :] y_train = df_target ID_test = df_id # best params so far using column/row subsampling, even longer training learning_rate = 0.01 n_estimators = 800 max_depth = 6 subsample = 0.9 colsample_bytree = 0.85 min_child_weight = 1 # default xgb = XGBClassifier(seed=0, learning_rate=learning_rate, n_estimators=n_estimators, min_child_weight=min_child_weight, max_depth=max_depth, colsample_bytree=colsample_bytree, subsample=subsample) xgb = xgb.fit(X_train, y_train, eval_set=[(X_train, y_train)], eval_metric='auc') importances = xgb.booster().get_fscore() df_importance = pd.DataFrame(zip(importances.keys(), importances.values()), columns=['feature', 'importance']) print df_importance.sort_values('importance', ascending=False).reset_index(drop=True) y_pred = xgb.predict_proba(X_test) submission = pd.DataFrame({'ID': ID_test, 'TARGET': y_pred[:, 1]}) submission.to_csv(filename, index=False) print 'Wrote %s' % filename
def get_xgb_feature_importance_plot(best_param_, experiment_, png_folder, png_fname, score_threshold=0.8): # 1. train_X, train_y = experiment_.get_train_data() clf = XGBClassifier() try: del best_param_['model_type'] except: pass clf.set_params(**best_param_) clf.fit(train_X, train_y) index2feature = clf.booster().get_fscore() fis = pd.DataFrame({ 'name': index2feature.keys(), 'score': index2feature.values() }) fis = fis.sort('score', ascending=False) if len(fis.index) > 20: score_threshold = fis['score'][fis['score'] > 0.0].quantile( score_threshold) #where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) # 2. plot #gs = GridSpec(2,2) #ax1 = plt.subplot(gs[:,0]) #ax2 = plt.subplot(gs[0,1]) #ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance sns.barplot( x='score', y='name', data=fis, #ax=ax1, color="blue") #plt.title("Feature_Importance", fontsize=10) plt.ylabel("Feature", fontsize=10) plt.xlabel("Feature_Importance : f-Score", fontsize=10) """ # 3.2 PDF confidence_score = clf.oob_decision_function_[:,1] sns.distplot(confidence_score, kde=False, rug=False, ax=ax2) ax2.set_title("PDF") # 3.3 CDF num_bins = min(best_param_.get('n_estimators',1), 100) counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10) """ png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname) plt.tight_layout() plt.savefig(png_fname) #, bbox_inches='tight', pad_inches=1) plt.close() return True
colsample_bytree=0.8, # 0.9 scale_pos_weight=14, # 10 objective="reg:logistic", nthread=-1, seed=random_seed) # analyse_n_estimators(model, train_x, train_y, est_list=[8000, 9000, 10000, 11000, 12000]) # params = {"max_depth": [3, 4, 5], # "min_child_weight": [1, 10, 100],} # params = {"min_child_weight": [5, 7, 9, 10, 11, 13, 15]} # params = {"subsample": [0.7, 0.8, 0.9, 1], # "colsample_bytree": [0.7, 0.8, 0.9, 1],} # params = {"reg_lambda": [0.1, 1, 10, 100]} # params = {"scale_pos_weight": [6, 8, 10, 12, 14, 16, 18]} # g_model = GridSearchCV(model, param_grid=params, scoring="f1", cv=5, n_jobs=-1, iid=False, verbose=0) # g_model.fit(train_x, train_y) # print(g_model.grid_scores_) # print(g_model.best_score_) # print(g_model.best_params_) # log.info("Best parameter is {}, with score {}".format(g_model.best_params_, g_model.best_score_)) model.fit(train_x, train_y) pred = model.predict(test_x) test_uid = test.iloc[:, 0] result = pd.DataFrame(columns=["uid", "label"]) result["uid"] = test_uid result["label"] = pred result.to_csv("result.csv", index=False) model.booster().save_model("2.model")
class classifier: def __init__(self): self.model = XGBClassifier() self.progress = 0 def para_tuning( self, X, y, para, grid, seed=0, verbose=False ): # verbose = 1 for tuning log, verbose = 2 for plotting, verbose = 3 for both # determine which to parameter to tune this time if para == '': return None elif para == 'learning_rate': param_grid = dict(learning_rate=grid) # [0,0.1] elif para == 'max_depth': param_grid = dict(max_depth=grid) # int elif para == 'min_child_weight': param_grid = dict(min_child_weight=grid) # [0,1] elif para == 'gamma': param_grid = dict(gamma=grid) # [0,1] elif para == 'max_delta_step': param_grid = dict(max_delta_step=grid) # int elif para == 'colsample_bytree': param_grid = dict(colsample_bytree=grid) # [0,1] elif para == 'reg_alpha': param_grid = dict(reg_alpha=grid) # [0,1] elif para == 'reg_lambda': param_grid = dict(reg_lambda=grid) # [0,1] else: print('WRONG PARAMETER.') return None kfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=seed) grid_search = GridSearchCV(self.model, param_grid, scoring='accuracy', n_jobs=-1, cv=kfold) grid_result = grid_search.fit(X, y) # summarize results means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] if verbose == 1 or verbose == 3: for mean, stdev, param in zip(means, stds, params): print('{:.4f} ({:.4f}) WITH: {} = {}'.format( mean, stdev, para, list(param.values())[0])) print('-' * 63) self.progress += 1 progress = int(self.progress / 7 * 100) progress_bar = int(self.progress / 7 * 58) print('\r' + '█' * progress_bar + ' ' * (58 - progress_bar) + ' {:>3}%'.format(progress), end='') if verbose == 2 or verbose == 3: # plot plt.close() plt.figure(figsize=(20, 10)) plt.errorbar(grid, means, yerr=stds) plt.title('XGBoost {} Tuning'.format(para)) plt.xlabel(para) plt.ylabel('accuracy') plt.show() return list(grid_result.best_params_.values())[0] def tune(self, X, y, verbose=False, seed=0): self.model.seed = seed # fit model no training data print('-' * 63) print('AUTO TUNING ON TRAINING DATASET.') self.model.n_estimators = 1024 self.model.subsample = 0.6 self.model.learning_rate = 0.01 self.model.max_depth = self.para_tuning(X, y, 'max_depth', [2, 4, 6, 8], seed, verbose) self.model.min_child_weight = self.para_tuning(X, y, 'min_child_weight', [4, 8, 12, 16], seed, verbose) self.model.gamma = self.para_tuning( X, y, 'gamma', [0, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8], seed, verbose) self.model.max_delta_step = self.para_tuning(X, y, 'max_delta_step', [0, 1, 2, 4], seed, verbose) self.model.colsample_bytree = self.para_tuning(X, y, 'colsample_bytree', [0.5, 0.6, 0.7], seed, verbose) self.model.reg_alpha = self.para_tuning(X, y, 'reg_alpha', [0, 0.001, 0.01, 0.1, 10, 100], seed, verbose) self.model.reg_lambda = self.para_tuning( X, y, 'reg_lambda', [0, 0.001, 0.01, 0.1, 10, 100], seed, verbose) self.model.learning_rate /= 2 sleep(3) print('\rAUTO TUNING FINISHED.' + ' ' * 42) print('-' * 63) if input('MODEL REVIEWING? (Y/N) ') == 'Y': print(self.model) def train(self, data, early_stopping_rounds=None, verbose=True, seed=0): X_train, y_train = data.train[0], data.train[1] X_test, y_test = data.test[0], data.test[1] # tune paramters using trainging dataset self.tune(X_train, y_train, seed=seed) print('-' * 63) # train the model with optimized parameters print('MODEL TRAINING.') metric = ['error', 'logloss', 'auc'] # self.model.min_child_weight = 4 self.model.fit(X_train, y_train, eval_metric=metric, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=early_stopping_rounds, verbose=False) # make predictions for train data y_pred = self.model.predict(X_train) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_train, predictions) print('TRAINING FINISHED.') print('ACCURACY TRAINING: {:.2f}%'.format(accuracy * 100)) # make predictions for test data y_pred = self.model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print('ACCURACY TESTING: {:.2f}%'.format(accuracy * 100)) if verbose is True: try: # plot boosting results results = self.model.evals_result() epochs = len(results['validation_0'][metric[0]]) x_axis = range(0, epochs) plt.style.use('ggplot') plt.rcParams['font.size'] = 8 plt.figure(figsize=(20, 10)) i = 0 for m in metric: ax = plt.subplot2grid((len(metric), 2), (i, 0)) i += 1 ax.plot(x_axis, results['validation_0'][m], label='Train') ax.plot(x_axis, results['validation_1'][m], label='Test') ax.legend() ax.set_ylabel(m) # plot feature importances features = data.features mapFeat = dict( zip(['f' + str(i) for i in range(len(features))], features)) imp = pd.Series(self.model.booster().get_fscore()) imp.index = imp.reset_index()['index'].map(mapFeat) ax = plt.subplot2grid((len(metric), 2), (0, 1), rowspan=len(metric)) imp.sort_values().plot(kind='barh') ax.set_ylabel('importance') plt.show() except: print('PLOTTING ERROR.')
class XGBoostModel: def __init__(self, use_rfc=True): self.use_rfc = use_rfc if self.use_rfc: # Instantiate Random Forest Classifier self.rfc = RFCModel() self.rfc.unpickle() def load_train_data(self): self.df, y, _ = clean_df('data/data.json', training=True) if self.use_rfc: # Include results from random forest classifier as new column rfc_probs = self.rfc.predict_proba_all() self.df['rfc_proba'] = rfc_probs X = self.df.values self.features = self.df.columns self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=0.20, stratify=y, random_state=42) def load_test_data(self): self.df, _, oid = clean_df('data/data_point.json', training=False) if self.use_rfc: # Include results from random forest classifier as new column rfc_probs = self.rfc.predict_proba('data/data_point.json') self.df['rfc_proba'] = rfc_probs return self.df.values, oid def load_one(self, one_json): # with open('one.json', 'w') as f: # temp = '[' + one_json + ']' # f.write(temp) self.df, _, oid = clean_df('[' + one_json + ']', training=False) if self.use_rfc: # Include results from random forest classifier as new column rfc_probs = self.rfc.predict_proba('data/data_point.json') self.df['rfc_proba'] = rfc_probs return self.df.values, oid def fit(self): self.model = XGBClassifier(max_depth=8,\ # reg_alpha=.8,\ n_estimators=200,\ scale_pos_weight=10.13,\ learning_rate=0.1) self.model.fit(self.X_train, self.y_train) @property def feature_importances_(self): #I couldn't call the master class, so just copy-n-pasted #See https://github.com/dmlc/xgboost/commit/dd477ac903eb6f658d6fb2984763c3f8a4516389#diff-2c197a11c1b576e821f5942be9eab74c b = self.model.booster() fs = b.get_fscore() all_features = [fs.get(f, 0.) for f in b.feature_names] all_features = np.array(all_features, dtype=np.float32) return all_features / all_features.sum() def plot_features(self, save_img_dir=None, img_name_prefix='', ext='svg'): ''' use ext='svg' for web! add save_file_dir location to save images save_file_dir has NO trailing slash! eg 'static/images' to keep multiple images saved add prefix string prefix will be added to image file name ''' # this is needed to fix lable clipping in saved files from matplotlib import rcParams rcParams.update({'figure.autolayout': True}) #severly modified from https://gist.github.com/light94/6c42df29f3232ae31e52 b = self.model.booster() fs = b.get_fscore() #print('feature...') #print(b.feature_names) #all_features = {f:fs.get(f, 0.) for f in b.feature_names} #need to add real feature names all_features = { self.features[i]: float(fs.get('f' + str(i), 0.)) for i in range(len(b.feature_names)) } importance = sorted(all_features.items(), key=itemgetter(1)) ff = pd.DataFrame(importance, columns=['feature', 'fscore']) ff['fscore'] = ff['fscore'] / ff['fscore'].sum() #"plot 1" ax = ff.fscore.plot(xticks=ff.index, rot=65) ax.set_xticklabels(ff.feature) plt.title('XGBoost F-scores by feature') if save_img_dir is not None: plt.savefig('{}/{}feature_fscores.{}'.format( save_img_dir, img_name_prefix, ext)) plt.show() #"plot 2" ff.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10)) plt.title('XGBoost Feature Importance') plt.xlabel('relative importance') if save_img_dir is not None: plt.savefig('{}/{}features_barh.{}'.format(save_img_dir, img_name_prefix, ext)) plt.show() plt.close() def pickle(self): _pickle(self.model, 'data/XGBoostModel.pkl') def unpickle(self): self.model = _unpickle('data/XGBoostModel.pkl') def score(self): y_pred = self.model.predict(self.X_test) probs = self.model.predict_proba(self.X_test)[:, 1] accuracy = accuracy_score(self.y_test, y_pred) f1 = f1_score(self.y_test, y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print("f1: %.2f" % f1) print('Confusion matrix') print(np.array([['TN', 'FP'], ['FN', 'TP']])) print(confusion_matrix(self.y_test, y_pred)) def predict(self, X): return self.model.predict(X) def predict_proba(self, X): prob = self.model.predict_proba(X) return prob[:, 1]
def MLdecTree (learnData, picpath, endpoint="I9_STR_EXH", delCol=["I9_STR_SAH","I9_SEQULAE", "I9_STR", "IX_CIRCULATORY"], corrValue=0.995, binary=True): #reads in processed Data from other function learnColumn=learnData.columns #correlates all the nevt columns with the target columns and saves the columns with high corr in list matching = [s for s in learnColumn if endpoint.lower() in s.lower()] endpointofInterest = [s for s in matching if "nevt" in s.lower()] corrDropCol=[] for colName in learnData.columns: #print(colName) if "nevt" in colName.lower(): coreName=colName.split('_NEVT')[0] for match in matching: corrCo=learnData[match].corr(learnData[colName], method='spearman') if (corrCo > corrValue) or (corrCo < -corrValue): #spike_cols = [col for col in learnColumn if coreName in col] corrDropCol.extend([colName, coreName+"_AGE"]) #setting the y for endpoint of interest y = learnData[endpointofInterest[0]].copy().to_numpy() y = y.astype(int) #drop all columns which are medicly too close related to endpoint mask_pattrn = '|'.join(delCol) if mask_pattrn: learnData1 = learnData[learnData.columns.drop(list(learnData.filter(regex=mask_pattrn)))] #deletes all strongly corr columns corrDropCol=list(set(corrDropCol)-set(matching)) mask_pattrn = '|'.join(corrDropCol) if mask_pattrn: learnData1 = learnData1[learnData1.columns.drop(list(learnData1.filter(regex=mask_pattrn)))] #Splitting dependent and independent Variable y=result mask_pattrn = '|'.join(matching) if mask_pattrn: X = learnData1[learnData1.columns.drop(list(learnData1.filter(regex=mask_pattrn)))] else: X=learnData1 #splitting Data in train and test Data set y=pd.Series(preprocessing.LabelEncoder().fit_transform(np.array(y))) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) # ka=pd.Series(y.unique()).sort_values() # , use_label_encoder=False #fitting xgbTree # clf_xgb= xgb.XGBClassifier(use_label_encoder=False) objective="multi:softmax", num_class = len(y.unique())) # clf_xgb.fit(X_train, y_train, eval_metric="merror", eval_set=[(X_test, y_test)]) if binary is True: #to be modified: gamma, n_jobs(threads) #normal weight: scale_pos_weight= (y != 0).sum()/(y == 0).sum() model = XGBClassifier(base_score=0.5, booster="gbtree", colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0.25, learning_rate=0.1, max_delta_step=0, max_depth=6, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, objective="binary:logistic", random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=30, seed=None, subsample=1, verbosity=1) #eval_metric:aucpr + auc + logloss model.fit(X_train, y_train, verbose=True, eval_metric="auc") else: lc = LabelEncoder() lc = lc.fit(y) model = XGBClassifier(base_score=0.5, booster="gbtree", colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=6, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, objective="reg:tweedie", random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=(y == 0).sum()/(y != 0).sum(), seed=None, subsample=1, verbosity=1) #eval_metric:rmse + tweedie-nloglik model.fit(X_train, y_train, verbose=True, eval_metric="tweedie-nloglik") #The accuracy of the model is calculated and printed y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) #Confusion plot (makes sense when the value is binary classified) conf = plot_confusion_matrix(model, X_test, y_test, display_labels=["Have no stroke", "Have a stroke"]) plt.savefig(picpath + '/confmatrix', format = "png") #Code for printing out the xgb Tree calculated and make it pretty bst = model.booster() #for importance_type in ("weight","gain","cover","total_gain","total_cover"): # print("%s: " % importance_type, bst.get_score(importance_type=importance_type)) #next two section is to make visual adjustments node_params = {"shape": "box", "style": "filled, rounded", "fillcolor": "#78cbe"} leaf_params= {"shape" : "box", "style" : "filled", "fillcolor" : "#e48038"} #creates tree image = xgb.to_graphviz(model, num_trees=0, size="10,10", condition_node_params=node_params, leaf_node_params=leaf_params) #Set a different dpi (work only if format == 'png') image.graph_attr = {'dpi':'400'} #Saving the tree where the code is saved image.render(picpath + '/modellbild1', format = "png") return accuracy, model, corrDropCol