def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=0 for j in range(m): clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=7,num_round=400,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) yqq=y_pred*(1.0/(j+1)) print j,llfun(y_test_cv,yqq) #y_pred/=m; clf=XGBClassifier(max_depth=10,colsample_bytree=0.8,learning_rate=0.02,n_estimators=500,nthread=-1) #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100),(y_train_cv),eval_metric="logloss",eval_set=[(X_test_cv, y_test_cv)]) y_pred=clf.predict_proba(X_test_cv).T[1] print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred=y_pred yreal=y_test_cv idx=idx[test_index] print xx[-1]#,y_pred.shape break print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred,yreal,idx#np.mean(xx)
def xgboostcv(max_depth, learning_rate, n_estimators, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, silent=True, nthread=-1, seed=1234): clf = XGBClassifier(max_depth=int(max_depth), learning_rate=learning_rate, n_estimators=int(n_estimators), silent=silent, nthread=nthread, gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, seed=seed, objective="binary:logistic") # Run Kfolds on the data model to stop over-fitting X_train, X_valid, y_train, y_valid = train_test_split(train, train_labels, test_size=0.1, random_state=seed) xgb_model =, y_train, eval_metric="auc", eval_set=[(X_valid, y_valid)], early_stopping_rounds=20) y_pred = xgb_model.predict_proba(X_valid)[:,1] return auc(y_valid, y_pred)
def test_predict_sklearn_pickle(self): X,y = makeXy() Xtest = makeXtest() from xgboost import XGBClassifier kwargs={} kwargs['tree_method'] = 'gpu_hist' kwargs['predictor'] = 'gpu_predictor' kwargs['silent'] = 0 kwargs['objective'] = 'binary:logistic' model = XGBClassifier(**kwargs),y) print(model) # pickle model save_obj(model,"model.pkl") # delete model del model # load model model = load_obj("model.pkl") os.remove("model.pkl") # continue as before print("Before model.predict") sys.stdout.flush() tmp = time.time() gpu_pred = model.predict(Xtest, output_margin=True) print(gpu_pred) print("E non-zeroes: %d:" % (np.count_nonzero(gpu_pred))) print("E GPU Time to predict = %g" % (time.time() - tmp))
def xgboost_classifier(self): cls = XGBClassifier() print 'xgboost cross validation score', cross_val_score(cls,self.x_data,self.y_data) start_time = time.time(), self.y_train) print 'score', cls.score(self.x_test, self.y_test) print 'time cost', time.time() - start_time
def feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc'): thresholds = [thres for thres in sorted(model.feature_importances_) if thres != 0] # Use feat. with >0 importance roc_scores = {} for thresh in thresholds: # select features using threshold selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) selection_model = XGBClassifier() # train model, y_train, eval_metric=eval_metric) select_X_test = selection.transform(X_test) # eval model y_pred = selection_model.predict(select_X_test) roc = roc_auc_score(y_test, y_pred) roc_scores[selection.threshold] = roc best_thresh = max(roc_scores, key=roc_scores.get) fs = SelectFromModel(model, threshold=best_thresh, prefit=True) pickle_model(fs, '') X_train_trans_ = fs.transform(X_train) X_test_trans_ = fs.transform(X_test) print 'total features kept: {}'.format(X_train_trans_.shape[1]) return X_train_trans_, X_test_trans_
def train_model_xgb_meta(train_x, train_y, xgb_features): train_ind = StratifiedShuffleSplit(train_y, random_state=1, test_size=0.2) for train_index, test_index in train_ind: x_train = train_x.ix[train_index, :] y_train = train_y.ix[train_index] x_eval = train_x.ix[test_index, :] y_eval = train_y.ix[test_index] #Classifier xgb = XGBClassifier(max_depth=xgb_features['max_depth'], learning_rate=xgb_features['learning_rate'], n_estimators=int(xgb_features['n_estimators']), objective='binary:logistic', subsample=xgb_features['subsample'], colsample_bytree=xgb_features['colsample_bytree'], min_child_weight=xgb_features['min_child_weight']) # gives 0.458 # bag_clf = BaggingClassifier(xgb, max_samples=10, warm_start=True, verbose=10) # x_train = pd.DataFrame(x_train, dtype=float) #, y_train) xgb =, y_train, verbose=True, eval_metric='logloss', eval_set=[(x_eval, y_eval)], early_stopping_rounds=10) # cv_score = cross_val_score(xgb, x_train, y_train, cv=4, n_jobs=1, pre_dispatch=1, verbose=10, scoring='log_loss') # print(cv_score) # print(np.mean(cv_score)) # predictions = pd.Series(xgb.predict_proba(x_train, ntree_limit=xgb.best_iteration)[:, 1], name='PredictedProb') return xgb # , predictions
def cv(X_train, y_train, features_inner): kfold = StratifiedKFold(n_splits=5, shuffle=True) scores_f = [] scores_p = [] scores_r = [] for train, test in kfold.split(X_train, y_train): model = XGBClassifier() X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns) y_train_cv = pd.DataFrame(y_train.values[train], columns=["tred_cutoff"]) X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns) y_test_cv = pd.DataFrame(y_train.values[test], columns=["tred_cutoff"]), y_train_cv) y_pred = model.predict(X_test_cv) s_f = f1_score(y_test_cv, y_pred) s_p = precision_score(y_test_cv, y_pred) s_r = recall_score(y_test_cv, y_pred) print("\tscores f1", (s_f)) print("\tscores p", (s_p)) print("\tscores r", (s_r)) scores_f.append(s_f) scores_p.append(s_p) scores_r.append(s_r) print("mean scores f1", np.mean(scores_f)) print("mean scores p", np.mean(scores_p)) print("mean scores r", np.mean(scores_r))
def xgboostcv(max_depth, learning_rate, n_estimators, subsample, colsample_bytree, gamma, min_child_weight, silent=True, nthread=-1, seed=1234): clf = XGBClassifier(max_depth=int(max_depth), learning_rate=learning_rate, n_estimators=int(n_estimators), silent=silent, nthread=nthread, subsample=subsample, colsample_bytree=colsample_bytree, gamma=gamma, min_child_weight = min_child_weight, seed=seed, objective="binary:logistic"), y0, eval_metric="logloss", eval_set=[(x1, y1)],early_stopping_rounds=25) ll = -log_loss(y1, clf.predict_proba(x1)) return ll
def XGB_model(train,y): model=XGBClassifier(n_estimators=150, learning_rate=0.01) from sklearn import cross_validation cv = cross_validation.KFold(len(train), n_folds=5,random_state=7) for traincv,testcv in cv:[traincv],y.iloc[traincv]) y_XGB=model.predict(test) return y_XGB
def main(): # Set seed for reproducibility np.random.seed(0) print("Loading data...") # Load the data from the CSV files training_data = pd.read_csv('/home/vipin/Videos/train.csv', header=0) prediction_data = pd.read_csv('/home/vipin/Videos/test.csv', header=0) training_data['countrycode']=training_data['countrycode'].apply(lambda x:ord(x)) training_data['browserid']=training_data['browserid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") ) training_data['devid']=training_data['devid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("none")) #pd.to_csv('/home/vipin/Videos/train11.csv', sep=',', encoding='utf-8') #exit(0) prediction_data['countrycode']=prediction_data['countrycode'].apply(lambda x:ord(x)) prediction_data['browserid']=prediction_data['browserid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") ) prediction_data['devid']=prediction_data['devid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("none") ) features=['siteid','offerid','category','merchant','countrycode','browserid','devid'] target="click" X = training_data[features] x_prediction = prediction_data[features] Y= training_data[target] ids = prediction_data["ID"] model = XGBClassifier() #linear_model.LogisticRegression(n_jobs=-1) print("Training...") # Your model is trained on the training_data, Y) print("Predicting...") seed =7 test_size=0.33 X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=test_size,random_state=seed) y_prediction = model.predict_proba(x_prediction) results = y_prediction[:, 1] results_df = pd.DataFrame(data={'probability':results}) joined = pd.DataFrame(ids).join(results_df) y_pred=model.predict(X_test) accuracy=accuracy_score(y_test,y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print("Writing predictions to predictions.csv") # Save the predictions out to a CSV file joined.to_csv("/home/vipin/Videos/predictions.csv", index=False)
def test_xgboost(): """Ensure that the TPOT xgboost method outputs the same as the xgboost classfier method""" tpot_obj = TPOT() result = tpot_obj._xgradient_boosting(training_testing_data, n_estimators=100, learning_rate=0, max_depth=3) result = result[result['group'] == 'testing'] xgb = XGBClassifier(n_estimators=100, learning_rate=0.0001, max_depth=3, seed=42), training_classes) assert np.array_equal(result['guess'].values, xgb.predict(testing_features))
def update_model(current_year): print 'Creating model...\nDate: {}'.format('%Y-%m-%d_%H:%M:%S')) managers = tuple(unique_managers(current_year)) sql = "select * from (select week, year, manager1_name, manager2_name, team1_points, team1_projected, team2_points, team2_projected, type \ from scoreboard_all WHERE team1_points > 0 and week<=13 \ UNION select week, year, manager2_name AS manager1_name, manager1_name as manager2_name, team2_points AS team1_points, \ team2_projected AS team1_projected, team1_points as team2_points, team1_projected AS team2_projected, type FROM scoreboard_all \ where team1_points>0 and week<=13) order by year, week, type;" ff1 = download_data(os.path.join(os.getcwd(), 'data/fantasy_football.db'), sql) data_features = custom_features(ff1) data_features = data_features[(data_features.manager1_name.isin(managers)) & (data_features.manager2_name.isin(managers))] X, y, managers, league_type = dummy_and_interaction(data_features) # feats = X.columns.tolist() sc = StandardScaler() X_std = sc.fit_transform(X) pickle_model(sc, 'standard.scaler') # Select best features X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.25, random_state=None) model = XGBClassifier(), y_train) # imports = model.feature_importances_.tolist() # g = zip(feats, imports) # feat_importance = sorted(g, key=lambda x: x[1], reverse=True) # print feat_importance X_train_trans, X_test_trans = feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc') # Select best params model = XGBClassifier() learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3] n_estimators = [50, 100, 150, 200, 250, 300] param_grid = dict(n_estimators=n_estimators, learning_rate=learning_rate) grid_search = GridSearchCV(model, param_grid, scoring="log_loss", cv=10, verbose=1) result =, y_train) print("Best: {0} using {1}".format(result.best_score_, result.best_params_)) print 'Best params: ', result.best_params_ best_est = result.best_estimator_ validation = best_est.predict_proba(X_train_trans) print("Roc AUC Train: ", roc_auc_score(y_train, validation[:, 1], average='macro')) probs = best_est.predict_proba(X_test_trans) print("Roc AUC Validation: ", roc_auc_score(y_test, probs[:, 1], average='macro')) pickle_model(best_est, 'fantasy.predict')
def train(imgfile='img/segmentation', modelfile='segmentation.pkl'): filelabel = getFiles(imgfile) row = 120 col=40 data = filter(lambda z: z is not None ,map(lambda x:Img(x[1],row,col,x[0]).imgmap,filelabel)) data = filter(lambda x:x[0] is not None,sum(data,[])) label = np.array(map(lambda x:CHARACTER.get(x[0]),data)) feature = np.array(map(lambda x:np.array(x[1]),data)) from xgboost import XGBClassifier xgb = XGBClassifier(objective='multi:softmax',reg_alpha=1.0,reg_lambda=0.0,subsample=0.7,n_estimators=100,learning_rate=0.3) model =,label,eval_set=[(feature,label)],eval_metric='mlogloss') import pickle fn = modelfile with open(fn, 'w') as f: # open file with write-mode pickle.dump(model, f)
def runner (): m = Model() X = m.df.drop("tred_cutoff", axis=1) Y = m.df["tred_cutoff"] features_inner = m.features + m.features_2 cv(X, Y, features_inner) model = XGBClassifier(), Y) y_pred = model.predict(m.X_test) s_f = f1_score(m.y_test, y_pred) s_p = precision_score(m.y_test, y_pred) s_r = recall_score(m.y_test, y_pred) print("test f1", s_f) print("test precision", s_p) print("test recall", s_r)
def main(): titanic = pandas.read_csv('dataset/titanic.csv') x_set = titanic[['pclass', 'age', 'sex']] y_set = titanic['survived'] x_set.fillna(x_set['age'].mean(), inplace=True) x_train, x_test, y_train, y_test = utils.prepare_train_and_test_sets(x_set, y_set) dict_vectorizer = DictVectorizer(sparse=False) x_train = dict_vectorizer.fit_transform(x_train.to_dict(orient='record')) x_test = dict_vectorizer.transform(x_test.to_dict(orient='record')) decision_tree_classifier = DecisionTreeClassifier() utils.get_trained_result(decision_tree_classifier, x_test, x_train, y_test, y_train) xgb_classifier = XGBClassifier(), y_train) utils.get_trained_result(xgb_classifier, x_test, x_train, y_test, y_train)
def __make_sklearn_model(self): estimators = self.__parameters['estimators'] lrate = self.__parameters['learning_rate'] depth = self.__parameters['max_depth'] leaf_bodes = self.__parameters['max_leaf_nodes'] self.__model = SGBClassifier(n_estimators=estimators, learning_rate=lrate, max_depth=depth, max_leaf_nodes=leaf_bodes, random_state=0)
def trainXGB(data_subset): f.write('\nTraining XGB:'+'\n') X_train = data[data_subset]['X_train'] X_test = data[data_subset]['X_test'] y_train = data[data_subset]['y_train'] y_test = data[data_subset]['y_test'] for p in params['xgboost']: if data_subset != 'binary' and p['objective'] == 'binary:logistic': print("Skip using non-binary data with XGB binary:logistic objective") continue if data_subset == 'binary' and p['objective'] != 'binary:logistic': print("Skip using binary data with XGB multi:* objective") continue header = "@ subset: {0}, params: {1}".format(data_subset, p) f.write('\n'+header+'\n') objective = p['objective'] max_depth = p['max_depth'] try: n_estimators= p['n_estimators'] except KeyError as e: n_estimators= 100 model = XGBClassifier(objective=objective, max_depth=max_depth, n_estimators=n_estimators) start = time.time(), y_train) elapsed_train = time.time() - start y_pred = model.predict(X_test).astype(int) elapsed_predict = time.time() - start accuracy = accuracy_score(y_test, y_pred) precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, pos_label=2, average='weighted') print("\n{5}\nXGB with {0} objective, {6} max_depth, {7} n_estimators on data subset {1} trained in {2} seconds and predicted in {3} seconds with an accuracy of {4}\n".format(objective, data_subset, elapsed_train, elapsed_predict, accuracy, header, max_depth, n_estimators)) f.write(str(elapsed_train) + ', ' + str(elapsed_predict) + str(accuracy)+ ', ' + str(precision)+ ', ' + str(recall )+ ', ' + str(fscore )+ ', ' + str(support))
def train_model_xgb(train_x, train_y, xgb_features): train_ind = StratifiedShuffleSplit(train_y, random_state=1, test_size=0.1) for train_index, test_index in train_ind: x_train = train_x.ix[train_index, :] y_train = train_y.ix[train_index] x_eval = train_x.ix[test_index, :] y_eval = train_y.ix[test_index] #Classifier xgb = XGBClassifier(max_depth=xgb_features['max_depth'], learning_rate=xgb_features['learning_rate'], n_estimators=int(xgb_features['n_estimators']), objective='binary:logistic', subsample=xgb_features['subsample'], colsample_bytree=xgb_features['colsample_bytree'], min_child_weight=xgb_features['min_child_weight']) # gives 0.458 xgb =, y_train, verbose=True, eval_metric='logloss', eval_set=[(x_eval, y_eval)], early_stopping_rounds=10) predictions = pd.Series(xgb.predict_proba(x_train, ntree_limit=xgb.best_iteration)[:, 1], name='PredictedProb') return xgb, predictions
def test_predict_sklearn_pickle(self): x, y = build_dataset() kwargs = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'verbosity': 2, 'objective': 'binary:logistic', 'n_estimators': 10} model = XGBClassifier(**kwargs), y) save_pickle(model, "model.pkl") del model # load model model: xgb.XGBClassifier = load_pickle("model.pkl") os.remove("model.pkl") gpu_pred = model.predict(x, output_margin=True) # Switch to CPU predictor bst = model.get_booster() bst.set_param({'predictor': 'cpu_predictor'}) cpu_pred = model.predict(x, output_margin=True) np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
def get_xgb_feature_importance_plot(best_param_, experiment_, png_folder, png_fname, score_threshold=0.8): # 1. train_X, train_y = experiment_.get_train_data() clf = XGBClassifier() try: del best_param_['model_type'] except: pass clf.set_params(**best_param_), train_y) index2feature = clf.booster().get_fscore() fis = pd.DataFrame({'name':index2feature.keys(), 'score':index2feature.values()}) fis = fis.sort('score', ascending=False) if len(fis.index) > 20: score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold) #where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) # 2. plot #gs = GridSpec(2,2) #ax1 = plt.subplot(gs[:,0]) #ax2 = plt.subplot(gs[0,1]) #ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance sns.barplot(x = 'score', y = 'name', data = fis, #ax=ax1, color="blue") #plt.title("Feature_Importance", fontsize=10) plt.ylabel("Feature", fontsize=10) plt.xlabel("Feature_Importance : f-Score", fontsize=10) """ # 3.2 PDF confidence_score = clf.oob_decision_function_[:,1] sns.distplot(confidence_score, kde=False, rug=False, ax=ax2) ax2.set_title("PDF") # 3.3 CDF num_bins = min(best_param_.get('n_estimators',1), 100) counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10) """ png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname) plt.tight_layout() plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1) plt.close() return True
def __make_xgboost_model(self): estimators = self.__parameters['estimators'] lrate = self.__parameters['learning_rate'] depth = self.__parameters['max_depth'] leaf_bodes = self.__parameters['max_leaf_nodes'] self.__model = XGBClassifier(nthread=4, learning_rate=lrate, n_estimators=estimators, max_depth=depth, gamma=0, subsample=0.9, max_leaf_nodes=leaf_bodes, colsample_bytree=0.5)
def get_thresh(model,train,test,label_test,label_train): if (len(test)>len(train)) or (len(label_test)>len(label_train)): raise TypeError('Invalid train and test size') model1 = XGBClassifier() if type(model)!=type(XGBClassifier()): raise TypeError('Invalid model passed') if (pd.DataFrame(label_train).shape[1]>1) or (pd.DataFrame(label_test).shape[1]>1): raise TypeError('Multiple columns in label, Invalid shape.') max_score=0 thrsh=0 thresholds = np.sort(model.feature_importances_) for thresh in thresholds: selection = feature_selection.SelectFromModel(model, threshold=thresh,prefit=True) select_X_train = selection.transform(train) selection_model = XGBClassifier(), label_train) select_X_test = selection.transform(test) y_pred = selection_model.predict(select_X_test) scr=metrics.roc_auc_score(label_test,y_pred) if(scr>max_score): max_score=scr thrsh=thresh return thrsh
def test_on_data(X, y): x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=2333) print "train set: {}, test set: {}".format(len(x_train), len(x_test)) cls = XGBClassifier(), y_train) # on test pred = cls.predict(x_test) print "xgb accuracy score test", accuracy_score(y_test, pred) # on all pred = cls.predict(X) print "xgb accuracy score all", accuracy_score(y, pred) # compare to gbrt in sklearn cls = GradientBoostingClassifier(), y_train) # on test pred = cls.predict(x_test) print "sklearn accuracy score test", accuracy_score(y_test, pred) # on all pred = cls.predict(X) print "sklearn accuracy score all", accuracy_score(y, pred)
def __init__(self): self._seed = randint(1, 9) self._csvfile = "" self._titles = None self._dataset = None self._X = None self._y = None self._X_original = None self._y_original = None self._dataset_original = None self._model = Sequential() self._sc = StandardScaler() self._vnum = 0 # Number of variables self._classifier = XGBClassifier() self._epochs = 10 self._samplesize = 0 self._clusters = None
import numpy as np import pandas as pd import operator from sklearn.linear_model import LogisticRegressionCV, LogisticRegression from sklearn.model_selection import cross_val_predict, cross_val_score, KFold from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler, PolynomialFeatures, OneHotEncoder from sklearn.metrics import log_loss, roc_auc_score, f1_score from xgboost import XGBClassifier from scipy.sparse import csr_matrix from sklearn.decomposition import TruncatedSVD from data_utils import * models = { "linear": make_pipeline(StandardScaler(), LogisticRegression()), "xgb": XGBClassifier(n_estimators=16, learning_rate=0.5, max_depth=5) } def train_none_models(): X = generate_none_features("train") y = generate_none_targets().loc[X.index] # make sure ordering is correct for name, model in models.items(): with open("../models/none_model_{}.pkl".format(name), "wb") as f: pkl.dump(, y), f) def predict_none_models(): X = generate_none_features("test") preds = {} for name, model in models.items():
def __init__(self): self.model = XGBClassifier() self.progress = 0
def get_model(model_or_name, threads=-1, classification=False): regression_models = { 'xgboost': (XGBRegressor(max_depth=6, nthread=threads), 'XGBRegressor'), 'randomforest': (RandomForestRegressor(n_estimators=100, n_jobs=threads), 'RandomForestRegressor'), 'adaboost': (AdaBoostRegressor(), 'AdaBoostRegressor'), 'linear': (LinearRegression(), 'LinearRegression'), 'elasticnet': (ElasticNetCV(positive=True), 'ElasticNetCV'), 'lasso': (LassoCV(positive=True), 'LassoCV'), 'ridge': (Ridge(), 'Ridge'), 'xgb.1k': (XGBRegressor(max_depth=6, n_estimators=1000, nthread=threads), 'XGBRegressor.1K'), 'xgb.10k': (XGBRegressor(max_depth=6, n_estimators=10000, nthread=threads), 'XGBRegressor.10K'), 'rf.1k': (RandomForestRegressor(n_estimators=1000, n_jobs=threads), 'RandomForestRegressor.1K'), 'rf.10k': (RandomForestRegressor(n_estimators=10000, n_jobs=threads), 'RandomForestRegressor.10K') } classification_models = { 'xgboost': (XGBClassifier(nthread=threads), 'XGBClassifier'), 'randomforest': (RandomForestClassifier(n_estimators=100, n_jobs=threads), 'RandomForestClassifier'), 'adaboost': (AdaBoostClassifier(), 'AdaBoostClassifier'), 'logistic': (LogisticRegression(), 'LogisticRegression'), 'gaussian': (GaussianProcessClassifier(), 'GaussianProcessClassifier'), 'knn': (KNeighborsClassifier(), 'KNeighborsClassifier'), 'bayes': (GaussianNB(), 'GaussianNB'), 'svm': (SVC(), 'SVC'), 'xgb.1k': (XGBClassifier(n_estimators=1000, nthread=threads), 'XGBClassifier.1K'), 'rf.1k': (RandomForestClassifier(n_estimators=1000, n_jobs=threads), 'RandomForestClassifier.1K'), 'xgb.10k': (XGBClassifier(n_estimators=10000, nthread=threads), 'XGBClassifier.10K'), 'rf.10k': (RandomForestClassifier(n_estimators=10000, n_jobs=threads), 'RandomForestClassifier.10K') } neural_network_model = { } # TODO: integrate neural network models into this framework if isinstance(model_or_name, str): if classification: model_and_name = classification_models.get(model_or_name.lower()) else: model_and_name = regression_models.get(model_or_name.lower()) if not model_and_name: raise Exception("unrecognized model: '{}'".format(model_or_name)) else: model, name = model_and_name else: model = model_or_name name ="\w+", str(model)).group(0) return model, name
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.6774117647058825 exported_pipeline = make_pipeline( StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.5, min_samples_leaf=7, min_samples_split=18, n_estimators=100)), StackingEstimator(estimator=BernoulliNB(alpha=1.0, True)), StackingEstimator(estimator=XGBClassifier(learning_rate=0.01, max_depth=9, min_child_weight=20, n_estimators=100, nthread=1, subsample=0.6000000000000001)), ExtraTreesClassifier(bootstrap=True, criterion="gini", max_features=0.7500000000000001, min_samples_leaf=5, min_samples_split=10, n_estimators=100)), training_target) results = exported_pipeline.predict(testing_features)
print(ada_score) print(ada_cm) print(ada_cr) # -------------- from xgboost import XGBClassifier from sklearn.model_selection import GridSearchCV #Parameter list parameters = { 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3], 'max_depth': range(1, 3) } # Code starts here xgb_model = XGBClassifier(random_state=0), y_train) y_pred = xgb_model.predict(X_test) xgb_score = accuracy_score(y_test, y_pred) xgb_cm = confusion_matrix(y_test, y_pred) xgb_cr = classification_report(y_test, y_pred) print(xgb_score) print(xgb_cm) print(xgb_cr) clf_model = GridSearchCV(estimator=xgb_model, param_grid=parameters), y_train) y_pred = clf_model.predict(X_test) clf_score = accuracy_score(y_test, y_pred) clf_cm = confusion_matrix(y_test, y_pred) clf_cr = classification_report(y_test, y_pred) print(clf_score)
d_train_X_2 = pd.concat((d_train_X_0[numeric_features], d_train_X_1), axis=1) var_drop_1 = [ 'm1_loan_sum', 'm3_loan_sum', 'm6_loan_sum', 'm12_loan_sum', 'm18_loan_sum' ] var_drop_tz = [i for i in numeric_features if 'var_out' in i] var_drop = var_drop_1 + var_drop_tz d_train_X_3 = d_train_X_2.drop(var_drop, axis=1) #XGBoost_sklearn接口 from xgboost import XGBClassifier # 先learing_rate和n_estimators,再min_child_weight、colsample_bytree、subsample xgc = XGBClassifier(max_depth=2, objective='binary:logistic') model_params = { 'learning_rate': [0.05, 0.02], 'n_estimators': [300], 'colsample_bytree': [0.7], 'min_child_weight': [5], 'subsample': [0.7] } gs = GridSearchCV(estimator=xgc, param_grid=model_params, n_jobs=4, cv=5, verbose=1, scoring=ks.ks_scorer)
y = dataset['legitimate'].values # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) # Feature Scaling sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting xgboost to the training Set classifier = XGBClassifier(max_depth=20, learning_rate=0.3, n_estimators=150, verbose=10), y_train) # predict the test results y_pred = classifier.predict(X_test) # Makeing the confusion matrix cm = confusion_matrix(y_test, y_pred) print(cm) # Applying K-Fold cross validation accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10) print(accuracies.mean()) print(accuracies.std())
print('=============================================') print('=============================================') print(gsearch1.best_params_, gsearch1.best_score_) print('=============================================') print('=============================================') print('=============================================') param_test1 = { 'max_depth': [3, 4, 5, 6, 7], 'min_child_weight': [3, 5, 7], 'gamma': [i / 10.0 for i in range(0, 5, 2)], 'subsample': [i / 10.0 for i in range(5, 10, 2)], 'colsample_bytree': [i / 10.0 for i in range(5, 10, 2)], 'objective': ['binary:logistic'] } model = XGBClassifier() gsearch1 = GridSearchCV(estimator=XGBClassifier(), param_grid=param_test1, scoring='accuracy', n_jobs=-1, cv=5, verbose=1), target) print('=============================================') print('=============================================') print('=============================================') print(gsearch1.best_params_, gsearch1.best_score_) print('=============================================') print('=============================================')
def cv_BDT(input, output, params, show, channel, selection, names): # model = XGBClassifier() cvscores = [] AUC = [] cvscores_train = [] AUC_train = [] kfold = StratifiedKFold(5, True, 3456) for train, test in kfold.split(input, output): model = XGBClassifier(**params) X_train, X_test, y_train, y_test = ( input[train], input[test], output[train], output[test], ), y_train) y_prob = model.predict_proba(X_test) y_pred = model.predict(X_test) prediction = [round(value) for value in y_pred] auc = roc_auc_score(y_test, y_prob[:, 1]) accuracy = accuracy_score(y_test, prediction) print("Accuracy: %.2f%%; AUC = %.4f%%" % (accuracy * 100, auc)) cvscores.append(accuracy * 100) AUC.append(auc) y_prob = model.predict_proba(X_train) y_pred = model.predict(X_train) prediction = [round(value) for value in y_pred] auc = roc_auc_score(y_train, y_prob[:, 1]) accuracy = accuracy_score(y_train, prediction) print("Accuracy train: %.2f%%; AUC = %.4f%%" % (accuracy * 100, auc)) cvscores_train.append(accuracy * 100) AUC_train.append(auc) print("Accuracy test = %.2f%% (+/- %.2f%%); AUC = %.4f (+/- %.4f)" % (np.mean(cvscores), np.std(cvscores), np.mean(AUC), np.std(AUC))) print("Accuracy train = %.2f%% (+/- %.2f%%); AUC = %.4f (+/- %.4f)" % ( np.mean(cvscores_train), np.std(cvscores_train), np.mean(AUC_train), np.std(AUC_train), )) if show: name = "channel_" + str(channel) + "_BDT" name = "%s_%s" % (name, selection) modelname = "models/%s.h5" % name print("Save to %s" % modelname) plotter.plot_separation(model, X_test, y_test, name, False) plotter.plot_ROC(model, X_test, y_test, name, False) model.get_booster().feature_names = names mp.rc("figure", figsize=(5, 5)) plot_importance(model.get_booster(), max_num_features=15, importance_type="gain") plt.subplots_adjust(left=0.3)
# Divide the data set into a training and testing sets, each time with a different RNG seed training_indices, testing_indices = next(iter(StratifiedShuffleSplit(input_data['class'].values, n_iter=1, train_size=0.75, test_size=0.25, random_state=dataset_repeat))) training_features = input_data.loc[training_indices].drop('class', axis=1).values training_classes = input_data.loc[training_indices, 'class'].values testing_features = input_data.loc[testing_indices].drop('class', axis=1).values testing_classes = input_data.loc[testing_indices, 'class'].values # Create and fit the model on the training data try: clf = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth), training_classes) testing_score = clf.score(testing_features, testing_classes) except: continue param_string = '' param_string += 'learning_rate={},'.format(learning_rate) param_string += 'n_estimators={},'.format(n_estimators) param_string += 'max_depth={}'.format(max_depth) out_text = '\t'.join([dataset.split('/')[-1][:-7], 'XGBClassifier', param_string, str(testing_score)])
#XXX[1,i]=minmax[1] minmax=band.ComputeStatistics(1) XXX[0,i]=minmax[2] XXX[1,i]=minmax[3] return XXX Xtrain,Ytrain=gen_training(4000) XXX=gen_scale(sel) joblib.dump(XXX, path+'Mosquito-Modeling/Climate/data/XXX.pkl') for i in range(Xtrain.shape[1]): # Xtrain[:,i]=(Xtrain[:,i]-XXX[0,i])/(XXX[1,i]-XXX[0,i]) Xtrain[:,i]=(Xtrain[:,i]-XXX[0,i])/XXX[1,i] # define the cross validation from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score forest=XGBClassifier(learning_rate=0.01,max_depth=9,n_estimators=700) kfold=KFold(n_splits=10) scores = cross_val_score(forest, Xtrain, Ytrain, cv=kfold) print(scores) print('Accuracy: %.2f%% (%.2f%%)' % (scores.mean()*100,scores.std()*100)) clf = svm.SVC(kernel='rbf', C=1000) scores = cross_val_score(clf, Xtrain, Ytrain, cv=kfold) print(scores) print('Accuracy: %.2f%% (%.2f%%)' % (scores.mean()*100,scores.std()*100)) clf = svm.SVC(kernel='rbf', C=1000, probability=True), Ytrain) joblib.dump(clf,path+'Mosquito-Modeling/Climate/data/clf.pkl')
def hyperopt_xgb_score(params): clf = XGBClassifier(**params) current_score = cross_val_score(clf, X, y, cv=3).mean() print(current_score, params) return -current_score
print("Shape of data after applying PCA: ", X.shape) # In[11]: from sklearn.model_selection import KFold from sklearn.metrics import accuracy_score kf = KFold(n_splits=3) kf.get_n_splits(X) print(kf) finalaccuracy=[] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] xgmodel = XGBClassifier(), y_train) y_pred = xgmodel.predict(X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print(accuracy*100.0) finalaccuracy.append(accuracy*100.0) print("Accuracy: ", sum(finalaccuracy)/float(len(finalaccuracy))) # In[12]: from sklearn.ensemble import AdaBoostClassifier finalaccuracy=[] for train_index, test_index in kf.split(X):
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) labelencoder_X_2 = LabelEncoder() X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) onehotencoder = OneHotEncoder(categorical_features=[1]) X = onehotencoder.fit_transform(X).toarray() X = X[:, 1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Fitting XGBoost to the Training set from xgboost import XGBClassifier classifier = XGBClassifier(), y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Applying k-Fold Cross Validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10) accuracies.mean() accuracies.std()
digits = datasets.load_digits() x = y = x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2) # A parameter grid for XGBoost params = { "min_child_weight": [1, 5, 10], "gamma": [0.5, 1, 1.5, 2, 5], "subsample": [0.6, 0.8, 1.0], "colsample_bytree": [0.6, 0.8, 1.0], "max_depth": [3, 4, 5], } xgb = XGBClassifier( learning_rate=0.02, n_estimators=50, objective="binary:logistic", silent=True, nthread=1, ) digit_search = TuneSearchCV(xgb, param_distributions=params, n_iter=3, use_gpu=True), y_train) print(digit_search.best_params_) print(digit_search.cv_results_)
def objective(space): ### MODEL SELECTION if model_name == "lr": # logistic regression from sklearn.linear_model import LogisticRegression model = LogisticRegression(**space) elif model_name == "rf": # print("Setting model as RandomForestClassifier") from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(**space, n_jobs=-1) if verbose: print("Hyperparameters: ", space) elif model_name == "xgb": # print("Setting model as XGBClassifier") from xgboost import XGBClassifier model = XGBClassifier(**space, objective="binary:logistic", nthread=-1) if verbose: print("Hyperparameters: ", space) elif model_name == "dt": # print("Setting model as DecisionTreeClassifier") from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier(**space) if verbose: print("Hyperparameters: ", space) elif model_name == "catboost": # print("Setting model as CatBoost") from catboost import CatBoostClassifier model = CatBoostClassifier(**space) if verbose: print("Hyperparameters: ", space) elif model_name == "extratrees": # print("Setting model as CatBoost") from sklearn.ensemble import ExtraTreesClassifier model = ExtraTreesClassifier(**space, n_jobs=-1) if verbose: print("Hyperparameters: ", space) elif model_name == "svc": from sklearn.svm import SVC model = SVC(**space) if verbose: print("Hyperparameters: ", space) elif model_name == "ann": # print("Setting model as ANN") from sklearn import neural_network model = neural_network.MLPClassifier(**space) if verbose: print("Hyperparameters: ", space) elif model_name == "lgb": import lightgbm as lgb model = lgb.LGBMClassifier(**space, n_jobs=-1, random_state=42) if verbose: print("Hyperparameters: ", space) elif model_name == "knn": from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier(**space) if verbose: print("Hyperparameters: ", space) else: # print("ERRO: Especifique um nome valido para model_name: rf, xgb, dt ou catboost") raise Exception( "Invalid model_name - Please specify one of the supported model_name: rf, xgb, ann, dt, svc, lgr, knn or catboost" ) score = cross_val_score(model, x_train, y_train, cv=3, scoring=scoring, verbose=False, n_jobs=-1).mean() score = 1 - score ## ajusta para a funcao de minimizacao. return score
def opt_BDT(input, output, params, show, names): model = XGBClassifier(**params) xgb_param = model.get_xgb_params() cvscores = [] AUC = [] X_train, X_test, y_train, y_test = train_test_split(input, output, test_size=0.2, random_state=42) matrix_train = xgb.DMatrix(X_train, label=y_train) cvresult = xgb_param, matrix_train, num_boost_round=model.get_params()["n_estimators"], nfold=5, metrics="auc", early_stopping_rounds=30, verbose_eval=True, ) model.set_params(n_estimators=cvresult.shape[0]), y_train, eval_metric="auc") y_prob = model.predict_proba(X_test) y_pred = model.predict(X_test) prediction = [round(value) for value in y_pred] auc = roc_auc_score(y_test, y_prob[:, 1]) accuracy = accuracy_score(y_test, prediction) print("Accuracy: %.2f%%; AUC = %.4f%" % (accuracy * 100, auc)) if show: name = "channel_" + str(channel) + "_BDT" name = "%s_%s" % (name, selection) modelname = "models/%s.h5" % name print("Save to %s" % modelname) plotter.plot_separation(model, X_test, y_test, name, False) plotter.plot_ROC(model, X_test, y_test, name, False) model.get_booster().feature_names = names mp.rc("figure", figsize=(5, 5)) plot_importance(model.get_booster()) plt.subplots_adjust(left=0.3)
from xgboost import XGBClassifier X_train_no_last_crop = X_train_no_last_crop.reset_index() X_train_no_last_crop = X_train_no_last_crop.iloc[:, 1:] # Search for best xgb params. # A parameter grid for XGBoost #params = { # 'min_child_weight': [1, 5, 10], # 'gamma': [0.5, 1, 1.5, 2, 5], # 'subsample': [0.6, 0.8, 1.0], # 'colsample_bytree': [0.6, 0.8, 1.0], # 'max_depth': [3, 4, 5] # } # Define classifier. XGB_clf = XGBClassifier(learning_rate=0.02, n_estimators=200, silent=True, objective="multi:softmax", scoring="roc_auc") ## Create grid search. #folds = 3 #param_comb = 5 # #skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001) # #random_search = RandomizedSearchCV(XGB_clf, param_distributions=params, n_iter=param_comb, scoring='roc_auc', # n_jobs=4, cv=skf.split(X_train_no_last_crop,y_train), # verbose=3, random_state=1001 ) # Here we go, y_train) # Print the best estimator.
class classifier: def __init__(self): self.model = XGBClassifier() self.progress = 0 def para_tuning( self, X, y, para, grid, seed=0, verbose=False ): # verbose = 1 for tuning log, verbose = 2 for plotting, verbose = 3 for both # determine which to parameter to tune this time if para == '': return None elif para == 'learning_rate': param_grid = dict(learning_rate=grid) # [0,0.1] elif para == 'max_depth': param_grid = dict(max_depth=grid) # int elif para == 'min_child_weight': param_grid = dict(min_child_weight=grid) # [0,1] elif para == 'gamma': param_grid = dict(gamma=grid) # [0,1] elif para == 'max_delta_step': param_grid = dict(max_delta_step=grid) # int elif para == 'colsample_bytree': param_grid = dict(colsample_bytree=grid) # [0,1] elif para == 'reg_alpha': param_grid = dict(reg_alpha=grid) # [0,1] elif para == 'reg_lambda': param_grid = dict(reg_lambda=grid) # [0,1] else: print('WRONG PARAMETER.') return None kfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=seed) grid_search = GridSearchCV(self.model, param_grid, scoring='accuracy', n_jobs=-1, cv=kfold) grid_result =, y) # summarize results means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] if verbose == 1 or verbose == 3: for mean, stdev, param in zip(means, stds, params): print('{:.4f} ({:.4f}) WITH: {} = {}'.format( mean, stdev, para, list(param.values())[0])) print('-' * 63) self.progress += 1 progress = int(self.progress / 7 * 100) progress_bar = int(self.progress / 7 * 58) print('\r' + '█' * progress_bar + ' ' * (58 - progress_bar) + ' {:>3}%'.format(progress), end='') if verbose == 2 or verbose == 3: # plot plt.close() plt.figure(figsize=(20, 10)) plt.errorbar(grid, means, yerr=stds) plt.title('XGBoost {} Tuning'.format(para)) plt.xlabel(para) plt.ylabel('accuracy') return list(grid_result.best_params_.values())[0] def tune(self, X, y, verbose=False, seed=0): self.model.seed = seed # fit model no training data print('-' * 63) print('AUTO TUNING ON TRAINING DATASET.') self.model.n_estimators = 1024 self.model.subsample = 0.6 self.model.learning_rate = 0.01 self.model.max_depth = self.para_tuning(X, y, 'max_depth', [2, 4, 6, 8], seed, verbose) self.model.min_child_weight = self.para_tuning(X, y, 'min_child_weight', [4, 8, 12, 16], seed, verbose) self.model.gamma = self.para_tuning( X, y, 'gamma', [0, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8], seed, verbose) self.model.max_delta_step = self.para_tuning(X, y, 'max_delta_step', [0, 1, 2, 4], seed, verbose) self.model.colsample_bytree = self.para_tuning(X, y, 'colsample_bytree', [0.5, 0.6, 0.7], seed, verbose) self.model.reg_alpha = self.para_tuning(X, y, 'reg_alpha', [0, 0.001, 0.01, 0.1, 10, 100], seed, verbose) self.model.reg_lambda = self.para_tuning( X, y, 'reg_lambda', [0, 0.001, 0.01, 0.1, 10, 100], seed, verbose) self.model.learning_rate /= 2 sleep(3) print('\rAUTO TUNING FINISHED.' + ' ' * 42) print('-' * 63) if input('MODEL REVIEWING? (Y/N) ') == 'Y': print(self.model) def train(self, data, early_stopping_rounds=None, verbose=True, seed=0): X_train, y_train = data.train[0], data.train[1] X_test, y_test = data.test[0], data.test[1] # tune paramters using trainging dataset self.tune(X_train, y_train, seed=seed) print('-' * 63) # train the model with optimized parameters print('MODEL TRAINING.') metric = ['error', 'logloss', 'auc'] # self.model.min_child_weight = 4, y_train, eval_metric=metric, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=early_stopping_rounds, verbose=False) # make predictions for train data y_pred = self.model.predict(X_train) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_train, predictions) print('TRAINING FINISHED.') print('ACCURACY TRAINING: {:.2f}%'.format(accuracy * 100)) # make predictions for test data y_pred = self.model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print('ACCURACY TESTING: {:.2f}%'.format(accuracy * 100)) if verbose is True: try: # plot boosting results results = self.model.evals_result() epochs = len(results['validation_0'][metric[0]]) x_axis = range(0, epochs)'ggplot') plt.rcParams['font.size'] = 8 plt.figure(figsize=(20, 10)) i = 0 for m in metric: ax = plt.subplot2grid((len(metric), 2), (i, 0)) i += 1 ax.plot(x_axis, results['validation_0'][m], label='Train') ax.plot(x_axis, results['validation_1'][m], label='Test') ax.legend() ax.set_ylabel(m) # plot feature importances features = data.features mapFeat = dict( zip(['f' + str(i) for i in range(len(features))], features)) imp = pd.Series(self.model.booster().get_fscore()) imp.index = imp.reset_index()['index'].map(mapFeat) ax = plt.subplot2grid((len(metric), 2), (0, 1), rowspan=len(metric)) imp.sort_values().plot(kind='barh') ax.set_ylabel('importance') except: print('PLOTTING ERROR.')
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score print(classification_report(y_test, y_pred_svc)) print(confusion_matrix(y_test, y_pred_svc)) from sklearn.linear_model import LogisticRegression # Building pipeline text_clf_lr = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())]) # Fitting and generating predictions, y_train) y_pred_lr = text_clf_lr.predict(X_test) print(classification_report(y_test, y_pred_lr)) print(confusion_matrix(y_test, y_pred_lr)) from xgboost import XGBClassifier # Building pipeline text_clf_xgb = Pipeline([('tfidf', TfidfVectorizer()), ('clf', XGBClassifier())]) # Fitting and generating predictions, y_train) y_pred_xgb = text_clf_xgb.predict(X_test) print(classification_report(y_test, y_pred_xgb)) print(confusion_matrix(y_test, y_pred_xgb)) from sklearn.ensemble import RandomForestClassifier # Building pipeline text_clf_rf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', RandomForestClassifier())]) # Fitting and generating predictions, y_train) y_pred_rf = text_clf_rf.predict(X_test) print(classification_report(y_test, y_pred_rf)) print(confusion_matrix(y_test, y_pred_rf)) model_performance = [
def print_results(dataset, set1, set2): X_set1, y_set1 = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set1)]) X_set2, y_set2 = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set2)]) # X_set1 = np.random.rand(*X_set1.shape) # X_set2 = np.random.rand(*X_set2.shape) X_set1_wf = add_one_features(X_set1, 0) X_set2_wf = add_one_features(X_set2, 1) X_genes_wf = np.concatenate([X_set1_wf, X_set2_wf]) y_all = np.concatenate([y_set1, y_set2]) kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10) print_order = [ "genes", "genes_set", "genes_biased", "genes_double", "study" ] max_len_order = max(map(len, print_order)) rez = defaultdict(list) for i, (train_index, test_index) in enumerate(kf.split(X_genes_wf, y_all)): X_genes_wf_train, X_genes_wf_test = X_genes_wf[ train_index], X_genes_wf[test_index] y_train, y_test = y_all[train_index], y_all[test_index] print("before balanced") print_count_two_sets(X_genes_wf_train[:, 0], y_train) print_count_two_sets(X_genes_wf_test[:, 0], y_test) # print("counter before balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test)) X_genes_wf_train, y_train = random_upsample_balance( X_genes_wf_train, y_train) X_genes_wf_test, y_test = random_upsample_balance( X_genes_wf_test, y_test) # print("counter after balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test)) print("after balanced") print_count_two_sets(X_genes_wf_train[:, 0], y_train) print_count_two_sets(X_genes_wf_test[:, 0], y_test) X_genes_train = X_genes_wf_train[:, 1:] X_genes_test = X_genes_wf_test[:, 1:] Xs_train = X_genes_wf_train[:, :1] Xs_test = X_genes_wf_test[:, :1] rez["genes"].append( calc_results_simple(X_genes_train, X_genes_test, y_train, y_test, XGBClassifier())) rez["genes_set"].append( calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train, y_test, XGBClassifier())) rez["genes_biased"].append( calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train, y_test, BiasedXgboost())) rez["genes_double"].append( calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train, y_test, DoubleXgboost())) rez["study"].append( calc_results_simple(Xs_train, Xs_test, y_train, y_test, XGBClassifier())) for order in print_order: print(order, " " * (max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1])) print("") for order in print_order: print("==> ", order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order]))
for col in numeric_cols: plt.subplot(int(np.ceil(len(numeric_cols)/3)),3,nplot) sns.distplot(zeros[col],hist=False,label='Misses') sns.distplot(ones[col],hist=False,label='Hits') nplot+=1 plt.legend() plt.tight_layout() # %% Modelling ############################################### estimator = XGBClassifier() # estimator = LogisticRegression() param_grid={ 'max_depth':[3,5,12], 'n_estimators':[50,100,200], 'objective':['binary:logistic'] } # param_grid={} clf = GridSearchCV(estimator,param_grid,scoring='precision',cv=2,verbose=2,n_jobs=-1),y_train) preds_proba = clf.predict_proba(X_test)[:,1] threshold = 0.35 # preds = clf.predict(X_test)
lparams['bagging_freq'] = 6 #lparams['early_stopping_round'] = 20 cparams['n_estimators'] = 120 cparams['max_depth'] = 3 #cparams['l2_leaf_reg'] = 0.001 if use_gpu: xparams['tree_method'] = 'gpu_hist' xparams['predictor'] = 'gpu_predictor' xparams['objective'] = 'gpu:binary:logistic' n_jobs = 1 else: xparams['objective'] = 'binary:logistic' lparams['objective'] = 'binary' xgbm = XGBClassifier(**xparams) lgbm = LGBMClassifier(**lparams) cgbm = CatBoostClassifier(**cparams) rdf = RandomForestClassifier() classifiers = [rdf, xgbm, lgbm] classifiers = [xgbm, lgbm, cgbm] classifiers = [xgbm, lgbm] lr = LogisticRegression(C=0.1) grid = StackingClassifier(classifiers=classifiers, use_probas=True, average_probas=False, meta_classifier=lr) n_estimators = [100, 300] n_estimators = sp_randint(250, 500) max_depth = [2, 3]
cv = [] biter = [] for fold, (itr, icv) in enumerate(skf): print "------ Fold %d -----------\n" % (fold + 1) X_train = train_processed.iloc[itr] X_valid = train_processed.iloc[icv] Y_train = target[itr] Y_valid = target[icv] gbm = XGBClassifier(max_depth=8, learning_rate=0.01, n_estimators=10000, subsample=0.9, colsample_bytree=0.45, objective="binary:logistic", silent=False, min_child_weight=1, nthread=-1), Y_train, eval_metric="logloss", eval_set=[(X_train, Y_train), (X_valid, Y_valid)], early_stopping_rounds=200, verbose=20) ll = gbm.best_score best_iter = gbm.best_iteration cv.append(ll)
crossval_splits = 5 accuracy = numpy.zeros(crossval_splits) sensitivity = numpy.zeros(crossval_splits) specificity = numpy.zeros(crossval_splits) cont = 0 skf = StratifiedKFold(n_splits=crossval_splits, shuffle=True, random_state=123) skf.get_n_splits(data, labels) for train_index, test_index in skf.split(data, labels): train_data, test_data = data[train_index], data[test_index] train_labels, test_labels = labels[train_index], labels[test_index] #XGB Classifier model = XGBClassifier(use_label_encoder=False, booster='gbtree', random_state=123), train_labels) #Compute scores pred = model.predict(test_data) predictions = [round(value) for value in pred] predictions = numpy.asarray(predictions) ConfussionMatrix = numpy.zeros((no_classes, no_classes)) for i in range(pred.shape[0]): ConfussionMatrix[test_labels[i], predictions[i]] += 1.0 for i in range(no_classes): accuracy[cont] += ConfussionMatrix[i, i] accuracy[cont] /= ConfussionMatrix.sum() print('accuracy: ' + str(accuracy[cont]))
auc = roc_auc_score(y_actual, y_pred) accuracy = accuracy_score(y_actual, (y_pred > thresh)) recall = recall_score(y_actual, (y_pred > thresh)) precision = precision_score(y_actual, (y_pred > thresh)) specificity = calc_specificity(y_actual, y_pred, thresh) print('AUC:%.3f' % auc) print('accuracy:%.3f' % accuracy) print('recall:%.3f' % recall) print('precision:%.3f' % precision) print('specificity:%.3f' % specificity) print('prevalence:%.3f' % rate(y_actual)) print(' ') return auc, accuracy, recall, precision, specificity from xgboost import XGBClassifier import xgboost as xgb xgbc = XGBClassifier(), y_train) y_train_preds = xgbc.predict_proba(X_train_tf)[:, 1] y_valid_preds = xgbc.predict_proba(X_valid_tf)[:, 1] print('Xtreme Gradient Boosting Classifier ') print('Training: ') xgbc_train_auc, xgbc_train_accuracy, xgbc_train_recall, xgbc_train_precision, xgbc_train_specificity = print_report( y_train, y_train_preds, thresh) print('Validation: ') xgbc_valid_auc, xgbc_valid_accuracy, xgbc_valid_recall, xgbc_valid_precision, xgbc_valid_specificity = print_report( y_valid, y_valid_preds, thresh), Y_train) from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(n_estimators=300, random_state=0), Y_train) from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor(random_state=0), Y_train) from sklearn.svm import SVR model = SVR(kernel='rbf'), Y_train) from xgboost import XGBClassifier model = XGBClassifier(), Y_train) from sklearn.neighbors import KNeighborsRegressor model = KNeighborsRegressor(n_neighbors=5, metric='manhattan'), Y_train) from sklearn.linear_model import Lasso model = Lasso(), y_train) from sklearn.linear_model import Ridge model = Ridge(), y_train) from sklearn.linear_model import ElasticNet
X=ohe.fit_transform(X).toarray() X = X[:,1:] #%% #verilerin egitim ve test icin bolunmesi from sklearn.model_selection import train_test_split x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size=0.33, random_state=0) #%% #verilerin olceklenmesi """ FEATURE SCALING UYGULAMAYA GEREK KALMIYOR SANIRIM """ #%% """ XGBoost """ from xgboost import XGBClassifier classifier = XGBClassifier(), y_train) # Sonuçta bu bir classifier. XGBoostta bir classification alg. demekki. y_pred = classifier.predict(x_test) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_pred,y_test) print(cm)
print('We have %d classes and %d models TOTAL so in resulting arrays \ we expect to see %d columns.' % (n_classes, len(models_1) + len(models_2), n_classes * (len(models_1) + len(models_2)))) # Create empty arrays S_train_all = np.zeros((X_train.shape[0], 0)) S_test_all = np.zeros((X_test.shape[0], 0)) # Load results for name in sorted(glob('*.npy')): print('Loading: %s' % name) S = np.load(name) S_train_all = np.c_[S_train_all, S[0]] S_test_all = np.c_[S_test_all, S[1]] print('\nS_train_all shape:', S_train_all.shape) print('S_test_all shape: ', S_test_all.shape) # Initialize 2nd level model model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=3) # Fit 2nd level model model =, y_train) # Predict y_pred = model.predict_proba(S_test_all) # Final prediction score print('Final prediction score: %.8f' % log_loss(y_test, y_pred))
def fit_xgboost(params, X, y): clf = XGBClassifier(**params), y) return clf
_ = death_preds.actual.value_counts() ax=ax[0], rot=0, color=(sns.color_palette()[0], sns.color_palette()[2])).set(xticklabels=["Alive", "Deceased"]) _ = death_preds.actual.value_counts().plot.pie(labels = ("Alive", "Deceased"), autopct = "%.2f%%", label = "", fontsize = 13., ax = ax[1],\ colors = (sns.color_palette()[0], sns.color_palette()[2]), wedgeprops = {"linewidth": 1.5, "edgecolor": "#F7F7F7"}), ax[1].texts[1].set_color("#F7F7F7"), ax[1].texts[3].set_color("#F7F7F7") X = death_preds[death_preds.actual == 0].sample(350, random_state=62).append( death_preds[death_preds.actual == 1].sample( 350, random_state=62)).copy(deep=True).astype(np.float64) Y = X.actual.values tX = death_preds[~death_preds.index.isin(X.index)].copy(deep=True).astype( np.float64) tY = tX.actual.values X.drop(["SNo", "actual", "DateoFdeath"], 1, inplace=True) tX.drop(["SNo", "actual", "DateoFdeath"], 1, inplace=True) clf_xgb = XGBC(subsample=.8, colsample_bytree=.8, seed=14, max_depth=3).fit(X, Y) preds_xgb = clf_xgb.predict_proba(tX) ax = pd.DataFrame(list(clf_xgb.get_booster().get_fscore().items())).set_index(0)\ .sort_values(1).plot.barh(figsize = (12, 8)) _ = ax.set(frame_on=False, ylim=(0, len(clf_xgb.get_booster().get_fscore())), xticklabels="", xlabel="", ylabel=""), ax.legend("") _ = plt.title("XGB Feature Importance", fontsize=18.) logreg = LogisticRegression(random_state=14).fit(X, Y) preds_lr = logreg.predict_proba(tX) df = pd.DataFrame(list(zip(tX.columns, logreg.coef_[0]))) df = df.reindex(df[1].abs().sort_values().index).set_index(0) ax = df.plot.barh(width=.6, legend="", figsize=(12, 9))
# plot decision tree from numpy import loadtxt from xgboost import XGBClassifier from xgboost import plot_tree from matplotlib import pyplot # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:,0:8] y = dataset[:,8] # fit model no training data model = XGBClassifier(), y) # plot single tree plot_tree(model)
skf = StratifiedKFold(np.array(train["TARGET"]), n_folds = 10, shuffle = True, random_state = 14) cv = [] biter = [] for fold, (itr, icv) in enumerate(skf): print "------ Fold %d -----------\n" %(fold+1) trainingSet = train.iloc[itr] validationSet = train.iloc[icv] gbm = XGBClassifier(max_depth=4, learning_rate = 0.01, n_estimators=3000, subsample=0.8, colsample_bytree=0.5, objective="binary:logistic", silent = False, min_child_weight=5, nthread=-1)[feature_names], np.array(trainingSet["TARGET"]), eval_metric="auc", eval_set=[(trainingSet[feature_names], np.array(trainingSet["TARGET"])), (validationSet[feature_names], np.array(validationSet["TARGET"]))], early_stopping_rounds=200,verbose=20) ll = gbm.best_score best_iter = gbm.best_iteration cv.append(ll) biter.append(best_iter) print "---auc : %0.6f\n" %ll
#df_null_check.to_excel('Data/Null check.xlsx', index=False) _ = StandardScaler().fit_transform(X_test) X_test = pd.DataFrame(_, columns=X_test.columns) X_train = X_train.round(3) y_train = y_train.round(3) #X_train.to_excel(r'Linh Tinh/Linh tinh.xlsx', index = False) #, y_train) # df_new = rfecv.fit_transform(X_train,y_train) # print("Best Features:", rfecv.get_support) # print("Optimal number of features : %d" % rfecv.n_features_) print("Start Feature Selection") clf_feature_selection = XGBClassifier(colsample_bytree=0.5, gamma=0.1, learning_rate=0.15, max_depth=20, min_child_weight=5, n_estimators=400) clf = XGBClassifier() rfecv = RFECV(estimator=clf_feature_selection, step=1, cv=StratifiedKFold(2), scoring='roc_auc') params = { "learning_rate": [0.05, 0.15, 0.3], "max_depth": [5, 10, 20, 30, 50, 70], "min_child_weight": [5, 10, 20, 50, 100], "gamma": [0.0, 0.1, 0.2, 0.4, 0.5], "colsample_bytree": [0.2, 0.3, 0.5, 0.7], "n_estimators": [100, 200, 400, 500, 600]
target = df['TARGET'] del df['TARGET'] # del df['ID'] id = df_test['ID'] # del df_test['ID'] pca = PCA(n_components=250) train_pcaed = pca.fit_transform(df, target) random_forest = RandomForestClassifier(n_estimators=30, max_depth=5, max_features=20), target) forested = random_forest.predict_proba(train_pcaed) # pipe = Pipeline(steps=[('pca', pca), ('random_forest', random_forest)]) m2_xgb = XGBClassifier(n_estimators=110, nthread=1, max_depth=4), target) m2_xgbed = m2_xgb.predict_proba(train_pcaed) logistic_regression = LogisticRegression(penalty='l1'), target) logistic_regressioned = logistic_regression.predict_proba(train_pcaed) combined = np.concatenate([forested, m2_xgbed, logistic_regressioned], axis=1) log_reg = LogisticRegression(), target) scores = cross_validation.cross_val_score(log_reg, combined, target, cv=5, scoring='roc_auc')
# split into training and test from sklearn.model_selection import train_test_split validation_size = 0.2 seed = 0 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = validation_size, random_state = seed) # create instance of algorithm from xgboost import XGBClassifier model = XGBClassifier() # fit algorithm to the training set (not required if using parameter tuning), y_train) # predict y_predicted = model.predict(X_test)