def load_pkl(name): """Load xgboost model from pickle and perform conversion from version 0.90 if necessary. :return: XGBoost model """ import pickle import xgboost with open(name, 'rb') as f: try: model = pickle.load(f) return model except xgboost.core.XGBoostError as e: if "Check failed: header == serialisation_header_" in str(e): import xgboost_prev # pylint: disable=unused-import import tempfile class Unpickler(pickle.Unpickler): def find_class(self, module, name): if module.startswith("xgboost"): return pickle.Unpickler.find_class( self, module.replace("xgboost", "xgboost_prev"), name) return pickle.Unpickler.find_class(self, module, name) f.seek(0) model = Unpickler(f).load() temp_file = tempfile.NamedTemporaryFile( prefix='xgboost_migration', suffix='.model') model.save_model(temp_file.name) migrated_model = xgboost.XGBModel() migrated_model.load_model(temp_file.name) return migrated_model raise
def train_xgb_model(subtrain_x, subtrain_y, validation_x, validation_y, best_model_fname): print(subtrain_x.shape) params = { 'min_child_weight': 1, 'learning_rate': 0.03, 'colsample_bytree': 0.9, 'subsample': 0.9, 'gamma': 1, 'silent': 0, 'seed': 1234, # 'booster': 'gblinear', # 'booster': 'gbtree', 'max_depth': 9, 'objective': 'reg:linear', 'nthread': 10, 'n_estimators': 2000, } # xgsubtrain = xgb.DMatrix(subtrain_x, label=subtrain_y, ) # xgval = xgb.DMatrix(validation_x, label=validation_y) # rgs = xgb.train(params, xgsubtrain, early_stopping_rounds=10, eval=(xgval, 'eval')) rgs = xgb.XGBModel(**params) rgs.fit( subtrain_x, subtrain_y, eval_set=[(subtrain_x, subtrain_y), (validation_x, validation_y)], eval_metric='mae', early_stopping_rounds=30, verbose=True, ) return rgs, mean_absolute_error(validation_y, rgs.predict(validation_x))
def __init__(self): self.preprocessor = joblib.load(os.getenv('PREPROCESSOR_PATH')) self.clf = xgb.XGBModel(**{ 'objective': 'binary:logistic', 'n_estimators': 10 }) self.clf.load_model(os.getenv('MODEL_PATH'))
def test_save_load_model(): tm._skip_if_no_sklearn() from sklearn.datasets import load_digits try: from sklearn.model_selection import KFold except: from sklearn.cross_validation import KFold digits = load_digits(2) y = digits['target'] X = digits['data'] try: kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) except TypeError: # sklearn.model_selection.KFold uses n_split kf = KFold(n_splits=2, shuffle=True, random_state=rng).split(np.arange(y.shape[0])) with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, 'digits.model') for train_index, test_index in kf: xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) xgb_model.save_model(model_path) xgb_model = xgb.XGBModel() xgb_model.load_model(model_path) preds = xgb_model.predict(X[test_index]) labels = y[test_index] err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.1
def __init__(self, **kwargs): self.n_estimators = kwargs['xgb_n_estimators'] self.objective = kwargs['xgb_objective'] self.eval_metric = kwargs['xgb_eval_metric'] self.verbose = kwargs['xgb_verbose'] self.model = xgb.XGBModel(n_estimators=self.n_estimators, objective=self.objective)
def save_load_model(model_path): from sklearn.datasets import load_digits from sklearn.model_selection import KFold digits = load_digits(2) y = digits['target'] X = digits['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) xgb_model.save_model(model_path) xgb_model = xgb.XGBClassifier() xgb_model.load_model(model_path) assert isinstance(xgb_model.classes_, np.ndarray) assert isinstance(xgb_model._Booster, xgb.Booster) assert isinstance(xgb_model._le, XGBoostLabelEncoder) assert isinstance(xgb_model._le.classes_, np.ndarray) preds = xgb_model.predict(X[test_index]) labels = y[test_index] err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.1 assert xgb_model.get_booster().attr('scikit_learn') is None # test native booster preds = xgb_model.predict(X[test_index], output_margin=True) booster = xgb.Booster(model_file=model_path) predt_1 = booster.predict(xgb.DMatrix(X[test_index]), output_margin=True) assert np.allclose(preds, predt_1) with pytest.raises(TypeError): xgb_model = xgb.XGBModel() xgb_model.load_model(model_path)
def search_cv(x_train, y_train, x_test): xgb_model = xgb.XGBModel() params = { 'booster': ['gblinear'], 'silent': [1], 'learning_rate': [x for x in np.round(np.linspace(0.01, 1, 20), 2)], 'reg_lambda': [lambd for lambd in np.logspace(0, 3, 50)], 'objective': ['reg:linear'] } print('begin') clf = GridSearchCV(xgb_model, params, scoring='neg_mean_squared_error', refit=True) clf.fit(x_train, y_train) preds = clf.predict(x_test) sub_df = pd.read_csv('raw_data/answer_sample_b_20180117.csv', header=None) sub_df['Value'] = preds sub_df.to_csv('result/xgboost4.csv', header=None, index=False) best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1]) print('Raw RMSE:', score) for param_name in sorted(best_parameters.keys()): print("%s: %r" % (param_name, best_parameters[param_name]))
def fit(self): #print(self.args, self.X.shape, self.y.shape) appFeatures = ["%s_%d_%d" % (x, i, t) for t in [1, 0] for x in libdata.apprates for i in [0,1]] phyFeatures = ["%s_0" % (x) for x in libdata.targets] feature_names = np.append(appFeatures, phyFeatures) X = self.X y = self.y #print(X) if self.m is not None and self.m < len(self.X): sels = random.sample(X.shape[0], self.m, replace = False) X = X[sels] y = y[sels] self.nys = y.shape[1] print(len(X)) for i in range(self.nys): self.models.append(xgb.XGBModel(**self.args)) self.models[i].fit(X, y[:,i]) if False: #plot_importance(self.models[i]) feature_importance = self.models[i].feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) sorted_idx = sorted_idx[-10:] pos = np.arange(sorted_idx.shape[0]) + .5 plt.figure() plt.barh(pos, feature_importance[sorted_idx], align='center') print(sorted_idx) plt.yticks(pos, feature_names[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Feature Importance for Fan Power Prediction') plt.savefig("power_10r.eps", bbox_inches='tight') print(feature_importance)
def test_save_load_model(): with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, 'digits.model') save_load_model(model_path) with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, 'digits.model.json') save_load_model(model_path) from sklearn.datasets import load_digits with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, 'digits.model.json') digits = load_digits(2) y = digits['target'] X = digits['data'] booster = xgb.train( { 'tree_method': 'hist', 'objective': 'binary:logistic' }, dtrain=xgb.DMatrix(X, y), num_boost_round=4) predt_0 = booster.predict(xgb.DMatrix(X)) booster.save_model(model_path) cls = xgb.XGBClassifier() cls.load_model(model_path) predt_1 = cls.predict(X) assert np.allclose(predt_0, predt_1) cls = xgb.XGBModel() cls.load_model(model_path) predt_1 = cls.predict(X) assert np.allclose(predt_0, predt_1)
def XGBoost(self): self.Encoding() X_train,X_test,y_train,y_test = train_test_split(Train.X,Train.y,test_size=0.25,random_state=4) clf = xgb.XGBModel(max_depth=8,n_estimators=100,objective="reg:linear", random_state=17,n_jobs=-1) clf.fit(X_train, y_train, eval_metric='rmse', verbose = True, eval_set = [(X_train,y_train),(X_test, y_test)]) clf.save_model('./model/XGBoost.model') pickle.dump(clf, open("XGBosst.pickle.dat", "wb"))
def predict_cy_young(data_path, model_name, labels_to_drop=None): bst = xgb.XGBModel() bst.load_model(model_name) data = pd.read_csv(data_path) to_predict = data.drop(labels=labels_to_drop, axis=1) ypreds = bst.predict(to_predict) return ypreds
def __init__(self, task: Task, scorer: Scorer, opt_logger: OptimizationLogger = VoidLogger(None)): if task.task == "classification": space = XGBoostOptimizer.Params.classification_space else: space = XGBoostOptimizer.Params.general_space super().__init__(xgb.XGBModel(), task, space, scorer, opt_logger)
def train_xgboost(data, avg={}): test_X, test_Y = load_data_no_cut(data, avg) bst = xgb.XGBModel(max_depth=6, learning_rate=0.1, silent=True, objective='reg:linear', subsample=0.7, reg_alpha=0.5, reg_lambda=0.3, n_estimators=80) # bst.set_params(**param) bst.fit(test_X, test_Y) return bst
def xgboosting(X_train, y_train, n_estimators, params): print("> Model type : XGBoost") model = xgb.XGBModel(objective='reg:squarederror', max_depth=11, subsample=0.5, colsample_bytree=0.5, learning_rate=0.1, n_estimators=n_estimators, verbosity=0, seed=42) power_lines = y_train.columns trained_models = {} for pl in power_lines: trained_model = model.fit(X_train, y_train[pl]) trained_models[pl] = trained_model save_model(trained_models, "xgboost", False, params)
def train_xgboost(train_x, train_y, test_x, test_y, data_c): param = {"booster":"gbtree", "max_depth": 2, "eta": 0.3, "objective": "binary:logistic", "nthread":2} num_round = 100 train_mat = xgb.DMatrix(train_x, train_y) test_mat = xgb.DMatrix(test_x, label=test_y) all_mat = xgb.DMatrix(data_c.drop(columns=["Prod1"]), label=data_c[["Prod1"]]) evaluation = [(test_mat, "eval"), (train_mat, "train")] bst = xgb.train(param, train_mat, num_round, evaluation) clf3 = xgb.XGBModel(**param) clf3.fit(train_x, train_y, eval_set=[(train_x, train_y), (test_x, test_y)], eval_metric='logloss') print(roc_auc_score(test_y, bst.predict(test_mat))) print(roc_auc_score(test_y, clf3.predict(test_x)))
def test_save_load_model(self): self._init_ray() with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, "digits.model") self.save_load_model(model_path) with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, "digits.model.json") self.save_load_model(model_path) from sklearn.datasets import load_digits with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, "digits.model.json") digits = load_digits(n_class=2) y = digits["target"] X = digits["data"] booster = xgb.train( { "tree_method": "hist", "objective": "binary:logistic" }, dtrain=xgb.DMatrix(X, y), num_boost_round=4, ) predt_0 = booster.predict(xgb.DMatrix(X)) booster.save_model(model_path) cls = RayXGBClassifier() cls.load_model(model_path) proba = cls.predict_proba(X) assert proba.shape[0] == X.shape[0] assert proba.shape[1] == 2 # binary predt_1 = cls.predict_proba(X)[:, 1] assert np.allclose(predt_0, predt_1) cls = xgb.XGBModel() cls.load_model(model_path) predt_1 = cls.predict(X) assert np.allclose(predt_0, predt_1)
def test_save_load_model(): from sklearn.datasets import load_digits from sklearn.model_selection import KFold digits = load_digits(2) y = digits['target'] X = digits['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, 'digits.model') for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) xgb_model.save_model(model_path) xgb_model = xgb.XGBModel() xgb_model.load_model(model_path) preds = xgb_model.predict(X[test_index]) labels = y[test_index] err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.1
def best_xgb(X_train, y_train, n_estimators, params): models = {} importance_tab = load("importance_per_w") power_lines = y_train.columns[y_train.columns.str.match("NPWD")] model = xgb.XGBModel(objective='reg:squarederror', max_depth=11, subsample=0.5, colsample_bytree=0.5, learning_rate=0.1, n_estimators=n_estimators, verbosity=0, seed=42) trained_models = {} for pl in power_lines: print(">> Fitting", pl) features = importance_tab[pl] model.fit(X_train[features[:40]], y_train[pl]) trained_models[pl] = model save_model(trained_models, "xgboost", False, params)
def search(self, x_train, y_train, x_test): xgb_model = xgb.XGBModel() params = { 'booster': ['gblinear'], 'silent': [1], 'learning_rate': [x for x in np.round(np.linspace(0.01, 1, 20), 2)], 'reg_lambda': [lambd for lambd in np.logspace(0, 3, 50)], 'objective': ['reg:linear'] } print('begin') clf = GridSearchCV(xgb_model, params, scoring='neg_mean_squared_error', refit=True) clf.fit(x_train, y_train) preds = clf.predict(x_test) return preds
def get_estimator(estimator): if estimator == 'booster': e = xgb.XGBModel(objective='reg:squarederror', max_depth=11, subsample=0.5, colsample_bytree=0.5, learning_rate=0.1, n_estimators=500, silent=1, seed=42) elif estimator == 'xtrees': e = ExtraTreesRegressor(n_estimators=500, random_state=0, min_samples_leaf=20, n_jobs=-1) elif estimator == 'rf': e = RandomForestRegressor(n_estimators=500, random_state=1, min_samples_leaf=10, n_jobs=-1) return e
def train_with_xgboost(self, x_train, y_train, x_test, y_test): xgb_model = xgb.XGBModel() params = { 'booster': ['gblinear'], 'silent': [1], 'learning_rate': [x for x in np.round(np.linspace(0.01, 1, 20), 2)], 'reg_lambda': [lambd for lambd in np.logspace(0, 3, 50)], 'objective': ['reg:linear'] } print('begin') clf = GridSearchCV(xgb_model, params, scoring='neg_mean_squared_error', refit=True) clf.fit(x_train, y_train) preds = clf.predict(x_test) print('test mse:', self.cal_MSE(preds, y_test)) best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1]) print('Raw RMSE:', score) for param_name in sorted(best_parameters.keys()): print("%s: %r" % (param_name, best_parameters[param_name]))
def test_multi_adboost_cart(data): test_X, test_Y = load_data(data) adaboost = MultiAdaBoostRegressor([ DecisionTreeRegressor(max_depth=4), GradientBoostingRegressor(n_estimators=1, learning_rate=0.1, max_depth=4, random_state=0, loss='ls'), xgb.XGBModel(max_depth=4, learning_rate=0.6, silent=True, objective='reg:linear', subsample=0.7, reg_alpha=0.5, reg_lambda=0.3, n_estimators=1) ], loss="square", learning_rate=0.01, n_estimators=4) adaboost.fit(test_X, test_Y) return adaboost
# -*- coding: utf-8 -*- # @Time : 2019/3/15 15:36 # @Author : lilong # @File : xgboost_intro.py # @Description: xgboost简介 # 多分类问题指定 objective为'multi:softmax' import numpy as np import xgboost as xgb xgb.XGBModel() if __name__ == '__main__': data_train = xgb.DMatrix(r'E:\pyProject\python_trick\xgboost_practice\data\14.agaricus_train.txt') data_test = xgb.DMatrix(r'E:\pyProject\python_trick\xgboost_practice\data\14.agaricus_test.txt') print(data_train) print(data_test) # 设置参数 param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} # logitraw # xgb.XGBClassifier() watchlist = [(data_test, 'eval'), (data_train, 'train')] bst = xgb.train(param, data_train, num_boost_round=3, evals=watchlist) # bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg, feval=error_rate) y_pred = bst.predict(data_test) y = data_test.get_label() error = sum(y != (y_pred > 0.5)) err_rate = float(error) / len(y_pred) print('错误数据%d' % error) print('错误率%.5f%%' % (err_rate * 100))
def main(args, path: str, prefix: str, model_path: str, date_str: str): input_col = "{0}/{1}_columns.npy".format(path, prefix) input_x_train = "{0}/{1}_x_train.npy".format(path, prefix) input_y_train = "{0}/{1}_y_train.npy".format(path, prefix) input_x_dev = "{0}/{1}_x_dev.npy".format(path, prefix) input_y_dev = "{0}/{1}_y_dev.npy".format(path, prefix) model_file = "{0}/{1}_({2}).model".format(model_path, date_str, prefix) config_file = "{0}/{1}_({2}).config".format(model_path, date_str, prefix) importances_file = "{0}/{1}_({2}).xlsx".format(model_path, date_str, prefix) if args.configuration is None: config = { "max_depth": 3, "n_estimators": 400, "min_child_weight": 1, "tree_method": "gpu_hist", "learning_rate": 0.07, } else: with open(file=args.configuration, mode='r') as json_file: config = json.load(json_file) print("Start training...") columns = np.load(input_col) X_train = np.load(input_x_train) Y_train = np.load(input_y_train) if args.split_train: X_train, X_dev, Y_train, Y_dev = train_test_split( X_train, Y_train, test_size=args.test_size, stratify=Y_train, ) else: X_dev = np.load(input_x_dev) Y_dev = np.load(input_y_dev) # Imbalance ratio. # Allows to compensate the imbalance between the classes. imbalance_ratio = len(Y_train) / np.sum(Y_train) print("Imbalance:", imbalance_ratio) config.update({"scale_pos_weight": imbalance_ratio}) # Train the classifier # "rmse" for root mean squared error. # "mae" for mean absolute error. # "logloss" for binary logarithmic loss # and "mlogloss" for multi-class log loss (cross entropy). # "error" for classification error. # "auc" for area under ROC curve. with open(file=config_file, mode='w+') as cfg_file: json.dump(config, cfg_file) clf = xgb.XGBModel(**config) clf.fit( X_train, Y_train, eval_set=[(X_dev, Y_dev)], eval_metric=["auc"], early_stopping_rounds=400, verbose=True, ) bst = clf.get_booster() bst.save_model(model_file) imp = clf.feature_importances_ for feature_name, importance in zip(imp, columns): print( "Feature name: {0}, importance: {1}".format( importance, feature_name, ), ) # Output the features importance in an Excel file df_importances = pd.DataFrame( np.hstack(( np.array( sorted( zip(columns, imp, imp / np.max(imp)), key=lambda line: line[1], reverse=True, ), ), np.array([np.cumsum(sorted(imp, reverse=True))]).transpose(), ), ), columns=[ 'Feature name', 'Importance', 'Normalised importance', 'Cumulated importance', ], ) df_importances['Importance'] = pd.to_numeric(df_importances['Importance']) df_importances['Normalised importance'] = pd.to_numeric( df_importances['Normalised importance'], ) df_importances['Cumulated importance'] = pd.to_numeric( df_importances['Cumulated importance'], ) with pd.ExcelWriter( # pylint: disable=abstract-class-instantiated importances_file, date_format='YYYY-MM-DD', datetime_format='YYYY-MM-DD HH:MM:SS', engine='xlsxwriter', ) as writer: df_importances.to_excel( writer, sheet_name='Features importance', index=False, ) # https://stackoverflow.com/a/40535454 worksheet = writer.sheets['Features importance'] for idx, col in enumerate(df_importances): # loop through all columns series = df_importances[col] max_len = max( ( series.astype(str).map(len).max(), # len of largest item len(str(series.name)), # len of column name/header ), ) + 2 # adding a little extra space worksheet.set_column(idx, idx, max_len) # set column width
# @Time : 2019/3/18 18:44 # @Author : lilong # @File : xgboost_Model.py # @Description: import xgboost as xgb from sklearn.datasets import load_iris if __name__ == '__main__': data_train = xgb.DMatrix(r'E:\pyProject\python_trick\xgboost_practice\data\14.agaricus_train.txt') data_test = xgb.DMatrix(r'E:\pyProject\python_trick\xgboost_practice\data\14.agaricus_test.txt') iris_data = load_iris() X = iris_data.data y = iris_data.target # 设置参数 param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} # logitraw bst = xgb.XGBModel(objective="reg:linear", booster='gbtree', max_depth=3, learning_rate=1, n_estimators=4 ) bst.fit(X, y) y_pred = bst.predict(X) y = data_test.get_label() error = sum(y != (y_pred > 0.5)) err_rate = float(error) / len(y_pred) print('错误数据%d' % error) print('错误率%.5f%%' % (err_rate * 100))
import pandas as pd import numpy as np from math import exp from sklearn.preprocessing import normalize import xgboost as xgb norms = pd.read_excel('data.xlsx', sheet_name='normatives', index_col='Номер') clf = xgb.XGBModel() clf.load_model('xgb_class.json') def to_float(x): try: x = x.replace(',', '.') except: pass return float(x) for item in ['Зона опасности', 'Зона риска', 'Зона стабильности']: norms[item] = norms[item].apply(to_float) def belong_f(x, param=1): """ Функция, возвращающая кортеж значений функций принадлежности к каждому из состояний """ # значения, разделяющие интервалы splitters = [ norms.at[param, 'Зона риска'], norms.at[param, 'Зона опасности'], norms.at[param, 'Зона стабильности']
dtest = xgb.DMatrix(X_test, label=y_test) # param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error'), ('n_estimators',2)] param = {'max_depth': 2, 'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'n_estimators': 2} num_round = 2 watchlist = [(dtest,'eval'), (dtrain,'train')] evals_result = {} bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result) print('Access logloss metric directly from evals_result:') print(evals_result['eval']['logloss']) print('') print('Access complete dictionary:') print(evals_result) """ param_dist = {'objective': 'binary:logistic', 'n_estimators': 2} clf = xgb.XGBModel(**param_dist) clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['logloss', 'error'], verbose=True) evals_result = clf.evals_result() print(evals_result)
import os import sys import joblib import xgboost as xgb import pandas as pd from flask import Flask from flask import request from flask.logging import default_handler app = Flask(__name__) preprocessor = joblib.load(os.getenv('PREPROCESSOR_PATH')) clf = xgb.XGBModel(**{'objective': 'binary:logistic', 'n_estimators': 10}) clf.load_model(os.getenv('MODEL_PATH')) @app.route('/predict', methods=['POST']) def predict(): requestJSON = request.get_json(force=True, cache=False) features = pd.DataFrame({ 'PassengerId': pd.Series([], dtype='int64'), 'Survived': pd.Series([], dtype='int64'), 'Pclass': pd.Series([], dtype='int64'), 'Name': pd.Series([], dtype='str'), 'Sex': pd.Series([], dtype='str'), 'Age': pd.Series([], dtype='float64'), 'SibSp': pd.Series([], dtype='int64'), 'Parch': pd.Series([], dtype='int64'), 'Ticket': pd.Series([], dtype='str'),
def run(k, j, filename, seednum=20, threshold = 0.5, resultdir=None, graphdir = f'{treedir}/'): # classes = ["P1a1" , "P1a2" , "P2b" , "P2c" ] classes = ["P1a1" , "P1a2", "P2b", "P2c", "H1" ] # H1 H2 O (1) P1a1 (4) P1a2 (6) P2b P2c S1a (0) S1c S2 S3 joind = gp.read_file(filename, layer = layers[j]) print(f'\n------\n------{layers[j]}----\n-----\n') joind['area']= joind['geometry'].area #calculate the area of each object df1 = pd.DataFrame(joind.drop(columns='geometry')) df1 = df1.replace([np.inf, -np.inf], np.nan).dropna() Pcl = df1.loc[df1['geocode_2'].isin(classes)] # filter only classes of interest print(Pcl['geocode_2'].value_counts()) # regroup, geocode_2 from here on becomes binary! Pcl['geocode_2'] = np.where(Pcl['geocode_2'].str.contains(classes[k]),classes[k],'Others') print(Pcl['geocode_2'].value_counts()) minc = min(Pcl['geocode_2'].value_counts() ) # skip if less than 20 objects if minc< 20: print("minimum class less than 20") return (-1, -1) # -1 -1 if not calculated else: print(f'total {len(df1)}, P_H1_classes: {len(Pcl)}, minimun class: {minc}') # bootstrap and get averaged accuracy avepre = np.zeros(1) # store all the xgb+tree precisions in each CV averec = np.zeros(1) for seeds in range(seednum): np.random.seed(seeds) #1. categorise the variable "area", the variable "area" is kept in the data frame, strictly it can be removed. #2. use groupby to sample the same amount for each area category # use 70% of area for training, get the index print (Pcl['area'].quantile([0, .25, .5, .75, 1])) Pcl['area_c'] = pd.cut(Pcl['area'], bins= Pcl['area'].quantile([0, .25, .5, .75, 1]).tolist() labels=[ "q25", "q5", "q75", "Max"]) print(Pcl["area_c"].value_counts()) train_ind = Pcl.groupby('area_c').sample(n = int(min(Pcl["area_c"].value_counts())*0.7)).index test_ind = Pcl[~Pcl.index.isin(train_ind)].index Pcl.loc [train_ind,"geocode_2" ].value_counts() X_train0 = Pcl.loc [train_ind ].drop(columns=["geocode_2","layer","OBJECTID","path", "area_c"]) X_test0 = Pcl.loc [test_ind ].drop(columns=["geocode_2","layer","OBJECTID","path", "area_c"]) Y_train0 = Pcl.filter(regex='geocode_2').loc[train_ind] Y_test0 = Pcl.filter(regex='geocode_2').loc[test_ind] print("after sampling by area: for 2 classes,", X_train0.shape[0], X_test.shape[0]) print(Pcl.loc [train_ind ]["geocode_2"].value_counts()) # if my pandas is lower and i can't use the above function, # grouped = Pcl.drop(columns=["geocode_2","layer","OBJECTID","path",'area']).groupby('area_c') #def fun1(x): # y = x.drop(columns=["area_c"]) # return( y.sample(n = int(minc/5*0.7)).index ) #train_ind = grouped.apply(fun1) #test_ind = Pcl[~Pcl.index.isin(train_ind)].index #neew to ungroup train_ind # test data #grouped2 = Pcl[['geocode_2',"area_c"]].groupby('area_c') #y = grouped2.apply(fun1) ##### # after getting x, y train, we will use undersample to sample from each classes, p1a1 and others rus = RandomUnderSampler(random_state = 1) X_train, Y_train = rus.fit_resample(X_train0, Y_train0) print("number of samples used for training:", X_train.shape[0]/2) #y2 = y2.reshape(-1, 1) #y2_rus, y_rus = rus.fit_resample(y2, y) #y2_rus= y2_rus.flatten() #len(train)+len(test) # relable label_all = [classes[k], "Others"] #classtype = [(j, "float32") for j in classes] #Pcl.geocode_2.unique() i = 0 idx2class = {} class2idx = {} for tp in label_all: idx2class[i] = tp class2idx[tp] = i i+= 1 Y_trainnum = cl2idx(Y_train.values, class2idx).astype(int) Y_testnum = cl2idx(Y_test.values, class2idx).astype(int) np.unique(Y_trainnum) params = {'max_depth': 6, 'eta': 0.002, 'objective':'binary:logistic', 'num_class': 1} clf = xgb.XGBModel(**params) clf.fit(X_train.values, Y_trainnum, eval_set=[(X_train.values, Y_trainnum), (X_test.values, Y_testnum)], eval_metric='logloss', verbose=True) #for testing #clf = DecisionTreeClassifier(min_samples_split= 30, max_depth= 4, min_samples_leaf=20, random_state=1) yhat = clf.predict(X_test) # threshold 0.5, probability higher than 0.5 -> positive. yhat_labels = yhat>threshold yhat_labels = yhat_labels.astype(int) #TP TP = ((Y_testnum == 1) & (yhat_labels == 1)).astype(float) * X_test["area"] #FP FP = ((Y_testnum == 0) & (yhat_labels == 1)).astype(float) * X_test["area"] #TN TN = ((Y_testnum == 0) & (yhat_labels == 0)).astype(float) * X_test["area"] #FN FN =((Y_testnum == 1) & (yhat_labels == 0)).astype(float) * X_test["area"] precision = np.sum(TP)/np.sum(TP+FP) recall = np.sum(TP)/np.sum(TP+TN) averec = np.append(averec, recall) #store all of them avepre = np.append(avepre, precision) recall = averec.sum()/seednum #get the mean but exclude the first one (0) precision = avepre.sum()/seednum print(averec, recall) if resultdir is not None: Y_testnum = Y_testnum.astype(int) plt.rcParams.update({'font.size': 8}) ax = xgb.plot_importance(model, grid=False, importance_type='gain', title='Feature importance') ax.set_title(f'xgboost importance {layers[j]} {classes[k]}') fname = f"{resultdir}/P_{layers[j]}_{classes[k]}_imp" plt.savefig(fname, dpi=1200) return (recall, precision)
# Boosted tree print("XGB Tree") import xgboost as xgb param = { 'n_estimators': 10000, 'learning_rate': 0.1, 'objective': 'reg:squarederror', 'verbosity': 0 } fit_param = { 'eval_set': [(X_train, y_train), (X_test, y_test)], 'early_stopping_rounds': 200, 'verbose': False } BT = Regressor(xgb.XGBModel(**param)) BT.run(X_train, y_train, X_test, y_test, **fit_param) X_train_new, X_test_new = BT.select(X_train, X_test) fit_param = { 'eval_set': [(X_train_new, y_train), (X_test_new, y_test)], 'early_stopping_rounds': 100, 'verbose': False } # BT.run(X_train_new, y_train, X_test_new, y_test, **fit_param) xgb.plot_importance(BT.reg) plt.show() # RF print("Random Forest") RF = Regressor(ensemble.RandomForestRegressor(random_state=42)) RF.run(X_train, y_train, X_test, y_test)