示例#1
0
def load_pkl(name):
    """Load xgboost model from pickle and perform conversion from version
    0.90 if necessary.

    :return:
        XGBoost model
    """
    import pickle
    import xgboost
    with open(name, 'rb') as f:
        try:
            model = pickle.load(f)
            return model
        except xgboost.core.XGBoostError as e:
            if "Check failed: header == serialisation_header_" in str(e):
                import xgboost_prev  # pylint: disable=unused-import
                import tempfile

                class Unpickler(pickle.Unpickler):
                    def find_class(self, module, name):
                        if module.startswith("xgboost"):
                            return pickle.Unpickler.find_class(
                                self, module.replace("xgboost",
                                                     "xgboost_prev"), name)
                        return pickle.Unpickler.find_class(self, module, name)

                f.seek(0)
                model = Unpickler(f).load()
                temp_file = tempfile.NamedTemporaryFile(
                    prefix='xgboost_migration', suffix='.model')
                model.save_model(temp_file.name)
                migrated_model = xgboost.XGBModel()
                migrated_model.load_model(temp_file.name)
                return migrated_model
            raise
def train_xgb_model(subtrain_x, subtrain_y, validation_x, validation_y,
                    best_model_fname):
    print(subtrain_x.shape)
    params = {
        'min_child_weight': 1,
        'learning_rate': 0.03,
        'colsample_bytree': 0.9,
        'subsample': 0.9,
        'gamma': 1,
        'silent': 0,
        'seed': 1234,
        # 'booster': 'gblinear',
        # 'booster': 'gbtree',
        'max_depth': 9,
        'objective': 'reg:linear',
        'nthread': 10,
        'n_estimators': 2000,
    }

    # xgsubtrain = xgb.DMatrix(subtrain_x, label=subtrain_y, )
    # xgval = xgb.DMatrix(validation_x, label=validation_y)
    # rgs = xgb.train(params, xgsubtrain, early_stopping_rounds=10, eval=(xgval, 'eval'))

    rgs = xgb.XGBModel(**params)
    rgs.fit(
        subtrain_x,
        subtrain_y,
        eval_set=[(subtrain_x, subtrain_y), (validation_x, validation_y)],
        eval_metric='mae',
        early_stopping_rounds=30,
        verbose=True,
    )
    return rgs, mean_absolute_error(validation_y, rgs.predict(validation_x))
示例#3
0
 def __init__(self):
     self.preprocessor = joblib.load(os.getenv('PREPROCESSOR_PATH'))
     self.clf = xgb.XGBModel(**{
         'objective': 'binary:logistic',
         'n_estimators': 10
     })
     self.clf.load_model(os.getenv('MODEL_PATH'))
示例#4
0
def test_save_load_model():
    tm._skip_if_no_sklearn()
    from sklearn.datasets import load_digits
    try:
        from sklearn.model_selection import KFold
    except:
        from sklearn.cross_validation import KFold

    digits = load_digits(2)
    y = digits['target']
    X = digits['data']
    try:
        kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
    except TypeError:  # sklearn.model_selection.KFold uses n_split
        kf = KFold(n_splits=2, shuffle=True,
                   random_state=rng).split(np.arange(y.shape[0]))
    with TemporaryDirectory() as tempdir:
        model_path = os.path.join(tempdir, 'digits.model')
        for train_index, test_index in kf:
            xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
            xgb_model.save_model(model_path)
            xgb_model = xgb.XGBModel()
            xgb_model.load_model(model_path)
            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
            err = sum(1 for i in range(len(preds))
                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
            assert err < 0.1
示例#5
0
 def __init__(self, **kwargs):
     self.n_estimators = kwargs['xgb_n_estimators']
     self.objective = kwargs['xgb_objective']
     self.eval_metric = kwargs['xgb_eval_metric']
     self.verbose = kwargs['xgb_verbose']
     self.model = xgb.XGBModel(n_estimators=self.n_estimators,
                               objective=self.objective)
示例#6
0
def save_load_model(model_path):
    from sklearn.datasets import load_digits
    from sklearn.model_selection import KFold

    digits = load_digits(2)
    y = digits['target']
    X = digits['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
        xgb_model.save_model(model_path)
        xgb_model = xgb.XGBClassifier()
        xgb_model.load_model(model_path)
        assert isinstance(xgb_model.classes_, np.ndarray)
        assert isinstance(xgb_model._Booster, xgb.Booster)
        assert isinstance(xgb_model._le, XGBoostLabelEncoder)
        assert isinstance(xgb_model._le.classes_, np.ndarray)
        preds = xgb_model.predict(X[test_index])
        labels = y[test_index]
        err = sum(1 for i in range(len(preds))
                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
        assert err < 0.1
        assert xgb_model.get_booster().attr('scikit_learn') is None

        # test native booster
        preds = xgb_model.predict(X[test_index], output_margin=True)
        booster = xgb.Booster(model_file=model_path)
        predt_1 = booster.predict(xgb.DMatrix(X[test_index]),
                                  output_margin=True)
        assert np.allclose(preds, predt_1)

        with pytest.raises(TypeError):
            xgb_model = xgb.XGBModel()
            xgb_model.load_model(model_path)
示例#7
0
def search_cv(x_train, y_train, x_test):
    xgb_model = xgb.XGBModel()
    params = {
        'booster': ['gblinear'],
        'silent': [1],
        'learning_rate': [x for x in np.round(np.linspace(0.01, 1, 20), 2)],
        'reg_lambda': [lambd for lambd in np.logspace(0, 3, 50)],
        'objective': ['reg:linear']
    }
    print('begin')
    clf = GridSearchCV(xgb_model,
                       params,
                       scoring='neg_mean_squared_error',
                       refit=True)

    clf.fit(x_train, y_train)

    preds = clf.predict(x_test)
    sub_df = pd.read_csv('raw_data/answer_sample_b_20180117.csv', header=None)
    sub_df['Value'] = preds
    sub_df.to_csv('result/xgboost4.csv', header=None, index=False)
    best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
    print('Raw RMSE:', score)
    for param_name in sorted(best_parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))
示例#8
0
 def fit(self):
     #print(self.args, self.X.shape, self.y.shape)
     appFeatures = ["%s_%d_%d" % (x, i, t) for t in [1, 0] for x in libdata.apprates for i in [0,1]]
     phyFeatures = ["%s_0" % (x) for x in libdata.targets]
     feature_names = np.append(appFeatures, phyFeatures)
     X = self.X
     y = self.y
     #print(X)
     if self.m is not None and self.m < len(self.X):
         sels = random.sample(X.shape[0], self.m, replace = False)
         X = X[sels]
         y = y[sels]
     self.nys = y.shape[1]
     print(len(X))
     for i in range(self.nys):
         self.models.append(xgb.XGBModel(**self.args))
         self.models[i].fit(X, y[:,i])
         if False:
             #plot_importance(self.models[i])      
             feature_importance = self.models[i].feature_importances_
             feature_importance = 100.0 * (feature_importance / feature_importance.max())
             sorted_idx = np.argsort(feature_importance)
             sorted_idx = sorted_idx[-10:]
             pos = np.arange(sorted_idx.shape[0]) + .5
             plt.figure()
             plt.barh(pos, feature_importance[sorted_idx], align='center')
             print(sorted_idx)
             plt.yticks(pos, feature_names[sorted_idx])
             plt.xlabel('Relative Importance')
             plt.title('Feature Importance for Fan Power Prediction')
             plt.savefig("power_10r.eps", bbox_inches='tight')
             print(feature_importance)
示例#9
0
def test_save_load_model():
    with TemporaryDirectory() as tempdir:
        model_path = os.path.join(tempdir, 'digits.model')
        save_load_model(model_path)

    with TemporaryDirectory() as tempdir:
        model_path = os.path.join(tempdir, 'digits.model.json')
        save_load_model(model_path)

    from sklearn.datasets import load_digits
    with TemporaryDirectory() as tempdir:
        model_path = os.path.join(tempdir, 'digits.model.json')
        digits = load_digits(2)
        y = digits['target']
        X = digits['data']
        booster = xgb.train(
            {
                'tree_method': 'hist',
                'objective': 'binary:logistic'
            },
            dtrain=xgb.DMatrix(X, y),
            num_boost_round=4)
        predt_0 = booster.predict(xgb.DMatrix(X))
        booster.save_model(model_path)
        cls = xgb.XGBClassifier()
        cls.load_model(model_path)
        predt_1 = cls.predict(X)
        assert np.allclose(predt_0, predt_1)

        cls = xgb.XGBModel()
        cls.load_model(model_path)
        predt_1 = cls.predict(X)
        assert np.allclose(predt_0, predt_1)
 def XGBoost(self):
     self.Encoding()
     X_train,X_test,y_train,y_test = train_test_split(Train.X,Train.y,test_size=0.25,random_state=4)
     clf = xgb.XGBModel(max_depth=8,n_estimators=100,objective="reg:linear", random_state=17,n_jobs=-1)
     clf.fit(X_train, y_train, eval_metric='rmse', verbose = True, eval_set = [(X_train,y_train),(X_test, y_test)])
     clf.save_model('./model/XGBoost.model')
     pickle.dump(clf, open("XGBosst.pickle.dat", "wb"))
示例#11
0
def predict_cy_young(data_path, model_name, labels_to_drop=None):
    bst = xgb.XGBModel()
    bst.load_model(model_name)

    data = pd.read_csv(data_path)
    to_predict = data.drop(labels=labels_to_drop, axis=1)
    ypreds = bst.predict(to_predict)

    return ypreds
示例#12
0
 def __init__(self,
              task: Task,
              scorer: Scorer,
              opt_logger: OptimizationLogger = VoidLogger(None)):
     if task.task == "classification":
         space = XGBoostOptimizer.Params.classification_space
     else:
         space = XGBoostOptimizer.Params.general_space
     super().__init__(xgb.XGBModel(), task, space, scorer, opt_logger)
示例#13
0
def train_xgboost(data, avg={}):
    test_X, test_Y = load_data_no_cut(data, avg)
    bst = xgb.XGBModel(max_depth=6,
                       learning_rate=0.1,
                       silent=True,
                       objective='reg:linear',
                       subsample=0.7,
                       reg_alpha=0.5,
                       reg_lambda=0.3,
                       n_estimators=80)
    # bst.set_params(**param)
    bst.fit(test_X, test_Y)

    return bst
示例#14
0
def xgboosting(X_train, y_train, n_estimators, params):
    print("> Model type : XGBoost")
    model = xgb.XGBModel(objective='reg:squarederror',
                         max_depth=11,
                         subsample=0.5,
                         colsample_bytree=0.5,
                         learning_rate=0.1,
                         n_estimators=n_estimators,
                         verbosity=0,
                         seed=42)
    power_lines = y_train.columns
    trained_models = {}
    for pl in power_lines:
        trained_model = model.fit(X_train, y_train[pl])
        trained_models[pl] = trained_model
    save_model(trained_models, "xgboost", False, params)
示例#15
0
def train_xgboost(train_x, train_y, test_x, test_y, data_c):

    param = {"booster":"gbtree", "max_depth": 2, "eta": 0.3, "objective": "binary:logistic", "nthread":2}
    num_round = 100
    train_mat = xgb.DMatrix(train_x, train_y)
    test_mat = xgb.DMatrix(test_x, label=test_y)
    all_mat = xgb.DMatrix(data_c.drop(columns=["Prod1"]), label=data_c[["Prod1"]])

    evaluation = [(test_mat, "eval"), (train_mat, "train")]

    bst = xgb.train(param, train_mat, num_round, evaluation)

    clf3 = xgb.XGBModel(**param)
    clf3.fit(train_x, train_y, eval_set=[(train_x, train_y), (test_x, test_y)], eval_metric='logloss')
    print(roc_auc_score(test_y, bst.predict(test_mat)))
    print(roc_auc_score(test_y, clf3.predict(test_x)))
示例#16
0
    def test_save_load_model(self):
        self._init_ray()

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model")
            self.save_load_model(model_path)

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model.json")
            self.save_load_model(model_path)

        from sklearn.datasets import load_digits

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model.json")
            digits = load_digits(n_class=2)
            y = digits["target"]
            X = digits["data"]
            booster = xgb.train(
                {
                    "tree_method": "hist",
                    "objective": "binary:logistic"
                },
                dtrain=xgb.DMatrix(X, y),
                num_boost_round=4,
            )
            predt_0 = booster.predict(xgb.DMatrix(X))
            booster.save_model(model_path)
            cls = RayXGBClassifier()
            cls.load_model(model_path)

            proba = cls.predict_proba(X)
            assert proba.shape[0] == X.shape[0]
            assert proba.shape[1] == 2  # binary

            predt_1 = cls.predict_proba(X)[:, 1]
            assert np.allclose(predt_0, predt_1)

            cls = xgb.XGBModel()
            cls.load_model(model_path)
            predt_1 = cls.predict(X)
            assert np.allclose(predt_0, predt_1)
示例#17
0
def test_save_load_model():
    from sklearn.datasets import load_digits
    from sklearn.model_selection import KFold

    digits = load_digits(2)
    y = digits['target']
    X = digits['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    with TemporaryDirectory() as tempdir:
        model_path = os.path.join(tempdir, 'digits.model')
        for train_index, test_index in kf.split(X, y):
            xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
            xgb_model.save_model(model_path)
            xgb_model = xgb.XGBModel()
            xgb_model.load_model(model_path)
            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
            err = sum(1 for i in range(len(preds))
                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
            assert err < 0.1
示例#18
0
def best_xgb(X_train, y_train, n_estimators, params):
    models = {}
    importance_tab = load("importance_per_w")
    power_lines = y_train.columns[y_train.columns.str.match("NPWD")]
    model = xgb.XGBModel(objective='reg:squarederror',
                         max_depth=11,
                         subsample=0.5,
                         colsample_bytree=0.5,
                         learning_rate=0.1,
                         n_estimators=n_estimators,
                         verbosity=0,
                         seed=42)

    trained_models = {}
    for pl in power_lines:
        print(">> Fitting", pl)
        features = importance_tab[pl]
        model.fit(X_train[features[:40]], y_train[pl])
        trained_models[pl] = model
    save_model(trained_models, "xgboost", False, params)
示例#19
0
    def search(self, x_train, y_train, x_test):
        xgb_model = xgb.XGBModel()
        params = {
            'booster': ['gblinear'],
            'silent': [1],
            'learning_rate':
            [x for x in np.round(np.linspace(0.01, 1, 20), 2)],
            'reg_lambda': [lambd for lambd in np.logspace(0, 3, 50)],
            'objective': ['reg:linear']
        }
        print('begin')
        clf = GridSearchCV(xgb_model,
                           params,
                           scoring='neg_mean_squared_error',
                           refit=True)

        clf.fit(x_train, y_train)

        preds = clf.predict(x_test)
        return preds
示例#20
0
def get_estimator(estimator):
    if estimator == 'booster':
        e = xgb.XGBModel(objective='reg:squarederror',
                         max_depth=11,
                         subsample=0.5,
                         colsample_bytree=0.5,
                         learning_rate=0.1,
                         n_estimators=500,
                         silent=1,
                         seed=42)

    elif estimator == 'xtrees':
        e = ExtraTreesRegressor(n_estimators=500,
                                random_state=0,
                                min_samples_leaf=20,
                                n_jobs=-1)
    elif estimator == 'rf':
        e = RandomForestRegressor(n_estimators=500,
                                  random_state=1,
                                  min_samples_leaf=10,
                                  n_jobs=-1)
    return e
示例#21
0
 def train_with_xgboost(self, x_train, y_train, x_test, y_test):
     xgb_model = xgb.XGBModel()
     params = {
         'booster': ['gblinear'],
         'silent': [1],
         'learning_rate':
         [x for x in np.round(np.linspace(0.01, 1, 20), 2)],
         'reg_lambda': [lambd for lambd in np.logspace(0, 3, 50)],
         'objective': ['reg:linear']
     }
     print('begin')
     clf = GridSearchCV(xgb_model,
                        params,
                        scoring='neg_mean_squared_error',
                        refit=True)
     clf.fit(x_train, y_train)
     preds = clf.predict(x_test)
     print('test mse:', self.cal_MSE(preds, y_test))
     best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
     print('Raw RMSE:', score)
     for param_name in sorted(best_parameters.keys()):
         print("%s: %r" % (param_name, best_parameters[param_name]))
示例#22
0
def test_multi_adboost_cart(data):
    test_X, test_Y = load_data(data)
    adaboost = MultiAdaBoostRegressor([
        DecisionTreeRegressor(max_depth=4),
        GradientBoostingRegressor(n_estimators=1,
                                  learning_rate=0.1,
                                  max_depth=4,
                                  random_state=0,
                                  loss='ls'),
        xgb.XGBModel(max_depth=4,
                     learning_rate=0.6,
                     silent=True,
                     objective='reg:linear',
                     subsample=0.7,
                     reg_alpha=0.5,
                     reg_lambda=0.3,
                     n_estimators=1)
    ],
                                      loss="square",
                                      learning_rate=0.01,
                                      n_estimators=4)
    adaboost.fit(test_X, test_Y)
    return adaboost
示例#23
0
# -*- coding: utf-8 -*-
# @Time    : 2019/3/15 15:36
# @Author  : lilong
# @File    : xgboost_intro.py
# @Description: xgboost简介
# 多分类问题指定 objective为'multi:softmax'
import numpy as np
import xgboost as xgb
xgb.XGBModel()

if __name__ == '__main__':
    data_train = xgb.DMatrix(r'E:\pyProject\python_trick\xgboost_practice\data\14.agaricus_train.txt')
    data_test = xgb.DMatrix(r'E:\pyProject\python_trick\xgboost_practice\data\14.agaricus_test.txt')
    print(data_train)
    print(data_test)
    # 设置参数
    param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}  # logitraw
    # xgb.XGBClassifier()
    watchlist = [(data_test, 'eval'), (data_train, 'train')]
    bst = xgb.train(param, data_train, num_boost_round=3, evals=watchlist)
    # bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg, feval=error_rate)
    y_pred = bst.predict(data_test)
    y = data_test.get_label()

    error = sum(y != (y_pred > 0.5))
    err_rate = float(error) / len(y_pred)

    print('错误数据%d' % error)
    print('错误率%.5f%%' % (err_rate * 100))
示例#24
0
def main(args, path: str, prefix: str, model_path: str, date_str: str):
    input_col = "{0}/{1}_columns.npy".format(path, prefix)
    input_x_train = "{0}/{1}_x_train.npy".format(path, prefix)
    input_y_train = "{0}/{1}_y_train.npy".format(path, prefix)
    input_x_dev = "{0}/{1}_x_dev.npy".format(path, prefix)
    input_y_dev = "{0}/{1}_y_dev.npy".format(path, prefix)
    model_file = "{0}/{1}_({2}).model".format(model_path, date_str, prefix)
    config_file = "{0}/{1}_({2}).config".format(model_path, date_str, prefix)
    importances_file = "{0}/{1}_({2}).xlsx".format(model_path, date_str,
                                                   prefix)

    if args.configuration is None:
        config = {
            "max_depth": 3,
            "n_estimators": 400,
            "min_child_weight": 1,
            "tree_method": "gpu_hist",
            "learning_rate": 0.07,
        }

    else:
        with open(file=args.configuration, mode='r') as json_file:
            config = json.load(json_file)

    print("Start training...")
    columns = np.load(input_col)
    X_train = np.load(input_x_train)
    Y_train = np.load(input_y_train)

    if args.split_train:
        X_train, X_dev, Y_train, Y_dev = train_test_split(
            X_train,
            Y_train,
            test_size=args.test_size,
            stratify=Y_train,
        )

    else:
        X_dev = np.load(input_x_dev)
        Y_dev = np.load(input_y_dev)

    # Imbalance ratio.
    # Allows to compensate the imbalance between the classes.
    imbalance_ratio = len(Y_train) / np.sum(Y_train)
    print("Imbalance:", imbalance_ratio)
    config.update({"scale_pos_weight": imbalance_ratio})

    # Train the classifier
    # "rmse" for root mean squared error.
    # "mae" for mean absolute error.
    # "logloss" for binary logarithmic loss
    # and "mlogloss" for multi-class log loss (cross entropy).
    # "error" for classification error.
    # "auc" for area under ROC curve.
    with open(file=config_file, mode='w+') as cfg_file:
        json.dump(config, cfg_file)

    clf = xgb.XGBModel(**config)

    clf.fit(
        X_train,
        Y_train,
        eval_set=[(X_dev, Y_dev)],
        eval_metric=["auc"],
        early_stopping_rounds=400,
        verbose=True,
    )

    bst = clf.get_booster()

    bst.save_model(model_file)
    imp = clf.feature_importances_

    for feature_name, importance in zip(imp, columns):
        print(
            "Feature name: {0}, importance: {1}".format(
                importance,
                feature_name,
            ), )

    # Output the features importance in an Excel file
    df_importances = pd.DataFrame(
        np.hstack((
            np.array(
                sorted(
                    zip(columns, imp, imp / np.max(imp)),
                    key=lambda line: line[1],
                    reverse=True,
                ), ),
            np.array([np.cumsum(sorted(imp, reverse=True))]).transpose(),
        ), ),
        columns=[
            'Feature name',
            'Importance',
            'Normalised importance',
            'Cumulated importance',
        ],
    )

    df_importances['Importance'] = pd.to_numeric(df_importances['Importance'])
    df_importances['Normalised importance'] = pd.to_numeric(
        df_importances['Normalised importance'], )
    df_importances['Cumulated importance'] = pd.to_numeric(
        df_importances['Cumulated importance'], )

    with pd.ExcelWriter(  # pylint: disable=abstract-class-instantiated
            importances_file,
            date_format='YYYY-MM-DD',
            datetime_format='YYYY-MM-DD HH:MM:SS',
            engine='xlsxwriter',
    ) as writer:

        df_importances.to_excel(
            writer,
            sheet_name='Features importance',
            index=False,
        )

        # https://stackoverflow.com/a/40535454
        worksheet = writer.sheets['Features importance']

        for idx, col in enumerate(df_importances):  # loop through all columns
            series = df_importances[col]
            max_len = max(
                (
                    series.astype(str).map(len).max(),  # len of largest item
                    len(str(series.name)),  # len of column name/header
                ), ) + 2  # adding a little extra space

            worksheet.set_column(idx, idx, max_len)  # set column width
示例#25
0
# @Time    : 2019/3/18 18:44
# @Author  : lilong
# @File    : xgboost_Model.py
# @Description:
import xgboost as xgb
from sklearn.datasets import load_iris

if __name__ == '__main__':
    data_train = xgb.DMatrix(r'E:\pyProject\python_trick\xgboost_practice\data\14.agaricus_train.txt')
    data_test = xgb.DMatrix(r'E:\pyProject\python_trick\xgboost_practice\data\14.agaricus_test.txt')
    iris_data = load_iris()
    X = iris_data.data
    y = iris_data.target
    # 设置参数
    param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}  # logitraw
    bst = xgb.XGBModel(objective="reg:linear",
                       booster='gbtree',
                       max_depth=3,
                       learning_rate=1,
                       n_estimators=4
                       )
    bst.fit(X, y)
    y_pred = bst.predict(X)
    y = data_test.get_label()

    error = sum(y != (y_pred > 0.5))
    err_rate = float(error) / len(y_pred)

    print('错误数据%d' % error)
    print('错误率%.5f%%' % (err_rate * 100))
import pandas as pd
import numpy as np
from math import exp
from sklearn.preprocessing import normalize
import xgboost as xgb

norms = pd.read_excel('data.xlsx', sheet_name='normatives', index_col='Номер')
clf = xgb.XGBModel()
clf.load_model('xgb_class.json')


def to_float(x):
    try:
        x = x.replace(',', '.')
    except:
        pass
    return float(x)


for item in ['Зона опасности', 'Зона риска', 'Зона стабильности']:
    norms[item] = norms[item].apply(to_float)


def belong_f(x, param=1):
    """
    Функция, возвращающая кортеж значений функций принадлежности к каждому из состояний
    """
    # значения, разделяющие интервалы
    splitters = [
        norms.at[param, 'Зона риска'], norms.at[param, 'Зона опасности'],
        norms.at[param, 'Зона стабильности']
dtest = xgb.DMatrix(X_test, label=y_test)

# param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error'), ('n_estimators',2)]
param = {'max_depth': 2, 'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'n_estimators': 2}
num_round = 2
watchlist = [(dtest,'eval'), (dtrain,'train')]

evals_result = {}
bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result)

print('Access logloss metric directly from evals_result:')
print(evals_result['eval']['logloss'])


print('')
print('Access complete dictionary:')
print(evals_result)

"""
param_dist = {'objective': 'binary:logistic', 'n_estimators': 2}

clf = xgb.XGBModel(**param_dist)

clf.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        eval_metric=['logloss', 'error'],
        verbose=True)

evals_result = clf.evals_result()
print(evals_result)
示例#28
0
import os
import sys
import joblib
import xgboost as xgb
import pandas as pd

from flask import Flask
from flask import request
from flask.logging import default_handler

app = Flask(__name__)

preprocessor = joblib.load(os.getenv('PREPROCESSOR_PATH'))
clf = xgb.XGBModel(**{'objective': 'binary:logistic', 'n_estimators': 10})
clf.load_model(os.getenv('MODEL_PATH'))


@app.route('/predict', methods=['POST'])
def predict():
    requestJSON = request.get_json(force=True, cache=False)

    features = pd.DataFrame({
        'PassengerId': pd.Series([], dtype='int64'),
        'Survived': pd.Series([], dtype='int64'),
        'Pclass': pd.Series([], dtype='int64'),
        'Name': pd.Series([], dtype='str'),
        'Sex': pd.Series([], dtype='str'),
        'Age': pd.Series([], dtype='float64'),
        'SibSp': pd.Series([], dtype='int64'),
        'Parch': pd.Series([], dtype='int64'),
        'Ticket': pd.Series([], dtype='str'),
示例#29
0
def run(k, j, filename, seednum=20, threshold = 0.5, resultdir=None, graphdir = f'{treedir}/'):
#    classes = ["P1a1" , "P1a2"  , "P2b"  , "P2c" ] 
    classes = ["P1a1" , "P1a2", "P2b", "P2c", "H1" ]
    # H1 H2  O (1) P1a1 (4)  P1a2 (6)   P2b   P2c   S1a (0)   S1c    S2    S3 
    joind = gp.read_file(filename, layer = layers[j])
    print(f'\n------\n------{layers[j]}----\n-----\n')
    joind['area']= joind['geometry'].area #calculate the area of each object
    df1 = pd.DataFrame(joind.drop(columns='geometry'))
    df1 = df1.replace([np.inf, -np.inf], np.nan).dropna()
    
    Pcl = df1.loc[df1['geocode_2'].isin(classes)] # filter only classes of interest
    print(Pcl['geocode_2'].value_counts())
    # regroup, geocode_2 from here on becomes binary!
    Pcl['geocode_2'] = np.where(Pcl['geocode_2'].str.contains(classes[k]),classes[k],'Others')
    print(Pcl['geocode_2'].value_counts())
    minc = min(Pcl['geocode_2'].value_counts() ) # skip if less than 20 objects 
    if minc< 20:
        print("minimum class less than 20")
        return (-1, -1) # -1 -1 if not calculated
    else:    
        print(f'total {len(df1)}, P_H1_classes: {len(Pcl)}, minimun class: {minc}')       
        # bootstrap and get averaged accuracy
        avepre = np.zeros(1) # store all the xgb+tree precisions in each CV
        averec = np.zeros(1)
        for seeds in range(seednum):
            np.random.seed(seeds)
            #1. categorise the variable "area", the variable "area" is kept in the data frame, strictly it can be removed.  
            #2. use groupby to sample the same amount for each area category 
            # use 70% of area for training, get the index
            print (Pcl['area'].quantile([0, .25, .5, .75, 1]))
            Pcl['area_c'] = pd.cut(Pcl['area'],
                     bins=  Pcl['area'].quantile([0, .25, .5, .75, 1]).tolist()
                     labels=[ "q25", "q5", "q75", "Max"])
            
            print(Pcl["area_c"].value_counts())

            train_ind = Pcl.groupby('area_c').sample(n = int(min(Pcl["area_c"].value_counts())*0.7)).index 
            test_ind = Pcl[~Pcl.index.isin(train_ind)].index
            
            Pcl.loc [train_ind,"geocode_2" ].value_counts()
            X_train0 = Pcl.loc [train_ind ].drop(columns=["geocode_2","layer","OBJECTID","path", "area_c"])
            X_test0  = Pcl.loc [test_ind ].drop(columns=["geocode_2","layer","OBJECTID","path", "area_c"])
            
            Y_train0 = Pcl.filter(regex='geocode_2').loc[train_ind] 
            Y_test0  = Pcl.filter(regex='geocode_2').loc[test_ind] 
            print("after sampling by area: for 2 classes,", X_train0.shape[0], X_test.shape[0])
            print(Pcl.loc [train_ind ]["geocode_2"].value_counts())
            # if my pandas is lower and i can't use the above function,
             
            # grouped = Pcl.drop(columns=["geocode_2","layer","OBJECTID","path",'area']).groupby('area_c')
            
            #def fun1(x):
            #    y = x.drop(columns=["area_c"]) 
            #    return( y.sample(n = int(minc/5*0.7)).index )
            #train_ind = grouped.apply(fun1) 
            #test_ind = Pcl[~Pcl.index.isin(train_ind)].index
            #neew to ungroup train_ind
            
            # test data
            #grouped2 = Pcl[['geocode_2',"area_c"]].groupby('area_c')
            #y = grouped2.apply(fun1)
            
            #####
            # after getting x, y train, we will use undersample to sample from each classes, p1a1 and others
            
            rus = RandomUnderSampler(random_state  = 1)
            X_train, Y_train = rus.fit_resample(X_train0, Y_train0)
            print("number of samples used for training:", X_train.shape[0]/2)
            #y2 = y2.reshape(-1, 1)
            #y2_rus, y_rus = rus.fit_resample(y2, y)
            #y2_rus= y2_rus.flatten()
           
            #len(train)+len(test)
            
            # relable
            label_all = [classes[k], "Others"]
            #classtype  =  [(j, "float32") for j in classes]
            
            #Pcl.geocode_2.unique()
            i = 0
            idx2class = {}
            class2idx = {}
            for tp in label_all:
                idx2class[i] = tp
                class2idx[tp] = i 
                i+= 1
           
             
            Y_trainnum = cl2idx(Y_train.values, class2idx).astype(int)
            Y_testnum = cl2idx(Y_test.values, class2idx).astype(int)
             
            np.unique(Y_trainnum)
            params = {'max_depth': 6, 'eta': 0.002, 
                      'objective':'binary:logistic', 'num_class': 1}
             
            clf = xgb.XGBModel(**params)

            clf.fit(X_train.values, Y_trainnum,
            eval_set=[(X_train.values, Y_trainnum), (X_test.values, Y_testnum)],
            eval_metric='logloss',
            verbose=True)
            
            #for testing
            #clf = DecisionTreeClassifier(min_samples_split= 30, max_depth= 4, min_samples_leaf=20, random_state=1)

            yhat = clf.predict(X_test)
                     
                    # threshold 0.5, probability higher than 0.5 -> positive. 
            yhat_labels = yhat>threshold
            yhat_labels = yhat_labels.astype(int)
            
 
            #TP
            TP = ((Y_testnum == 1) & (yhat_labels == 1)).astype(float) * X_test["area"]
            #FP
            FP = ((Y_testnum == 0) & (yhat_labels == 1)).astype(float) * X_test["area"]
            #TN
            TN = ((Y_testnum == 0) & (yhat_labels == 0)).astype(float) * X_test["area"]
            #FN
            FN =((Y_testnum == 1) & (yhat_labels == 0)).astype(float) * X_test["area"]
            precision = np.sum(TP)/np.sum(TP+FP) 
            recall = np.sum(TP)/np.sum(TP+TN) 
            

            averec = np.append(averec, recall) #store all of them
            avepre = np.append(avepre, precision)

        recall = averec.sum()/seednum #get the mean but exclude the first one (0)
        precision = avepre.sum()/seednum
        print(averec, recall)
        if resultdir is not None:
            Y_testnum =  Y_testnum.astype(int)
            plt.rcParams.update({'font.size': 8})
            ax = xgb.plot_importance(model, grid=False, importance_type='gain', title='Feature importance')
            ax.set_title(f'xgboost importance {layers[j]} {classes[k]}')
            fname = f"{resultdir}/P_{layers[j]}_{classes[k]}_imp"
            plt.savefig(fname, dpi=1200)
        return (recall, precision)
示例#30
0
# Boosted tree
print("XGB Tree")
import xgboost as xgb

param = {
    'n_estimators': 10000,
    'learning_rate': 0.1,
    'objective': 'reg:squarederror',
    'verbosity': 0
}
fit_param = {
    'eval_set': [(X_train, y_train), (X_test, y_test)],
    'early_stopping_rounds': 200,
    'verbose': False
}
BT = Regressor(xgb.XGBModel(**param))
BT.run(X_train, y_train, X_test, y_test, **fit_param)
X_train_new, X_test_new = BT.select(X_train, X_test)
fit_param = {
    'eval_set': [(X_train_new, y_train), (X_test_new, y_test)],
    'early_stopping_rounds': 100,
    'verbose': False
}
# BT.run(X_train_new, y_train, X_test_new, y_test, **fit_param)
xgb.plot_importance(BT.reg)
plt.show()

# RF
print("Random Forest")
RF = Regressor(ensemble.RandomForestRegressor(random_state=42))
RF.run(X_train, y_train, X_test, y_test)