class XGBTransformer(ModelTransformer):
    def __init__(self, config_model_parameters, columns):
        super().__init__(config_model_parameters, columns)
        self.model = XGBRegressor(**self.model_parameters)
        self.name = "XGBoost"

    def features_importance(self, n=20):
        b = self.model.booster()
        fs = b.get_fscore()
        all_features = [fs.get(f, 0.) for f in b.feature_names]
        all_features = np.array(all_features, dtype=np.float32)
        importance = all_features / all_features.sum()
        logger.info("Feature importances")
        for k, (i, f) in enumerate(reversed(sorted(zip(importance, self.columns)))):
            logger.info("%s -> %f", f, i)
            if k == n:
                break
    def test_tree_ensemble_regressor_xgboost(self):

        this = os.path.dirname(__file__)
        data_train = pandas.read_csv(os.path.join(
            this, "xgboost.model.xgb.n4.d3.train.txt"),
                                     header=None)

        X = data_train.iloc[:, 1:].values
        y = data_train.iloc[:, 0].values

        params = dict(n_estimator=4, max_depth=3)
        model = XGBRegressor(**params).fit(X, y)
        # See https://github.com/apple/coremltools/issues/51.
        model.booster = model.get_booster
        model_coreml = convert_xgb_to_coreml(model)
        model_onnx = convert_cml(model_coreml)
        assert model_onnx is not None
示例#3
0
    def test_tree_ensemble_regressor_xgboost(self):

        this = os.path.dirname(__file__)
        data_train = pandas.read_csv(os.path.join(
            this, "xgboost.model.xgb.n4.d3.train.txt"),
                                     header=None)

        X = data_train.iloc[:, 1:].values
        y = data_train.iloc[:, 0].values

        params = dict(n_estimator=4, max_depth=3)
        model = XGBRegressor(**params).fit(X, y)
        # See https://github.com/apple/coremltools/issues/51.
        model.booster = model.get_booster
        model_coreml = convert_xgb_to_coreml(model)
        model_onnx = convert_cml(model_coreml)
        assert model_onnx is not None
        if sys.version_info[0] >= 3:
            # python 2.7 returns TypeError: can't pickle instancemethod objects
            dump_data_and_model(X.astype(numpy.float32),
                                model,
                                model_onnx,
                                basename="CmlXGBoostRegressor-OneOff-Reshape",
                                allow_failure=True)
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin):
    def __init__(self,
                objective='reg:linear',
                learning_rate=0.045,
                min_child_weight=50,
                subsample=0.8,
                colsample_bytree=0.7,
                max_depth=7,
                n_estimators=700,
                nthread=-1,
                seed=0,
                n_buckets=8,
                initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6,
                                #1., 2., 3., 4., 5., 6., 7.
                                ],
                minimizer='BFGS',
                scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return


    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,

        """
2 / 5
grid scores:
  mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65531

3 / 5
grid scores:
  mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65474

4 / 5
grid scores:
  mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65490


2 / 10
grid scores:
  mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65688

3 / 10
grid scores:
  mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65705

4 / 10
grid scores:
  mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65643

5 / 10
grid scores:
  mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65630

        """
        from sklearn.cross_validation import StratifiedKFold
        kf = StratifiedKFold(y, n_folds=2)
        print(kf)
        params = []
        for itrain, itest in kf:
            ytrain = y[itrain]
            Xtrain = X.iloc[list(itrain)]
            ytest = y[itest]
            Xtest = X.iloc[list(itest)]

            self.xgb = XGBRegressor(
                           objective=self.objective,
                           learning_rate=self.learning_rate,
                           min_child_weight=self.min_child_weight,
                           subsample=self.subsample,
                           colsample_bytree=self.colsample_bytree,
                           max_depth=self.max_depth,
                           n_estimators=self.n_estimators,
                           nthread=self.nthread,
                           missing=0.0,
                           seed=self.seed)
            self.xgb.fit(Xtrain, ytrain)
            te_y_hat = self.xgb.predict(Xtest,
                                        ntree_limit=self.xgb.booster().best_iteration)
            print('XGB Test score is:', -self.scoring(te_y_hat, ytest))

            self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
                           initial_params=self.initial_params,
                           minimizer=self.minimizer,
                           scoring=self.scoring)
            self.off.fit(te_y_hat, ytest)
            print("Offsets:", self.off.params)
            params += [list(self.off.params)]

            pass

        from numpy import array
        self.off.params = array(params).mean(axis=0)
        print("Mean Offsets:", self.off.params)
        self.xgb.fit(X, y)

        return self


    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
class PrudentialRegressorFO(BaseEstimator, RegressorMixin):
    def __init__(self,
                objective='reg:linear',
                learning_rate=0.045,
                min_child_weight=50,
                subsample=0.8,
                colsample_bytree=0.7,
                max_depth=7,
                n_estimators=700,
                nthread=-1,
                seed=0,
                n_buckets=8,
                initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6,
                                #1., 2., 3., 4., 5., 6., 7.
                                ],
                minimizer='BFGS',
                scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return


    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        self.xgb = XGBRegressor(
                       objective=self.objective,
                       learning_rate=self.learning_rate,
                       min_child_weight=self.min_child_weight,
                       subsample=self.subsample,
                       colsample_bytree=self.colsample_bytree,
                       max_depth=self.max_depth,
                       n_estimators=self.n_estimators,
                       nthread=self.nthread,
                       missing=0.0,
                       seed=self.seed)
        from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
#                       basinhopping=True,
                       initial_params=self.initial_params,
                       minimizer=self.minimizer,
                       scoring=self.scoring)

        self.xgb.fit(X, y)

        tr_y_hat = self.xgb.predict(X,
                                    ntree_limit=self.xgb.booster().best_iteration)
        print('Train score is:', -self.scoring(tr_y_hat, y))
        self.off.fit(tr_y_hat, y)
        print("Offsets:", self.off.params)

        return self


    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
示例#6
0
                     subsample=subsample,
                     colsample_bytree=colsample_bytree,
                     scale_pos_weight=1,
                     gamma=gamma,
                     reg_alpha=reg_alpha,
                     learning_rate=learning_rate)
pid_list = data_reader.pid_check(training_data)
model.fit(training_data, training_label)
training_output, predict_output = model.predict(training_data), model.predict(
    predict_data)
data_writer.data_write(folder_name,
                       training_output,
                       predict_output,
                       silent=True)

score = model.booster().get_fscore()
mapper = {'f{0}'.format(i): v for i, v in enumerate(names)}
mapped = {mapper[k]: v for k, v in score.items()}
fig, ax = plt.subplots(1, 1, figsize=(7, 25))
xgb.plot_importance(mapped, ax=ax)
#plt.show()
plt.savefig("graph.png")

all_output = 0
rem = []
for i in range(12):
    training_data, training_label, predict_data, predict_label = data_reader.test_data_read(
        i)
    training_data = np.delete(training_data, 124, 1)
    predict_data = np.delete(predict_data, 124, 1)
    training_data = np.delete(training_data, 118, 1)
示例#7
0
# y_train=train_df["Y"]
y_train = np.log1p(train_df["Y"])
print("---1----")
train_df = train_df.drop(["Y"], axis=1)
print("---11----")
# quantity = [attr for attr in train_df.columns if train_df.dtypes[attr] != 'object']  # 数值变量集合
train_df = train_df[quantity]
# X_train = Imputer().fit_transform(train_df)
X_train = train_df
X_train = X_train.fillna(0)
print(np.isnan(X_train).any())
print("---111----")
xgb1 = XGBRegressor()
xgb1.fit(X_train, y_train)
print("---2----")
feat_imp = pd.Series(xgb1.booster().get_fscore()).sort_values(ascending=False)


def sort_weight_values(X, y, predictors):
    selector = SelectKBest(f_regression, k=5)
    selector.fit(X, y)
    scores = -np.log10(selector.pvalues_)
    dt = pd.DataFrame()
    dt["predictors"] = predictors
    dt["scores"] = scores
    dt = dt.sort_values(by='scores', axis=0, ascending=False)
    dt.to_csv(
        '/Users/jianjun.yue/PycharmGItHub/data/智能制造质量预测/特征重要性_SelectKBest.csv',
        header=False,
        index=False)
    print("------------------------------------------------")
示例#8
0
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin):
    def __init__(
            self,
            objective='reg:linear',
            learning_rate=0.045,
            min_child_weight=50,
            subsample=0.8,
            colsample_bytree=0.7,
            max_depth=7,
            n_estimators=700,
            nthread=-1,
            seed=0,
            n_buckets=8,
            initial_params=[
                -1.5,
                -2.6,
                -3.6,
                -1.2,
                -0.8,
                0.04,
                0.7,
                3.6,
                #1., 2., 3., 4., 5., 6., 7.
            ],
            minimizer='BFGS',
            scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return

    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,
        """
2 / 5
grid scores:
  mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65531

3 / 5
grid scores:
  mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65474

4 / 5
grid scores:
  mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65490


2 / 10
grid scores:
  mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65688

3 / 10
grid scores:
  mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65705

4 / 10
grid scores:
  mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65643

5 / 10
grid scores:
  mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65630

        """
        from sklearn.cross_validation import StratifiedKFold
        kf = StratifiedKFold(y, n_folds=2)
        print(kf)
        params = []
        for itrain, itest in kf:
            ytrain = y[itrain]
            Xtrain = X.iloc[list(itrain)]
            ytest = y[itest]
            Xtest = X.iloc[list(itest)]

            self.xgb = XGBRegressor(objective=self.objective,
                                    learning_rate=self.learning_rate,
                                    min_child_weight=self.min_child_weight,
                                    subsample=self.subsample,
                                    colsample_bytree=self.colsample_bytree,
                                    max_depth=self.max_depth,
                                    n_estimators=self.n_estimators,
                                    nthread=self.nthread,
                                    missing=0.0,
                                    seed=self.seed)
            self.xgb.fit(Xtrain, ytrain)
            te_y_hat = self.xgb.predict(
                Xtest, ntree_limit=self.xgb.booster().best_iteration)
            print('XGB Test score is:', -self.scoring(te_y_hat, ytest))

            self.off = DigitizedOptimizedOffsetRegressor(
                n_buckets=self.n_buckets,
                initial_params=self.initial_params,
                minimizer=self.minimizer,
                scoring=self.scoring)
            self.off.fit(te_y_hat, ytest)
            print("Offsets:", self.off.params)
            params += [list(self.off.params)]

            pass

        from numpy import array
        self.off.params = array(params).mean(axis=0)
        print("Mean Offsets:", self.off.params)
        self.xgb.fit(X, y)

        return self

    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(
            X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
示例#9
0
class PrudentialRegressorFO(BaseEstimator, RegressorMixin):
    def __init__(
            self,
            objective='reg:linear',
            learning_rate=0.045,
            min_child_weight=50,
            subsample=0.8,
            colsample_bytree=0.7,
            max_depth=7,
            n_estimators=700,
            nthread=-1,
            seed=0,
            n_buckets=8,
            initial_params=[
                -1.5,
                -2.6,
                -3.6,
                -1.2,
                -0.8,
                0.04,
                0.7,
                3.6,
                #1., 2., 3., 4., 5., 6., 7.
            ],
            minimizer='BFGS',
            scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return

    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        self.xgb = XGBRegressor(objective=self.objective,
                                learning_rate=self.learning_rate,
                                min_child_weight=self.min_child_weight,
                                subsample=self.subsample,
                                colsample_bytree=self.colsample_bytree,
                                max_depth=self.max_depth,
                                n_estimators=self.n_estimators,
                                nthread=self.nthread,
                                missing=0.0,
                                seed=self.seed)
        from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        self.off = FullDigitizedOptimizedOffsetRegressor(
            n_buckets=self.n_buckets,
            #                       basinhopping=True,
            initial_params=self.initial_params,
            minimizer=self.minimizer,
            scoring=self.scoring)

        self.xgb.fit(X, y)

        tr_y_hat = self.xgb.predict(
            X, ntree_limit=self.xgb.booster().best_iteration)
        print('Train score is:', -self.scoring(tr_y_hat, y))
        self.off.fit(tr_y_hat, y)
        print("Offsets:", self.off.params)

        return self

    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(
            X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
clf_xgb = XGBRegressor(max_depth=3, n_estimators=1000)
clf_gbm = GBMRegressor(exec_path=path_to_exec,
                       num_iterations=1000,
                       learning_rate=0.01,
                       num_leaves=255,
                       min_data_in_leaf=1,
                       early_stopping_round=20,
                       verbose=False)

x_train, x_test, y_train, y_test = model_selection.train_test_split(
    X, Y, test_size=test_size, random_state=seed)

# Training the two models
clf_gbm.fit(x_train, y_train, test_data=[(x_test, y_test)])
clf_xgb.fit(x_train,
            y_train,
            eval_set=[(x_test, y_test)],
            eval_metric='rmse',
            early_stopping_rounds=20,
            verbose=False)

print("xgboost: feature importance")
dic_fi = clf_xgb.booster().get_fscore()
xgb_fi = [(feature_names[int(k[1:])], dic_fi[k]) for k in dic_fi]
xgb_fi = sorted(xgb_fi, key=lambda x: x[1], reverse=True)
print(xgb_fi)

print("lightgbm: feature importance")
gbm_fi = clf_gbm.feature_importance(feature_names)
print(gbm_fi)
示例#11
0
# 회기 모델
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8,
                                                    random_state=0)

scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# XGBRFRegressor??????

model = XGBRegressor()



model.fit(x_train,y_train)
model.booster().get_score()


thres_holds = np.sort(model.feature_importances_)
print(thres_holds)

model1 = MultiOutputRegressor(XGBRegressor())
model1.fit(x_train,y_train)

score = model.score(x_test,y_test)

print(f"r2 : {score}")

'''feature engineering'''

示例#12
0
test = test.drop(["单核细胞%"], axis=1)
# X = X.drop(["淋巴细胞%"], axis=1)
# X = X.drop(["乙肝e抗原"], axis=1)
# X = X.drop(["乙肝表面抗体"], axis=1)
Y = data["血糖"]
Y = np.log1p(Y)

clf = XGBRegressor()
print("---111----")
kfold = KFold(n_splits=5, random_state=7)
test_score = np.sqrt(
    -cross_val_score(clf, X, Y, cv=kfold, scoring='neg_mean_squared_error'))
print("------test_score--------")
print(test_score)
print(np.mean(test_score))
print("---2----")
clf.fit(X, Y)
FeatureImportances = pd.Series(
    clf.booster().get_fscore()).sort_values(ascending=False)
print(FeatureImportances)
print("---3----")
pred = np.expm1(clf.predict(test))
pred_df = pd.DataFrame()
pred_df["pred"] = pred

pred_df.to_csv(
    '/Users/jianjun.yue/PycharmGItHub/data/人工智能辅助糖尿病遗传风险预测/sub_0107_XG_去掉负效果特征.csv',
    header=False,
    index=False,
    float_format='%.3f')