Пример #1
0
    def def_model(self, parameters: dict = None):
        model = XGBRegressor()

        if parameters is not None:
            model.set_params(**parameters)

        self._model = model
class XGBWrapper_regr(object):
    """
    A wrapper for xgboost model so that we will have a single api for various models.
    """

    def __init__(self):
        self.model = XGBRegressor()

    def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None):

        self.model = self.model.set_params(**params)
        
        eval_set = [(X_train, y_train)]
        if X_valid is not None:
            eval_set.append((X_valid, y_valid))
        if X_holdout is not None:
            eval_set.append((X_holdout, y_holdout))

        self.model.fit(X=X_train, y=y_train,
                       eval_set=eval_set, eval_metric='rmse',
                       verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds'])

        scores = self.model.evals_result()
        self.best_score_ = {k: {m: m_v[-1] for m, m_v in v.items()} for k, v in scores.items()}
#         self.best_score_ = {k: {m: n if m != 'cappa' else -n for m, n in v.items()} for k, v in self.best_score_.items()}

        self.feature_importances_ = self.model.feature_importances_
    
    def predict(self, X_test):
        return self.model.predict(X_test)
Пример #3
0
 def objective(params):
     (learning_rate, max_depth, n_estimators) = params
     if (learner_choice == 'xgbR'):
         learner = XGBRegressor()
     elif (learner_choice == 'xgbC'):
         learner = XGBClassifier()
     learner.set_params(booster='gbtree',
                        learning_rate=learning_rate,
                        max_depth=max_depth,
                        n_estimators=n_estimators,
                        subsample=0.75)
     return -mean(
         cross_val_score(learner,
                         X,
                         y,
                         cv=cv_folds,
                         n_jobs=-1,
                         scoring="neg_mean_absolute_error"))
Пример #4
0
    def xgb(X_train, X_test, y_train, y_test):
        
        mod = XGBRegressor(learning_rate=0.2, objective='reg:squarederror')
        estimators = np.arange(1, 200, 10)
        scores = []
        estim = []

        for n in estimators:
            mod.set_params(n_estimators=n)
            mod.fit(X_train, y_train)
            scores.append(mod.score(X_test, y_test))
            estim.append(n)

        xdf = pd.DataFrame({'Estimator':estim, 'Score':scores})
        best = next((x for x in xdf['Estimator'][xdf['Score'] == max(xdf['Score'])]), None)

        xgbr = XGBRegressor(n_estimators=best, learning_rate=0.2, objective='reg:squarederror')
        xgbr.fit(X_train, y_train)
        
        return xgbr
Пример #5
0
def train_model(X_train, y_train, X_test, y_test, estimator):
    """
        This function performs the training of the model.

        :param df_train: The dataframe with the train data set.
        :param df_test: The dataframe with the test data set.
        :return: model: Returns the trained model which can be used to get predictions.
    """
    logger.info("Start train_model()")

    model = None
    if estimator == 'DecisionTreeRegressor':
        model = DecisionTreeRegressor()
        model.fit(X_train, y_train)
    if estimator == 'SGDRegressor':
        model = MultiOutputRegressor(SGDRegressor())
        model.fit(X_train, y_train)
    if estimator == 'GradientBoostingRegressor':
        model = MultiOutputRegressor(GradientBoostingRegressor())
        model.fit(X_train, y_train)
    if estimator == 'XGBRegressor':
        best_params = {
            'colsample_bytree': 0.5,
            'gamma': 0.0,
            'learning_rate': 0.1,
            'max_depth': 5,
            'min_child_weight': 5,
            'n_estimators': 50,
            'nthread': -1,
            #  'num_boost_round': 45,
            'objective': 'reg:squarederror'
        }
        model_xgb = XGBRegressor(n_jobs=-1)
        model_xgb.set_params(**best_params)

        model = MultiOutputRegressor(model_xgb)
        model.fit(X_train, y_train)

    return model
Пример #6
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.num_training_rounds = num_training_rounds
                # init the classifier
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        xgtrain  = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan)
        try:
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
        except:
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
            except:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
        self.clf.set_params(n_estimators=cvresult.shape[0])
        self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring)

        #Predict training set:
        train_df_predictions = self.clf.predict(self.df[self.predictors])

        if self.target_type == 'binary':
            train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1]
            print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions))
            print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob))
        elif self.target_type == 'linear':
            print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))
            print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)))

    def predict(self, test_df):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        if self.target_type == 'binary':
            self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1]
        elif self.target_type == 'linear':
            self.output = self.clf.predict(self.test_df[self.predictors])
        return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if col not in self.cols_to_remove:
                if self.verbose:
                    print('converting', col)
                df[col] = pd.to_numeric(df[col], errors='coerce')
                if self.verbose:
                    print(df[col].dtype)

        # drop those marked for dropping
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            for idx, value in enumerate(self.output):
                test_id = self.test_df[self.id_column][idx]
                test_output = self.output[idx]
                to_write = [test_id, test_output]
                if include_actual:
                    to_write.append(self.test_df[self.target_column][idx])
                writer.writerow(to_write)

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)
Пример #7
0
 subsample=0.8,
 colsample_bytree=0.8,
 objective='reg:gamma',
 nthread=4,
 scale_pos_weight=1,
 seed=1024)
    
  


#####parameter 1max_depth 

xgb_param = xgb1.get_xgb_params()
cvresult = xgb.cv(xgb_param, Dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5,
        metrics='rmse', early_stopping_rounds=50)
xgb1.set_params(n_estimators=cvresult.shape[0])






param_test1 = {
 'max_depth':[3,4,5,6,7],
 'min_child_weight':[3,4,5,6,7]
}
gsearch1 = GridSearchCV(estimator = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 gamma=0,
 subsample=0.8,
Пример #8
0
 gamma=0.15,
 reg_alpha=2.5,
 reg_lambda=10,
 subsample=1,
 colsample_bytree=0.1,
 colsample_bylevel=0.1,
 objective= 'reg:logistic',
 nthread=-1,
 scale_pos_weight=1,
 tree_method= 'gpu_exact',
 gpu_id= 0,
 seed=0)
print('start cv')
result=xgb.cv(params, xgb.DMatrix(train_X,label=train_y), num_boost_round=1000, nfold=8, stratified=False,  maximize=False, early_stopping_rounds=10,as_pandas=True, verbose_eval=None, show_stdv=True,
       seed=0, callbacks=None, shuffle=True)
model.set_params(n_estimators=int(result.shape[0]))
model.fit(train_X, train_y)
print('start predict')
y=[]
X=train_X[-1,:].reshape(1,train_X.shape[1])
X = np.concatenate((X, train_y[-1].reshape(1, 1)), axis=1)
for i in range(len(test)):
	X=np.concatenate((X[:,3:],test[i].reshape(1,2)), axis=1).reshape((1, train_X.shape[1]))
	y.append(model.predict(X)[0]*(1+i/len(test)))
	X = np.concatenate((X.reshape(1,train_X.shape[1]), y[-1].reshape(1, 1)), axis=1)
print(y)
# yhat = model.predict(test_X)
# print(np.shape(yhat))
# test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
# # invert scaling for forecast
inv_yhat = scaler.inverse_transform(y)
Пример #9
0
 def instantiate_model(self, params):
     model = XGBRegressor()
     model.set_params(**params)
     return model
Пример #10
0
 'min_child_weight':range(4,11,1)
}
gsearch2 = GridSearchCV(estimator = xgb1, 
                        param_grid = param_test2, 
                        scoring='r2',
                        n_jobs=-1,
                        iid=False, 
                        cv=5)
gsearch2.fit(train[predictors],y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
# max_depth: 2, min_child_weight: 9

param_test2b = {
 'min_child_weight':range(19, 31)
}
xgb1.set_params(max_depth = 2)
gsearch2b = GridSearchCV(estimator = xgb1, 
                        param_grid = param_test2b, 
                        scoring='r2',
                        n_jobs=-1,
                        iid=False, 
                        cv=5)
gsearch2b.fit(train[predictors],y)
gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_
# max_depth = 2, min_child_weight = 27

param_test2c = {
 'max_depth': [2,3,4,5],
 'min_child_weight':range(4,31)
}
gsearch2c = GridSearchCV(estimator = xgb1, 
Пример #11
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        # checks for sampling
        sample_fraction = float(sample_fraction)
        if sample_fraction > 1:
            sample_fraction = 1.0
        if sample_fraction * n_samples > 1:
            n_samples = round(1.0/sample_fraction)
        if sample_fraction <= 0:
            print('sample_fraction 0 or negative, switching to 0.1')
            sample_fraction = 0.1
        # if sample_fraction is results in sample smaller than 1
        if round(sample_fraction * len(df)) == 0:
            sample_fraction = 1.0/len(df)
        # check if data is dataframe
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.sample_fraction = sample_fraction
                self.n_samples = n_samples
                self.num_training_rounds = num_training_rounds
                self.prefix = prefix
                # init the classifier:
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'multiclass':
                    self.scoring = 'merror'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'multi:softmax',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
                # if preferred scoring metric is stated:
                if scoring:
                    self.scoring = scoring
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        # if subsampling
        if self.sample_fraction == 1.0:
            df_list = [self.df]
        else:
            df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples)
        print(df_list)
        for idx, current_df in enumerate(df_list):
            print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction))
            xgtrain  = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan)
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
            except:
                try:
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
                except:
                    xgb_param['num_class'] = len(current_df[self.target_column].unique())
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
                self.clf.set_params(n_estimators=cvresult.shape[0])
                self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring)

                #Predict training set:
                train_df_predictions = self.clf.predict(current_df[self.predictors])

                if self.target_type == 'binary' or self.target_type == 'multiclass':
                    train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1]
                    print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions))
                    if self.target_type == 'binary':
                        print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob))
                elif self.target_type == 'linear':
                    print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))
                    print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)))
                filename = self.prefix + '_' + str(idx) + '.pkl'
                self.save(filename)

    def predict(self, test_df, return_multi_outputs=False, return_mean_std=False):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        # prediction
        print('## predicting from test set')
        output_list = []
        output = None
        for idx, ns in enumerate(range(self.n_samples)):
            if self.n_samples == 1:
                xgb = self
                if self.target_type == 'binary':
                    output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                elif self.target_type == 'linear':
                    output = xgb.clf.predict(self.test_df[self.predictors])
            else:
                try:
                    filename = self.prefix + '_' + str(idx) + '.pkl'
                    xgb = self.load(filename)
                    if self.target_type == 'binary':
                        output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                    elif self.target_type == 'linear':
                        output = xgb.clf.predict(self.test_df[self.predictors])
                    output_list.append(list(output))
                except IOError:
                    print('no file found, skipping')
        # average the outputs if n_samples is more than one
        if self.n_samples == 1:
            self.output = output
            try:
                self.multi_outputs = [list(output)]
            except:
                self.multi_outputs = None
        else:
            self.output = np.mean(output_list, axis=0)
            self.multi_outputs = output_list
        if return_multi_outputs:
            return self.multi_outputs
        elif return_mean_std:
            return (self.output, np.std(output_list, axis=0))
        else:
            return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if self.verbose:
                print('converting', col)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if self.verbose:
                print(df[col].dtype)
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def random_sample(self, df, fraction=0.2, n_samples=None):
        """
        splits into random samples
        - n_samples: how many samples you want returned (default = All)
        - fraction : what fraction of data to include in the sample (default = 0.2)
        """
        print('constructing random samples')
        num_rows = len(df)
        len_sample = round(fraction * num_rows)
        # create list of slice index lists
        indices = range(0,num_rows)
        slice_list = []
        tmp_idx_list = []
        while len(indices) > 0:
            while len(tmp_idx_list) < len_sample and len(indices) > 0:
                idx = indices.pop(random.randrange(len(indices)))
                tmp_idx_list.append(idx)
            slice_list.append(tmp_idx_list)
            tmp_idx_list = []
        # get slices
        sample_list = []
        for s in range(n_samples):
            try:
                sample_list.append(df.loc[slice_list[s],:])
            except:
                pass
        return sample_list

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            try:
                for idx, value in enumerate(self.output):
                    test_id = self.test_df[self.id_column][idx]
                    test_output = self.output[idx]
                    to_write = [test_id, test_output]
                    if include_actual:
                        to_write.append(self.test_df[self.target_column][idx])
                    writer.writerow(to_write)
                print('results written to ' + filename)
            except:
                print('write_csv failed')

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

    def load(self, model_file='xgb.pkl'):
        xgb = joblib.load(model_file)
        return xgb
Пример #12
0
def xgbm_model_fit(random_search_flag,
                   x_train,
                   y_train,
                   x_test,
                   y_test,
                   modeltype,
                   multi_label,
                   log_y,
                   num_boost_round=100):
    start_time = time.time()
    if multi_label and not random_search_flag:
        model = num_boost_round
    else:
        rand_params = {
            'learning_rate': sp.stats.uniform(scale=1),
            'gamma': sp.stats.randint(0, 100),
            'n_estimators': sp.stats.randint(100, 500),
            "max_depth": sp.stats.randint(3, 15),
        }

    if modeltype == 'Regression':
        objective = 'reg:squarederror'
        eval_metric = 'rmse'
        shuffle = False
        stratified = False
        num_class = 0
        score_name = 'Score'
        scale_pos_weight = 1
    else:
        if modeltype == 'Binary_Classification':
            objective = 'binary:logistic'
            eval_metric = 'error'  ## dont foolishly change to auc or aucpr since it doesnt work in finding feature imps later
            shuffle = True
            stratified = True
            num_class = 1
            score_name = 'Error Rate'
            scale_pos_weight = get_scale_pos_weight(y_train)
        else:
            objective = 'multi:softprob'
            eval_metric = 'merror'  ## dont foolishly change to auc or aucpr since it doesnt work in finding feature imps later
            shuffle = True
            stratified = True
            if multi_label:
                num_class = y_train.nunique().max()
            else:
                if isinstance(y_train, np.ndarray):
                    num_class = np.unique(y_train).max() + 1
                elif isinstance(y_train, pd.Series):
                    num_class = y_train.nunique()
                else:
                    num_class = y_train.nunique().max()
            score_name = 'Multiclass Error Rate'
            scale_pos_weight = 1  ### use sample_weights in multi-class settings ##
    ######################################################
    final_params = {
        'booster': 'gbtree',
        'colsample_bytree': 0.5,
        'alpha': 0.015,
        'gamma': 4,
        'learning_rate': 0.01,
        'max_depth': 8,
        'min_child_weight': 2,
        'reg_lambda': 0.5,
        'subsample': 0.7,
        'random_state': 99,
        'objective': objective,
        'eval_metric': eval_metric,
        'verbosity': 0,
        'n_jobs': -1,
        'scale_pos_weight': scale_pos_weight,
        'num_class': num_class,
        'silent': True
    }
    #######  This is where we split into single and multi label ############
    if multi_label:
        ######   This is for Multi_Label problems ############
        rand_params = {
            'estimator__learning_rate': [0.1, 0.5, 0.01, 0.05],
            'estimator__n_estimators': [50, 100, 150, 200, 250],
            'estimator__gamma': [2, 4, 8, 16, 32],
            'estimator__max_depth': [3, 5, 8, 12],
        }
        if random_search_flag:
            if modeltype == 'Regression':
                clf = XGBRegressor(n_jobs=-1, random_state=999, max_depth=6)
                clf.set_params(**final_params)
                model = MultiOutputRegressor(clf, n_jobs=-1)
            else:
                clf = XGBClassifier(n_jobs=-1, random_state=999, max_depth=6)
                clf.set_params(**final_params)
                model = MultiOutputClassifier(clf, n_jobs=-1)
            if modeltype == 'Regression':
                scoring = 'neg_mean_squared_error'
            else:
                scoring = 'precision'
            model = RandomizedSearchCV(model,
                                       param_distributions=rand_params,
                                       n_iter=15,
                                       return_train_score=True,
                                       random_state=99,
                                       n_jobs=-1,
                                       cv=3,
                                       refit=True,
                                       scoring=scoring,
                                       verbose=False)
            model.fit(x_train, y_train)
            print(
                'Time taken for Hyper Param tuning of multi_label XGBoost (in minutes) = %0.1f'
                % ((time.time() - start_time) / 60))
            cv_results = pd.DataFrame(model.cv_results_)
            print('Mean cross-validated test %s = %0.04f' %
                  (score_name, cv_results['mean_test_score'].mean()))
            ### In this case, there is no boost rounds so just return the default num_boost_round
            return model.best_estimator_
        else:
            try:
                model.fit(x_train, y_train)
            except:
                print(
                    'Multi_label XGBoost model is crashing during training. Please check your inputs and try again...'
                )
            return model
    else:
        #### This is for Single Label Problems #############
        if modeltype == 'Multi_Classification':
            wt_array = get_sample_weight_array(y_train)
            dtrain = xgb.DMatrix(x_train, label=y_train, weight=wt_array)
        else:
            dtrain = xgb.DMatrix(x_train, label=y_train)
        ########   Now let's perform randomized search to find best hyper parameters ######
        if random_search_flag:
            cv_results = xgb.cv(final_params,
                                dtrain,
                                num_boost_round=num_boost_round,
                                nfold=5,
                                stratified=stratified,
                                metrics=eval_metric,
                                early_stopping_rounds=10,
                                seed=999,
                                shuffle=shuffle)
            # Update best eval_metric
            best_eval = 'test-' + eval_metric + '-mean'
            mean_mae = cv_results[best_eval].min()
            boost_rounds = cv_results[best_eval].argmin()
            print("Cross-validated %s = %0.3f in num rounds = %s" %
                  (score_name, mean_mae, boost_rounds))
            print(
                'Time taken for Hyper Param tuning of XGBoost (in minutes) = %0.1f'
                % ((time.time() - start_time) / 60))
            return boost_rounds
        else:
            try:
                model = xgb.train(
                    final_params,
                    dtrain,
                    num_boost_round=num_boost_round,
                    verbose_eval=False,
                )
            except:
                print(
                    'XGBoost model is crashing. Please check your inputs and try again...'
                )
            return model
Пример #13
0
class Xgb:
    def __init__(self,
                 df,
                 target_column='',
                 id_column='',
                 target_type='binary',
                 categorical_columns=[],
                 drop_columns=[],
                 numeric_columns=[],
                 num_training_rounds=500,
                 verbose=1,
                 sample_fraction=1.0,
                 n_samples=1,
                 early_stopping_rounds=None,
                 prefix='xgb_model',
                 scoring=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        # checks for sampling
        sample_fraction = float(sample_fraction)
        if sample_fraction > 1:
            sample_fraction = 1.0
        if sample_fraction * n_samples > 1:
            n_samples = round(1.0 / sample_fraction)
        if sample_fraction <= 0:
            print('sample_fraction 0 or negative, switching to 0.1')
            sample_fraction = 0.1
        # if sample_fraction is results in sample smaller than 1
        if round(sample_fraction * len(df)) == 0:
            sample_fraction = 1.0 / len(df)
        # check if data is dataframe
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.sample_fraction = sample_fraction
                self.n_samples = n_samples
                self.num_training_rounds = num_training_rounds
                self.prefix = prefix
                # init the classifier:
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(learning_rate=0.1,
                                             n_estimators=num_training_rounds,
                                             subsample=0.8,
                                             colsample_bytree=0.8,
                                             objective='binary:logistic',
                                             scale_pos_weight=1,
                                             seed=123)
                elif self.target_type == 'multiclass':
                    self.scoring = 'merror'
                    self.clf = XGBClassifier(learning_rate=0.1,
                                             n_estimators=num_training_rounds,
                                             subsample=0.8,
                                             colsample_bytree=0.8,
                                             objective='multi:softmax',
                                             scale_pos_weight=1,
                                             seed=123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(n_estimators=num_training_rounds,
                                            objective='reg:linear')
                # if preferred scoring metric is stated:
                if scoring:
                    self.scoring = scoring
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [
            x for x in self.df.columns
            if x not in [self.target_column, self.id_column]
        ]
        xgb_param = self.clf.get_xgb_params()

        # if subsampling
        if self.sample_fraction == 1.0:
            df_list = [self.df]
        else:
            df_list = self.random_sample(df=self.df,
                                         fraction=self.sample_fraction,
                                         n_samples=self.n_samples)
        print(df_list)
        for idx, current_df in enumerate(df_list):
            print('ITERATION ' + str(idx + 1) + ' of ' + str(self.n_samples) +
                  ', sample_fraction=' + str(self.sample_fraction))
            xgtrain = xgb.DMatrix(current_df[self.predictors],
                                  label=current_df[self.target_column],
                                  missing=np.nan)
            try:
                cvresult = xgb.cv(
                    xgb_param,
                    xgtrain,
                    num_boost_round=self.clf.get_params()['n_estimators'],
                    nfold=5,
                    metrics=[self.scoring],
                    early_stopping_rounds=self.early_stopping_rounds,
                    show_progress=self.verbose)
            except:
                try:
                    cvresult = xgb.cv(
                        xgb_param,
                        xgtrain,
                        num_boost_round=self.clf.get_params()['n_estimators'],
                        nfold=5,
                        metrics=[self.scoring],
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose_eval=self.verbose)
                except:
                    xgb_param['num_class'] = len(
                        current_df[self.target_column].unique())
                    cvresult = xgb.cv(
                        xgb_param,
                        xgtrain,
                        num_boost_round=self.clf.get_params()['n_estimators'],
                        nfold=5,
                        metrics=[self.scoring],
                        early_stopping_rounds=self.early_stopping_rounds)
            self.clf.set_params(n_estimators=cvresult.shape[0])
            print('fitting model')
            self.clf.fit(current_df[self.predictors],
                         current_df[self.target_column],
                         eval_metric=self.scoring)

            #Predict training set:
            train_df_predictions = self.clf.predict(
                current_df[self.predictors])

            if self.target_type == 'binary' or self.target_type == 'multiclass':
                train_df_predprob = self.clf.predict_proba(
                    current_df[self.predictors])[:, 1]
                print("Accuracy : %.4g" % metrics.accuracy_score(
                    current_df[self.target_column].values,
                    train_df_predictions))
                if self.target_type == 'binary':
                    print("AUC Score (Train): %f" % metrics.roc_auc_score(
                        current_df[self.target_column], train_df_predprob))
            elif self.target_type == 'linear':
                print("Mean squared error: %f" % metrics.mean_squared_error(
                    current_df[self.target_column].values,
                    train_df_predictions))
                print("Root mean squared error: %f" % np.sqrt(
                    metrics.mean_squared_error(
                        current_df[self.target_column].values,
                        train_df_predictions)))
            filename = self.prefix + '_' + str(idx) + '.pkl'
            self.save(filename)

    def predict(self,
                test_df,
                return_multi_outputs=False,
                return_mean_std=False):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        # prediction
        print('## predicting from test set')
        output_list = []
        output = None
        for idx, ns in enumerate(range(self.n_samples)):
            if self.n_samples == 1:
                if self.target_type == 'binary':
                    output = self.clf.predict_proba(
                        self.test_df[self.predictors])[:, 1]
                elif self.target_type == 'linear':
                    output = self.clf.predict(self.test_df[self.predictors])
            else:
                try:
                    filename = self.prefix + '_' + str(idx) + '.pkl'
                    xgb_load = self.load(filename)
                    if self.target_type == 'binary':
                        output = xgb_load.clf.predict_proba(
                            self.test_df[self.predictors])[:, 1]
                    elif self.target_type == 'linear':
                        output = xgb_load.clf.predict(
                            self.test_df[self.predictors])
                    output_list.append(list(output))
                except IOError:
                    print('no file found, skipping')
        # average the outputs if n_samples is more than one
        if self.n_samples == 1:
            self.output = output
            try:
                self.multi_outputs = [list(output)]
            except:
                self.multi_outputs = None
        else:
            self.output = np.mean(output_list, axis=0)
            self.multi_outputs = output_list
        if return_multi_outputs:
            return self.multi_outputs
        elif return_mean_std:
            return (self.output, np.std(output_list, axis=0))
        else:
            return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(
            self.clf.booster().get_fscore().items()),
                                    key=operator.itemgetter(1),
                                    reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature",
                                  "importance",
                                  kind="barh",
                                  color=sns.color_palette("deep", 3))

    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([
                df,
                pd.get_dummies(
                    df[col]).rename(columns=lambda x: col + '_' + str(x))
            ],
                           axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt / float(
                        len(df[col])
                ) > 0.6:  # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if col is not self.target_column:
                    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                        if df[col].std() == 0:
                            print('will drop', col)
                            self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if self.verbose:
                print('converting', col)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if self.verbose:
                print(df[col].dtype)
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[
                    col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def random_sample(self, df, fraction=0.2, n_samples=None):
        """
        splits into random samples
        - n_samples: how many samples you want returned (default = All)
        - fraction : what fraction of data to include in the sample (default = 0.2)
        """
        print('constructing random samples')
        num_rows = len(df)
        len_sample = round(fraction * num_rows)
        # create list of slice index lists
        indices = list(range(0, num_rows))
        print('INDICES', indices)
        slice_list = []
        tmp_idx_list = []
        while len(indices) > 0:
            while len(tmp_idx_list) < len_sample and len(indices) > 0:
                idx = indices.pop(random.randrange(len(indices)))
                tmp_idx_list.append(idx)
            slice_list.append(tmp_idx_list)
            tmp_idx_list = []
        # get slices
        sample_list = []
        for s in range(n_samples):
            try:
                sample_list.append(df.loc[slice_list[s], :])
            except:
                pass
        return sample_list

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            try:
                for idx, value in enumerate(self.output):
                    test_id = self.test_df[self.id_column][idx]
                    test_output = self.output[idx]
                    to_write = [test_id, test_output]
                    if include_actual:
                        to_write.append(self.test_df[self.target_column][idx])
                    writer.writerow(to_write)
                print('results written to ' + filename)
            except:
                print('write_csv failed')

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

    def load(self, model_file='xgb.pkl'):
        xgb = joblib.load(model_file)
        return xgb
Пример #14
0
 'max_depth': [1,2,3,4],
 'min_child_weight':range(4,12,1)
}
gsearch2 = GridSearchCV(estimator = xgb1, 
                        param_grid = param_test2, 
                        scoring='r2',
                        n_jobs=-1,
                        iid=False, 
                        cv=5)
gsearch2.fit(train[predictors],train['y'])
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

param_test2b = {
 'min_child_weight':range(11, 30)
}
xgb1.set_params(max_depth = 3)
gsearch2b = GridSearchCV(estimator = xgb1, 
                        param_grid = param_test2b, 
                        scoring='r2',
                        n_jobs=-1,
                        iid=False, 
                        cv=5)
gsearch2b.fit(train[predictors],train['y'])
gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_

# max_depth = 3, min_child_weight = 17

xgb1.set_params(min_child_weight = 17)
param_test3 = {
 'gamma': [x / 10 for x in range(11, 30)]
}
Пример #15
0
# step 2.2: fine tune max_depth, min_child_weight
param_test2 = {'max_depth': [2, 3, 4], 'min_child_weight': range(4, 7, 1)}

gs2 = GridSearchCV(xgb1,
                   param_grid=param_test2,
                   scoring='neg_mean_squared_error',
                   n_jobs=-1,
                   iid=False,
                   cv=outer_cv)
gs2.fit(X, y)
print_grid_scores(gs2)
print('Best parameters: %r' % gs2.best_params_)
print('Best mean test RMSE: %.4f' % (np.sqrt(-gs2.best_score_)))

xgb1.set_params(max_depth=3, min_child_weight=4)

# step 3: tune gamma
#param_test3 = {
# 'gamma':[i/10.0 for i in range(0,5)]
#}

#param_test3 = {
# 'gamma':[i/100.0 for i in range(0,10)]
#}

param_test3 = {'gamma': range(6)}

gs3 = GridSearchCV(xgb1,
                   param_grid=param_test3,
                   scoring='neg_mean_squared_error',
Пример #16
0
    reg_lambda=1,  # [默认是1] 权重的L2正则化项
    max_depth=10,  # [默认是6] 树的最大深度,这个值也是用来避免过拟合的3-10
    min_child_weight=
    1,  # [默认是1]决定最小叶子节点样本权重和。当它的值较大时,可以避免模型学习到局部的特殊样本。但如果这个值过高,会导致欠拟合。
    n_jobs=1)
"""
dtrain = xgb.DMatrix(X_train, y_train)
xgb_params = clf.get_xgb_params()
cvresult = xgb.cv(xgb_params, dtrain, nfold=5, num_boost_round=2000,
                      early_stopping_rounds=50)
#clf_xgb = xgb.train(xgb_params, dtrain, num_boost_round=cvresult.shape[0])
#fscore = clf_xgb.get_fscore()
#print(cvresult.shape[0], fscore)
print(cvresult.shape[0])
"""
clf.set_params(n_estimators=28)
"""
param_test1 = {
        'max_depth': [i for i in range(3, 12, 2)],
        'min_child_weight': [i for i in range(1, 10, 2)]
    }
best_max_depth = 5
best_min_child_weight = 1
param_test2 = {
        'max_depth': [best_max_depth-1,best_max_depth,best_max_depth+1],
        'min_child_weight': [best_min_child_weight,best_min_child_weight+1]
    }
"""
clf.set_params(max_depth=5, min_child_weight=1)
"""
param_test3 = {
Пример #17
0
 gamma=0.1,
 reg_alpha=2.5,
 reg_lambda=5,
 subsample=0.8,
 colsample_bytree=0.5,
 objective= 'reg:logistic',
 nthread=-1,
 scale_pos_weight=1,
 silent=True,
 tree_method= 'gpu_exact',
 gpu_id= 0,
 seed=0)
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.2, random_state=0)
result=xgb.cv(params, xgb.DMatrix(X,label=y), num_boost_round=1000, nfold=8, stratified=False, folds=None,maximize=False, early_stopping_rounds=10,as_pandas=True, verbose_eval=None, show_stdv=True,
       seed=0, callbacks=None, shuffle=True,feval=loss)
xgb1.set_params(n_estimators=result.shape[0])
xgb1.fit(X_train, Y_train)


preds=xgb1.predict(X_train)*(maxy-miny)+miny
# preds=preds+(preds-np.mean(preds))*0.5
cha=(preds - Y_train*(maxy-miny)-miny)
print('train',np.dot(cha,cha.T)/len(cha))


preds=xgb1.predict(X_test)*(maxy-miny)+miny
# preds=preds+(preds-np.mean(preds))*0.5
cha=(preds - Y_test*(maxy-miny)-miny)
print('test',np.dot(cha,cha.T)/len(cha))
print(np.min(preds),np.max(preds),np.mean(preds))
Пример #18
0
def run_find(x_train, y_train, i, x_predict):

    # 找到合适的参数调优的估计器数目

    clf = XGBRegressor(
        objective='reg:linear',
        learning_rate=0.1,  # [默认是0.3]学习率类似,调小能减轻过拟合,经典值是0.01-0.2
        gamma=
        0,  # 在节点分裂时,只有在分裂后损失函数的值下降了,才会分裂这个节点。Gamma指定了节点分裂所需的最小损失函数下降值。这个参数值越大,算法越保守。
        subsample=0.8,  # 随机采样比例,0.5-1 小欠拟合,大过拟合
        colsample_bytree=0.8,  # 训练每棵树时用来训练的特征的比例
        reg_alpha=1,  # [默认是1] 权重的L1正则化项
        reg_lambda=1,  # [默认是1] 权重的L2正则化项
        max_depth=10,  # [默认是6] 树的最大深度,这个值也是用来避免过拟合的3-10
        min_child_weight=
        1,  # [默认是1]决定最小叶子节点样本权重和。当它的值较大时,可以避免模型学习到局部的特殊样本。但如果这个值过高,会导致欠拟合。
    )
    nums, fscore = modelfit(clf,
                            x_train,
                            y_train,
                            cv_folds=5,
                            early_stopping_rounds=30,
                            feval=evalerror)
    print('test_estimators:', nums)
    clf.set_params(n_estimators=nums)

    # 1 先对 max_depth和min_child_weight 这两个比较重要的参数进行调优
    ## 粗调:
    param_test1 = {
        'max_depth': [i for i in range(3, 12, 2)],
        'min_child_weight': [i for i in range(1, 10, 2)]
    }
    best_params, best_score = find_params(param_test1, clf, x_train, y_train)
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    ## 精调:
    max_d = best_params['max_depth']
    min_cw = best_params['min_child_weight']
    param_test2 = {
        'max_depth': [max_d - 1, max_d, max_d + 1],
        'min_child_weight': [min_cw - 1, min_cw, min_cw + 1]
    }
    best_params, best_score = find_params(param_test2, clf, x_train, y_train)
    clf.set_params(max_depth=best_params['max_depth'],
                   min_child_weight=best_params['min_child_weight'])
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    # 2 对 gamma 进行调参:
    ## 粗调:
    param_test3 = {'gamma': [i / 10.0 for i in range(0, 10, 2)]}
    best_params, best_score = find_params(param_test3, clf, x_train, y_train)
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    ## 精调:
    b_gamma = best_params['gamma']
    param_test4 = {'gamma': [b_gamma, b_gamma + 0.1, b_gamma + 0.2]}
    best_params, best_score = find_params(param_test4, clf, x_train, y_train)
    clf.set_params(gamma=best_params['gamma'])
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    # 3 对subsample和colsample_bytree进行调参
    ## 粗调
    param_test5 = {
        'subsample': [i / 10.0 for i in range(6, 10)],
        'colsample_bytree': [i / 10.0 for i in range(6, 10)]
    }
    best_params, best_score = find_params(param_test5, clf, x_train, y_train)
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    ## 精调
    b_subsample = best_params['subsample']
    b_colsample_bytree = best_params['colsample_bytree']
    param_test6 = {
        'subsample': [b_subsample - 0.05, b_subsample, b_subsample + 0.05],
        'colsample_bytree': [
            b_colsample_bytree - 0.05, b_colsample_bytree,
            b_colsample_bytree + 0.05
        ]
    }
    best_params, best_score = find_params(param_test6, clf, x_train, y_train)
    clf.set_params(subsample=best_params['subsample'],
                   colsample_bytree=best_params['colsample_bytree'])
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    # 4 对 reg_alpha和lambda 进行调节
    ## 粗调
    param_test7 = {
        'reg_alpha': [1e-5, 1e-2, 0.1, 1, 2],
        'reg_lambda': [1e-5, 1e-2, 0.1, 1, 2]
    }
    best_params, best_score = find_params(param_test7, clf, x_train, y_train)
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    ## 精调
    b_alp = best_params['reg_alpha']
    b_lam = best_params['reg_lambda']
    param_test8 = {
        'reg_alpha': [b_alp, 2 * b_alp, 3 * b_alp],
        'reg_lambda': [b_lam, 2 * b_lam, 3 * b_lam]
    }
    best_params, best_score = find_params(param_test7, clf, x_train, y_train)
    clf.set_params(reg_alpha=best_params['reg_alpha'],
                   reg_lambda=best_params['reg_lambda'])
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    # 5 调小learning_rate, 提高迭代次数
    clf.set_params(learning_rate=0.01)
    nums, fscore = modelfit(clf,
                            x_train,
                            y_train,
                            cv_folds=5,
                            early_stopping_rounds=50,
                            feval=evalerror)
    clf.set_params(n_estimators=nums)

    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_predict)

    return y_predict, fscore
Пример #19
0
 'min_child_weight':range(1,6,2)
}
grid_search(xgb1, param_test1)
# max_depth: 3, min_child_weight: 5

param_test2 = {
 'max_depth': [1,2,3,4],
 'min_child_weight':range(4,10)
}
grid_search(xgb1, param_test2)
# max_depth: 3, min_child_weight: 4

param_test2b = {
 'min_child_weight':range(19, 31)
}
xgb1.set_params(max_depth = 2)
gsearch2b = GridSearchCV(estimator = xgb1, 
                        param_grid = param_test2b, 
                        scoring='r2',
                        n_jobs=-1,
                        iid=False, 
                        cv=5)
gsearch2b.fit(train[predictors],y)
gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_
# max_depth = 2, min_child_weight = 27

param_test2c = {
 'max_depth': [2,3,4,5],
 'min_child_weight':range(4,31)
}
gsearch2c = GridSearchCV(estimator = xgb1, 
Пример #20
0
def get_XgbRegressor(train_data,
                     train_target,
                     test_data,
                     feature_names,
                     parameters,
                     early_stopping_rounds,
                     num_folds,
                     eval_metric,
                     model_name='model',
                     stratified=False):
    '''
    :param train_data: 一定是numpy
    :param train_target:
    :param parameters:
    :param round:
    :param k:
    :param eval_metrics:自定义 or 内置字符串
    :return:
    '''
    reg = XGBRegressor()
    reg.set_params(**parameters)

    # 定义一些变量
    oof_preds = np.zeros((train_data.shape[0], ))
    sub_preds = np.zeros((test_data.shape[0], ))
    feature_importance_df = pd.DataFrame()
    cv_result = []

    # K-flod
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=1234)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1234)
    X_train_newfeature = np.zeros((1, 1))
    for n_flod, (train_index,
                 val_index) in enumerate(folds.split(train_data,
                                                     train_target)):
        train_X = train_data[train_index]
        val_X = train_data[val_index]
        train_Y = train_target[train_index]
        val_Y = train_target[val_index]
        # 参数初步定之后划分20%为验证集,准备一个watchlist 给train和validation set ,设置num_round 足够大(比如100000),以至于你能发现每一个round 的验证集预测结果,
        # 如果在某一个round后 validation set 的预测误差上升了,你就可以停止掉正在运行的程序了。
        watchlist = [(train_X, train_Y), (val_X, val_Y)]

        # early_stop 看validate的eval是否下降,这时候必须传eval_set,并取eval_set的最后一个作为validate
        reg.fit(train_X,
                train_Y,
                early_stopping_rounds=early_stopping_rounds,
                eval_set=watchlist,
                eval_metric=eval_metric)

        ## 生成gbdt新特征
        new_feature = reg.apply(val_X)
        if X_train_newfeature.shape[0] == 1:
            X_train_newfeature = mergeToOne(val_X, new_feature)
        else:
            X_train_newfeature = mergeToOne(val_X, new_feature)
            X_train_newfeature = np.concatenate(
                (X_train_newfeature, mergeToOne(new_feature, val_X)), axis=0)
        print(X_train_newfeature)
        # 获得每次的预测值补充
        oof_preds[val_index] = reg.predict(val_X)
        # 获得预测的平均值,这里直接加完再除m
        sub_preds += reg.predict(test_data)
        result = mean_absolute_error(val_Y, reg.predict(val_X))
        print('Fold %2d macro-f1 : %.6f' % (n_flod + 1, result))
        cv_result.append(round(result, 5))
        gc.collect()
        # 默认就是gain 如果要修改要再参数定义中修改importance_type
        # 保存特征重要度
        gain = reg.feature_importances_
        fold_importance_df = pd.DataFrame({
            'feature': feature_names,
            'gain': 100 * gain / gain.sum(),
            'fold': n_flod,
        }).sort_values('gain', ascending=False)
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
    # 进行保存
    sub_preds = sub_preds / folds.n_splits
    new_feature = reg.apply(test_data)
    X_test_newfeature = mergeToOne(test_data, new_feature)

    if not os.path.isdir('./sub'):
        os.makedirs('./sub')
    pd.DataFrame(oof_preds,
                 columns=['class'
                          ]).to_csv('./sub/val_{}.csv'.format(model_name),
                                    index=False)
    pd.DataFrame(sub_preds,
                 columns=['class'
                          ]).to_csv('./sub/test_{}.csv'.format(model_name),
                                    index=False)
    print('cv_result', cv_result)

    if not os.path.isdir('./gbdt_newfeature'):
        os.makedirs('./gbdt_newfeature')

    np.save("./gbdt_newfeature/train_newfeature.npy", X_train_newfeature)
    np.save("./gbdt_newfeature/test_newfeature.npy", X_test_newfeature)
    save_importances(feature_importance_df, model_name)
    return reg, sub_preds
Пример #21
0
def cross_validation(dtrain, ytrain, predictors):
    #每次调整完一个参数,重新确定新的num_rounds
    dtrain = dtrain[predictors]
    xgb_model = XGBRegressor(
        learning_rate=0.5,
        max_depth=20,
        n_estimators=100,
        min_child_weight=1,
        gamma=0,
        objective='reg:linear',
        nthread=4,
    )
    modelfit(xgb_model, dtrain, ytrain)
    print('tunning learning rate...')
    params = {'learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=params,
                           scoring='neg_mean_squared_error',
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(dtrain.values, ytrain)
    xgb_model.set_params(learning_rate=gsearch.best_params_['learning_rate'])
    print(gsearch.best_params_)

    print('tunning max_depth...')
    params = {'max_depth': [3, 5, 7, 9]}
    print(xgb_model.get_params()['n_estimators'])
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=params,
                           scoring='neg_mean_squared_error',
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(dtrain.values, ytrain)
    xgb_model.set_params(max_depth=gsearch.best_params_['max_depth'])
    print(gsearch.best_params_)
    #choose best num_round
    modelfit(xgb_model, dtrain, ytrain)
    print(xgb_model.get_params()['n_estimators'])

    print('tunning min_child_weight...')
    param_child_weight = {'min_child_weight': [1, 3, 5, 7]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_child_weight,
                           scoring='neg_mean_squared_error',
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(dtrain.values, ytrain)
    xgb_model.set_params(
        min_child_weight=gsearch.best_params_['min_child_weight'])
    print(xgb_model.get_params())
    modelfit(xgb_model, dtrain.values, ytrain)
    print(xgb_model.get_params()['n_estimators'])

    print('tunning gamma...')
    param_gamma = {'gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_gamma,
                           scoring='neg_mean_squared_error',
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(dtrain.values, ytrain)
    xgb_model.set_params(gamma=gsearch.best_params_['gamma'])
    print(xgb_model.get_params())
    modelfit(xgb_model, dtrain.values, ytrain)
    print(xgb_model.get_params()['n_estimators'])

    #print('tunning colsample_bylevel')
    #param_colsample_bylevel = {'colsample_bylevel':[0.6,0.8,1]}
    #gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bylevel, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    #gsearch.fit(dtrain.values,ytrain)
    #xgb_model.set_params(colsample_bylevel = gsearch.best_params_['colsample_bylevel'])
    #tunning colsample_bytree
    print(xgb_model.get_params())
    modelfit(xgb_model, dtrain.values, ytrain)
    print('num_rounds after tunning colsample_bylevel:%f' %
          xgb_model.get_params()['n_estimators'])

    print('tunning colsample_bytree...')
    param_colsample_bytree = {'colsample_bytree': [0.6, 0.7, 0.8, 1]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_colsample_bytree,
                           scoring='neg_mean_squared_error',
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(dtrain.values, ytrain)
    xgb_model.set_params(
        colsample_bytree=gsearch.best_params_['colsample_bytree'])
    print(xgb_model.get_params())
    modelfit(xgb_model, dtrain.values, ytrain)
    print('num_rounds after tunning colsample_bytree:%f' %
          xgb_model.get_params()['n_estimators'])
    # save and return model
    cur_time = time.strftime("%Y-%m-%d-%H-%M", time.localtime())
    pickle.dump(
        xgb_model,
        open('../models/autogridsearch_xgb_' + cur_time + '.model', 'wb'))
    cv_score(xgb_model, dtrain.values, ytrain)
    return xgb_model
        reg_lambda=1,  # [默认是1] 权重的L2正则化项
        max_depth=10,  # [默认是6] 树的最大深度,这个值也是用来避免过拟合的3-10
        min_child_weight=1,  # [默认是1]决定最小叶子节点样本权重和。当它的值较大时,可以避免模型学习到局部的特殊样本。但如果这个值过高,会导致欠拟合。
        n_jobs=1
)
"""
dtrain = xgb.DMatrix(X_train, y_train)
xgb_params = clf.get_xgb_params()
cvresult = xgb.cv(xgb_params, dtrain, nfold=5, num_boost_round=2000,
                      early_stopping_rounds=50)
#clf_xgb = xgb.train(xgb_params, dtrain, num_boost_round=cvresult.shape[0])
#fscore = clf_xgb.get_fscore()
#print(cvresult.shape[0], fscore)
print(cvresult.shape[0])
"""
clf.set_params(n_estimators=10)
"""
param_test1 = {
        'max_depth': [i for i in range(3, 12, 2)],
        'min_child_weight': [i for i in range(1, 10, 2)]
    }
best_max_depth = 3
best_min_child_weight = 9
param_test2 = {
        'max_depth': [i for i in range(3, 12, 2)],
        'min_child_weight': [i for i in range(1, 10, 2)]
    }
"""
clf.set_params(max_depth=3,min_child_weight=9)
"""
param_test3 = {
Пример #23
0
    def fit(self, inputs_train, labels_train, fit_options={}):
        xgb_reg = XGBRegressor(random_state=self.options['seed'])

        print('Starting with low learning rate and tuning: \
            max_depth, min_child_weight, n_estimators')

        params = {
            "learning_rate": [0.1],  # np.arange(0.05,0.45,0.05), #eta
            # np.arange(2,14,2),
            "max_depth": self.options['max_depth'],
            # np.arange(1,7,6),
            "min_child_weight": self.options['min_child_weight'],
            # np.arange(10,80,10),
            "n_estimators": self.options['n_estimators'],
            "colsample_bytree": [0.8],
            "subsample": [0.8],
            "gamma": [0],
        }

        GSCV = GridSearchCV(
            xgb_reg,  # , #np.arange(0.05,0.45,0.05), #eta),
            params,
            cv=self.options['cv'],
            scoring=self.options['scoring'],
            n_jobs=self.options['n_jobs'],
            verbose=self.options['verbose'],  # verbose,
            return_train_score=True)

        GSCV.fit(inputs_train, labels_train)

        print('best_params_:', GSCV.best_params_)  # ,
        print('best_score_:', GSCV.best_score_)

        print('Tuning: gamma')
        params = {
            "learning_rate": [0.1],  # np.arange(0.05,0.45,0.05), #eta
            "max_depth": [GSCV.best_params_['max_depth']],
            "min_child_weight": [GSCV.best_params_['min_child_weight']],
            "n_estimators": [GSCV.best_params_['n_estimators']],
            "colsample_bytree": [0.8],
            "subsample": [0.8],
            # np.arange(0.05,0.45,0.05),
            "gamma": self.options['gamma'],
        }

        GSCV = GridSearchCV(
            xgb_reg,  # , #np.arange(0.05,0.45,0.05), #eta),
            params,
            cv=self.options['cv'],
            scoring=self.options['scoring'],
            n_jobs=self.options['n_jobs'],
            verbose=self.options['verbose'],  # verbose,
            return_train_score=True)

        GSCV.fit(inputs_train, labels_train)

        print('best_params_:', GSCV.best_params_)  # ,
        print('best_score_:', GSCV.best_score_)

        print('Tuning: colsample_bytree, subsample')

        params = {
            "learning_rate": [0.1],  # np.arange(0.05,0.45,0.05), #eta
            "max_depth": [GSCV.best_params_['max_depth']],
            "min_child_weight": [GSCV.best_params_['min_child_weight']],
            "n_estimators": [GSCV.best_params_['n_estimators']],
            "gamma": [GSCV.best_params_['gamma']],

            # np.arange(0.60, 0.95, 0.05),
            "colsample_bytree": self.options['colsample_bytree'],
            # np.arange(0.60, 0.95, 0.05),
            "subsample": self.options['subsample'],
        }

        GSCV = GridSearchCV(
            xgb_reg,  # , #np.arange(0.05,0.45,0.05), #eta),
            params,
            cv=self.options['cv'],
            scoring=self.options['scoring'],
            n_jobs=self.options['n_jobs'],
            verbose=self.options['verbose'],  # verbose,
            return_train_score=True)

        GSCV.fit(inputs_train, labels_train)

        print('best_params_:', GSCV.best_params_)  # ,
        print('best_score_:', GSCV.best_score_)

        print('Tuning: reg_alpha, reg_lambda')

        params = {
            "learning_rate": [0.1],  # np.arange(0.05,0.45,0.05), #eta
            "max_depth": [GSCV.best_params_['max_depth']],
            "min_child_weight": [GSCV.best_params_['min_child_weight']],
            "n_estimators": [GSCV.best_params_['n_estimators']],
            "gamma": [GSCV.best_params_['gamma']],
            "colsample_bytree": [GSCV.best_params_['colsample_bytree']],
            "subsample": [GSCV.best_params_['subsample']],

            # ,[1e-5, 1e-2, 0.1, 1, 10], #alpha
            "reg_alpha": self.options['reg_alpha'],
            # [1e-5, 1e-2, 0.1, 1, 10],#lambda
            "reg_lambda": self.options['reg_lambda'],
        }

        GSCV = GridSearchCV(
            xgb_reg,  # , #np.arange(0.05,0.45,0.05), #eta),
            params,
            cv=self.options['cv'],
            scoring=self.options['scoring'],
            n_jobs=self.options['n_jobs'],
            verbose=self.options['verbose'],  # verbose,
            return_train_score=True)

        GSCV.fit(inputs_train, labels_train)

        print('best_params_:', GSCV.best_params_)  # ,
        print('best_score_:', GSCV.best_score_)

        print('Tuning: learning_rate')

        params = {
            # np.arange(0.025,0.150,0.025), #np.arange(0.05,0.45,0.05), #eta
            "learning_rate": self.options['learning_rate'],
            "max_depth": [GSCV.best_params_['max_depth']],
            "min_child_weight": [GSCV.best_params_['min_child_weight']],
            "n_estimators": [GSCV.best_params_['n_estimators']],
            "gamma": [GSCV.best_params_['gamma']],
            "colsample_bytree": [GSCV.best_params_['colsample_bytree']],
            "subsample": [GSCV.best_params_['subsample']],
            "reg_alpha": [GSCV.best_params_['reg_alpha']],  # alpha
            "reg_lambda": [GSCV.best_params_['reg_lambda']]  # lambda
        }

        GSCV = GridSearchCV(
            xgb_reg,  # , #np.arange(0.05,0.45,0.05), #eta),
            params,
            cv=self.options['cv'],
            scoring=self.options['scoring'],
            n_jobs=self.options['n_jobs'],
            verbose=self.options['verbose'],  # verbose,
            return_train_score=True)

        GSCV.fit(inputs_train, labels_train)

        print('best_params_:', GSCV.best_params_)  # ,
        print('best_score_:', GSCV.best_score_)

        print('Final model')

        # Regression
        regressor = XGBRegressor(random_state=self.options['seed'])  # seed)
        regressor.set_params(**GSCV.best_params_)
        trained_regressor = regressor.fit(inputs_train, labels_train)
        self.regressor = trained_regressor
        self.feature_importances_ = self.regressor.feature_importances_
Пример #24
0
class XGBoost(BaseModel):
    """XGBoost Class."""
    def __init__(self,
                 XGBoost_objective,
                 tuning_metric,
                 trials='trials',
                 bottom_coding=None,
                 transform=None,
                 **kwargs):
        """Initialize hyperparameters."""
        super(XGBoost, self).__init__(bottom_coding=bottom_coding,
                                      transform=transform)

        self.model = XGBRegressor
        self.tuning_metric = tuning_metric
        self.objective = XGBoost_objective
        self.trials = Trials() \
            if trials == 'trials' \
            else MongoTrials('mongo://localhost:1234/foo_db/jobs',
                             exp_key='exp1')
        self.set_parameters()

    def set_parameters(self):
        self.space = {
            'n_estimators':
            hp.choice('n_estimators', list(range(100, 5000, 900))),
            'max_depth':
            hp.choice('max_depth', list(range(3, 10, 3))),
            'min_child_weight':
            hp.choice('min_child_weight', list(range(1, 10, 4))),
            'subsample':
            hp.choice('subsample', [i / 100.0 for i in range(75, 100, 10)]),
            'gamma':
            hp.choice('gamma', [i / 10.0 for i in range(0, 5, 2)]),
            'colsample_bytree':
            hp.quniform('colsample_bytree', 0.75, 1, 0.05),
            'objective':
            self.objective,
            'booster':
            'dart',
            'tree_method':
            'gpu_exact',
            'n_gpu':
            1,
            'silent':
            1,
            'learning_rate':
            0.1,
            'scale_pos_weight':
            1
        }

    def tune(self, training_set, logger=None, saver=None):
        self.training_set = training_set
        objective = generate_objective(self.training_set, self.model)
        best = space_eval(
            self.space,
            fmin(fn=objective,
                 space=self.space,
                 trials=self.trials,
                 algo=tpe.suggest,
                 max_evals=self.max_evals))
        print(f'Best hyperparams: {best}')

        self.model = XGBRegressor()
        self.model.set_params(**best)
        self.model.fit(training_set.X, training_set.y)

    def instantiate_model(self, params):
        model = XGBRegressor()
        model.set_params(**params)
        return model
Пример #25
0
def cross_validation(dtrain,ytrain,predictors):
    #每次调整完一个参数,重新确定新的num_rounds
    dtrain = dtrain[predictors]
    xgb_model = XGBRegressor(
                learning_rate= 0.5,
                max_depth = 20,
                n_estimators = 100,
                min_child_weight = 1,
                gamma = 0,
                objective='reg:linear',
                nthread=4,
                )
    modelfit(xgb_model,dtrain,ytrain)
    print('tunning learning rate...')
    params = {'learning_rate':[0.01,0.015,0.025,0.05,0.1]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring = 'neg_mean_squared_error',n_jobs = 4,iid=False,cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(learning_rate = gsearch.best_params_['learning_rate'])
    print(gsearch.best_params_)

    print('tunning max_depth...')
    params = { 'max_depth':[3,5,7,9]}
    print(xgb_model.get_params()['n_estimators'])
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(max_depth = gsearch.best_params_['max_depth'])
    print(gsearch.best_params_)
    #choose best num_round
    modelfit(xgb_model,dtrain,ytrain)
    print(xgb_model.get_params()['n_estimators'])
    
    print('tunning min_child_weight...')
    param_child_weight = {'min_child_weight':[1,3,5,7]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_child_weight, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(min_child_weight = gsearch.best_params_['min_child_weight'])
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print(xgb_model.get_params()['n_estimators'])

    print('tunning gamma...')
    param_gamma = {'gamma':[0.05,0.1,0.3,0.5,0.7,0.9,1]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_gamma, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(gamma = gsearch.best_params_['gamma'])
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print(xgb_model.get_params()['n_estimators'])

    #print('tunning colsample_bylevel')
    #param_colsample_bylevel = {'colsample_bylevel':[0.6,0.8,1]}
    #gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bylevel, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    #gsearch.fit(dtrain.values,ytrain)
    #xgb_model.set_params(colsample_bylevel = gsearch.best_params_['colsample_bylevel'])
    #tunning colsample_bytree
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print('num_rounds after tunning colsample_bylevel:%f'%xgb_model.get_params()['n_estimators'])

    print('tunning colsample_bytree...')
    param_colsample_bytree = {'colsample_bytree':[0.6,0.7,0.8,1]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bytree, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(colsample_bytree = gsearch.best_params_['colsample_bytree'])
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print('num_rounds after tunning colsample_bytree:%f'%xgb_model.get_params()['n_estimators'])
    # save and return model
    cur_time = time.strftime("%Y-%m-%d-%H-%M",time.localtime())
    pickle.dump(xgb_model,open('../models/autogridsearch_xgb_'+cur_time+'.model','wb'))
    cv_score(xgb_model,dtrain.values,ytrain)
    return xgb_model
def train(x_train, y_train, x_valid, y_valid, n_estimators_0, objective,
          eval_metric, scoring, rmspe_xg, kfold, esr):
    # 1-设置参数初始值
    print("1-设置参数初始值")
    reg = XGBRegressor(
        # General Parameters
        booster="gbtree",
        silent=1,
        nthread=-1,
        n_jobs=-1,
        # Booster Parameters
        learning_rate=0.1,
        n_estimators=n_estimators_0,
        gamma=0,
        max_depth=7,
        min_child_weight=0.001,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_alpha=0,
        reg_lambda=1,
        max_delta_step=0,
        scale_pos_weight=1,
        # Learning Task Parameters
        objective=objective,
        eval_metric=eval_metric,
        seed=0)

    # 2-训练最优弱分类器个数:n_estimators_1
    print("2-训练最优弱分类器个数:n_estimators_1")
    xgb_param = reg.get_xgb_params()
    d_train = xgb.DMatrix(x_train, y_train)
    d_valid = xgb.DMatrix(x_valid, y_valid)
    watchlist = [(d_train, "train"), (d_valid, "valid")]

    t_begin = pd.Timestamp.now()
    xgb_cv = xgb.cv(
        params=xgb_param,
        dtrain=d_train,
        num_boost_round=xgb_param["n_estimators"],
        nfold=kfold,
        feval=rmspe_xg,
        #metrics=eval_metric,
        early_stopping_rounds=int(xgb_param["n_estimators"] / esr),
        verbose_eval=None)
    t1 = pd.Timestamp.now()
    n_estimators_1 = xgb_cv.shape[0]
    reg.set_params(n_estimators=n_estimators_1)
    xgb_param = reg.get_xgb_params()
    print("分类器个数:%s, 用时:%s" % (n_estimators_1, (t1 - t_begin)))

    # 3-暴力搜索:learning_rate
    print("3-暴力搜索:learning_rate")
    param = {"learning_rate": [0.1, 0.2, 0.3]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_3 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    #model_3.grid_scores_; model_3.best_score_; model_3.best_estimator_
    best_param = model_3.best_params_["learning_rate"]
    reg.set_params(learning_rate=best_param)
    xgb_param = reg.get_xgb_params()
    print("learning_rate:%s, 用时:%s" % (best_param, (t1 - t0)))

    # 4-暴力搜索:max_depth, min_child_weight
    print("4-暴力搜索:max_depth, min_child_weight")
    param = {
        "max_depth": [3, 5, 7, 9, 11],
        "min_child_weight": [0.001, 0.01, 0.1, 1]
    }
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_4 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param_1 = model_4.best_params_["max_depth"]
    best_param_2 = model_4.best_params_["min_child_weight"]
    print("max_depth:%s,min_child_weight:%s,用时:%s" %
          (best_param_1, best_param_2, (t1 - t0)))

    # 5-精确搜索:max_depth
    print("5-精确搜索:max_depth")
    param = {"max_depth": [best_param_1 - 1, best_param_1, best_param_1 + 1]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_5 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param_1 = model_5.best_params_["max_depth"]
    reg.set_params(max_depth=best_param_1)
    xgb_param = reg.get_xgb_params()
    print("max_depth:%s,用时:%s" % (best_param_1, (t1 - t0)))

    # 6-暴力搜索:gamma
    print("6-暴力搜索:gamma")
    param = {"gamma": [0, 0.5, 1, 1.5, 2, 2.5]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_6 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_6.best_params_["gamma"]
    print("gamma:%s,用时:%s" % (best_param, (t1 - t0)))

    # 7-精确搜索:gamma
    print("7-精确搜索:gamma")
    if best_param == 0:
        param = {"gamma": [0, 0.1, 0.2, 0.3, 0.4]}
    else:
        param = {"gamma": np.arange(best_param - 0.2, best_param + 0.3, 0.1)}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_7 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_7.best_params_["gamma"]
    reg.set_params(gamma=best_param)
    xgb_param = reg.get_xgb_params()
    print("gamma:%s,用时:%s" % (best_param, (t1 - t0)))

    # 8-调整最优弱分类器个数:n_estimators_2
    print("8-调整最优弱分类器个数:n_estimators_2")
    reg.set_params(n_estimators=n_estimators_0)
    xgb_param = reg.get_xgb_params()

    t0 = pd.Timestamp.now()
    xgb_cv = xgb.cv(
        params=xgb_param,
        dtrain=d_train,
        num_boost_round=xgb_param["n_estimators"],
        nfold=kfold,
        feval=rmspe_xg,
        #metrics=eval_metric,
        early_stopping_rounds=int(xgb_param["n_estimators"] / esr),
        verbose_eval=None)
    t1 = pd.Timestamp.now()
    n_estimators_2 = xgb_cv.shape[0]
    reg.set_params(n_estimators=n_estimators_2)
    xgb_param = reg.get_xgb_params()
    print("分类器个数:%s, 用时:%s" % (n_estimators_2, (t1 - t0)))

    # 9-暴力搜索:subsample, colsample_bytree
    print("9-暴力搜索:subsample, colsample_bytree")
    param = {
        "subsample": [0.6, 0.7, 0.8, 0.9],
        "colsample_bytree": [0.6, 0.7, 0.8, 0.9]
    }
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_8 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param_1 = model_8.best_params_["subsample"]
    best_param_2 = model_8.best_params_["colsample_bytree"]
    print("subsample:%s,colsample_bytree:%s,用时:%s" %
          (best_param_1, best_param_2, (t1 - t0)))

    # 10-精确搜索:subsample, colsample_bytree
    print("10-精确搜索:subsample, colsample_bytree")
    param = {
        "subsample": [best_param_1 - 0.05, best_param_1, best_param_1 + 0.05],
        "colsample_bytree":
        [best_param_2 - 0.05, best_param_2, best_param_2 + 0.05]
    }
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_9 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param_1 = model_9.best_params_["subsample"]
    best_param_2 = model_9.best_params_["colsample_bytree"]
    reg.set_params(subsample=best_param_1, colsample_bytree=best_param_2)
    xgb_param = reg.get_xgb_params()
    print("subsample:%s,colsample_bytree:%s,用时:%s" %
          (best_param_1, best_param_2, (t1 - t0)))

    # 11-暴力搜索:reg_alpha
    print("11-暴力搜索:reg_alpha")
    param = {"reg_alpha": [0, 1, 2, 3]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_11 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_11.best_params_["reg_alpha"]
    reg.set_params(reg_alpha=best_param)
    xgb_param = reg.get_xgb_params()
    print("reg_alpha:%s,用时:%s" % (best_param, (t1 - t0)))

    # 12-精确搜索:reg_alpha
    print("12-精确搜索:reg_alpha")
    if best_param == 0:
        param = {"reg_alpha": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}
    else:
        param = {
            "reg_alpha": np.arange(best_param - 0.5, best_param + 0.5, 0.2)
        }
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_12 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_12.best_params_["reg_alpha"]
    reg.set_params(reg_alpha=best_param)
    xgb_param = reg.get_xgb_params()
    print("reg_alpha:%s,用时:%s" % (best_param, (t1 - t0)))

    # 13-暴力搜索:reg_lambda
    print("13-暴力搜索:reg_lambda")
    param = {"reg_lambda": [1, 3, 5, 7]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_13 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_13.best_params_["reg_lambda"]
    reg.set_params(reg_lambda=best_param)
    xgb_param = reg.get_xgb_params()
    print("reg_lambda:%s,用时:%s" % (best_param, (t1 - t0)))

    # 14-精确搜索:reg_lambda
    print("14-精确搜索:reg_lambda")
    param = {"reg_lambda": np.arange(best_param - 1, best_param + 1, 0.2)}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_14 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_14.best_params_["reg_lambda"]
    reg.set_params(reg_lambda=best_param)
    xgb_param = reg.get_xgb_params()
    print("reg_lambda:%s,用时:%s" % (best_param, (t1 - t0)))

    # 15-精确搜索:max_delta_step, scale_pos_weight
    print("15-精确搜索:max_delta_step, scale_pos_weight")
    param = {"max_delta_step": [0, 1, 3, 5], "scale_pos_weight": [1, 3, 5, 7]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_12 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param_1 = model_12.best_params_["max_delta_step"]
    best_param_2 = model_12.best_params_["scale_pos_weight"]
    reg.set_params(max_delta_step=best_param_1, scale_pos_weight=best_param_2)
    xgb_param = reg.get_xgb_params()
    print("max_delta_step:%s,scale_pos_weight:%s,用时:%s" %
          (best_param_1, best_param_2, (t1 - t0)))

    # 16-调整最优弱分类器个数:n_estimators_3
    print("16-调整最优弱分类器个数:n_estimators_3")
    reg.set_params(n_estimators=n_estimators_0)
    xgb_param = reg.get_xgb_params()

    t0 = pd.Timestamp.now()
    xgb_cv = xgb.cv(
        params=xgb_param,
        dtrain=d_train,
        num_boost_round=xgb_param["n_estimators"],
        nfold=kfold,
        feval=rmspe_xg,
        #metrics=eval_metric,
        early_stopping_rounds=int(xgb_param["n_estimators"] / esr),
        verbose_eval=None)
    t1 = pd.Timestamp.now()
    n_estimators_3 = xgb_cv.shape[0]
    reg.set_params(n_estimators=n_estimators_3)
    xgb_param = reg.get_xgb_params()
    print("分类器个数:%s, 用时:%s" % (n_estimators_3, (t1 - t0)))

    # 17-精确搜索:learning_rate
    print("17-精确搜索:learning_rate")
    lr = xgb_param["learning_rate"]
    param = {"learning_rate": [lr - 0.05, lr, lr + 0.05]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_16 = reg_gscv.fit(x_train, y_train)
    t_1 = pd.Timestamp.now()
    best_param = model_16.best_params_["learning_rate"]
    reg.set_params(learning_rate=best_param)
    xgb_param = reg.get_xgb_params()
    print("learning_rate:%s,用时:%s" % (best_param, (t_1 - t0)))

    # 18-终极训练
    print("18-终极训练")
    model_res = xgb.train(params=xgb_param,
                          dtrain=d_train,
                          num_boost_round=xgb_param["n_estimators"],
                          evals=watchlist,
                          feval=rmspe_xg,
                          early_stopping_rounds=int(xgb_param["n_estimators"] /
                                                    esr))
    t_end = pd.Timestamp.now()
    print("参数训练完毕,总用时:%s" % (t_end - t_begin))
    return model_res, reg
Пример #27
0
    'max_depth': [4, 5, 6],  # initial best is 5, check around 5
    'min_child_weight': range(2, 5, 1)  # initial best is 3, check 2,3,4
}

gs2 = GridSearchCV(xgb1,
                   param_grid=param_test2,
                   scoring='neg_mean_squared_error',
                   n_jobs=-1,
                   iid=False,
                   cv=outer_cv)
gs2.fit(X, y)
print_grid_scores(gs2)
print('Best parameters: %r' % gs2.best_params_)
print('Best mean test RMSE: %.5f' % (np.sqrt(-gs2.best_score_)))

xgb1.set_params(max_depth=5, min_child_weight=3)

# step 3: tune gamma
#param_test3 = {
# 'gamma':[i/10.0 for i in range(0,5)]
#}

param_test3 = {'gamma': [i / 100.0 for i in range(0, 10)]}

#param_test3 = {
# 'gamma':range(6)
#}

gs3 = GridSearchCV(xgb1,
                   param_grid=param_test3,
                   scoring='neg_mean_squared_error',
Пример #28
0
                    colsample_bytree=0.8,
                    objective='reg:gamma',
                    nthread=4,
                    scale_pos_weight=1,
                    seed=1024)

#####parameter 1max_depth

xgb_param = xgb1.get_xgb_params()
cvresult = xgb.cv(xgb_param,
                  Dtrain,
                  num_boost_round=xgb1.get_params()['n_estimators'],
                  nfold=5,
                  metrics='rmse',
                  early_stopping_rounds=50)
xgb1.set_params(n_estimators=cvresult.shape[0])

param_test1 = {
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [3, 4, 5, 6, 7]
}
gsearch1 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,
                                               n_estimators=1000,
                                               gamma=0,
                                               subsample=0.8,
                                               colsample_bytree=0.8,
                                               objective='reg:gamma',
                                               nthread=4,
                                               scale_pos_weight=1,
                                               seed=27),
                        param_grid=param_test1,
Пример #29
0
    reg_lambda=1,  # [默认是1] 权重的L2正则化项
    max_depth=10,  # [默认是6] 树的最大深度,这个值也是用来避免过拟合的3-10
    min_child_weight=
    1,  # [默认是1]决定最小叶子节点样本权重和。当它的值较大时,可以避免模型学习到局部的特殊样本。但如果这个值过高,会导致欠拟合。
    n_jobs=1)
"""
dtrain = xgb.DMatrix(X_train, y_train)
xgb_params = clf.get_xgb_params()
cvresult = xgb.cv(xgb_params, dtrain, nfold=5, num_boost_round=2000,
                      early_stopping_rounds=50)
#clf_xgb = xgb.train(xgb_params, dtrain, num_boost_round=cvresult.shape[0])
#fscore = clf_xgb.get_fscore()
#print(cvresult.shape[0], fscore)
print(cvresult.shape[0])
"""
clf.set_params(n_estimators=4)
"""
param_test1 = {
        'max_depth': [i for i in range(3, 17, 2)],
        'min_child_weight': [i for i in range(1, 10, 2)]
    }

best_max_depth = 13
best_min_child_weight = 1
param_test2 = {
        'max_depth': [best_max_depth-1,best_max_depth,best_max_depth+1],
        'min_child_weight': [best_min_child_weight,best_min_child_weight+1]
    }
"""
clf.set_params(max_depth=13, min_child_weight=1)
"""