Пример #1
0
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='reg:gamma',
                    nthread=4,
                    scale_pos_weight=1,
                    seed=1024)

#####parameter 1max_depth

xgb_param = xgb1.get_xgb_params()
cvresult = xgb.cv(xgb_param,
                  Dtrain,
                  num_boost_round=xgb1.get_params()['n_estimators'],
                  nfold=5,
                  metrics='rmse',
                  early_stopping_rounds=50)
xgb1.set_params(n_estimators=cvresult.shape[0])

param_test1 = {
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [3, 4, 5, 6, 7]
}
gsearch1 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,
                                               n_estimators=1000,
                                               gamma=0,
                                               subsample=0.8,
                                               colsample_bytree=0.8,
                                               objective='reg:gamma',
Пример #2
0
    d_train = xgb.DMatrix(x_train, y_train)
    d_test = xgb.DMatrix(x_test, y_test)
    wathchlist = [(d_train, "train"), (d_test, "test")]
    reg = XGBRegressor(learning_rate=0.3,
                       n_estimators=100,
                       objective="reg:linear",
                       eval_metric="rmse",
                       min_child_weight=1,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       reg_alpha=0,
                       reg_lambda=1,
                       n_jobs=-1,
                       nthread=-1,
                       seed=3)
    params = reg.get_params()
    evals_res = {}
    model_sklearn = xgb.train(params=params,
                              dtrain=d_train,
                              evals=wathchlist,
                              evals_result=evals_res,
                              early_stopping_rounds=10,
                              verbose_eval=True)
    y_pred = model_sklearn.predict(d_test)
    df_evals = pd.DataFrame({
        "loss_train": evals_res.get("train").get("rmse"),
        "loss_test": evals_res.get("test").get("rmse")
    })
    df_evals.plot()
    model.loss_reg(y_test, y_pred)
Пример #3
0
opt.optimize_n_estimators(model, train, predictors, labels)

param_test = {'max_depth': range(3, 10, 2), 'min_child_weight': range(1, 6, 2)}
opt.optimize_params(model, train, predictors, param_test)

model.fit(train[predictors], labels)

res = model.predict(test[predictors])

print(res)

f = open("./run_num.txt", mode='r')
run_num = int(f.read())
f.close()
f = open("./run_num.txt", mode='w')
run_num += 1
f.write(str(run_num))
f.close()

ids = range(1461, (1461 + len(res)))
result_df = pd.DataFrame({"Id": ids, "SalePrice": res})
result_df.to_csv("./submission_" + str(run_num) + ".csv", index=False)

f = open("./params_" + str(run_num) + ".txt", mode='w')
f.write(str(model.get_params()))
f.write("\n")
f.write("tresh_deviation: " + str(_tresh_deviation) + "\n")
f.write("tresh_importance: " + str(_tresh_importance) + "\n")
f.write("tresh_similarity: " + str(_tresh_similarity) + "\n")
f.close()
Пример #4
0
def cross_validation(dtrain,ytrain,predictors):
    #每次调整完一个参数,重新确定新的num_rounds
    dtrain = dtrain[predictors]
    xgb_model = XGBRegressor(
                learning_rate= 0.5,
                max_depth = 20,
                n_estimators = 100,
                min_child_weight = 1,
                gamma = 0,
                objective='reg:linear',
                nthread=4,
                )
    modelfit(xgb_model,dtrain,ytrain)
    print('tunning learning rate...')
    params = {'learning_rate':[0.01,0.015,0.025,0.05,0.1]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring = 'neg_mean_squared_error',n_jobs = 4,iid=False,cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(learning_rate = gsearch.best_params_['learning_rate'])
    print(gsearch.best_params_)

    print('tunning max_depth...')
    params = { 'max_depth':[3,5,7,9]}
    print(xgb_model.get_params()['n_estimators'])
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(max_depth = gsearch.best_params_['max_depth'])
    print(gsearch.best_params_)
    #choose best num_round
    modelfit(xgb_model,dtrain,ytrain)
    print(xgb_model.get_params()['n_estimators'])
    
    print('tunning min_child_weight...')
    param_child_weight = {'min_child_weight':[1,3,5,7]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_child_weight, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(min_child_weight = gsearch.best_params_['min_child_weight'])
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print(xgb_model.get_params()['n_estimators'])

    print('tunning gamma...')
    param_gamma = {'gamma':[0.05,0.1,0.3,0.5,0.7,0.9,1]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_gamma, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(gamma = gsearch.best_params_['gamma'])
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print(xgb_model.get_params()['n_estimators'])

    #print('tunning colsample_bylevel')
    #param_colsample_bylevel = {'colsample_bylevel':[0.6,0.8,1]}
    #gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bylevel, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    #gsearch.fit(dtrain.values,ytrain)
    #xgb_model.set_params(colsample_bylevel = gsearch.best_params_['colsample_bylevel'])
    #tunning colsample_bytree
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print('num_rounds after tunning colsample_bylevel:%f'%xgb_model.get_params()['n_estimators'])

    print('tunning colsample_bytree...')
    param_colsample_bytree = {'colsample_bytree':[0.6,0.7,0.8,1]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bytree, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(colsample_bytree = gsearch.best_params_['colsample_bytree'])
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print('num_rounds after tunning colsample_bytree:%f'%xgb_model.get_params()['n_estimators'])
    # save and return model
    cur_time = time.strftime("%Y-%m-%d-%H-%M",time.localtime())
    pickle.dump(xgb_model,open('../models/autogridsearch_xgb_'+cur_time+'.model','wb'))
    cv_score(xgb_model,dtrain.values,ytrain)
    return xgb_model
Пример #5
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.num_training_rounds = num_training_rounds
                # init the classifier
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        xgtrain  = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan)
        try:
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
        except:
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
            except:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
        self.clf.set_params(n_estimators=cvresult.shape[0])
        self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring)

        #Predict training set:
        train_df_predictions = self.clf.predict(self.df[self.predictors])

        if self.target_type == 'binary':
            train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1]
            print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions))
            print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob))
        elif self.target_type == 'linear':
            print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))
            print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)))

    def predict(self, test_df):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        if self.target_type == 'binary':
            self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1]
        elif self.target_type == 'linear':
            self.output = self.clf.predict(self.test_df[self.predictors])
        return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if col not in self.cols_to_remove:
                if self.verbose:
                    print('converting', col)
                df[col] = pd.to_numeric(df[col], errors='coerce')
                if self.verbose:
                    print(df[col].dtype)

        # drop those marked for dropping
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            for idx, value in enumerate(self.output):
                test_id = self.test_df[self.id_column][idx]
                test_output = self.output[idx]
                to_write = [test_id, test_output]
                if include_actual:
                    to_write.append(self.test_df[self.target_column][idx])
                writer.writerow(to_write)

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)
Пример #6
0
                    tree_method= 'exact',
                    learning_rate=0.1,
                    n_estimators=100,
                    nthread=4,
                    scale_pos_weight=1,
                    reg_alpha=0.001,
                    seed=27),
                       param_grid = param_test1, scoring=None,n_jobs=4,iid=False, cv=5)
gsearch1.fit(train_X,train_Y)

print(gsearch1.best_params_, gsearch1.best_score_)


#fit model
all_set = xgb.DMatrix(feature_df,label=label_df)
XGBmodel=xgb.train(XGBmodel.get_params(),all_set,num_boost_round=10000)

#generate result
test_data = pd.read_csv("test.csv")
test_drID = test_data.drop(["id"],axis=1)
test_drID["penalty"] = le.transform(test_drID["penalty"])
test_drID = test_drID.fillna(0)
test_set = min_max_scaler.transform(test_drID)
test_set = varSel.transform(test_set)
test_set = poly.fit_transform(test_set)
test_set = xgb.DMatrix(test_set)
xgbResult = XGBmodel.predict(test_set)
for i in range(0,len(xgbResult)):
    if xgbResult[i]<0:
        xgbResult[i]=0.2
xgbResult = xgbResult * 1.25
Пример #7
0
def cross_validation(dtrain, ytrain, predictors):
    #每次调整完一个参数,重新确定新的num_rounds
    dtrain = dtrain[predictors]
    xgb_model = XGBRegressor(
        learning_rate=0.5,
        max_depth=20,
        n_estimators=100,
        min_child_weight=1,
        gamma=0,
        objective='reg:linear',
        nthread=4,
    )
    modelfit(xgb_model, dtrain, ytrain)
    print('tunning learning rate...')
    params = {'learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=params,
                           scoring='neg_mean_squared_error',
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(dtrain.values, ytrain)
    xgb_model.set_params(learning_rate=gsearch.best_params_['learning_rate'])
    print(gsearch.best_params_)

    print('tunning max_depth...')
    params = {'max_depth': [3, 5, 7, 9]}
    print(xgb_model.get_params()['n_estimators'])
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=params,
                           scoring='neg_mean_squared_error',
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(dtrain.values, ytrain)
    xgb_model.set_params(max_depth=gsearch.best_params_['max_depth'])
    print(gsearch.best_params_)
    #choose best num_round
    modelfit(xgb_model, dtrain, ytrain)
    print(xgb_model.get_params()['n_estimators'])

    print('tunning min_child_weight...')
    param_child_weight = {'min_child_weight': [1, 3, 5, 7]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_child_weight,
                           scoring='neg_mean_squared_error',
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(dtrain.values, ytrain)
    xgb_model.set_params(
        min_child_weight=gsearch.best_params_['min_child_weight'])
    print(xgb_model.get_params())
    modelfit(xgb_model, dtrain.values, ytrain)
    print(xgb_model.get_params()['n_estimators'])

    print('tunning gamma...')
    param_gamma = {'gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_gamma,
                           scoring='neg_mean_squared_error',
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(dtrain.values, ytrain)
    xgb_model.set_params(gamma=gsearch.best_params_['gamma'])
    print(xgb_model.get_params())
    modelfit(xgb_model, dtrain.values, ytrain)
    print(xgb_model.get_params()['n_estimators'])

    #print('tunning colsample_bylevel')
    #param_colsample_bylevel = {'colsample_bylevel':[0.6,0.8,1]}
    #gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bylevel, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    #gsearch.fit(dtrain.values,ytrain)
    #xgb_model.set_params(colsample_bylevel = gsearch.best_params_['colsample_bylevel'])
    #tunning colsample_bytree
    print(xgb_model.get_params())
    modelfit(xgb_model, dtrain.values, ytrain)
    print('num_rounds after tunning colsample_bylevel:%f' %
          xgb_model.get_params()['n_estimators'])

    print('tunning colsample_bytree...')
    param_colsample_bytree = {'colsample_bytree': [0.6, 0.7, 0.8, 1]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_colsample_bytree,
                           scoring='neg_mean_squared_error',
                           n_jobs=4,
                           iid=False,
                           cv=5)
    gsearch.fit(dtrain.values, ytrain)
    xgb_model.set_params(
        colsample_bytree=gsearch.best_params_['colsample_bytree'])
    print(xgb_model.get_params())
    modelfit(xgb_model, dtrain.values, ytrain)
    print('num_rounds after tunning colsample_bytree:%f' %
          xgb_model.get_params()['n_estimators'])
    # save and return model
    cur_time = time.strftime("%Y-%m-%d-%H-%M", time.localtime())
    pickle.dump(
        xgb_model,
        open('../models/autogridsearch_xgb_' + cur_time + '.model', 'wb'))
    cv_score(xgb_model, dtrain.values, ytrain)
    return xgb_model
Пример #8
0
class Xgb:
    def __init__(self,
                 df,
                 target_column='',
                 id_column='',
                 target_type='binary',
                 categorical_columns=[],
                 drop_columns=[],
                 numeric_columns=[],
                 num_training_rounds=500,
                 verbose=1,
                 sample_fraction=1.0,
                 n_samples=1,
                 early_stopping_rounds=None,
                 prefix='xgb_model',
                 scoring=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        # checks for sampling
        sample_fraction = float(sample_fraction)
        if sample_fraction > 1:
            sample_fraction = 1.0
        if sample_fraction * n_samples > 1:
            n_samples = round(1.0 / sample_fraction)
        if sample_fraction <= 0:
            print('sample_fraction 0 or negative, switching to 0.1')
            sample_fraction = 0.1
        # if sample_fraction is results in sample smaller than 1
        if round(sample_fraction * len(df)) == 0:
            sample_fraction = 1.0 / len(df)
        # check if data is dataframe
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.sample_fraction = sample_fraction
                self.n_samples = n_samples
                self.num_training_rounds = num_training_rounds
                self.prefix = prefix
                # init the classifier:
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(learning_rate=0.1,
                                             n_estimators=num_training_rounds,
                                             subsample=0.8,
                                             colsample_bytree=0.8,
                                             objective='binary:logistic',
                                             scale_pos_weight=1,
                                             seed=123)
                elif self.target_type == 'multiclass':
                    self.scoring = 'merror'
                    self.clf = XGBClassifier(learning_rate=0.1,
                                             n_estimators=num_training_rounds,
                                             subsample=0.8,
                                             colsample_bytree=0.8,
                                             objective='multi:softmax',
                                             scale_pos_weight=1,
                                             seed=123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(n_estimators=num_training_rounds,
                                            objective='reg:linear')
                # if preferred scoring metric is stated:
                if scoring:
                    self.scoring = scoring
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [
            x for x in self.df.columns
            if x not in [self.target_column, self.id_column]
        ]
        xgb_param = self.clf.get_xgb_params()

        # if subsampling
        if self.sample_fraction == 1.0:
            df_list = [self.df]
        else:
            df_list = self.random_sample(df=self.df,
                                         fraction=self.sample_fraction,
                                         n_samples=self.n_samples)
        print(df_list)
        for idx, current_df in enumerate(df_list):
            print('ITERATION ' + str(idx + 1) + ' of ' + str(self.n_samples) +
                  ', sample_fraction=' + str(self.sample_fraction))
            xgtrain = xgb.DMatrix(current_df[self.predictors],
                                  label=current_df[self.target_column],
                                  missing=np.nan)
            try:
                cvresult = xgb.cv(
                    xgb_param,
                    xgtrain,
                    num_boost_round=self.clf.get_params()['n_estimators'],
                    nfold=5,
                    metrics=[self.scoring],
                    early_stopping_rounds=self.early_stopping_rounds,
                    show_progress=self.verbose)
            except:
                try:
                    cvresult = xgb.cv(
                        xgb_param,
                        xgtrain,
                        num_boost_round=self.clf.get_params()['n_estimators'],
                        nfold=5,
                        metrics=[self.scoring],
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose_eval=self.verbose)
                except:
                    xgb_param['num_class'] = len(
                        current_df[self.target_column].unique())
                    cvresult = xgb.cv(
                        xgb_param,
                        xgtrain,
                        num_boost_round=self.clf.get_params()['n_estimators'],
                        nfold=5,
                        metrics=[self.scoring],
                        early_stopping_rounds=self.early_stopping_rounds)
            self.clf.set_params(n_estimators=cvresult.shape[0])
            print('fitting model')
            self.clf.fit(current_df[self.predictors],
                         current_df[self.target_column],
                         eval_metric=self.scoring)

            #Predict training set:
            train_df_predictions = self.clf.predict(
                current_df[self.predictors])

            if self.target_type == 'binary' or self.target_type == 'multiclass':
                train_df_predprob = self.clf.predict_proba(
                    current_df[self.predictors])[:, 1]
                print("Accuracy : %.4g" % metrics.accuracy_score(
                    current_df[self.target_column].values,
                    train_df_predictions))
                if self.target_type == 'binary':
                    print("AUC Score (Train): %f" % metrics.roc_auc_score(
                        current_df[self.target_column], train_df_predprob))
            elif self.target_type == 'linear':
                print("Mean squared error: %f" % metrics.mean_squared_error(
                    current_df[self.target_column].values,
                    train_df_predictions))
                print("Root mean squared error: %f" % np.sqrt(
                    metrics.mean_squared_error(
                        current_df[self.target_column].values,
                        train_df_predictions)))
            filename = self.prefix + '_' + str(idx) + '.pkl'
            self.save(filename)

    def predict(self,
                test_df,
                return_multi_outputs=False,
                return_mean_std=False):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        # prediction
        print('## predicting from test set')
        output_list = []
        output = None
        for idx, ns in enumerate(range(self.n_samples)):
            if self.n_samples == 1:
                if self.target_type == 'binary':
                    output = self.clf.predict_proba(
                        self.test_df[self.predictors])[:, 1]
                elif self.target_type == 'linear':
                    output = self.clf.predict(self.test_df[self.predictors])
            else:
                try:
                    filename = self.prefix + '_' + str(idx) + '.pkl'
                    xgb_load = self.load(filename)
                    if self.target_type == 'binary':
                        output = xgb_load.clf.predict_proba(
                            self.test_df[self.predictors])[:, 1]
                    elif self.target_type == 'linear':
                        output = xgb_load.clf.predict(
                            self.test_df[self.predictors])
                    output_list.append(list(output))
                except IOError:
                    print('no file found, skipping')
        # average the outputs if n_samples is more than one
        if self.n_samples == 1:
            self.output = output
            try:
                self.multi_outputs = [list(output)]
            except:
                self.multi_outputs = None
        else:
            self.output = np.mean(output_list, axis=0)
            self.multi_outputs = output_list
        if return_multi_outputs:
            return self.multi_outputs
        elif return_mean_std:
            return (self.output, np.std(output_list, axis=0))
        else:
            return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(
            self.clf.booster().get_fscore().items()),
                                    key=operator.itemgetter(1),
                                    reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature",
                                  "importance",
                                  kind="barh",
                                  color=sns.color_palette("deep", 3))

    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([
                df,
                pd.get_dummies(
                    df[col]).rename(columns=lambda x: col + '_' + str(x))
            ],
                           axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt / float(
                        len(df[col])
                ) > 0.6:  # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if col is not self.target_column:
                    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                        if df[col].std() == 0:
                            print('will drop', col)
                            self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if self.verbose:
                print('converting', col)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if self.verbose:
                print(df[col].dtype)
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[
                    col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def random_sample(self, df, fraction=0.2, n_samples=None):
        """
        splits into random samples
        - n_samples: how many samples you want returned (default = All)
        - fraction : what fraction of data to include in the sample (default = 0.2)
        """
        print('constructing random samples')
        num_rows = len(df)
        len_sample = round(fraction * num_rows)
        # create list of slice index lists
        indices = list(range(0, num_rows))
        print('INDICES', indices)
        slice_list = []
        tmp_idx_list = []
        while len(indices) > 0:
            while len(tmp_idx_list) < len_sample and len(indices) > 0:
                idx = indices.pop(random.randrange(len(indices)))
                tmp_idx_list.append(idx)
            slice_list.append(tmp_idx_list)
            tmp_idx_list = []
        # get slices
        sample_list = []
        for s in range(n_samples):
            try:
                sample_list.append(df.loc[slice_list[s], :])
            except:
                pass
        return sample_list

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            try:
                for idx, value in enumerate(self.output):
                    test_id = self.test_df[self.id_column][idx]
                    test_output = self.output[idx]
                    to_write = [test_id, test_output]
                    if include_actual:
                        to_write.append(self.test_df[self.target_column][idx])
                    writer.writerow(to_write)
                print('results written to ' + filename)
            except:
                print('write_csv failed')

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

    def load(self, model_file='xgb.pkl'):
        xgb = joblib.load(model_file)
        return xgb
Пример #9
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        # checks for sampling
        sample_fraction = float(sample_fraction)
        if sample_fraction > 1:
            sample_fraction = 1.0
        if sample_fraction * n_samples > 1:
            n_samples = round(1.0/sample_fraction)
        if sample_fraction <= 0:
            print('sample_fraction 0 or negative, switching to 0.1')
            sample_fraction = 0.1
        # if sample_fraction is results in sample smaller than 1
        if round(sample_fraction * len(df)) == 0:
            sample_fraction = 1.0/len(df)
        # check if data is dataframe
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.sample_fraction = sample_fraction
                self.n_samples = n_samples
                self.num_training_rounds = num_training_rounds
                self.prefix = prefix
                # init the classifier:
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'multiclass':
                    self.scoring = 'merror'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'multi:softmax',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
                # if preferred scoring metric is stated:
                if scoring:
                    self.scoring = scoring
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        # if subsampling
        if self.sample_fraction == 1.0:
            df_list = [self.df]
        else:
            df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples)
        print(df_list)
        for idx, current_df in enumerate(df_list):
            print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction))
            xgtrain  = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan)
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
            except:
                try:
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
                except:
                    xgb_param['num_class'] = len(current_df[self.target_column].unique())
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
                self.clf.set_params(n_estimators=cvresult.shape[0])
                self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring)

                #Predict training set:
                train_df_predictions = self.clf.predict(current_df[self.predictors])

                if self.target_type == 'binary' or self.target_type == 'multiclass':
                    train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1]
                    print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions))
                    if self.target_type == 'binary':
                        print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob))
                elif self.target_type == 'linear':
                    print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))
                    print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)))
                filename = self.prefix + '_' + str(idx) + '.pkl'
                self.save(filename)

    def predict(self, test_df, return_multi_outputs=False, return_mean_std=False):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        # prediction
        print('## predicting from test set')
        output_list = []
        output = None
        for idx, ns in enumerate(range(self.n_samples)):
            if self.n_samples == 1:
                xgb = self
                if self.target_type == 'binary':
                    output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                elif self.target_type == 'linear':
                    output = xgb.clf.predict(self.test_df[self.predictors])
            else:
                try:
                    filename = self.prefix + '_' + str(idx) + '.pkl'
                    xgb = self.load(filename)
                    if self.target_type == 'binary':
                        output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                    elif self.target_type == 'linear':
                        output = xgb.clf.predict(self.test_df[self.predictors])
                    output_list.append(list(output))
                except IOError:
                    print('no file found, skipping')
        # average the outputs if n_samples is more than one
        if self.n_samples == 1:
            self.output = output
            try:
                self.multi_outputs = [list(output)]
            except:
                self.multi_outputs = None
        else:
            self.output = np.mean(output_list, axis=0)
            self.multi_outputs = output_list
        if return_multi_outputs:
            return self.multi_outputs
        elif return_mean_std:
            return (self.output, np.std(output_list, axis=0))
        else:
            return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if self.verbose:
                print('converting', col)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if self.verbose:
                print(df[col].dtype)
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def random_sample(self, df, fraction=0.2, n_samples=None):
        """
        splits into random samples
        - n_samples: how many samples you want returned (default = All)
        - fraction : what fraction of data to include in the sample (default = 0.2)
        """
        print('constructing random samples')
        num_rows = len(df)
        len_sample = round(fraction * num_rows)
        # create list of slice index lists
        indices = range(0,num_rows)
        slice_list = []
        tmp_idx_list = []
        while len(indices) > 0:
            while len(tmp_idx_list) < len_sample and len(indices) > 0:
                idx = indices.pop(random.randrange(len(indices)))
                tmp_idx_list.append(idx)
            slice_list.append(tmp_idx_list)
            tmp_idx_list = []
        # get slices
        sample_list = []
        for s in range(n_samples):
            try:
                sample_list.append(df.loc[slice_list[s],:])
            except:
                pass
        return sample_list

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            try:
                for idx, value in enumerate(self.output):
                    test_id = self.test_df[self.id_column][idx]
                    test_output = self.output[idx]
                    to_write = [test_id, test_output]
                    if include_actual:
                        to_write.append(self.test_df[self.target_column][idx])
                    writer.writerow(to_write)
                print('results written to ' + filename)
            except:
                print('write_csv failed')

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

    def load(self, model_file='xgb.pkl'):
        xgb = joblib.load(model_file)
        return xgb
Пример #10
0
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective='reg:gamma',
 nthread=4,
 scale_pos_weight=1,
 seed=1024)
    
  


#####parameter 1max_depth 

xgb_param = xgb1.get_xgb_params()
cvresult = xgb.cv(xgb_param, Dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5,
        metrics='rmse', early_stopping_rounds=50)
xgb1.set_params(n_estimators=cvresult.shape[0])






param_test1 = {
 'max_depth':[3,4,5,6,7],
 'min_child_weight':[3,4,5,6,7]
}
gsearch1 = GridSearchCV(estimator = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,