Exemplo n.º 1
0
class my_model:
    def __init__(self, d):
        self.linear_reg = linear_model.Ridge()
        self.xgb_reg = XGBRegressor(max_depth=7)
        self.d = d

    def fit(self, X, y):
        self.linear_reg.fit(X[:, 0].reshape(-1, 1), y)
        self.l_reg_res = self.linear_reg.predict(X[:, 0].reshape(-1, 1))
        self.xgb_reg.fit(X[:, 1:], y - self.l_reg_res)
        X_nn = np.hstack([
            X,
            self.xgb_reg.predict(X[:, 1:]).reshape(-1, 1),
            self.l_reg_res.reshape(-1, 1)
        ])
        return X_nn

    def predict(self, X):
        if isinstance(X[0, -1], str):
            for i in range(X.shape[0]):
                X[i, -1] = self.d[X[i, -1]]
            X = X.astype(np.float64, copy=False)

        X_nn_final = np.hstack([
            X,
            self.xgb_reg.predict(X[:, 1:]).reshape(-1, 1),
            self.linear_reg.predict(X[:, 1].reshape(-1, 1)).reshape(-1, 1)
        ])
        return X_nn_final
Exemplo n.º 2
0
def train_xg_boost(params):
    xg_model = XGBRegressor(n_estimators=int(params['n_estimators']), learning_rate=params['eta'], n_jobs=-1,
                            max_depth=int(params['max_depth']), gamma=params['gamma'], colsample_bytree=params['colsample_bytree'],
                            min_child_weight=params['min_child_weight'], reg_alpha=params['xg_reg_alpha'], subsample=params['subsample'],
                            reg_lambda=params['xg_reg_lambda']
                            )

    # In[ ]:
    xg_model.fit(X_train.values, y_train.values)
    training_values = xg_model.predict(X_train.values)
    print(training_values)
    training_rmse = math.sqrt(mean_squared_error(y_train, training_values))
    print("training_rmse", training_rmse)
    validation_values = xg_model.predict(X_validtn.values)
    validation_rmse = math.sqrt(mean_squared_error(y_validtn, validation_values))
    print("validation_rmse", validation_rmse)
    """test_submission = pd.DataFrame()
    test_submission["Score"] = xg_model.predict(combined_test_data)
    test_submission.to_excel('submission4.xlsx', index=False)"""

    return {
        'loss': validation_rmse,
        'status': STATUS_OK,
        'eval_time': time.time(),
    }
def xgb_regression(X_train,y_train,X_val, y_val,X_test,y_test,args):
    if y_test.shape[-1] == 1:
        model = XGBRegressor(
            learn_rate=0.1,
            max_depth=4,  # 4
            min_child_weight=10,
            gamma=1,  # 1
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.8,
            objective='reg:linear',
            n_estimators=2000,
            tree_method='gpu_hist',
            n_gpus=-1
        )
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse',
                  early_stopping_rounds=300)
        y_pred = model.predict(X_test)
        y_test = y_test.astype('float')
        MSE = mean_squared_error(y_test, y_pred)
        RMSE = MSE ** 0.5
        return RMSE
    else:
        RMSEs = []
        if len(y_train.shape) == 3:
            y_train = [x[0] for x in y_train]
            y_val = [x[0] for x in y_val]
            y_test = [x[0] for x in y_test]
            y_train = pd.DataFrame(y_train)
            y_val = pd.DataFrame(y_val)
            y_test = pd.DataFrame(y_test)
        for i in range(y_test.shape[1]):
            if float(max(y_val[i])) == 0 or float(max(y_train[i])) == 0 or float(max(y_test[i])) == 0:
                continue
            model = XGBRegressor(
                learn_rate=0.1,
                max_depth=4,  # 4
                min_child_weight=10,
                gamma=1,  # 1
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.8,
                objective='reg:linear',
                n_estimators=2000,
                tree_method='gpu_hist',
                n_gpus=-1
            )
            model.fit(X_train, [float(k) for k in y_train[i]], eval_set=[(X_val, [float(k) for k in y_val[i]])], eval_metric='rmse',
                      early_stopping_rounds=300)
            y_pred = model.predict(X_test)
            y_test = y_test.astype('float')
            MSE = mean_squared_error(y_test[i], y_pred)
            RMSE = MSE ** 0.5
            RMSEs.append(RMSE)
        return np.mean(RMSEs)
Exemplo n.º 4
0
def main():
    qresult = connect_db('solar.db', 'dip')
    smiles, compounds, gaps = get_data(qresult)
    mols = get_mols(smiles)
    fps_morgan, failed_mols = get_fingerprints(mols)
    refine_compounds(compounds, mols, gaps, failed_mols)
    compound_array = np.array(compounds)
    gaps_array = np.array(gaps)
    train_id, test_id, y_train, y_test = train_test_split(compound_array,
                                                          gaps_array,
                                                          test_size=0.20,
                                                          random_state=0)
    train_fps = get_fp_from_id(compounds, fps_morgan, train_id)
    test_fps = get_fp_from_id(compounds, fps_morgan, test_id)
    xgb1 = XGBRegressor(n_estimators=2000,
                        learning_rate=0.03,
                        max_depth=7,
                        colsample_bytree=0.6,
                        nthread=8,
                        scale_pos_weight=1,
                        gamma=0,
                        random_state=0,
                        subsample=0.6,
                        min_child_weight=3,
                        early_stopping_rounds=10,
                        reg_alpha=1)
    modelfit(xgb1, train_fps, y_train)
    #xgb1 = joblib.load('gbdt_dip_xgb.joblib')
    #joblib.dump(xgb1, 'gbdt_dip_xgb2.joblib')
    y_pred_cv = cvp(xgb1, train_fps, y_train, cv=4, n_jobs=8)
    y_train_pred = xgb1.predict(train_fps)
    y_pred_test = xgb1.predict(test_fps)
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    train_df['id'] = pd.Series(train_id)
    train_df['dip_exp'] = pd.Series(y_train)
    train_df['dip_cv'] = pd.Series(y_pred_cv)
    train_df['dip_gbdt'] = pd.Series(y_train_pred)
    train_df['Group'] = 'Train'
    test_df['id'] = pd.Series(test_id)
    test_df['dip_exp'] = pd.Series(y_test)
    test_df['dip_cv'] = pd.Series(y_pred_test)
    test_df['dip_gbdt'] = pd.Series(y_pred_test)
    test_df['Group'] = 'Test'
    result_df = pd.concat([train_df, test_df])

    result_df.to_csv('dip_xgb_train_test.csv')
    test_err = mean_squared_error(y_pred_test, y_test)
    print('Test error: {:4f}'.format(np.sqrt(test_err)))
Exemplo n.º 5
0
class XGBRegressorMetaPrim(primitive):
    def __init__(self, random_state=0):
        super(XGBRegressorMetaPrim, self).__init__(name='XGBRegressorMeta')
        self.hyperparams = []
        self.type = 'ensemble'
        self.description = "XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable."
        self.hyperparams_run = {'default': True}
        self.random_state = random_state
        self.model = XGBRegressor(random_state=self.random_state, n_jobs=5)
        self.accept_type = 'xgb'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.model.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        output['predictions'] = self.model.predict(output['X'])
        output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"])
        final_output = {0: output}
        return final_output
Exemplo n.º 6
0
def xgt_regressor(lr, max_d, estimators, X_train, X_test, y_train, y_test, obj):
    rmse = 10000
    for i in lr:
        for j in max_d:
            for k in estimators:

                clf=XGBRegressor(learning_rate=i,
                                n_estimators=k,
                                max_depth=j, 
                                min_child_weight=1, 
                                gamma=1,
                                subsample=0.5,
                                colsample_bytree=0.8,
                                objective=obj,
                                nthread=4,
                                scale_pos_weight=1, 
                                missing=np.nan)
                clf.fit(X_train,y_train)
                y_pred=clf.predict(X_test)
                
                a,b,c = pred_eval(y_pred,y_test)

                if a < rmse:
                    b_lr = i
                    b_d = j
                    b_e = k
                    
                    rmse = a
                    clf_b = clf
        
    return clf_b, (b_lr, b_d, b_e)
Exemplo n.º 7
0
 def xgboostmodel(self):
     df = pd.read_csv(datafile, encoding='utf-8', index_col=0)
     print(df.shape)
     traindata = df.iloc[:, :].values
     x = traindata[:, :-1]
     y = traindata[:, -1]
     x_train, x_test, y_train, y_test = train_test_split(
         x, y, train_size=0.7)  # list
     if self.params is None:
         params = {'max_depth': 80, 'n_estimators': 512}
     else:
         params = self.params
     raw_model = XGBRegressor(max_depth=128,
                              n_estimators=768,
                              learning_rate=0.01,
                              silence=False)
     raw_model.fit(x_train, y_train)
     raw_model.save_model(self.model_file)
     pred = raw_model.predict(x_test)
     self.true = y_test
     self.pred = pred
     self.show_save_figure(fig_path=self.fig_path,
                           modelname=self.job_name,
                           detal_idx=500)
     t_mean = self.cal_mean(self.true)
     p_mean = self.cal_mean(self.pred)
     self.save_result(self.result_path, true_mean=t_mean, pred_mean=p_mean)
Exemplo n.º 8
0
def XGB_reg_evaluation(individual, evaluation_method='roll_win'):
    '''
    evaluation_method : can be roll_win, mse
    '''

    if evaluation_method == 'roll_win':
        trainNumber = individual[6]  # the train num
        param = {
            'eta': individual[0],
            'silent': True,
            'objective': "reg:linear",
            'nthread': -1,
            'min_child_weight': individual[1],
            'max_depth': individual[2],
            'subsample': individual[3],
            'colsample_bylevel': individual[4],
            'seed': 0
        }
        roll_win_mseValue = 0
        for i in xrange(N_validation):
            trainingX, trainingY = trainX[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window),:],\
                                          trainY[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window)]

            testingX, testingY= trainX[(trainNum - (i + 1) * window):(trainNum - i * window),:], \
                                       trainY[(trainNum - (i + 1) * window):(trainNum - i * window)]
            dtrain = xgb.DMatrix(data=trainingX, label=trainingY)
            bst = xgb.train(params=param,
                            dtrain=dtrain,
                            num_boost_round=individual[5])
            testingX = xgb.DMatrix(testingX)
            roll_win_mseValue += sum(
                (testingY - bst.predict(testingX))**2) / window
        roll_win_mseValue /= N_validation
        return (roll_win_mseValue, )

    if evaluation_method == 'mse':
        ### The cross validation evaluation
        N_SPLITS = N_splits
        kf = KFold(n_splits=N_SPLITS)
        cv_mseValue = 0
        fc = XGBRegressor(learning_rate=individual[0],
                          n_estimators=individual[5],
                          silent=True,
                          objective="reg:linear",
                          nthread=-1,
                          gamma=0,
                          min_child_weight=individual[1],
                          max_depth=individual[2],
                          subsample=individual[3],
                          colsample_bylevel=individual[4],
                          seed=0)
        for train, test in kf.split(trainX):
            fc.fit(trainX[train, :], trainY[train])
            cv_mseValue += sum(
                (trainY[test] - fc.predict(trainX[test, :]))**2) / len(test)
        cv_mseValue = cv_mseValue / N_SPLITS
        return (cv_mseValue, )

    print "There is no evaluation method for %s" % evaluation_method
    raise Exception("evaluation_method is not valid")
Exemplo n.º 9
0
def cbd_model(cbd_df,cbd_finalinput):
    '''
    function that creates model from the cbd dataframe and returns the predicted
    number of crimes for the next three days
    '''

    X_cbd=cbd_df[['year', 'month', 'day', 'tmax', 'tmin', 'consumer_price_index',
       'gdp_millions_2007', 'seasonally_adjusted_unemployment',
       'unadjusted_unemployment', 'Possession, cocaine ',
       'Heroin, possession ', 'Heroin Price Canada',
       'day_segment_1200pm-1159pm', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'day_of_week_Wednesday']]
    y_cbd=cbd_df['number_of_crimes']


    scaler = StandardScaler()
    scaler.fit(X_cbd)  # Don't cheat - fit only on training data
    X_cbd = scaler.transform(X_cbd)
    cbd_input_scaled = scaler.transform(cbd_finalinput)
    xgb=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
    xgb.fit(X_cbd,y_cbd)
    predict_cbd=xgb.predict(cbd_input_scaled)

    return predict_cbd
def get_ntree():
    rmse_t_total, rmse_v_total = [], []
    for ntree in range(10, 500, 10):
        xgb_base = XGBRegressor(objective='reg:linear',
                                n_estimators=ntree,
                                random_state=1234,
                                silent=0,
                                booster='gbtree',
                                eval_metric='rmse')
        rmse_t_1, rmse_v_1 = [], []
        print('此时 ntree = %s' % ntree)
        for train, test in get_cv(y=y_train, n_splits=5, random_state=42):
            X_t, y_t = X_train[train], y_train[train]
            X_v, y_v = X_train[test], y_train[test]
            xgb_base.fit(X_t, y_t)
            y_t_pre = xgb_base.predict(X_t)
            y_v_pre = xgb_base.predict(X_v)
            rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre))
            rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre))
            rmse_t_1.append(rmse_t_each)
            rmse_v_1.append(rmse_v_each)
        rmse_t = np.mean(rmse_t_1)
        rmse_v = np.mean(rmse_v_1)
        rmse_t_total.append(rmse_t)
        rmse_v_total.append(rmse_v)

    return rmse_t_total, rmse_v_total
Exemplo n.º 11
0
    def xgboost_single_pred(self):

        x_train = self.x_train
        y_train = self.y_train

        x_test = self.x_test
        y_test = self.y_test

        self.y_pred_all_xgb = []
        y_train = list(y_train)
        xgboost_clf = XGBRegressor(learning_rate=0.1, n_estimators=75)

        for i in range(len(x_test)):
            xgboost_clf.fit(x_train, y_train)
            x_test_one = x_test.iloc[i:i + 1]
            y_test_one = xgboost_clf.predict(x_test_one)
            self.y_pred_all_xgb.append(list(y_test_one)[0])
            x_train = x_train.append(x_test_one)
            y_train.append(y_test[i])

        xgboost_mse = mean_squared_error(self.y_test, self.y_pred_all_xgb)
        xgboost_rmse = np.sqrt(xgboost_mse)
        y_pred_all_xgb = pd.DataFrame(list(self.y_pred_all_xgb))
        ratio_single_xgb = pd.DataFrame(list(self.y_test)) / y_pred_all_xgb
        return xgboost_rmse, y_pred_all_xgb, ratio_single_xgb
Exemplo n.º 12
0
def subfeat_stacking(train1,train2,test,sub=0.75,repeat=20):
    predictors = [x for x in train1.columns if x not in ['ID', 'y']]
    y_train2 = np.zeros((train2.shape[0], repeat))
    y_test = np.zeros((test.shape[0], repeat))
    for i in range(repeat):
        import random
        random.seed(i)
        random.shuffle(predictors)
        predictors_sub = predictors[:int(len(predictors)*sub)]
        model = XGBRegressor(max_depth=4, learning_rate=0.0045, n_estimators=1250,
                             silent=True, objective='reg:linear', nthread=-1, min_child_weight=1,
                             max_delta_step=0, subsample=0.93, seed=27)
        model.fit(train1[predictors_sub], train1['y'])
        y_train2[:, i] = model.predict(train2[predictors_sub])
        y_test[:, i] = model.predict(test[predictors_sub])
    return y_train2,y_test
Exemplo n.º 13
0
def model_intrv3(Y_train, X_train, Y_test, X_test, Targ):
    global reslts
    global metrs
    import pandas as pd
    import numpy as np
    import datetime as dt
    import sklearn
    from sklearn.metrics import mean_squared_error
    from xgboost.sklearn import XGBRegressor
    from sklearn.metrics import mean_squared_error
    model = XGBRegressor(n_estimators=200,
                         learning_rate=0.05,
                         max_depth=4,
                         random_state=0,
                         subsample=0.9,
                         colsample_bytree=1.0,
                         loss='ls').fit(X_train, Y_train)
    model.score(X_test, Y_test)

    pred_Yxgb = model.predict(X_test)
    mse = mean_squared_error(Y_test, pred_Yxgb)
    nRMSE = np.sqrt(mse) / Targ.mean()
    # nRMSE=np.sqrt(mse)/max(Targ)
    Yts_pd = {'Yts': Y_test, 'Ypd': pred_Yxgb}
    Yts_pd = pd.DataFrame(Yts_pd)
    print(mse, nRMSE)
    metrs = {'mse': mse, 'nRMSE': nRMSE}
    reslts = {'Ypred': pred_Yxgb, 'Yts_pd': Yts_pd}
    return {'Yts_pd': Yts_pd, 'mse': mse, 'nRMSE': nRMSE}
Exemplo n.º 14
0
def over_sample(train, test, feat):
    predictors = [x for x in train.columns if x not in ['ID', 'y']]
    groups = list(train[feat].unique())
    result = None
    for name in groups:
        train_temp = pd.concat([train, train[train[feat] == name]])
        test_temp = test[test[feat] == name]
        model = XGBRegressor(max_depth=4,
                             learning_rate=0.0045,
                             n_estimators=1250,
                             silent=True,
                             objective='reg:linear',
                             nthread=-1,
                             min_child_weight=1,
                             max_delta_step=0,
                             subsample=0.93,
                             seed=27)
        model.fit(train_temp[predictors], train_temp['y'])
        pred = model.predict(test_temp[predictors])
        if result is None:
            result = pd.DataFrame({'ID': test_temp['ID'].values, 'y': pred})
        else:
            result = pd.concat([
                result,
                pd.DataFrame({
                    'ID': test_temp['ID'].values,
                    'y': pred
                })
            ])
    result.sort_values('ID', inplace=True)

    return result
class XGBWrapper_regr(object):
    """
    A wrapper for xgboost model so that we will have a single api for various models.
    """

    def __init__(self):
        self.model = XGBRegressor()

    def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None):

        self.model = self.model.set_params(**params)
        
        eval_set = [(X_train, y_train)]
        if X_valid is not None:
            eval_set.append((X_valid, y_valid))
        if X_holdout is not None:
            eval_set.append((X_holdout, y_holdout))

        self.model.fit(X=X_train, y=y_train,
                       eval_set=eval_set, eval_metric='rmse',
                       verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds'])

        scores = self.model.evals_result()
        self.best_score_ = {k: {m: m_v[-1] for m, m_v in v.items()} for k, v in scores.items()}
#         self.best_score_ = {k: {m: n if m != 'cappa' else -n for m, n in v.items()} for k, v in self.best_score_.items()}

        self.feature_importances_ = self.model.feature_importances_
    
    def predict(self, X_test):
        return self.model.predict(X_test)
Exemplo n.º 16
0
def go(data_dict,feats_to_use, params={"seed":0,"silent":False,"n_jobs":-1},
 parameter_tuning=False):
    
    '''
    if with_gpu:
        xgb = XGBRegressor(seed=0, silent=False, tree_method='gpu_hist', n_gpus=-1)
    else:
        xgb = XGBRegressor(seed=0, silent=False, n_jobs=-1)
    '''
    X_train=data_dict['X_train'][feats_to_use].copy()
    y_train=data_dict['y_train'].copy()
    X_test=data_dict['X_test'][feats_to_use].copy()
    X_val=data_dict['X_val'][feats_to_use].copy()
    y_val=data_dict['y_val'].copy()

    
    
    if parameter_tuning:
        fit_params={
        "early_stopping_rounds":10, 
        "eval_metric" : "rmse", 
        "eval_set" : [(X_val,y_val)]}
        xgb=XGBRegressor() 
        train_val_features=pd.concat([X_train,X_val])
        train_val_labels=pd.concat([y_train,y_val])
        test_fold = np.zeros(train_val_features.shape[0])   # initialize all index to 0
        test_fold[:X_train.shape[0]] = -1   # set index of training set to -1, indicating not to use it in validation
        
        ps=PredefinedSplit(test_fold=test_fold)
        X_train=data_dict['X_train'][feats_to_use]
        y_train=data_dict['y_train']
        X_test=data_dict['X_test'][feats_to_use]
        grid=GridSearchCV(xgb,params,fit_params=fit_params,scoring=RMSE , cv=ps, verbose=32, n_jobs=-1)
        start=time.time()
        grid.fit(train_val_features,train_val_labels)
        elapsed=time.time()-start
        print (elapsed)
        print ('best params:',grid.best_params_)
        print ('best score:',grid.best_score_)

        return grid.best_params_, grid.best_estimator_
        
    else:
        xgb=XGBRegressor(**params)
        print (xgb)
    
        print ('start xgboost training')
        start=time.time()
        eval_set=[(X_val,y_val)]
        xgb.fit(X_train,y_train, eval_set=eval_set,eval_metric='rmse',early_stopping_rounds=30)
        elapsed=time.time()-start
        print (elapsed)
        data_dict['y_pred']=np.exp(xgb.predict(X_test))-1

        #generate submission
        data_dict['X_test']['item_cnt_month']=data_dict['y_pred']
        test=pd.read_csv('test.csv')
        submission=pd.merge(test,data_dict['X_test'], 
            on=['shop_id','item_id'],how='left')[['ID','item_cnt_month']]
        return submission, xgb
Exemplo n.º 17
0
def xgbt_base_rmse_mode(train_input, train_target, test_input, test_target):
    param = {
        'n_estimators': 10,
        'learning_rate': 0.01,
    }

    adj_params = {
        'n_estimators': [10, 50, 100, 200, 300, 400, 500, 1000],
        'learning_rate': [0.01, 0.1, 1]
    }

    xgbt = XGBRegressor(**param)
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    cscv = GridSearchCV(xgbt,
                        adj_params,
                        scoring='neg_mean_absolute_error',
                        cv=cv,
                        n_jobs=-1)
    cscv.fit(train_input, train_target)
    xgbt = XGBRegressor(**cscv.best_params_)
    xgbt.fit(train_input, train_target.ravel())
    predicted = xgbt.predict(test_input)
    xgbt_base_rmse = np.sqrt(metrics.mean_squared_error(
        test_target, predicted))
    print("xgbt_base_rmse: ", xgbt_base_rmse)
    #print ("RMSE:", np.sqrt(metrics.mean_squared_error(test_target, predicted)))
    return xgbt_base_rmse
Exemplo n.º 18
0
def build_model(train, test, pred, label, seed=1080, is_shuffle=True):
    train_pred = np.zeros((train.shape[0], ))
    test_pred = np.zeros((test.shape[0], ))
    n_splits = 10
    # Kfold
    fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed)
    kf_way = fold.split(train[pred])
    # params
    params = {
        'booster': 'gbtree',
        'objective': 'reg:gamma',
        'gamma': 0.1,
        'max_depth': 5,
        'lamda': 3,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'min_child_weight': 3,
        'silent': 1,
        'eta': 0.1,
        'seed': seed,
        'nthread': 8,
        'eval_meric': 'rmse'
    }
    # train
    for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1):
        train_x, train_y = train[pred].iloc[train_idx], train[label].iloc[
            train_idx]
        valid_x, valid_y = train[pred].iloc[valid_idx], train[label].iloc[
            valid_idx]
        # 数据加载
        n_train = xgb.DMatrix(train_x, label=train_y)
        n_valid = xgb.DMatrix(valid_x, label=valid_y)

        xgbModel = XGBRegressor(max_depth=30,
                                learning_rate=0.1,
                                n_estimators=5,
                                objective='reg:logistic',
                                booster='gbtree',
                                gamma=0.1,
                                seed=seed)
        xgbModel.fit(train_x, train_y, verbose=True)
        train_pred[valid_idx] = xgbModel.predict(valid_x)
        test_pred += xgbModel.predict(test[pred]) / fold.n_splits

    test['label'] = test_pred

    return test[['loadingOrder', 'label']]
Exemplo n.º 19
0
def alternative_minimization_xgboost(indexes_listeners_train, x_data_users,
                                     y_data_users, number_points_user, params,
                                     llambda, f):
    old_obj_val = 1e6
    obj_val = -1e6
    ridge_target = np.copy(y_data_users)

    ### Stopping criterion
    start_time = time.time()
    number_loops = -1
    while (old_obj_val - obj_val) / old_obj_val > 1e-2:
        number_loops += 1

        xgb_model = XGBRegressor(n_estimators=params['n_estimators'],
                                 learning_rate=params['learning_rate'],
                                 max_depth=params['max_depth'],
                                 subsample=params['subsample'],
                                 colsample_bytree=params['colsample_bytree'])
        xgb_model.fit(x_data_users, ridge_target)

        EB_residuals = []
        aux = 0

        for i in range(len(indexes_listeners_train)):
            y_data_user = y_data_users[indexes_listeners_train[i]]
            x_data_user = x_data_users[indexes_listeners_train[i]]

            EB_residual_user = np.mean(
                np.array(y_data_user) -
                xgb_model.predict(np.array(x_data_user))) / float(
                    1 + llambda / float(len(y_data_user)))
            EB_residuals.append(EB_residual_user)

            for idx in indexes_listeners_train[i]:
                ridge_target[idx] = y_data_users[idx] - EB_residual_user

        old_obj_val = obj_val
        obj_val = np.linalg.norm(ridge_target - xgb_model.predict(
            x_data_users))**2 + llambda * np.linalg.norm(EB_residuals)**2
        #print 'Objval: '+str(obj_val), (old_obj_val- obj_val) / old_obj_val

    write_and_print('Number loops: ' + str(number_loops), f)
    write_and_print('Time alt-min: ' + str(time.time() - start_time), f)

    return xgb_model, EB_residuals
Exemplo n.º 20
0
def cv_test(train, cv=5):

    t0 = time.time()
    target = 'y'
    predictors = [x for x in train.columns if x not in ['ID', 'y']]
    train_X = train[predictors]
    train_Y = train[target]

    mean_r2 = []
    kf = KFold(len(train_Y), n_folds=cv, shuffle=True, random_state=520)
    for i, (train_index, test_index) in enumerate(kf):

        x_train = train_X.iloc[train_index]
        x_test = train_X.iloc[test_index]
        y_train = train_Y.iloc[train_index]
        y_test = train_Y.iloc[test_index]

        lgb_model = LGBMRegressor(boosting_type='gbdt',
                                  num_leaves=10,
                                  max_depth=4,
                                  learning_rate=0.005,
                                  n_estimators=675,
                                  max_bin=25,
                                  subsample_for_bin=50000,
                                  min_split_gain=0,
                                  min_child_weight=5,
                                  min_child_samples=10,
                                  subsample=0.995,
                                  subsample_freq=1,
                                  colsample_bytree=1,
                                  reg_alpha=0,
                                  reg_lambda=0,
                                  seed=0,
                                  nthread=-1,
                                  silent=True)
        xgb_model = XGBRegressor(max_depth=4,
                                 learning_rate=0.0045,
                                 n_estimators=1250,
                                 silent=True,
                                 objective='reg:linear',
                                 nthread=-1,
                                 min_child_weight=1,
                                 max_delta_step=0,
                                 subsample=0.93,
                                 seed=27)
        xgb_model.fit(x_train, y_train)

        pred = xgb_model.predict(x_test)
        from sklearn.metrics import r2_score
        score = r2_score(y_test, pred)
        mean_r2.append(score)
        print('{0}: r2:{1}\n\n'.format(i + 1, score))

    print(u'r2-均值:%s' % (np.array(mean_r2).mean()))
    print('Done in %.1fs!' % (time.time() - t0))

    return None
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM,
                                             test_size=0.1,
                                             random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(
            cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index,
                                            CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(
            SUBMISSION_FOLDER_PATH,
            "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = XGBRegressor(learning_rate=0.01,
                             max_depth=12,
                             n_estimators=N_ESTIMATORS,
                             silent=False,
                             objective="reg:linear",
                             gamma=1,
                             min_child_weight=1,
                             subsample=0.8,
                             colsample_bytree=0.5,
                             reg_alpha=1,
                             seed=cross_validation_index,
                             nthread=-1)

        model.fit(X_train[train_index],
                  Y_train[train_index],
                  eval_set=[(X_train[valid_index], Y_train[valid_index])],
                  eval_metric=lambda y_predicted, y_true:
                  ("actual_mae",
                   mean_absolute_error(np.exp(y_true.get_label()),
                                       np.exp(y_predicted))),
                  early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                  verbose=True)

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
    def XGB_train(self,X_train, X_valid, labels_train, labels_valid, X_test, xgb_params_all):
        xgb_param_contrl = {'early_stopping_rounds': 100}
        xgb_params = xgb_params_all.copy()
        objective_type = xgb_params['objective_type']
        xgb_params.pop('objective_type')

        for k in xgb_param_contrl.keys():
            if k in xgb_params:
                xgb_param_contrl[k] = xgb_params[k]
                xgb_params.pop(k)

        if not self.config.retrain:
            # 调用已有模型进行增量训练
            model_load = self.load_model()
            if not model_load:
                print('不存在模型:{},从头训练'.format(self.modelName))
                if objective_type == 'regressor':
                    clf = XGBRegressor(**xgb_params)
                else:
                    clf = XGBClassifier(**xgb_params)
                clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                        early_stopping_rounds=xgb_param_contrl['early_stopping_rounds'])
            else:
                clf = model_load.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                        early_stopping_rounds=xgb_param_contrl['early_stopping_rounds'])
        else:
            if objective_type == 'regressor':
                clf = XGBRegressor(**xgb_params)
            else:
                clf = XGBClassifier(**xgb_params)


            clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                    early_stopping_rounds=xgb_param_contrl['early_stopping_rounds'])

        val_xgb_pre = clf.predict(X_valid, ntree_limit=clf.best_iteration)
        test_xgb_pre = clf.predict(X_test, ntree_limit=clf.best_iteration)

        metrics_name = self.config.metrics_name
        myMetrics = defindMetrics.MyMetrics(metrics_name)
        score_xgb = myMetrics.metricsFunc(val_xgb_pre, labels_valid)
        self.save_model(clf, self.config.saveModel)
        return val_xgb_pre, test_xgb_pre, score_xgb
Exemplo n.º 23
0
def model(df, alpha):
    X = df
    y = df.pop('y')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.3,
                                                        random_state=42)

    upper = _random_search(
        X_train, y_train,
        XGBOOSTQUANTILE(quant_alpha=1 - alpha / 2,
                        quant_delta=1,
                        quant_thres=6,
                        quant_var=3.2), {
                            'quant_delta': uniform(.01, 12),
                            'quant_thres': uniform(1, 12),
                            'quant_var': uniform(1, 12)
                        }).predict(X_test)

    lower = _random_search(
        X_train, y_train,
        XGBOOSTQUANTILE(quant_alpha=alpha / 2,
                        quant_delta=1,
                        quant_thres=6,
                        quant_var=3.2), {
                            'quant_delta': uniform(.01, 12),
                            'quant_thres': uniform(1, 12),
                            'quant_var': uniform(1, 12)
                        }).predict(X_test)

    median = _random_search(
        X_train, y_train,
        XGBOOSTQUANTILE(quant_alpha=.5,
                        quant_delta=1,
                        quant_thres=6,
                        quant_var=3.2), {
                            'quant_delta': uniform(.01, 12),
                            'quant_thres': uniform(1, 12),
                            'quant_var': uniform(1, 12)
                        }).predict(X_test)

    xgbls = XGBRegressor()
    xgbls.fit(X_train, y_train)
    mean = xgbls.predict(X_test)

    return pd.concat([
        X_test.reset_index(drop=True),
        y_test.reset_index(drop=True),
        pd.DataFrame(upper, columns=['upper_bound']),
        pd.DataFrame(lower, columns=['lower_bound']),
        pd.DataFrame(mean, columns=['mean']),
        pd.DataFrame(median, columns=['median'])
    ],
                     axis=1)
Exemplo n.º 24
0
def xgbt_mode(train_input, train_target, test_input, test_target):
    param = {
        'n_estimators': 1000,
        'learning_rate': 0.01,
        'objective': 'reg:squarederror',
    }

    xgbt = XGBRegressor(**param)
    xgbt.fit(train_input, train_target.ravel())
    xgbt_predicted = xgbt.predict(test_input)
    return xgbt_predicted
Exemplo n.º 25
0
def stacking(Data, Test, Target, FoldNum):
    BaseModel = [RandomForestRegressor(), AdaBoostRegressor(), GradientBoostingRegressor(), ExtraTreesRegressor(),
                 SVR()]
    EnsembleModel = XGBRegressor()
    # 模型初始化
    Scale = StandardScaler()
    Data = pd.DataFrame(Scale.fit_transform(Data))
    Test = test_process(Test)
    # 数据标准化和归一化
    BaseTrainFold = []
    BaseTestFold = []
    BaseTargetFold = []
    KF = KFold(n_splits=FoldNum)
    for TrainIndex, TestIndex in KF.split(Data):
        BaseTrainFold.append(Data.iloc[TrainIndex].reset_index(drop=True))
        BaseTestFold.append(Data.iloc[TestIndex].reset_index(drop=True))
        BaseTargetFold.append(Target.iloc[TrainIndex].reset_index(drop=True))
    # 针对BaseModel进行数据集的划分
    EnsembleTrainFold = []
    EnsembleTestFold = []
    Mark = 0
    for Model in BaseModel:
        Mark += 1
        TrainFold = []
        TestFold = []
        for Num in range(FoldNum):
            Clf = Model
            Clf.fit(BaseTrainFold[Num], BaseTargetFold[Num])
            TrainFold.append(pd.DataFrame(data={"data" + str(Mark): Clf.predict(BaseTestFold[Num])}))
            TestFold.append(pd.DataFrame(data={"data" + str(Mark): Clf.predict(Test)}))
            if Num == FoldNum - 1:
                TrainTemp = TrainFold[0]
                TestTemp = TestFold[0]
                for Index in range(1, FoldNum):
                    TrainTemp = TrainTemp.append(TrainFold[Index])
                    TestTemp = TestTemp.append(TestFold[Index])
                TrainTemp.reset_index(inplace=True, drop=True)
                TestTemp.reset_index(inplace=True, drop=True)
                EnsembleTrainFold.append(TrainTemp)
                EnsembleTestFold.append(TestTemp)
    EnsembleTrain = EnsembleTrainFold[0]
    EnsembleTest = EnsembleTestFold[0]
    for Index in range(1, len(EnsembleTrainFold)):
        EnsembleTrain = pd.merge(EnsembleTrain, EnsembleTrainFold[Index], left_index=True, right_index=True)
        EnsembleTest = pd.merge(EnsembleTest, EnsembleTestFold[Index], left_index=True, right_index=True)
    # 第一层模型进行数据拟合和处理
    EnsembleModel.fit(EnsembleTrain, Target)
    EnsembleResult = EnsembleModel.predict(EnsembleTest)
    Result = 0
    for Num in EnsembleResult:
        Result += Num
    Result = Result / len(EnsembleResult)
    # 对测试集结果求平均值进行输出
    return Result
Exemplo n.º 26
0
def run_xgb(**args):
    print("building xgb model:")
    xgb_model = XGBRegressor()
    xgb_model.fit(args["training_data"], args["training_label"])
    output = xgb_model.predict(args["test_data"])
    pickle.dump(xgb_model, open("xgb_testmodel.p", "wb"))

    output = list(map(lambda e: round(e), output))
    print(output)
    pickle.dump(output, open("xgb_output.p", "wb"))
    return output
Exemplo n.º 27
0
def xgb(x_train, y_train, x_val, y_val):
    xgb = XGBRegressor(n_estimators=1000,
                       max_depth=10,
                       learning_rate=0.01,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       random_state=2000)
    xgb.fit(x_train, y_train)
    result = xgb.predict(x_val)
    score = mean_absolute_error(result, y_val)
    return score
def xgboost_reg(train_df, target):

    if not os.path.isfile('Data/pickles/models/xgboost_model'):
        params = {
            'n_estimators': [10, 20, 30, 40, 50, 100, 250, 500, 1000],
            'max_depth': [1, 3, 5],
            'learning_rate': [
                0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.08, 0.09, 0.1, 0.3,
                0.5, 0.7, 1
            ],
            'reg_alpha': [0, 0.001, 0.1, 0.5, 1, 2, 5],
            'reg_lambda': [0, 0.001, 0.1, 1, 2, 5],
            'n_jobs': [3],
            'early_stopping_rounds': [6]
        }

        model = XGBRegressor(objective='reg:linear')
        grid = GridSearchCV(estimator=model,
                            param_grid=params,
                            verbose=3,
                            cv=3,
                            scoring='neg_root_mean_squared_error')

        grid.fit(train_df, target)

        print(grid.best_params_)
        with open('Data/pickles/models/xgboost_model', 'wb') as file:
            boost_model = grid.best_estimator_
            pickle.dump(boost_model, file)

    else:
        with open('Data/pickles/models/xgboost_model', 'rb') as file:
            model = pickle.load(file)

    train_split_model = XGBRegressor(objective='reg:linear',
                                     learning_rate=0.08,
                                     max_depth=3,
                                     n_estimators=500,
                                     n_jobs=3,
                                     reg_alpha=0.001,
                                     reg_lambda=1)

    x_train, x_test, y_train, y_test = train_test_split(train_df, target)

    train_split_model.fit(x_train, y_train)

    y_pred = train_split_model.predict(x_test)
    '''best params: {'learning_rate': 0.08, 'max_depth': 3, 'n_estimators': 500, 'n_jobs': 3, 'reg_alpha': 0.001, 'reg_lambda': 1}'''

    print('RMSE:{}'.format(np.sqrt(mean_squared_error(y_test, y_pred))))

    return model
Exemplo n.º 29
0
def predict_tags(tag):
    training_dataset = pd.read_csv(CSV_DIR + 'train_' + tag)
    training_dataset = pd.get_dummies(training_dataset)

    test_dataset = pd.read_csv(CSV_DIR + 'test_' + tag)
    test_dataset = pd.get_dummies(test_dataset)

    X_train = training_dataset.drop(['outcome'], axis=1)
    y_train = training_dataset['outcome']

    X_test = test_dataset

    xgbcl = XGBRegressor()
    xgbcl.fit(X_train, y_train)

    return (list(xgbcl.predict(X_test)))
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = XGBRegressor(
            learning_rate=0.01,
            max_depth=12,
            n_estimators=N_ESTIMATORS,
            silent=False,
            objective="reg:linear",
            gamma=1,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.5,
            reg_alpha=1,
            seed=cross_validation_index,
            nthread=-1)

        model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])],
            eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))),
            early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True)

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
Exemplo n.º 31
0
class XGBoostRegressionModel():
    def __init__(self, name):
        self.model = XGBRegressor(n_estimators=1000,
                                  max_depth=10,
                                  learning_rate=0.001,
                                  random_state=0)

    def train(self, X, y, label, configs):

        X.reset_index()
        y.reset_index()
        distrs = [get_distribution(y)]
        index = ['Entire set']

        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 10)  #CROSS VALIDATION CHANGE
        plt.figure(figsize=(10, 10))

        outcome_df = pd.DataFrame()

        kf = KFold(n_splits=5)

        for train_index, test_index in kf.split(X):
            training_X, testing_X = X.iloc[train_index], X.iloc[test_index]
            training_y, testing_y = y.iloc[train_index], y.iloc[test_index]

            # Train, predict and Plot
            self.model.fit(training_X, training_y)
            #y_pred_rt = self.model.predict_proba(testing_X)[:, 1]
            y_pred_rt = self.model.predict(testing_X)

            mse = mean_squared_error(testing_y, y_pred_rt)**(0.5)

            performance_row = {"Mean Square Error": mse}

            outcome_df = outcome_df.append(performance_row, ignore_index=True)
        outcome_df.to_csv("Outcomes/" + label + "RegressionStudent.csv")

        distr_df = pd.DataFrame(
            distrs,
            index=index,
            columns=[f'Label {l}' for l in range(np.max(y) + 1)])
        distr_df.to_csv(configs['model']['save_dir'] +
                        "-K-Fold-Distributions.csv",
                        index=True)
Exemplo n.º 32
0
def train_xgb_model(best_nodes, X_train_scaled, Y_train):

    rsg = XGBRegressor(gamma=best_nodes["gamma"],
                       max_depth=best_nodes["max_depth"],
                       learning_rate=best_nodes["learning_rate"],
                       min_child_weight=best_nodes["min_child_weight"],
                       subsample=best_nodes["subsample"],
                       colsample_bytree=best_nodes["colsample_bytree"],
                       reg_alpha=best_nodes["reg_alpha"],
                       reg_lambda=best_nodes["reg_lambda"],
                       n_estimators=int(best_nodes["n_estimators"]),
                       random_state=42)

    rsg.fit(X_train_scaled, Y_train)
    Y_pred = rsg.predict(X_train_scaled)
    print("mse:", np.mean((Y_pred - Y_train)**2))
    print("rmse:", np.sqrt(np.mean((Y_pred - Y_train)**2)))
    return rsg
Exemplo n.º 33
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        # checks for sampling
        sample_fraction = float(sample_fraction)
        if sample_fraction > 1:
            sample_fraction = 1.0
        if sample_fraction * n_samples > 1:
            n_samples = round(1.0/sample_fraction)
        if sample_fraction <= 0:
            print('sample_fraction 0 or negative, switching to 0.1')
            sample_fraction = 0.1
        # if sample_fraction is results in sample smaller than 1
        if round(sample_fraction * len(df)) == 0:
            sample_fraction = 1.0/len(df)
        # check if data is dataframe
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.sample_fraction = sample_fraction
                self.n_samples = n_samples
                self.num_training_rounds = num_training_rounds
                self.prefix = prefix
                # init the classifier:
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'multiclass':
                    self.scoring = 'merror'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'multi:softmax',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
                # if preferred scoring metric is stated:
                if scoring:
                    self.scoring = scoring
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        # if subsampling
        if self.sample_fraction == 1.0:
            df_list = [self.df]
        else:
            df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples)
        print(df_list)
        for idx, current_df in enumerate(df_list):
            print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction))
            xgtrain  = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan)
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
            except:
                try:
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
                except:
                    xgb_param['num_class'] = len(current_df[self.target_column].unique())
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
                self.clf.set_params(n_estimators=cvresult.shape[0])
                self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring)

                #Predict training set:
                train_df_predictions = self.clf.predict(current_df[self.predictors])

                if self.target_type == 'binary' or self.target_type == 'multiclass':
                    train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1]
                    print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions))
                    if self.target_type == 'binary':
                        print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob))
                elif self.target_type == 'linear':
                    print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))
                    print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)))
                filename = self.prefix + '_' + str(idx) + '.pkl'
                self.save(filename)

    def predict(self, test_df, return_multi_outputs=False, return_mean_std=False):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        # prediction
        print('## predicting from test set')
        output_list = []
        output = None
        for idx, ns in enumerate(range(self.n_samples)):
            if self.n_samples == 1:
                xgb = self
                if self.target_type == 'binary':
                    output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                elif self.target_type == 'linear':
                    output = xgb.clf.predict(self.test_df[self.predictors])
            else:
                try:
                    filename = self.prefix + '_' + str(idx) + '.pkl'
                    xgb = self.load(filename)
                    if self.target_type == 'binary':
                        output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                    elif self.target_type == 'linear':
                        output = xgb.clf.predict(self.test_df[self.predictors])
                    output_list.append(list(output))
                except IOError:
                    print('no file found, skipping')
        # average the outputs if n_samples is more than one
        if self.n_samples == 1:
            self.output = output
            try:
                self.multi_outputs = [list(output)]
            except:
                self.multi_outputs = None
        else:
            self.output = np.mean(output_list, axis=0)
            self.multi_outputs = output_list
        if return_multi_outputs:
            return self.multi_outputs
        elif return_mean_std:
            return (self.output, np.std(output_list, axis=0))
        else:
            return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if self.verbose:
                print('converting', col)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if self.verbose:
                print(df[col].dtype)
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def random_sample(self, df, fraction=0.2, n_samples=None):
        """
        splits into random samples
        - n_samples: how many samples you want returned (default = All)
        - fraction : what fraction of data to include in the sample (default = 0.2)
        """
        print('constructing random samples')
        num_rows = len(df)
        len_sample = round(fraction * num_rows)
        # create list of slice index lists
        indices = range(0,num_rows)
        slice_list = []
        tmp_idx_list = []
        while len(indices) > 0:
            while len(tmp_idx_list) < len_sample and len(indices) > 0:
                idx = indices.pop(random.randrange(len(indices)))
                tmp_idx_list.append(idx)
            slice_list.append(tmp_idx_list)
            tmp_idx_list = []
        # get slices
        sample_list = []
        for s in range(n_samples):
            try:
                sample_list.append(df.loc[slice_list[s],:])
            except:
                pass
        return sample_list

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            try:
                for idx, value in enumerate(self.output):
                    test_id = self.test_df[self.id_column][idx]
                    test_output = self.output[idx]
                    to_write = [test_id, test_output]
                    if include_actual:
                        to_write.append(self.test_df[self.target_column][idx])
                    writer.writerow(to_write)
                print('results written to ' + filename)
            except:
                print('write_csv failed')

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

    def load(self, model_file='xgb.pkl'):
        xgb = joblib.load(model_file)
        return xgb
pd.Series(rf_age_valid_pred).isnull().sum()
statistics.mean(error_df.sqerr)

len(rf_age_valid_pred)
len(age_valid_Y)
pd.crosstab(pd.Series(rf_age_valid_pred).apply(lambda x: round(x)),age_valid_Y)

## XGB for age prediction
from xgboost.sklearn import XGBRegressor

xgb = XGBRegressor(max_depth=6, learning_rate=0.2, n_estimators=100,
                    objective='reg:linear', subsample=0.5, colsample_bytree=0.5, seed=321)

eval_set  = [(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y), (mvt_valid_X.drop(['age', 'gender'], axis=1),age_valid_Y)]
xgb.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y, eval_set = eval_set, eval_metric= 'rmse',early_stopping_rounds= 10, verbose=1)
xgb_age_valid_pred = xgb.predict(mvt_valid_X.drop(['age', 'gender'], axis=1))


## ADAboost for age prediction
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor(n_estimators=50,learning_rate=0.1,loss='linear', random_state=321)
ada.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y.values,)
ada_age_valid_pred = ada.predict(mvt_valid_X.drop(['age', 'gender'], axis=1))

len(ada_age_valid_pred)
len(age_valid_Y)

error_df = pd.DataFrame(pd.Series(ada_age_valid_pred), columns=['pred'])
error_df.reset_index( inplace=True)
act_df = pd.DataFrame(age_valid_Y)
act_df.reset_index( inplace=True)
Exemplo n.º 35
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.num_training_rounds = num_training_rounds
                # init the classifier
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        xgtrain  = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan)
        try:
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
        except:
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
            except:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
        self.clf.set_params(n_estimators=cvresult.shape[0])
        self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring)

        #Predict training set:
        train_df_predictions = self.clf.predict(self.df[self.predictors])

        if self.target_type == 'binary':
            train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1]
            print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions))
            print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob))
        elif self.target_type == 'linear':
            print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))
            print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)))

    def predict(self, test_df):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        if self.target_type == 'binary':
            self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1]
        elif self.target_type == 'linear':
            self.output = self.clf.predict(self.test_df[self.predictors])
        return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if col not in self.cols_to_remove:
                if self.verbose:
                    print('converting', col)
                df[col] = pd.to_numeric(df[col], errors='coerce')
                if self.verbose:
                    print(df[col].dtype)

        # drop those marked for dropping
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            for idx, value in enumerate(self.output):
                test_id = self.test_df[self.id_column][idx]
                test_output = self.output[idx]
                to_write = [test_id, test_output]
                if include_actual:
                    to_write.append(self.test_df[self.target_column][idx])
                writer.writerow(to_write)

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)
Exemplo n.º 36
0
#parameters of algorithm
xgb1=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.5, max_delta_step=0,
       max_depth=7, min_child_weight=4, missing=None, n_estimators=1000,
       n_jobs=1, nthread=4, objective='reg:gamma', random_state=0,
       reg_alpha=0.09, reg_lambda=1, scale_pos_weight=1, seed=1048,
       silent=True, subsample=0.86)
###################

 #Fit the algorithm on the data

xgb1.fit(train_x, train_y,eval_metric='rmse')

### train vs test
pre = xgb1.predict(test_x)
plt.figure(figsize=(16,9))
plt.style.use('ggplot')
plt.plot(pre,label='predict_load1')
plt.plot(np.array(test_y),label='test_load1')
plt.title('Test_prediction  MAE=%s' % str(np.sum(abs(pre-test_y))/len(pre)))
plt.legend(loc='upper left')
plt.savefig("D:\\load Forecasting\\plot\\TraiVsTest.jpg")


#####Newdata
pre1=xgb1.predict(load1_test.drop('load1',1))
plt.figure(figsize=(16,9))
plt.style.use('ggplot')
plt.plot(pre1,label='predict_load1')
plt.plot(np.array(load1_test['load1']),label='real_load1')