示例#1
0
    validation_predict_dir = validation_predict_dir_prefix + str(week) + '\\'
    test_predict_dir = test_predict_dir_prefix + str(week) + '\\'
    validation_predicts = []
    test_predicts = []
    validation_paths = os.listdir(validation_predict_dir)
    for p in validation_paths:
        validation_predicts.append(pd.read_csv(validation_predict_dir + p)['visitors_predict'].tolist())
        validation_predicts_y = np.log1p(np.array(pd.read_csv(validation_predict_dir + p)['visitors'].tolist()))

    test_paths = os.listdir(test_predict_dir)
    for p in test_paths:
        test = pd.read_csv(test_predict_dir + p)
        test_predicts.append(test['visitors'].tolist())

    validation_predicts = [np.array(predicts).reshape(-1, 1) for predicts in validation_predicts]
    test_predicts = [np.array(predicts).reshape(-1, 1) for predicts in test_predicts]

    train_merge = np.concatenate(validation_predicts, axis=1)
    test_merge = np.concatenate(test_predicts, axis=1)

    # lr = Ridge(random_state=1234, alpha=0.01, normalize=True)
    # lr = RandomForestRegressor(n_estimators=200, n_jobs=-1, max_depth=7, random_state=1234)
    lr = XGBRegressor(subsample=0.7, colsample_bytree=0.8, n_jobs=-1, random_state=1234, reg_lambda=0.01, reg_alpha=0.01, n_estimators=200, max_depth=7)
    # lr = SVR(C=10)
    lr.fit(train_merge, validation_predicts_y)
    predict = lr.predict(test_merge)
    temp = lr.predict(train_merge)
    print('RMSE stacking: ', RMSLE(validation_predicts_y, lr.predict(train_merge)))
    test['visitors'] = np.expm1(predict)
    test['visitors'] = test['visitors'].clip(lower=0.)
    test[['id', 'visitors']].to_csv(data_dir + 'submission_use_stacking3_' + str(week) + '.csv', index=False)
示例#2
0
 def __init__(self):
     self.__xgb = XGBRegressor()
     logging.getLogger('XGBoost').setLevel(logging.ERROR)
     logging.basicConfig(filename='xgboost.log',
                         format='%(levelname)s %(asctime)s: %(message)s',
                         level=logging.DEBUG)
    'Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
    'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
    'killStreaks', 'longestKill', 'matchDuration', 'maxPlace', 'numGroups',
    'rankPoints', 'revives', 'rideDistance', 'roadKills', 'swimDistance',
    'teamKills', 'vehicleDestroys', 'walkDistance', 'weaponsAcquired',
    'winPoints', 'matchType_duo', 'matchType_duo-fpp', 'matchType_solo',
    'matchType_solo-fpp', 'matchType_squad', 'matchType_squad-fpp',
    'winPlacePerc'
]]
# generate attribute
X = clean_data.iloc[:, 3:33]
# generate target regression target
Y = clean_data.iloc[:, 33]

# generate the parameter dict which can be fit in GridSearchCV model
param_test = {
    'learning_rate': [0.02, 0.03, 0.1],
    'min_child_weight': [4, 6, 8],
    'max_depth': [8, 10],
    "subsample": [0.6, 0.4],
    "n_estimators": [300, 500]
}
# define the target model and some parameters except these which should be chosen by greedy search
model = XGBRegressor(n_estimators=-1)
grid = GridSearchCV(model, param_test, cv=5, scoring='neg_mean_absolute_error')
# fit the data into greedy search
print("begin to find parameters")
grid.fit(X, Y)
# print the statistic data and choose the best parameters for our target model
print(grid.best_score_, grid.best_estimator_, grid.best_params_)
示例#4
0
    """
    y_df = pd.DataFrame({'Id': test.Id, 'SalePrice': pred_y})
    return y_df.to_csv(file_name, index = False)

#Reading Data
iowa_data = pd.read_csv("Iowa Housing Prices.csv") 

#Setting iv and dv
X = iowa_data.drop(labels = ["Id", "SalePrice"], axis = 1)
X_imputed = impute_extension(X)
X_OHE = OHE(X)
X = X_imputed.join(X_OHE)
y = iowa_data.SalePrice

#Model
XGBR_model = XGBRegressor(learning_rate=0.05, n_estimators= 1000)

#Cross Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(XGBR_model, X, y, scoring = "neg_mean_squared_log_error", cv = 5)
rmse_log_score = np.sqrt(scores.mean()*-1)

#Partial Dependence Plot
#plot_importance(XGBR_model.fit(X,y), importance_type = "gain", max_num_features = 10)

#Applying on Test data
#Data preprocessing
test = pd.read_csv("test.csv")
test_X_imputed = impute_extension(test)
test_X_OHE = OHE(test)
test_X = test_X_imputed.join(test_X_OHE)
    X = clean_training_data.drop(columns=target_variable).values
    y = clean_training_data[target_variable].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SPLIT_SEED)

    # Remove outliers from training set
    X_train, y_train = remove_outliers(X_train, y_train)
    ''' 2. Create and train model(s) '''
    models = [('GradientBoostingRegressor',
               GradientBoostingRegressor(n_estimators=400,
                                         max_depth=4,
                                         max_features=0.9,
                                         warm_start=True)),
              ('XGBRegressor',
               XGBRegressor(objective='reg:squarederror',
                            n_estimators=400,
                            max_depth=4,
                            subsample=0.9)),
              ('Deep Neural Network', create_nn_model(X_train.shape[1]))]

for name, model in models:
    # Train model on training dataset
    start_time = time.time()
    print('--- Starting Training ({}) ---'.format(name))
    train_model(model, X_train, y_train)
    training_time = time.time() - start_time
    print('--- Training took {} sec ---'.format(training_time))

    # Test model
    print('--- Testing ({}) ---'.format(name))
    train_accuracy, train_pred = test_model(model, X_train, y_train)
    test_accuracy, test_pred = test_model(model, X_test, y_test)
示例#6
0
文件: ensemble.py 项目: yaoyeyi/JDD
# lgb params
lgb_params = {}
lgb_params['n_estimators'] = 120
# lgb_params['max_bin'] = 2
lgb_params['learning_rate'] = 0.01  # shrinkage_rate
lgb_params['metric'] = 'mae'  # or 'mae'
lgb_params['sub_feature'] = 0.6
lgb_params['subsample'] = 0.8  # sub_row
lgb_params['num_leaves'] = 11  # num_leaf
lgb_params['min_data'] = 5  # min_data_in_leaf
lgb_params['verbose'] = -1
lgb_params['feature_fraction_seed'] = 2
lgb_params['bagging_seed'] = 3

# XGB model
xgb_model = XGBRegressor(n_estimators=120)

gbdt_model = GradientBoostingRegressor()

# lgb model
lgb_model = LGBMRegressor(**lgb_params)

# RF model
rf_model = RandomForestRegressor(**rf_params)

# ET model
et_model = ExtraTreesRegressor()

# SVR model
# SVM is too slow in more then 10000 set
# svr_model = SVR(kernel='rbf', C=5.0, epsilon=0.005)
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-15.21590991057142
exported_pipeline = make_pipeline(
    SelectFwe(score_func=f_regression, alpha=0.026000000000000002),
    StackingEstimator(estimator=LinearSVR(C=0.1,
                                          dual=False,
                                          epsilon=0.001,
                                          loss="squared_epsilon_insensitive",
                                          tol=0.01)),
    StackingEstimator(
        estimator=GradientBoostingRegressor(alpha=0.75,
                                            learning_rate=0.001,
                                            loss="quantile",
                                            max_depth=2,
                                            max_features=0.35000000000000003,
                                            min_samples_leaf=15,
                                            min_samples_split=17,
                                            n_estimators=100,
                                            subsample=0.7500000000000001)),
    Normalizer(norm="l1"),
    XGBRegressor(learning_rate=0.01,
                 max_depth=6,
                 min_child_weight=8,
                 n_estimators=100,
                 nthread=1,
                 subsample=0.1))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
示例#8
0
x = x_data.values
y = y_data.values
x_pred = test.values

train_gap = np.load('./dacon/comp1/train_gap.npy')
test_gap = np.load('./dacon/comp1/test_gap.npy')

x = np.hstack((x, train_gap))
x_pred = np.hstack((x_pred, test_gap))

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=33)

xgb = XGBRegressor()

model = MultiOutputRegressor(xgb)

model.fit(x_train, y_train)

y_pred1 = model.predict(x_test)

print('mae: ', mean_absolute_error(y_test, y_pred1))


## feature_importances
def plot_feature_importances(model):
    plt.figure(figsize=(10, 40))
    n_features = x_data.shape[1]  # n_features = column개수
    plt.barh(
featimp.index = mergedassetmacrolag.columns[0:len(mergedassetmacrolag.columns
                                                  ) - 1]
featimp = featimp.sort_values(by=['FeatImp'])
featimp.plot(kind='bar')

rf_validationpred = rfreg.predict(
    X_validation.iloc[:, 0:len(X_validation.columns)].values)
rf_validationact = yval
valpreddiff = (rf_validationpred - rf_validationact)
valpreddiffpct = ((rf_validationpred - rf_validationact) / rf_validationpred)
print('act:%.3f, pred:%.3f, diff:%.3f, diffpct:%.3f' %
      (rf_validationact, rf_validationpred, valpreddiff, valpreddiffpct))

######## Run XGBoost Algo on Assets with macro data ##########
xgbreg = XGBRegressor(objective='reg:squarederror',
                      learning_rate=0.1,
                      max_depth=4,
                      seed=1)
xgbreg.fit(X_train, y_train)

# Predicting asset value result with Random Forest Regression
xgb_pred = xgbreg.predict(X_test)

# Check the importance of each feature
xgbimp = xgbreg.feature_importances_

xgbmse = mean_squared_error(y_test, xgb_pred)
xgbrmse = np.sqrt(xgbmse)
xgbmae = mean_absolute_error(y_test, xgb_pred)
xgbr2 = r2_score(y_test, xgb_pred)
print('MSE:%.4f, RMSE:%.4f, MAE:%.4f, R2:%.4f' %
      (xgbmse, xgbrmse, xgbmae, xgbr2))
示例#10
0
        
        select_x_train = selection.transform(x_train)
        select_x_test = selection.transform(x_test)
        select_x_pred = selection.transform(x_pred)
        print(select_x_train.shape[1])
        
        parameter = {
            'n_estimators': [100, 200, 400],
            'learning_rate' : [0.05, 0.07, 0.1],
            'colsample_bytree': [ 0.7, 0.8, 0.9],
            'colsample_bylevel':[ 0.7, 0.8, 0.9],
            'max_depth': [4, 5, 6]
        }
    
        # search = RandomizedSearchCV( XGBRegressor(gpu_id = 0, tree_method = 'gpu_hist'), parameter, cv =5)
        search = RandomizedSearchCV( XGBRegressor(), parameter, cv =5)
        multi_search = MultiOutputRegressor(search,n_jobs = -1)
        
        multi_search.fit(select_x_train, y_train )
        
        y_pred = multi_search.predict(select_x_test)
        mae = mean_absolute_error(y_test, y_pred)
        score =r2_score(y_test, y_pred)
        print("Thresh=%.3f, n = %d, R2 : %.2f%%, MAE : %.3f"%(thres, select_x_train.shape[1], score*100.0, mae))
        
        y_predict = multi_search.predict(select_x_pred)

        # submission
        a = np.arange(10000,20000)
        submission = pd.DataFrame(y_predict, a)
        submission.to_csv('./dacon/comp1/sub/select_XGB03_%i_%.5f.csv'%(i, mae),index = True, header=['hhb','hbo2','ca','na'],index_label='id')
示例#11
0
Y_train = Y_train.clip(0, 20)
Y_valid = Y_valid.clip(0, 20)

# In[27]:

from xgboost import XGBRegressor
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

# In[28]:

model = XGBRegressor(
    max_depth=10,
    n_estimators=1000,
    min_child_weight=0.5,
    colsample_bytree=0.8,
    subsample=0.8,
    eta=0.1,
    #     tree_method='gpu_hist',
    seed=42)

model.fit(X_train,
          Y_train,
          eval_metric="rmse",
          eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
          verbose=True,
          early_stopping_rounds=20)

pickle.dump(model, open('model.pkl', 'wb'))

model = pickle.load(open('model.pkl', 'rb'))
示例#12
0
def get_model_from_name(model_name, training_params=None):

    # For Keras
    epochs = 250
    if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning':
        print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy')
        epochs = 30

    all_model_params = {
        'LogisticRegression': {'n_jobs': -2},
        'RandomForestClassifier': {'n_jobs': -2},
        'ExtraTreesClassifier': {'n_jobs': -1},
        'AdaBoostClassifier': {'n_estimators': 10},
        'SGDClassifier': {'n_jobs': -1},
        'Perceptron': {'n_jobs': -1},
        'LinearSVC': {'dual': False},
        'LinearRegression': {'n_jobs': -2},
        'RandomForestRegressor': {'n_jobs': -2},
        'LinearSVR': {'dual': False, 'loss': 'squared_epsilon_insensitive'},
        'ExtraTreesRegressor': {'n_jobs': -1},
        'MiniBatchKMeans': {'n_clusters': 8},
        'GradientBoostingRegressor': {'presort': False, 'learning_rate': 0.05, 'warm_start': True},
        'GradientBoostingClassifier': {'presort': False, 'learning_rate': 0.05, 'warm_start': True},
        'SGDRegressor': {'shuffle': False},
        'PassiveAggressiveRegressor': {'shuffle': False},
        'AdaBoostRegressor': {'n_estimators': 10},
        'XGBRegressor': {'nthread':-1, 'n_estimators': 200},
        'XGBClassifier': {'nthread':-1, 'n_estimators': 200},
        'LGBMRegressor': {},
        'LGBMClassifier': {},
        'DeepLearningRegressor': {'epochs': epochs, 'batch_size': 50, 'verbose': 2},
        'DeepLearningClassifier': {'epochs': epochs, 'batch_size': 50, 'verbose': 2}
    }

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it)
        model_params.update(training_params)
        print('After overwriting our defaults with your values, here are the final params that will be used to initialize the model:')
        print(model_params)


    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),


        'SGDClassifier': SGDClassifier(),
        'Perceptron': Perceptron(),
        'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
        'LinearSVC': LinearSVC(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'LinearSVR': LinearSVR(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),

        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),
        'SGDRegressor': SGDRegressor(),
        'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans()
    }

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if keras_installed:
        model_map['DeepLearningClassifier'] = KerasClassifier(build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print('It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize')
        raise(e)
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params
示例#13
0
# Shape of train and test data
print(X_train_temp.shape, X_val_temp.shape)

# --------------
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Code starts here

dt = DecisionTreeRegressor(random_state=5)
dt.fit(X_train, y_train)
accuracy = dt.score(X_val, y_val)
y_pred = dt.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_pred, y_val))
print(accuracy)
print(rmse)

# --------------
from xgboost import XGBRegressor

# Code starts here
xgb = XGBRegressor(max_depth=50, learning_rate=0.83, n_estimators=100)
xgb.fit(X_train, y_train)
accuracy = xgb.score(X_val, y_val)
y_pred = xgb.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(accuracy)
print(rmse)

# Code ends here
示例#14
0
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, r2_score

# dataset = load_breast_cancer()
# x = dataset.data
# y = dataset.target

x, y = load_breast_cancer(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)

model = XGBRegressor(n_estimators=100,
                     learning_rate=0.1)  # 나무의 갯수(n_estimators)는 epoch

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric=["logloss", "error"],
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=100)
# eval_set은 validation_0이 x_train, y_train// validation1이 x_test, y_test

# train test val val지표가 중요

# rmse, mae, logloss, error(설명 error가 accuracy), auc(설명 accuracy친구)

results = model.evals_result()
print("eval's results : ", results)
示例#15
0
    'learning_rate': [0.1, 0.3, 0.001, 0.01],
    'max_depth': [4, 5, 6]
}, {
    'n_estimators': [400, 600],
    'learning_rate': [0.1, 0.001, 0.5],
    'max_depth': [4, 5, 6],
    'colsample_bytree': [0.6, 0.9, 1],
    'colsample_bylevel': [0.6, 0.7, 0.9]
}]  #XGBRegressor : n_estimators 몇번돌릴건지

grid_random = [RandomizedSearchCV]  #GridSearchCV
kflod = KFold(n_splits=5, shuffle=True)

for i in grid_random:
    model = i(XGBRegressor(n_jobs=8,
                           tree_method='gpu_hist',
                           predictor='gpu_predictor'),
              parameter,
              cv=kflod)
    #n_estimators == epochs

    model.fit(X_train,
              y_train,
              verbose=1,
              eval_metric=['rmse'],
              eval_set=[(X_train, y_train), (X_test, y_test)])
    filename = '../data/h5/model_XGB_person.sav'
    pickle.dump(model, open(filename, 'wb'))
    # model = pickle.load(open(filename, 'rb'))
    y_pred = model.predict(X_test)
    acc = model.score(X_test, y_test)
示例#16
0
param_grid = {
    'n_estimators': [150, 250, 350],
    'max_depth': [1, 2, 3],
    'min_samples_split': [5, 6, 7]
}

opt_models[model], cv_score, grid_results = train_model(opt_models[model],
                                                        param_grid=param_grid,
                                                        splits=splits,
                                                        repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

model = 'XGB'
opt_models[model] = XGBRegressor()

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [1, 2, 3],
}

opt_models[model], cv_score, grid_results = train_model(opt_models[model],
                                                        param_grid=param_grid,
                                                        splits=splits,
                                                        repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

X, y = get_training_data()
#1. 데이터
dataset = load_diabetes()
x = dataset.data
y = dataset.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=66,
                                                    shuffle=True)

#2. 모델
# model = DecisionTreeRegressor(max_depth=4)
# model = RandomForestRegressor(max_depth=4)
# model = GradientBoostingRegressor(max_depth=4)
model = XGBRegressor(n_job=-1)

#3. 훈련
model.fit(x_train, y_train)

#4. 평가, 예측
acc = model.score(x_test, y_test)
print(model.feature_importances_)
print("acc : ", acc)

#5. 시각화
import matplotlib.pyplot as plt
import numpy as np
'''
def plot_feature_importances_dataset(model): 
    n_features = dataset.data.shape[1]
示例#18
0
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[(
    'num', numerical_transformer,
    numerical_cols), ('cat', categorical_transformer, categorical_cols)])

from xgboost import XGBRegressor

# Define the model
my_model_1 = XGBRegressor(random_state=0)  # Your code here

clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', my_model_1)])
clf.fit(X_train, y_train)  # Your code here

from sklearn.metrics import mean_absolute_error

# Get predictions
predictions_1 = clf.predict(X_valid)  # Your code here

# Calculate MAE
mae_1 = mean_absolute_error(predictions_1, y_valid)  # Your code here

# Uncomment to print MAE
print("Mean Absolute Error:", mae_1)
示例#19
0
# d = np.argmax(cumsum >=0.95) + 1
# print('cumsum >=0.95 : ',cumsum >=0.95)
# print('d : ',d)

# import matplotlib.pyplot as plt

# plt.plot(cumsum)
# plt.grid()
# plt.show()

#2 모델 구성

# model = RandomForestRegressor()
model = XGBRegressor(n_jobs=-1,
                     use_label_encoder=False,
                     eval_metric='mlogloss')

score = cross_val_score(model, x_train, y_train, cv=KFold)
# model.fit(x_train,y_train,eval_metric='mlogloss')
# r2 = model.score(x_test,y_test)

# print(model.feature_importances_)
# print('r2 : ',r2)

print(score)

# (442, 7)
# (442,)
# [0.34307936 0.41395492 0.57884725 0.32314566 0.24108945]
示例#20
0
data = train_df.append(test_df)
#data = autoclean(data)
train, test = data[0:len(train_df)], data[len(train_df):]

# Organize our data for training
X = train.drop(["y"], axis=1)
Y = train["y"]
x_test = test.drop(["y"], axis=1)
X, X_Val, Y, Y_Val = train_test_split(X, Y)

# A parameter grid for XGBoost
params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,11)],
'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4,5]}

# Initialize XGB and GridSearch
xgb = XGBRegressor(nthread=4) 

grid = GridSearchCV(xgb, params)
grid.fit(X, Y)

# Print the r2 score
print(r2_score(Y_Val, grid.best_estimator_.predict(X_Val))) 

# Save the file
y_test = grid.best_estimator_.predict(x_test)
results_df = pd.DataFrame(data={'y':y_test}) 
ids = test_df["ID"]
joined = pd.DataFrame(ids).join(results_df)
joined.to_csv("mercedes.csv", index=False)

# This scored 0.5563 for me on the LB
示例#21
0
parameters = [{
    'max_depth': [13, 14, 15, 16],
    'criterion': ['mse'],
    'n_estimators': [298, 299],
    'max_features': ['sqrt'],
}]
grid_search = GridSearchCV(estimator=rfr, param_grid=parameters, cv=10)
grid_search = grid_search.fit(X, y)
print("Best score:", grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)

# =============================================================================
# STEP 7: BONUS Use XGBoost
# =============================================================================
# Fitting XGBoost to the dataset
xgbr = XGBRegressor()
xgbr.fit(X_train, y_train)

# Cross validation (10-fold validation)
xgbr_score = cross_val_score(estimator=xgbr, X=X_train, y=y_train, cv=10)
print("----------------------------------------------")
print("Step 7:")
print("XGBoost score:", xgbr_score)
print("Mean score:", xgbr_score.mean())
print("Standard Deviation:", xgbr_score.std())
print("----------------------------------------------")

# Show features' score by descending order
ftscrxgboost = sorted(zip(
    map(lambda x: round(x, 4), xgbr.feature_importances_),
    backupdataset.columns.values),
示例#22
0
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)
plt.scatter(y_test, y_pred)

rf.score(x_test, y_test)

print('MAE :', " ", metrics.mean_absolute_error(y_test, y_pred))
print('MSE :', " ", metrics.mean_squared_error(y_test, y_pred))
print('RMSE :', " ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# XGBoost

xgb = XGBRegressor(n_estimators=500,
                   max_depth=4,
                   learning_rate=0.1,
                   early_stopping_rounds=10)
xgb.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)

y_pred = xgb.predict(x_test)
plt.scatter(y_test, y_pred)

xgb.score(x_test, y_test)

plot_importance(xgb)

print('MAE :', " ", metrics.mean_absolute_error(y_test, y_pred))
print('MSE :', " ", metrics.mean_squared_error(y_test, y_pred))
print('RMSE :', " ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.3,random_state=0)


# In[79]:


from sklearn.model_selection import train_test_split
xtrains,xtests,ytrains,ytests=train_test_split(Xs,Y,test_size=0.3,random_state=0)


# In[56]:


from xgboost import XGBRegressor
model=XGBRegressor()
model.fit(xtrain,ytrain)


# In[80]:


from xgboost import XGBRegressor
model=XGBRegressor()
model.fit(xtrains,ytrains)


# In[ ]:


ypred=model.predict(xtest)
示例#24
0
logging.info('2. RandomForestRegressor- start predict')
rf.predict(train[col])
score_rf = test_model(rf, X_test, y_test)
print(score_rf)
#train_error_rf = round(mean_squared_error(y_train, rf.predict(X_train)), 3)
#test_error_rf = round(mean_squared_error(y_test, rf.predict(X_test)), 3)

logging.info('2.1 Feature importances...')
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
dtrain = xgboost.DMatrix(train[col], label=y)

logging.info('3. Quick&Dirty XGBoost...')
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
#train_error_xgb = round(mean_squared_error(y_train, xgb.predict(X_train)), 3)
#test_error_xgb = round(mean_squared_error(y_test, xgb.predict(X_test)), 3)
#print('XGBoost train error: {}'.format(train_error_xgb))
#print('XGBoost test error: {}'.format(test_error_xgb))
score_xgb = test_model(xgb, X_test, y_test)
logging.info('3. End XGBoost...')

params = dict(max_depth=list(range(5, 10)), n_estimators=[100, 700], learning_rate=np.arange(0, 1, 0.05))
logging.info('4. Start Grid Search')
grid_search = GridSearchCV(xgb, param_grid=params, n_jobs=-1).fit(X_train, y_train)
logging.info('4. End Grid Search')
# summarize the results of the grid search
print('Best Estimator: {}'.format(grid_search.best_estimator_))
print('Best Parameters: {}'.format(grid_search.best_params_))
示例#25
0
print(x_train.shape)        # (8000, 71)
print(x_test.shape)         # (2000, 71)
print(y_train.shape)        # (8000, 4)
print(y_test.shape)         # (2000, 4)

# # y_train1 = y_train[:, 0]
# # y_train2 = y_train[:, 1]
# # y_train3 = y_train[:, 2]
# # y_train4 = y_train[:, 3]

# # y_test1 = y_test[:, 0]
# # y_test2 = y_test[:, 1]
# # y_test3 = y_test[:, 2]
# # y_test4 = y_test[:, 3]

xgbr = XGBRegressor()
# model.fit(x_train, y_train)
# score = model.score(x_test, y_test)
# print('R2 :', score)

model = MultiOutputRegressor(xgbr)
model.fit(x_train,y_train)
# print(len(model.estimators_))
# print(model.estimators_[0].feature_importances_)

for i in range(len(model.estimators_)):
    threshold = np.sort(model.estimators_[i].feature_importances_)

    for thresh in threshold:
        selection = SelectFromModel(model.estimators_[i], threshold=thresh, prefit=True)
示例#26
0
y_test = y_test.iloc[1:]

# %%
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import plot_importance, plot_tree
plt.style.use('fivethirtyeight')

model = XGBRegressor(
    n_estimators=1000,
    #max_depth=8,
    #min_child_weight=300,
    #colsample_bytree=0.8,
    #subsample=0.8,
    #eta=0.3,
    #seed=42
)

model.fit(X_train,
          y_train,
          eval_metric="rmse",
          eval_set=[(X_train, y_train), (X_test, y_test)],
          verbose=False,
          early_stopping_rounds=100)

# %%
model.feature_importances_
示例#27
0
 def __init__(self, rfe_cv, *args, **kwargs):
     self.rfe = None
     self.rfe_cv = rfe_cv
     self.model = XGBRegressor(*args, **kwargs)
    def setClf(self):
        self.clf = XGBRegressor(max_depth=7,
                                learning_rate=0.01,
                                n_estimators=100)

        return
示例#29
0
    def __init__(self):
        self.classifier_param_list = [
            {
                "model": [LogisticRegression(fit_intercept=False)],
                "model__C": [1, 5, 10],
            },
            {
                "model": [DecisionTreeClassifier()],
                "model__min_samples_split": [0.25, 0.5, 1.0],
                "model__max_depth": [5, 10, 15],
            },
            {
                "model": [RandomForestClassifier()],
                "model__min_samples_split": [0.25, 0.5, 1.0],
                "model__max_depth": [5, 10, 15],
            },
            {
                "model": [BaggingClassifier()],
                "model__n_estimators": [5, 10, 15],
                "model__max_features": [0.25, 0.5, 1.0],
            },
            {
                "model": [AdaBoostClassifier()],
                "model__n_estimators": [5, 10, 15],
                "model__learning_rate": [0.001, 0.01, 0.1],
            },
            {
                "model": [MLPClassifier()],
                "model__activation": ["identity", "logistic", "tanh", "relu"],
                "model__alpha": [0.001, 0.01, 0.1],
            },
            {
                "model": [XGBClassifier()],
                "model__n_estimators": [5, 10, 15],
                "model__learning_rate": [0.001, 0.01, 0.1],
            },
            {
                "model": [lgb.LGBMClassifier()],
                "model__learning_rate": [0.001, 0.01, 0.1],
                "model__n_estimators": [5, 10, 15],
                "model__num_leaves": [5, 10, 15],
            },
            {
                "model": [CatBoostClassifier()],
                "model__learning_rate": [0.001, 0.01, 0.1],
                "model__depth": [5, 10, 15],
                "model__l2_leaf_reg": [5, 10, 15],
            },
        ]

        self.regressor_param_list = [
            {
                "model": [ElasticNet(fit_intercept=False)],
                "model__alpha": [0.001, 0.01, 0.1],
                "model__l1_ratio": [0.25, 0.5, 1.0],
            },
            {
                "model": [DecisionTreeRegressor()],
                "model__min_samples_split": [0.25, 0.5, 1.0],
                "model__max_depth": [5, 10, 15],
            },
            {
                "model": [RandomForestRegressor()],
                "model__min_samples_split": [0.25, 0.5, 1.0],
                "model__max_depth": [5, 10, 15],
            },
            {
                "model": [BaggingRegressor()],
                "model__n_estimators": [5, 10, 15],
                "model__max_features": [0.25, 0.5, 1.0],
            },
            {
                "model": [AdaBoostRegressor()],
                "model__n_estimators": [5, 10, 15],
                "model__learning_rate": [0.001, 0.01, 0.1],
            },
            {
                "model": [MLPRegressor()],
                "model__activation": ["identity", "logistic", "tanh", "relu"],
                "model__alpha": [0.001, 0.01, 0.1],
            },
            {
                "model": [XGBRegressor()],
                "model__n_estimators": [5, 10, 15],
                "model__learning_rate": [0.001, 0.01, 0.1],
            },
            {
                "model": [lgb.LGBMRegressor()],
                "model__learning_rate": [0.001, 0.01, 0.1],
                "model__n_estimators": [5, 10, 15],
                "model__num_leaves": [5, 10, 15],
            },
            {
                "model": [CatBoostRegressor()],
                "model__learning_rate": [0.001, 0.01, 0.1],
                "model__depth": [5, 10, 15],
                "model__l2_leaf_reg": [5, 10, 15],
            },
        ]
示例#30
0
X = df.drop(columns=['price'], axis=1)
Y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

#Encoding the regions
regions_df = np.asarray(X['region']).reshape(1, -1)
enc = OrdinalEncoder(encoding_method='ordered', variables=['region'])
enc.fit(X_train, y_train)

X_train_enc = enc.transform(X_train)
X_test_enc = enc.transform(X_test)

#fit model no training data
regressor = XGBRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3)
regressor.fit(X_train_enc, y_train)

#make predictions for test data
y_pred = regressor.predict(X_test_enc)
predictions = [round(value) for value in y_pred]

#Re-Normalizing price by multiplying with 1000000
price_predictions = y_pred * 1000000
print(len(y_pred))
print(y_pred)
print(price_predictions)
price_pred_round = np.round(price_predictions, 2)
print(price_pred_round)
# evaluate predictions
mse = mean_squared_error(y_test, predictions)