Пример #1
1
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

print('train shape', x_train.shape)
# x_train.to_csv('../subs/naive2.csv', index=True)

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)



cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=20, show_stdv=False)
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()


num_boost_rounds = len(cv_output)
print('num rounds:', num_boost_rounds)
model = xgb.train(xgb_params, dtrain, num_boost_round= num_boost_rounds)

fig, ax = plt.subplots(1, 1, figsize=(8, 13))
xgb.plot_importance(model, height=0.5, ax=ax)
plt.show()

y_predict = model.predict(dtest)
output = pd.DataFrame({'id': id_test, 'price_doc': y_predict})

output.to_csv('../subs/xgbSub_seed255.csv', index=False)
Пример #2
0
def train_helper(X_train, X_test, y_train, y_test, model_name):
    xg_train = xgboost.DMatrix( X_train, label=y_train)
    xg_test = xgboost.DMatrix(X_test, label=y_test)

    le = load_label_encoder(model_name)

    param = {}
    # use softmax multi-class classification
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.002
    param['max_depth'] = 7
    param['nthread'] = 7
    param['num_class'] = len(le.classes_)
    param['eval_metric'] = 'merror'

    evals = [ (xg_train, 'train'), (xg_test, 'eval') ]

    # Train xgboost
    print "Training classifier..."
    t1 = time.time()
    bst = xgboost.train(param, xg_train, 500, evals, early_stopping_rounds=10)
    xgboost.plot_importance(bst)
    t2 = time.time()
    print t2-t1
    bst.save_model(classifier_filename(model_name))
    return bst
Пример #3
0
    def test_importance_plot_lim(self):
        np.random.seed(1)
        dm = xgb.DMatrix(np.random.randn(100, 100), label=[0, 1] * 50)
        bst = xgb.train({}, dm)
        assert len(bst.get_fscore()) == 71
        ax = xgb.plot_importance(bst)
        assert ax.get_xlim() == (0., 11.)
        assert ax.get_ylim() == (-1., 71.)

        ax = xgb.plot_importance(bst, xlim=(0, 5), ylim=(10, 71))
        assert ax.get_xlim() == (0., 5.)
        assert ax.get_ylim() == (10., 71.)
Пример #4
0
def run_xgb(train, test, features, target, random_state=0):
    eta = 0.02
    max_depth = 5 
    subsample = 0.75
    colsample_bytree = 0.7
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "multi:softprob",
        "num_class": 12,
        "booster" : "gbtree",
        "eval_metric": "mlogloss",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state,
    }
    num_boost_round = 500*2
    early_stopping_rounds = 50
    test_size = 0.3

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    print('Length train:', len(X_train.index))
    print('Length valid:', len(X_valid.index))
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print "importance of feathure"
    xgb.plot_importance(gbm)
    show()


    #time.sleep(60*5)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration)
    score = log_loss(y_valid.tolist(), check)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score
 def run_train_validation(self):
     x_train, y_train,x_validation,y_validation = self.get_train_validationset()
     dtrain = xgb.DMatrix(x_train, label= y_train,feature_names=x_train.columns)
     dvalidation = xgb.DMatrix(x_validation, label= y_validation,feature_names=x_validation.columns)
     self.set_xgb_parameters()
     
     evals=[(dtrain,'train'),(dvalidation,'eval')]
     model = xgb.train(self.xgb_params, dtrain, evals=evals, **self.xgb_learning_params)
     xgb.plot_importance(model)
     plt.show()
      
     print "features used:\n {}".format(self.get_used_features())
      
     return
def test_sklearn_plotting():
    tm._skip_if_no_sklearn()
    from sklearn.datasets import load_iris

    iris = load_iris()

    classifier = xgb.XGBClassifier()
    classifier.fit(iris.data, iris.target)

    import matplotlib
    matplotlib.use('Agg')

    from matplotlib.axes import Axes
    from graphviz import Digraph

    ax = xgb.plot_importance(classifier)
    assert isinstance(ax, Axes)
    assert ax.get_title() == 'Feature importance'
    assert ax.get_xlabel() == 'F score'
    assert ax.get_ylabel() == 'Features'
    assert len(ax.patches) == 4

    g = xgb.to_graphviz(classifier, num_trees=0)
    assert isinstance(g, Digraph)

    ax = xgb.plot_tree(classifier, num_trees=0)
    assert isinstance(ax, Axes)
Пример #7
0
    def test_plotting(self):
        bst2 = xgb.Booster(model_file='xgb.model')
        # plotting

        import matplotlib
        matplotlib.use('Agg')

        from matplotlib.axes import Axes
        from graphviz import Digraph

        ax = xgb.plot_importance(bst2)
        assert isinstance(ax, Axes)
        assert ax.get_title() == 'Feature importance'
        assert ax.get_xlabel() == 'F score'
        assert ax.get_ylabel() == 'Features'
        assert len(ax.patches) == 4

        ax = xgb.plot_importance(bst2, color='r',
                                 title='t', xlabel='x', ylabel='y')
        assert isinstance(ax, Axes)
        assert ax.get_title() == 't'
        assert ax.get_xlabel() == 'x'
        assert ax.get_ylabel() == 'y'
        assert len(ax.patches) == 4
        for p in ax.patches:
            assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red


        ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'],
                                 title=None, xlabel=None, ylabel=None)
        assert isinstance(ax, Axes)
        assert ax.get_title() == ''
        assert ax.get_xlabel() == ''
        assert ax.get_ylabel() == ''
        assert len(ax.patches) == 4
        assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red
        assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red
        assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue
        assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue

        g = xgb.to_graphviz(bst2, num_trees=0)
        assert isinstance(g, Digraph)
        ax = xgb.plot_tree(bst2, num_trees=0)
        assert isinstance(ax, Axes)
Пример #8
0
 def save_topn_features(self, fname="XGBRegressor_topn_features.txt", topn=-1):
     ax = xgb.plot_importance(self.model)
     yticklabels = ax.get_yticklabels()[::-1]
     if topn == -1:
         topn = len(yticklabels)
     else:
         topn = min(topn, len(yticklabels))
     with open(fname, "w") as f:
         for i in range(topn):
             f.write("%s\n"%yticklabels[i].get_text())
Пример #9
0
def plot_feat_importances():
    gbm = xgboost.XGBClassifier(silent=False, seed=8).fit(X_train, y_train)
    plot = xgboost.plot_importance(gbm)
    ticks = plot.set_yticklabels(df_xgb.columns)

    importances = rf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rf.estimators_],
                 axis=0)
    indices = np.argsort(importances)
    plt.barh(range(len(indices)), importances[indices], yerr=std[indices], color='lightblue')
    ticks = plt.yticks(range(len(indices)), df_xgb.columns)
Пример #10
0
    def plot_importance(self, ax=None, height=0.2,
                        xlim=None, title='Feature importance',
                        xlabel='F score', ylabel='Features',
                        grid=True, **kwargs):

        """Plot importance based on fitted trees.

        Parameters
        ----------
        ax : matplotlib Axes, default None
            Target axes instance. If None, new figure and axes will be created.
        height : float, default 0.2
            Bar height, passed to ax.barh()
        xlim : tuple, default None
            Tuple passed to axes.xlim()
        title : str, default "Feature importance"
            Axes title. To disable, pass None.
        xlabel : str, default "F score"
            X axis title label. To disable, pass None.
        ylabel : str, default "Features"
            Y axis title label. To disable, pass None.
        kwargs :
            Other keywords passed to ax.barh()

        Returns
        -------
        ax : matplotlib Axes
        """

        import xgboost as xgb

        if not isinstance(self._df.estimator, xgb.XGBModel):
            raise ValueError('estimator must be XGBRegressor or XGBClassifier')
        return xgb.plot_importance(self._df.estimator.booster(),
                                   ax=ax, height=height, xlim=xlim, title=title,
                                   xlabel=xlabel, ylabel=ylabel, grid=True, **kwargs)
Пример #11
0
import xgboost as xgb

#Subset the data and set up model parameters
offset = 5000
num_round = 500
xgtest = xgb.DMatrix(test)
gb_params = {"objective":"reg:linear", "eta": 0.01, "min_child_weight": 6, \
"subsample": 0.7, "colsample_bytree": 0.7, "scale_pos_weight": 1, "silent": 1, 
"max_depth": 8}

#Create a train and validation dmatrices 
xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])

#Train model and predict test values
watchlist = [(xgtrain, 'train'),(xgval, 'val')]
model = xgb.train(gb_params, xgtrain, num_round, watchlist, \
early_stopping_rounds=4)
xg_preds = model.predict(xgtest, ntree_limit=model.best_iteration)

xgb.plot_importance(model)

fscore = [ (v,k) for k,v in model.get_fscore().iteritems() ]
fscore.sort(reverse=True)

#Send predicted scores to csv file
submission = pd.DataFrame({"Id": test_ind, "Hazard": xg_preds})
submission = submission.set_index("Id")
submission.to_csv('/Users/btrani/Git/projects/Kaggle/Liberty_Mutual/sub_gb_7.csv')

params["colsample_bytree"] = 0.8

cv_results = xgb.cv(params,train_matrix,
                    num_boost_round = params["num_rounds"],
                    nfold = params.get('nfold',5),
                    metrics = params['eval_metric'],
                    early_stopping_rounds = params["early_stopping_rounds"],
                    verbose_eval = True,
                    seed = seed)

n_best_trees = cv_results.shape[0]

watchlist = [(train_matrix, 'train')]
gbt = xgb.train(params, train_matrix, n_best_trees,watchlist)

xgb.plot_importance(gbt)
datas.groupby("age")['is_fraud'].agg(['count','mean'])

## false negative cost more
## false positive is acceptable
########### plot ROC on validation set
Xtrain_only,Xvalid,ytrain_only,yvalid = train_test_split(Xtrain,ytrain,test_size=0.3,random_state=seed)
onlytrain_matrix = xgb.DMatrix(Xtrain_only,ytrain_only)
valid_matrix = xgb.DMatrix(Xvalid,yvalid)

temp_gbt = xgb.train(params, onlytrain_matrix, n_best_trees,[(onlytrain_matrix,'train_only'),(valid_matrix,'validate')])
yvalid_proba_pred = temp_gbt.predict(valid_matrix,ntree_limit=n_best_trees)

fpr,tpr,thresholds = roc_curve(yvalid,yvalid_proba_pred)
roc = pd.DataFrame({'FPR':fpr,'TPR':tpr,'Threshold':thresholds})
Пример #13
0
#n_job = 7 time 0:00:00.030917

#5. 시각화
import matplotlib.pyplot as plt
import numpy as np
'''
# def plot_feature_importances_dataset(model): 
#     n_features = dataset.data.shape[1]
#     plt.barh(np.arange(n_features), model.feature_importances_,
#         align='center')
#     plt.yticks(np.arange(n_features), dataset.feature_names)
#     plt.xlabel("Feature Improtances")
#     plt.ylabel("Features")
#     plt.ylim(-1, n_features)
'''
plot_importance(model)  #xgboost --- plot_importance
plt.show()
#F score로 확인

#DecisionTreeClassifier
#정리전
# [0.0125026  0.         0.03213177 0.95536562]
# acc :  0.9333333333333333

#정리후
# [0.44369011 0.53961888 0.01669101]
# acc col정리 :  0.9

#RandomForestClassifier
#정리전
# [0.08142011 0.02056809 0.41412333 0.48388846]
Пример #14
0
def fea_plot(xg_model,
             feature,
             label,
             type='weight',
             max_num_features=None,
             x_axis_label=None,
             ranks_dir='./'):
    fig, AX = plt.subplots(nrows=1, ncols=2)
    fscore = xg_model.get_score(importance_type=type)
    fscore = sorted(fscore.items(), key=itemgetter(1),
                    reverse=True)  # sort scores
    fea_index = get_fea_index(fscore, max_num_features)

    #save ranks to files
    path_to_save = '../average_rank/ranks/' + ranks_dir
    if not os.path.isdir(path_to_save):
        os.mkdir(path_to_save)
    path_to_save = path_to_save + '/index_' + type + '.txt'

    save_rank_file = open(path_to_save, 'w')
    all_feat_index = get_fea_index(fscore, None)
    all_feat_index = [i + 1 for i in all_feat_index]
    print('fscore len')
    print(len(all_feat_index))
    if (x_axis_label != None):
        all_x_axis_label = get_axis_label(all_feat_index, x_axis_label)
    else:
        all_x_axis_label = all_feat_index
    for item in all_x_axis_label:
        save_rank_file.write("%s\n" % item)
    save_rank_file.close()

    if (x_axis_label != None):
        mapper = {'f{0}'.format(i): v for i, v in enumerate(x_axis_label)}
        mapped = {
            mapper[k]: v
            for k, v in xg_model.get_score(importance_type=type).items()
        }
        xgb.plot_importance(mapped,
                            xlabel=type,
                            ax=AX[0],
                            max_num_features=max_num_features)
    else:
        xgb.plot_importance(xg_model,
                            xlabel=type,
                            importance_type=type,
                            ax=AX[0],
                            max_num_features=max_num_features)

    print(fea_index)
    print(max_num_features)

    feature = feature[:, fea_index]
    dimension = len(fea_index)
    X = range(1, dimension + 1)

    Yp = np.mean(feature[np.where(label == 1)[0]], axis=0)
    Yn = np.mean(feature[np.where(label != 1)[0]], axis=0)
    for i in range(0, dimension):
        param = np.fmax(Yp[i], Yn[i])
        if (param != 0):
            Yp[i] /= param
            Yn[i] /= param
        else:
            print('oops!seems wrong')
    p1 = AX[1].bar(X, +Yp, facecolor='#ff9999', edgecolor='white')
    p2 = AX[1].bar(X, -Yn, facecolor='#9999ff', edgecolor='white')
    AX[1].legend((p1, p2), ('Malware', 'Normal'))
    AX[1].set_title('Comparison of selected features by their means')
    AX[1].set_xlabel('Feature Index')
    AX[1].set_ylabel('Mean Value')
    AX[1].set_ylim(-1.1, 1.1)
    #update on 5/25/2017, this line should be added or removed according to the inputdata format
    fea_index = [i + 1 for i in fea_index]
    if (x_axis_label != None):
        tar_x_axis_label = get_axis_label(fea_index, x_axis_label)
    else:
        tar_x_axis_label = fea_index
    plt.xticks(X, tar_x_axis_label, rotation=80)
    plt.suptitle('Feature Selection results')

    #seems useless
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 11

    plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)  # fontsize of the axes title
    plt.rc('axes', labelsize=BIGGER_SIZE)  # fontsize of the x and y labels
    plt.rc('xtick', labelsize=BIGGER_SIZE)  # fontsize of the tick labels
    plt.rc('ytick', labelsize=BIGGER_SIZE)  # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
Пример #15
0
 def plot_feature_importances(self, model):
     plot_importance(model)
     plt.show()
Пример #16
0
 def plot_importance_matrix(self,vars_names):
     pdb.set_trace()
     xgb.plot_importance(self.clf)
Пример #17
0
fscore_lo = np.percentile(fscore, 2.5, axis=0)
fscore_hi = np.percentile(fscore, 97.5, axis=0)
ind_sort = np.array(np.argsort(fscore_mean))
fscore_mean_sorted = fscore_mean[ind_sort]
# ci_sorted = fscore_ci[ind_sort]
fscore_lo_sorted = fscore_lo[ind_sort]
fscore_hi_sorted = fscore_hi[ind_sort]
feature_label_sorted = feature_label[ind_sort]
feature_label_short = []
for i in range(feature_label_sorted.size):
    feature_label_short.append(dic[feature_label_sorted[i]])
    
get_ipython().magic(u'matplotlib inline')
plt.figure(figsize=(4,12))
axes = plt.gca()
# plt.barh(np.arange(val_sorted.size), val_sorted, xerr=ci_sorted, height=.7, color=(.4,.4,.8), align='center', ecolor=(0,0,0))
plt.barh(np.arange(fscore_mean_sorted.size), fscore_mean_sorted,          xerr=np.array([fscore_mean_sorted-fscore_lo_sorted,fscore_hi_sorted-fscore_mean_sorted]),          height=.7, color=(.4,.4,.8), align='center', ecolor=(0,0,0))
plt.yticks(np.arange(len(feature_label_short)), feature_label_short, fontsize=12, color=(0,0,0));
# axes.set_ylim([3.5, len(feature_label_short)-9.5])
# axes.set_xlim([0, 0.04])
plt.box(on=False)
plt.xlabel('Gini Importance',fontsize=14)
plt.grid()


# In[ ]:

np.percentile(fscore, 2.5, axis=0)
xgb.plot_importance()

Пример #18
0
cv_xgb = xgb.cv(params = our_params, dtrain = xgdmat, num_boost_round = 3000, nfold = 5,
                metrics = ['error'], # Make sure you enter metrics inside a list or you may encounter issues!
                early_stopping_rounds = 100) # Look for early stopping that minimizes error

print('Tail:\n')
print(cv_xgb.tail(5))


our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8,
             'objective': 'binary:logistic', 'max_depth':3, 'min_child_weight':1}

print('Final Train: \n')
final_gb = xgb.train(our_params, xgdmat, num_boost_round = 432)

xgb.plot_importance(final_gb)
plt.show()
#Predicting:

testdmat = xgb.DMatrix(X_pred)
y_pred = final_gb.predict(testdmat)

y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0

y_pred = y_pred.astype(np.int64)
#Submission

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": y_pred
# draw feature importance
plt.plot(RF_clf_Ab.feature_importances_)
plt.title("Feature importance by Random Forest Model")
plt.show()
# #-----------------------------------------------------------------------------#
# 1,7 train the XGBclassifier on the training data
xgb_clf_Ab = xgb.XGBClassifier()
xgb_clf_Ab.fit(X_Ab_train,y_Ab_train)

# predict accuracy
xgb_acc_Ab= xgb_clf_Ab.score(X_Ab_test,y_Ab_test)
print("Accuracy of xgb: {:.4f}".format(xgb_acc_Ab))

# draw feature importance
xgb.plot_importance(xgb_clf_Ab,title = 'Feature importance',xlabel = 'F score', ylabel = 'Features', grid = True )
plt.show()

#-----------------------------------------------------------------------------#
# 2. train the models on the training data with 5-fold cross validation and check the stability of prediction accuracy
#-----------------------------------------------------------------------------#
# 2.1 train the logistic regression model on the training data with 5-fold cross validation
LR_cv = linear_model.LogisticRegression()
LR_scores = cross_validation.cross_val_score(LR_cv, X_Ab,y_Ab, cv = 5)
print("Logistic Regression")
print(LR_scores)
print("Accuracy of LR: %0.2f (+/- %0.2f)" %(LR_scores.mean(),LR_scores.std()*2))

#-----------------------------------------------------------------------------#
# 2.2 train SVM classifier with 5-fold cross validation
Пример #20
0
 def plot_importance(self):
     ax = xgb.plot_importance(self.model)
     self.save_topn_features()
     return ax

y_pred=loaded_model.predict_proba(X_test)


# In[147]:


ll = log_loss(y_test,y_pred)
ll


# In[148]:


xgb.plot_importance(loaded_model)


# # XGBOOST PARAMETER TUNING

# In[149]:


HYPER_PARAMS = { 
 'learning_rate': 0.20,
 'n_estimators':0,
 'max_depth': 5,
 'subsample': 0.7,
 'colsample_bytree': 0.9,
 'max_delta_step': 1,
 'objective': 'multi:softmax',
Пример #22
0
ax.tick_params(axis='x', rotation=90)
ax.set_xlabel('GAME DATE')
ax.set_ylabel('POINTS SCORED')
ax.set_title('MAVS 2020 ACTUAL vs PREDICTED vs VEGAS\n BEAT VEGAS 13 out of 25 (DON\'T BET ON THIS!)')
ax.legend(loc=2)

dal_res = dal_pred - dal_y_test
vegas_res = dal_test_vegas - dal_y_test
abs(dal_res) < abs(vegas_res)

plt_x
plt_y = 
X_test.columns[plt_x]
fig,ax = plt.subplots(figsize=(30,24))
# plt.scatter(y_test, bst1.predict(X_test)-y_test)
xgb.plot_importance(bst1, ax=ax)
ax.barh(plt_y, plt_x)



    # dal_train_vegas = season[(season.SEASON_ID==i)].iloc[:int(len(season[(season.SEASON_ID==i)])*(2/3)), 5]




DUMMY REGRESSOR:

dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)
#21.982204049481744
mean_squared_error(y_test, dummy_regr.predict(X_test), squared = False)
Пример #23
0
                                      colsample_bytree=colsample,
                                      subsample=subsample)
                m.fit(Xtr, ytr)
                pp = m.predict_proba(Xts)[:, 1]
                if FINAL_SUBMISSION:
                    import datetime
                    timestamp = datetime.datetime.now().strftime(
                        '%Y-%m-%d-%H:%M')
                    scores = np.c_[np.arange(len(pp)), pp]
                    np.savetxt('../out/vilab-submission-%s.csv' % timestamp,
                               scores, '%d,%.8f', ',', header='id,probability',
                               comments='')
                    toc()
                else:
                    toc('cs=%.2f md=%2d lr=%.2f mcw=%1d g=%d score=%.4f' % (
                        colsample, max_depth, learning_rate, min_child_weight,
                        gamma, roc_auc_score(yts, pp)))
                sys.stdout.flush()

import matplotlib.pyplot as plt
plt.ioff()
xgb.plot_importance(m, tick_label=names)
plt.savefig('xgb-features.pdf')
plt.show()

'''
xgb.plot_tree(m)
plt.savefig('xgb-tree.pdf', dpi=900)
plt.show()
'''
Пример #24
0
#test = []
pred2 = model.predict(dtest)

df2 = pd.DataFrame()
df2["Orginal"] = testDelay
df2["Predicted"] = pred2
df2.to_csv('compareDelay.csv', index = False)


import matplotlib.pyplot as plt
plt.style.use("ggplot")
mapper = { 'f{0}' . format (I): v for I, v in  enumerate (train.columns)}
mapped = {mapper [k]: v for k, v in model.get_fscore().items()}
import operator
mapped = sorted(mapped.items(), key=operator.itemgetter(1))
xgb.plot_importance(mapped)
plt.show()
df = pd.DataFrame(mapped, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(25, 15))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.gcf().savefig('feature_importance_xgb.png')


xx = np.linspace(-10,500)
yy = xx
h0 = plt.plot(xx, yy, 'k-', label="ideal Values")
plt.scatter(df2.Orginal, df2.Predicted, c = 'y')
plt.legend()
plt.show()
Пример #25
0
##xgboost.train(params, dtrain, num_boost_round=10, evals=(), obj=None, 
##              feval=None, maximize=False, early_stopping_rounds=None, 
##              evals_result=None, verbose_eval=True, learning_rates=None, 
##              xgb_model=None)
#
evallist  = [(dtest,'eval'), (dtrain,'train')]

watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
evals_result = {}
num_round = 10
bst = xgb.train(param,xg_train, num_round, evals_result=evals_result)
pred = bst.predict(xg_test)

print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))

xgb.plot_importance(bst)
xgb.plot_tree(bst, num_trees=2)

#=============Logistic Regression==============================================================

#Define sigmoid function
def sigmoid(z):
    return 1 / (1 + e**(-z))

#Calcualte the cost to be minimized -- using the sigmoid function
def cost(theta, X, y, l):
    m = X.shape[0] #Number of rows in the data
    z = X.dot(theta)
    O = (-1 / m) * (log(sigmoid(z)).T.dot(y)  +  log(1-sigmoid(z)).T.dot((1-y)))
#    print(m)
#    print(theta)
Пример #26
0
def train(param, num_round=1000, early_stopping_rounds=20):
    exec_time = time.strftime("%Y%m%d%I%p%M", time.localtime())

    os.mkdir('{0}_{1}'.format(model_path, exec_time))
    os.mkdir('{0}_{1}'.format(submission_path, exec_time))

    train_params = param.copy()
    train_params['num_boost_round'] = num_round
    train_params['early_stopping_rounds'] = early_stopping_rounds
    json.dump(train_params, open('{0}_{1}{2}'.format(model_path, exec_time, model_params), 'wb+'))

    print 'get training data'

    train_features = pd.read_csv(train_path + 'train_features.csv').astype(float)
    train_labels = pd.read_csv(train_path + 'labels.csv').astype(float)

    validate_features = pd.read_csv(validate_path + 'train_features.csv').astype(float)
    validate_labels = pd.read_csv(validate_path + 'labels.csv').astype(float)

    predict_features = pd.read_csv(predict_path + 'train_features.csv').astype(float)

    create_feature_map(train_features.columns.tolist(), '{0}_{1}{2}'.format(model_path, exec_time, model_fmap_file))

    train_matrix = xgboost.DMatrix(train_features.values, label=train_labels.values, feature_names=train_features.columns)
    val_matrix = xgboost.DMatrix(validate_features.values, label=validate_labels.values, feature_names=validate_features.columns)
    predict_matrix = xgboost.DMatrix(predict_features.values, feature_names=predict_features.columns)

    watchlist = [(train_matrix, 'train'), (val_matrix, 'eval')]

    print 'model training'
    with open('{0}_{1}{2}'.format(model_path, exec_time, model_train_log), 'wb+') as outf:
        sys.stdout = outf
        model = xgboost.train(param, train_matrix, num_boost_round=num_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds)

    sys.stdout = save_stdout
    print 'model.best_score: {0}, model.best_iteration: {1}, model.best_ntree_limit: {2}'.format(model.best_score, model.best_iteration, model.best_ntree_limit)

    print 'output offline model data'
    model.save_model('{0}_{1}{2}'.format(model_path, exec_time, model_file))
    model.dump_model('{0}_{1}{2}'.format(model_path, exec_time, model_dump_file))

    importance = model.get_fscore(fmap='{0}_{1}{2}'.format(model_path, exec_time, model_fmap_file))
    importance = sorted(importance.items(), key=operator.itemgetter(1))
    df = pd.DataFrame(importance, columns=['feature', 'fscore'])
    df['fscore'] = df['fscore'] / df['fscore'].sum()
    df.to_csv('{0}_{1}{2}'.format(model_path, exec_time, model_feature_importance_csv), index=False)

    xgboost.plot_importance(model)
    plt.gcf().set_size_inches(20, 16)
    plt.gcf().set_tight_layout(True)
    plt.gcf().savefig('{0}_{1}{2}'.format(model_path, exec_time, model_feature_importance_file))
    plt.close()

    train_pred_labels = model.predict(train_matrix, ntree_limit=model.best_ntree_limit)
    val_pred_labels = model.predict(val_matrix, ntree_limit=model.best_ntree_limit)

    train_pred_frame = pd.Series(train_pred_labels, index=train_features.index)
    train_pred_frame.name = probability_consumed_label
    val_pred_frame = pd.Series(val_pred_labels, index=validate_features.index)
    val_pred_frame.name = probability_consumed_label

    train_true_frame = pd.read_csv(train_path + 'labels.csv')['Label']
    val_true_frame = pd.read_csv(validate_path + 'labels.csv')['Label']
    train_coupons = pd.read_csv(train_path + 'dataset.csv')
    val_coupons = pd.read_csv(validate_path + 'dataset.csv')
    train_check_matrix = train_coupons[[coupon_label]].join(train_true_frame).join(train_pred_frame)
    val_check_matrix = val_coupons[[coupon_label]].join(val_true_frame).join(val_pred_frame)
    print 'Average auc of train matrix: ', check_average_auc(train_check_matrix)
    print 'Average auc of validate matrix', check_average_auc(val_check_matrix)

    val_coupons = val_coupons.join(val_pred_frame).join(val_pred_frame.map(lambda x: 0. if x < 0.5 else 1.).rename('map')).join(val_true_frame)
    val_coupons.to_csv('{0}_{1}{2}'.format(model_path, exec_time, val_diff_file), index=False)
    print confusion_matrix(val_coupons['Label'], val_coupons['map'])

    labels = model.predict(predict_matrix, ntree_limit=model.best_ntree_limit)
    frame = pd.Series(labels, index=predict_features.index)
    frame.name = probability_consumed_label

    plt.figure()
    frame.hist(figsize=(10, 8))
    plt.title('results histogram')
    plt.xlabel('predict probability')
    plt.gcf().savefig('{0}_{1}{2}'.format(submission_path, exec_time, submission_hist_file))
    plt.close()

    submission = pd.read_csv(predict_path + 'dataset.csv')
    submission = submission[[user_label, coupon_label, date_received_label]].join(frame)
    submission.to_csv('{0}_{1}{2}'.format(submission_path, exec_time, submission_file), index=False)
Пример #27
0
#%%

predictions = my_model.predict(val_X)
#print (predictions)

#%%

#print((mean_squared_error(val_y, )

rmse = np.sqrt(mean_squared_error(val_y, predictions))
R = r2_score(val_y, predictions)
MAPE = np.mean(np.abs((val_y - predictions) / val_y)) * 100

print("RMSE: %f" % (rmse))

print("R^: ", R)

print("MAPE: ", MAPE)

#%%
xgb.plot_importance(my_model)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()
#%%
xgb.plot_tree(my_model, num_trees=0)
plt.rcParams['figure.figsize'] = [50, 10]
plt.show()
#xgb.plot_tree(my_model,num_trees=0)
#plt.rcParams['figure.figsize'] = [50, 10]
#plt.show()
Пример #28
0
    model = xgb.train(params, dtrain, 200, watchlist, maximize=True, early_stopping_rounds = 25, verbose_eval=5)
    del dvalid
else:
    dtrain = xgb.DMatrix(train, y)
    del train, y
    gc.collect()
    watchlist = [(dtrain, 'train')]
    model = xgb.train(params, dtrain, 30, watchlist, maximize=True, verbose_eval=1)

del dtrain
gc.collect()

print('[{}] Finish XGBoost Training'.format(time.time() - start_time))

# Plot the feature importance from xgboost
plot_importance(model)
plt.gcf().savefig('feature_importance_xgb.png')

# Load the test for predict 
test = pd.read_csv(path+"test.csv", usecols=test_columns, dtype=dtypes)
test = pd.merge(test, ip_count, on='ip', how='left', sort=False)
del ip_count
gc.collect()

sub['click_id'] = test['click_id'].astype('int')

test['clicks_by_ip'] = test['clicks_by_ip'].astype('uint16')
test = timeFeatures(test)
test.drop(['click_id', 'ip'], axis=1, inplace=True)
dtest = xgb.DMatrix(test)
del test
                              reg_alpha=0.05,
                              reg_lambda=2,
                              subsample=1.0,
                              colsample_bytree=1.0,
                              max_delta_step=1,
                              scale_pos_weight=1,
                              objective='multi:softprob',
                              nthread=8,
                              seed=0  # ,
                              # silent = False
                              )
    print('training...')
    xgb_model.fit(training, label)
    print('predicting...')
    predicted = xgb_model.predict_proba(testing)
    predicted = pandas.DataFrame(predicted)
    predicted.columns = xgb_model.classes_
    # Name index column.
    predicted.index.name = 'Id'
    # Write csv.
    print('Saving prediction...')
    predicted.to_csv('Prediction.csv')
    # feature importance
    feat_imp = pandas.Series(xgb_model.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    matplotlib.pyplot.show()
    plot_importance(xgb_model, title='Feature importance')
    matplotlib.pyplot.show()
    plot_tree(xgb_model, num_trees=0)
    matplotlib.pyplot.show()
Пример #30
0
# best : {'gamma': 0.4, 'learning_rate': 0.05740649534056902, 'max_depth': 5, 'min_child_weight': 6, 'n_estimators': 166, 'subsample': 0.6}

# Train model one last time on all of the optimized parameter, including those found from the grid search & early boosting
final_model = xgb.train(final_params, xgb_train, num_boost_round = best_iteration)

# Create an XGBoost data matrix for the testing data. 
X_test_xgbmat = xgb.DMatrix(X_test)

# With the trained final model, predict the labels of our test set
y_pred = final_model.predict(X_test_xgbmat)
y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0

# Produce accuracy score for the above prediction. The goal is to improve upon the MLE baseline estimate.
accuracy = accuracy_score(y_pred, y_test)


print "Test Set Accuracy: ", accuracy
print "Baseline MLE Accuracy: ", max(1-np.sum(y_train)/float(y_train.size), np.sum(y_train)/float(y_train.size))
importance_dict = final_model.get_score(importance_type= 'gain')


# Select top 'n' feature to show in the importance plot
n = 10
top_n_importance_dict = dict(sorted(importance_dict.iteritems(), key=operator.itemgetter(1), reverse=True)[:n])

# Plot top 'n' features with respect to average gain per split
xgb.plot_importance(top_n_importance_dict, xlabel = 'Average Gain Per Split', title = "'" + y_string + "'\n" + 'Top 5 Feature Importance')
plt.show()

def XGBoost_regressor2():
    """
    Train an XGBoost model with XGBoost lib.
    This method is mainly used to find relative importance of
    the features.
    """
    train = xgb.DMatrix('train_libSVM.dat')
    all_train = xgb.DMatrix('all_train_libSVM.dat')
    test = xgb.DMatrix('test_libSVM.dat')
    validation = xgb.DMatrix('validate_libSVM.dat')
    param = {'max_depth': 11, 'eta': 0.002, 'silent': 1,
             'objective': 'reg:linear', 'gamma': 2.2,
             'subsample': 0.8, 'colsample_bytree': 0.7,
             'scale_pos_weight': 0.55, 'min_child_weight': 5,
             'n_jobs': 4}
    # 0.03-> 900, 1600 without features of SVD similarity between
    #                  search term and other columns
    # eta    ntrees   error
    # 0.03-> 900 ->  0.2397 * 2 = 4795
    # 0.025 -> 1900 ->            4782
    # 0.06 -> 640 -> 0.2400 * 2 = 4801
    ############# common brand & SVD brand deleted ############
    # 0.03-> 900 ->  0.2397 * 2 = 4794
    ############ add KL distance ########
    # 0.03 -> 966 -> 0.2397
    # 0.03 -> 1102 -> 0.234 or so
    ##### add spell checking
    # round = 200
    # depth = 12 -> 0.235371
    # depth = 11 min_cw = 5 -> 0.235316   SELECTED
    # depth = 10 -> 0.235840
    # depth = 9 -> 0.235912
    # depth = 8 -> 0.236202
    # min_child_weight = 6 -> 0.235679
    # min_child_weight = 4 -> 0.235478

    ###### as of April 16
    # 3500, 0.01, 0.238908
    # 1500, 0.03, 0.238893
    # 750, 0.06, 0.239034
    watchlist = [(validation, 'eval'), (train, 'train')]
    # TODO: do data cleaning again.
    # add approximate matching
    # check KL distance
    # n = 1096
    num_round = 10000
    xgb_model = xgb.train(param, train, num_round, watchlist)
    # xgb_model = xgb.cv(param, all_train, num_round, nfold=5,
    #                    metrics={'error'})
    # print xgb_model.head()
    # xgb_model.info()

    prediction = xgb_model.predict(test)
    importance = xgb_model.get_fscore(fmap='xgb.fmap')
    print importance
    sorted_importance = sorted(importance.items(),
                               key=operator.itemgetter(1))
    print sorted_importance
    importance_of_feature_file\
        = open('importance_of_feature_file', 'w')
    pickle.dump(sorted_importance, importance_of_feature_file)
    importance_of_feature_file.close()

    xgb.plot_importance(xgb_model)
    test_id = pd.read_pickle('id_test')
    prediction = prediction * 2 + 1
    prediction[prediction > 3] = 3
    prediction[prediction < 1] = 1
    clean_result(prediction)
    pd.DataFrame({"id": test_id.values, "relevance": prediction})\
        .to_csv('submission.csv', index=False)
Пример #33
0
 def plot_importance(self):
     xgb.plot_importance(self.model._Booster)
Пример #34
0
    del xgb_train, xgb_val
    gc.collect()

    cv_scores.append(roc_auc_score(y_val, bst.predict(xgb.DMatrix(X_val), ntree_limit=bst.best_ntree_limit)))
    print(cv_scores)

    print('predicting...')
    if i == 0:
        pred = bst.predict(xgb.DMatrix(np.array(test_x)),
                               ntree_limit=bst.best_ntree_limit)
    else:
        pred += bst.predict(xgb.DMatrix(np.array(test_x)),
                                ntree_limit=bst.best_ntree_limit)

del train_x, train_y
gc.collect()

print('mean_score:', np.mean(cv_scores))

pred /= folds
df_test['is_churn'] = pred.clip(0.0000001, 0.999999)
df_test = df_test[['msno', 'is_churn']]

# df_test.to_csv(out_path + 'stack_submissions{}.csv'.format(datetime.now().strftime("%Y%m%d-%H%M%S")), index=False)
df_test = []

plt.rcParams['figure.figsize'] = (7.0, 7.0)
xgb.plot_importance(booster=bst)
plt.show()
# plt.savefig('./feature_importance.png', dpi=100)
Пример #35
0
def plotFeatureImportance(bst):
    plt.figure(figsize=(10,10), dpi=200)
    xgb.plot_importance(bst, height=0.2)
    plt.gcf().savefig('feature_importance_xgb.png', dpi=200)
Пример #36
0
train.dtypes
train.


# In[2]:

X_train = train.drop("count",1)
Y_train = train['count']
T_train_xgb = xgb.DMatrix(X_train, Y_train)
params = {"objective": "reg:linear",'bst:max_depth':13,"booster":"gbtree" }
gbm = xgb.train(dtrain=T_train_xgb, params = params)
X_test = xgb.DMatrix(test)
Y_pred = gbm.predict(X_test)
print(Y_pred)
xgb.plot_importance(gbm)


# In[ ]:

plt.show()


# In[35]:

result['count'] = pd.DataFrame(Y_pred)
#result[result['count']<=0] = 0

result_final = result.set_index('datetime')
result_final[result_final['count']<=0]=0
result_final
    y = train.cardio
    train.drop('cardio', axis=1, inplace=True)

    z = pd.DataFrame()
    z['id'] = test.id
    z['y'] = 0

    v = pd.DataFrame()
    v['y'] = y

    train2, y, test2 = cleanup_and_generate(train, y, test)

    et1(train2.values, y, test2.values, v, z)
    keras1(train2, y, test2, v, z)
    rf1(train2.values, y, test2.values, v, z)
    xgb1(train2, y, test2, v, z)
    xgb2(train2, y, test2, v, z)

    z.y = z.xgb1 * 0.4 + z.xgb2 * 0.4 + (z.keras1 + z.rf1 + z.et1) * (0.2 / 3)
    z.y = prestore(z.y)
    save_results(v, z)

    print('done: %s.'%(now()))

    #'''
    clf = xgb.XGBClassifier(n_estimators=1000, learning_rate=.005)
    clf.fit(train2, y)
    for c in ['weight', 'gain', 'cover']:
        xgb.plot_importance(clf, title = 'Feature ' + c, importance_type=c)
    #'''
# True value
y_predict = X_test_now[:,price_index]    
X_test_now = np.delete(X_test_now, [house_pk_index, price_index], axis = 1)

# predictd value
dpredict = xgb.DMatrix(X_test_now)
ypred_with_evallist = bst_with_evallist.predict(dpredict)

RMSE = np.sqrt(((ypred_with_evallist - y_predict) ** 2).mean())

print('######################################')
print("RMSE of bst_with_evallist :", RMSE)
print('The r2 score for this Group %d is : %4f' % (test_group - 1, \
                                                   metrics.r2_score(y_predict, ypred_with_evallist)))
print('')
'''

# In[Importance plot & Trees plotted into .pdf]
# Attribute's importance plot
'''
xgb.plot_importance(bst_with_evallist)

# Tree plot and saved into pdf
num_trees = len(bst_with_evallist.get_dump())
for tree_index in range(num_trees):
    dot = xgb.to_graphviz(bst_with_evallist, num_trees = tree_index)
    dot.render("trees/tree{}".format(tree_index))
'''