def build_prediction_model(path, percentage, para_tuning_mark, last_mark):
    # Read data
    if not last_mark:
        train = pandas.read_csv(path + "train_" + str(percentage))
        dev = pandas.read_csv(path + "dev_" + str(percentage))
        test = pandas.read_csv(path + "test_" + str(percentage))
    else:
        if percentage == 1.0:
            return
        train = pandas.read_csv(path + "train_" + str(percentage) + "_last")
        dev = pandas.read_csv(path + "dev_" + str(percentage) + "_last")
        test = pandas.read_csv(path + "test_" + str(percentage) + "_last")

    # Check whether there are any columns with all zeros
    nonzero_colums = train.loc[:, (train != 0).any(axis=0)].columns

    # Scale
    scale_pos_weight = {0: 0, 1: 0}
    for index, value in train['label'].iteritems():
        scale_pos_weight[value] += 1
    scale_value = scale_pos_weight[0] / float(scale_pos_weight[1])

    # Build prediction model
    predictors = [x for x in nonzero_colums if x not in ['label']]

    if para_tuning_mark:
        # Parameter turning guide:
        # https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
        # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

        # Parameter: learning_rate
        para_tuning_0(train, dev, test, scale_value)
        # para_tuning_1(train, dev, test, scale_value)
        # para_tuning_2(train, dev, test, scale_value)
        # para_tuning_3(train, dev, test, scale_value)
        # para_tuning_4(train, dev, test, scale_value)

    else:

        xgb = XGBClassifier(learning_rate=0.015,
                            n_estimators=686,
                            max_depth=9,
                            min_child_weight=5,
                            gamma=0.0,
                            subsample=0.8,
                            colsample_bytree=0.8,
                            reg_alpha=0.01,
                            objective='binary:logistic',
                            nthread=4,
                            scale_pos_weight=scale_value,
                            seed=27)

        xgb.fit(train[predictors], train['label'], eval_metric='auc')
        dtest_predprob = xgb.predict_proba(test[predictors])[:, 1]

        print(
            "AUC/F1 Score/Kappa (Test):\t%f\t%f\t%f\t" %
            (metrics.roc_auc_score(test['label'], dtest_predprob),
             metrics.f1_score(test['label'], dtest_predprob.round()),
             metrics.cohen_kappa_score(test['label'], dtest_predprob.round())))
Пример #2
0
def train(ite):
    print(i)
    data = train_target_0.sample(700)  #数据显示1 :0 = 17:2(》0.5)
    data = data.append(train_target_1)
    y_ = data.target
    del data['target']
    xgb.fit(data, y_)
    #    train_p[ite] = xgb.predict(train_data)
    res[ite] = xgb.predict_proba(test_data)[:, 1]
Пример #3
0
def trainxgb(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)

    random_state=random.randint(0, 1000000)
    print('random state: {state}'.format(state=random_state))

    xgb = XGBoostClassifier(base_estimator='gbtree',
                 objective='multi:softprob',
                 metric='mlogloss',
                 num_classes=9,
                 learning_rate=random.uniform(0.01,0.05),
                 max_depth=random.randint(10,20),
                 max_samples=random.uniform(0.0,1.0),
                 max_features=random.uniform(0.0,1.0),
                 max_delta_step=random.randint(1,10),
                 min_child_weight=random.randint(1,10),
                 min_loss_reduction=1,
                 l1_weight=0.0,
                 l2_weight=0.0,
                 l2_on_bias=False,
                 gamma=0.02,
                 inital_bias=random.uniform(0.0,1.0),
                 random_state=random_state,
                 watchlist=[[valid_x,valid_y]],
                 n_jobs=30,
                 n_iter=3000,
                )

    xgb.fit(train_x, train_y)

    valid_predictions = xgb.predict_proba(valid_x)

    if test(valid_y,valid_predictions) <0.450:
        test_predictions= xgb.predict_proba(test_x)
        data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv")
        data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
Пример #4
0
def predict(data, slot):
    xgb = joblib.load(MODELS_D % slot)
    pred_xgb = xgb.predict_proba(data)[:, 1]

    # knn = pickle.load(open('../knn-models/%d' % slot, 'rb'))
    # knn = joblib.load('../knn-models/%d' % slot)
    # pred_knn = knn.predict_proba(data)[:, 1]

    # proba = 0.8 * pred_xgb + 0.2 * pred_knn

    # sgd = pickle.load(open('../sgd-models/%d' % slot, 'rb'))
    # pred_sgd = sgd.predict_proba(data)[:, 1]

    # proba = 0.9 * pred_xgb + 0.1 * pred_sgd
    proba = pred_xgb

    return proba
Пример #5
0
def test_xgb(test_tbl, xgb_model, train_list):
    df_test_x, df_test_y, f_list_test, df_median = data_preprocess(test_tbl)
    df_test = pd.DataFrame()
    for e in train_list:
        df_test[e] = df_test_x[e]
    df_test_x = df_test
    # df_test_x.fillna(-1, inplace=True)
    print 'Read test done'
    test_y = np.array(df_test_y)
    xgb = xgb_model
    test_x = np.array(df_test_x)
    y_proba = xgb.predict_proba(test_x)
    y_score = [item[0] for item in y_proba]
    y_good = [1 - item for item in test_y]
    tmp_df = pd.DataFrame()
    tmp_df['f'] = y_score
    tmp_df['good'] = y_good
    tmp_df['bad'] = test_y
    ks_dict = run_ks(test_y, y_proba[:, 1])
    auc = roc_auc_score(test_y, y_proba[:, 1])
    print "%f\t%f" % (auc, ks_dict['ks'])
    print_ks(ks_dict, test_tbl + '_score_ks_detail')
def xgboost_param_solution():
    xgb=XGBoostClassifier(alpha=0, booster='gbtree', colsample_bytree=0.459971793632,
         early_stopping_rounds=30, eta=0.0305648288294,
         eval_metric='mlogloss', gamma=0.0669039612464, l=0, lambda_bias=0,
         max_delta_step=4, max_depth=14, min_child_weight=8, nthread=4,
         ntree_limit=0, num_class=9, num_round=1000,
         objective='multi:softprob', seed=84425, silent=0,
         subsample=0.972607582489, use_buffer=True)

    train=load_data('train.csv')
    test=load_data('test.csv')
    le = preprocessing.LabelEncoder()
    le.fit(train['target'])
    train['target']=le.transform(train['target'])
    feature_cols= [col for col in train.columns if col  not in ['target','id']]
    X_train=train[feature_cols]
    X_test=test[feature_cols]
   
    y=train['target']
    test_ids=test['id']
    
    xgb.fit(X_train, y)
    preds=xgb.predict_proba(X_test)
    write_submission(test_ids,preds,'submissions/xgboost_param_solution_76.csv')
Пример #7
0
print ('F1 Score',f1_score(y_val,RF2preds_val))
print ('ROC AUC Score',roc_auc_score(y_train,RF2predprob_train))
print ('ROC AUC Score',roc_auc_score(y_val,RF2predprob_val))

# As we can see we got an improved score on the tuned datasets and the validation set had a better f1 score than both our first RF and both LRs. But our AUC score did go down some. Next we will be comparing the XGBoost to these models to see if it does better. So lets look at our confusion matrix here tounderstand the  classification reports to help us figure out what we still need to improve.
print ('Training Confusion Matrix',confusion_matrix(y_train,RF2preds_train))
print ('Val Confusion Matrix',confusion_matrix(y_val,RF2preds_val))
print ('Training Classification report',classification_report(y_train,RF2preds_train))
print ('Val Classification Report',classification_report(y_val,RF2preds_val))

# Now that we have improved the model with the RF, we can next see if an XGBoost will get us any better numbers and then we can choose the best model. Lets start with getting the base model.
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

# Lets get the base predictions for the train and validations sets. The predicted probabilities for predicting the class and getting our AUC and f1 score.
xgbpredprob_train = xgb.predict_proba(x_train)[:, 1]
xgbpredprob_val = xgb.predict_proba(x_val)[:, 1]

# The decision predictions to help us classify and get the f1 scores and see what the recall and precision are if we want them.
xgbpreds_train = xgb.predict(x_train)
xgbpreds_val = xgb.predict(x_val)

# Lets look at the error to assess the fit and efficacy. We will use the aucpr eval metric to get the f1 score related score. We can also use auc but we are focusing mroe on f1 score for prediction and wewill get the auc later. This is us basically re-running the fit and evaluating it vs the val set to see what we get without any tuning. But we will look at the results from above without the evaluation step to get the general baseline next.
eval_set = [(x_val, y_val)]
eval_metric = ["aucpr","error"]
%time xgb.fit(x_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=2)

# We get a good base score with a low error to start lets see if we can improve. (aucpr 0.913701, error 0.125)

# Results from the initial basline model we want to improve without evaluation.
print ('F1 Score',f1_score(y_train,xgbpreds_train))
Пример #8
0
# initial the model
xgb = xgb.XGBClassifier(parameters=xgb_parameters)

"""## Training and validation"""

# split validation set
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=['user_id','product_id','reordered']),
                                                  train['reordered'], 
                                                  test_size=0.3, random_state=42)

# fit the model
xgb.fit(X_train, y_train)

# make prediction
y_pred = (xgb.predict_proba(X_val)[:, 1] >= 0.21).astype('int') #setting a threshold

!pip install scikit-plot

# evaluation
from sklearn.metrics import f1_score, classification_report
from scikitplot.metrics import plot_confusion_matrix
from scikitplot.classifiers import plot_feature_importances
print('F1 Score: {}'.format(f1_score(y_pred, y_val)))
print(classification_report(y_pred, y_val))
# plot confusion matrix
plot_confusion_matrix(y_pred, y_val)

# plot importance
features = train.drop(columns=['user_id','product_id','reordered'])
plot_feature_importances(xgb, feature_names=features.columns, x_tick_rotation=90, max_num_features=20, figsize=(10,8))
def alation_test(path):
    # Read data
    train = pandas.read_csv(path + "train_1.0")
    test = pandas.read_csv(path + "test_1.0")

    # Check whether there are any columns with all zeros
    nonzero_colums = train.loc[:, (train != 0).any(axis=0)].columns

    # Scale
    scale_pos_weight = {0: 0, 1: 0}
    for index, value in train['label'].iteritems():
        scale_pos_weight[value] += 1
    scale_value = scale_pos_weight[0] / float(scale_pos_weight[1])

    # Build prediction model
    non_linguistic_features = [
        'duration', 'utterance_tutor', 'utterance_student', 'words_tutor',
        'words_student', 'unique_words_tutor', 'unique_words_student',
        'unique_concepts_tutor', 'unique_concepts_student', 'new_words_tutor',
        'new_words_student', 'new_concepts_tutor', 'new_concepts_student',
        'wait_time', 'responsiveness_mean', 'alignment_all',
        'alignment_concept', 'complexity_tutor', 'complexity_student',
        'questions_tutor', 'questions_student', 'sentiment_tutor',
        'sentiment_student', 'tutor_experience', 'student_experience'
    ]

    features_groups = [[
        'duration', 'utterance_tutor', 'utterance_student', 'words_tutor',
        'words_student'
    ],
                       [
                           'unique_words_tutor', 'unique_words_student',
                           'unique_concepts_tutor', 'unique_concepts_student',
                           'new_words_tutor', 'new_words_student',
                           'new_concepts_tutor', 'new_concepts_student'
                       ], ['wait_time', 'responsiveness_mean'],
                       ['alignment_all', 'alignment_concept'],
                       ['complexity_tutor', 'complexity_student'],
                       ['questions_tutor', 'questions_student'],
                       ['sentiment_tutor', 'sentiment_student'],
                       ['tutor_experience', 'student_experience']]

    # Feature groups
    k = 0
    for i in range(len(features_groups) + 3):

        # if i < len(features_groups):
        #     continue

        if i in range(len(features_groups)):
            print(features_groups[i])

        if i < len(features_groups):
            features_group = features_groups[i]
            features_group.append('label')
        else:

            if i < len(features_groups) + 2:
                features_group = [
                    x for x in train.columns
                    if x not in non_linguistic_features
                ][100 * k + 1:100 * (k + 1) + 1]
            else:
                # Trigrams
                features_group = [
                    x for x in train.columns
                    if x not in non_linguistic_features
                ][100 * k + 1:]

            features_group.append('label')
            k += 1

        train_predictors = [
            x for x in nonzero_colums if x not in features_group
        ]
        test_predictors = [
            x for x in nonzero_colums if x not in features_group
        ]

        xgb = XGBClassifier(learning_rate=0.015,
                            n_estimators=686,
                            max_depth=9,
                            min_child_weight=5,
                            gamma=0.0,
                            subsample=0.8,
                            colsample_bytree=0.8,
                            reg_alpha=0.01,
                            objective='binary:logistic',
                            nthread=4,
                            scale_pos_weight=scale_value,
                            seed=27)

        xgb.fit(train[train_predictors], train['label'], eval_metric='auc')
        dtest_predprob = xgb.predict_proba(test[test_predictors])[:, 1]

        print(
            "AUC/F1 Score/Kappa (Test):\t%f\t%f\t%f\t" %
            (metrics.roc_auc_score(test['label'], dtest_predprob),
             metrics.f1_score(test['label'], dtest_predprob.round()),
             metrics.cohen_kappa_score(test['label'], dtest_predprob.round())))
        print('')
result = bayes_cv_tuner.fit(train_mod_std[selected_features].values,
                            target_mod.values,
                            callback=status_print)

#best_params_mod = {
#
#}

xgb = xgb.XGBClassifier(best_params)

print("3.1 model development")
oof_xgb = np.zeros(len(train))
predictions_xgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(
        folds.split(train_mod.values, target_mod.values)):
    print("Fold {}".format(fold_ + 1))
    xgb.fit(train_mod.iloc[trn_idx][selected_features],
            target_mod.iloc[trn_idx])
    oof_xgb[val_idx] = xgb.predict_proba(
        train_mod.iloc[val_idx][selectedfeatures])[:, 1]

    predictions_xgb += xgb.predict_proba(
        test_mod[selected_features])[:, 1] / folds.n_splits

    print("CV score: {:<8.5f}".format(roc_auc_score(target_mod, oof_xgb)))

sub_df = pd.DataFrame({"ID_code": test["ID_code"].values})
sub_df["target"] = predictions_xgb
sub_df.to_csv("../result/submission_xgb_mod.csv", index=False)
Пример #11
0
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

#dtrain = xgb.DMatrix(df_all.values, label=labels, missing=np.nan)

#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)
X_test = vals[piv_train:]

#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('../out/submission.csv',index=False)
## Find optimal weight for bagging(use both geometric and arthematic progression and ) 
## ap1 * [XGBOOST^gp1 * NN^gp2] + ap2 * [ET]


# In[ ]:


X_train,X_test,y_train,y_test=train_test_split(train[predictors].values,train[target].values.ravel(),test_size=0.3)


# In[ ]:


xgb=XGBoostClassifier(num_class=2,num_boost_round=148,params)
xgb.fit(X_train,y_train)
probs1=xgb.predict_proba(X_test)
probs1=probs1[:,1]
XGBOOST=(probs1>0.4).astype('int')


# In[ ]:


nn = KerasClassifier(build_fn=base_model, nb_epoch=25, batch_size=64, verbose=2) ## tune the model? No,taking too long.
nn.fit(X_train,y_train)
probs2=nn.predict_proba(X_test)
probs2=probs2[:,1]
NN=(probs2>0.4).astype('int')


# In[ ]:
Пример #13
0
def train_pred(Xtrain, ytrain, Xvalid, isfindSTOP=False):
    from sklearn.model_selection import train_test_split

    print('%%%%%%%%%%%%%%%%%%%% Start train model. %%%%%%%%%%%%%%%%%%%%')

    ### FIXED PARS ###
    learn_rate = 0.07
    n_trees = 1000  # previous tested using early stop
    if isfindSTOP: n_trees = 1000
    ### FIXED PARS ###

    par_max_depth = 7
    par_gamma = 20
    par_min_child_weight = 10
    par_reg_alpha = 0.0
    par_reg_lambda = 2.0
    par_scale_pos_weight = 1.3

    # Define XGBoost classifier
    xgb = XGBClassifier(objective='binary:logistic',
                        seed=np.random.randint(0, 1000000),
                        learning_rate=learn_rate,
                        n_estimators=n_trees,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        max_depth=par_max_depth,
                        min_child_weight=par_min_child_weight,
                        gamma=par_gamma,
                        reg_alpha=par_reg_alpha,
                        reg_lambda=par_reg_lambda,
                        scale_pos_weight=par_scale_pos_weight)

    if isfindSTOP:
        # Use stratified train_test_split due to the very imbalanced label classes
        X_train, X_val, y_train, y_val = train_test_split(Xtrain,
                                                          ytrain,
                                                          test_size=0.1,
                                                          stratify=ytrain)
        eval_set = [(X_train, y_train), (X_val, y_val)]

        # Fit the classifier instance on the training data
        xgb.fit(X_train,
                y_train,
                eval_set=eval_set,
                early_stopping_rounds=50,
                eval_metric=gini_xgb_min)

        # Predict training set:
        train_predprob = xgb.predict_proba(X_train)[:, 1]
        val_predprob = xgb.predict_proba(X_val)[:, 1]
        gini_train = gini_normalized(y_train, train_predprob)
        gini_val = gini_normalized(y_val, val_predprob)
        print("Val, Train Gini coef : %.5f %.5f" % (gini_val, gini_train))

    else:
        # Fit the classifier instance on the training data
        xgb.fit(Xtrain, ytrain)

        # Predict test sets:
        p_valid = xgb.predict_proba(Xvalid)[:, 1]

        return p_valid
Пример #14
0
# 导入第三方包
import xgboost
import numpy as np
# 构建XGBoost分类器
xgboost = xgboost.XGBClassifier()
# 使用重抽样后的数据,对其建模
xgboost.fit(over_samples_X, over_samples_y)
# 将模型运用到测试数据集中
resample_pred = xgboost.predict(np.array(X_test))

# 返回模型的预测效果
print('模型的准确率为:\n', metrics.accuracy_score(y_test, resample_pred))
print('模型的评估报告:\n', metrics.classification_report(y_test, resample_pred))

# 计算欺诈交易的概率值,用于生成ROC曲线的数据
y_score = xgboost.predict_proba(np.array(X_test))[:, 1]
fpr, tpr, threshold = metrics.roc_curve(y_test, y_score)
# 计算AUC的值
roc_auc = metrics.auc(fpr, tpr)

# 绘制面积图
plt.stackplot(fpr, tpr, color='steelblue', alpha=0.5, edgecolor='black')
# 添加边际线
plt.plot(fpr, tpr, color='black', lw=1)
# 添加对角线
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
# 添加文本信息
plt.text(0.5, 0.3, 'ROC curve (area = %0.2f)' % roc_auc)
# 添加x轴与y轴标签
plt.xlabel('1-Specificity')
plt.ylabel('Sensitivity')
Пример #15
0
import xgboost as xgb
xgb=xgb.XGBClassifier()
xgb.fit(X_train, y_train)


# In[73]:


plt.bar(range(len(xgb.feature_importances_)), xgb.feature_importances_)
plt.show()


# In[74]:


probabilities = xgb.predict_proba(X_test)
print_metrics(y_test, probabilities, 0.5)  


# In[75]:


solution=xgb.predict(test1)
my_submission=pd.DataFrame({'CustomerID':test.CustomerID,'BikeBuyer': solution})
my_submission.to_csv('XgboostClassifierMicrosoft01.csv', index=False)


# In[76]:


from sklearn.neural_network import MLPClassifier
Пример #16
0
                            n_jobs=1,
                            nthread=None,
                            objective='binary:logistic',
                            random_state=0,
                            reg_alpha=0,
                            reg_lambda=1,
                            scale_pos_weight=1,
                            seed=None,
                            subsample=0.9,
                            verbosity=1)

        X_train_, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train_, y_valid = y.iloc[train_index], y.iloc[valid_index]
        xgb.fit(X_train_, y_train_)
        del X_train_, y_train_
        pred = xgb.predict_proba(test_X)[:, 1]
        val = xgb.predict_proba(X_valid)[:, 1]
        del xgb, X_valid
        print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
        del val, y_valid
        xgb_sub['isFraud'] = xgb_sub['isFraud'] + pred / n_fold
        del pred
        gc.collect()

    xgb_sub.to_csv('sub_xgb.csv', index=False)

# ### ensemble

# In[63]:

if PREDICT:
Пример #17
0
# random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,Y_train), verbose=3, random_state=27)
# start_time = timer(None) # timing starts from this point for "start_time" variable
# search = random_search.fit(X_train, Y_train)
# timer(start_time)
# print(search.best_params_)


print('Start training XGB')
start_time = timer(None)
xgb.fit(X_train, Y_train, eval_metric='auc')
timer(start_time)
print("Start predicting XGB")
start_time = timer(None)
predictions = xgb.predict(X_test)
timer(start_time)
pred_proba = xgb.predict_proba(X_test)[:, 1]
print('Statistics')
print("AUC : %f" % metrics.roc_auc_score(Y_test, pred_proba))
print("F1 Score: %f" % metrics.f1_score(Y_test, predictions))
print('**********************************')

clf = LogisticRegression(C=1e5)
print('Start training log regression')
start_time = timer(None)
clf.fit(X_train, Y_train)
timer(start_time)
print("Score: ", clf.score(X_test, Y_test))
start_time = timer(None)
print("Start predicting log regression")
y_pred = clf.predict(X_test)
timer(start_time)
 'n_estimators': 27,
 'subsample': 0.45}
'''

xgb.best_score_  # 0.83585339132974634

xgb = XGBClassifier(max_depth=6,
                    learning_rate=0.1,
                    n_estimators=27,
                    objective='multi:softprob',
                    subsample=0.4,
                    colsample_bytree=0.5,
                    seed=0)

xgb.fit(X, y)
xgb_predictions = xgb.predict_proba(X_test)

# Put these in a good form to spit out
xgb_predictions = xgb_predictions.ravel()

# Have to ensure these are in the same order, yep, looks good
classes = np.tile(xgb.classes_, X_test.shape[0])
ids = np.repeat(test["id"].values, 12)

print(xgb_predictions.shape)
print(classes.shape)
print(ids.shape)
print(test_users['id'].shape)
print(test['id'].shape)

# We want to make this a list of most likely occurances
Пример #19
0
                    objective='multi:softmax',
                    sub_sample=1,
                    num_class=4,
                    n_gpus=0)
# error evaluation for multiclass training
xgb.fit(X_train, y_train)

# In[ ]:

from sklearn.preprocessing import LabelEncoder

labels = LabelEncoder()
y_train_labels_fit = labels.fit(y_train)
y_train_lables_trf = labels.transform(y_train)

test_pred = pd.DataFrame(xgb.predict_proba(X_test), columns=labels.classes_)

# In[ ]:

#test_pred = pd.DataFrame(bst.predict(X_test1), columns=labels.classes_)
q = {
    'ID': test_data["ID"],
    'no_financial_services': test_pred[0],
    'other_only': test_pred[1],
    'mm_only': test_pred[2],
    'mm_plus': test_pred[3]
}
df_pred1 = pd.DataFrame(data=q)
df_pred1 = df_pred1[[
    'ID', 'no_financial_services', 'other_only', 'mm_only', 'mm_plus'
]]
Пример #20
0
                         random_state=20,
                         n_jobs=4)

lr = LogisticRegression()

lr.fit(train_x, train_y)
lr_pred = lr.predict_proba(test_x)[:, 1]

lgb.fit(train_x, train_y)
lgb_pred = lgb.predict_proba(test_x)[:, 1]

gbdt.fit(train_x, train_y)
gbdt_pred = gbdt.predict_proba(test_x)[:, 1]

xgb.fit(train_x, train_y)
xgb_pred = xgb.predict_proba(test_x)[:, 1]

y_pred = 0.7 * lgb_pred + 0.15 * xgb_pred + 0.15 * gbdt_pred

auc = roc_auc_score(test_y, y_pred)
print("xgboost+lightgbm+gbdt的加权auc是{}".format(auc))

mine = MINE()
mine.compute_score(lr_pred, xgb_pred)

print("lr和xgb的mic:{}".format(mine.mic()))
"""

class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
Пример #21
0
ans = svc.predict_proba(xvalid_tfv)

print "The log loss using tfidVectorizer for SVM is " + str(multiclass_logloss(yvalid,ans))

svc = MultinomialNB(C=1.0, kernel='rbf', degree=3, probability=True)
svc.fit(xtrain_cv, ytrain)
ans = svc.predict_proba(xvalid_cv)

print "The log loss using CountVectorizer for SVM is " + str(multiclass_logloss(yvalid,ans))


''' Predicting using xgboost'''

xgb = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, learning_rate=0.1)
xgb.fit(xtrain_tfv, ytrain)
ans = xgb.predict_proba(xvalid_tfv)

print "The log loss using tfidVectorizer for xgboost is " + str(multiclass_logloss(yvalid,ans))

xgb.fit(xtrain_cv, ytrain)
ans = xgb.predict_proba(xvalid_cv)

print "The log loss using CountVectorizer for xgboost is " + str(multiclass_logloss(yvalid,ans))

Grid Search

mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

svd = TruncatedSVD()
scl = preprocessing.StandardScaler()
lr_model = LogisticRegression()
Пример #22
0
    name = SLOT_FMT % i
    ans.__setattr__(name, ans.__getattr__(name).astype(float))

feature_frames = []
for i in range(46, 76):
    data = pd.read_csv('../sess/%03d.csv' % i, index_col='user_id')
    feature_frames.append(data)
features = pd.concat(feature_frames)

user = pd.read_csv('../public/user_create_time.csv', index_col='user_id')
user2 = user['user_create_time'].str.split('-', 1, expand=True)
user2 = user2.astype('int')
for index, row in features.iterrows():
    user_row = user2.loc[index]
    features.at[index, 'created'] = (user_row[0] - 2016) * 12 + user_row[1]

guess = {}
for slot in range(0, 28):
    xgb = pickle.load(open('../xgb-models/%d' % slot, 'rb'))
    pred_xgb = xgb.predict_proba(features)[:, 1]

    # sgd = pickle.load(open('../sgd-models/%d' % slot, 'rb'))
    # pred_sgd = sgd.predict_proba(features)[:, 1]

    # proba = 0.9 * pred_xgb + 0.1 * pred_sgd
    guess[SLOT_FMT % slot] = pred_xgb

for i in range(0, len(features)):
    for slot in range(0, 28):
        ans.iat[i, slot] = guess[SLOT_FMT % slot][i]
# data_parent = pd.read_csv('septic_patients_data.csv')
data_parent = pd.read_csv('sample_test_data.csv')

#data_parent = pd.read_csv('results.csv')
datax_temp = data_parent[[
    'tissue_extraction', 'temp_fin', 'ph', 'hb', 'lactate'
]]
scaler = joblib.load("scaler.save")
datax = scaler.transform(datax_temp)

xgb = pickle.load(open("xgboost.dat", "rb"))
svm = pickle.load(open("svm.dat", "rb"))
lr = pickle.load(open("lr.dat", "rb"))
randomforest = pickle.load(open("randomforest.dat", "rb"))

xgb_pred = pd.DataFrame(xgb.predict_proba(datax)[:, 1])
rf_pred = pd.DataFrame(randomforest.predict_proba(datax)[:, 1])

temp = pd.concat([xgb_pred, rf_pred], axis=1)
temp['avg'] = temp.mean(axis=1)
combined_df = pd.concat([
    pd.DataFrame(data_parent[['subject_id', 'datetime']]),
    pd.DataFrame(datax_temp), temp['avg']
],
                        axis=1)

combined_df['patient_category'] = combined_df.apply(f, axis=1)

# critical_patients = combined_df.loc[combined_df['patient_category'].isin(['very-critical', 'critical', 'moderate-critical'])]
combined_df.to_csv('critical_patients_records.csv')
T_train_sample_xgb = xgb.DMatrix(X_train_sample, Y_train_sample)
X_test_sample_xgb = xgb.DMatrix(X_test_sample)


xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=200,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
#scores:  XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=50
#0.8183974444336767 121 rounds
Y_test_sample = test_sample["country_destination"]
Y_test_sample = Y_test_sample.map(country_num_dic)

X_train_sample.isnull().sum()

eval_set  = [(X_train_sample, Y_train_sample), (X_test_sample, Y_test_sample)]
xgb.fit(X_train_sample, Y_train_sample, eval_set = eval_set, eval_metric = 'mlogloss', early_stopping_rounds= 10)
Y_pred_sample = xgb.predict_proba(X_test_sample)


y_le_train_sample = (train_sample['country_destination'].map(country_num_dic)).values
y_le_test_sample = (test_sample['country_destination'].map(country_num_dic)).values
y_le_train = (train['country_destination'].map(country_num_dic)).values

id_train = train['id'].values
id_train_sample = train_sample['id'].values
id_test_sample = test_sample['id'].values
id_test = test['id'].values


#------------- TRAIN SAMPLE PREDICTION --------------------------
ids = []  #list of ids
cts = []  #list of countries
Пример #25
0
def forest_model(test=True, grid_cv=False, save_final_results=True):
    ''' execute final model
    '''
    global train_full
    global target_full
    global X_train
    global X_test
    global Y_train
    global Y_test
    global final_X_test
    global GS_CV
    global f_pred
    global accuracies

    logging.warn('Create boosted trees model with training data')
    ## Encode categories ##
    le = LabelEncoder()
    lb = LabelBinarizer()
    cat_full = le.fit_transform(np.array(target_full).ravel())
    cat_full_lb = lb.fit_transform(np.array(target_full).ravel())

    mcl = MultiColumnLabelEncoder()
    ohe = OneHotEncoder()
    im = Imputer(strategy='most_frequent')
    im2 = Imputer(strategy='mean')
    p = Pipeline([('mcl', mcl), ('im', im), ('ohe', ohe)])

    ## full dataset ##
    X = np.concatenate((p.fit_transform(train_full[CAT_COLS]).todense() \
                            ,im2.fit_transform(np.array(train_full[NUM_COLS]))),axis=1)
    Y = cat_full

    ## Set up X,Y data for modeling ##
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split( \
                                                      X \
                                                      , Y \
                                                      , test_size=TEST_SIZE \
                                                      , random_state=0)

    if grid_cv:
        ## Run grid search to find optimal parameters ##
        params_grid = {
            'max_depth': [15, 20, 25],
            'subsample': [0.25, 0.5],
            #  'colsample_bytree':[ 0.25, 0.5, 0.75 ] ,
        }
        logging.warn(
            'Running grid search CV with params: {}'.format(params_grid))
        ndcg = make_scorer(ndcg_score, needs_proba=True, k=5)
        xgb = XGBClassifier(n_estimators=50,
                            objective='multi:softprob',
                            seed=0)
        cv = GridSearchCV(xgb, params_grid, scoring=ndcg).fit(X, Y)
        logging.warn('Best XGB params: {}'.format(cv.best_params_))
        GS_CV = cv.best_params_

    ## Run model with all data and save ##
    if save_final_results:
        ''' Write results to a csv file
            NOTE: sorting is not done here
        '''
        logging.warn('Make predictions for final test set')
        xgb = XGBClassifier(learning_rate=0.1,
                            n_estimators=500,
                            objective='multi:softprob',
                            seed=0,
                            **GS_CV)
        xgb.fit(X_train, Y_train)
        if test:
            logging.warn('Test prediction accuracy')
            p_pred = xgb.predict(X_test)
            p_pred_i = le.inverse_transform(p_pred)
            p_pred_p = xgb.predict_proba(X_test)
            cat_tst_lb = lb.fit_transform(Y_test)
            logging.warn('Accuracy: ' + str(np.mean(p_pred == Y_test)))
            logging.warn('\n' + classification_report(p_pred, Y_test))
            logging.warn('Log Loss: {}'.format(log_loss(Y_test, p_pred_p)))
            logging.warn('Label Ranking Precision score: {}'\
                            .format(label_ranking_average_precision_score(cat_tst_lb, p_pred_p)))
            logging.warn('Label Ranking loss: {}'.format(
                label_ranking_loss(cat_tst_lb, p_pred_p)))
            logging.warn('NDCG score: {}'.format(
                ndcg_score(cat_tst_lb, p_pred_p, k=5)))
            categories = set(Y_test)
            accuracies = np.zeros(len(categories))
            for c in categories:
                accuracies[c] = np.sum(
                    p_pred[p_pred == c] == Y_test[p_pred == c]) * 1.0
                accuracies[c] /= p_pred[p_pred == c].shape[0]

        X = np.concatenate((p.transform(final_X_test[CAT_COLS]).todense() \
                                ,im2.transform(np.array(final_X_test[NUM_COLS]))),axis=1)
        f_pred = xgb.predict_proba(X)
{'colsample_bytree': 0.5,
 'gamma': 0.15,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 6,
 'n_estimators': 27,
 'subsample': 0.45}
'''

xgb.best_score_ # 0.83585339132974634

xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=27,
                    objective='multi:softprob', subsample=0.4, colsample_bytree=0.5, seed=0)  

xgb.fit(X, y)
xgb_predictions = xgb.predict_proba(X_test)

# Put these in a good form to spit out
xgb_predictions = xgb_predictions.ravel()

# Have to ensure these are in the same order, yep, looks good
classes = np.tile(xgb.classes_, X_test.shape[0])
ids = np.repeat(test["id"].values, 12)

print(xgb_predictions.shape)
print(classes.shape)
print(ids.shape)
print(test_users['id'].shape)
print(test['id'].shape)

# We want to make this a list of most likely occurances
Пример #27
0
y = le.fit_transform(labels)
X_test = vals[piv_train:]
print("PRIV", piv_train)
print()
#Classifier
xgb = XGBClassifier(max_depth=6,
                    learning_rate=0.3,
                    n_estimators=25,
                    objective='multi:softprob',
                    subsample=0.5,
                    colsample_bytree=0.5,
                    seed=0)

print("X", X.shape, "   Y", y.shape)
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('../output/sub.csv', index=False)

#myfunc()