def fit(self, X, y):
     dtrain = xgb.DMatrix(X, label=y, missing=np.NaN)
     self.model = xgb.train(self.learner_params,
                            dtrain,
                            self.boosting_rounds,
                            xgb_model=self.model)
    if x_test[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_test[c].values))
        x_test[c] = lbl.transform(list(x_test[c].values))

xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

# cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20, verbose_eval=25, show_stdv=False)
# print('best num_boost_rounds = ', len(cv_output))
# num_boost_rounds = len(cv_output) # 382

num_boost_rounds = 385
model = xgb.train(dict(xgb_params, silent=0),
                  dtrain,
                  num_boost_round=num_boost_rounds)
y_predict = model.predict(dtest)
output = pd.DataFrame({'id': id_test, 'price_doc': y_predict})
output.to_csv('output.csv', index=False)
print 'ok'
示例#3
0
        print('Train score is:', scr[i])
    print(log_loss(y, oob_pred))
    print oob_pred[1:10]
    sub_pred = sub_pred.mean(axis=1)
    oob_pred_filename = '../output/oob_pred_rfentropy_' + str(np.mean(scr))
    sub_pred_filename = '../output/sub_pred_rfentropy_' + str(np.mean(scr))
    pkl.dump(oob_pred, open(oob_pred_filename + '.p', 'wb'))
    pkl.dump(sub_pred, open(sub_pred_filename + '.p', 'wb'))
    preds = pd.DataFrame({"ID": ids, "PredictedProb": sub_pred})
    preds.to_csv(sub_pred_filename + '.csv', index=False)
    '''
	fraction_of_positives, mean_predicted_value = calibration_curve(y, oob_pred, n_bins=10)
	plt.plot(mean_predicted_value, fraction_of_positives, "s-",label="ssafaasf")
	plt.show()
	'''
else:
    # Train on full data
    dtrain = xgb.DMatrix(X, y)
    dtest = xgb.DMatrix(X_sub)
    clf = xgb.train(xgb_param, dtrain, m_params['n_rounds'])

    pred = clf.predict(dtrain)
    print('Train score is:', log_loss(y, np.array(pred)))

    #clf.save_model(model_name + '.model')
    pred = clf.predict(dtest)

    print("Saving Results.")
    preds = pd.DataFrame({"ID": ids, "target": pred})
    preds.to_csv(model_name + '.csv', index=False)
def xgb_model(X_train, X_valid, y_train, y_valid, X_test_id, X_test):
    """
    xgb 模型
    :param X_train:
    :param X_valid:
    :param y_train:
    :param y_valid:
    :param X_test_id:
    :return:
    """
    import pandas as pd
    import numpy as np
    import xgboost as xgb

    dtrain = xgb.DMatrix(X_train, label=y_train.values)
    dvalid = xgb.DMatrix(X_valid, label=y_valid.values)

    # ########################################## Tuning Paramters ##########################################
    xgb_best_params = {}
    params = {'booster': 'gbtree',
              'objective': 'reg:squarederror',
              'max_depth': 6,
              'learning_rate': 1,
              'gamma': 0,
              'min_child_weight': 1,
              'subsample': 1,
              'colsample_bytree': 1,
              'reg_alpha': 0,
              'reg_lambda ': 1,
              'random_state': 23,
              'gpu_id': 0,
              'max_bin': 16,
              'tree_method': 'gpu_exact'
              }

    # ########################################### n_estimators  ############################################
    min_merror = np.inf
    for n_estimators in range(10, 1000, 10):
        params['n_estimators'] = n_estimators
        cv_results = xgb.cv(params, dtrain, nfold=3, num_boost_round=1000, early_stopping_rounds=30, feval=rmspe_xg,
                            seed=23)
        mean_error = min(cv_results['test-rmspe-mean'])

        if mean_error < min_merror:
            min_merror = mean_error
            xgb_best_params["n_estimators"] = n_estimators

    params["n_estimators"] = xgb_best_params["n_estimators"]

    # ########################################### max_depth & min_child_weight #############################
    min_merror = np.inf
    for max_depth in range(6, 11, 1):
        for min_child_weight in range(4, 10, 1):
            params['max_depth'] = max_depth
            params['min_child_weight'] = min_child_weight
            cv_results = xgb.cv(params, dtrain, nfold=3, num_boost_round=1000, early_stopping_rounds=50, feval=rmspe_xg,
                                seed=23)
            mean_error = np.argmin(cv_results['test-rmspe-mean'])

            if mean_error < min_merror:
                min_merror = mean_error
                xgb_best_params["max_depth"] = max_depth
                xgb_best_params["min_child_weight"] = min_child_weight

    params['max_depth'] = xgb_best_params['max_depth']
    params["min_child_weight"] = xgb_best_params["min_child_weight"]

    # ########################################### gamma #####################################################
    for gamma in [i / 10.0 for i in range(0, 1)]:
        params['gamma'] = gamma
        cv_results = xgb.cv(params, dtrain, nfold=3, early_stopping_rounds=50, feval=rmspe_xg, seed=23)
        mean_error = min(cv_results['test-rmspe-mean'])

        if mean_error < min_merror:
            min_merror = mean_error
            xgb_best_params["gamma"] = gamma

    params["gamma"] = xgb_best_params["gamma"]

    # ############################################# subsample & colsample_bytree ############################
    min_merror = np.inf
    for subsample in [i / 10.0 for i in range(6, 10)]:
        for colsample_bytree in [i / 10.0 for i in range(6, 10)]:
            params['subsample'] = subsample
            params['colsample_bytree'] = colsample_bytree
            cv_results = xgb.cv(params, dtrain, nfold=3, early_stopping_rounds=50, feval=rmspe_xg, seed=23)
            mean_error = min(cv_results['test-rmspe-mean'])

            if mean_error < min_merror:
                min_merror = mean_error
                xgb_best_params["subsample"] = subsample
                xgb_best_params["colsample_bytree"] = colsample_bytree

    params["subsample"] = xgb_best_params["subsample"]
    params["colsample_bytree"] = xgb_best_params["colsample_bytree"]

    # ############################################# reg_alpha ################################################
    min_merror = np.inf
    for reg_alpha in [0.8, 0.9, 1, 1.1, 1.2]:
        params['reg_alpha'] = reg_alpha
        cv_results = xgb.cv(params, dtrain, nfold=3, early_stopping_rounds=50, feval=rmspe_xg, seed=23)
        mean_error = min(cv_results['test-rmspe-mean'])

        if mean_error < min_merror:
            min_merror = mean_error
            xgb_best_params["reg_alpha"] = reg_alpha

    params["reg_alpha"] = xgb_best_params["reg_alpha"]

    # ############################################# reg_lambda ################################################
    min_merror = np.inf
    for reg_lambda in [0.8, 0.9, 1, 1.1, 1.2]:
        params['reg_lambda'] = reg_lambda
        cv_results = xgb.cv(params, dtrain, nfold=3, early_stopping_rounds=50, feval=rmspe_xg, seed=23)
        mean_error = min(cv_results['test-rmspe-mean'])

        if mean_error < min_merror:
            min_merror = mean_error
            xgb_best_params["reg_lambda"] = reg_lambda

    params["reg_lambda"] = xgb_best_params["reg_lambda"]

    # ############################################# learning_rate ################################################
    min_merror = np.inf
    for learning_rate in [0.001, 0.005, 0.01, 0.03, 0.05]:
        params['learning_rate'] = learning_rate
        cv_results = xgb.cv(params, dtrain, nfold=3, early_stopping_rounds=50, feval=rmspe_xg, seed=23)
        mean_error = min(cv_results['test-rmspe-mean'])

        if mean_error < min_merror:
            min_merror = mean_error
            xgb_best_params["learning_rate"] = learning_rate

    params["learning_rate"] = xgb_best_params["learning_rate"]

    print(params)
    bst_params = {
        "eta": 0.3,
        "alpha": 1,
        "silent": 1,
        "seed": 42,
        "objective": params['objective'],
        "booster": params['booster'],
        "max_depth": params['max_depth'],
        'min_child_weight': params['min_child_weight'],
        "subsample": params['subsample'],
        "colsample_bytree": params['colsample_bytree'],
        "reg_alpha": params['reg_alpha'],
        "gpu_id": params['gpu_id'],
        "max_bin": params['max_bin'],
        "tree_method": params['tree_method'],
        "n_estimators": params['n_estimators']
    }

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    xgb_model = xgb.train(bst_params, dtrain, num_boost_round=1000, evals=watchlist, early_stopping_rounds=100,
                          feval=rmspe_xg, verbose_eval=True)
    print("Validating")
    yhat = xgb_model.predict(xgb.DMatrix(X_valid))
    error = rmspe(np.expm1(y_valid.values), np.expm1(yhat))
    print('RMSPE: {:.6f}'.format(error))

    xgb_test_prod = xgb_model.predict(xgb.DMatrix(X_test))
    xgb_test_prod = np.expm1(xgb_test_prod)
    sub_df = pd.DataFrame(data=list(X_test_id), columns=['id'])
    sub_df["forecastVolum"] = [int(i) for i in xgb_test_prod]
    sub_df.to_csv(DefaultConfig.project_path + "/data/submit/" + DefaultConfig.select_model + "_submission.csv",
                  index=False,
                  encoding='utf-8')
        line = ser.readline()
        total_byte = total_byte + len(line.decode('utf-8'))
        #print("                                 byte:",len(line.decode('utf-8'))," total_byte:", total_byte)
        line_str = (line.decode('utf-8')).replace('\n', '') # byteをstrに変換後、改行コードを削除
        lines = line_str.split(',')
        print(lines)
        with open(test_data, 'a',newline="") as f:
            writer = csv.writer(f)
            writer.writerow(lines)

        ### 即座に予測
        # pandasのDataFrameを作成
        df = pd.read_csv(test_data)
        data = pd.DataFrame(df, columns=['bothfoot_L','swing_L','bothfoot_R','swing_R','stand_L','stand_R']) # ヘッダーがあることが前提
        # 最終行を対象に予測
        dtest = xgb.DMatrix(data.tail(1))
        pred = ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
        #print(pred[0]) # クラスタ番号
        cpred = int(pred[0])
        # 番号ごとの定義は毎回変える必要がある
        if cpred == 4:
            print("Normal")
        elif cpred == 0:
            print("Tired")
        elif cpred == 8:
            print("RUN")
        else:
            print("Stop")

except KeyboardInterrupt:
    # 確認用
示例#6
0
    
    df_train=train1[train1.shop_id.notnull()]
    df_test=train1[train1.shop_id.isnull()]
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(df_train['shop_id'].values))
    df_train['label'] = lbl.transform(list(df_train['shop_id'].values))    
    num_class=df_train['label'].max()+1    

    params = {
            'objective': 'multi:softmax',
            'eta': 0.1,
            'max_depth': 9,
            'eval_metric': 'merror',
            'seed': 0,
            'missing': -999,
            'num_class':num_class,
            'silent' : 1
            }
            
    feature=[x for x in train1.columns if x not in ['user_id','label','shop_id','time_stamp','mall_id','wifi_infos','hours','weekday']]    
    xgbtrain = xgb.DMatrix(df_train[feature], df_train['label'])
    xgbtest = xgb.DMatrix(df_test[feature])
    watchlist = [ (xgbtrain,'train'), (xgbtrain, 'test') ]
    num_rounds=60
    model = xgb.train(params, xgbtrain, num_rounds, watchlist, early_stopping_rounds=15)
    df_test['label']=model.predict(xgbtest)
    df_test['shop_id']=df_test['label'].apply(lambda x:lbl.inverse_transform(int(x)))
    r=df_test[['row_id','shop_id']]
    result=pd.concat([result,r])
    result['row_id']=result['row_id'].astype('int')
    result.to_csv(path+'sub.csv',index=False)
示例#7
0
    def run(self):
        """
        Run the whole training and predicting process.
        Returns:

        """
        logger.info('Predicting week 10...')
        logger.info('PCA with week 10...')
        reg_train_10 = pd.read_csv('processed/reg_train_10.csv', dtype=DTYPES)
        reg_train_10 = self.set_clusters(reg_train_10)

        for cluster_no in range(0, self._n_clusters):
            tmp = reg_train_10[reg_train_10.cluster == cluster_no]
            tmp_feature = tmp[FEATURES].values
            dtest = xgb.DMatrix(tmp_feature,
                                label=None,
                                feature_names=FEATURES)
            del tmp_feature
            y_test = self._xgb_boosters[cluster_no].predict(dtest)
            tmp['Demanda_uni_equil'] = np.exp(y_test) - 1

            if cluster_no == 0:
                sub_10 = tmp[['id', 'Demanda_uni_equil']]
                train_10 = tmp[[
                    'Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
                    'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil'
                ]]
            else:
                sub_10 = pd.concat([sub_10, tmp[['id', 'Demanda_uni_equil']]])
                train_10 = pd.concat([
                    train_10, tmp[[
                        'Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
                        'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil'
                    ]]
                ])

        logger.info('Predicting week 11...')
        raw_train = pd.concat([RAW_TRAIN, train_10], axis=0)
        reg_train_11 = pp.extract_lag_features(data_frame=raw_train,
                                               test_set=RAW_TEST,
                                               week=11)

        logger.info('PCA with week 11...')
        reg_train_11 = self.set_clusters(reg_train_11)

        for cluster_no in range(0, self._n_clusters):
            tmp = reg_train_11[reg_train_11.label == cluster_no]
            tmp_feature = reg_train_11[FEATURES].values
            dtest = xgb.DMatrix(tmp_feature,
                                label=None,
                                feature_names=FEATURES)
            del tmp_feature
            y_test = self._xgb_boosters[cluster_no].predict(dtest)
            tmp['Demanda_uni_equil'] = np.exp(y_test) - 1

            if cluster_no == 0:
                sub_11 = tmp[['id', 'Demanda_uni_equil']]
                train_11 = tmp[[
                    'Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
                    'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil'
                ]]
            else:
                sub_11 = pd.concat([sub_11, tmp[['id', 'Demanda_uni_equil']]])
                train_11 = pd.concat([
                    train_11, tmp[[
                        'Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
                        'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil'
                    ]]
                ])

        sub = pd.concat([sub_10, sub_11])
        sub = sub.sort_values(by='id')
        sub.to_csv('submission/submission_cluster_xgb.csv', index=False)
        logger.info('Done with submission.')
folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=2019)
oof =  np.zeros(len(train_df)) 
predictions =np.zeros(len(test_df))

for i, (trn, val) in enumerate(folds.split(train_df.values,target.values)):
    print(i+1, "fold.    AUC")
    
    trn_x = train_df.iloc[trn][features]
    trn_y = target.iloc[trn]
    val_x = train_df.iloc[val][features]
    val_y = target.iloc[val]

    

    model = xgb.train(params
                      , xgb.DMatrix(trn_x, trn_y)
                      , 100000
                      , [(xgb.DMatrix(trn_x, trn_y), 'train'), (xgb.DMatrix(val_x, val_y), 'valid')]
                      , verbose_eval=5000
                      , early_stopping_rounds=3000
                      )

    oof[val] = model.predict(xgb.DMatrix(val_x), ntree_limit=model.best_ntree_limit)
    predictions += model.predict(xgb.DMatrix(test_df[features]), ntree_limit=model.best_ntree_limit) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
cv_auc = roc_auc_score(target, oof)
cv_auc = cv_auc.round(6)


OUTPUT_FILE = 'xgb_result_50w/' + output_name +'-'+ str(cv_auc) + '.csv'
def _model_predict(all_feature, predict_feature, predict_col, num_boost_round=1000):
    k_v = {}
    if predict_col in enum_col or predict_col in ext_enum_col:
        # 删除数量较少的类别
        def func_count(df):
            df['value_count'] = df[predict_col].count()
            return df
        if predict_col in large_limit_col.keys():
            number_limit = large_limit_col[predict_col]
        else:
            number_limit = 10
        all_feature = all_feature.groupby(predict_col).apply(func_count)
        del_test_size = len(all_feature[(all_feature[test_label_col] == 1) & (all_feature["value_count"] < number_limit)])
        print(predict_col, "del_test_size:", del_test_size)

        # 原本应有的所有测试集
        test_feature_org = all_feature[all_feature[test_label_col] == 1]
        test_feature_org.drop(["value_count"], axis=1, inplace=True)
        test_y_org = np.array(test_feature_org[predict_col])
        test_x_org = np.array(test_feature_org.drop([predict_col, test_label_col], axis=1))
        print("test_x_org", test_x_org.shape)

        all_feature = all_feature[all_feature["value_count"] >= number_limit]
        all_feature.drop(["value_count"], axis=1, inplace=True)

        # 将value转换为class
        label = all_feature[predict_col]
        all_y = sorted(list(set(label)))

        if len(all_y) == 1:
            # 只有一个值,直接返回预测结果
            print("only one value!")
            return np.array([all_y[0]] * len(predict_feature)), 1

        v_k = {}
        for k, v in enumerate(all_y):
            v_k[v] = k
            k_v[k] = v
        label = np.array([v_k[i] for i in label])
        all_feature[predict_col] = label

    train_feature = all_feature[all_feature[test_label_col] == 0]
    train_y = np.array(train_feature[predict_col])
    train_x = np.array(train_feature.drop([predict_col, test_label_col], axis=1))
    test_feature = all_feature[all_feature[test_label_col] == 1]
    test_y = np.array(test_feature[predict_col])
    test_x = np.array(test_feature.drop([predict_col, test_label_col], axis=1))
    predict_x = np.array(predict_feature.drop([predict_col, test_label_col], axis=1))
    print("train_x:", train_x.shape, "test_x:", test_x.shape, "predict_x", predict_x.shape)

    params = {'booster': 'gbtree',
              'eta': 0.02,
              'max_depth': 8,  # 5 4 3
              'colsample_bytree': 0.9,  # 0.8 0.7
              'subsample': 0.8,
              'min_child_weight': 40,  # 2 3
              'silent': 1,
              'nthread': 4,
              'tree_method': 'gpu_hist',
              "gpu_id": 0,
              "seed": 0
              }
    if predict_col in bool_col:
        params["objective"] = "binary:logistic"
        params["eval_metric"] = "error"
        params["is_unbalance"] = True
        eval_metric = None
    elif predict_col in enum_col or predict_col in ext_enum_col:
        params["objective"] = "multi:softmax"
        params["eval_metric"] = "merror"
        params["num_class"] = max(label) + 1
        eval_metric = None
    else:
        params["objective"] = "reg:linear"
        eval_metric = tool.xgb_metric

    train_set = xgb.DMatrix(train_x, label=train_y)
    valid_set = xgb.DMatrix(test_x, label=test_y)
    temp_model = xgb.train(params, train_set, num_boost_round=num_boost_round, evals=[(valid_set, "validate")],
                           feval=eval_metric, maximize=True, early_stopping_rounds=200, verbose_eval=False)
    test_pred = temp_model.predict(valid_set)

    # 把概率转换为label
    if predict_col in bool_col:
        test_pred = np.where(test_pred > 0.5, 1, 0)
    elif predict_col in enum_col or predict_col in ext_enum_col:
        # 用原始的全测试集
        if del_test_size > 0:
            valid_set = xgb.DMatrix(test_x_org)
            test_pred = temp_model.predict(valid_set)
        test_y = test_y_org
        # 取回原来的值
        test_pred = np.array([k_v[i] for i in test_pred])

    if predict_col in category_col:
        test_s = tool.label_score(test_y, test_pred)
    else:
        test_s = tool.regression_score(test_y, test_pred)

    # 可能保留两位小数或一位小数更好,或取整
    if_round = False
    test_pred2 = np.round(test_pred, 2)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 2
        test_s = test_s2
    test_pred2 = np.round(test_pred, 1)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 1
        test_s = test_s2
    test_pred2 = np.round(test_pred, 0)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 0
        test_s = test_s2

    print("best iteration: ", temp_model.best_iteration)
    print("test score: ", test_s)

    predict_set = xgb.DMatrix(predict_x)
    predict_target = temp_model.predict(predict_set)
    predict_target = np.array(predict_target)
    if predict_col in enum_col or predict_col in ext_enum_col:
        predict_target = np.array([k_v[i] for i in predict_target])
    elif predict_col in bool_col:
        predict_target = np.where(predict_target > 0.5, 1, 0)

    if if_round:
        predict_target = np.round(predict_target, if_round)

    return predict_target, test_s
                     colsample_bytree=0.8,
                     objective='binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)
modelfit(xgb1, train, predictors)

# In[79]:

df = test
df = df.drop('id', 1)
df = df.drop('scan_folder', 1)

# In[80]:

xgtest = xgb.DMatrix(df[predictors].values)

# In[81]:

preds = xgb1.predict_proba(df)

# In[82]:

preds

# In[110]:

data = []
cols = ['id', 'cancer']
df = test
for i, row in tqdm(df.iterrows(), total=len(df)):
# =============================================================================
# wait
# =============================================================================
while True:
    if os.path.isfile('SUCCESS_803'):
        break
    else:
        sleep(60 * 1)

utils.send_line('{} start'.format(__file__))
# =============================================================================
# load train
# =============================================================================

dtrain = xgb.DMatrix('../data/dtrain.mt')
gc.collect()

# =============================================================================
# xgboost
# =============================================================================

param = {
    'colsample_bylebel': 0.8,
    'subsample': 0.1,
    'eta': 0.1,
    'eval_metric': 'auc',
    'max_depth': 4,
    'objective': 'binary:logistic',
    'silent': 1,
    'tree_method': 'hist',
示例#12
0
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train
gc.collect()

#split = 80000
split = int(len(x_train) * 0.88)
print("split: " + str(split))
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[
    split:], y_train[split:]

print('Building DMatrix...')
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid
gc.collect()

print('Training ...')
params = {}
params['seed'] = 21
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['silent'] = 1
params['max_depth'] = 4
params['min_child_weight'] = 1
params['gamma'] = 0.0
示例#13
0
print(feature_names)
from sklearn.model_selection import KFold
del train['filename']
del train['classification']
k = 3
kfold = KFold(n_splits=k)
features_T = train.drop(['pathologie'], axis=1).values
pathologies_T = train.pathologie
print("Data ready for Kfold")
print("Kfold test starting")
for i, (train_index,
        test_index) in enumerate(kfold.split(features_T, pathologies_T)):
    print('[Fold %d/%d]' % (i + 1, 3))
    X_train, X_test = features_T[train_index], features_T[test_index]
    y_train, y_test = pathologies_T[train_index], pathologies_T[test_index]
    d_train = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
    d_test = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names)
    watchlist = [(d_test, 'eval'), (d_train, 'train')]
    print("Training")
    evals_result = {}
    model = xgb.train(params,
                      d_train,
                      num_round,
                      watchlist,
                      evals_result=evals_result)
    print("Trained")
    xgb.plot_importance(model, max_num_features=25)
    from matplotlib import pyplot
    pyplot.show()
    print()
 def predict(self, X):
     if self.model is None:
         raise XgbLearnerException("Xgboost model is None")
     dtrain = xgb.DMatrix(X, missing=np.NaN)
     return self.model.predict(dtrain)
示例#15
0
import xgboost as xgb

# read in data
dtrain = xgb.DMatrix('../../data/data_20170722_01/train_data.txt')
dtest = xgb.DMatrix('../../data/data_20170722_01/test_data.txt')

# specify parameters via map, definition are same as c++ version
param = {'max_depth':22, 'eta':0.1, 'silent':0, 'objective':'binary:logistic','min_child_weight':3,'gamma':14 }

# specify validations set to watch performance
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = 60
bst = xgb.train(param, dtrain, num_round, watchlist)

# this is prediction
preds = bst.predict(dtest)
labels = dtest.get_label()

positive_threshold_list = [0.50, 0.67, 0.80, 0.90, 0.95]

for positive_threshold in positive_threshold_list:
	print('positive_threshold: ' + str(positive_threshold))
	num_correct = sum(1 for i in range(len(preds)) if int(preds[i]>positive_threshold)==labels[i])
	num_pred = len(preds)
	num_error = num_pred - num_correct
	print ('error=%d/%d=%f' % (num_error, num_pred, num_error /float(num_pred)))

	print ('accuracy=%d/%d=%f' % ( num_correct, num_pred, num_correct /float(num_pred)))

	num_true_positive = sum(1 for i in range(len(preds)) if int(preds[i]>positive_threshold)==labels[i] and labels[i]==1)
	num_positive_pred = sum(1 for i in range(len(preds)) if preds[i]>positive_threshold)
示例#16
0
 def get_dmat(self):
     return xgb.DMatrix(self.X, self.y)
示例#17
0
def runModel(modelName, num):

    train_data_X,train_data_Y,test_data_X,test_data_Y = getData_old()

    dtrain = xgb.DMatrix(train_data_X, train_data_Y)
    dtest = xgb.DMatrix(test_data_X, test_data_Y)

    # print(dtrain.get_label())
    # print(ca)



    # specify parameters via map
    # params = {'max_depth':10, 'eta':1, 'silent':1, 'objective':'binary:logistic' }

    watchlist = [ (dtrain,'train'), (dtest, 'test') ]  

    params={
        'booster':'gbtree',
        'objective': 'binary:logistic', 
        'gamma':0.8,  # 在树的叶子节点下一个分区的最小损失,越大算法模型越保守 。[0:]
        'max_depth':6, # 构建树的深度 [1:]
        'lambda':100,  # L2 正则项权重
        'subsample':0.5, # 采样训练数据,设置为0.5,随机选择一般的数据实例 (0:1]
        'colsample_bytree':1, # 构建树树时的采样比率 (0:1]
        'min_child_weight':12, # 节点的最少特征数
        'silent':1 ,
        'eta': 0.1, # 如同学习率
        'seed':30,
        'nthread':4,# cpu 线程数,根据自己U的个数适当调整
    }

    # 设置boosting迭代计算次数
    num_round = num

    bst = xgb.train(params, dtrain, num_round, watchlist)  # dtrain是训练数据集


    train_preds = bst.predict(dtrain)    #
    # print ("train_preds",train_preds)
    
    train_predictions = [round(value) for value in train_preds]
    # print ("train_predictions",train_predictions)
    
    y_train = dtrain.get_label()
    # print ("y_train",y_train)
    
    train_accuracy = accuracy_score(y_train, train_predictions)
    # log.info ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    

    # make prediction
    preds = bst.predict(dtest)
    predictions = [round(value) for value in preds]
    # log.info ("preds:"+str(preds)) 
    # log.info ("predictions:"+str(predictions))

    y_test = dtest.get_label()
    # log.info ("y_test:"+str(y_test))
    
    test_accuracy = accuracy_score(y_test, predictions)
    # log.info("Test Accuracy: "+str(test_accuracy * 100.0)+"%")

    #save model
    with open(PATH_CURR + '/modelSave/' + modelName + '.pik','wb')as f:  
        pickle.dump(bst,f,-1)
示例#18
0
 def get_test_dmat(self, num_rows):
     rs = np.random.RandomState(432)
     return xgb.DMatrix(
         self.X[rs.randint(0, self.X.shape[0], size=num_rows), :])
示例#19
0
    def train(self, data=None, log_demand=True, normalize=False):
        """
        Train
        Args:
            data: a DataFrame object, default None. If None, 'processed/reg_train.csv' will be loaded.
            log_demand: a boolean object, defult True. If True, the target demand will be transformed by log operation, e.g.:
                        y => log(y+1).
            normalize: a bollean object, default False. If True, the train data will be normalized.

        Returns: A xgb booster.

        """
        # load data
        logger.info('Loading train data...')
        if data is None:
            data = pd.read_csv('processed/reg_train.csv', dtype=DTYPES)

        logger.info('Setting features for training...')

        # sampling
        logger.info('Sampling...')
        data = data.take(
            np.random.permutation(len(data))[:int(self._batch_size *
                                                  len(data))])

        # prepare training set.
        logger.info("Preparing training set...")
        if log_demand:
            data.loc[:, 'Demanda_uni_equil'] = np.log(
                data.Demanda_uni_equil.values + 1)

        x_data = data[FEATURES]
        feature_names = x_data.columns.tolist()
        x_data = x_data.values
        if normalize:
            x_data = x_data / x_data.max(axis=0)
        y_data = data['Demanda_uni_equil'].values

        # free memory
        del data

        # make cross-validation set.
        logger.info('Cross-validation...')
        x_train, x_test, y_train, y_test = train_test_split(
            x_data, y_data, test_size=self._cv_size)
        dtrain = xgb.DMatrix(data=x_train,
                             label=y_train,
                             feature_names=feature_names)
        dtest = xgb.DMatrix(data=x_test,
                            label=y_test,
                            feature_names=feature_names)

        # free memory
        del x_data, x_train, x_test, y_train, y_test

        print('parameters: \n', self._xgb_params)
        watchlist = [(dtrain, 'train'), (dtest, 'eval')]
        logger.info('Training...')
        xgb_reg = xgb.train(params=self._xgb_params,
                            dtrain=dtrain,
                            num_boost_round=40,
                            evals=watchlist,
                            early_stopping_rounds=5)
        logger.info('Done with training.')
        return xgb_reg
示例#20
0
文件: main.py 项目: qsz13/otto-group
                                               verbose=0,
                                               warm_start=False)
    logistic.fit(features, labels)
    print cross_val_score(logistic, features, labels, cv=5)


if __name__ == "__main__":
    #read sample
    sample = pd.read_csv('sampleSubmission.csv')

    # Import Data
    tests = pd.read_csv('test.csv')
    tests = tests.drop('id', axis=1)
    scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
    tests = scaler.fit_transform(tests)
    tests_xgb = xgb.DMatrix(tests)

    #features = pd.read_csv('../input/train.csv')
    features = pd.read_csv('train.csv')
    features = features.drop('id', axis=1)

    # Extract target and Encode it to make it manageable by ML algo
    labels = features.target.values
    labels = LabelEncoder().fit_transform(labels)
    #print labels

    # Remove target from train, else it's too easy ...
    features = features.drop('target', axis=1)

    #features = preprocessing.normalize(features)
    scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
示例#21
0
params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',
    'num_class': 2,
    'gamma': 0.2,
    'max_depth': 15,
    'lambda': 2,
    'subsample': 0.8,
    'colsample_bytree': 0.5,
    'min_child_weight': 2,
    'silent': 1,
    'eta': 0.1,
    'seed': 1000
}

d_train = xgb.DMatrix(x_train, y_train)
num_rounds = 100
model = xgb.train(params, d_train, num_rounds)

dtest = xgb.DMatrix(x_test)
y_pred = model.predict(dtest)

compute_score(y_pred, y_test)

fig, ax = plt.subplots(figsize=(15, 15))
plot_importance(model, height=0.5, ax=ax, max_num_features=64)
plt.show()
""" 3. AdaboostClassifier """
print("************ AdaboostClassifier ************")
clf = AdaBoostClassifier(n_estimators=120, learning_rate=0.9)
clf.fit(x_train, y_train)
示例#22
0
# -*- coding: utf-8 -*-
import xgboost as xgb
import matplotlib.pyplot as plt
import os
import sklearn as sk
import common

work_dir = 'E:/krzys/informatyka-studia/sem-16-2017L/msi2/proj/SmogDetector/SmogDetector.Python/'
os.chdir(work_dir)
f1 = open('./res/res_7_cv.txt', 'w+')
# read in data
dtrain = xgb.DMatrix(work_dir + 'data/' + 'data_reg2_train.txt')
dtest = xgb.DMatrix(work_dir + 'data/' + 'data_reg2_test.txt')
# specify parameters via map
param = [
    ('max_depth', 6),  # depth of tree
    ('booster', 'dart'),
    ('eta', 0.1),  #learning rate, prevents overfitting
    ('silent', 1),  #  prints mesagees
    ('gamma', 1.0),  #  bigger -> more conservative
    #('min_child_weight',1),
    ('objective', 'reg:linear'),
    ('eval_metric', 'rmse'),
    #('eval_metric', 'merror') #2 metryki
]

watchlist = [(dtrain, 'train'), (dtest, 'eval')]
num_round = 1000
evals_result = {}
f1.write("Regresja_dart-idx,Regresja_dart-precyzja,Regresja_dart-std\n")
for i_max_depth in range(3, 4):
# missing values
data_set.isnull().sum()  # no missing values
# missingno.matrix(data_set)
# distribution of labels
sns.countplot(target)  # balanced labels
# correlation analysis
sns.heatmap(data_set.corr())  # some features are highly correlated

# split dataset into train and test set
ind = np.random.rand(len(data_set)) < 0.8
train_set_origin = data_set[ind]
test_set_origin = data_set[~ind]
# reset index
train_set = train_set_origin.reset_index(drop=True)
test_set = test_set_origin.reset_index(drop=True)
# train label and test label
train_label = target[ind]
test_label = target[~ind]

# Train model
xgb_train = xgb.DMatrix(train_set, label=train_label)
params = {
    "objective": "multi:softmax",
    "eta": 0.1,
    "max_depth": 5,
    "num_class": 3
}
num_round = 50
watchlist = [(xgb_train, 'train'), (xgb_test, 'test')]
xgb_test = xgb.DMatrix(test_set, label=test_label)
xgb_model = xgb.train(params, xgb_train, num_round, watchlist)
示例#24
0
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 1000)

    def train(self,
              x_train,
              y_train,
              x_valid=None,
              y_valid=None,
              sample_weights=None):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    # pred_leaf=True => getting leaf indices
    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x), pred_leaf=True).astype(int)


x_train, y_train, x_test = get_data()

dtrain = xgb.DMatrix(x_train, label=y_train)

xg = XgbWrapper(seed=SEED, params=xgb_params)
xg_cat_embedding_train, xg_cat_embedding_test = get_oof(
    xg, x_train, y_train, x_test)

xg_cat_embedding_ohe_train, xg_cat_embedding_ohe_test = get_sparse_ohe(
    xg_cat_embedding_train, xg_cat_embedding_test)

print("OneHotEncoded XG-Embeddings: {},{}".format(
    xg_cat_embedding_ohe_train.shape, xg_cat_embedding_ohe_test.shape))
示例#25
0
import xgboost as xgb
from xgboost import XGBRegressor

hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(["Salary", "League", "Division", "NewLeague"],
             axis=1).astype("float64")
X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

xgb_model = XGBRegressor().fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))
xgb_grid = {
    'colsample_bytree': [0.4, 0.5, 0.6, 0.9, 1],
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [2, 3, 4, 5, 6],
    'learning_rate': [0.1, 0.01, 0.5]
}
xgb = XGBRegressor()
xgb_cv = GridSearchCV(xgb, param_grid=xgb_grid, cv=10, versobe=2)
xgb_cv.fit(X_train, y_train)
print(xgb_cv.best_params_)
示例#26
0
 def predict(self, x):
     return self.gbdt.predict(xgb.DMatrix(x), pred_leaf=True).astype(int)
def arlines_test():
    if sys.version.startswith("2"):
        print("native XGBoost tests only supported on python3")
        return
    import xgboost as xgb
    assert H2OXGBoostEstimator.available() is True

    # Artificial data to be used throughout the test, very simple
    raw_data = {
        'wealthy': [1, 1, 1, 0, 0],
        'ownsTesla': [False, False, False, True, True]
    }
    train_frame = pd.DataFrame(data=raw_data)

    data = train_frame.as_matrix(['wealthy'])
    label = train_frame.as_matrix(['ownsTesla'])

    # Native XGBosot model trained first
    dtrain = xgb.DMatrix(data=data, label=label)
    watchlist = [(dtrain, 'train')]
    param = {
        'eta': 0.7,
        'silent': 1,
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'max_depth': 2,
        'seed': 1,
        'max_delta_step': 0,
        'alpha': 0,
        'nround': 5
    }
    bst = xgb.train(params=param,
                    dtrain=dtrain,
                    num_boost_round=2,
                    evals=watchlist)
    native_prediction = bst.predict(data=dtrain)
    print(native_prediction)
    assert len(native_prediction) == 5

    # H2O XGBoost model trained
    frame = h2o.H2OFrame(train_frame)
    # Force factor variables, even if recognized correctly
    frame['ownsTesla'] = frame['ownsTesla'].asfactor()
    frame['wealthy'] = frame['wealthy'].asfactor()
    # The ntrees parameters in H2O translates to max_depth param
    h2o_model = H2OXGBoostEstimator(training_frame=frame,
                                    learn_rate=0.7,
                                    booster='gbtree',
                                    seed=1,
                                    ntrees=2)
    h2o_model.train(x=['ownsTesla'], y='wealthy', training_frame=frame)
    h2o_prediction = h2o_model.predict(frame['ownsTesla'])
    print(h2o_prediction)

    assert len(h2o_prediction['p0']) == 5

    assert round(h2o_prediction['p0'][0, 0],
                 5) == round(native_prediction[0].item(), 5)
    assert round(h2o_prediction['p0'][1, 0],
                 5) == round(native_prediction[1].item(), 5)
    assert round(h2o_prediction['p0'][2, 0],
                 5) == round(native_prediction[2].item(), 5)
    assert round(h2o_prediction['p0'][3, 0],
                 5) == round(native_prediction[3].item(), 5)
    assert round(h2o_prediction['p0'][4, 0],
                 5) == round(native_prediction[4].item(), 5)
train_X = combined_data.iloc[:train_length, 1:]
train_Y = train_data['SalePrice']
train_Id = combined_data.iloc[:train_length, 0]

test_X = combined_data.iloc[train_length:, 1:]
test_Id = combined_data.iloc[train_length:, 0]

#Price Comparision for Original Sale Price and log of Sale Price
fig, (axis1, axis2) = plt.subplots(1, 2, figsize=(10, 5))
axis1.hist(train_Y)
train_Y = np.log1p(train_Y)
axis2.hist(train_Y)

# formatting DMatrix to train xgb
dtrain = xgb.DMatrix(train_X, label=train_Y)

# The error metric: RMSE on the log of the sale prices.
from sklearn.metrics import mean_squared_error
import math


#UDF for Range Function for decimals
def common_num_range(start, stop, step):
    startlen = stoplen = steplen = 0
    if '.' in str(start):
        startlen = len(str(start)) - str(start).index('.') - 1
    if '.' in str(stop):
        stoplen = len(str(stop)) - str(stop).index('.') - 1
    if '.' in str(step):
        steplen = len(str(step)) - str(step).index('.') - 1
with open(root_folder + model_object_folder + features_xgboost_file,
          "rb") as pickle_features_file:
    features = pickle.load(pickle_features_file)

# save the model to disk
with open(root_folder + model_object_folder + xgboost_reference_price_model,
          "rb") as pickle_output_file:
    xgb_model = pickle.load(pickle_output_file)

features_importance =\
    pd.read_csv(root_folder + model_validation_folder + features_importance_xgboost_csv, sep=';', decimal=',')
features_importance =\
    features_importance.sort_values(by=['Importance'], ascending=False)

predictions = xgb_model.predict(xgb.DMatrix(X_test, label=y_test))
df_measures_knn =\
    pd.DataFrame(columns=['k','n_components','mean_dist_25_75','median_dist_25_75',
                          'total_dist_25_75','mean_variance','median_variance',
                          'total_variance','mean_ratio_variance','median_ratio_variance',
                          'total_ratio_variance','mean_ratio_interquartile','median_ratio_interquartile',
                          'total_ratio_interquartile','df_coverage_real_spread','df_coverage_predicted_spread',
                          'df_dist_prediction_quartiles_to_mean','df_dist_real_quartiles_to_mean',
                          'mean_dist_q_prediction_to_mean','median_dist_q_prediction_to_mean',
                          'total_dist_q_prediction_to_mean','mean_dist_q_real_to_mean',
                          'median_dist_q_real_to_mean','total_dist_q_real_to_mean',
                          'mean_dist_prediction_to_mean', 'median_dist_prediction_to_mean',
                          'total_dist_prediction_to_mean', 'mean_dist_real_to_mean',
                          'median_dist_real_to_mean', 'total_dist_real_to_mean',
                          'mean_dist_25_75_including_pred','median_dist_25_75_including_pred',
                          'total_dist_25_75_including_pred','mean_increase_including_pred'])
示例#30
0
def get_tranformer_score(tranformer):
    xrf = tranformer
    dpredict = xgb.DMatrix(X_test)
    prediction = xrf.predict(dpredict, ntree_limit=xrf.best_ntree_limit)

    return mean_squared_error(y_test, prediction)