Пример #1
0
def runXGBoost(x_train, y_train, x_test, y_test, p):

    # Here we instantiate the extra gradient boosting classifier
    clf = XGBClassifier()
    clf.set_params(**p)

    clf.fit(x_train, y_train)

    # now, make the predictions using our classifier
    xgb_predictions = clf.predict(x_test)

    # now we have to computer the classification accuracy
    # think about what two variables we have to compare
    xgb_score = accuracy_score(y_test, xgb_predictions)
    print("XGB classification accuracy on test data is " + str(xgb_score),
          file=sys.stderr)

    etc_predictions = clf.predict(x_test)
    dt_score = accuracy_score(y_test, etc_predictions)
    print("accuracy score on test data: " + str(dt_score), file=sys.stderr)
    train_score = accuracy_score(y_train, clf.predict(x_train))
    print("accuracy score on training data: " + str(train_score),
          file=sys.stderr)

    return (train_score, dt_score)
Пример #2
0
def modelfit(useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    alg = XGBClassifier(**params)
    df = data.sample(frac=0.3)
    pX = df.drop('LABEL', axis=1)
    py = df['LABEL']
    if useTrainCV:
        print("start use cv")
        xgb_param = alg.get_xgb_params()
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=xgb_param['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds)
        print(cvresult.shape[0])
        alg.set_params(n_estimators=cvresult.shape[0])
        params['n_estimators'] = cvresult.shape[0]
        print("best tree size is {}".format(cvresult.shape[0]))
    # Fit the algorithm on the data
    alg.fit(X, y, eval_metric='auc')
    y_pred = alg.predict(pX)
    accuracy = metrics.accuracy_score(py, y_pred)
    print("精确率Accuracy: %.2f%%" % (accuracy * 100.0))
    print('auc:', metrics.roc_auc_score(py, y_pred))
    train_report = metrics.classification_report(py, y_pred)
    print(train_report)
    feat_imp = pd.Series(
        alg.get_booster().get_fscore()).sort_values(ascending=False)
    print(feat_imp)
    return alg
Пример #3
0
def Create_Model(X_train, X_test, y_train, y_test, learning_rate, n_estimators,
                 max_depth, min_child_weight, gamma, subsample,
                 colsample_bytree, reg_alpha, eval_metric):

    ROCforest = XGBClassifier(learning_rate=learning_rate,
                              n_estimators=n_estimators,
                              max_depth=max_depth,
                              min_child_weight=min_child_weight,
                              gamma=gamma,
                              subsample=subsample,
                              colsample_bytree=colsample_bytree,
                              reg_alpha=reg_alpha,
                              objective='binary:logistic',
                              nthread=4,
                              seed=12)

    cv_folds = 5

    eval_metric = eval_metric

    xgb_param = ROCforest.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
    cvresult = xgb.cv(xgb_param,
                      xgtrain,
                      num_boost_round=ROCforest.get_params()['n_estimators'],
                      nfold=cv_folds,
                      metrics=eval_metric)

    ROCforest.set_params(n_estimators=cvresult.shape[0])

    ROCforest.fit(X_train, y_train)

    return ROCforest
Пример #4
0
def predict_xgboost(param_grid, train, test, col_type, nthread=3, seed=1):
    target = col_type['target']
    features = col_type['features']
    ID = col_type['ID']

    params = dict()
    for key, value in param_grid.items():
        params[key] = value[0]

    params['objective'] = 'binary:logistic'
    params['nthread'] = nthread
    params['random_state'] = seed
    params['seed'] = seed
    params['silent'] = True

    xgb_model = XGBClassifier()
    xgb_model.set_params(**params)

    X_train = train[features].values
    y_train = train[target].values
    X_test = test[features].values

    # Fit the algorithm on train data
    xgb_model.fit(X_train, y_train, eval_metric='auc')

    # Predict on test data
    pred = xgb_model.predict_proba(X_test)[:, 1]

    pred = pd.concat([test.loc[:, [ID]],
                      pd.Series(pred, name='pred_xgboost')],
                     axis=1)

    return pred
 def eval_fn(params):
     model = XGBClassifier(n_estimators=n_estimators_max, learning_rate=learning_rate, seed=seed)
     score = 0
     n_estimators = 0
     for tr, va in skf:
         X_tr, y_tr = X_train[tr], y_train[tr]
         X_va, y_va = X_train[va], y_train[va]
         model.set_params(**params)
         model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss',
                   early_stopping_rounds=50, verbose=False)
         score += model.best_score
         n_estimators += model.best_iteration
     score /= n_folds
     n_estimators /= n_folds
     n_estimators_lst.append(n_estimators)
     result_str = "train:%.4f ntree:%5d  " % (score, n_estimators)
     if X_valid is not None:
         model.n_estimators = n_estimators
         model.fit(X_train, y_train)
         pr = model.predict_proba(X_valid)[:,1]
         sc_valid = log_loss(y_valid, pr)
         score_valid.append(sc_valid)
         result_str += "valid:%.4f" % sc_valid
     if verbose:
         print result_str
     return score
Пример #6
0
def xgb_model(x1, y1):
    X_train, X_test, y_train, y_test = train_test_split(
        x1, y1, test_size=0.3, random_state=SEED
    )

    # Down-sample controls in training set, [1:1] case:control
    if subsample is True:
        X_train, y_train = subsample_df(X_train, y_train)
    # Implement SMOTE to balance training set, [1:1] case:control
    if smote is True:
        X_train, y_train = smote_sample(X_train, y_train)

    columns = X_train.columns

    # Weight Rescale
    ratio = float(
        np.sum(y_train["psych_hosp"].values == 0)
        / np.sum(y_train["psych_hosp"].values == 1)
    )

    # Instantiate the XGBClassifier and specify parameters
    xgb1 = XGBClassifier(
        learning_rate=0.1,
        n_estimators=500,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        nthread=4,
        scale_pos_weight=ratio,
        seed=SEED,
    )

    xgb_param = xgb1.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values)
    cvresult = xgb.cv(
        xgb_param,
        xgtrain,
        num_boost_round=xgb1.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=50,
    )
    xgb1.set_params(n_estimators=cvresult.shape[0])

    # Fit the algorithm on the data
    xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc")

    imp = importances(xgb1, X_test, y_test)  # permutation
    imp = imp.reset_index()
    imp_ = imp[imp["Importance"] >= 0.0001]

    feats = []
    for _ in imp_["Feature"]:
        feats.append(_)

    return imp, feats
Пример #7
0
def XGBoost_training_single(Training, label, XGBoost_param):

    XGBoost_param['subsample'] = 0.8
    XGBoost_param['colsample_bytree'] = 0.8

    bst = XGBClassifier()
    bst.set_params(**XGBoost_param)
    bst.fit(Training, label)

    return (bst)
Пример #8
0
def test_param(params, X_train, y_train, X_test, y_test, seed, verbose=True):
    # Costruisco un modello con i parametri specificati
    xgb1 = XGBClassifier(objective='multi:softmax', num_class=9, seed=seed)

    xgb1.set_params(**params)
    # Addestro il modello con una parte del dataset
    modelfit(xgb1, X_train, y_train, verbose=verbose)

    # Valuto il modello sul trainingset
    test_rmse = evaluate(xgb1, X_test, y_test)

    return xgb1, test_rmse
Пример #9
0
def grid_search(xgb_model, param_search, dtrain):
    xgb_gs = XGBClassifier(booster='gbtree', learning_rate=0.1, n_estimators=300, max_depth=6,
                              reg_alpha=0.05, min_child_weight=1, gamma=0, subsample=0.8,
                              colsample_bytree=1, objective='multi:softmax', num_class=10,
                              scale_pos_weight=1)
    param_set = xgb_model.get_params()
    xgb_gs.set_params(**param_set)
    # gsearch = GridSearchCV(estimator=xgb_model, param_grid=param, scoring='accuracy', cv=5)
    gsearch = GridSearchCV(estimator=xgb_gs, param_grid=param_search, cv=5)
    gsearch.fit(dtrain.values[:, 1:], dtrain.values[:, 0])
    print(gsearch.cv_results_)
    print(gsearch.best_score_)
    print(gsearch.best_params_)
    return gsearch
def Blend(Number_of_algo, L, Phi, Chi, N, X, y):
    '''Build a new data set with the created F and G matrixes for L levels. Train the Belended algorithm with the new data and it's corresponding parameters'''
    rho = 0.7
    Levels_D_fw = []
    for i in range(0, L):
        sample_size = int(rho * len(X))  #Get sample size for this level
        indices = range(0, len(X))  #Get all the indices of the data to sample
        new_indices = random.sample(
            indices,
            sample_size)  #sample the indices according to the sample size
        D_dash = X[new_indices]  #Retrieve sampled indices data from X
        Labels_dash = y[new_indices]  #Retrieve sampled indices data from y
        remaining_indices = list(
            set(indices) -
            set(new_indices))  #Get those indices which are not sampled
        D_complement = X[
            remaining_indices]  #Retrieve remaining indices data from X
        Labels_complement = y[
            remaining_indices]  #Retrieve remaining indices data from y
        Labels_complement = Labels_complement.reshape(
            -1, 1
        )  #Convert a one dimensional single array into a two dimensional single array
        M = Build_Models(
            N, Chi, D_dash,
            Labels_dash)  #Generate models for the sampled indices data
        Models_count = len(M)  #Total number of Models
        G_Matrix = G(M, D_complement, Models_count, Number_of_algo,
                     3)  #Generate G
        F_Matrix = F(G_Matrix, D_complement, Models_count,
                     Number_of_algo)  #Generate F
        D_fw_temp = np.concatenate(
            (D_complement, G_Matrix, F_Matrix, Labels_complement), axis=1
        )  #Generate new dataset i.e add the elements of reamining data, F & G row wise
        Levels_D_fw.append(
            D_fw_temp)  #Add each such new dataset for the present Level
    '''Number of columns of D_fw would be d(columns of the data set) + Nc(Number of models * Number of classes)
     + dNc(Number of columns * Number of models * Number of classes) + 1(Labels columns which is 1)'''

    D_fw = np.concatenate(
        Levels_D_fw, axis=0
    )  #Column wise append all the new datasets generated for each Level
    Object = XGBClassifier()  #Create a base algorithm classifier
    Object.set_params(
        **Phi['params']
    )  #Set the randomly generated params for the base algorithm classifier
    Object.fit(D_fw[:, :(D_fw.shape[1] - 1)],
               D_fw[:, [D_fw.shape[1] -
                        1]])  #Train the base algorithm with the new Data
    return Object  #Return the trained Model
def main():
    data_train = pd.read_csv(args.train_dataset)
    X_train = data_train.drop(['Id', 'Class'], axis=1)
    y_train = data_train.loc[:, 'Class']
    data_test = pd.read_csv(args.test_dataset)
    X_test = data_test.drop(['Id'], axis=1)
    Id = data_test.loc[:, 'Id']
    clf = XGBClassifier()
    clf.set_params(**best_dicts)
    clf.fit(X_train, y_train)
    prediction = clf.predict_proba(X_test)
    columns = ['Prediction'+str(i) for i in range(1, 10)]
    prediction = pd.DataFrame(prediction, columns=columns)
    results = pd.concat([Id, prediction], axis=1)
    return (clf, results)
Пример #12
0
def learn_xgboost(train, predictors, class_target):

    xgb_train = xgb.DMatrix(train[predictors].values,
                            label=train[class_target].values)
    xgb1 = XGBClassifier()

    # Tunning parameters
    cv_folds = 10
    early_stopping_rounds = 20
    show_progress = True

    params = {
        'reg_alpha': 0,
        'colsample_bytree': 0.6,
        'silent': 1,
        'colsample_bylevel': 1,
        'scale_pos_weight': 1,
        'learning_rate': 0.1,
        'missing': -1,
        'max_delta_step': 0,
        'nthread': 4,
        'base_score': 0.5,
        'n_estimators': 500,
        'subsample': 0.7,
        'reg_lambda': 1,
        'seed': 0,
        'min_child_weight': 1,
        'objective': 'multi:softprob',
        'max_depth': 6,
        'gamma': 0,
        'num_class': 2
    }
    # Cross-Validation
    cvresult = xgb.cv(params,
                      xgb_train,
                      num_boost_round=params['n_estimators'],
                      nfold=cv_folds,
                      metrics=['mlogloss'],
                      early_stopping_rounds=early_stopping_rounds)
    xgb1.set_params(n_estimators=cvresult.shape[0])
    xgb1.fit(train[predictors], train[class_target], eval_metric=['mlogloss'])
    return xgb1
Пример #13
0
def XGBoost_training(Training,
                     label,
                     XGBoost_param,
                     cv,
                     threads=6):  ## 分类方法:XGBoost

    XGBoost_gs = GridSearchCV(estimator=XGBClassifier(subsample=0.8,
                                                      colsample_bytree=0.8),
                              param_grid=XGBoost_param,
                              scoring='roc_auc',
                              n_jobs=threads,
                              iid=False,
                              cv=cv)
    XGBoost_gs.fit(Training, label)

    bst = XGBClassifier()

    bst.set_params(**XGBoost_gs.best_params_)
    bst.fit(Training, label)

    return (bst)
Пример #14
0
def main():
    # Load data for XGBClassifier
    dtrain = load_svmlight_file("../data/agaricus.txt.train")
    dtest = load_svmlight_file("../data/agaricus.txt.test")

    # load data for xgboost
    xgtrain = xgb.DMatrix("../data/agaricus.txt.train")
    #xgtest = xgb.DMatrix("../data/agaricus.txt.test")

    xgb0 = XGBClassifier(learning_rate=0.1,
                         n_estimators=2,
                         max_depth=5,
                         min_child_weight=1,
                         gamma=0,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         objective='binary:logistic',
                         nthread=4,
                         scale_pos_weight=1,
                         seed=27)
    #   res0=modelfit(xgb0, dtrain, dtest, xgtrain)
    #    xgb0.set_params(n_estimators=res0[4])

    res00 = base(xgb0, dtrain, dtest, xgtrain)

    param_test1 = {
        'max_depth': list(range(3, 10, 2)),
        'min_child_weight': list(range(1, 6, 2))
    }
    gridSearch_res1 = tune(xgb0, param_test=param_test1, dtrain=dtrain)
    xgb0.set_params(max_depth=gridSearch_res1[1]['max_depth'],
                    min_child_weight=gridSearch_res1[1]['min_child_weight'])
    res1 = modelfit(xgb0, dtrain, dtest, xgtrain)
    print(res1)

    param_test2 = {'gamma': [i / 10 for i in range(0, 5)]}
    gridSearch_res2 = tune(xgb0, param_test=param_test2, dtrain=dtrain)
    xgb0.set_params(gamma=gridSearch_res2[1]['gamma'])
    res2 = modelfit(xgb0, dtrain, dtest, xgtrain)

    param_test3 = {
        'subsample': [i / 10.0 for i in range(6, 10)],
        'colsample_bytree': [i / 10.0 for i in range(6, 10)]
    }
    gridSearch_res3 = tune(xgb0, param_test=param_test3, dtrain=dtrain)
    xgb0.set_params(subsample=gridSearch_res3[1]['subsample'],
                    colsample_bytree=gridSearch_res3[1]['colsample_bytree'])
    res3 = modelfit(xgb0, dtrain, dtest, xgtrain)
Пример #15
0
    def train(self, train_set, dev_set):
        logger.log('Get features from training set')
        if os.path.exists(train_features_file):
            train_features = np.load(train_features_file)
            _, _, train_labels, _, _ = self.get_minibatch(
                train_set, 0, len(train_set))
        else:
            train_features = None
            train_labels = []
            total_batch = int(len(train_set) - 1) / self.batch_size + 1
            for i in tqdm(range(total_batch)):
                minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \
                    minibatch_prem_dep, minibatch_hypo_dep = \
                    self.get_minibatch(train_set, i * self.batch_size, (i+1) * self.batch_size)
                feed_dict = {
                    self.model.premise_x: minibatch_premise_vectors,
                    self.model.hypothesis_x: minibatch_hypothesis_vectors,
                    self.model.y: minibatch_labels,
                    self.model.keep_rate_ph: 1.0
                }
                if 'dep_avg' in self.model_type:
                    feed_dict[self.model.prem_dep] = minibatch_prem_dep
                    feed_dict[self.model.hypo_dep] = minibatch_hypo_dep
                minibatch_features = self.sess.run([self.model.features],
                                                   feed_dict)
                train_features = minibatch_features[0] if train_features is None \
                    else np.concatenate((train_features, minibatch_features[0]))
                train_labels += minibatch_labels

            np.save(train_features_file, train_features)

        logger.log('Get features from dev set')
        if os.path.exists(dev_features_file):
            dev_features = np.load(dev_features_file)
            _, _, dev_labels, _, _ = self.get_minibatch(
                dev_set, 0, len(dev_set))
        else:
            dev_features = None
            dev_labels = []
            total_batch = int(len(dev_set) - 1) / self.batch_size + 1
            for i in tqdm(range(total_batch)):
                minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \
                    minibatch_prem_dep, minibatch_hypo_dep = \
                    self.get_minibatch(dev_set, i * self.batch_size, (i+1) * self.batch_size)
                feed_dict = {
                    self.model.premise_x: minibatch_premise_vectors,
                    self.model.hypothesis_x: minibatch_hypothesis_vectors,
                    self.model.y: minibatch_labels,
                    self.model.keep_rate_ph: 1.0
                }
                if 'dep_avg' in self.model_type:
                    feed_dict[self.model.prem_dep] = minibatch_prem_dep
                    feed_dict[self.model.hypo_dep] = minibatch_hypo_dep
                minibatch_features = self.sess.run([self.model.features],
                                                   feed_dict)
                dev_features = minibatch_features[0] if dev_features is None \
                    else np.concatenate((dev_features, minibatch_features[0]))
                dev_labels += minibatch_labels

            np.save(dev_features_file, dev_features)

        tuned_parameters = {'max_depth': [4, 6, 8], 'n_estimators': [100, 200]}

        best_score = 0.
        best_params = []
        for g in ParameterGrid(tuned_parameters):
            clf = XGBClassifier(nthread=24)
            clf.set_params(**g)
            clf.fit(train_features, train_labels)
            score = clf.score(dev_features, dev_labels)
            logger.log('%s: %f' % (str(g), score))
            if best_score < score:
                best_score = score
                best_params = g
                self.clf = clf

        logger.log('Best score: %s %f' % (str(best_params), best_score))
Пример #16
0
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', 
               max_depth=60, random_state=42, class_weight={0:.2, 1:.8})

FinalTrainX = FinalTrain.drop('readmitted', axis=1)
FinalTrainY = FinalTrain['readmitted'].replace([2,1], [1,0])

rfc.fit(FinalTrainX, FinalTrainY)

from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=1)

xgb.fit(FinalTrainX, FinalTrainY)


#Now, import new test cleaned test set and run it through our model:
TestDF = pd.read.csv(filename)
TestDF = TestDF.drop('IsTrain', axis=1)

TestDFLR = TestDF.drop((['diabfeat_neurologic', 'race_AfricanAmerican', 'A1Cresult_>7', 'primarydiag_injury', 'number_diagnoses', 
    'med_glimepiride', 'med_insulin', 'diag_infection', 'medical_specialty_Orthopedics', 'med_nateglinide', 'discharge_disposition_leftAMA', 
    'admission_source_id_3', 'change_Ch', 'diag_circulatory', 'medical_specialty_Gastroenterology', 'medical_specialty_Surgery',
    'primarydiag_infection', 'primarydiag_mentaldis'], axis=1))

predictprobsLR = lgr.predict_proba(TestDFLR)[:,1]
Пример #17
0
# Take a random 20% of the dataset as validation data
x_train, x_valid, y_train, y_valid = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.3,
                                                      random_state=1981)

gc.collect()
print("loaded data")

parameters = {
    'learning_rate': 0.02,
    'max_depth': 4,
    'min_child_weight': 1,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'objective': 'binary:logistic',
    'scale_pos_weight': 1,
    'random_state': 1987,
    'silent': True,
}

model = XGBClassifier()
model.set_params(**parameters)

# single_run(parameters, d_train, d_valid, d_test)

predictors = [x for x in train_df.columns if x not in ['target', 'id']]

# multiple_run(model, train_df, predictors)
grid_search(model, train_df, predictors)
Пример #18
0
def xgb_model2(x1, y1, ft):
    ## Remove features that negatively impact the model - Used after xgb_mode2 is already run once
    ##Copy results from XGB2_FEATS into 'unwanted'
    # unwanted = {'fever_unknown', 'other_ext_injury', 'med_angiotensin_ii_i', 'wbc_disease', 'proc_124', 'acute_bronch', 'hemorrhoid', 'chf', 'poison_psycho', 'eye_inflam', 'lower_limb_fract', 'biliary_tract', 'other_bone_disease', 'med_antifungal', 'spondylosis', 'secndry_malig', 'other_joint', 'neoplasm_unspec', 'chest_pain_nos', 'acq_foot_deform', 'mood', 'nonmalig_breast', 'schizo', 'suicide', 'osteo_arth', 'other_connective', 'medical_eval'}
    # ft = [e for e in ft if e not in unwanted]
    print("XGB Features:\n", ft, "\n")

    x1 = x1.loc[:, ft]
    X_train, X_test, y_train, y_test = train_test_split(
        x1, y1, test_size=0.3, random_state=SEED
    )

    # Down-sample controls in training set, [1:1] case:control
    if subsample is True:
        X_train, y_train = subsample_df(X_train, y_train)
    # Implement SMOTE to balance training set, [1:1] case:control
    if smote is True:
        X_train, y_train = smote_sample(X_train, y_train)

    columns = X_train.columns

    # Weight Rescale
    ratio = float(
        np.sum(y_train["psych_hosp"].values == 0)
        / np.sum(y_train["psych_hosp"].values == 1)
    )

    # Instantiate the XGBClassifier and specify parameters
    xgb1 = XGBClassifier(
        learning_rate=0.1,
        n_estimators=500,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        nthread=4,
        scale_pos_weight=ratio,
        seed=SEED,
    )

    xgb_param = xgb1.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values)
    cvresult = xgb.cv(
        xgb_param,
        xgtrain,
        num_boost_round=xgb1.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=50,
    )
    xgb1.set_params(n_estimators=cvresult.shape[0])
    xgb_cv_score = cross_val_score(
        xgb1, X_train, np.ravel(y_train), cv=10, scoring="roc_auc"
    )

    # Fit the algorithm on the data
    xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc")

    D = feature_dependence_matrix(X_train)
    viz1 = plot_dependence_heatmap(D, figsize=(11, 10))
    viz1.save("output/Psych_XGB_feat_depend_" + outfile)

    xgb_predict = xgb1.predict(X_test)

    print("=== All AUC Scores [CV - Train] ===")
    print(xgb_cv_score, "\n")
    print("=== Mean AUC Score [CV - Train] ===")
    print(xgb_cv_score.mean(), "\n")
    print("=== Confusion Matrix [Test] ===")
    print(confusion_matrix(y_test, xgb_predict), "\n")
    print("=== Classification Report [Test] ===")
    print(classification_report(y_test, xgb_predict), "\n")
    print("=== AUC Score [Test] ===")
    print(roc_auc_score(y_test, xgb_predict), "\n")

    imp = importances(xgb1, X_test, y_test)  # permutation
    viz2 = plot_importances(imp)
    viz2.save("output/Psych_XGB_feat_imp_" + outfile)
    imp = imp.reset_index()
    imp_ = imp[imp["Importance"] < 0.00000]

    feats = []
    for _ in imp_["Feature"]:
        feats.append(_)

    xgb_roc_auc = roc_auc_score(y_test, xgb_predict)
    fpr, tpr, thresholds = roc_curve(y_test, xgb1.predict_proba(X_test)[:, 1])
    plt.figure()
    plt.plot(fpr, tpr, label="XGB Classifier (area = %0.3f)" % xgb_roc_auc)
    plt.plot([0, 1], [0, 1], "r--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver operating characteristic: Clinical Data Only [XGB]")
    plt.legend(loc="lower right")
    plt.savefig("output/ROC_Psych_XGB_" + outfile)
    plt.savefig("output/ROC_Psych_XGB_" + outfile)
    plt.show()

    return imp, feats
    Parmt_XGBoost = {
        'n_estimators': [50, 100],
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.1, 0.3],
        'colsample_bytree': [0.5, 1],
        'gamma': [0],
    }
    Parmt_model_XGBoost = GridSearchCV(estimator=XGBoost_model,
                                       param_grid=Parmt_XGBoost,
                                       scoring='roc_auc',
                                       n_jobs=-1,
                                       cv=cv_inner).fit(
                                           train_data, train_target)
    best_parameters = Parmt_model_XGBoost.best_params_
    # Set best parameters to XGBoost model
    XGBoost_model.set_params(**best_parameters)

    # Train optimized XGBoost model on train data
    XGBoost_model.fit(train_data, train_target)

    # Train data results
    prob_train[train_idx, ncv_idx] = XGBoost_model.predict_proba(train_data)[:,
                                                                             1]
    aucs_train[ncv_idx] = metrics.roc_auc_score(train_target,
                                                prob_train[train_idx, ncv_idx])

    # Test data results
    prob_test[test_idx, ncv_idx] = XGBoost_model.predict_proba(test_data)[:, 1]
    aucs_test[ncv_idx] = metrics.roc_auc_score(test_target, prob_test[test_idx,
                                                                      ncv_idx])
train.drop(x, axis=1, inplace=True)
test.drop(x, axis=1, inplace=True)

y_train = train['TARGET'].values
X_train = train.drop(['ID','TARGET'], axis=1).values

y_test = test['ID']
X_test = test.drop(['ID'], axis=1).values

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=600,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.6815,
 colsample_bytree=0.701,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5,
metrics=['auc'], early_stopping_rounds=50, show_progress=False)
xgb1.set_params(n_estimators=cvresult.shape[0])
xgb1.fit(X_train, y_train, eval_metric='auc')
output = xgb1.predict_proba(X_test)[:,1]

submission = pd.DataFrame({"ID":y_test, "TARGET":output})
submission.to_csv("submission.csv", index=False)
class ParamTuner:
    def __init__(self, X_train, y_train):
        self._clf = XGBClassifier(learning_rate=0.01,
                                  n_estimators=1000,
                                  max_depth=5,
                                  min_child_weight=1,
                                  gamma=0,
                                  subsample=0.8,
                                  colsample_bytree=0.8,
                                  objective='binary:logistic',
                                  scale_pos_weight=1,
                                  seed=0)
        self._dtrain = xgb.DMatrix(X_train, label=y_train)
        self._X_train = X_train
        self._y_train = y_train

    @property
    def clf(self):
        return self._clf

    def show_params(self):
        logging.info("-" * 40)
        logging.info("current params:\n" + str(self._clf.get_params()))
        logging.info("-" * 40)

    def get_param(self, name):
        return self._clf.get_params()[name]

    def set_param(self, name, value):
        self._clf.set_params(**{name: value})

    def set_params(self, params):
        self._clf.set_params(**params)

    def tune_num_boost_round(self):
        logging.info("turn num_boost_round")
        history = xgb.cv(self._clf.get_params(),
                         dtrain=self._dtrain,
                         num_boost_round=NUM_BOOST_ROUND,
                         nfold=CV_FOLDS,
                         metrics='auc',
                         early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                         show_stdv=True)
        logging.info("tail of history:\n" + str(history.tail(1)))
        logging.info("learning rate: %f, best boosting num: %d" %
                     (self.get_param('learning_rate'), history.shape[0]))
        self.set_param('n_estimators', history.shape[0])
        self.show_params()

    def grid_search(self, param_grid):
        logging.info("grid search on %s" % param_grid.keys())
        gs = GridSearchCV(estimator=self._clf,
                          param_grid=param_grid,
                          scoring='roc_auc',
                          n_jobs=-1,
                          iid=False,
                          cv=CV_FOLDS)
        gs.fit(X=self._X_train, y=self._y_train)
        logging.info("grid_scores:\n" + '\n'.join(map(str, gs.grid_scores_)))
        logging.info("best_params: " + str(gs.best_params_))
        logging.info("best_score: " + str(gs.best_score_))
        self.set_params(gs.best_params_)
        self.show_params()
Пример #22
0
def train_model_xgb_cv(X_train, X_test, y_train, y_test):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    xgb_sklearn = XGBClassifier(learning_rate=0.1,
                                n_estimators=300,
                                max_depth=3,
                                min_child_weight=1,
                                gamma=0.3,
                                subsample=0.6,
                                colsample_bytree=0.7,
                                objective='binary:logistic',
                                nthread=4,
                                seed=27,
                                reg_lambda=0.01)

    xgb_params = xgb_sklearn.get_params()
    cvresult = xgb.cv(xgb_params,
                      dtrain,
                      num_boost_round=xgb_params['n_estimators'],
                      nfold=5,
                      metrics='auc',
                      early_stopping_rounds=5)
    n_estimators = cvresult.shape[0]
    print("n_estimators: ", n_estimators)
    xgb_sklearn.set_params(n_estimators=n_estimators)
    xgb_sklearn.fit(np.array(X_train), np.array(y_train), eval_metric='auc')

    pred_y = xgb_sklearn.predict(X_test)
    pred_y_prob = xgb_sklearn.predict_proba(X_test)[:, 1]
    # auc
    auc = roc_auc_score(y_test, pred_y_prob)
    print('AUC: ', auc)
    # error
    score = xgb_sklearn.score(X_test, y_test)
    print('error: ', 1 - score)

    # grid search
    params = {'max_depth': [2, 3, 4, 5, 6, 7, 8]}
    model = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.1,
            n_estimators=300,
            # max_depth=3,
            min_child_weight=1,
            gamma=0.3,
            subsample=0.6,
            colsample_bytree=0.7,
            objective='binary:logistic',
            nthread=4,
            seed=27,
            reg_lambda=0.01),
        param_grid=params,
        cv=2)
    model.fit(np.array(X_train), np.array(y_train), eval_metric='auc')
    print(model.cv_results_, model.best_params_, model.best_score_)

    feat_imp = pd.Series(xgb_sklearn.get_booster().get_fscore(
        fmap='xgb.fmap')).sort_values(ascending=True)
    feat_imp.plot(kind='barh', color='black', legend=False, figsize=(10, 6))
    plt.ylabel('Feature name')
    plt.xlabel('Feature score')
    plt.savefig(
        'C:/Users/Administrator.USER-20161227PQ/Desktop/paper figure/figure5.png',
        dpi=300)
    plt.show()
Пример #23
0
#        Makes the model more robust by shrinking the weights on each step
#        Typical final values to be used: 0.01-0.2
xgb1 = XGBClassifier(  #起点模型
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    seed=27)
best_iter = modelfit(xgb1, x, y)  # 作图、找最优iter等
xgb1.set_params(n_estimators=best_iter)  # 改最优iter

#%% 以下的区块中,每一块都是调某一两个参数,格式是类似的
# 有些para_grid的range很小,因为那是最后的代码。一开始range都是很大的,我根据结果再把range调小然后再重复执行,所以留下的是最后的小range的代码。

#%% Tune max_depth and min_child_weight
gsearch = []
# 这个变量用来记录调参的整个过程。每次调一个参数就增加一项(xgb,para),所以最后的最优模型就是gsearch[-1][0].best_estimator_。这样便于调用前面某一步得到的模型。

#    min_child_weight [default=1]
#        Defines the minimum sum of weights of all observations required in a child.
#        This is similar to min_child_leaf in GBM but not exactly. This refers to min “sum of weights” of observations while GBM has min “number of observations”.
#        Used to control over-fitting. Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree.
#        Too high values can lead to under-fitting hence, it should be tuned using CV.
#    max_depth [default=6]
#        The maximum depth of a tree, same as GBM.
class XGBoostModel:
    def __init__(self, n_classes=2, data_str=None):
        # TODO: decide what nthread should be?
        if n_classes > 2 and data_str == 'voice':
            self.model = XGBClassifier(learning_rate=0.1,
                                       n_estimators=650,
                                       max_depth=5,
                                       min_child_weight=1,
                                       gamma=0,
                                       subsample=0.8,
                                       colsample_bytree=0.8,
                                       objective='multi:softmax',
                                       nthread=4,
                                       scale_pos_weight=1,
                                       seed=0)

            # parameters found by tuning on voice data with 400 points:
            self.model.set_params(
                **{
                    "base_score": 0.5,
                    "booster": "gbtree",
                    "colsample_bylevel": 1,
                    "colsample_bynode": 1,
                    "colsample_bytree": 0.25,
                    "gamma": 0.0,
                    "learning_rate": 0.01,
                    "max_delta_step": 0,
                    "max_depth": 2,
                    "min_child_weight": 1,
                    "n_estimators": 3600,
                    "nthread": 4,
                    "objective": "multi:softprob",
                    "reg_alpha": 1e-05,
                    "reg_lambda": 1,
                    "scale_pos_weight": 1,
                    "seed": 0,
                    "subsample": 0.75,
                    "verbosity": 1
                })
        elif data_str == 'heart':
            self.model = XGBClassifier(learning_rate=0.1,
                                       n_estimators=20,
                                       max_depth=2,
                                       min_child_weight=0.3,
                                       gamma=0,
                                       subsample=0.65,
                                       colsample_bytree=0.15,
                                       reg_alpha=0,
                                       objective='binary:logistic',
                                       nthread=4,
                                       scale_pos_weight=1)
        elif data_str == 'ads':
            print('TODO: tune xgboost on ads! not done yet?')

            # TODO: tune model on heart and ads data too
            self.model = XGBClassifier(learning_rate=0.1,
                                       n_estimators=1000,
                                       max_depth=5,
                                       min_child_weight=1,
                                       gamma=0,
                                       subsample=0.8,
                                       colsample_bytree=0.8,
                                       objective='binary:logistic',
                                       nthread=4,
                                       scale_pos_weight=1,
                                       seed=0)
        else:
            print(
                'ERROR in xgBoostModel, only data_str in (voice, heart, ads) supported. Please add tuning parameters to file'
            )

    def fit(self, x, y, with_tuning=False):
        self.model.fit(x, y)

    def predict(self, x):
        return self.model.predict(x)

    def predict_proba(self, x):
        return self.model.predict_proba(x)
Пример #25
0
def set_parameters(set_name, golden_set, input_file):

    golden = str_to_bool(golden_set)

    #-------------------------------------------------------------------------

    #read in the directory that is being run
    data_dir = set_name

    #read in the parameters file and load it

    full_path = os.path.join(working_dir, "{0}".format(data_dir),
                             'params.yaml')
    stream = open(full_path, 'r')
    parameters = yaml.load(stream, Loader=yaml.FullLoader)

    #read in Hypatia data as pandas dataframe (2D structure), drop HIP numbers
    df = pd.read_csv(input_file)

    set_number = set_name

    #-------------------------------------------------------------------------

    if golden:
        df2 = df.copy()
        df2.loc[df2[(df2['Exo'] == 1)
                    & (df2['MaxPMass'] > parameters['gas_giant_mass'])].
                sample(10, random_state=np.random.RandomState()).index,
                'Exo'] = 0
        yy = df2.loc[df2['Exo'] == 0].index
        zz = df.loc[df['Exo'] == 0].index
        changed = [ind for ind in yy if not ind in zz]
        changedhips = [df['HIP'][ind] for ind in changed]
        df = df2.copy()
        yy2 = df2.loc[df2['Exo'] == 0].index
        zz2 = df.loc[df['Exo'] == 0].index
        changed2 = [ind for ind in yy2 if not ind in zz2]
    #-------------------------------------------------------------------------

    df.index = df['HIP']
    df['Exo'] = df['Exo'].astype('category')  #category = limited possibilities
    df['Multi'] = df['Multi'].astype('category')
    df['MaxPMass'] = df['MaxPMass'].astype(np.number)
    df['Sampled'] = np.zeros((df.shape[0]))
    df['Predicted'] = np.zeros((df.shape[0]))
    df = df.drop(['HIP'], 1)

    # Print a bunch of stuff in terminal
    print('Parameters used in simulation:')
    print('------------------------------')
    print('')

    for key in parameters.keys():
        print('{0} = {1}'.format(key, parameters[key]))

    cv_folds = parameters['cv_folds']
    early_stopping_rounds = parameters['early_stopping_rounds']
    N_iterations = parameters['N_iterations']
    N_samples = parameters['N_samples']
    gas_giant_mass = parameters['gas_giant_mass']
    features = parameters['features']

    relevant_columns = features + ['Exo', 'MaxPMass', 'Sampled', 'Predicted']

    #Redefine dataframe with the "relevant columns" and remove nans if dropnans==True in yaml
    if (parameters['dropnans']):
        df = df[relevant_columns].dropna()

    print('Number of samples used in simulation: {0}'.format(df.shape[0]))

    print('')

    #Define the confusion matrix and other arrays
    cfm = np.zeros((2, 2))

    auc_score_train = []
    precision_score_train = []
    feat_imp_train = pd.DataFrame(columns=features)
    probabilities_total = pd.DataFrame(index=df.index)

    print('iteration \t estimators')
    print('---------------------------')

    #---------------------------XGBOOST LOOP----------------------------------------------

    # Loop for all of the iterations (defined in yaml)
    for iteration in range(0, N_iterations):

        #dataframe of 200 random hosts with giant planets
        df_iter_with_exo = df[(df['Exo'] == 1)
                              & (df['MaxPMass'] > gas_giant_mass)].sample(
                                  N_samples,
                                  random_state=np.random.RandomState())
        #dataframe of 200 random non hosts
        df_iter_none_exo = df[df['Exo'] == 0].sample(
            N_samples, random_state=np.random.RandomState())

        # make a new dataframe of the 400 star subset
        df_train = pd.concat([df_iter_with_exo, df_iter_none_exo], axis=0)
        # make a dataframe of those stars NOT in the training set (to predict on)
        df_predict = df[~df.index.isin(df_train.index)]

        # The train dataframe with everything but the Exo column
        X = df_train.drop(['Exo'], 1)
        # The Exo column (and hips)
        Y = df_train.Exo

        # Note: Using gbtree booster
        alg = XGBClassifier(
            learning_rate=
            0.1,  #def=0.3, prevents overfitting and makes feature weight conservative
            n_estimators=1000,  #number of boosted trees to fit
            max_depth=6,  #def=6, max depth of tree/complexity
            min_child_weight=
            1,  #def=1, min weight needed to continue leaf partitioning
            gamma=
            0,  #def=0, minimum loss reduction required to make partition on a leaf
            subsample=0.8,  #def=1, subsample ratio of the training set
            colsample_bytree=
            0.8,  #def=1, subsample ratio of columns when making each tree
            objective=
            'binary:logistic',  #def=linear, logistic regression for binary classification, output probability
            nthread=
            1,  #originall = 8, but issue on laptop...def=max, number of parallel threads used to run xgboost
            scale_pos_weight=1,  #def=1, balance positive and neg weights
            seed=27)  #def=0, random number seed

        #get input parameters of algorithm
        xgb_param = alg.get_xgb_params()

        #construct training set matrix
        xgtrain = xgb.DMatrix(X[features].values, label=Y)

        #cross validation (CV) of xgboost to avoid overfitting
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds)

        alg.set_params(n_estimators=cvresult.shape[0])
        print(iteration, '\t \t', cvresult.shape[0])

        alg.fit(X[features], Y, eval_metric='auc')

        dtrain_predictions = alg.predict(X[features])
        dtrain_predprob = alg.predict_proba(X[features])[:, 1]

        feat_imp = alg.get_booster().get_fscore()
        # See how the algorithm performs on the Exo data
        auc_score = metrics.roc_auc_score(Y, dtrain_predprob)
        precision_score = metrics.precision_score(Y, dtrain_predictions)
        metric_score = metrics.confusion_matrix(Y, dtrain_predictions)

        # Weighting function to ignore the null values
        normalized_features = pd.DataFrame(
            (1 -
             df_train[features].isnull().sum() / df_train[features].count()) *
            pd.Series(alg.get_booster().get_fscore()),
            columns=[iteration]).T

        #calculate the confusion matrix
        feat_imp_train = pd.concat([
            feat_imp_train,
            pd.DataFrame(feat_imp, columns=features, index=[iteration])
        ])
        feat_imp_train_normal = pd.concat(
            [feat_imp_train, normalized_features])
        auc_score_train.append(auc_score)
        precision_score_train.append(precision_score)
        cfm += metric_score

        df.loc[df_predict.index, 'Sampled'] += np.ones(len(df_predict.index))
        df.loc[df_predict.index,
               'Predicted'] += alg.predict(df_predict[features])
        df.loc[df_predict.index, 'Prob'] = alg.predict(df_predict[features])

        values = df['Prob']
        probabilities_total = pd.concat(
            [probabilities_total,
             pd.Series(values, name=str(iteration))],
            axis=1)

        if (not iteration % 10):
            probabilities_total.to_pickle(
                '{0}/probabilities_total.pkl'.format(data_dir))

    #-------------------------------------------------------------------------

    # Calculate the confusion matrix
    cfm /= N_iterations
    cfm[0] /= cfm[0].sum()
    cfm[1] /= cfm[1].sum()

    # Print confusion matrix
    print(np.round(cfm, 3))
    df['Prob'] = df['Predicted'] / df['Sampled']

    ###########-------------------Output List of Planets------------------------#########

    #Find the stars with >90% probability of hosting a planet, with the Sampled, Predicted, and Prob columns
    planets = df[(df.Prob > .90)
                 & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']]
    print('Number of most probable planet hosts: {0}'.format(planets.shape[0]))

    #Sort the stars with predicted planets and save that file
    planetprobs = planets.sort_values(by='Prob', ascending=False)
    name = data_dir + '/figures/planet_probabilities' + str(
        datetime.today().strftime('-%h%d-%H%M')) + '.csv'
    #name = data_dir+'/figures/planet_probabilities.csv'
    outfile = open(name, 'w')
    planetprobs.to_csv(outfile)
    outfile.close()

    #Create a second list with all stars in Hypatia and the probabilities
    planets2 = df[(df.Prob > .0)
                  & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']]
    if golden:  #if 10 stars were randomly taken out
        changeddf = pd.DataFrame([])  #make empty dataframe
        for star in changedhips:  #loop over the 10 known planets hosts (defined at top)
            changeddf = changeddf.append(planets2.loc[planets2.index == star])
            if planets2.loc[
                    planets2.index ==
                    star].empty:  #catch for when a known planet host was cut (bc of abunds)
                temp = pd.Series([nan, nan, nan],
                                 index=['Sampled', 'Predicted', 'Prob'])
                temp.name = star
                changeddf = changeddf.append(
                    temp)  #append blank file (with star name as index)
        #Save golden set as a separate file with the date and time as a tag
        filename = '{0}/figures/goldenSetProbabilities' + str(
            datetime.today().strftime('-%h%d-%H%M')) + '.csv'
        changeddf.to_csv(filename.format(set_number), na_rep=" ")

    #Save the file with all of the probabilities
    planetprobs2 = planets2.sort_values(by='Prob', ascending=False)
    name2 = data_dir + '/figures/planet_probabilitiesAll' + str(
        datetime.today().strftime('-%h%d-%H%M')) + '.csv'
    #name2 = data_dir+'/figures/planet_probabilitiesAll.csv'
    outfile2 = open(name2, 'w')
    planetprobs2.to_csv(outfile2)
    outfile2.close()

    ###########------------------------Save Files------------------------##########
    print('Saving data files')

    #Save files
    feat_imp_train.to_pickle('{0}/features_train.pkl'.format(data_dir))
    feat_imp_train_normal.to_pickle(
        '{0}/features_train_normal.pkl'.format(data_dir))
    probabilities_total.to_pickle(
        '{0}/probabilities_total.pkl'.format(data_dir))
    df.to_pickle('{0}/df_info_all.pkl'.format(data_dir))

    np.save('{0}/auc_score_train.npy'.format(data_dir),
            np.array(auc_score_train))
    np.save('{0}/precision_score_train.npy'.format(data_dir),
            np.array(precision_score_train))
    np.save('{0}/cfm.npy'.format(data_dir), cfm)

    print('Simulation completed successfully.')
    if golden:
        print("Changed indices and HIP numbers:")
        print(changed)
        print(changedhips)
# xgb为了兼容sklean的GridSearchCV借口,定义了模型类,参数基本一直,但是有三个参数的命名不同
params['n_estimators'] = boost_num
params['learning_rate'] = params['eta']
params['reg_alpha'] = params['alpha']
params['reg_lambda'] = params['lambda']
params.pop('eta')
params.pop('alpha')
params.pop('lambda')

# 设置想要搜索的参数及阈值
search_param = {
    'max_depth':[3,5],
    'min_child_weight':[2,4]
}
# 删除param里想要搜索的参数
for key in search_param:
    params.pop(key)

model = XGBClassifier()
model.set_params(**params)
gridsearch = GridSearchCV(estimator=model,param_grid=search_param,scoring='roc_auc',n_jobs=4,cv=5,iid=False)
gridsearch.fit(train_data,train_label)
print(gridsearch.cv_results_)
print(gridsearch.best_params_)
print(gridsearch.best_score_)

with open('best_param.pkl','wb') as f:
    pickle.dump(gridsearch.best_params_,f)


                                    train_size=n_train, random_state=123)
 for idx, ignore in sss_train:
     X_train = X[train_idx][idx]
     y_train = target[train_idx][idx]
     #
     # 2.
     sss_train_inner = StratifiedShuffleSplit(y_train, n_iter=n_iter_cv, test_size=.1,
                                              random_state=456)
     model = XGBClassifier(n_estimators=1000, max_depth=10, subsample=.8, seed=987)
     params_lst_optimized = []
     for params in xgb_params_lst:
         n_estimators = 0
         for tr, va in sss_train_inner:
             X_tr, y_tr = X_train[tr], y_train[tr]
             X_va, y_va = X_train[va], y_train[va]
             model.set_params(**params)
             model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric="mlogloss",
                       early_stopping_rounds=50, verbose=False)
             n_estimators += model.best_iteration
         sc = params.copy()
         sc.update({'n_estimators':n_estimators / n_iter_cv})
         params_lst_optimized.append(sc)
     print 'Step 2 Done.', datetime.now() - t0
     # 3.
     model = XGBClassifier(max_depth=10, subsample=.8)
     for params in params_lst_optimized:
         for seed_train in range(100, 100+n_iter_pred):
             params.update({'seed':seed_train})
             model.set_params(**params)
             model.fit(X_train, y_train)
             pr = model.predict_proba(X_test)
Пример #28
0
def get_XgbClassifer(train_data,
                     train_target,
                     test_data,
                     feature_names,
                     parameters,
                     early_stopping_rounds,
                     num_folds,
                     eval_metric,
                     model_name='model',
                     stratified=True):
    '''
    :param train_data: 一定是numpy
    :param train_target:
    :param parameters:
    :param round:
    :param k:
    :param eval_metrics:自定义 or 内置字符串
    :return:
    '''

    # 如果在param中设置,会莫名报参数不存在的错误
    clf = XGBClassifier(num_class=n_class)
    clf.set_params(**parameters)

    # 定义一些变量
    oof_preds = np.zeros((train_data.shape[0], n_class))
    sub_preds = np.zeros((test_data.shape[0], n_class))
    feature_importance_df = pd.DataFrame()
    cv_result = []

    # K-flod
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=1234)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1234)
    for n_flod, (train_index,
                 val_index) in enumerate(folds.split(train_data,
                                                     train_target)):
        train_X = train_data[train_index]
        val_X = train_data[val_index]
        train_Y = train_target[train_index]
        val_Y = train_target[val_index]

        # 参数初步定之后划分20%为验证集,准备一个watchlist 给train和validation set ,设置num_round 足够大(比如100000),以至于你能发现每一个round 的验证集预测结果,
        # 如果在某一个round后 validation set 的预测误差上升了,你就可以停止掉正在运行的程序了。
        watchlist = [(train_X, train_Y)]
        # early_stop 看validate的eval是否下降,这时候必须传eval_set,并取eval_set的最后一个作为validate
        clf.fit(train_X,
                train_Y,
                early_stopping_rounds=early_stopping_rounds,
                eval_set=watchlist,
                eval_metric=eval_metric)
        # 获得每次的预测值补充
        oof_preds[val_index] = clf.predict_proba(val_X)
        # 获得预测的平均值,这里直接加完再除m
        sub_preds += clf.predict_proba(test_data)
        # 计算当前准确率
        result = mean_absolute_error(val_Y, clf.predict(val_X))
        print('Fold %2d macro-f1 : %.6f' % (n_flod + 1, result))
        print(type(result))
        cv_result.append(round(result, 5))
        gc.collect()

        # 默认就是gain 如果要修改要再参数定义中修改importance_type
        # 保存特征重要度
        gain = clf.feature_importances_
        fold_importance_df = pd.DataFrame({
            'feature': feature_names,
            'gain': 100 * gain / gain.sum(),
            'fold': n_flod,
        }).sort_values('gain', ascending=False)
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)

    # 进行保存
    sub_preds = sub_preds / folds.n_splits
    if not os.path.isdir('./cv'):
        os.makedirs('./cv')
    pd.DataFrame(oof_preds,
                 columns=['class_' + str(i) for i in range(n_class)
                          ]).to_csv('./cv/val_prob_{}.csv'.format(model_name),
                                    index=False,
                                    float_format='%.4f')
    pd.DataFrame(sub_preds,
                 columns=['class_' + str(i) for i in range(n_class)
                          ]).to_csv('./cv/test_prob_{}.csv'.format(model_name),
                                    index=False,
                                    float_format='%.4f')
    oof_preds = [np.argmax(x) for x in oof_preds]
    sub_preds = [np.argmax(x) for x in sub_preds]
    if not os.path.isdir('./sub'):
        os.makedirs('./sub')
    pd.DataFrame(oof_preds,
                 columns=['class'
                          ]).to_csv('./sub/val_{}.csv'.format(model_name),
                                    index=False)
    pd.DataFrame(sub_preds,
                 columns=['class'
                          ]).to_csv('./sub/test_{}.csv'.format(model_name),
                                    index=False)

    save_importances(feature_importance_df, model_name)
    return clf
Пример #29
0
# %%
# 4.2. tuning parameters
predictors = [x for x in df.columns if x not in ['label']]
clf = XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='binary:logistic',
                    scale_pos_weight=1)
modelfit(clf, df2, predictors, cv_folds=10)

# %%
clf.set_params(n_estimators=193)

# %%
param1 = {'max_depth': range(3, 12, 2), 'min_child_weight': range(1, 6, 2)}
gs1 = GridSearchCV(estimator=clf,
                   param_grid=param1,
                   scoring='f1_weighted',
                   n_jobs=-1,
                   iid=False,
                   cv=10)
gs1.fit(X, y)
print(gs1.best_params_)
print(gs1.best_score_)

# %%
clf.set_params(max_depth=7, min_child_weight=1)
Пример #30
0
def fit_xgboost(param_grid, param_table, train, col_type, find_n_estimator=False,
                cv_iterations=5, cv_folds=5, nthread=3, seed=1, verbose=0):

    target = col_type['target']
    features = col_type['features']
    ID = col_type['ID']

    start_time = strftime("%Y-%m-%d %H-%M", gmtime())
    pred_return = {}
    for params in param_table.itertuples(index=True, name='NamedTuple'):
        params = params._asdict()
        index = params['Index']
        params.pop('Index')  # remove "Index" from params

        params['objective'] = 'binary:logistic'
        params['nthread'] = nthread
        params['random_state'] = seed
        params['seed'] = seed
        params['silent'] = True

        xgb_model = XGBClassifier()
        xgb_model.set_params(**params)

        if find_n_estimator:
            xgb_train = xgb.DMatrix(train[features], label=train[target])
            cv_result = xgb.cv(
                xgb_model.get_xgb_params(),
                xgb_train,
                num_boost_round=int(params['n_estimators']),
                nfold=cv_folds,
                metrics='auc',
                early_stopping_rounds=50,
                seed=seed)

            best_n_estimator = cv_result.shape[0]
            param_table.at[index, 'n_estimators'] = best_n_estimator
            xgb_model.set_params(n_estimators=best_n_estimator)

        scores = []
        pred_all = []
        for cv_index in range(cv_iterations):
            pred = train.loc[:, [ID]]  # get only the ID column
            # k-fold cross validation
            skf = StratifiedKFold(n_splits=cv_folds, random_state=cv_index, shuffle=True)

            for train_index, dev_index in skf.split(train[features].values, train[target].values):
                X_train = train[features].iloc[train_index].values
                y_train = train[target].iloc[train_index].values

                X_dev = train[features].iloc[dev_index].values
                y_dev = train[target].iloc[dev_index].values

                # Fit the algorithm on train folds
                xgb_model.fit(X_train, y_train, eval_metric='auc')

                # Predict on dev fold
                pred_dev = xgb_model.predict_proba(X_dev)[:, 1]
                pred.at[dev_index, 'Pred'] = pred_dev

                # Compute the score
                score = metrics.roc_auc_score(y_dev, pred_dev)
                scores.append(score)

            if len(pred_all) == 0:
                pred_all = pred
            else:
                pred_all = pd.concat([pred_all, pred], axis=0)

        pred_mean = pred_all.groupby(ID)['Pred'].mean()  # avg predict_proba for each ID
        score = metrics.roc_auc_score(train.sort_values(ID)[target].values,
                                      pred_mean)  # use avg pred to compute auc score
        pred_return['Pred_' + str(index)] = pred_mean  # store the pred result for use in stacking

        param_table.at[index, 'Score'] = score
        param_table.at[index, 'Score_Std'] = np.std(scores)

        if verbose == 1:
            print('{} : {}'.format(index, param_table.iloc[index, :]))

    param_table["Score_Weighted"] = param_table["Score"] - 0.1 * param_table["Score_Std"]

    # update_param_grid
    best_param_index = param_table["Score_Weighted"].idxmax()
    print("Param_grid size: {}".format(param_table.shape[0]))
    print("Current Score: {},  Score_Std: {}".format(param_table.loc[best_param_index, "Score"],
                                                     param_table.loc[best_param_index, "Score_Std"]))
    print("--------------------------")
    for param in param_grid:
        best_param = param_table.loc[best_param_index, param]
        if isinstance(param_grid[param], list):
            if len(param_grid[param]) > 1 or (len(param_grid[param]) == 1 and param_grid[param][0] != best_param):
                print("{}: tuned to {}".format(param, best_param))
        else:
            print("{}: tuned to {}".format(param, best_param))
        param_grid[param] = [best_param]

    return param_grid, pred_return
Пример #31
0
def modelfit(train,
             labels,
             test,
             features,
             useTrainCV=True,
             cv_folds=5,
             early_stopping_rounds=50):
    model = XGBClassifier(learning_rate=0.2,
                          n_estimators=1000,
                          max_depth=5,
                          min_child_weight=1,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          objective='binary:logistic',
                          scale_pos_weight=1,
                          seed=27)

    test_percent = 0.2
    X_train, X_test, y_train, y_test = train_test_split(train,
                                                        labels,
                                                        test_size=test_percent,
                                                        random_state=23)

    xgb_param = model.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train[features], y_train)
    xgcv = xgb.DMatrix(X_test[features])
    xgtest = xgb.DMatrix(test[features])
    cvresult = xgb.cv(xgb_param,
                      xgtrain,
                      num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds,
                      metrics='auc',
                      early_stopping_rounds=early_stopping_rounds)
    print("n_estimators=")
    print(cvresult.shape[0])
    model.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    model.fit(X_train, y_train)

    ##training predictions
    proba = model.predict_proba(X_test)
    preds = proba[:, 1]
    score = roc_auc_score(y_test, preds)
    print("Area under ROC {0}".format(score))

    #Print model report:
    #	print "\nModel Report"
    #	print "Accuracy : %.4g" % accuracy_score(y_train, preds)
    #	print "AUC Score (Train): %f" % roc_auc_score(y_train, preds)

    feat_imp = pd.Series(
        model.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    #	plt.show()

    ##test predictions
    test_proba = model.predict_proba(test)
    test_preds = test_proba[:, 1]

    return test_preds
Пример #32
0
            'colsample_bytree': 0.7,  # 生成树时进行的列采样
            'min_child_weight': 3,
            # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
            # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
            # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
            'silent': 0,  # 设置成1则没有运行信息输出,最好是设置为0.
            # //无效'eta': 0.007, # 如同学习率
            'seed': 1000,
            'nthread': 7,  # cpu 线程数
            # 'eval_metric': 'auc'
        }

        # 训练模型
        model = XGBClassifier()  # 构建模型
        model.get_params()  #获取参数
        model.set_params(**params)  # 设置参数
        # 开始训练
        model.fit(aTrain_X, aTrain_Y, eval_metric='auc')

        # 保存模型
        score0 = 0  # model.score(aTrain_X, aTrain_Y)
        score1 = model.score(aTest_X, aTest_Y)
        if score1 > 0.745:
            pickle.dump(
                model,
                open(
                    '{}/qa_data/pre_trained_models/xgboost_qaquality_21_60dz_s{}.pkl'
                    .format(cur_dir, round(score1, 3)), 'wb'))
            print('====> yes found good xgboost model')
        # print(i+1, score)  # 打印每轮训练的准确率
        # 打印准确率 和 召回率
                     colsample_bytree=0.8,
                     seed=1)

xgb_param = xgb1.get_xgb_params()
xgtrain = xgb.DMatrix(x_train_ss, label=y_train)
cvresult = xgb.cv(xgb_param,
                  xgtrain,
                  num_boost_round=xgb1.get_params()['n_estimators'],
                  nfold=5,
                  metrics='auc',
                  early_stopping_rounds=50,
                  verbose_eval=10)

print('n_estimators', cvresult.shape[0])
print('test-auc:', cvresult.iloc[cvresult.shape[0] - 1, 0])
xgb1.set_params(n_estimators=cvresult.shape[0])
print('model', xgb1)

#n_estimators 137
#test-auc: 0.8731962

tuned_parameters = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_child_weight': [1, 2, 3, 4, 5, 6]
}
xgb2 = XGBClassifier(learning_rate=0.1,
                     n_estimators=137,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
Пример #34
0
    xgb_param,
    xgtrain,
    num_boost_round=1000,  #model.get_params()['n_estimators'], 
    nfold=5,
    metrics='merror',
    early_stopping_rounds=50,
    stratified=True)

print('\ntraining error')
print(cvresult['train-merror-mean'])
print('\nvalidation error')
print(cvresult['test-merror-mean'])

cvresult[['train-merror-mean', 'test-merror-mean']].plot()

model.set_params(n_estimators=cvresult.shape[0])

#Fit the algorithm on the data
model.fit(X_train, y_train, eval_metric='merror')

#Predict training set:
predictions = model.predict(X_test)
predprob = model.predict_proba(X_test)[:, 1]

# Print model report:
print("\nModel Report")
print("Training Accuracy : %.4g" %
      metrics.accuracy_score(y_train, model.predict(X_train)))
print("Testing Accuracy : %.4g" %
      metrics.accuracy_score(y_test, model.predict(X_test)))