예제 #1
0
def load_architecture():

    ada_params_filename = logger.config_dict['BEST_ADA_L']
    logger.log(
        "Loading params for ADA from {} ...".format(ada_params_filename))
    with open(logger.get_model_file(ada_params_filename, "large")) as fp:
        ada_best_params = json.load(fp)

    ada_model = AdaBoostClassifier(DecisionTreeClassifier())
    ada_model.set_params(**ada_best_params)

    xgb_params_filename = logger.config_dict['BEST_XGB_L']
    logger.log(
        "Loading params for XGB from {} ...".format(xgb_params_filename))
    with open(logger.get_model_file(xgb_params_filename, "large")) as fp:
        xgb_best_params = json.load(fp)

    xgb_model = XGBClassifier()
    xgb_model.set_params(**xgb_best_params)

    ensemble_weights = [0.5, 0.5]

    comb_model = VotingClassifier(estimators=[('ADA', ada_model),
                                              ('XGB', xgb_model)],
                                  voting='soft',
                                  weights=ensemble_weights,
                                  n_jobs=-1)

    logger.log("Finish loading best architecture {}".format(comb_model))

    return comb_model
예제 #2
0
def xgmethod(X,Y):
  
    # split data into train and test sets
    seed = 7
    test_size = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

    scaler = preprocessing.StandardScaler().fit(X_train)
    scaler.transform(X_train) 
    # XGtrain matrix
    xgtrain = xgb.DMatrix(X_train, label=y_train)
    
   
    model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=100,objective='binary:logistic')
    xgb_param = model.get_xgb_params()
    
    print ('Start cross validation')
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=500, nfold=10, metrics=['auc'],
     early_stopping_rounds=50, stratified=True, seed=1301)
    print('Best number of trees = {}'.format(cvresult.shape[0]))
    
    model.set_params(n_estimators=cvresult.shape[0])
    print('Fit on the trainingsdata')
    model.fit(X_train, y_train, eval_metric='auc')
   
    pred = model.predict(X_test, ntree_limit=cvresult.shape[0])
    
  
    # make predictions for test data
    predictions = [round(value) for value in pred]
   
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    return accuracy
예제 #3
0
def xgboost(X_train, X_test, y_train, y_test, **kwargs):
    model = XGBClassifier(random_state=9)
    model.set_params(**kwargs)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy
예제 #4
0
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
  alg = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
                        min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0,
                        objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
  if useTrainCV:
        print("Start Feeding Data")
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          early_stopping_rounds=early_stopping_rounds)
        display(cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])

    
  print('Start Training')
  alg.fit(X_train, y_train, eval_metric='auc')
  print("Start Predicting")
  predictions = alg.predict(X_test)
  pred_proba = alg.predict_proba(X_test)[:, 1]

    # Model performance
  print("\nModel statistic")
  print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
  print("AUC score (test set): %f" % metrics.roc_auc_score(y_test, pred_proba))
  print("F1 Score (test set): %f" % metrics.f1_score(y_test, predictions))

  feat_imp = alg.feature_importances_
  feat = X_train.columns.tolist()
  res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False)
  res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances')
  plt.ylabel('Feature Importance Score')
  plt.show()
  print(res_df)
  print(res_df["Features"].tolist())
  return cvresult, alg
예제 #5
0
def return_classifier(classifier, classifier_params):
    """
    Returns classifier object based on name
    """
    # Max Features parameter for RandomForest and DecisionTree
    cp = classifier_params.copy()
    if classifier in ['LogisticRegression', 'KNeighborsClassifier','RandomForest']:
        cp['n_jobs'] = -1

    if classifier == 'LinearSVC':
        cv_generator = cp['cv_generator']
    else:
        cv_generator = None

    if classifier == 'XGBoost':
        from xgboost import XGBClassifier
        clf = XGBClassifier()
    elif classifier == 'LogisticRegression':
        clf = linear_model.LogisticRegression()
    elif classifier == 'KNeighborsClassifier':
        del cp['random_state']
        clf = neighbors.KNeighborsClassifier()
    elif classifier == 'RandomForest':
        clf = ensemble.RandomForestClassifier()
    elif classifier == 'DecisionTree':
        clf = tree.DecisionTreeClassifier()
    elif classifier == 'AdaBoost':
        clf = ensemble.AdaBoostClassifier()
    elif classifier == 'LinearSVC':
        del cp['cv_generator']
        clf = svm.LinearSVC()

    clf.set_params(**cp)
    return clf, cv_generator
def get_xgb_feature_importance_plot(best_param_, experiment_, 
                                    png_folder,
                                    png_fname,
                                    score_threshold=0.8):

    # 1. 
    train_X, train_y = experiment_.get_train_data()
    clf = XGBClassifier()
    try:
        del best_param_['model_type']
    except:
        pass
    clf.set_params(**best_param_)
    clf.fit(train_X, train_y)
    index2feature = clf.booster().get_fscore()
    fis = pd.DataFrame({'name':index2feature.keys(),
                        'score':index2feature.values()})
    fis = fis.sort('score', ascending=False)
    if len(fis.index) > 20:
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        #where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)

    # 2. plot
    #gs = GridSpec(2,2)
    #ax1 = plt.subplot(gs[:,0])
    #ax2 = plt.subplot(gs[0,1])
    #ax3 = plt.subplot(gs[1,1])

    # 3.1 feature importance
    sns.barplot(x = 'score', y = 'name',
                data = fis,
                #ax=ax1,
                color="blue")
    #plt.title("Feature_Importance", fontsize=10)
    plt.ylabel("Feature", fontsize=10)
    plt.xlabel("Feature_Importance : f-Score", fontsize=10)

    """
    # 3.2 PDF
    confidence_score = clf.oob_decision_function_[:,1]
    sns.distplot(confidence_score, kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF")

    # 3.3 CDF
    num_bins = min(best_param_.get('n_estimators',1), 100)
    counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True)
    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10)
    """

    png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1)
    plt.close()

    return True
예제 #7
0
def xgbclf(params, X_train, y_train, X_test, y_test):

    eval_set = [(X_train, y_train), (X_test, y_test)]

    model = XGBClassifier(**params).\
      fit(X_train, y_train, eval_set=eval_set, \
                  eval_metric='auc', early_stopping_rounds = 100, verbose=100)

    model.set_params(**{'n_estimators': model.best_ntree_limit})
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test, ntree_limit=model.best_ntree_limit)

    abclf_cm = confusion_matrix(y_test, y_pred)
    print(abclf_cm)
    print(abclf_cm[0][0] / (abclf_cm[0][0] + abclf_cm[1][0]))
    print(classification_report(y_test, y_pred))
    print('\n')
    print("Model Final Generalization Accuracy: %.6f" %
          accuracy_score(y_test, y_pred))

    y_pred_proba = model.predict_proba(X_test,
                                       ntree_limit=model.best_ntree_limit)[:,
                                                                           1]
    get_roc(y_test, y_pred_proba)
    return model
예제 #8
0
def xgboost(X_train, X_test, y_train, y_test, **kwargs):
    xgb1 = XGBClassifier(seed=9)
    if kwargs:
        xgb1.set_params(**kwargs)
    xgb1.fit(X_train, y_train)

    y_pred = xgb1.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    return accuracy
예제 #9
0
def xgboost(X_train, X_test, y_train, y_test, **kwargs):
    model = XGBClassifier(seed=9)
    model.set_params(**kwargs)
    #ac,bst=myXGBoost(X_train, X_test, y_train, y_test,model,param_grid1,KFold=3)
    #h,j=param2(X_train, X_test, y_train, y_test,model,param_grid2)
    #return h,j
    #print model
    model.fit(X_train, y_train)
    d = model.predict(X_test)
    a = accuracy_score(y_test, d)
    return a
예제 #10
0
def train_xgb(X, y, params, save_path=None, save_path_booster=None):

    # the threshold is not handled by XGB interface
    params, binary_threshold = _parse_param_and_delete(params,
                                                       'binary_threshold', .5)

    # n_jobs is handled by XGB SKL interface
    params = _parse_param_and_keep(params,
                                   name='n_jobs',
                                   default=min(max_cpu_count(), 24))

    X = np.asarray(X)
    y = np.asarray(y).flatten()

    if not tuple(np.sort(np.unique(y))) == (0, 1):
        raise NotImplementedError(
            'XGB Wrapper currently only support biinary classification.')

    # Fit the model
    model = XGBClassifier(use_label_encoder=False, )
    model = clone(model)
    model.set_params(**params)

    logging.info('Training...')
    model.fit(
        X,
        y,
        # early_stopping_rounds=10,
        verbose=True,
    )
    # Save and re-load (feature-agnostic model)
    temp_file = f'temp-{time.time()}-{random.random()}.bin'
    model.get_booster().save_model(temp_file)
    booster = Booster(model_file=temp_file)
    os.remove(temp_file)

    if binary_threshold == 'auto':
        p_ = booster.predict(DMatrix(X))
        p_ = np.sort(p_)
        binary_threshold = p_[int((y == 0).sum())]

    logging.info(f'Using a binary_threshold = {binary_threshold}')

    # Wrap
    model = XGBClassifierSKLWrapper(booster,
                                    features=X.shape[1],
                                    threshold=binary_threshold)

    # Save
    if save_path is not None:
        save_pickle(model, save_path)
    if save_path_booster is not None:
        save_pickle(model.get_booster(), save_path_booster)
    return model
예제 #11
0
class Hyperopt_xbc:
    def __init__(self, X, y, seed):
        self.name = 'XGBoost'
        self.name_short = 'XBC'
        self.X = X
        self.y = y
        self.seed = seed
        self.clf = None
        self.best_acc = 0
        self.space = {
            'objective': 'binary:logistic',
            'max_depth': hp.choice('max_depth', range(5, 30, 1)),
            'learning_rate': hp.quniform('learning_rate', 0.01, 0.5, 0.01),
            'n_estimators': hp.choice('n_estimators', range(10, 500, 10)),
            'booster': hp.choice('booster', ['gbtree', 'gblinear', 'dart']),
            'gamma': hp.quniform('gamma', 0, 0.50, 0.01),
            'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
            'subsample': hp.quniform('subsample', 0.1, 1, 0.01),
            'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)
        }
        self.max_evals = 50

    def train_test(self, params):
        warnings.filterwarnings(action='ignore', category=DeprecationWarning)
        self.clf = XGBClassifier(**params)
        self.clf.fit(self.X, self.y)
        return cross_val_score(self.clf,
                               self.X,
                               self.y,
                               scoring='roc_auc',
                               cv=10).mean()

    def f(self, params):
        acc = self.train_test(params)
        if acc > self.best_acc:
            self.best_acc = acc
        return {'loss': -acc, 'status': STATUS_OK}

    def best(self):
        trials = Trials()
        best = fmin(self.f,
                    self.space,
                    algo=tpe.suggest,
                    max_evals=self.max_evals,
                    rstate=np.random.RandomState(self.seed),
                    trials=trials)
        self.clf.set_params(**best)
        return self.clf, self.name, self.name_short, space_eval(
            self.space, best), self.best_acc
예제 #12
0
    def training(self):
        """
        Training is done at each max_depth loop.
        XGBoost's cv is used to find the optimum number of tree (estimators) at each depth, up to 1000 trees.
        Once traning result doesn't improve for 50 epochs, training will stop. The tree number used in the last epoch
        will be used to fit the train and test set again. Metrics will then be measured again this XGB model.
        """

        max_depth = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        best_depth = 0
        best_estimator = 0
        max_score = 0
        for md in max_depth:
            model = XGBClassifier(learning_rate=0.3, n_estimators=1000, max_depth=md, min_child_weight=1,
                                  gamma=1, subsample=1, colsample_bytree=0.1, reg_lambda=0, reg_alpha=1,
                                  random_state=42)
            xgb_param = model.get_xgb_params()
            xgtrain = xgboost.DMatrix(self.Xtrain.values, label=self.ytrain.values)

            cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=1000, early_stopping_rounds=50,
                                  nfold=8, metrics='auc', stratified=True, shuffle=True, seed=42,
                                  verbose_eval=False)
            print("There are {} trees in the XGB model. CV-mean: {:.4f}, CV-std: {:.4f}.".format(
                cvresult.shape[0], cvresult.iloc[cvresult.shape[0] - 1, 0],
                cvresult.iloc[cvresult.shape[0] - 1, 1]))
            n = cvresult.shape[0]
            model.set_params(n_estimators=n)
            model.fit(self.Xtrain,
                      self.ytrain,
                      eval_metric=self._metric,
                      eval_set=[(self.Xtrain, self.ytrain), (self.Xtest, self.ytest)],
                      verbose=False)
            y_pred = model.predict(self.Xtest)
            score = accuracy_score(self.ytest, y_pred)
            mse = mean_squared_error(self.ytest, y_pred)

            if score > max_score:
                max_score = score
                min_mse = mse
                best_depth = md
                best_estimator = n
                self.best_xgb = model
            print("Accuracy score: " + str(round(score, 4)) + " at depth: " + str(md) + " and estimator " + str(n))
            print("Mean square error: " + str(round(mse, 4)) + " at depth: " + str(md) + " and estimator " + str(n))
        print("Best score: " + str(round(max_score, 4)) + " Best MSE: " + str(round(min_mse, 4)) + " at depth: " + str(
            best_depth) + " and estimator of " + str(best_estimator))
예제 #13
0
def generateXGBoostPrediction(train, test):
    print('\n##################\nXGBoost\n##################')
    features = [
        'orderfrequency', 'dayfrequency', 'days_without_product_order',
        'department_id', 'aisle_id', 'eval_days_since_prior_order',
        'numproductorders', 'totaluserorders', 'user_id', 'product_id'
    ]
    param = {}
    #param['booster'] = 'gbtree'
    param['objective'] = 'binary:logistic'
    # param["eval_metric"] = "error"
    # param['eta'] = 0.3
    # param['gamma'] = 0
    param['max_depth'] = 4
    param['n_estimators'] = 80
    param['learning_rate'] = 0.1
    # param['min_child_weight'] = 1
    # param['max_delta_step'] = 0
    #param['subsample'] = 1
    # param['colsample_bytree'] = 1
    # param['silent'] = 1
    # param['seed'] = 0
    #param['base_score'] = 0.4

    X_train = train[features]
    test = test[features]

    y_train = train['reordered']

    estimator = XGBClassifier()
    estimator.set_params(**param)
    metLearn = CalibratedClassifierCV(estimator, method='sigmoid', cv=5)
    metLearn.fit(X_train, y_train)
    y_pred = metLearn.predict(test)

    # estimator.fit(X_train, y_train)
    # y_pred = estimator.predict(test)
    print('Predict counter : %s' % (Counter(y_pred)))

    df = pd.DataFrame(columns=('user_id', 'product_id', 'predy'))
    df['user_id'] = test['user_id']
    df['product_id'] = test['product_id']
    df['predy'] = y_pred
    return df
예제 #14
0
def final_xgb(X_train, y_train, X_test, y_test, scale_pos_weight, best_params,
              analysis):

    xgb = XGBClassifier(**best_params)
    xgb.set_params(njobs=4,
                   random_state=0,
                   objective='binary:logistic',
                   scale_pos_weight=scale_pos_weight)

    eval_set = [(X_train, y_train), (X_test, y_test)]
    eval_metric = ["error", "auc"]

    xgb.fit(X_train,
            y_train,
            eval_metric=eval_metric,
            eval_set=eval_set,
            verbose=0)

    results = xgb.evals_result()

    fig1, axes1 = plt.subplots(figsize=(10, 8), nrows=1, ncols=2)
    axes1[0].plot(results['validation_0']['error'], label='Train Error')
    axes1[0].plot(results['validation_1']['error'], label='Validation Error')
    axes1[0].set_title("Final XGBoost Error")
    axes1[0].set_xlabel("Iteration")
    axes1[0].set_ylabel("Error")
    axes1[0].legend()

    axes1[1].plot(results['validation_0']['auc'], label='Train AUC-ROC')
    axes1[1].plot(results['validation_1']['auc'], label='Validation AUC-ROC')
    axes1[1].set_title("Final XGBoost AUC-ROC")
    axes1[1].set_xlabel("Iteration")
    axes1[1].set_ylabel("AUC")
    axes1[1].legend()

    fig1.tight_layout()

    fig1.savefig(fig_dir + '/{}_final_xgb_model.png'.format(analysis),
                 format='png',
                 dpi=300,
                 transparent=False)

    return xgb
예제 #15
0
def opt_BDT(input, output, params, show, names):

    model = XGBClassifier(**params)
    xgb_param = model.get_xgb_params()
    cvscores = []
    AUC = []
    X_train, X_test, y_train, y_test = train_test_split(input,
                                                        output,
                                                        test_size=0.2,
                                                        random_state=42)
    matrix_train = xgb.DMatrix(X_train, label=y_train)
    cvresult = xgb.cv(
        xgb_param,
        matrix_train,
        num_boost_round=model.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=30,
        verbose_eval=True,
    )
    model.set_params(n_estimators=cvresult.shape[0])
    model.fit(X_train, y_train, eval_metric="auc")
    y_prob = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    prediction = [round(value) for value in y_pred]
    auc = roc_auc_score(y_test, y_prob[:, 1])
    accuracy = accuracy_score(y_test, prediction)

    print("Accuracy: %.2f%%; AUC = %.4f%" % (accuracy * 100, auc))
    if show:

        name = "channel_" + str(channel) + "_BDT"
        name = "%s_%s" % (name, selection)
        modelname = "models/%s.h5" % name
        print("Save to %s" % modelname)

        plotter.plot_separation(model, X_test, y_test, name, False)
        plotter.plot_ROC(model, X_test, y_test, name, False)
        model.get_booster().feature_names = names
        mp.rc("figure", figsize=(5, 5))
        plot_importance(model.get_booster())
        plt.subplots_adjust(left=0.3)
        plt.show()
예제 #16
0
파일: oofs.py 프로젝트: Jie-Yuan/tql-Python
    def fit_predict(self, X_train, y_train, X_valid, y_valid, X_test,
                    **kwargs):
        clf = XGBClassifier()
        if self.params is not None:
            clf.set_params(**self.params)
            # print(clf.get_params())

        eval_set = [(X_train, y_train), (X_valid, y_valid)]
        self.clf = clf.fit(X_train,
                           y_train,
                           eval_set=eval_set,
                           eval_metric=None,
                           verbose=100,
                           early_stopping_rounds=100)
        # evals_result = self.clf.evals_result()

        valid_predict = clf.predict_proba(X_valid)
        test_predict = clf.predict_proba(X_test)
        return valid_predict, test_predict
class ClassificationLearner:
    def __init__(self, **kwargs):
        self.estimator = XGBClassifier(**kwargs)
        self.fit_info = None

    # noinspection PyPep8Naming
    # pylint: disable-msg=too-many-arguments
    # pylint: disable-msg=too-many-locals
    # pylint: disable-msg=invalid-name
    def fit(self, X, y):
        # If there is no evaluation data, split some.
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=42)

        if X.shape[0] < 10000:
            best_param = search_parameters(self.estimator, x_train, y_train)
            self.estimator.set_params(**best_param)

        self.estimator.fit(x_train,
                           y_train,
                           eval_set=[(x_test, y_test)],
                           early_stopping_rounds=10,
                           verbose=False)

        y_train_pred = self.predict_proba(x_train)[:, 1]
        train_auc = sklearn.metrics.roc_auc_score(y_train, y_train_pred)
        y_test_pred = self.predict_proba(x_test)[:, 1]
        test_auc = sklearn.metrics.roc_auc_score(y_test, y_test_pred)

        self.fit_info = 'Train/Test AUC: {:.2f}/{:.2f}'.format(
            train_auc, test_auc)

        return self

    def predict_proba(self, x):
        return self.estimator.predict_proba(x)

    def predict(self, x):
        return self.estimator.predict(x)
예제 #18
0
def train_evaluate(training_dataset_path, validation_dataset_path, max_depth,
                   n_estimators, output_dir):

    df_train = pd.read_csv(training_dataset_path)
    df_validation = pd.read_csv(validation_dataset_path)
    df = pd.concat([df_train, df_validation])

    categorical_features = ['workclass', 'occupation']
    target = 'income_bracket'

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=categorical_features)

    # Change label to 0 if <=50K, 1 if >50K
    df[target] = df[target].apply(lambda x: 0 if x == ' <=50K' else 1)

    # Split features and labels into 2 different vars
    X_train = df.loc[:, df.columns != target]
    y_train = np.array(df[target])

    # Normalize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    grid = {'max_depth': int(max_depth), 'n_estimators': int(n_estimators)}

    model = XGBClassifier()
    model.set_params(**grid)
    model.fit(X_train, y_train)

    model_filename = 'xgb_model.pkl'
    pickle.dump(model, open(model_filename, "wb"))

    EXPORT_PATH = os.path.join(
        output_dir,
        datetime.datetime.now().strftime("%Y%m%d%H%M%S"))

    gcs_model_path = '{}/{}'.format(EXPORT_PATH, model_filename)
    subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path])
    print('Saved model in: {}'.format(gcs_model_path))
예제 #19
0
 def return_model_assessment(self, args):
     curr_model_hyper_params = [
         'colsample_bylevel', 'colsample_bytree', 'gamma', 'learning_rate',
         'max_delta_step', 'max_depth', 'min_child_weight', 'n_estimators',
         'reg_alpha', 'reg_lambda', 'subsample'
     ]
     params = {
         curr_model_hyper_params[i]: args[i]
         for i, j in enumerate(curr_model_hyper_params)
     }
     model = XGBClassifier(random_state=self.seed, seed=self.seed)
     model.set_params(**params)
     fitted_model = model.fit(self.X_train,
                              self.y_train,
                              sample_weight=None)
     self.models.append(fitted_model)
     train_predictions = model.predict(self.X_train)
     test_predictions = model.predict(self.X_test)
     train_score = f1_score(train_predictions, self.y_train)
     test_score = f1_score(test_predictions, self.y_test)
     self.train_scores.append(train_score)
     self.test_scores.append(test_score)
     return 1 - test_score
예제 #20
0
def get_default_xgb_model(df):

    final_X, final_y = hs.get_final_data(df, hs.get_data_transformer())

    parameters = {
        'nthread': 1,
        'objective': 'binary:logistic',
        'learning_rate': 0.01,
        'max_depth': 8,
        'min_child_weight': 3,
        'silent': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.5,
        'n_estimators': 1000,
        'missing': -999,
        'seed': 1337
    }

    xgb_model = XGBClassifier(verbosity=0)
    xgb_model.set_params(**parameters)
    xgb_model.fit(final_X, final_y)

    return xgb_model
예제 #21
0
 def modelXGBClassifier(self, trial: optuna.trial.Trial):
     opt_params = dict(
         max_depth=trial.suggest_int("max_depth", 2, 2**4),
         learning_rate=trial.suggest_discrete_uniform(
             'learning_rate', 0.001, 1, 0.001),
         n_estimators=trial.suggest_int("n_estimators", 2, 2**10, log=True),
         gamma=trial.suggest_loguniform('gamma', 1e-8, 1),
         min_child_weight=trial.suggest_loguniform('min_child_weight', 1e-8,
                                                   2**10),
         subsample=trial.suggest_uniform('subsample', 0.1, 1),
         colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.1, 1),
         colsample_bylevel=trial.suggest_uniform('colsample_bylevel', 0.1,
                                                 1),
         reg_alpha=trial.suggest_loguniform('reg_alpha', 1e-8, 10),
         reg_lambda=trial.suggest_loguniform('reg_lambda', 1e-8, 10),
     )
     clf = XGBClassifier(max_depth=3,
                         learning_rate=0.1,
                         n_estimators=100,
                         silent=True,
                         objective="binary:logistic",
                         booster='gbtree',
                         n_jobs=1,
                         gamma=0,
                         min_child_weight=1,
                         max_delta_step=0,
                         subsample=1,
                         colsample_bytree=1,
                         colsample_bylevel=1,
                         reg_alpha=0,
                         reg_lambda=1,
                         scale_pos_weight=1,
                         base_score=0.5,
                         random_state=0,
                         missing=None)
     clf.set_params(**{**opt_params, **self.params})
     return clf
예제 #22
0
def log_xgboost(params, train_X, train_Y, test_X, test_Y):

    with mlflow.start_run() as ml_run:
        for k, v in params.items():
            mlflow.log_param(k, v)
        mlflow.set_tag("state", "dev")
        xgc = XGBClassifier(objective="binary:logistic")
        xgc.set_params(**params)
        model = xgc.fit(train_X,
                        train_Y.values.ravel(),
                        eval_set=[(train_X, train_Y.values.ravel()),
                                  (test_X, test_Y.values.ravel())],
                        eval_metric=['error', 'logloss'],
                        verbose=0)
        predictions = model.predict(test_X)
        acc = accuracy_score(test_Y.values.ravel(), predictions)
        loss = log_loss(test_Y.values.ravel(), predictions)

        ## Plots
        error_plot = plot_learning(model, "error")
        error_plot.savefig("temp/error_plot.png")
        mlflow.log_artifact("temp/error_plot.png")
        loss_plot = plot_learning(model, "logloss")
        loss_plot.savefig("temp/logloss.png")
        mlflow.log_artifact("temp/logloss.png")
        conf_mat = confusion_matrix(test_Y, predictions)
        conf_mat_plot = sns.heatmap(conf_mat, annot=True, fmt='g')
        conf_mat_plot.figure.savefig("temp/confmat.png")
        mlflow.log_artifact("temp/confmat.png")
        mlflow.log_metrics({'log_loss': loss, 'accuracy': acc})

        mlflow.xgboost.log_model(model, "model")

        print(f"Model trained with parameters: {params}")

        return model, predictions, acc, loss
예제 #23
0
        logger.info('test col: %s' % (add_col))
        for train_idx, test_idx in list(cv)[:1]:
            train_omit_idx = numpy.intersect1d(train_idx, omit_idx)
            logger.info('ommit size: %s %s' %
                        (train_idx.shape[0], len(train_omit_idx)))

            ans = []
            insample_ans = []
            for i in ['']:  #
                logger.info('model: %s' % i)
                cols = data.columns.values  # [col for col in feature_column if 'L%s' % i in col]
                logger.info('model xg: %s' % i)
                model = XGBClassifier(seed=0)
                #model = RandomForestClassifier(n_jobs=-1, random_state=0)
                gc.collect()
                model.set_params(**params)
                model.fit(data.ix[train_idx, cols], target[train_idx])

                ans = model.predict_proba(data.ix[test_idx, cols])[:, 1]
                insample_ans = model.predict_proba(data.ix[train_idx, cols])[:,
                                                                             1]

            logger.info('train_end')
            """
            if all_ans is None:
                all_ans = ans
                all_target = target[test_idx]
                all_ids = ids.ix[test_idx].values
            else:
                all_ans = numpy.r_[all_ans, ans]
                all_target = numpy.r_[all_target, target[test_idx]]
        subsample=0.9,
        colsample_bytree=0.7,
        objective='multi:softprob',
        scale_pos_weight=1,
        seed=0,
    )

    xgb_enc = OneHotEncoder(handle_unknown='ignore')
    xgb_enc.fit(X)  # since I am working mostly on categorical features

    estimate_nround = False
    if estimate_nround:
        logger.info('estimating the n_estimators...')
        best_n_rounds = estimate_xgb_nround(xgb_model, X, y)
        logger.info('complete estimating the n_estimators')
        xgb_model.set_params(n_estimators=best_n_rounds)
        xgb_model.fit(xgb_enc.transform(X), y)
        plot_importance_matrix(xgb_model, csv_path)
        sys.exit()

    # start tunning
    param_grid = {
        ### step 1 ###
        #                'max_depth': [3, 5, 7, 9],
        #                'min_child_weight': [1, 3, 5]
        ### best parameter for round 1: max_depth = 5, min_child_weight = 1 ###
        ### step 2 ###
        #                'max_depth': [4, 5, 6],
        #                'min_child_weight': [1, 2]
        ### best parameter for round 2: max_depth = 5, min_child_weight = 1 ###
        ### step 3 ###
예제 #25
0
					objective= 'multi:softprob', 
					max_depth = 7, 
					gamma= .2)

# use the xgb interface
xgb_param = clf.get_xgb_params()
xgb_param['num_class'] = 5
xgb_param['eval_metric'] = 'mlogloss'
Xg_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
cvresult = xgb.cv(xgb_param, 
				  Xg_train, 
 				  num_boost_round = clf.get_params()['n_estimators'],
 				  nfold = 5,
 				  show_progress = True,
				  early_stopping_rounds = 100)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(X_train, y_train)
best_outcome_params = clf.get_params()
best_outcome_score = cvresult.min()

try:
	# predict the outcome probabilities
	y_pred = grid.predict_proba(X_test)
except:
	# predict the outcome probabilities
	y_pred = clf.predict_proba(X_test)


# Create a data frame
column_names = possible_outcomes[:]
idx = pd.Int64Index(np.arange(1,11457, dtype='int64'))
예제 #26
0
                  nfold=5,
                  metrics='auc', 
                  early_stopping_rounds=50,
                  seed=42
                  )

cvresult.head()

cvresult.shape

xgb_best_param = {'n_estimators': cvresult.shape[0]}
xgb_best_param
# best n_estimators value to be used in the stack model

# update xgb with the optimal n_estimators
xgb.set_params(**xgb_best_param)

# #### 2. Tune max_depth and min_child_weight

parameter_grid = {
                    'max_depth': np.arange(2, 4),
                    'min_child_weight': np.arange(1, 4)
                 }
grid_xgb = GridSearchCV(xgb, parameter_grid, cv=cv_splitter, n_jobs=-1)
grid_xgb.fit(X_1, y)

grid_xgb.best_params_

xgb_best_param.update(grid_xgb.best_params_)
xgb_best_param
# best parameter values to be used in the stack model
예제 #27
0
    no_test=False)
var_kin, var_geo = data.variables_list()
skf = StratifiedKFold(n_splits=n, shuffle=True)

params = {
    'learning_rate': 0.05,
    'n_estimators': 100,
    'max_depth': 4,
    'subsample': 0.5,
    'n_jobs': 4,
    'min_child_weight': 15
}

train_params = {'early_stopping_rounds': 10, 'verbose': 0}
xgb = XGBClassifier(**params)
xgb.set_params(**train_params)

xgb_kin = clf.Classifier(model=xgb,
                         cv=skf,
                         variables=var_kin,
                         model_name='XGBoost',
                         var_name='kinetic',
                         fig_name='xgb',
                         train_params=train_params)
xgb_kin.fit(train)
xgb_kin.check_ks_and_cvm(train,
                         check_agreement=check_agreement,
                         check_correlation=check_correlation)
xgb_kin.predict(data=test)
params = {
    'learning_rate': 0.05,
dsample = LabelEncoding(dsample)
dtest = LabelEncoding(test)

X_train = dsample.drop('AB_NICU', axis=1)
y_train = dsample['AB_NICU']
X_test = dtest.drop('AB_NICU', axis=1)
y_test = dtest['AB_NICU']

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#XGBoost initial fit 
xgb = XGBClassifier()
xgb.set_params(random_state=0)
xgb.fit(X_train, y_train)
print("The training error is: %.5f" % (1 - xgb.score(X_train, y_train)))
print("The test error is: %.5f" % (1 - xgb.score(X_test, y_test)))

# Commented out IPython magic to ensure Python compatibility.
# set the parameter grid
xgb_param_grid ={'learning_rate': [0.01,0.05,0.1],
                 'max_depth': [3,4,5,6],
                 'min_child_weight': [4,5,6],
                 'n_estimators': [100,200,300,400]}

#grid search
grid_search_xgb = GridSearchCV(xgb, xgb_param_grid, scoring='accuracy', cv= 5, n_jobs=-1, return_train_score = True)
# %time grid_search_xgb.fit(X_train, y_train)
예제 #29
0
    max_score = -100
    best_thresh = None
    pg = list(ParameterGrid(all_params))
    for i in range(data.shape[1]):
        thresh, score = mcc_optimize(data[:, i], target)
        logger.info('model:%s, thresh: %s, total score: %s, max_score: %s' %
                    (i, thresh, score, max_score))

    for i, params in enumerate(pg):
        logger.info('%s/%s param: %s' % (i + 1, len(pg), params))
        pred_proba_all = []
        y_true = []
        for train_idx, test_idx in cv:
            model = XGBClassifier(seed=0)
            #model = LogisticRegression(n_jobs=-1, class_weight='balanced')
            model.set_params(**params)

            model.fit(data[train_idx],
                      target[train_idx],
                      eval_metric=evalmcc_xgb_min,
                      verbose=False)

            #pred_proba = data[test_idx, -1]
            pred_proba = model.predict_proba(data[test_idx])[:, 1]
            pred_proba_all = numpy.r_[pred_proba_all, pred_proba]
            y_true = numpy.r_[y_true, target[test_idx]]
            score = roc_auc_score(target[test_idx], pred_proba)
            #logger.info('    score: %s' % score)
            #thresh, score = mcc_scoring(model, data[test_idx], target[test_idx])
            list_score.append(score)
            #logger.info('    thresh: %s' % thresh)
예제 #30
0
    def rvs(self, random_state):
        return random_state.choice(self.support)


def search(param_dict,
           cv_obj,
           X,
           y,
           n_iter=1_000,
           skeleton=None,
           scoring='neg_log_loss',
           **kwargs):
    if skeleton is None:
        skeleton = XGBClassifier(n_jobs=1, random_state=SEED)
        if 'early_stopping_rounds' in kwargs:
            skeleton.set_params(n_estimators=1_000)

    dist = {k: Uniform(v) for k, v in param_dict.items()}

    optim = RandomizedSearchCV(
        estimator=skeleton,
        param_distributions=dist,
        n_iter=n_iter,
        scoring=scoring,
        cv=cv_obj,
        return_train_score=True,
        verbose=1,
        n_jobs=4,
        random_state=SEED,
    )
예제 #31
0
Fitting the final XGBoost with parameters found on grid_cv.
Use all training data.
Test on test data.
#######################################################################
"""
params = best_params
# params = {'colsample_bytree': 0.6,
#           'learning_rate': 0.01,
#           'max_depth': 3,
#           'n_estimators': 250,
#           'subsample': 1.0}

xgb = XGBClassifier(**params)
xgb.set_params(silent=True,
               verbosity=0,
               njobs=4,
               random_state=0,
               objective='binary:logistic',
               scale_pos_weight=scale_pos_weight)

eval_set = [(X_train, y_train), (X_test, y_test)]
eval_metric = ["error", "auc"]

xgb.fit(X_train, y_train,
        eval_metric=eval_metric,
        eval_set=eval_set,
        verbose=False)

results = xgb.evals_result()

fig1, ax1 = plt.subplots()
ax1.plot(results['validation_0']['error'], label='Train Error')
예제 #32
0
def get_model(PARAMS):
    '''Get model according to parameters'''
    model = XGBClassifier()
    model.set_params(**PARAMS)
    return model