Exemplo n.º 1
1
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=0
         
        for j in range(m):
            clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=7,num_round=400,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
            yqq=y_pred*(1.0/(j+1))

            print j,llfun(y_test_cv,yqq)

        #y_pred/=m;
        clf=XGBClassifier(max_depth=10,colsample_bytree=0.8,learning_rate=0.02,n_estimators=500,nthread=-1)
        #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        clf.fit(X_train_cv,(y_train_cv),eval_metric="logloss",eval_set=[(X_test_cv, y_test_cv)])
        y_pred=clf.predict_proba(X_test_cv).T[1]
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred=y_pred
        yreal=y_test_cv
        idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred,yreal,idx#np.mean(xx)
Exemplo n.º 2
0
def xgboostcv(max_depth,
              learning_rate,
              n_estimators,
              gamma,
              min_child_weight,
              max_delta_step,
              subsample,
              colsample_bytree,
              silent=True,
              nthread=-1,
              seed=1234):

    clf = XGBClassifier(max_depth=int(max_depth),
                        learning_rate=learning_rate,
                        n_estimators=int(n_estimators),
                        silent=silent,
                        nthread=nthread,
                        gamma=gamma,
                        min_child_weight=min_child_weight,
                        max_delta_step=max_delta_step,
                        subsample=subsample,
                        colsample_bytree=colsample_bytree,
                        seed=seed,
                        objective="binary:logistic")

    # Run Kfolds on the data model to stop over-fitting
    X_train, X_valid, y_train, y_valid = train_test_split(train,
                                                          train_labels,
                                                          test_size=0.1,
                                                          random_state=seed)
    xgb_model = clf.fit(X_train, y_train, eval_metric="auc", eval_set=[(X_valid, y_valid)], early_stopping_rounds=20)
    y_pred = xgb_model.predict_proba(X_valid)[:,1]

    return auc(y_valid, y_pred)
    def test_predict_sklearn_pickle(self):
        X,y = makeXy()
        Xtest = makeXtest()

        from xgboost import XGBClassifier
        kwargs={}
        kwargs['tree_method'] = 'gpu_hist'
        kwargs['predictor'] = 'gpu_predictor'
        kwargs['silent'] = 0
        kwargs['objective'] = 'binary:logistic'

        model = XGBClassifier(**kwargs)
        model.fit(X,y)
        print(model)

        # pickle model
        save_obj(model,"model.pkl")
        # delete model
        del model
        # load model
        model = load_obj("model.pkl")
        os.remove("model.pkl")

        # continue as before
        print("Before model.predict")
        sys.stdout.flush()
        tmp = time.time()
        gpu_pred = model.predict(Xtest, output_margin=True)
        print(gpu_pred)
        print("E non-zeroes: %d:" % (np.count_nonzero(gpu_pred)))
        print("E GPU Time to predict = %g" % (time.time() - tmp))
Exemplo n.º 4
0
 def xgboost_classifier(self):
     cls = XGBClassifier()
     print 'xgboost cross validation score', cross_val_score(cls,self.x_data,self.y_data)
     start_time = time.time()
     cls.fit(self.x_train, self.y_train)
     print 'score', cls.score(self.x_test, self.y_test)
     print 'time cost', time.time() - start_time
Exemplo n.º 5
0
def feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc'):
    thresholds = [thres for thres in sorted(model.feature_importances_) if thres != 0]  # Use feat. with >0 importance

    roc_scores = {}
    for thresh in thresholds:  # select features using threshold

        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        select_X_train = selection.transform(X_train)

        selection_model = XGBClassifier()  # train model
        selection_model.fit(select_X_train, y_train, eval_metric=eval_metric)

        select_X_test = selection.transform(X_test)  # eval model
        y_pred = selection_model.predict(select_X_test)

        roc = roc_auc_score(y_test, y_pred)
        roc_scores[selection.threshold] = roc

    best_thresh = max(roc_scores, key=roc_scores.get)

    fs = SelectFromModel(model, threshold=best_thresh, prefit=True)
    pickle_model(fs, 'feature.select')
    X_train_trans_ = fs.transform(X_train)
    X_test_trans_ = fs.transform(X_test)
    print 'total features kept: {}'.format(X_train_trans_.shape[1])

    return X_train_trans_, X_test_trans_
Exemplo n.º 6
0
def train_model_xgb_meta(train_x, train_y, xgb_features):
    train_ind = StratifiedShuffleSplit(train_y, random_state=1, test_size=0.2)

    for train_index, test_index in train_ind:
        x_train = train_x.ix[train_index, :]
        y_train = train_y.ix[train_index]

        x_eval = train_x.ix[test_index, :]
        y_eval = train_y.ix[test_index]


    #Classifier
    xgb = XGBClassifier(max_depth=xgb_features['max_depth'], learning_rate=xgb_features['learning_rate'], n_estimators=int(xgb_features['n_estimators']), objective='binary:logistic',
                        subsample=xgb_features['subsample'], colsample_bytree=xgb_features['colsample_bytree'], min_child_weight=xgb_features['min_child_weight'])
    # gives 0.458

    #  bag_clf = BaggingClassifier(xgb, max_samples=10, warm_start=True, verbose=10)
    #  x_train = pd.DataFrame(x_train, dtype=float)
    #  bag_clf.fit(x_train, y_train)
    xgb = xgb.fit(x_train, y_train, verbose=True, eval_metric='logloss',  eval_set=[(x_eval, y_eval)], early_stopping_rounds=10)

    #  cv_score = cross_val_score(xgb, x_train, y_train, cv=4, n_jobs=1, pre_dispatch=1, verbose=10, scoring='log_loss')
    #  print(cv_score)
    #  print(np.mean(cv_score))

    #  predictions = pd.Series(xgb.predict_proba(x_train, ntree_limit=xgb.best_iteration)[:, 1], name='PredictedProb')

    return xgb  #  , predictions
Exemplo n.º 7
0
def cv(X_train, y_train, features_inner):

    kfold = StratifiedKFold(n_splits=5, shuffle=True)

    scores_f = []
    scores_p = []
    scores_r = []

    for train, test in kfold.split(X_train, y_train):

        model = XGBClassifier()
        X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns)
        y_train_cv = pd.DataFrame(y_train.values[train], columns=["tred_cutoff"])
        X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns)
        y_test_cv = pd.DataFrame(y_train.values[test], columns=["tred_cutoff"])
        model.fit(X_train_cv, y_train_cv)

        y_pred = model.predict(X_test_cv)

        s_f = f1_score(y_test_cv, y_pred)
        s_p = precision_score(y_test_cv, y_pred)
        s_r = recall_score(y_test_cv, y_pred)
        print("\tscores f1", (s_f))
        print("\tscores p", (s_p))
        print("\tscores r", (s_r))
        scores_f.append(s_f)
        scores_p.append(s_p)
        scores_r.append(s_r)

    print("mean scores f1", np.mean(scores_f))
    print("mean scores p", np.mean(scores_p))
    print("mean scores r", np.mean(scores_r))
Exemplo n.º 8
0
def xgboostcv(max_depth,
              learning_rate,
              n_estimators,
              subsample,
              colsample_bytree,
              gamma,
              min_child_weight,
              silent=True,
              nthread=-1,
              seed=1234):

    clf = XGBClassifier(max_depth=int(max_depth),
                        learning_rate=learning_rate,
                        n_estimators=int(n_estimators),
                        silent=silent,
                        nthread=nthread,
                        subsample=subsample,
                        colsample_bytree=colsample_bytree,
                        gamma=gamma,
                        min_child_weight = min_child_weight,
                        seed=seed,
                        objective="binary:logistic")

    clf.fit(x0, y0, eval_metric="logloss", eval_set=[(x1, y1)],early_stopping_rounds=25)
    ll = -log_loss(y1, clf.predict_proba(x1))
    return ll
Exemplo n.º 9
0
def XGB_model(train,y):
	model=XGBClassifier(n_estimators=150, learning_rate=0.01)
	from sklearn import cross_validation
	cv = cross_validation.KFold(len(train), n_folds=5,random_state=7)
	for traincv,testcv in cv:
	    model.fit(train.iloc[traincv],y.iloc[traincv])
	y_XGB=model.predict(test)
	return y_XGB
Exemplo n.º 10
0
def main():
    # Set seed for reproducibility
    np.random.seed(0)

    print("Loading data...")
    # Load the data from the CSV files
    
    training_data = pd.read_csv('/home/vipin/Videos/train.csv', header=0)
    prediction_data = pd.read_csv('/home/vipin/Videos/test.csv', header=0)
     
     
    training_data['countrycode']=training_data['countrycode'].apply(lambda x:ord(x))
    training_data['browserid']=training_data['browserid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") )
    training_data['devid']=training_data['devid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("none"))
    
    
    #pd.to_csv('/home/vipin/Videos/train11.csv', sep=',', encoding='utf-8')
    #exit(0)
    prediction_data['countrycode']=prediction_data['countrycode'].apply(lambda x:ord(x))
    prediction_data['browserid']=prediction_data['browserid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") )
    prediction_data['devid']=prediction_data['devid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("none") )
    
    
    features=['siteid','offerid','category','merchant','countrycode','browserid','devid']
    target="click"
    X = training_data[features]
    x_prediction = prediction_data[features]
    Y= training_data[target]
    ids = prediction_data["ID"]
    model = XGBClassifier()
            
            
    #linear_model.LogisticRegression(n_jobs=-1)
        
    print("Training...")
            # Your model is trained on the training_data
    model.fit(X, Y)
        
    print("Predicting...")
    
    seed =7
    test_size=0.33
    X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=test_size,random_state=seed)
    y_prediction = model.predict_proba(x_prediction)
    results = y_prediction[:, 1]
    results_df = pd.DataFrame(data={'probability':results})
    joined = pd.DataFrame(ids).join(results_df)
        
    y_pred=model.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    

    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Writing predictions to predictions.csv")
        # Save the predictions out to a CSV file
    joined.to_csv("/home/vipin/Videos/predictions.csv", index=False)
Exemplo n.º 11
0
def test_xgboost():
    """Ensure that the TPOT xgboost method outputs the same as the xgboost classfier method"""

    tpot_obj = TPOT()
    result = tpot_obj._xgradient_boosting(training_testing_data, n_estimators=100, learning_rate=0, max_depth=3)
    result = result[result['group'] == 'testing']

    xgb = XGBClassifier(n_estimators=100, learning_rate=0.0001, max_depth=3, seed=42)
    xgb.fit(training_features, training_classes)

    assert np.array_equal(result['guess'].values, xgb.predict(testing_features))
Exemplo n.º 12
0
def update_model(current_year):
    print 'Creating model...\nDate: {}'.format(datetime.now().strftime('%Y-%m-%d_%H:%M:%S'))

    managers = tuple(unique_managers(current_year))

    sql = "select * from (select week, year, manager1_name, manager2_name, team1_points, team1_projected, team2_points, team2_projected, type \
         from scoreboard_all WHERE team1_points > 0 and week<=13 \
        UNION select week, year, manager2_name AS manager1_name, manager1_name as manager2_name, team2_points AS team1_points, \
        team2_projected AS team1_projected, team1_points as team2_points, team1_projected AS team2_projected, type FROM scoreboard_all \
        where team1_points>0 and week<=13) order by year, week, type;"

    ff1 = download_data(os.path.join(os.getcwd(), 'data/fantasy_football.db'), sql)

    data_features = custom_features(ff1)
    data_features = data_features[(data_features.manager1_name.isin(managers)) & (data_features.manager2_name.isin(managers))]
    X, y, managers, league_type = dummy_and_interaction(data_features)
    # feats = X.columns.tolist()
    sc = StandardScaler()
    X_std = sc.fit_transform(X)
    pickle_model(sc, 'standard.scaler')

    # Select best features
    X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.25, random_state=None)

    model = XGBClassifier()
    model.fit(X_train, y_train)
    # imports = model.feature_importances_.tolist()
    # g = zip(feats, imports)
    # feat_importance = sorted(g, key=lambda x: x[1], reverse=True)
    # print feat_importance
    X_train_trans, X_test_trans = feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc')

    # Select best params
    model = XGBClassifier()
    learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
    n_estimators = [50, 100, 150, 200, 250, 300]
    param_grid = dict(n_estimators=n_estimators, learning_rate=learning_rate)

    grid_search = GridSearchCV(model, param_grid, scoring="log_loss", cv=10, verbose=1)
    result = grid_search.fit(X_train_trans, y_train)

    print("Best: {0} using {1}".format(result.best_score_, result.best_params_))
    print 'Best params: ', result.best_params_
    best_est = result.best_estimator_
    validation = best_est.predict_proba(X_train_trans)
    print("Roc AUC Train: ", roc_auc_score(y_train, validation[:, 1], average='macro'))

    probs = best_est.predict_proba(X_test_trans)
    print("Roc AUC Validation: ", roc_auc_score(y_test, probs[:, 1], average='macro'))

    pickle_model(best_est, 'fantasy.predict')
Exemplo n.º 13
0
def train(imgfile='img/segmentation', modelfile='segmentation.pkl'):
    
    filelabel = getFiles(imgfile)
    row = 120
    col=40
    data = filter(lambda z: z is not None ,map(lambda x:Img(x[1],row,col,x[0]).imgmap,filelabel))
    data = filter(lambda x:x[0] is not None,sum(data,[]))
    label = np.array(map(lambda x:CHARACTER.get(x[0]),data))
    feature = np.array(map(lambda x:np.array(x[1]),data))
    from xgboost import XGBClassifier
    xgb = XGBClassifier(objective='multi:softmax',reg_alpha=1.0,reg_lambda=0.0,subsample=0.7,n_estimators=100,learning_rate=0.3)
    model = xgb.fit(feature,label,eval_set=[(feature,label)],eval_metric='mlogloss')
    import pickle
    fn = modelfile
    with open(fn, 'w') as f:                     # open file with write-mode
        pickle.dump(model, f)
Exemplo n.º 14
0
def runner ():
    m = Model()
    X = m.df.drop("tred_cutoff", axis=1)
    Y = m.df["tred_cutoff"]
    features_inner = m.features + m.features_2
    cv(X, Y, features_inner)

    model = XGBClassifier()
    model.fit(X, Y)

    y_pred = model.predict(m.X_test)
    s_f = f1_score(m.y_test, y_pred)
    s_p = precision_score(m.y_test, y_pred)
    s_r = recall_score(m.y_test, y_pred)
    print("test f1", s_f)
    print("test precision", s_p)
    print("test recall", s_r)
Exemplo n.º 15
0
def main():
    titanic = pandas.read_csv('dataset/titanic.csv')

    x_set = titanic[['pclass', 'age', 'sex']]
    y_set = titanic['survived']
    x_set.fillna(x_set['age'].mean(), inplace=True)
    x_train, x_test, y_train, y_test = utils.prepare_train_and_test_sets(x_set, y_set)

    dict_vectorizer = DictVectorizer(sparse=False)
    x_train = dict_vectorizer.fit_transform(x_train.to_dict(orient='record'))
    x_test = dict_vectorizer.transform(x_test.to_dict(orient='record'))

    decision_tree_classifier = DecisionTreeClassifier()
    utils.get_trained_result(decision_tree_classifier, x_test, x_train, y_test, y_train)

    xgb_classifier = XGBClassifier()
    xgb_classifier.fit(x_train, y_train)
    utils.get_trained_result(xgb_classifier, x_test, x_train, y_test, y_train)
Exemplo n.º 16
0
 def __make_sklearn_model(self):
     estimators = self.__parameters['estimators']
     lrate = self.__parameters['learning_rate']
     depth = self.__parameters['max_depth']
     leaf_bodes = self.__parameters['max_leaf_nodes']
     self.__model = SGBClassifier(n_estimators=estimators,
                                  learning_rate=lrate,
                                  max_depth=depth,
                                  max_leaf_nodes=leaf_bodes,
                                  random_state=0)
Exemplo n.º 17
0
  def trainXGB(data_subset):
    f.write('\nTraining XGB:'+'\n')

    X_train = data[data_subset]['X_train']
    X_test = data[data_subset]['X_test']
    y_train = data[data_subset]['y_train']
    y_test = data[data_subset]['y_test']

    for p in params['xgboost']:
      if data_subset != 'binary' and p['objective'] == 'binary:logistic':
        print("Skip using non-binary data with XGB binary:logistic objective")
        continue
      if data_subset == 'binary' and p['objective'] != 'binary:logistic':
        print("Skip using binary data with XGB multi:* objective")
        continue

      header = "@ subset: {0}, params: {1}".format(data_subset, p)
      f.write('\n'+header+'\n')

      objective = p['objective']
      max_depth = p['max_depth']
      try:
        n_estimators= p['n_estimators']
      except KeyError as e:
        n_estimators= 100

      model = XGBClassifier(objective=objective, max_depth=max_depth,
        n_estimators=n_estimators)

      start = time.time()
      model.fit(X_train, y_train)
      elapsed_train = time.time() - start

      y_pred = model.predict(X_test).astype(int)
      elapsed_predict = time.time() - start

      accuracy = accuracy_score(y_test, y_pred)
      precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, pos_label=2, average='weighted')

      print("\n{5}\nXGB with {0} objective, {6} max_depth, {7} n_estimators on data subset {1} trained in {2} seconds and predicted in {3} seconds with an accuracy of {4}\n".format(objective, data_subset, elapsed_train, elapsed_predict, accuracy, header, max_depth, n_estimators))

      f.write(str(elapsed_train) + ', ' + str(elapsed_predict) + str(accuracy)+ ', ' + str(precision)+ ', ' + str(recall )+ ', ' + str(fscore )+ ', ' + str(support))
Exemplo n.º 18
0
def train_model_xgb(train_x, train_y, xgb_features):

    train_ind = StratifiedShuffleSplit(train_y, random_state=1, test_size=0.1)

    for train_index, test_index in train_ind:
        x_train = train_x.ix[train_index, :]
        y_train = train_y.ix[train_index]

        x_eval = train_x.ix[test_index, :]
        y_eval = train_y.ix[test_index]

    #Classifier
    xgb = XGBClassifier(max_depth=xgb_features['max_depth'], learning_rate=xgb_features['learning_rate'], n_estimators=int(xgb_features['n_estimators']), objective='binary:logistic',
                        subsample=xgb_features['subsample'], colsample_bytree=xgb_features['colsample_bytree'], min_child_weight=xgb_features['min_child_weight'])
    # gives 0.458
    xgb = xgb.fit(x_train, y_train, verbose=True, eval_metric='logloss',  eval_set=[(x_eval, y_eval)], early_stopping_rounds=10)

    predictions = pd.Series(xgb.predict_proba(x_train, ntree_limit=xgb.best_iteration)[:, 1], name='PredictedProb')

    return xgb, predictions
Exemplo n.º 19
0
    def test_predict_sklearn_pickle(self):
        x, y = build_dataset()

        kwargs = {'tree_method': 'gpu_hist',
                  'predictor': 'gpu_predictor',
                  'verbosity': 2,
                  'objective': 'binary:logistic',
                  'n_estimators': 10}

        model = XGBClassifier(**kwargs)
        model.fit(x, y)

        save_pickle(model, "model.pkl")
        del model

        # load model
        model: xgb.XGBClassifier = load_pickle("model.pkl")
        os.remove("model.pkl")

        gpu_pred = model.predict(x, output_margin=True)

        # Switch to CPU predictor
        bst = model.get_booster()
        bst.set_param({'predictor': 'cpu_predictor'})
        cpu_pred = model.predict(x, output_margin=True)
        np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
def get_xgb_feature_importance_plot(best_param_, experiment_, 
                                    png_folder,
                                    png_fname,
                                    score_threshold=0.8):

    # 1. 
    train_X, train_y = experiment_.get_train_data()
    clf = XGBClassifier()
    try:
        del best_param_['model_type']
    except:
        pass
    clf.set_params(**best_param_)
    clf.fit(train_X, train_y)
    index2feature = clf.booster().get_fscore()
    fis = pd.DataFrame({'name':index2feature.keys(),
                        'score':index2feature.values()})
    fis = fis.sort('score', ascending=False)
    if len(fis.index) > 20:
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        #where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)

    # 2. plot
    #gs = GridSpec(2,2)
    #ax1 = plt.subplot(gs[:,0])
    #ax2 = plt.subplot(gs[0,1])
    #ax3 = plt.subplot(gs[1,1])

    # 3.1 feature importance
    sns.barplot(x = 'score', y = 'name',
                data = fis,
                #ax=ax1,
                color="blue")
    #plt.title("Feature_Importance", fontsize=10)
    plt.ylabel("Feature", fontsize=10)
    plt.xlabel("Feature_Importance : f-Score", fontsize=10)

    """
    # 3.2 PDF
    confidence_score = clf.oob_decision_function_[:,1]
    sns.distplot(confidence_score, kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF")

    # 3.3 CDF
    num_bins = min(best_param_.get('n_estimators',1), 100)
    counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True)
    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10)
    """

    png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1)
    plt.close()

    return True
Exemplo n.º 21
0
 def __make_xgboost_model(self):
     estimators = self.__parameters['estimators']
     lrate = self.__parameters['learning_rate']
     depth = self.__parameters['max_depth']
     leaf_bodes = self.__parameters['max_leaf_nodes']
     self.__model = XGBClassifier(nthread=4,
                                  learning_rate=lrate,
                                  n_estimators=estimators,
                                  max_depth=depth,
                                  gamma=0,
                                  subsample=0.9,
                                  max_leaf_nodes=leaf_bodes,
                                  colsample_bytree=0.5)
Exemplo n.º 22
0
def get_thresh(model,train,test,label_test,label_train):
    if (len(test)>len(train)) or (len(label_test)>len(label_train)):
        raise TypeError('Invalid train and test size')
    model1 = XGBClassifier()
    if type(model)!=type(XGBClassifier()):
        raise TypeError('Invalid model passed')
    if (pd.DataFrame(label_train).shape[1]>1) or (pd.DataFrame(label_test).shape[1]>1):
    	raise TypeError('Multiple columns in label, Invalid shape.')
    max_score=0
    thrsh=0
    thresholds = np.sort(model.feature_importances_)
    for thresh in thresholds:
        selection = feature_selection.SelectFromModel(model, threshold=thresh,prefit=True)
        select_X_train = selection.transform(train)
        selection_model = XGBClassifier()
        selection_model.fit(select_X_train, label_train)
        select_X_test = selection.transform(test)
        y_pred = selection_model.predict(select_X_test)
        scr=metrics.roc_auc_score(label_test,y_pred)
        if(scr>max_score):
            max_score=scr
            thrsh=thresh
    return thrsh
Exemplo n.º 23
0
def test_on_data(X, y):

    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=2333)
    print "train set: {}, test set: {}".format(len(x_train), len(x_test))
    cls = XGBClassifier()
    cls.fit(x_train, y_train)
    # on test
    pred = cls.predict(x_test)
    print "xgb accuracy score test", accuracy_score(y_test, pred)

    # on all
    pred = cls.predict(X)
    print "xgb accuracy score all", accuracy_score(y, pred)

    # compare to gbrt in sklearn
    cls = GradientBoostingClassifier()
    cls.fit(x_train, y_train)
    # on test
    pred = cls.predict(x_test)
    print "sklearn accuracy score test", accuracy_score(y_test, pred)

    # on all
    pred = cls.predict(X)
    print "sklearn accuracy score all", accuracy_score(y, pred)
Exemplo n.º 24
0
 def __init__(self):
     self._seed = randint(1, 9)
     self._csvfile = ""
     self._titles = None
     self._dataset = None
     self._X = None
     self._y = None
     self._X_original = None
     self._y_original = None
     self._dataset_original = None
     self._model = Sequential()
     self._sc = StandardScaler()
     self._vnum = 0  # Number of variables
     self._classifier = XGBClassifier()
     self._epochs = 10
     self._samplesize = 0
     self._clusters = None
Exemplo n.º 25
0
import numpy as np
import pandas as pd
import operator
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler, PolynomialFeatures, OneHotEncoder
from sklearn.metrics import log_loss, roc_auc_score, f1_score
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from data_utils import *

models = {
    "linear": make_pipeline(StandardScaler(), LogisticRegression()),
    "xgb": XGBClassifier(n_estimators=16, learning_rate=0.5, max_depth=5)
}


def train_none_models():
    X = generate_none_features("train")
    y = generate_none_targets().loc[X.index]  # make sure ordering is correct
    for name, model in models.items():
        with open("../models/none_model_{}.pkl".format(name), "wb") as f:
            pkl.dump(model.fit(X, y), f)


def predict_none_models():
    X = generate_none_features("test")
    preds = {}
    for name, model in models.items():
Exemplo n.º 26
0
 def __init__(self):
     self.model = XGBClassifier()
     self.progress = 0
Exemplo n.º 27
0
def get_model(model_or_name, threads=-1, classification=False):
    regression_models = {
        'xgboost': (XGBRegressor(max_depth=6,
                                 nthread=threads), 'XGBRegressor'),
        'randomforest':
        (RandomForestRegressor(n_estimators=100,
                               n_jobs=threads), 'RandomForestRegressor'),
        'adaboost': (AdaBoostRegressor(), 'AdaBoostRegressor'),
        'linear': (LinearRegression(), 'LinearRegression'),
        'elasticnet': (ElasticNetCV(positive=True), 'ElasticNetCV'),
        'lasso': (LassoCV(positive=True), 'LassoCV'),
        'ridge': (Ridge(), 'Ridge'),
        'xgb.1k': (XGBRegressor(max_depth=6,
                                n_estimators=1000,
                                nthread=threads), 'XGBRegressor.1K'),
        'xgb.10k': (XGBRegressor(max_depth=6,
                                 n_estimators=10000,
                                 nthread=threads), 'XGBRegressor.10K'),
        'rf.1k':
        (RandomForestRegressor(n_estimators=1000,
                               n_jobs=threads), 'RandomForestRegressor.1K'),
        'rf.10k': (RandomForestRegressor(n_estimators=10000, n_jobs=threads),
                   'RandomForestRegressor.10K')
    }

    classification_models = {
        'xgboost': (XGBClassifier(nthread=threads), 'XGBClassifier'),
        'randomforest':
        (RandomForestClassifier(n_estimators=100,
                                n_jobs=threads), 'RandomForestClassifier'),
        'adaboost': (AdaBoostClassifier(), 'AdaBoostClassifier'),
        'logistic': (LogisticRegression(), 'LogisticRegression'),
        'gaussian': (GaussianProcessClassifier(), 'GaussianProcessClassifier'),
        'knn': (KNeighborsClassifier(), 'KNeighborsClassifier'),
        'bayes': (GaussianNB(), 'GaussianNB'),
        'svm': (SVC(), 'SVC'),
        'xgb.1k': (XGBClassifier(n_estimators=1000,
                                 nthread=threads), 'XGBClassifier.1K'),
        'rf.1k':
        (RandomForestClassifier(n_estimators=1000,
                                n_jobs=threads), 'RandomForestClassifier.1K'),
        'xgb.10k': (XGBClassifier(n_estimators=10000,
                                  nthread=threads), 'XGBClassifier.10K'),
        'rf.10k': (RandomForestClassifier(n_estimators=10000, n_jobs=threads),
                   'RandomForestClassifier.10K')
    }

    neural_network_model = {
    }  # TODO: integrate neural network models into this framework

    if isinstance(model_or_name, str):
        if classification:
            model_and_name = classification_models.get(model_or_name.lower())
        else:
            model_and_name = regression_models.get(model_or_name.lower())
        if not model_and_name:
            raise Exception("unrecognized model: '{}'".format(model_or_name))
        else:
            model, name = model_and_name
    else:
        model = model_or_name
        name = re.search("\w+", str(model)).group(0)

    return model, name
Exemplo n.º 28
0
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.6774117647058825
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False,
                                                     criterion="gini",
                                                     max_features=0.5,
                                                     min_samples_leaf=7,
                                                     min_samples_split=18,
                                                     n_estimators=100)),
    StackingEstimator(estimator=BernoulliNB(alpha=1.0, True)),
    StackingEstimator(estimator=XGBClassifier(learning_rate=0.01,
                                              max_depth=9,
                                              min_child_weight=20,
                                              n_estimators=100,
                                              nthread=1,
                                              subsample=0.6000000000000001)),
    ExtraTreesClassifier(bootstrap=True,
                         criterion="gini",
                         max_features=0.7500000000000001,
                         min_samples_leaf=5,
                         min_samples_split=10,
                         n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 29
0
print(ada_score)
print(ada_cm)
print(ada_cr)

# --------------
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

#Parameter list
parameters = {
    'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3],
    'max_depth': range(1, 3)
}

# Code starts here
xgb_model = XGBClassifier(random_state=0)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
xgb_score = accuracy_score(y_test, y_pred)
xgb_cm = confusion_matrix(y_test, y_pred)
xgb_cr = classification_report(y_test, y_pred)
print(xgb_score)
print(xgb_cm)
print(xgb_cr)
clf_model = GridSearchCV(estimator=xgb_model, param_grid=parameters)
clf_model.fit(X_train, y_train)
y_pred = clf_model.predict(X_test)
clf_score = accuracy_score(y_test, y_pred)
clf_cm = confusion_matrix(y_test, y_pred)
clf_cr = classification_report(y_test, y_pred)
print(clf_score)
Exemplo n.º 30
0
d_train_X_2 = pd.concat((d_train_X_0[numeric_features], d_train_X_1), axis=1)

var_drop_1 = [
    'm1_loan_sum', 'm3_loan_sum', 'm6_loan_sum', 'm12_loan_sum', 'm18_loan_sum'
]
var_drop_tz = [i for i in numeric_features if 'var_out' in i]
var_drop = var_drop_1 + var_drop_tz

d_train_X_3 = d_train_X_2.drop(var_drop, axis=1)

#XGBoost_sklearn接口
from xgboost import XGBClassifier

# 先learing_rate和n_estimators,再min_child_weight、colsample_bytree、subsample

xgc = XGBClassifier(max_depth=2, objective='binary:logistic')

model_params = {
    'learning_rate': [0.05, 0.02],
    'n_estimators': [300],
    'colsample_bytree': [0.7],
    'min_child_weight': [5],
    'subsample': [0.7]
}

gs = GridSearchCV(estimator=xgc,
                  param_grid=model_params,
                  n_jobs=4,
                  cv=5,
                  verbose=1,
                  scoring=ks.ks_scorer)
y = dataset['legitimate'].values

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting xgboost to the training Set
classifier = XGBClassifier(max_depth=20,
                           learning_rate=0.3,
                           n_estimators=150,
                           verbose=10)
classifier.fit(X_train, y_train)

# predict the test results
y_pred = classifier.predict(X_test)

# Makeing the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Applying K-Fold cross validation
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
print(accuracies.mean())
print(accuracies.std())
    print('=============================================')
    print('=============================================')
    print(gsearch1.best_params_, gsearch1.best_score_)
    print('=============================================')
    print('=============================================')
    print('=============================================')

    param_test1 = {
        'max_depth': [3, 4, 5, 6, 7],
        'min_child_weight': [3, 5, 7],
        'gamma': [i / 10.0 for i in range(0, 5, 2)],
        'subsample': [i / 10.0 for i in range(5, 10, 2)],
        'colsample_bytree': [i / 10.0 for i in range(5, 10, 2)],
        'objective': ['binary:logistic']
    }
    model = XGBClassifier()
    gsearch1 = GridSearchCV(estimator=XGBClassifier(),
                            param_grid=param_test1,
                            scoring='accuracy',
                            n_jobs=-1,
                            cv=5,
                            verbose=1)

    gsearch1.fit(train, target)

    print('=============================================')
    print('=============================================')
    print('=============================================')
    print(gsearch1.best_params_, gsearch1.best_score_)
    print('=============================================')
    print('=============================================')
Exemplo n.º 33
0
def cv_BDT(input, output, params, show, channel, selection, names):

    # model = XGBClassifier()

    cvscores = []
    AUC = []

    cvscores_train = []
    AUC_train = []
    kfold = StratifiedKFold(5, True, 3456)
    for train, test in kfold.split(input, output):
        model = XGBClassifier(**params)
        X_train, X_test, y_train, y_test = (
            input[train],
            input[test],
            output[train],
            output[test],
        )
        model.fit(X_train, y_train)

        y_prob = model.predict_proba(X_test)
        y_pred = model.predict(X_test)
        prediction = [round(value) for value in y_pred]
        auc = roc_auc_score(y_test, y_prob[:, 1])
        accuracy = accuracy_score(y_test, prediction)
        print("Accuracy: %.2f%%; AUC = %.4f%%" % (accuracy * 100, auc))
        cvscores.append(accuracy * 100)
        AUC.append(auc)

        y_prob = model.predict_proba(X_train)
        y_pred = model.predict(X_train)
        prediction = [round(value) for value in y_pred]
        auc = roc_auc_score(y_train, y_prob[:, 1])
        accuracy = accuracy_score(y_train, prediction)
        print("Accuracy train: %.2f%%; AUC = %.4f%%" % (accuracy * 100, auc))
        cvscores_train.append(accuracy * 100)
        AUC_train.append(auc)

    print("Accuracy test = %.2f%% (+/- %.2f%%); AUC = %.4f (+/- %.4f)" %
          (np.mean(cvscores), np.std(cvscores), np.mean(AUC), np.std(AUC)))
    print("Accuracy train = %.2f%% (+/- %.2f%%); AUC = %.4f (+/- %.4f)" % (
        np.mean(cvscores_train),
        np.std(cvscores_train),
        np.mean(AUC_train),
        np.std(AUC_train),
    ))
    if show:

        name = "channel_" + str(channel) + "_BDT"
        name = "%s_%s" % (name, selection)
        modelname = "models/%s.h5" % name
        print("Save to %s" % modelname)
        plotter.plot_separation(model, X_test, y_test, name, False)
        plotter.plot_ROC(model, X_test, y_test, name, False)
        model.get_booster().feature_names = names
        mp.rc("figure", figsize=(5, 5))

        plot_importance(model.get_booster(),
                        max_num_features=15,
                        importance_type="gain")
        plt.subplots_adjust(left=0.3)
        plt.show()
Exemplo n.º 34
0
     # Divide the data set into a training and testing sets, each time with a different RNG seed
     training_indices, testing_indices = next(iter(StratifiedShuffleSplit(input_data['class'].values,
                                                                          n_iter=1,
                                                                          train_size=0.75,
                                                                          test_size=0.25,
                                                                          random_state=dataset_repeat)))
 
     training_features = input_data.loc[training_indices].drop('class', axis=1).values
     training_classes = input_data.loc[training_indices, 'class'].values
 
     testing_features = input_data.loc[testing_indices].drop('class', axis=1).values
     testing_classes = input_data.loc[testing_indices, 'class'].values
 
     # Create and fit the model on the training data
     try:
         clf = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)
         clf.fit(training_features, training_classes)
         testing_score = clf.score(testing_features, testing_classes)
     except:
         continue
 
     param_string = ''
     param_string += 'learning_rate={},'.format(learning_rate)
     param_string += 'n_estimators={},'.format(n_estimators)
     param_string += 'max_depth={}'.format(max_depth)
 
     out_text = '\t'.join([dataset.split('/')[-1][:-7],
                           'XGBClassifier',
                           param_string,
                           str(testing_score)])
 
Exemplo n.º 35
0
                        #XXX[1,i]=minmax[1]  
                        minmax=band.ComputeStatistics(1)
                        XXX[0,i]=minmax[2]
                        XXX[1,i]=minmax[3] 
    return XXX
    
Xtrain,Ytrain=gen_training(4000)
XXX=gen_scale(sel)
joblib.dump(XXX, path+'Mosquito-Modeling/Climate/data/XXX.pkl')
 
for i in range(Xtrain.shape[1]):
    # Xtrain[:,i]=(Xtrain[:,i]-XXX[0,i])/(XXX[1,i]-XXX[0,i])
    Xtrain[:,i]=(Xtrain[:,i]-XXX[0,i])/XXX[1,i]
    
# define the cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
forest=XGBClassifier(learning_rate=0.01,max_depth=9,n_estimators=700)
kfold=KFold(n_splits=10)
scores = cross_val_score(forest, Xtrain, Ytrain, cv=kfold)
print(scores)
print('Accuracy: %.2f%% (%.2f%%)' % (scores.mean()*100,scores.std()*100))
clf = svm.SVC(kernel='rbf', C=1000)
scores = cross_val_score(clf, Xtrain, Ytrain, cv=kfold)
print(scores)
print('Accuracy: %.2f%% (%.2f%%)' % (scores.mean()*100,scores.std()*100))
clf = svm.SVC(kernel='rbf', C=1000, probability=True)
clf.fit(Xtrain, Ytrain)
joblib.dump(clf,path+'Mosquito-Modeling/Climate/data/clf.pkl')

Exemplo n.º 36
0
def hyperopt_xgb_score(params):
    clf = XGBClassifier(**params)
    current_score = cross_val_score(clf, X, y, cv=3).mean()
    print(current_score, params)
    return -current_score
print("Shape of data after applying PCA: ", X.shape)


# In[11]:


from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
kf = KFold(n_splits=3)
kf.get_n_splits(X)
print(kf)
finalaccuracy=[]
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    xgmodel = XGBClassifier()
    xgmodel.fit(X_train, y_train)
    y_pred = xgmodel.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print(accuracy*100.0)
    finalaccuracy.append(accuracy*100.0)
print("Accuracy: ", sum(finalaccuracy)/float(len(finalaccuracy)))


# In[12]:


from sklearn.ensemble import AdaBoostClassifier
finalaccuracy=[]
for train_index, test_index in kf.split(X):
Exemplo n.º 38
0
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features=[1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
accuracies.mean()
accuracies.std()
Exemplo n.º 39
0
digits = datasets.load_digits()
x = digits.data
y = digits.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

# A parameter grid for XGBoost
params = {
    "min_child_weight": [1, 5, 10],
    "gamma": [0.5, 1, 1.5, 2, 5],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "max_depth": [3, 4, 5],
}

xgb = XGBClassifier(
    learning_rate=0.02,
    n_estimators=50,
    objective="binary:logistic",
    silent=True,
    nthread=1,
)

digit_search = TuneSearchCV(xgb,
                            param_distributions=params,
                            n_iter=3,
                            use_gpu=True)

digit_search.fit(x_train, y_train)
print(digit_search.best_params_)
print(digit_search.cv_results_)
Exemplo n.º 40
0
    def objective(space):
        ### MODEL SELECTION

        if model_name == "lr":
            # logistic regression
            from sklearn.linear_model import LogisticRegression
            model = LogisticRegression(**space)

        elif model_name == "rf":
            # print("Setting model as RandomForestClassifier")
            from sklearn.ensemble import RandomForestClassifier

            model = RandomForestClassifier(**space, n_jobs=-1)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "xgb":
            # print("Setting model as XGBClassifier")
            from xgboost import XGBClassifier

            model = XGBClassifier(**space,
                                  objective="binary:logistic",
                                  nthread=-1)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "dt":
            # print("Setting model as DecisionTreeClassifier")
            from sklearn.tree import DecisionTreeClassifier

            model = DecisionTreeClassifier(**space)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "catboost":
            # print("Setting model as CatBoost")
            from catboost import CatBoostClassifier

            model = CatBoostClassifier(**space)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "extratrees":
            # print("Setting model as CatBoost")
            from sklearn.ensemble import ExtraTreesClassifier

            model = ExtraTreesClassifier(**space, n_jobs=-1)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "svc":
            from sklearn.svm import SVC

            model = SVC(**space)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "ann":
            # print("Setting model as ANN")
            from sklearn import neural_network

            model = neural_network.MLPClassifier(**space)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "lgb":
            import lightgbm as lgb

            model = lgb.LGBMClassifier(**space, n_jobs=-1, random_state=42)

            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "knn":
            from sklearn.neighbors import KNeighborsClassifier

            model = KNeighborsClassifier(**space)

            if verbose:
                print("Hyperparameters: ", space)

        else:
            # print("ERRO: Especifique um nome valido para model_name: rf, xgb, dt ou catboost")
            raise Exception(
                "Invalid model_name - Please specify one of the supported model_name: rf, xgb, ann, dt, svc, lgr, knn or catboost"
            )
        score = cross_val_score(model,
                                x_train,
                                y_train,
                                cv=3,
                                scoring=scoring,
                                verbose=False,
                                n_jobs=-1).mean()
        score = 1 - score  ## ajusta para a funcao de minimizacao.

        return score
Exemplo n.º 41
0
def opt_BDT(input, output, params, show, names):

    model = XGBClassifier(**params)
    xgb_param = model.get_xgb_params()
    cvscores = []
    AUC = []
    X_train, X_test, y_train, y_test = train_test_split(input,
                                                        output,
                                                        test_size=0.2,
                                                        random_state=42)
    matrix_train = xgb.DMatrix(X_train, label=y_train)
    cvresult = xgb.cv(
        xgb_param,
        matrix_train,
        num_boost_round=model.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=30,
        verbose_eval=True,
    )
    model.set_params(n_estimators=cvresult.shape[0])
    model.fit(X_train, y_train, eval_metric="auc")
    y_prob = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    prediction = [round(value) for value in y_pred]
    auc = roc_auc_score(y_test, y_prob[:, 1])
    accuracy = accuracy_score(y_test, prediction)

    print("Accuracy: %.2f%%; AUC = %.4f%" % (accuracy * 100, auc))
    if show:

        name = "channel_" + str(channel) + "_BDT"
        name = "%s_%s" % (name, selection)
        modelname = "models/%s.h5" % name
        print("Save to %s" % modelname)

        plotter.plot_separation(model, X_test, y_test, name, False)
        plotter.plot_ROC(model, X_test, y_test, name, False)
        model.get_booster().feature_names = names
        mp.rc("figure", figsize=(5, 5))
        plot_importance(model.get_booster())
        plt.subplots_adjust(left=0.3)
        plt.show()
Exemplo n.º 42
0
from xgboost import XGBClassifier

X_train_no_last_crop = X_train_no_last_crop.reset_index()
X_train_no_last_crop = X_train_no_last_crop.iloc[:, 1:]

# Search for best xgb params.
# A parameter grid for XGBoost
#params = {
#        'min_child_weight': [1, 5, 10],
#        'gamma': [0.5, 1, 1.5, 2, 5],
#        'subsample': [0.6, 0.8, 1.0],
#        'colsample_bytree': [0.6, 0.8, 1.0],
#        'max_depth': [3, 4, 5]
#        }
# Define classifier.
XGB_clf = XGBClassifier(learning_rate=0.02, n_estimators=200, silent=True,
                        objective="multi:softmax", scoring="roc_auc")

## Create grid search.
#folds = 3
#param_comb = 5
#
#skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
#
#random_search = RandomizedSearchCV(XGB_clf, param_distributions=params, n_iter=param_comb, scoring='roc_auc', 
#                                   n_jobs=4, cv=skf.split(X_train_no_last_crop,y_train), 
#                                   verbose=3, random_state=1001 )

# Here we go
#random_search.fit(X_train_no_last_crop, y_train)

# Print the best estimator.
Exemplo n.º 43
0
class classifier:
    def __init__(self):
        self.model = XGBClassifier()
        self.progress = 0

    def para_tuning(
        self,
        X,
        y,
        para,
        grid,
        seed=0,
        verbose=False
    ):  # verbose = 1 for tuning log, verbose = 2 for plotting, verbose = 3 for both

        # determine which to parameter to tune this time
        if para == '':
            return None
        elif para == 'learning_rate':
            param_grid = dict(learning_rate=grid)  # [0,0.1]
        elif para == 'max_depth':
            param_grid = dict(max_depth=grid)  # int
        elif para == 'min_child_weight':
            param_grid = dict(min_child_weight=grid)  # [0,1]
        elif para == 'gamma':
            param_grid = dict(gamma=grid)  # [0,1]
        elif para == 'max_delta_step':
            param_grid = dict(max_delta_step=grid)  # int
        elif para == 'colsample_bytree':
            param_grid = dict(colsample_bytree=grid)  # [0,1]
        elif para == 'reg_alpha':
            param_grid = dict(reg_alpha=grid)  # [0,1]
        elif para == 'reg_lambda':
            param_grid = dict(reg_lambda=grid)  # [0,1]
        else:
            print('WRONG PARAMETER.')
            return None
        kfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=seed)
        grid_search = GridSearchCV(self.model,
                                   param_grid,
                                   scoring='accuracy',
                                   n_jobs=-1,
                                   cv=kfold)
        grid_result = grid_search.fit(X, y)
        # summarize results
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        if verbose == 1 or verbose == 3:
            for mean, stdev, param in zip(means, stds, params):
                print('{:.4f} ({:.4f}) WITH: {} = {}'.format(
                    mean, stdev, para,
                    list(param.values())[0]))
            print('-' * 63)
        self.progress += 1
        progress = int(self.progress / 7 * 100)
        progress_bar = int(self.progress / 7 * 58)
        print('\r' + '█' * progress_bar + ' ' * (58 - progress_bar) +
              ' {:>3}%'.format(progress),
              end='')
        if verbose == 2 or verbose == 3:
            # plot
            plt.close()
            plt.figure(figsize=(20, 10))
            plt.errorbar(grid, means, yerr=stds)
            plt.title('XGBoost {} Tuning'.format(para))
            plt.xlabel(para)
            plt.ylabel('accuracy')
            plt.show()
        return list(grid_result.best_params_.values())[0]

    def tune(self, X, y, verbose=False, seed=0):
        self.model.seed = seed
        # fit model no training data
        print('-' * 63)
        print('AUTO TUNING ON TRAINING DATASET.')
        self.model.n_estimators = 1024
        self.model.subsample = 0.6
        self.model.learning_rate = 0.01

        self.model.max_depth = self.para_tuning(X, y, 'max_depth',
                                                [2, 4, 6, 8], seed, verbose)
        self.model.min_child_weight = self.para_tuning(X, y,
                                                       'min_child_weight',
                                                       [4, 8, 12, 16], seed,
                                                       verbose)
        self.model.gamma = self.para_tuning(
            X, y, 'gamma', [0, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8], seed,
            verbose)
        self.model.max_delta_step = self.para_tuning(X, y, 'max_delta_step',
                                                     [0, 1, 2, 4], seed,
                                                     verbose)
        self.model.colsample_bytree = self.para_tuning(X, y,
                                                       'colsample_bytree',
                                                       [0.5, 0.6, 0.7], seed,
                                                       verbose)
        self.model.reg_alpha = self.para_tuning(X, y, 'reg_alpha',
                                                [0, 0.001, 0.01, 0.1, 10, 100],
                                                seed, verbose)
        self.model.reg_lambda = self.para_tuning(
            X, y, 'reg_lambda', [0, 0.001, 0.01, 0.1, 10, 100], seed, verbose)
        self.model.learning_rate /= 2

        sleep(3)
        print('\rAUTO TUNING FINISHED.' + ' ' * 42)
        print('-' * 63)
        if input('MODEL REVIEWING? (Y/N) ') == 'Y':
            print(self.model)

    def train(self, data, early_stopping_rounds=None, verbose=True, seed=0):
        X_train, y_train = data.train[0], data.train[1]
        X_test, y_test = data.test[0], data.test[1]

        # tune paramters using trainging dataset
        self.tune(X_train, y_train, seed=seed)
        print('-' * 63)
        # train the model with optimized parameters
        print('MODEL TRAINING.')
        metric = ['error', 'logloss', 'auc']
        #         self.model.min_child_weight = 4
        self.model.fit(X_train,
                       y_train,
                       eval_metric=metric,
                       eval_set=[(X_train, y_train), (X_test, y_test)],
                       early_stopping_rounds=early_stopping_rounds,
                       verbose=False)

        # make predictions for train data
        y_pred = self.model.predict(X_train)
        predictions = [round(value) for value in y_pred]
        # evaluate predictions
        accuracy = accuracy_score(y_train, predictions)
        print('TRAINING FINISHED.')
        print('ACCURACY TRAINING: {:.2f}%'.format(accuracy * 100))

        # make predictions for test data
        y_pred = self.model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        # evaluate predictions
        accuracy = accuracy_score(y_test, predictions)
        print('ACCURACY TESTING: {:.2f}%'.format(accuracy * 100))

        if verbose is True:
            try:
                # plot boosting results
                results = self.model.evals_result()
                epochs = len(results['validation_0'][metric[0]])
                x_axis = range(0, epochs)
                plt.style.use('ggplot')
                plt.rcParams['font.size'] = 8
                plt.figure(figsize=(20, 10))
                i = 0
                for m in metric:
                    ax = plt.subplot2grid((len(metric), 2), (i, 0))
                    i += 1
                    ax.plot(x_axis, results['validation_0'][m], label='Train')
                    ax.plot(x_axis, results['validation_1'][m], label='Test')
                    ax.legend()
                    ax.set_ylabel(m)
                # plot feature importances
                features = data.features
                mapFeat = dict(
                    zip(['f' + str(i) for i in range(len(features))],
                        features))
                imp = pd.Series(self.model.booster().get_fscore())
                imp.index = imp.reset_index()['index'].map(mapFeat)
                ax = plt.subplot2grid((len(metric), 2), (0, 1),
                                      rowspan=len(metric))
                imp.sort_values().plot(kind='barh')
                ax.set_ylabel('importance')
                plt.show()
            except:
                print('PLOTTING ERROR.')
Exemplo n.º 44
0
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, y_pred_svc))
print(confusion_matrix(y_test, y_pred_svc))
from sklearn.linear_model import LogisticRegression
# Building pipeline
text_clf_lr = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', LogisticRegression())])
# Fitting and generating predictions
text_clf_lr.fit(X_train, y_train)
y_pred_lr = text_clf_lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))
from xgboost import XGBClassifier
# Building pipeline
text_clf_xgb = Pipeline([('tfidf', TfidfVectorizer()),
                         ('clf', XGBClassifier())])
# Fitting and generating predictions
text_clf_xgb.fit(X_train, y_train)
y_pred_xgb = text_clf_xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))
from sklearn.ensemble import RandomForestClassifier
# Building pipeline
text_clf_rf = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', RandomForestClassifier())])
# Fitting and generating predictions
text_clf_rf.fit(X_train, y_train)
y_pred_rf = text_clf_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))
model_performance = [
def print_results(dataset, set1, set2):
    X_set1, y_set1 = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set1)])
    X_set2, y_set2 = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set2)])

    #    X_set1 = np.random.rand(*X_set1.shape)
    #    X_set2 = np.random.rand(*X_set2.shape)

    X_set1_wf = add_one_features(X_set1, 0)
    X_set2_wf = add_one_features(X_set2, 1)

    X_genes_wf = np.concatenate([X_set1_wf, X_set2_wf])
    y_all = np.concatenate([y_set1, y_set2])

    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
    print_order = [
        "genes", "genes_set", "genes_biased", "genes_double", "study"
    ]

    max_len_order = max(map(len, print_order))

    rez = defaultdict(list)

    for i, (train_index, test_index) in enumerate(kf.split(X_genes_wf, y_all)):
        X_genes_wf_train, X_genes_wf_test = X_genes_wf[
            train_index], X_genes_wf[test_index]
        y_train, y_test = y_all[train_index], y_all[test_index]

        print("before balanced")
        print_count_two_sets(X_genes_wf_train[:, 0], y_train)
        print_count_two_sets(X_genes_wf_test[:, 0], y_test)
        #        print("counter before balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test))
        X_genes_wf_train, y_train = random_upsample_balance(
            X_genes_wf_train, y_train)
        X_genes_wf_test, y_test = random_upsample_balance(
            X_genes_wf_test, y_test)
        #        print("counter after balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test))
        print("after balanced")
        print_count_two_sets(X_genes_wf_train[:, 0], y_train)
        print_count_two_sets(X_genes_wf_test[:, 0], y_test)

        X_genes_train = X_genes_wf_train[:, 1:]
        X_genes_test = X_genes_wf_test[:, 1:]

        Xs_train = X_genes_wf_train[:, :1]
        Xs_test = X_genes_wf_test[:, :1]

        rez["genes"].append(
            calc_results_simple(X_genes_train, X_genes_test, y_train, y_test,
                                XGBClassifier()))
        rez["genes_set"].append(
            calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train,
                                y_test, XGBClassifier()))
        rez["genes_biased"].append(
            calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train,
                                y_test, BiasedXgboost()))
        rez["genes_double"].append(
            calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train,
                                y_test, DoubleXgboost()))
        rez["study"].append(
            calc_results_simple(Xs_train, Xs_test, y_train, y_test,
                                XGBClassifier()))

        for order in print_order:
            print(order, " " * (max_len_order - len(order)), ": ",
                  list_to_4g_str(rez[order][-1]))
        print("")

    for order in print_order:
        print("==> ", order, " " * (max_len_order - len(order)), ": ",
              list2d_to_4g_str_pm(rez[order]))
Exemplo n.º 46
0
for col in numeric_cols:
    plt.subplot(int(np.ceil(len(numeric_cols)/3)),3,nplot)
    sns.distplot(zeros[col],hist=False,label='Misses')
    sns.distplot(ones[col],hist=False,label='Hits')
    nplot+=1
    plt.legend()
plt.tight_layout()
plt.show()




# %% Modelling
###############################################

estimator = XGBClassifier()
# estimator = LogisticRegression()
param_grid={
            'max_depth':[3,5,12],
            'n_estimators':[50,100,200],
            'objective':['binary:logistic']
            }
# param_grid={}

clf = GridSearchCV(estimator,param_grid,scoring='precision',cv=2,verbose=2,n_jobs=-1)

clf.fit(X_train,y_train)

preds_proba = clf.predict_proba(X_test)[:,1]
threshold = 0.35
# preds = clf.predict(X_test)
Exemplo n.º 47
0
lparams['bagging_freq'] = 6
#lparams['early_stopping_round'] = 20
cparams['n_estimators'] = 120
cparams['max_depth'] = 3
#cparams['l2_leaf_reg'] = 0.001

if use_gpu:
    xparams['tree_method'] = 'gpu_hist'
    xparams['predictor'] = 'gpu_predictor'
    xparams['objective'] = 'gpu:binary:logistic'
    n_jobs = 1
else:
    xparams['objective'] = 'binary:logistic'
    lparams['objective'] = 'binary'

xgbm = XGBClassifier(**xparams)
lgbm = LGBMClassifier(**lparams)
cgbm = CatBoostClassifier(**cparams)
rdf = RandomForestClassifier()
classifiers = [rdf, xgbm, lgbm]
classifiers = [xgbm, lgbm, cgbm]
classifiers = [xgbm, lgbm]
lr = LogisticRegression(C=0.1)
grid = StackingClassifier(classifiers=classifiers,
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

n_estimators = [100, 300]
n_estimators = sp_randint(250, 500)
max_depth = [2, 3]
    cv = []
    biter = []
    for fold, (itr, icv) in enumerate(skf):

        print "------ Fold %d -----------\n" % (fold + 1)
        X_train = train_processed.iloc[itr]
        X_valid = train_processed.iloc[icv]
        Y_train = target[itr]
        Y_valid = target[icv]

        gbm = XGBClassifier(max_depth=8,
                            learning_rate=0.01,
                            n_estimators=10000,
                            subsample=0.9,
                            colsample_bytree=0.45,
                            objective="binary:logistic",
                            silent=False,
                            min_child_weight=1,
                            nthread=-1)

        gbm.fit(X_train,
                Y_train,
                eval_metric="logloss",
                eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
                early_stopping_rounds=200,
                verbose=20)

        ll = gbm.best_score
        best_iter = gbm.best_iteration
        cv.append(ll)
Exemplo n.º 49
0
crossval_splits = 5
accuracy = numpy.zeros(crossval_splits)
sensitivity = numpy.zeros(crossval_splits)
specificity = numpy.zeros(crossval_splits)
cont = 0

skf = StratifiedKFold(n_splits=crossval_splits, shuffle=True, random_state=123)
skf.get_n_splits(data, labels)

for train_index, test_index in skf.split(data, labels):
    train_data, test_data = data[train_index], data[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]
    #XGB Classifier
    model = XGBClassifier(use_label_encoder=False,
                          booster='gbtree',
                          random_state=123)
    model.fit(train_data, train_labels)

    #Compute scores
    pred = model.predict(test_data)
    predictions = [round(value) for value in pred]
    predictions = numpy.asarray(predictions)
    ConfussionMatrix = numpy.zeros((no_classes, no_classes))
    for i in range(pred.shape[0]):
        ConfussionMatrix[test_labels[i], predictions[i]] += 1.0

    for i in range(no_classes):
        accuracy[cont] += ConfussionMatrix[i, i]
    accuracy[cont] /= ConfussionMatrix.sum()
    print('accuracy: ' + str(accuracy[cont]))
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f' % auc)
    print('accuracy:%.3f' % accuracy)
    print('recall:%.3f' % recall)
    print('precision:%.3f' % precision)
    print('specificity:%.3f' % specificity)
    print('prevalence:%.3f' % rate(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity


from xgboost import XGBClassifier
import xgboost as xgb
xgbc = XGBClassifier()
xgbc.fit(X_train_tf, y_train)

y_train_preds = xgbc.predict_proba(X_train_tf)[:, 1]
y_valid_preds = xgbc.predict_proba(X_valid_tf)[:, 1]

print('Xtreme Gradient Boosting Classifier ')
print('Training: ')
xgbc_train_auc, xgbc_train_accuracy, xgbc_train_recall, xgbc_train_precision, xgbc_train_specificity = print_report(
    y_train, y_train_preds, thresh)
print('Validation: ')
xgbc_valid_auc, xgbc_valid_accuracy, xgbc_valid_recall, xgbc_valid_precision, xgbc_valid_specificity = print_report(
    y_valid, y_valid_preds, thresh)
Exemplo n.º 51
0
model.fit(X_train, Y_train)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=300, random_state=0)
model.fit(X_train, Y_train)

from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, Y_train)

from sklearn.svm import SVR
model = SVR(kernel='rbf')
model.fit(X_train, Y_train)

from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, Y_train)

from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=5, metric='manhattan')
model.fit(X_train, Y_train)

from sklearn.linear_model import Lasso
model = Lasso()
model.fit(x_train, y_train)

from sklearn.linear_model import Ridge
model = Ridge()
model.fit(x_train, y_train)

from sklearn.linear_model import ElasticNet
X=ohe.fit_transform(X).toarray()
X = X[:,1:]

#%%
#verilerin egitim ve test icin bolunmesi
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size=0.33, random_state=0)

#%%
#verilerin olceklenmesi
""" FEATURE SCALING UYGULAMAYA GEREK KALMIYOR SANIRIM """

#%% 
""" XGBoost """
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(x_train, y_train) # Sonuçta bu bir classifier. XGBoostta bir classification alg. demekki.

y_pred = classifier.predict(x_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_pred,y_test)
print(cm)







Exemplo n.º 53
0
print('We have %d classes and %d models TOTAL so in resulting arrays \
we expect to see %d columns.' % (n_classes, len(models_1) + len(models_2), 
                                 n_classes * (len(models_1) + len(models_2))))

# Create empty arrays
S_train_all = np.zeros((X_train.shape[0], 0))
S_test_all = np.zeros((X_test.shape[0], 0))

# Load results
for name in sorted(glob('*.npy')):
    print('Loading: %s' % name)
    S = np.load(name)
    S_train_all = np.c_[S_train_all, S[0]]
    S_test_all = np.c_[S_test_all, S[1]]
    
print('\nS_train_all shape:', S_train_all.shape)
print('S_test_all shape: ', S_test_all.shape)

# Initialize 2nd level model
model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                      n_estimators=100, max_depth=3)
    
# Fit 2nd level model
model = model.fit(S_train_all, y_train)

# Predict
y_pred = model.predict_proba(S_test_all)

# Final prediction score
print('Final prediction score: %.8f' % log_loss(y_test, y_pred))
Exemplo n.º 54
0
def fit_xgboost(params, X, y):
    clf = XGBClassifier(**params)
    clf.fit(X, y)
    return clf
_ = death_preds.actual.value_counts().plot.bar(
    ax=ax[0], rot=0,
    color=(sns.color_palette()[0],
           sns.color_palette()[2])).set(xticklabels=["Alive", "Deceased"])
_ = death_preds.actual.value_counts().plot.pie(labels = ("Alive", "Deceased"), autopct = "%.2f%%", label = "", fontsize = 13., ax = ax[1],\
colors = (sns.color_palette()[0], sns.color_palette()[2]), wedgeprops = {"linewidth": 1.5, "edgecolor": "#F7F7F7"}), ax[1].texts[1].set_color("#F7F7F7"), ax[1].texts[3].set_color("#F7F7F7")
X = death_preds[death_preds.actual == 0].sample(350, random_state=62).append(
    death_preds[death_preds.actual == 1].sample(
        350, random_state=62)).copy(deep=True).astype(np.float64)
Y = X.actual.values
tX = death_preds[~death_preds.index.isin(X.index)].copy(deep=True).astype(
    np.float64)
tY = tX.actual.values
X.drop(["SNo", "actual", "DateoFdeath"], 1, inplace=True)
tX.drop(["SNo", "actual", "DateoFdeath"], 1, inplace=True)
clf_xgb = XGBC(subsample=.8, colsample_bytree=.8, seed=14,
               max_depth=3).fit(X, Y)
preds_xgb = clf_xgb.predict_proba(tX)
ax = pd.DataFrame(list(clf_xgb.get_booster().get_fscore().items())).set_index(0)\
.sort_values(1).plot.barh(figsize = (12, 8))
_ = ax.set(frame_on=False,
           ylim=(0, len(clf_xgb.get_booster().get_fscore())),
           xticklabels="",
           xlabel="",
           ylabel=""), ax.legend("")
_ = plt.title("XGB Feature Importance", fontsize=18.)
logreg = LogisticRegression(random_state=14).fit(X, Y)
preds_lr = logreg.predict_proba(tX)

df = pd.DataFrame(list(zip(tX.columns, logreg.coef_[0])))
df = df.reindex(df[1].abs().sort_values().index).set_index(0)
ax = df.plot.barh(width=.6, legend="", figsize=(12, 9))
# plot decision tree
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import plot_tree
from matplotlib import pyplot
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
y = dataset[:,8]
# fit model no training data
model = XGBClassifier()
model.fit(X, y)
# plot single tree
plot_tree(model)
pyplot.show()
    skf = StratifiedKFold(np.array(train["TARGET"]), n_folds = 10, shuffle = True, random_state = 14) 

    cv = []
    biter = []    
    for fold, (itr, icv) in enumerate(skf):
    
        print "------ Fold %d -----------\n" %(fold+1)
        
        trainingSet = train.iloc[itr]
        validationSet = train.iloc[icv]
        
        gbm = XGBClassifier(max_depth=4,
                            learning_rate = 0.01,
                            n_estimators=3000,
                            subsample=0.8,
                            colsample_bytree=0.5,
                            objective="binary:logistic",
                            silent = False,
                            min_child_weight=5,                       
                            nthread=-1)
                            
        gbm.fit(trainingSet[feature_names], np.array(trainingSet["TARGET"]),
                eval_metric="auc",
                eval_set=[(trainingSet[feature_names], np.array(trainingSet["TARGET"])), (validationSet[feature_names], np.array(validationSet["TARGET"]))],
                         early_stopping_rounds=200,verbose=20)    
                          
        ll = gbm.best_score
        best_iter = gbm.best_iteration
        cv.append(ll)
        biter.append(best_iter)
        print "---auc : %0.6f\n" %ll
Exemplo n.º 58
0
#df_null_check.to_excel('Data/Null check.xlsx', index=False)
_ = StandardScaler().fit_transform(X_test)
X_test = pd.DataFrame(_, columns=X_test.columns)

X_train = X_train.round(3)
y_train = y_train.round(3)
#X_train.to_excel(r'Linh Tinh/Linh tinh.xlsx', index = False)

# rfecv.fit(X_train, y_train)
# df_new = rfecv.fit_transform(X_train,y_train)
# print("Best Features:", rfecv.get_support)
# print("Optimal number of features : %d" % rfecv.n_features_)
print("Start Feature Selection")
clf_feature_selection = XGBClassifier(colsample_bytree=0.5,
                                      gamma=0.1,
                                      learning_rate=0.15,
                                      max_depth=20,
                                      min_child_weight=5,
                                      n_estimators=400)
clf = XGBClassifier()
rfecv = RFECV(estimator=clf_feature_selection,
              step=1,
              cv=StratifiedKFold(2),
              scoring='roc_auc')

params = {
    "learning_rate": [0.05, 0.15, 0.3],
    "max_depth": [5, 10, 20, 30, 50, 70],
    "min_child_weight": [5, 10, 20, 50, 100],
    "gamma": [0.0, 0.1, 0.2, 0.4, 0.5],
    "colsample_bytree": [0.2, 0.3, 0.5, 0.7],
    "n_estimators": [100, 200, 400, 500, 600]
Exemplo n.º 59
0
target = df['TARGET']
del df['TARGET']
# del df['ID']
id = df_test['ID']
# del df_test['ID']

pca = PCA(n_components=250)
train_pcaed = pca.fit_transform(df, target)

random_forest = RandomForestClassifier(n_estimators=30, max_depth=5, max_features=20)
random_forest.fit(train_pcaed, target)
forested = random_forest.predict_proba(train_pcaed)
# pipe = Pipeline(steps=[('pca', pca), ('random_forest', random_forest)])

m2_xgb = XGBClassifier(n_estimators=110, nthread=1, max_depth=4)
m2_xgb.fit(train_pcaed, target)
m2_xgbed = m2_xgb.predict_proba(train_pcaed)

logistic_regression = LogisticRegression(penalty='l1')
logistic_regression.fit(train_pcaed, target)
logistic_regressioned = logistic_regression.predict_proba(train_pcaed)

combined = np.concatenate([forested, m2_xgbed, logistic_regressioned], axis=1)


log_reg = LogisticRegression()
log_reg.fit(combined, target)

scores = cross_validation.cross_val_score(log_reg, combined, target,
                                              cv=5, scoring='roc_auc')
# split into training and test
from sklearn.model_selection import train_test_split

validation_size = 0.2
seed = 0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = validation_size, random_state = seed)





# create instance of algorithm
from xgboost import XGBClassifier
model = XGBClassifier()




# fit algorithm to the training set (not required if using parameter tuning)   
model.fit(X_train, y_train)




# predict
y_predicted = model.predict(X_test)