Python XGBClassifier示例，xgboost.sklearn.XGBClassifier Python示例

示例#1

0

显示文件

文件： module6_boost_cv.py 项目： mircean/ML

def job_function(params):
	learning_rate = params[0]
	max_depth = params[1]
	ss_cs = params[2]
	gamma = params[3]
	min_child_weight = params[4]
	reg_lambda = params[5]
	reg_alpha = params[6]

	early_stopping_rounds = 25
	if learning_rate >= 0.3:
		early_stopping_rounds = 5
	if learning_rate <= 0.03:
		early_stopping_rounds = 50

	scores = []
	for i in range(iterations_per_job):
		X_train = Xy[i][0]
		X_test = Xy[i][1]
		y_train = Xy[i][2]
		y_test = Xy[i][3]
		
		y_train2 = le.transform(y_train)   
		y_test2 = le.transform(y_test)   

		clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha)      
		clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False)
		y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit)
		score = calculate_score(y_predicted, y_test2)
		scores.append(score)

	avg_score = np.array(scores).mean()
	print(avg_score, params)
	return avg_score

示例#2

0

显示文件

文件： model.py 项目： kalleon/custom

    def _distributor(self, label, cv, param, eval_metric, early_stopping_rounds=50):
        start = time()

        if self.is_classifier:
            label = 'XGBClassifier'
            rs = XGBClassifier(param)
        else:
            label = 'XGBRegressor'
            rs = XGBRegressor(param)

        X_visible, X_blind, y_visible, y_blined = \
            train_test_split(
                self.X_train, self.y_train, random_state=1301, stratify=self.y_train, test_size=0.4)

        rs.fit(self.X_visible, self.y_visible, eval_metric, early_stopping_rounds=50,
               eval_set=[(X_visible, y_visible), (X_blind, y_blined)])

        self.result[label] = {}
        self.result[label]['clf'] = rs
        # self.result[label]['score'] = rs.best_score_
        self.result[label]['time'] = time() - start
        # self.result[label]['set'] = ('n_iter: %s cv: %s' % (n_iter, cv))

        pprint.pprint(self.result[label])
        # pprint.pprint(rs.grid_scores_)

        out_result = open(self.result_address, 'wb')
        pickle.dump(self.result, out_result)
        out_result.close()

示例#3

0

显示文件

文件： util.py 项目： samcrosoft/Amazon_Review_Helpfulness_Prediction

def extract_leaf_feature(features, targets, train_indexes, params):
    model = XGBClassifier(**params)
    model.fit(features[train_indexes], targets[train_indexes])
    booster = model.booster()
    dmatrix = xgb.DMatrix(features)
    leaf = booster.predict(dmatrix, pred_leaf=True)
    encoder = sklearn.preprocessing.OneHotEncoder()
    leaf_feature = encoder.fit_transform(leaf)
    return leaf_feature

示例#4

0

显示文件

文件： airbnb xgboost model.py 项目： paperparrot/Kaggle-scripts

def main(training_data, test_data):
    # Merging data to ensure consistent cleaning. Putting marker variable to separate later.
    training_data['source'] = 'training'
    test_data['source'] = 'test'
    merged_data = pd.concat([training_data, test_data])

    # Cleaning data
    cleaned_data = data_cleaner(merged_data)

    # Separating data, removing marker
    pred_df = cleaned_data[cleaned_data['source'] == 'training'].copy()
    test_pred = cleaned_data[cleaned_data['source'] == 'test'].copy()

    pred_df.drop('source', axis=1, inplace=True)
    test_pred.drop('source', axis=1, inplace=True)

    # Transforming target into ints, saving the key for later transformation
    labels = LabelEncoder().fit(training_data['country_destination'])
    target_df = pd.Series(labels.transform(training_data['country_destination']), index=training_data.index)

    # Training model
    xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob',
                              subsample=0.5, colsample_bytree=0.5, seed=0)
    xgb_model.fit(pred_df.as_matrix(), target_df.tolist())

    # Running the model
    preds = xgb_model.predict_proba(test_pred.as_matrix())

    # Selecting the top 5 most likely for each respondent and stacking. 
    # This section is VERY slow and could use being optimized
    model_probs = pd.DataFrame(preds, index=test_pred.index, columns=labels.classes_)

    stacked_probs = pd.Series()
    for i in model_probs.index:
        temp = model_probs.loc[i, :]
        temp_sort = pd.DataFrame(temp.sort_values(ascending=False)[:5].index)

        temp_sort['id'] = i
        temp_sort.columns = ['country', 'id']

        stacked_probs = pd.concat([stacked_probs, temp_sort])

    # # Selecting classes with highest probabilities, compiling into list
    # ids = []
    # cts = []
    # test_ids = pd.Series(test_data.index)
    # for i in range(len(test_ids)):
    #     idx = test_data.index[i]
    #     ids += [idx] * 5
    #     cts += labels.inverse_transform(np.argsort(model_probs[i])[::-1])[:5].tolist()
    #
    # predictions = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])

    # Cleaning output and returning it
    output = stacked_probs[['id', 'country']]
    return output

示例#5

0

显示文件

文件： utility.py 项目： tks0123456789/ParamTune_experiments

 def eval_fn(params):
     model = XGBClassifier(n_estimators=n_estimators_max, learning_rate=learning_rate, seed=seed)
     score = 0
     n_estimators = 0
     for tr, va in skf:
         X_tr, y_tr = X_train[tr], y_train[tr]
         X_va, y_va = X_train[va], y_train[va]
         model.set_params(**params)
         model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss',
                   early_stopping_rounds=50, verbose=False)
         score += model.best_score
         n_estimators += model.best_iteration
     score /= n_folds
     n_estimators /= n_folds
     n_estimators_lst.append(n_estimators)
     result_str = "train:%.4f ntree:%5d  " % (score, n_estimators)
     if X_valid is not None:
         model.n_estimators = n_estimators
         model.fit(X_train, y_train)
         pr = model.predict_proba(X_valid)[:,1]
         sc_valid = log_loss(y_valid, pr)
         score_valid.append(sc_valid)
         result_str += "valid:%.4f" % sc_valid
     if verbose:
         print result_str
     return score

示例#6

0

显示文件

文件： xgbModel.py 项目： SeanBE/numerai

def objective(space):

    clf = XGBClassifier(n_estimators=int(space['n_estimators']),
                        objective='binary:logistic',
                        seed=37,
                        learning_rate=space['learning_rate'],
                        max_depth=space['max_depth'],
                        min_child_weight=space['min_child_weight'],
                        colsample_bytree=space['colsample_bytree'],
                        subsample=space['subsample'])

    clf.fit(xTrain, yTrain, eval_metric="logloss")
    pred = clf.predict_proba(xValid)[:, 1]
    loss = log_loss(yValid, pred)
    return{'loss': loss, 'status': STATUS_OK}

示例#7

0

显示文件

文件： Baseline.py 项目： Ewen2015/Kaggle

    def GBDT(self, report=False):
        """Gradient Boosting Decision Tree.

        Args:
            report: whether print out the model analysis report.
        Returns:
            Decision tree model generated from Gradient Boosting Decision Tree."""
        from xgboost.sklearn import XGBClassifier

        self.gbdt = XGBClassifier(objective='binary:logistic',
                                  booster='gbtree',
                                  learning_rate=0.01,
                                  n_estimators=5000,
                                  max_depth=3,
                                  subsample=0.75,
                                  colsample_bytree=0.75,
                                  n_jobs=4,
                                  random_state=2018)

        self.gbdt.fit(self.train_prep[self.features], self.train_prep[self.target])
        
        if report:
            from Report import Report
            rpt = Report(self.gbdt, self.train, self.valid, self.target, self.features)
            rpt.ALL()

        return self.gbdt

示例#8

0

显示文件

文件： Ilwar.py 项目： MacLunch/MacLunch

    def fit(self, json_train, n_estimators = 10, is_xgb = True):

        train = self.pre_process(json_train, istrain = True)
        
        bow_vectorizer = BagOfWordsVectorizer()
        word2vec_model = Word2VecModel()
        tag_counter_model = TagCounterModel()

        # word2vec_model.fit(train["author_pos_sentences"], 500)
        # author_features = word2vec_model.transform(train["author_pos_sentences"], "author")
        # self.author_model = word2vec_model.get_model()

#        bow_vectorizer.fit(train["title_pos_sentences"], 1000)
#        title_features = bow_vectorizer.transform(train["title_pos_sentences"], "title")
#        self.title_model = bow_vectorizer.get_vectorizer()

        bow_vectorizer.fit(train["text_pos_sentences"], 1000)
        text_features = bow_vectorizer.transform(train["text_pos_sentences"], "text")
        self.text_model = bow_vectorizer.get_vectorizer()

#        tag_features = tag_counter_model.fit_transform(train["text"])
#        self.tag_model = tag_counter_model.get_col()

        train = pd.concat([train, text_features], axis = 1)

        #le = preprocessing.LabelEncoder()

        # train["forumid"] = le.fit_transform(train["forumid"])
        
        label = train['istroll']
        train = train.drop('istroll', axis=1)
        train = train.drop(['text', 'text_pos', 'text_pos_sentences'], axis=1)
        
        print(train.columns)

        train.columns = [str(x) for x in range(len(train.columns))]
        
        if is_xgb == False:
            self.model = RandomForestClassifier(n_estimators, n_jobs=-1)
        else:
            self.model = XGBClassifier(n_estimators = n_estimators, max_depth = 10)

        print(train.shape)
        self.model.fit(train, label)

示例#9

0

显示文件

文件： ensemble.py 项目： BabelTower/kaggle_airbnb

def apply_xgb_ens(y_valid, valid_folder='Valid', test_folder='Test'):
    """
    Ensembler based on xgboost Gradient boosting.
    """
    #Loading data
    X, X_test, n_preds, n_class = get_X_X_Test(valid_folder, test_folder)
    y = y_valid
    
    #Defining classifier
    xgb = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=200,
                        objective='multi:softprob', gamma=0., 
                        max_delta_step=0., subsample=0.9, colsample_bytree=0.9,
                        seed=0)  
    xgb.fit(X, y)   
    y_pred = xgb.predict_proba(X_test)
    return y_pred

示例#10

0

显示文件

文件： prediction.py 项目： Zhongjiong/kaggle_airbnb_new_user_bookings

def perform_prediction(training, labels, testing, xgb_votes, rf_votes):
    """ Perform prediction using a combination of XGB and RandomForests. """
    predictions = np.zeros((len(testing), len(set(labels))))
    # Predictions using xgboost.
    for i in range(xgb_votes):
        print 'XGB vote %d' % i
        xgb = XGBClassifier(
            max_depth=DEPTH_XGB, learning_rate=LEARNING_XGB,
            n_estimators=ESTIMATORS_XGB, objective='multi:softprob',
            subsample=SUBSAMPLE_XGB, colsample_bytree=COLSAMPLE_XGB)
        xgb.fit(training, labels)
        predictions += xgb.predict_proba(testing)
    # Predictions using RandomForestClassifier.
    for i in range(rf_votes):
        print 'RandomForest vote %d' % i
        rand_forest = RandomForestClassifier(
            n_estimators=ESTIMATORS_RF, criterion=CRITERION_RF, n_jobs=JOBS_RF,
            max_depth=DEPTH_RF, min_samples_leaf=MIN_LEAF_RF, bootstrap=True)
        rand_forest.fit(training, labels)
        predictions += rand_forest.predict_proba(testing)
    return predictions

示例#11

0

显示文件

文件： bnp_xgb_init.py 项目： paperparrot/BNP_kaggle

def xgboostinitial_predictor(train_path, test_path, eval_path):
    # Loading the data
    print 'Loading the data...'
    train = pd.read_csv(train_path, index_col=0)
    test = pd.read_csv(test_path, index_col=0)
    eval_df = pd.read_csv(eval_path, index_col=0)
    target = train['target'].copy()
    train.drop('target', axis=1, inplace=True)

    # Training model
    print 'Model training begins...'
    # xgtrain = xgb.DMatrix(train.values, target.values, missing=np.nan)
    # xgboost_params = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eval_metric': 'logloss', 'eta': 0.01,
    #                   'subsample': 0.5, 'colsample_bytree': 0.5, 'max_depth': 10, 'silent': 0}
    #
    # xgb_model = xgb.train(xgboost_params, xgtrain, learning_rates=0.3)

    xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='binary:logistic',
                              subsample=0.5, colsample_bytree=0.5, seed=0)
    xgb_model.fit(train.as_matrix(), target.tolist())

    # Running the model
    print 'Making predictions....'
    # xgtest = xgb.DMatrix(test.values)
    # xgeval = xgb.DMatrix(eval_df)

    test_preds = xgb_model.predict_proba(test.as_matrix())
    eval_preds = xgb_model.predict_proba(eval_df.as_matrix())

    print 'Cleaning predictions to match expected format....'
    test_output = pd.DataFrame(test_preds, index=test.index)
    print test_output.columns
    test_output = test_output[1]
    test_output.columns = ['PredictedProb']

    eval_output = pd.DataFrame(eval_preds, index=eval_df.index)
    eval_output = eval_output[1]
    eval_output.columns = ['PredictedProb']

    return test_output, eval_output

示例#12

0

显示文件

文件： prediction.py 项目： Chouffe/kaggle_airbnb_new_user_booking

def train_classifier(X, y, clf_name='xgb'):
    if clf_name == 'xgb':
        clf = XGBClassifier(
            n_estimators=ESTIMATORS_XG,
            objective=OBJECTIVE_XG,
            max_depth=DEPTH_XG,
            learning_rate=LEARNING_RATE_XG,
            subsample=SUBSAMPLE_XG,
            colsample_bytree=COLSAMPLE_BYTREE_XG,
            seed=0,
        )
    else:
        clf = RandomForestClassifier(
            n_estimators=ESTIMATORS_RF,
            criterion=CRITERION_RF,
            n_jobs=JOBS_RF,
            max_depth=DEPTH_RF,
            min_samples_leaf=MIN_LEAF_RF,
            min_samples_split=MIN_SPLIT_RF,
            max_features=MAX_FEATURES_RF,
            bootstrap=True,
        )
    clf.fit(X, y)
    return clf

示例#13

0

显示文件

文件： analysis_basic.py 项目： joostgp/kaggle_ad_detection

def get_xgboost_classifier(X_train, y_train, X_val, y_val,params=None, tag=""):
    
    param_grid = {'max_depth':[3,5,7], 'min_child_weight': [1,3,5], 'n_estimators': [50]}
    
    if params is None:
        xgb = XGBClassifier(
                 learning_rate =0.2,
                 objective= 'binary:logistic',
                 seed=27)
                 
        t = start("training xgboost ")
        cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=10,test_size=0.2, random_state=123)
        clf = grid_search.GridSearchCV(xgb, param_grid, cv=cv, n_jobs=1, scoring='roc_auc')
        clf = clf.fit(X_train,y_train)
        report(t, nitems=10*len(param_grid))
        
        print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_))
        print "With parameters:"
    
        best_parameters = clf.best_estimator_.get_params()
        for param_name in sorted(param_grid.keys()):
            print '\t%s: %r' % (param_name, best_parameters[param_name]) 
    else:
        clf = XGBClassifier(**params)
        clf.fit(X_train, y_train, eval_set =  [(X_train,y_train),(X_val,y_val)], eval_metric='auc', verbose=False)
        
        if plot_cv_curves:
            train = clf.evals_result()['validation_0']['auc']
            val = clf.evals_result()['validation_1']['auc']
        
            plot_cv_curve(train, val, tag)
        
        if plot_feature_importance:
            plot_feature_importance(clf, tag)

    return clf

示例#14

0

显示文件

文件： final_predictor.py 项目： Chris19920210/Microsoft_malware

def main():
    data_train = pd.read_csv(args.train_dataset)
    X_train = data_train.drop(['Id', 'Class'], axis=1)
    y_train = data_train.loc[:, 'Class']
    data_test = pd.read_csv(args.test_dataset)
    X_test = data_test.drop(['Id'], axis=1)
    Id = data_test.loc[:, 'Id']
    clf = XGBClassifier()
    clf.set_params(**best_dicts)
    clf.fit(X_train, y_train)
    prediction = clf.predict_proba(X_test)
    columns = ['Prediction'+str(i) for i in range(1, 10)]
    prediction = pd.DataFrame(prediction, columns=columns)
    results = pd.concat([Id, prediction], axis=1)
    return (clf, results)

示例#15

0

显示文件

文件： module3_python_v2.py 项目： mircean/ML

def myThreadFunc(ThreadID):
	X_train = Xy[ThreadID][0]
	X_test = Xy[ThreadID][1]
	y_train = Xy[ThreadID][2]
	y_test = Xy[ThreadID][3]
		
	y_train2 = le.transform(y_train)   
	y_test2 = le.transform(y_test)   

	clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha)      
	clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False)
	y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit)
	score = calculate_score(y_predicted, y_test2)
	print(score, clf.booster().best_ntree_limit)
	
	train_and_test_scores[ThreadID] = score

示例#16

0

显示文件

文件： Airbnb prediction.py 项目： prashant8488/AirBnb-Predict-New-User-Booking

# In[91]:

# Random Forests

random_forest = RandomForestClassifier(random_state=1, n_estimators=45, min_samples_split=3, min_samples_leaf=2)

random_forest.fit(X, y)
score=random_forest.score(X, y)

Y_pred = random_forest.predict(X_test)


# In[14]:

#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X, y)
score = xgb.score(X,y)
y_pred = xgb.predict_proba(X_test)  



# In[15]:

print (score)


# In[21]:

# for Random forest

示例#17

0

显示文件

文件： fenchai.py 项目： yubo1993/UnionPay-Business-Algorithm-Competition

data1_new_train = data1_new.ix[0:11016, :]
data1_new_test = data1_new.ix[11017:, :]
X_train, X_test, Y_train, Y_test = train_test_split(data1_new_train,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=42)

#训练xgboost模型
#设置初试参数
xgb_train = XGBClassifier(booster="gbtree",
                          learning_rate=0.02,
                          n_estimators=1000,
                          max_depth=6,
                          min_child_weight=5,
                          gamma=0,
                          reg_alpha=65,
                          reg_lambda=10,
                          subsample=0.81,
                          colsample_bytree=0.81,
                          objective='binary:logistic',
                          nthread=8,
                          scale_pos_weight=3.9,
                          seed=27)
xgb_train.fit(X_train, Y_train, eval_metric="auc")
pred = xgb_train.predict(X_test)
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(Y_test, pred)
metrics.auc(fpr, tpr)
np.mean(f1_score(Y_test, pred, average=None))
y_prediction = xgb_train.predict(data1_new_test)
predict_result = DataFrame({"user_id": user_id, "y_prediction": y_prediction})

示例#18

0

显示文件

文件： module8_boost_2models.py 项目： mircean/ML

def model1(df_train, df_test):
	print('model1')

	print('rows', df_train.shape[0]) 

	#remove rows with no sessions data
	hassessions = df_train['HasSessions']
	df_train = df_train.drop(hassessions[hassessions == 0].index)

	#remove rows older than 1/1/2014
	#dac2 = df_train.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
	#print('removing rows', len(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index))
	#df_train = df_train.drop(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index)

	print('rows', df_train.shape[0]) 

	labels = df_train['country_destination'].values
	df_train = df_train.drop(['country_destination'], axis=1)
	piv_train = df_train.shape[0]

	#Creating a DataFrame with train+test data
	df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
	#Removing id and date_first_booking
	df_all = df_all.drop(['id', 'date_first_booking', 'sessions_count', 'HasSessions'], axis=1)

	#Filling nan
	df_all = df_all.fillna(-1)

	#####Feature engineering#######
	print('features in the csv', df_all.shape[1])

	#date_account_created
	print('dac', datetime.now())
	dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
	df_all['dac_year'] = dac[:,0]
	df_all['dac_month'] = dac[:,1]
	df_all['dac_day'] = dac[:,2]

	#day of week, seazon
	print('dac2', datetime.now())
	dac2 = df_all.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
	df_all['dac_weekday'] = dac2.apply(lambda x: x.weekday())
	df_all['dac_season'] = dac2.apply(calculate_season)

	df_all = df_all.drop(['date_account_created'], axis=1)

	#timestamp_first_active
	print('tfa', datetime.now())
	tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
	df_all['tfa_year'] = tfa[:,0]
	df_all['tfa_month'] = tfa[:,1]
	df_all['tfa_day'] = tfa[:,2]
	df_all = df_all.drop(['timestamp_first_active'], axis=1)

	#Age
	print('age', datetime.now())
	av = df_all.age.values
	df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

	#remove features
	print('remove features', datetime.now())
	df_all = df_all.drop(['Sessions' + str(i) for i in [0]], axis=1)
	df_all = df_all.drop(['SessionsD' + str(i) for i in range(456)], axis=1)

	print('features in the model', df_all.shape[1])

	#One-hot-encoding features
	print('one-hot', datetime.now())
	ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'dac_season', 'sessions_preferred_device'] 

	for f in ohe_feats:
		df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
		df_all = df_all.drop([f], axis=1)
		df_all = pd.concat((df_all, df_all_dummy), axis=1)

	#Splitting train and test
	vals = df_all.values
	X = vals[:piv_train]
	y = labels
	X_predict = vals[piv_train:]

	#learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha =  0.03, 6, 0.5, 2, 2, 2, 1
	learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha =  0.03, 8, 0.5, 2, 1, 2, 0

	early_stopping_rounds = 25
	if learning_rate <= 0.03:
		early_stopping_rounds = 50

	print(learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha)

	#n_estimators = 455
	n_estimators = 350
	#n_estimators = 1
	print(n_estimators)

	print('fit start', datetime.now())
	clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=-1)      
	clf2.fit(X, y)
	y_predicted2 = clf2.predict_proba(X_predict)  

	return y_predicted2

示例#19

0

显示文件

    y_pred = clf_grid.predict(X_test)

    print('best parameter:\n', clf_grid.best_params_)
    print('Best score is {}'.format(clf_grid.best_score_))
    print('accuracy:', metrics.accuracy_score(y_test, y_pred), 'precision:', metrics.precision_score(y_test, y_pred),'recall:', metrics.recall_score(y_test, y_pred), 'f-score:', metrics.accuracy_score(y_test, y_pred),
          'cm:',metrics.confusion_matrix(y_test, y_pred))

    # XGBoost
    # grid search and step-by-step tuning parameters: 1. fix learning rate and number of estimators for tuning tree-based parameters; 2. tune max_depth and min_child_weight; 3. tune gamma;
                                                  # 4. tune subsample and colsample_bytree; 5. tune Regularization Parameters; 6. tune learning rate and number of estimators;
                                                  # reference guide at https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
    # tuning learning rate and number of estimators as an example
from xgboost.sklearn import XGBClassifier

    param_test5 = {'n_estimators': range(50,500,50),'learning_rate':[i/100.0 for i in range(1,10)]}
    gsearch5 = GridSearchCV(estimator=XGBClassifier(max_depth=5,min_child_weight=1, gamma=0.0, subsample=0.9, colsample_bytree=0.7,reg_alpha=0.1,
                                                    objective= 'binary:logistic',nthread=4, scale_pos_weight=1, seed=2),param_grid = param_test5, scoring='accuracy',n_jobs=-1,iid=False, cv=5)

    gsearch5.fit(X_train, y_train)
    y_pred = gsearch5.predict(X_test)

    print('best parameter:\n', gsearch5.best_params_)
    print('Best score is {}'.format(gsearch5.best_score_))
    print('accuracy:', metrics.accuracy_score(y_test, y_pred), 'precision:', metrics.precision_score(y_test, y_pred),
          'recall:', metrics.recall_score(y_test, y_pred), 'f-score:', metrics.accuracy_score(y_test, y_pred),
          'cm:', metrics.confusion_matrix(y_test, y_pred))

        # XGBoost feature importance score with the best parameters
    clf = XGBClassifier(max_depth=5, min_child_weight=1, gamma=0.0, subsample=0.9, colsample_bytree=0.7,
                        reg_alpha=0.1, n_estimators=350, learning_rate=0.05)
    clf.fit(X_train, y_train)
    clf.score(X_test, y_test)

示例#20

0

显示文件

class XGBoostModel(BaseModel):
    """RandomForest classifier."""
    def __init__(self,
                 max_depth=3,
                 learning_rate=0.1,
                 n_estimators=100,
                 objective="binary:logistic",
                 booster='gbtree',
                 silent=True,
                 n_jobs=1,
                 gamma=0,
                 min_child_weight=1,
                 max_delta_step=0,
                 subsample=1,
                 colsample_bytree=1,
                 colsample_bylevel=1,
                 reg_alpha=0,
                 reg_lambda=1,
                 scale_pos_weight=1,
                 base_score=0.5,
                 random_state=0,
                 missing=None):
        """"""
        super(XGBoostModel).__init__()
        self.model = XGBClassifier(max_depth=max_depth,
                                   learning_rate=learning_rate,
                                   n_estimators=n_estimators,
                                   silent=silent,
                                   objective=objective,
                                   booster=booster,
                                   n_jobs=n_jobs,
                                   gamma=gamma,
                                   min_child_weight=min_child_weight,
                                   max_delta_step=max_delta_step,
                                   subsample=subsample,
                                   colsample_bytree=colsample_bytree,
                                   colsample_bylevel=colsample_bylevel,
                                   reg_alpha=reg_alpha,
                                   reg_lambda=reg_lambda,
                                   scale_pos_weight=scale_pos_weight,
                                   base_score=base_score,
                                   random_state=random_state,
                                   missing=missing)

    def predict(self, features):
        super().predict(features)
        labels = self.model.predict(features)
        return labels

    def predict_prob(self, features):
        super().predict_prob(features)
        probs = self.model.predict_proba(features)
        return probs

    def predict_log_prob(self, features):
        super().predict_log_prob(features)
        probs = self.model.predict_proba(features)
        return probs

    def train(self, features, targets):
        super().train(features, targets)
        start = time.time()
        self.model.fit(X=features, y=targets)
        print('Finished, time %s' % (time.time() - start))

    def accuracy_score(self, features, targets):
        super().accuracy_score(features, targets)
        score = self.model.score(features, targets,
                                 self.model.scale_pos_weight)
        return score

    def abs_errors(self, features, targets):
        targets_pred = self.predict(features)
        result = abs(targets_pred - targets)
        return result

    def rmse_score(self, y_pred, y_true):
        """
        计算RMSE评分，为了体现预测结果0、1、2不同的重要性，增加对1,2预测错误的惩罚度，
        在评分计算时对不同行为分别乘以1,2,2.5的权重因子。
        np.average((y_true - y_pred) ** 2, axis=0, weights=weights)
        :param y_pred: 预测标签
        :param y_true: 真实标签
        :return: 评分
        """
        weight_dict = {0: 1, 1: 2, 2: 2.5}  # 不同类别的误判惩罚权重
        weights = [weight_dict[l] for l in y_true]
        mse = np.average((y_true - y_pred)**2, axis=0, weights=weights)
        score = 1 / (1 + np.sqrt(mse))
        return score

示例#21

0

显示文件

文件： xgboost调参-2.py 项目： yanheluke/Machine_learning

    #
    #feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    #feat_imp.plot(kind='bar', title='Feature Importances')
    #plt.ylabel('Feature Importance Score')


### 建立模型
print('### 建立XGBClassifier')
xgbc = XGBClassifier(
    learning_rate=0.1,  # 学习率
    silent=1,  # 输出中间过程
    n_estimators=150,  # 决策树个数
    max_depth=5,  # 决策树深度
    min_child_weight=1,  # 最小叶子节点权重和？
    gamma=0,  # 惩罚项系数
    subsample=0.8,  # 训练一棵树所用的数据占全部数据集比例
    colsample_bytree=0.8,  # 训练一颗树所用的特征占全部数据集比例
    objective='binary:logistic',  # 损失函数 
    nthread=4,  # 线程数
    scale_pos_weight=1,  # 样本不平衡
    eval_metric='logloss',  # 评估指标
    reg_alpha=0.03,  # 正则化系数
    seed=27)  # 随机种子

### 网格搜索
## step1:决策树个数 n_estimators
print("### 调参:决策树个数")
#modelfit(xgbc, train, test, predictors)

## step2:决策树参数 max_depth/min_child_weight/gamma/subsample/colsample_bytree
print("### 调参:决策树参数")

示例#22

0

显示文件

文件： module7_boost_submit.py 项目： mircean/ML

		Xy.append([X_train, X_test, y_train, y_test])

	for iter in range(iterations):
#		if iter < 5:
#			continue
		X_train = Xy[iter][0]
		X_test = Xy[iter][1]
		y_train = Xy[iter][2]
		y_test = Xy[iter][3]

		y_train2 = le.transform(y_train)   
		y_test2 = le.transform(y_test)   

		print('fit start', datetime.now())

		clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread)      
		clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2)

submit = 0
if submit == 1:
#	n_estimators = 395
	n_estimators = 349
	#n_estimators = clf.booster().best_ntree_limit 
	print(n_estimators)

	print('fit start', datetime.now())
	clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread)      
	clf2.fit(X, y)
	#clf2.fit(X, y, eval_set=[(X, y2)], eval_metric=calculate_score_dummy, early_stopping_rounds=n_estimators)

	y_predicted = clf2.predict_proba(X_predict)

示例#23

0

显示文件

文件： XGboost+ClusterCentroids.py 项目： non27/The-final-assignment

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#ClusterCentroids
cc = ClusterCentroids(random_state=0)
os_X,os_y = cc.fit_sample(X_train,y_train)

#XGboost
clf_XG = XGBClassifier(learning_rate= 0.3, min_child_weight=1,
                       max_depth=6,gamma=0,subsample=1, max_delta_step=0, colsample_bytree=1,
                       reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000)  
clf_XG.fit(os_X, os_y,eval_set=[(os_X, os_y), (X_test, y_test)],eval_metric='auc',verbose=False)  
evals_result = clf_XG.evals_result()  
y_true, y_pred = y_test, clf_XG.predict(X_test)  

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)  
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)  
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
 
#Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)
print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1])

示例#24

0

显示文件

文件： voting.py 项目： ajmal017/python_base

from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC
import numpy as np
#事先准备三个模型，xgb和rf模型都已经通过cross_validation找出了较好的参数
clf1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=140,
                     max_depth=1,
                     min_child_weight=2,
                     gamma=0,
                     subsample=0.7,
                     colsample_bytree=0.6,
                     objective='binary:logistic',
                     nthread=4,
                     scale_pos_weight=1)
clf2 = RandomForestClassifier(n_estimators=50,
                              max_depth=1,
                              min_samples_split=4,
                              min_samples_leaf=54,
                              oob_score=True)
clf3 = SVC(C=0.1, probability=True)

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
eclf = VotingClassifier(estimators=[('xgb', clf1), ('rf', clf2),
                                    ('svc', clf3)],
                        voting='hard')
for clf, label in zip([clf1, clf2, clf3, eclf],
                      ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv=5, scoring='accuracy')

示例#25

0

显示文件

文件： xgboost.py 项目： Helen-n/kaggle

data=pd.read_csv("../data/data.csv")
data.lon.unique().shape

data_x=pd.get_dummies(data.action_type,prefix="action_type")
cols=["combined_shot_type","game_event_id","period","playoffs",
      "shot_type","shot_zone_area","shot_zone_basic","shot_zone_range",
      "matchup","opponent","game_date","shot_distance","minutes_remaining","seconds_remaining",
      "loc_x","loc_y"]
for col in cols:
    data_x=pd.concat([data_x,pd.get_dummies(data[col],prefix=col),],axis=1)
train_x=data_x[-pd.isnull(data.shot_made_flag)]
test_x=data_x[pd.isnull(data.shot_made_flag)]
train_y=data.shot_made_flag[-pd.isnull(data.shot_made_flag)]

clf = XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=550,
                     subsample=0.5, colsample_bytree=0.5, seed=0)
clf.fit(train_x, train_y)
y_pred = clf.predict(train_x)
print("Number of mislabeled points out of a total %d points : %d"  % (train_x.shape[0],(train_y != y_pred).sum()))

def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    print(ll)
    return ll
    
logloss(train_y,clf.predict_proba(train_x)[:,1])

示例#26

0

显示文件

文件： python.py 项目： andositopu/Data-Science-Lengkap-Analisis-Kartu-Kredit-Customers

from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# In[29]:

preprocessor = make_pipeline(SelectKBest(f_classif, k=10),
                             PolynomialFeatures(2))

AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(random_state=0))
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))
GBoost = make_pipeline(preprocessor, StandardScaler(),
                       GradientBoostingClassifier())
RandomForest = make_pipeline(preprocessor, RandomForestClassifier())
XGB = make_pipeline(preprocessor, XGBClassifier())
Extree = make_pipeline(preprocessor, ExtraTreesClassifier())

dict_of_models = {
    'AdaBoost': AdaBoost,
    'SVM': SVM,
    'GBoost': GBoost,
    'RandomForest': RandomForest,
    'XGB': XGB,
    'Extree': Extree
}

# In[30]:

from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import learning_curve

示例#27

0

显示文件

文件： xgboost02.py 项目： haborta/AILab

from xgboost.sklearn import XGBClassifier

# load data
dataset = loadtxt("pima-indians-diabetes.csv", delimiter=",")
X = dataset[:, 0:8]
Y = dataset[:, 8]

seed = 7

test_size = 0.33
X_train, x_test, Y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=test_size,
                                                    random_state=seed)

model = XGBClassifier()
eval_set = [(x_test, y_test)]
# early_stopping_rounds: 如果连续N 次结果没有提升,则停止
# eval_metric: 损失函数
# eval_set: A list of (X, y) pairs to use as a validation set for early-stopping
# verbose: print 学习结果
model.fit(X_train,
          Y_train,
          early_stopping_rounds=10,
          eval_metric="logloss",
          eval_set=eval_set,
          verbose=True)

# make predictions for test data
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]

示例#28

0

显示文件

文件： 1_simple_xgb.py 项目： shubhampachori12110095/patient2vec

for data_file in data_files:
    if exists_in_log_file(data_file):
        # Skip if already trained and tested
        print('Skipping {}'.format(data_file))
        continue

    print('Training on {}'.format(data_file))

    # Loading data
    data = dill.load(open(os.path.join(DATA_DIR, data_file), 'rb'))

    for months_before in data.keys():
        train_x = data[months_before]["TRAIN"]["X"]
        train_y = data[months_before]["TRAIN"]["y"]
        test_x = data[months_before]["TEST"]["X"]
        test_y = data[months_before]["TEST"]["y"]

        # Creating and training model
        clf = XGBClassifier(n_estimators=N_ESTIMATORS,random_state=1,
                            verbose=1, n_jobs=N_JOBS)
        clf.fit(train_x, train_y, verbose=True)

        # Scoring
        pred_y = clf.predict_proba(test_x)

        auc_score = roc_auc_score(test_y, pred_y[:,1])
        log_score = log_loss(test_y, pred_y)

        logging.info('{}, {}, {}, {}'.format(data_file, months_before, auc_score, log_score))

示例#29

0

显示文件

    def train(self, train_set, dev_set):
        logger.log('Get features from training set')
        if os.path.exists(train_features_file):
            train_features = np.load(train_features_file)
            _, _, train_labels, _, _ = self.get_minibatch(
                train_set, 0, len(train_set))
        else:
            train_features = None
            train_labels = []
            total_batch = int(len(train_set) - 1) / self.batch_size + 1
            for i in tqdm(range(total_batch)):
                minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \
                    minibatch_prem_dep, minibatch_hypo_dep = \
                    self.get_minibatch(train_set, i * self.batch_size, (i+1) * self.batch_size)
                feed_dict = {
                    self.model.premise_x: minibatch_premise_vectors,
                    self.model.hypothesis_x: minibatch_hypothesis_vectors,
                    self.model.y: minibatch_labels,
                    self.model.keep_rate_ph: 1.0
                }
                if 'dep_avg' in self.model_type:
                    feed_dict[self.model.prem_dep] = minibatch_prem_dep
                    feed_dict[self.model.hypo_dep] = minibatch_hypo_dep
                minibatch_features = self.sess.run([self.model.features],
                                                   feed_dict)
                train_features = minibatch_features[0] if train_features is None \
                    else np.concatenate((train_features, minibatch_features[0]))
                train_labels += minibatch_labels

            np.save(train_features_file, train_features)

        logger.log('Get features from dev set')
        if os.path.exists(dev_features_file):
            dev_features = np.load(dev_features_file)
            _, _, dev_labels, _, _ = self.get_minibatch(
                dev_set, 0, len(dev_set))
        else:
            dev_features = None
            dev_labels = []
            total_batch = int(len(dev_set) - 1) / self.batch_size + 1
            for i in tqdm(range(total_batch)):
                minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \
                    minibatch_prem_dep, minibatch_hypo_dep = \
                    self.get_minibatch(dev_set, i * self.batch_size, (i+1) * self.batch_size)
                feed_dict = {
                    self.model.premise_x: minibatch_premise_vectors,
                    self.model.hypothesis_x: minibatch_hypothesis_vectors,
                    self.model.y: minibatch_labels,
                    self.model.keep_rate_ph: 1.0
                }
                if 'dep_avg' in self.model_type:
                    feed_dict[self.model.prem_dep] = minibatch_prem_dep
                    feed_dict[self.model.hypo_dep] = minibatch_hypo_dep
                minibatch_features = self.sess.run([self.model.features],
                                                   feed_dict)
                dev_features = minibatch_features[0] if dev_features is None \
                    else np.concatenate((dev_features, minibatch_features[0]))
                dev_labels += minibatch_labels

            np.save(dev_features_file, dev_features)

        tuned_parameters = {'max_depth': [4, 6, 8], 'n_estimators': [100, 200]}

        best_score = 0.
        best_params = []
        for g in ParameterGrid(tuned_parameters):
            clf = XGBClassifier(nthread=24)
            clf.set_params(**g)
            clf.fit(train_features, train_labels)
            score = clf.score(dev_features, dev_labels)
            logger.log('%s: %f' % (str(g), score))
            if best_score < score:
                best_score = score
                best_params = g
                self.clf = clf

        logger.log('Best score: %s %f' % (str(best_params), best_score))

示例#30

0

显示文件

    params = {
        'learning_rate': 0.1,
        'n_estimators': 100,
        'seed': 0,
        'subsample': 1,
        'colsample_bytree': 1,
        'objective': 'binary:logistic',
        'max_depth': 3
    }

    # log model params
    for key in params:
        mlflow.log_param(key, params[key])

    # train XGBoost model
    gbtree = XGBClassifier(**params)
    gbtree.fit(train_features, train_labels)

    importances = gbtree.get_booster().get_fscore()
    print(importances)

    # get predictions
    y_pred = gbtree.predict(test_features)

    accuracy = accuracy_score(test_labels, y_pred)
    print("Accuracy: %.1f%%" % (accuracy * 100.0))

    # log accuracy metric
    mlflow.log_metric("accuracy", accuracy)

    sns.set(font_scale=1.5)

示例#31

0

显示文件

文件： para2.py 项目： yuanyuanshi/kaggle_competition_scripts

params={'max_depth':6, 'eta':0.05,'objective':'multi:softprob', 'subsample':0.8, 'colsample_bytree':1,'min_child_weight':1,'num_class':3}
num_rounds=206
z=[]
dtrain=xgb.DMatrix(train[features],label=y)
clf=xgb.train(params,dtrain,num_rounds)

importance=clf.get_fscore(fmap='xgb.fmap')
importance=sorted(importance.items(),key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

bst=list(df['feature'][df.fscore>0.001])
#df.to_csv('select.csv',index=False)
X_train,X_valid,y_train,y_valid=train_test_split(train[bst],y,test_size=0.6,random_state=10)
print ('start xgboost learning...')
alg = XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=1210, objective='multi:softprob', subsample=0.8, colsample_bytree=1,min_child_weight=1)                    
alg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],eval_metric='mlogloss',early_stopping_rounds=10,verbose=True)


#plt.figure()
#df.plot()
#df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
#plt.title('XGBoost Feature Importance')
#plt.xlabel('relative importance')
#plt.gcf().savefig('feature_importance_xgb.png')
y_pred = alg.predict_proba(test[bst])
result=pd.DataFrame(y_pred,columns=['predict_0','predict_1','predict_2'])
result['id']=test.id.values.copy()
#result.to_csv('xgb10.csv',index=False)

示例#32

0

显示文件

文件： sample329.py 项目： tetherless-world/CodeGraph

# *Selection of ML algorithm*: A first approach to deal with imbalanced data is to balance it by discarding the majority class before applying an ML algorithm. The disadvantage of  undersampling is that a model trained in this way will not perform well on real-world skewed test data since almost all the information was discarded. A better approach might be to oversample the minority class, say by the synthetic minority oversampling technique (SMOTE) contained in the 'imblearn' library. Motivated by this, I tried a variety of anomaly-detection and supervised learning approaches. I find, however, that the best result is obtained on the original dataset by using a ML algorithm based on ensembles of decision trees that intrinsically performs well on imbalanced data. Such algorithms not only allow for constructing a model that can cope with the missing values in our data, but they naturally allow for speedup via parallel-processing. Among these algorithms, the extreme gradient-boosted (XGBoost) algorithm used below slightly outperforms random-forest. Finally, XGBoost, like several other ML algorithms, allows for weighting the positive class more compared to the negative class --- a setting that also allows to account for the skew in the data.

# Split the data into training and test sets in a 80:20 ratio

# In[ ]:


trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.2, \
                                                random_state = randomState)

# In[ ]:

# Long computation in this cell (~1.8 minutes)
weights = (Y == 0).sum() / (1.0 * (Y == 1).sum())
clf = XGBClassifier(max_depth = 3, scale_pos_weight = weights, \
                n_jobs = 4)
probabilities = clf.fit(trainX, trainY).predict_proba(testX)
print('AUPRC = {}'.format(average_precision_score(testY, \
                                              probabilities[:, 1])))

# <a href='#top'>back to top</a>

# <a id='importance'></a>
# ##### 6.1. What are the important features for the ML model?
# The figure below shows that the new feature *errorBalanceOrig* that we created is the most relevant feature for the model. The features are ordered based on the number of samples affected by splits on those features.

# In[ ]:

fig = plt.figure(figsize=(14, 9))
ax = fig.add_subplot(111)

示例#33

0

显示文件

文件： LoanPrediction2_XGB.py 项目： Paliking/ML_examples

#  gamma=0,
#  subsample=0.6,
#  colsample_bytree=0.7,
#  objective= 'binary:logistic',
#  scale_pos_weight=1,
#  reg_alpha=0.1,
#  seed=27)
# modelfit(xgb1, df_train, predictors, targetname, early_stopping_rounds=50)


xgb1 = XGBClassifier(
 learning_rate=0.01,
 n_estimators=700,
 max_depth=5,
 min_child_weight=8,
 gamma=0.3,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 scale_pos_weight=1,
 seed=27)



xgb1.fit(df_train[predictors], df_train[targetname])
df_test['target'] = xgb1.predict(df_test[predictors])




df_test['target'] = df_test['target'].apply(lambda x: 'Y' if x==1 else 'N')

示例#34

0

显示文件

文件： xgboost_zillow_home_value.py 项目： johnnychiuchiu/Machine-Learning

        y_train, dtrain_predprob)

    feat_imp = pd.Series(
        alg.get_booster().get_fscore()).sort_values(ascending=False)
    print feat_imp
    # feat_imp.plot(kind='bar', title='Feature Importances')
    # plt.ylabel('Feature Importance Score')


# predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(learning_rate=0.2,
                     n_estimators=100,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective='reg:linear',
                     n_jobs=4,
                     scale_pos_weight=1,
                     random_state=27)

# modelfit(xgb1, x_train, y_train)

##### Step 2: Tune max_depth and min_child_weight
### description
# We tune these first as they will have the highest impact on model outcome. To start with, let's set wider ranges
# and then we will perform another iteration for smaller ranges.
### note
# GridSearchCV documentation -> http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# scoring parameters -> http://scikit-learn.org/stable/modules/model_evaluation.html

示例#35

0

显示文件

文件： ipinyouregression.py 项目： marcincuber/WebEconomics

data_test = large_set.tail(545421)
data_test = pd.DataFrame(pd.concat([data_test, tag_test_set], axis=1))
print 'Finished Reconstructing Train/Test Sets'
print data_train.shape
print data_test.shape


print 'Started Computing train set labels'
label_set = np.sign(label_set['Click'])
label_set[label_set == -1] = 0
print 'Finished computing train set labels'

# fit estimator
print "start XGBClassifier"
n_samples = data_train.shape[0]
est=XGBClassifier(n_estimators=200, learning_rate=0.1, silent= False)

print "start fitting"
est.fit(data_train, label_set)
# predict class labels
probs = est.predict_proba(data_test)

print "cross validation start"
cv = cross_validation.ShuffleSplit(n_samples, n_iter=10, random_state=0)
scores = cross_validation.cross_val_score(est, data_train, label_set, cv=cv)
mean = np.mean(probs[:, 1])
std = np.std(probs[:, 1])
print "Test predicted Mean:", mean
print "Test predicted STD:", std
df = pd.DataFrame(probs[:, 1])
df.columns = ["Prediction"]

示例#36

0

显示文件

    def dviz_classification_visualization(data_train, target_train,
                                          classifierName):
        clf = tree.DecisionTreeClassifier(max_depth=5, random_state=666)
        clf.fit(data_train, target_train)
        svg_tree = dtreeviz(
            clf,
            data_train,
            target_train,
            target_name=classifierName,
            feature_names=data_train.columns,
            orientation="TD",
            class_names=[classifierName, 'not_' + classifierName],
            fancy=True,
            histtype='strip',
            X=None,
            label_fontsize=12,
            ticks_fontsize=8,
            fontname="Arial")
        fname = os.path.join(
            tree_evaluations_out,
            str(classifierName) + 'fancy_decision_tree_example.svg')
        svg_tree.save(fname)

        print(rounds)
        # READ IN DATA FOLDER AND REMOVE ALL NON-FEATURE VARIABLES (POP DLC COORDINATE DATA AND TARGET DATA)
        print('Reading in ' + str(len(os.listdir(data_folder))) +
              ' annotated files...')
        for i in os.listdir(data_folder):
            if i.__contains__(".csv"):
                currentFn = os.path.join(data_folder, i)
                df = pd.read_csv(currentFn, index_col=0)
                features = features.append(df, ignore_index=True)
                print(features)
        features = features.loc[:, ~features.columns.str.contains('^Unnamed')]
        features = features.drop(["scorer"], axis=1, errors='ignore')
        totalTargetframes = features[classifierName].sum()
        try:
            targetFrame = features.pop(classifierName).values
        except KeyError:
            print(
                'Error: the dataframe does not contain any target annotations. Please check the csv files in the project_folder/csv/target_inserted folder'
            )
        features = features.fillna(0)
        features = drop_bp_cords(features, inifile)
        target_names = []
        loop = 1
        for i in range(model_nos):
            currentModelNames = 'target_name_' + str(loop)
            currentModelNames = config.get('SML settings', currentModelNames)
            if currentModelNames != classifierName:
                target_names.append(currentModelNames)
            loop += 1
        print('# of models to be created: 1')

        for i in range(len(target_names)):
            currentModelName = target_names[i]
            features.pop(currentModelName).values
        class_names = class_names = ['Not_' + classifierName, classifierName]
        feature_list = list(features)
        print('# of features in dataset: ' + str(len(feature_list)))

        # IF SET BY USER - PERFORM UNDERSAMPLING AND OVERSAMPLING IF SET BY USER
        data_train, data_test, target_train, target_test = train_test_split(
            features, targetFrame, test_size=train_test_size)
        under_sample_setting = config.get('create ensemble settings',
                                          'under_sample_setting')
        over_sample_setting = config.get('create ensemble settings',
                                         'over_sample_setting')
        trainDf = data_train
        trainDf[classifierName] = target_train
        targetFrameRows = trainDf.loc[trainDf[classifierName] == 1]
        print('# of ' + str(classifierName) + ' frames in dataset: ' +
              str(totalTargetframes))
        trainDf = trainDf.sample(frac=1).reset_index(drop=True)
        if under_sample_setting == 'Random undersample':
            print('Performing undersampling...')
            under_sample_ratio = config.getfloat('create ensemble settings',
                                                 'under_sample_ratio')
            nonTargetFrameRows = trainDf.loc[trainDf[classifierName] == 0]
            nontargetFrameRowsSize = int(
                len(targetFrameRows) * under_sample_ratio)
            nonTargetFrameRows = nonTargetFrameRows.sample(
                nontargetFrameRowsSize, replace=False)
            trainDf = pd.concat([targetFrameRows, nonTargetFrameRows])
            target_train = trainDf.pop(classifierName).values
            data_train = trainDf
        if under_sample_setting != 'Random undersample':
            target_train = trainDf.pop(classifierName).values
            under_sample_ratio = 'NaN'
        if over_sample_setting == 'SMOTEENN':
            print('Performing SMOTEEN oversampling...')
            over_sample_ratio = config.getfloat('create ensemble settings',
                                                'over_sample_ratio')
            smt = SMOTEENN(sampling_strategy=over_sample_ratio)
            data_train, target_train = smt.fit_sample(data_train, target_train)
        if over_sample_setting == 'SMOTE':
            print('Performing SMOTE oversampling...')
            over_sample_ratio = config.getfloat('create ensemble settings',
                                                'over_sample_ratio')
            smt = SMOTE(sampling_strategy=over_sample_ratio)
            data_train, target_train = smt.fit_sample(data_train, target_train)
        if (over_sample_setting != 'SMOTEENN') or (over_sample_setting !=
                                                   'SMOTE'):
            over_sample_ratio = 'NaN'
        data_train = data_train.sample(frac=1).reset_index(drop=True)
        #target_train = np.random.shuffle(target_train)

        # RUN THE DECISION ENSEMBLE SET BY THE USER
        # run random forest
        if model_to_run == 'RF':
            print('Training model ' + str(classifierName) + '...')
            RF_n_estimators = config.getint('create ensemble settings',
                                            'RF_n_estimators')
            RF_max_features = config.get('create ensemble settings',
                                         'RF_max_features')
            RF_criterion = config.get('create ensemble settings',
                                      'RF_criterion')
            RF_min_sample_leaf = config.getint('create ensemble settings',
                                               'RF_min_sample_leaf')
            clf = RandomForestClassifier(n_estimators=RF_n_estimators,
                                         max_features=RF_max_features,
                                         n_jobs=-1,
                                         criterion=RF_criterion,
                                         min_samples_leaf=RF_min_sample_leaf,
                                         bootstrap=True,
                                         verbose=1)
            try:
                clf.fit(data_train, target_train)
            except ValueError:
                print(
                    'ERROR: The model contains a faulty array. This may happen when trying to train a model with 0 examples of the behavior of interest'
                )

            # predictions = clf.predict_proba(data_test)
            # data_test['probability'] = predictions[:, 1]
            # data_test['prediction'] = np.where(data_test['probability'] > 0.499999, 1, 0)
            # print(data_test['prediction'].sum())

            scoring = ['precision', 'recall', 'f1']
            newDataTargets = np.concatenate((target_train, target_test),
                                            axis=0)
            # #newDataTargets = np.where((newDataTargets == 0) | (newDataTargets == 1), newDataTargets ** 1, newDataTargets)
            # newDataFeatures = np.concatenate((data_train, data_test), axis=0)
            # #newDataFeatures = np.where((newDataFeatures == 0) | (newDataFeatures == 1), newDataFeatures ** 1, newDataFeatures)
            # cv = ShuffleSplit(n_splits=5, test_size=train_test_size)
            # results = cross_validate(clf, newDataFeatures, newDataTargets, cv=cv, scoring=scoring)
            # results = pd.DataFrame.from_dict(results)
            # crossValresultsFname = os.path.join(tree_evaluations_out, str(classifierName) + '_cross_val_100.csv')
            # results.to_csv(crossValresultsFname)

            # #RUN RANDOM FOREST EVALUATIONS
            # compute_permutation_importance = config.get('create ensemble settings', 'compute_permutation_importance')
            # if compute_permutation_importance == 'yes':
            #     print('Calculating permutation importances...')
            #     computePermutationImportance(data_test, target_test, clf)
            #
            # generate_learning_curve = config.get('create ensemble settings', 'generate_learning_curve')
            # if generate_learning_curve == 'yes':
            #     shuffle_splits = config.getint('create ensemble settings', 'LearningCurve_shuffle_k_splits')
            #     dataset_splits = config.getint('create ensemble settings', 'LearningCurve_shuffle_data_splits')
            #     print('Calculating learning curves...')
            #     LearningCurve(features, targetFrame, shuffle_splits, dataset_splits)
            # if generate_learning_curve != 'yes':
            #     shuffle_splits = 'NaN'
            #     dataset_splits = 'NaN'

            # generate_precision_recall_curve = config.get('create ensemble settings', 'generate_precision_recall_curve')
            # if generate_precision_recall_curve == 'yes':
            #     print('Calculating precision recall curve...')
            #     precisionRecallDf = pd.DataFrame()
            #     probabilities = clf.predict_proba(data_test)[:, 1]
            #     precision, recall, thresholds = precision_recall_curve(target_test, probabilities, pos_label=1)
            #     precisionRecallDf['precision'] = precision
            #     precisionRecallDf['recall'] = recall
            #     thresholds = list(thresholds)
            #     thresholds.insert(0, 0.00)
            #     precisionRecallDf['thresholds'] = thresholds
            #     PRCpath = os.path.join(tree_evaluations_out, str(classifierName) + '_precision_recall.csv')
            #     precisionRecallDf.to_csv(PRCpath)
            #
            # generate_example_decision_tree = config.get('create ensemble settings', 'generate_example_decision_tree')
            # if generate_example_decision_tree == 'yes':
            #     print('Generating example decision tree using graphviz...')
            #     estimator = clf.estimators_[3]
            #     generateExampleDecisionTree(estimator)

            generate_classification_report = config.get(
                'create ensemble settings', 'generate_classification_report')
            if generate_classification_report == 'yes':
                print('Generating yellowbrick classification report...')
                generateClassificationReport(clf, class_names, rounds)

            # generate_features_importance_log = config.get('create ensemble settings', 'generate_features_importance_log')
            # if generate_features_importance_log == 'yes':
            #     print('Generating feature importance log...')
            #     importances = list(clf.feature_importances_)
            #     log_df = generateFeatureImportanceLog(importances)
            #
            # generate_features_importance_bar_graph = config.get('create ensemble settings', 'generate_features_importance_bar_graph')
            # if generate_features_importance_bar_graph == 'yes':
            #     N_feature_importance_bars = config.getint('create ensemble settings', 'N_feature_importance_bars')
            #     print('Generating feature importance bar graph...')
            #     generateFeatureImportanceBarGraph(log_df, N_feature_importance_bars)
            # if generate_features_importance_bar_graph != 'yes':
            #     N_feature_importance_bars = 'NaN'

            # generate_example_decision_tree_fancy = config.get('create ensemble settings','generate_example_decision_tree_fancy')
            # if generate_example_decision_tree_fancy == 'yes':
            #     print('Generating fancy decision tree example...')
            #     dviz_classification_visualization(data_train, target_train, classifierName)

            # SAVE MODEL META DATA
            RF_meta_data = config.get('create ensemble settings',
                                      'RF_meta_data')
            if RF_meta_data == 'yes':
                metaDataList = [
                    classifierName, RF_criterion, RF_max_features,
                    RF_min_sample_leaf, RF_n_estimators,
                    compute_permutation_importance,
                    generate_classification_report,
                    generate_example_decision_tree,
                    generate_features_importance_bar_graph,
                    generate_features_importance_log,
                    generate_precision_recall_curve, RF_meta_data,
                    generate_learning_curve, dataset_splits, shuffle_splits,
                    N_feature_importance_bars, over_sample_ratio,
                    over_sample_setting, train_test_size, under_sample_ratio,
                    under_sample_ratio
                ]
                generateMetaData(metaDataList)

        # run gradient boost model
        if model_to_run == 'GBC':
            GBC_n_estimators = config.getint('create ensemble settings',
                                             'GBC_n_estimators')
            GBC_max_features = config.get('create ensemble settings',
                                          'GBC_max_features')
            GBC_max_depth = config.getint('create ensemble settings',
                                          'GBC_max_depth')
            GBC_learning_rate = config.getfloat('create ensemble settings',
                                                'GBC_learning_rate')
            GBC_min_sample_split = config.getint('create ensemble settings',
                                                 'GBC_min_sample_split')
            clf = GradientBoostingClassifier(
                max_depth=GBC_max_depth,
                n_estimators=GBC_n_estimators,
                learning_rate=GBC_learning_rate,
                max_features=GBC_max_features,
                min_samples_split=GBC_min_sample_split,
                verbose=1)
            clf.fit(data_train, target_train)
            clf_pred = clf.predict(data_test)
            print(
                str(classifierName) + str(" Accuracy train: ") +
                str(clf.score(data_train, target_train)))

            generate_example_decision_tree = config.get(
                'create ensemble settings', 'generate_example_decision_tree')
            if generate_example_decision_tree == 'yes':
                estimator = clf.estimators_[3, 0]
                generateExampleDecisionTree(estimator)

            generate_classification_report = config.get(
                'create ensemble settings', 'generate_classification_report')
            if generate_classification_report == 'yes':
                generateClassificationReport(clf, class_names)

            generate_features_importance_log = config.get(
                'create ensemble settings', 'generate_features_importance_log')
            if generate_features_importance_log == 'yes':
                importances = list(clf.feature_importances_)
                log_df = generateFeatureImportanceLog(importances)

            generate_features_importance_bar_graph = config.get(
                'create ensemble settings',
                'generate_features_importance_bar_graph')
            N_feature_importance_bars = config.getint(
                'create ensemble settings', 'N_feature_importance_bars')
            if generate_features_importance_bar_graph == 'yes':
                generateFeatureImportanceBarGraph(log_df,
                                                  N_feature_importance_bars)

        # run XGboost
        if model_to_run == 'XGB':
            XGB_n_estimators = config.getint('create ensemble settings',
                                             'XGB_n_estimators')
            XGB_max_depth = config.getint('create ensemble settings',
                                          'GBC_max_depth')
            XGB_learning_rate = config.getfloat('create ensemble settings',
                                                'XGB_learning_rate')
            clf = XGBClassifier(max_depth=XGB_max_depth,
                                min_child_weight=1,
                                learning_rate=XGB_learning_rate,
                                n_estimators=XGB_n_estimators,
                                silent=0,
                                objective='binary:logistic',
                                max_delta_step=0,
                                subsample=1,
                                colsample_bytree=1,
                                colsample_bylevel=1,
                                reg_alpha=0,
                                reg_lambda=0,
                                scale_pos_weight=1,
                                seed=1,
                                missing=None,
                                verbosity=3)
            clf.fit(data_train, target_train, verbose=True)

        # SAVE MODEL
        modelfn = str(classifierName) + '.sav'
        modelPath = os.path.join(modelDir_out, modelfn)
        pickle.dump(clf, open(modelPath, 'wb'))
        print('Classifier ' + str(classifierName) + ' saved @ ' +
              str('models/generated_models ') + 'folder')
        print(
            'Evaluation files are in models/generated_models/model_evaluations folders'
        )

示例#37

0

显示文件

文件： airbnb xgboost model.py 项目： paperparrot/Kaggle-scripts

def main(training_data, test_data):
    # Merging data to ensure consistent cleaning. Putting marker variable to separate later.
    training_data['source'] = 'training'
    test_data['source'] = 'test'
    merged_data = pd.concat([training_data, test_data])

    # Cleaning data
    cleaned_data = data_cleaner(merged_data)

    # Separating data, removing marker
    pred_df = cleaned_data[cleaned_data['source'] == 'training'].copy()
    test_pred = cleaned_data[cleaned_data['source'] == 'test'].copy()

    pred_df.drop('source', axis=1, inplace=True)
    test_pred.drop('source', axis=1, inplace=True)

    # Transforming target into ints, saving the key for later transformation
    labels = LabelEncoder().fit(training_data['country_destination'])
    target_df = pd.Series(labels.transform(
        training_data['country_destination']),
                          index=training_data.index)

    # Training model
    xgb_model = XGBClassifier(max_depth=6,
                              learning_rate=0.3,
                              n_estimators=25,
                              objective='multi:softprob',
                              subsample=0.5,
                              colsample_bytree=0.5,
                              seed=0)
    xgb_model.fit(pred_df.as_matrix(), target_df.tolist())

    # Running the model
    preds = xgb_model.predict_proba(test_pred.as_matrix())

    # Selecting the top 5 most likely for each respondent and stacking.
    # This section is VERY slow and could use being optimized
    model_probs = pd.DataFrame(preds,
                               index=test_pred.index,
                               columns=labels.classes_)

    stacked_probs = pd.Series()
    for i in model_probs.index:
        temp = model_probs.loc[i, :]
        temp_sort = pd.DataFrame(temp.sort_values(ascending=False)[:5].index)

        temp_sort['id'] = i
        temp_sort.columns = ['country', 'id']

        stacked_probs = pd.concat([stacked_probs, temp_sort])

    # # Selecting classes with highest probabilities, compiling into list
    # ids = []
    # cts = []
    # test_ids = pd.Series(test_data.index)
    # for i in range(len(test_ids)):
    #     idx = test_data.index[i]
    #     ids += [idx] * 5
    #     cts += labels.inverse_transform(np.argsort(model_probs[i])[::-1])[:5].tolist()
    #
    # predictions = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])

    # Cleaning output and returning it
    output = stacked_probs[['id', 'country']]
    return output

示例#38

0

显示文件

    'Property_Area_Urban', 'Loan_tot_income_ratio', 'coapplicant_True'
]
X = dfr_train[col]
y = dfr_train['Loan_Status']
'''
# RandomForest 
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators  = 300, max_features=None,criterion = 'entropy',random_state = 0)
RF.fit(X, y)
'''

#Xgboost
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

classifier = XGBClassifier(learning_rate=0.1, n_estimators=10)
classifier.fit(X, y)

#validation
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=classifier, X=X, y=y, cv=10)
print(accuracies.mean())
print(accuracies.std())
'''
#ensemble
from sklearn import model_selection

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)

示例#39

0

显示文件

文件： test1.py 项目： mircean/ML

def do_cell(task):
    df_train, df_test, x_start, y_start = task[0], task[1], task[2], task[3]
    #print('do_cell', df_train.shape, df_test.shape, x_start, y_start)

    #train
    n_places_th_local = n_places_th
    n_places_local = n_places

    if n_places != 0:
        tmp = df_train.shape[0]
        value_counts = df_train.place_id.value_counts()[0:n_places]
        df_train = pd.merge(df_train, pd.DataFrame(value_counts), left_on='place_id', right_index=True)[df_train.columns]
        n_places_th_local = value_counts.values[n_places - 1]
        percentage = df_train.shape[0]/tmp

    elif n_places_th != 0:
        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]
        df_train = df_train.loc[mask.values]

    else:
        n_places_th_local = 2

        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        while percentage > n_places_percentage:
            n_places_th_local += 1
            n_places_local = value_counts[value_counts >= n_places_th_local].count()
            mask = value_counts[df_train.place_id.values] >= n_places_th_local
            percentage = mask.value_counts()[True]/df_train.shape[0]

        n_places_th_local -= 1
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        df_train = df_train.loc[mask.values]


    #print(x_start, y_start, n_places_local, n_places_th_local, percentage)
        
    #test
    row_ids = df_test.index
    if 'place_id' in df_test.columns:
        df_test = df_test.drop(['place_id'], axis=1)

    le = LabelEncoder()
    y = le.fit_transform(df_train.place_id.values)
    
    X = df_train.drop(['place_id'], axis=1).values
    X_predict = df_test.values

    score = 0
    n_estimators = 0
    if xgb == 1:    
        if xgb_calculate_n_estimators == True:
            clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)

            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
   
                clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=calculate_score, early_stopping_rounds=early_stopping_rounds, verbose=10 if one_cell == 1 else False)
                score = round(1 - clf.booster().best_score, 6)
                n_estimators = clf.booster().best_ntree_limit
            else:
                abc += 1
                xgb_options = clf.get_xgb_params()
                xgb_options['num_class'] = n_places + 1
                train_dmatrix = DMatrix(X, label=y)

                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                cv_results = cv(xgb_options, train_dmatrix, clf.n_estimators, early_stopping_rounds=early_stopping_rounds, verbose_eval=10 if one_cell == 1 else False, show_stdv=False, folds=folds, feval=calculate_score)

                n_estimators = cv_results.shape[0]
                score = round(1 - cv_results.values[-1][0], 6)
                std = round(cv_results.values[-1][1], 6)
        else:
            n_estimators = n_estimators_fixed

        clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)
    else:
        clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1)
        if rf_calculate_score == True:
            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
                y_train2 = le.transform(y_train)
                y_test2 = le.transform(y_test)
    
                clf.fit(X_train, y_train2)
                y_predict = clf.predict_proba(X_test)

                scores_local = []
                for i in range(X_test.shape[0]):
                    score = calculate_score_per_row(y_predict[i], y_test2[i])
                    scores_local.append(score)

                score = np.array(scores_local).mean()
            else:
                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                scores_cv = []
                for train, test in folds:
                    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

                    y_train2 = le.transform(y_train)
                    y_test2 = le.transform(y_test)
    
                    clf.fit(X_train, y_train2)
                    y_predict = clf.predict_proba(X_test)

                    scores_local = []
                    for i in range(X_test.shape[0]):
                        score = calculate_score_per_row(y_predict[i], y_test2[i])
                        scores_local.append(score)

                    score = np.array(scores_local).mean()
                    print('  ', x_start, y_start, score)
                    scores_cv.append(score)

                score = np.array(scores_cv).mean()
    
    #if few_cells == 1 or grid_search == 1:
    #    return [score, None, None]

    clf.fit(X, y)
    y_predict = clf.predict_proba(X_predict)
    ##1
    labels_predict = le.inverse_transform(np.argsort(y_predict, axis=1)[:,::-1][:,:n_topx])    

    print(x_start, y_start, score, n_estimators, n_places_local, n_places_th_local, percentage)

    return [score, row_ids, labels_predict]

示例#40

0

显示文件

seed = 100
np.random.seed(seed)
random.seed(seed)

X, y = utils.importar_datos()

# ### Métricas finales

pipeline = Pipeline([("preprocessor", pp.PreprocessingOHE()),
                     ("model",
                      XGBClassifier(use_label_encoder=False,
                                    scale_pos_weight=1,
                                    subsample=0.8,
                                    colsample_bytree=0.8,
                                    objective="binary:logistic",
                                    n_estimators=1000,
                                    learning_rate=0.01,
                                    n_jobs=-1,
                                    eval_metric="logloss",
                                    min_child_weight=6,
                                    max_depth=6,
                                    reg_alpha=0.05))])

pipeline = utils.entrenar_y_realizar_prediccion_final_con_metricas(
    X, y, pipeline)

# La métrica objetivo AUC-ROC tiene un resultado similar al obtenido al utilizar LE. Sin embargo, se observa que aumento la tasa de Falsos Negativos con respecto al otro modelo, por lo que su Recall (y por ende su F1 Score) disminuyó (en 0.09). A su vez, mejoró levemente la tasa de Verdaderos Negativos.

# ### Predicción HoldOut

utils.predecir_holdout_y_generar_csv(pipeline,
                                     'Predicciones/4-XGBoost-OHE.csv')

示例#41

0

显示文件

文件： permutation_test.py 项目： liuhongwei2018/GDM

from sklearn.model_selection import train_test_split, StratifiedKFold, permutation_test_score
from xgboost.sklearn import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

data_all = pd.read_csv("G:/GDM/DATA/GDM.csv")

X ,y= data_all.drop(['OGTTgroup1'],axis=1),data_all.OGTTgroup1
X_log ,y_log= data_all.drop(['OGTTgroup1','weight_gain','income','education','DBP',
                             'parity','multi_pregnancy'],axis=1),data_all.OGTTgroup1

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 1,test_size=0.3,stratify=y)
X_train_log,X_test_log,y_train_log,y_test_log = train_test_split(X_log,y_log,random_state = 1,test_size=0.3,stratify=y_log)

clf = XGBClassifier(random_state=5361,scale_pos_weight=12.026280323450134,n_estimators=200,max_depth=2,
                    min_child_weight=29,colsample_bytree=0.7,subsample=1,gamma=0,
                    reg_alpha=5,reg_lambda=5,learning_rate=0.1, n_jobs=-1).fit(X_train, y_train)
cv = StratifiedKFold(5)

score, permutation_scores, pvalue = permutation_test_score(
    clf, X_train, y_train, scoring="roc_auc", cv=cv, n_permutations=1000, n_jobs=-1)

#print(pvalue)

clf_log = LogisticRegression(random_state=0,fit_intercept=True, C=1e9,solver = 'newton-cg').fit(X_train_log, y_train_log)

score_log, permutation_scores_log, pvalue_log = permutation_test_score(
    clf_log, X_train_log, y_train_log, scoring="roc_auc", cv=cv, n_permutations=1000, n_jobs=-1)
#print(pvalue_log)

示例#42

0

显示文件

文件： ModelTraining.py 项目： the-spectator/QuickQuote

def model_making_main(file):
    logger.info(">> Start - Model making")

    df = pd.read_csv(config.preprocessed_csv, encoding='UTF-8')
    # If getting an error remove .astype(str)
    select_columns = [
        'recepientemail', 'Gender', 'Age(years)', 'Product Type', 'Weight',
        'Height', 'Habit', 'Face Amount', 'Medication', 'Property',
        'Medical Data', 'Family'
    ]
    df['ColumnA'] = df[select_columns].apply(
        lambda x: ','.join(x.dropna().astype(str)), axis=1)

    logging.info("Remove puncutation, tokenize")
    df['Lemmitize'] = df['ColumnA'].apply(rem_punt).apply(tokenize)

    df['Lemmitize'] = df['Lemmitize'].apply(conversion)
    df.to_csv(config.nlp_processed_csv, index=False, encoding="utf-8")

    df = pd.read_csv(config.nlp_processed_csv)

    X = df['Lemmitize']
    of = pd.read_csv(file, encoding='UTF-8')
    y = of['Offer_noise_free']

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=4)
    vect = TfidfVectorizer(max_df=0.5,
                           max_features=10000,
                           min_df=1,
                           use_idf=True,
                           ngram_range=(1, 2),
                           lowercase=True)
    represent = TfidfVectorizer(max_df=0.5,
                                max_features=10000,
                                min_df=1,
                                use_idf=True,
                                ngram_range=(1, 1),
                                lowercase=True)
    matrix = represent.fit_transform(X.values)
    # visualize(represent,matrix,X,y)
    # print(matrix)

    # for i, feature in enumerate(vect.get_feature_names()):
    #    print(i, feature)

    #va = raw_input()

    model1 = XGBClassifier(nthread=4, n_estimators=1000)
    model3 = RandomForestClassifier(n_estimators=60,
                                    n_jobs=3,
                                    max_features="auto",
                                    min_samples_leaf=50)
    model4 = SVC(kernel='rbf', C=1, gamma=10)
    model5 = LogisticRegression()
    model7 = SGDClassifier(alpha=.0001)
    model_making("XGBOOST", vect, model1, X_train, y_train, X_test, y_test)
    model_making("Random Forest", vect, model3, X_train, y_train, X_test,
                 y_test)
    model_making("SVM", vect, model4, X_train, y_train, X_test, y_test)
    model_making("Logistic Regression", vect, model5, X_train, y_train, X_test,
                 y_test)
    model_making("SGDClassifier", vect, model7, X_train, y_train, X_test,
                 y_test)
    # model_with_SVD(vect,X_train,X_test,y_train,y_test)
    logger.info("<< End - Model making")

示例#43

0

显示文件

文件： Train_Model.py 项目： mimcomp/proactive-examples

    model = GaussianNB(**vars)  
  elif alg.name == 'LogisticRegression':
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(**vars)
  elif alg.name == 'AdaBoost' and alg.type == 'classification':
    from sklearn.ensemble import AdaBoostClassifier
    model = AdaBoostClassifier(**vars)
  elif alg.name == 'GradientBoosting' and alg.type == 'classification':
    from sklearn.ensemble import GradientBoostingClassifier
    model = GradientBoostingClassifier(**vars)
  elif alg.name == 'RandomForest' and alg.type == 'classification':
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(**vars)
  elif alg.name == 'XGBoost' and alg.type == 'classification':
    from xgboost.sklearn import XGBClassifier
    model = XGBClassifier(**vars)   
  elif alg.name == 'CatBoost' and alg.type == 'classification':
    from catboost import CatBoostClassifier
    model = CatBoostClassifier(**vars)       
 
  #-------------------------------------------------------------
  # Regression algorithms   
  elif alg.name == 'TPOT_Regressor':
    from tpot import TPOTRegressor
    model = TPOTRegressor(
        generations=alg.generations,
        cv=alg.cv,
        scoring=alg.scoring,
        verbosity=alg.verbosity
    )
  elif alg.name == 'AutoSklearn_Regressor':

示例#44

0

显示文件

        )),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])


TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Import the hashing vectorizer

p2 = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                     non_negative=True, norm=None, binary=False,
                                                     ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', XGBClassifier())))
    ])

示例#45

0

显示文件

文件： ensemble_methods.py 项目： parthrparekh93/AML

def xgboost_algorithm(XTrain,YTrain,XTest):
    xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
    xgb.fit(XTrain, YTrain)
    y_pred_xgboost = xgb.predict_proba(XTest) 
    return y_pred_xgboost

示例#46

0

显示文件

    feat_imp = pd.Series(
        alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Impxortances')
    plt.ylabel('Feature Importance Score')

    #Choose all predictors except target & IDcols


#%% Step 1: Fix learning rate and number of estimators for tuning tree-based parameters
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=1000,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective='binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)
modelfit(xgb1, train, predictors)

param_test1 = {
    'max_depth': list(range(3, 13, 2)),
    'min_child_weight': list(range(1, 7, 2))
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,
                                                n_estimators=140,
                                                max_depth=5,
                                                min_child_weight=1,

示例#47

0

显示文件

文件： main.py 项目： TimeMachine/Airbnb-New-User-Bookings

df_all = pd.merge(df_all, df_sess_features, how='left', left_on='id', right_on='id')
df_all = df_all.drop(['id'], axis=1)
#release memory
del df_sessions
del device_freq
del action_freq

#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)                  

print('scores:', NDCG.cross_validation_score(X, labels,xgb,5))
'''
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission

示例#48

0

显示文件

# df_All_stat_9 = pd.read_csv("mchnt_ana.csv", sep=',')
# df_All = pd.merge(left=df_All, right=df_All_stat_9, how='left', left_on='certid', right_on='certid')
#########################

label_df = pd.read_csv("train_label_encrypt.csv", sep=",", low_memory=False, error_bad_lines=False)
df_All = pd.merge(left=df_All, right=label_df, how='left', left_on='certid', right_on='certid')

df_All = df_All.fillna(-1)

df_All_train = df_All[(df_All["label"] == 0) | (df_All["label"] == 1)]
df_All_test = df_All[(df_All["label"] != 0) & (df_All["label"] != 1)]


for i in range(2):
    savename = "xgboost_results_1120_" + str(i) + ".csv"
    print savename
    df_All_train = shuffle(df_All_train)
    X_train = df_All_train.drop(["certid", "label"], axis=1, inplace=False)
    y_train = df_All_train["label"]
    clf = XGBClassifier(learning_rate =0.1,n_estimators=1000,max_depth=5,gamma=0.01,subsample=0.8,colsample_bytree=0.8,objective= 'binary:logistic', reg_alpha=0.1, reg_lambda=0.1,seed=27)
    clf = clf.fit(X_train, y_train)
    X_test = df_All_test.drop(["certid", "label"], axis=1, inplace=False)
    pred = clf.predict(X_test).T
    cerid_arr = np.array(df_All_test["certid"]).T
    result = np.vstack((cerid_arr,pred))
    np.savetxt(savename,result.T,delimiter=',', fmt = "%s")

示例#49

0

显示文件

文件： train.py 项目： guohuiGH/kaggle

def xgbost(x,y,targetx):
    clf_xgb = XGBClassifier(n_estimators=1000,max_depth=6, learning_rate=0.0075,subsample=0.7,colsample_bytree=0.7,seed=4)
    clf_xgb.fit(x,y)
    return clf_xgb.predict_proba(targetx)[:,1]

示例#50

0

显示文件

文件： XGboost+ClusterCentroids.py 项目： non27/The-final-assignment

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

#ClusterCentroids
cc = ClusterCentroids(random_state=0)
os_X, os_y = cc.fit_sample(X_train, y_train)

#XGboost
clf_XG = XGBClassifier(learning_rate=0.3,
                       min_child_weight=1,
                       max_depth=6,
                       gamma=0,
                       subsample=1,
                       max_delta_step=0,
                       colsample_bytree=1,
                       reg_lambda=1,
                       n_estimators=100,
                       seed=1000,
                       scale_pos_weight=1000)
clf_XG.fit(os_X,
           os_y,
           eval_set=[(os_X, os_y), (X_test, y_test)],
           eval_metric='auc',
           verbose=False)
evals_result = clf_XG.evals_result()
y_true, y_pred = y_test, clf_XG.predict(X_test)

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)

示例#51

0

显示文件

文件： script.py 项目： vcte/CS412_Final_Project

def build_model(X, y):
    print("Fitting classifier")
    xgb = XGBClassifier(max_depth = 4, learning_rate = 0.25, n_estimators = 25,
                            objective = 'multi:softprob', subsample = 0.6, colsample_bytree = 0.6)
    xgb.fit(X, y)
    return xgb

示例#52

0

显示文件

    #建模
    alg.fit(dtrain[predictors], dtrain['AKI'], eval_metric='auc')

    #对训练集预测
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]

    #输出模型的一些结果
    print("Stopped at iteration: {0}".format(cvresult.shape[0]))
    print("\n关于现在这个模型")
    print("准确率 : %.4g" %
          metrics.accuracy_score(dtrain['AKI'].values, dtrain_predictions))
    print("AUC 得分 (训练集): %f" %
          metrics.roc_auc_score(dtrain['AKI'], dtrain_predprob))


#获得最佳决策树数目
predictors = [x for x in re.columns if x not in [target]]
xgb1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=1000,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective='binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)
modelfit(xgb1, re, data, predictors)

示例#53

0

显示文件

文件： stacking.py 项目： river1020/rong360

def run():
    np.random.seed(0)  # seed to shuffle the train set
    n_folds = 4
#    verbose = True
    shuffle = False

    X,y = get_train_data()
    X_submission = mf.get_test_data()

    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]

    skf = list(StratifiedKFold(y, n_folds))
# 这里可以改变参数生成多个模型

    clfs = [RandomForestClassifier(n_estimators=500,
                                   max_features=0.8,
                                   bootstrap=True,
                                   min_samples_leaf=50,
                                   oob_score=True,
                                   criterion='gini',
                                   n_jobs=-1),
            RandomForestClassifier(n_estimators=500,
                                   max_features=0.5,
                                   bootstrap=True,
                                   min_samples_leaf=50,
                                   oob_score=True,
                                   criterion='entropy',
                                   n_jobs=-1),
            ExtraTreesClassifier(n_estimators=500,
                                   min_samples_leaf=50,
                                   criterion='gini',
                                   n_jobs=-1),
            ExtraTreesClassifier(n_estimators=500,
                                   min_samples_leaf=50,
                                   criterion='entropy',
                                   n_jobs=-1),
            GradientBoostingClassifier(learning_rate=0.05, 
                                       n_estimators=500,
                                       max_depth=3, 
                                       max_features=0.65, 
                                       subsample=0.7,
                                       random_state=10,
                                       min_samples_split=350,
                                       min_samples_leaf=70),
            GradientBoostingClassifier(learning_rate=0.01, 
                                       n_estimators=1000,
                                       max_depth=4, 
                                       max_features=0.7, 
                                       subsample=0.8,
                                       random_state=10,
                                       min_samples_split=350,
                                       min_samples_leaf=70),
            XGBClassifier(learning_rate=0.05,
                                      n_estimators=350,
                                      gamma=0,
                                      min_child_weight=5,
                                      max_depth=5,
                                      subsample=0.8,
                                      scale_pos_weight=1,
                                      colsample_bytree=0.8,
                                      objective='binary:logistic',
                                      nthread=8,
                                      eval_metric= 'auc',
                                      seed=10),
            XGBClassifier(learning_rate=0.02,
                                      n_estimators=500,
                                      gamma=0,
                                      min_child_weight=5,
                                      max_depth=5,
                                      subsample=0.7,
                                      scale_pos_weight=1,
                                      colsample_bytree=0.7,
                                      objective='binary:logistic',
                                      nthread=8,
                                      eval_metric= 'auc',
                                      seed=10) 
                                      ]

    print ("Creating train and test sets for blending.")

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print (j, clf)
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print ("Fold", i)
            X_train = X.ix[train,:]
            y_train = y[train]
            X_test = X.ix[test,:]
            y_test = y[test]
            clf.fit(X_train, y_train)
            y_submission = clf.predict_proba(X_test)[:, 1]
            dataset_blend_train[test, j] = y_submission
            print ("train ks_score: ",ks.ks_score(y_submission,y_test))
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

    print
    print ("Blending.")
    clf = LogisticRegression()
    
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

    dataset_blend_train.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/dataset_blend_train.csv',index=False)
    y.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/y.csv',index=False)
    y_submission.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/y_submission.csv',index=False)
    X_user_id.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/X_user_id.csv',index=False)
	
    print ("Linear stretch of predictions to [0,1]")
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())

    test_pre = pd.DataFrame({u'userid':X_user_id,u'probability':y_submission})
    test_pre = test_pre[['userid','probability']]    
    print (test_pre.head())
    test_pre.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/pre_blending.csv',index=False)

示例#54

0

显示文件

nclass=3

stime = time.time()

trainc=pd.read_csv('./data/train_lon_lat_predicted.csv',index_col=0)
testc=pd.read_csv('./data/test_lon_lat_predicted.csv',index_col=0)
target=pd.read_csv('./data/target.csv',index_col=0)
nf=10

outcome=target['status_group']       
cclf1=XGBClassifier(max_depth=14,
                    learning_rate=0.0588,
                    n_estimators=250,
                    objective='multi:softprob',
                    nthread=8,
                    gamma=0.6890,
                    min_child_weight=7.6550,
                    subsample=0.8, 
                    colsample_bytree=0.8)

              
cclf2=XGBClassifier(max_depth=15,
                    learning_rate=0.03599,
                    n_estimators=385,
                    objective='multi:softprob',
                    nthread=8,
                    gamma=0.6836,
                    min_child_weight= 4.3704,
                    subsample=0.8, 
                    colsample_bytree=0.8)

示例#55

0

显示文件

文件： Airbnb_analysis.py 项目： richardjcool/KaggleComps

    "first_affiliate_tracked",
    "signup_app",
    "first_device_type",
    "first_browser",
]
X = split_categorical_variables(train, categorical_variables)
y = X.pop("country_destination")
label_table = LabelEncoder()
y = label_table.fit_transform(y.values)


# # Let's try a gradiant boost classifier

# In[56]:

xgb_model = XGBClassifier(max_depth=3, n_estimators=10, learning_rate=0.1)
xgb_model.fit(X, y)


# ## How did we do?
#
# * To start, let's look at how well we did just predicting the final outcome


pred = xgb_model.predict_proba(X)

# Find the most probable country
best_country = []  # Not used for now
bestId = []
for i in range(len(pred)):
    bestId.append(np.argsort(pred[i])[::-1])

示例#56

0

显示文件

文件： 74_XGBOOST.py 项目： sabah83/Machine-Learning-Toturials

#Normalize
X = StandardScaler().fit_transform(X)

# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2017)

kfold = cross_validation.StratifiedKFold(y=y_train,
                                         n_folds=5,
                                         random_state=2017)
num_rounds = 100

clf_XGB = XGBClassifier(n_estimators=num_rounds,
                        objective='binary:logistic',
                        seed=2017)

# use early_stopping_rounds to stop the cv when there is no score imporovement
clf_XGB.fit(X_train,
            y_train,
            early_stopping_rounds=20,
            eval_set=[(X_test, y_test)],
            verbose=False)

results = cross_validation.cross_val_score(clf_XGB, X_train, y_train, cv=kfold)

print("\nxgBoost - CV Train : %.2f" % results.mean())
print("xgBoost - Train : %.2f" %
      metrics.accuracy_score(clf_XGB.predict(X_train), y_train))
print("xgBoost - Test : %.2f" %

示例#57

0

显示文件

文件： Airbnb_zihe_final.py 项目： joey-wang/Kaggle-airbnb

# In[ ]:

#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]


# In[ ]:


#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  


# In[ ]:

ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])

示例#58

0

显示文件

文件： solution.py 项目： deepakgupta1/IndiaHacks-Predict-the-Customer-Segment

                          verbose_eval=True)
        alg.set_params(n_estimators=cvresult.shape[0])

    alg.fit(dtrain[predictors], dtrain['segment'], eval_metric='auc')

    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]

    print '\nModel Report:'
    print 'AUC (Train): ', metrics.roc_auc_score(dtrain['segment'],
                                                 dtrain_predprob)

    return alg


print 'Training model_1...'
xgb1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=10000,
                     max_depth=4,
                     gamma=0,
                     objective='binary:logistic',
                     seed=27)
model_1 = modelfit(xgb1, train, predictors)

print 'Predictions in progress...'
submit = pd.DataFrame()
submit['ID'] = test['ID']
pred_1 = model_1.predict_proba(test[predictors])[:, 1]
submit['segment'] = pred_1
submit.to_csv('submit.csv', index=False)

示例#59

0

显示文件

文件： lw_preprocess.py 项目： tfaatfcn/lw-mlearn-rogerluo

def pipe_main(pipe=None):
    '''pipeline construction using sklearn estimators, final step support only
    classifiers currently
    
    .. note::
        data flows through a pipeline consisting of steps as below:
            raw data --> clean --> encoding --> scaling --> feature construction 
            --> feature selection --> resampling --> final estimator
            see scikit-learn preprocess & estimators
    parameter
    ----
    pipe - str 
        - in the format of 'xx_xx' of which 'xx' means steps in pipeline,
          default None
    return
    ----
        1) pipeline instance of chosen steps
        2) if pipe is None, a dict indicating possible choice of 'steps'
    '''
    clean = {
        'clean':
        Split_cls(dtype_filter='not_datetime', na1='null', na2=-999),
        'cleanNA':
        Split_cls(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Split_cls(dtype_filter='not_datetime', na1='most_frequent',
                  na2='mean'),
    }
    #
    encode = {
        'woe': Woe_encoder(max_leaf_nodes=5),
        'oht': Oht_encoder(),
        'ordi': Ordi_encoder(),
    }

    resample = {

        # over_sampling
        'rover':
        RandomOverSampler(),
        'smote':
        SMOTE(),
        'bsmote':
        BorderlineSMOTE(),
        'adasyn':
        ADASYN(),

        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),

        # under sampling cleaning methods
        'tlinks':
        TomekLinks(n_jobs=-1),
        'oside':
        OneSidedSelection(n_jobs=-1),
        'cleanNN':
        NeighbourhoodCleaningRule(n_jobs=-1),
        'enn':
        EditedNearestNeighbours(n_jobs=-1),
        'ann':
        AllKNN(n_jobs=-1),
        'cnn':
        CondensedNearestNeighbour(n_jobs=-1),

        # clean outliers
        'inlierForest':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'IsolationForest'}),
        'inlierLocal':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'LocalOutlierFactor'}),
        'inlierEllip':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'EllipticEnvelope'}),
        'inlierOsvm':
        FunctionSampler(outlier_rejection, kw_args={'method': 'OneClassSVM'}),
        # combine
        'smoteenn':
        SMOTEENN(),
        'smotelink':
        SMOTETomek(),
    }

    scale = {
        'stdscale': StandardScaler(),
        'maxscale': MinMaxScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'qauntile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(normalize_components=True, n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        'rtembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(Woe_encoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(
            LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc')),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fsvm':
        SelectFromModel(LinearSVC('l1', dual=False, C=1e-2)),
        'fxgb':
        SelectFromModel(XGBClassifier(n_jobs=-1)),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5)),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1), step=0.1, n_features_to_select=20),
        'fRFErf':
        RFE(ExtraTreesClassifier(n_estimators=100, max_depth=5),
            step=0.3,
            n_features_to_select=20),
        'fRFElog':
        RFE(LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc'),
            step=0.3,
            n_features_to_select=20)
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }
    # sklearn estimator
    t = all_estimators(type_filter=['classifier'])
    estimator = {}
    for i in t:
        try:
            estimator.update({i[0]: i[1]()})
        except Exception:
            continue

    estimator.update(
        dummy=DummyClassifier(),
        XGBClassifier=XGBClassifier(n_jobs=-1),
        LogisticRegressionCV=LogisticRegressionCV(scoring='roc_auc'),
        EasyEnsembleClassifier=EasyEnsembleClassifier(),
        BalancedRandomForestClassifier=BalancedRandomForestClassifier(),
        RUSBoostClassifier=RUSBoostClassifier(),
        SVC=SVC(C=0.01, gamma='auto'))

    if pipe is None:
        feature_s = {}
        feature_s.update(**feature_m, **feature_u)
        return {
            'clean': clean.keys(),
            'encoding': encode.keys(),
            'resample': resample.keys(),
            'scale': scale.keys(),
            'feature_c': feature_c.keys(),
            'feature_s': feature_s.keys(),
            'classifier': estimator.keys()
        }
    elif isinstance(pipe, str):
        l = pipe.split('_')
        all_keys_dict = {}
        all_keys_dict.update(**clean, **encode, **scale, **feature_c,
                             **feature_m, **feature_u, **estimator, **resample)
        steps = []
        for i in l:
            if all_keys_dict.get(i) is not None:
                steps.append((i, all_keys_dict.get(i)))
            else:
                raise KeyError(
                    "'{}' invalid key for sklearn estimators".format(i))
        return Pipeline(steps)

    else:
        raise ValueError("input pipe must be a string in format 'xx[_xx]'")

示例#60

0

显示文件

from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier

X_train, X_test, y_train, y_test = train_test_split(data,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

clf = DecisionTreeClassifier()
#we have to define max_depth to prevent overfitting
clf.fit(X_train, y_train)
print("Train Accuracy of clf:", clf.score(X_train, y_train))
print("Test Accuracy of clf", clf.score(X_test, y_test))

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
print("Train Accuracy of xgb:", xgb.score(X_train, y_train))
print("Test Accuracy of xgb:", xgb.score(X_test, y_test))

#%%
from sklearn.model_selection import GridSearchCV

#GridSearch on Xgboost Classifier
param_dict = {
    'max_depth': range(2, 3, 4),
    'min_child_weight': range(1, 2, 6),
    'learning_rate': [0.00001, 0.001, 0.01, 0.1],
    'n_estimators': [10, 50, 100]
}