示例#1
0
def job_function(params):
	learning_rate = params[0]
	max_depth = params[1]
	ss_cs = params[2]
	gamma = params[3]
	min_child_weight = params[4]
	reg_lambda = params[5]
	reg_alpha = params[6]

	early_stopping_rounds = 25
	if learning_rate >= 0.3:
		early_stopping_rounds = 5
	if learning_rate <= 0.03:
		early_stopping_rounds = 50

	scores = []
	for i in range(iterations_per_job):
		X_train = Xy[i][0]
		X_test = Xy[i][1]
		y_train = Xy[i][2]
		y_test = Xy[i][3]
		
		y_train2 = le.transform(y_train)   
		y_test2 = le.transform(y_test)   

		clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha)      
		clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False)
		y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit)
		score = calculate_score(y_predicted, y_test2)
		scores.append(score)

	avg_score = np.array(scores).mean()
	print(avg_score, params)
	return avg_score
示例#2
0
文件: model.py 项目: kalleon/custom
    def _distributor(self, label, cv, param, eval_metric, early_stopping_rounds=50):
        start = time()

        if self.is_classifier:
            label = 'XGBClassifier'
            rs = XGBClassifier(param)
        else:
            label = 'XGBRegressor'
            rs = XGBRegressor(param)

        X_visible, X_blind, y_visible, y_blined = \
            train_test_split(
                self.X_train, self.y_train, random_state=1301, stratify=self.y_train, test_size=0.4)

        rs.fit(self.X_visible, self.y_visible, eval_metric, early_stopping_rounds=50,
               eval_set=[(X_visible, y_visible), (X_blind, y_blined)])

        self.result[label] = {}
        self.result[label]['clf'] = rs
        # self.result[label]['score'] = rs.best_score_
        self.result[label]['time'] = time() - start
        # self.result[label]['set'] = ('n_iter: %s cv: %s' % (n_iter, cv))

        pprint.pprint(self.result[label])
        # pprint.pprint(rs.grid_scores_)

        out_result = open(self.result_address, 'wb')
        pickle.dump(self.result, out_result)
        out_result.close()
def extract_leaf_feature(features, targets, train_indexes, params):
    model = XGBClassifier(**params)
    model.fit(features[train_indexes], targets[train_indexes])
    booster = model.booster()
    dmatrix = xgb.DMatrix(features)
    leaf = booster.predict(dmatrix, pred_leaf=True)
    encoder = sklearn.preprocessing.OneHotEncoder()
    leaf_feature = encoder.fit_transform(leaf)
    return leaf_feature
def main(training_data, test_data):
    # Merging data to ensure consistent cleaning. Putting marker variable to separate later.
    training_data['source'] = 'training'
    test_data['source'] = 'test'
    merged_data = pd.concat([training_data, test_data])

    # Cleaning data
    cleaned_data = data_cleaner(merged_data)

    # Separating data, removing marker
    pred_df = cleaned_data[cleaned_data['source'] == 'training'].copy()
    test_pred = cleaned_data[cleaned_data['source'] == 'test'].copy()

    pred_df.drop('source', axis=1, inplace=True)
    test_pred.drop('source', axis=1, inplace=True)

    # Transforming target into ints, saving the key for later transformation
    labels = LabelEncoder().fit(training_data['country_destination'])
    target_df = pd.Series(labels.transform(training_data['country_destination']), index=training_data.index)

    # Training model
    xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob',
                              subsample=0.5, colsample_bytree=0.5, seed=0)
    xgb_model.fit(pred_df.as_matrix(), target_df.tolist())

    # Running the model
    preds = xgb_model.predict_proba(test_pred.as_matrix())

    # Selecting the top 5 most likely for each respondent and stacking. 
    # This section is VERY slow and could use being optimized
    model_probs = pd.DataFrame(preds, index=test_pred.index, columns=labels.classes_)

    stacked_probs = pd.Series()
    for i in model_probs.index:
        temp = model_probs.loc[i, :]
        temp_sort = pd.DataFrame(temp.sort_values(ascending=False)[:5].index)

        temp_sort['id'] = i
        temp_sort.columns = ['country', 'id']

        stacked_probs = pd.concat([stacked_probs, temp_sort])

    # # Selecting classes with highest probabilities, compiling into list
    # ids = []
    # cts = []
    # test_ids = pd.Series(test_data.index)
    # for i in range(len(test_ids)):
    #     idx = test_data.index[i]
    #     ids += [idx] * 5
    #     cts += labels.inverse_transform(np.argsort(model_probs[i])[::-1])[:5].tolist()
    #
    # predictions = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])

    # Cleaning output and returning it
    output = stacked_probs[['id', 'country']]
    return output
 def eval_fn(params):
     model = XGBClassifier(n_estimators=n_estimators_max, learning_rate=learning_rate, seed=seed)
     score = 0
     n_estimators = 0
     for tr, va in skf:
         X_tr, y_tr = X_train[tr], y_train[tr]
         X_va, y_va = X_train[va], y_train[va]
         model.set_params(**params)
         model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss',
                   early_stopping_rounds=50, verbose=False)
         score += model.best_score
         n_estimators += model.best_iteration
     score /= n_folds
     n_estimators /= n_folds
     n_estimators_lst.append(n_estimators)
     result_str = "train:%.4f ntree:%5d  " % (score, n_estimators)
     if X_valid is not None:
         model.n_estimators = n_estimators
         model.fit(X_train, y_train)
         pr = model.predict_proba(X_valid)[:,1]
         sc_valid = log_loss(y_valid, pr)
         score_valid.append(sc_valid)
         result_str += "valid:%.4f" % sc_valid
     if verbose:
         print result_str
     return score
示例#6
0
def objective(space):

    clf = XGBClassifier(n_estimators=int(space['n_estimators']),
                        objective='binary:logistic',
                        seed=37,
                        learning_rate=space['learning_rate'],
                        max_depth=space['max_depth'],
                        min_child_weight=space['min_child_weight'],
                        colsample_bytree=space['colsample_bytree'],
                        subsample=space['subsample'])

    clf.fit(xTrain, yTrain, eval_metric="logloss")
    pred = clf.predict_proba(xValid)[:, 1]
    loss = log_loss(yValid, pred)
    return{'loss': loss, 'status': STATUS_OK}
示例#7
0
    def GBDT(self, report=False):
        """Gradient Boosting Decision Tree.

        Args:
            report: whether print out the model analysis report.
        Returns:
            Decision tree model generated from Gradient Boosting Decision Tree."""
        from xgboost.sklearn import XGBClassifier

        self.gbdt = XGBClassifier(objective='binary:logistic',
                                  booster='gbtree',
                                  learning_rate=0.01,
                                  n_estimators=5000,
                                  max_depth=3,
                                  subsample=0.75,
                                  colsample_bytree=0.75,
                                  n_jobs=4,
                                  random_state=2018)

        self.gbdt.fit(self.train_prep[self.features], self.train_prep[self.target])
        
        if report:
            from Report import Report
            rpt = Report(self.gbdt, self.train, self.valid, self.target, self.features)
            rpt.ALL()

        return self.gbdt
示例#8
0
    def fit(self, json_train, n_estimators = 10, is_xgb = True):

        train = self.pre_process(json_train, istrain = True)
        
        bow_vectorizer = BagOfWordsVectorizer()
        word2vec_model = Word2VecModel()
        tag_counter_model = TagCounterModel()

        # word2vec_model.fit(train["author_pos_sentences"], 500)
        # author_features = word2vec_model.transform(train["author_pos_sentences"], "author")
        # self.author_model = word2vec_model.get_model()

#        bow_vectorizer.fit(train["title_pos_sentences"], 1000)
#        title_features = bow_vectorizer.transform(train["title_pos_sentences"], "title")
#        self.title_model = bow_vectorizer.get_vectorizer()

        bow_vectorizer.fit(train["text_pos_sentences"], 1000)
        text_features = bow_vectorizer.transform(train["text_pos_sentences"], "text")
        self.text_model = bow_vectorizer.get_vectorizer()

#        tag_features = tag_counter_model.fit_transform(train["text"])
#        self.tag_model = tag_counter_model.get_col()

        train = pd.concat([train, text_features], axis = 1)

        #le = preprocessing.LabelEncoder()

        # train["forumid"] = le.fit_transform(train["forumid"])
        
        label = train['istroll']
        train = train.drop('istroll', axis=1)
        train = train.drop(['text', 'text_pos', 'text_pos_sentences'], axis=1)
        
        print(train.columns)

        train.columns = [str(x) for x in range(len(train.columns))]
        
        if is_xgb == False:
            self.model = RandomForestClassifier(n_estimators, n_jobs=-1)
        else:
            self.model = XGBClassifier(n_estimators = n_estimators, max_depth = 10)

        print(train.shape)
        self.model.fit(train, label)
示例#9
0
def apply_xgb_ens(y_valid, valid_folder='Valid', test_folder='Test'):
    """
    Ensembler based on xgboost Gradient boosting.
    """
    #Loading data
    X, X_test, n_preds, n_class = get_X_X_Test(valid_folder, test_folder)
    y = y_valid
    
    #Defining classifier
    xgb = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=200,
                        objective='multi:softprob', gamma=0., 
                        max_delta_step=0., subsample=0.9, colsample_bytree=0.9,
                        seed=0)  
    xgb.fit(X, y)   
    y_pred = xgb.predict_proba(X_test)
    return y_pred      
    
    
    
def perform_prediction(training, labels, testing, xgb_votes, rf_votes):
    """ Perform prediction using a combination of XGB and RandomForests. """
    predictions = np.zeros((len(testing), len(set(labels))))
    # Predictions using xgboost.
    for i in range(xgb_votes):
        print 'XGB vote %d' % i
        xgb = XGBClassifier(
            max_depth=DEPTH_XGB, learning_rate=LEARNING_XGB,
            n_estimators=ESTIMATORS_XGB, objective='multi:softprob',
            subsample=SUBSAMPLE_XGB, colsample_bytree=COLSAMPLE_XGB)
        xgb.fit(training, labels)
        predictions += xgb.predict_proba(testing)
    # Predictions using RandomForestClassifier.
    for i in range(rf_votes):
        print 'RandomForest vote %d' % i
        rand_forest = RandomForestClassifier(
            n_estimators=ESTIMATORS_RF, criterion=CRITERION_RF, n_jobs=JOBS_RF,
            max_depth=DEPTH_RF, min_samples_leaf=MIN_LEAF_RF, bootstrap=True)
        rand_forest.fit(training, labels)
        predictions += rand_forest.predict_proba(testing)
    return predictions
示例#11
0
def xgboostinitial_predictor(train_path, test_path, eval_path):
    # Loading the data
    print 'Loading the data...'
    train = pd.read_csv(train_path, index_col=0)
    test = pd.read_csv(test_path, index_col=0)
    eval_df = pd.read_csv(eval_path, index_col=0)
    target = train['target'].copy()
    train.drop('target', axis=1, inplace=True)

    # Training model
    print 'Model training begins...'
    # xgtrain = xgb.DMatrix(train.values, target.values, missing=np.nan)
    # xgboost_params = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eval_metric': 'logloss', 'eta': 0.01,
    #                   'subsample': 0.5, 'colsample_bytree': 0.5, 'max_depth': 10, 'silent': 0}
    #
    # xgb_model = xgb.train(xgboost_params, xgtrain, learning_rates=0.3)

    xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='binary:logistic',
                              subsample=0.5, colsample_bytree=0.5, seed=0)
    xgb_model.fit(train.as_matrix(), target.tolist())

    # Running the model
    print 'Making predictions....'
    # xgtest = xgb.DMatrix(test.values)
    # xgeval = xgb.DMatrix(eval_df)

    test_preds = xgb_model.predict_proba(test.as_matrix())
    eval_preds = xgb_model.predict_proba(eval_df.as_matrix())

    print 'Cleaning predictions to match expected format....'
    test_output = pd.DataFrame(test_preds, index=test.index)
    print test_output.columns
    test_output = test_output[1]
    test_output.columns = ['PredictedProb']

    eval_output = pd.DataFrame(eval_preds, index=eval_df.index)
    eval_output = eval_output[1]
    eval_output.columns = ['PredictedProb']

    return test_output, eval_output
def train_classifier(X, y, clf_name='xgb'):
    if clf_name == 'xgb':
        clf = XGBClassifier(
            n_estimators=ESTIMATORS_XG,
            objective=OBJECTIVE_XG,
            max_depth=DEPTH_XG,
            learning_rate=LEARNING_RATE_XG,
            subsample=SUBSAMPLE_XG,
            colsample_bytree=COLSAMPLE_BYTREE_XG,
            seed=0,
        )
    else:
        clf = RandomForestClassifier(
            n_estimators=ESTIMATORS_RF,
            criterion=CRITERION_RF,
            n_jobs=JOBS_RF,
            max_depth=DEPTH_RF,
            min_samples_leaf=MIN_LEAF_RF,
            min_samples_split=MIN_SPLIT_RF,
            max_features=MAX_FEATURES_RF,
            bootstrap=True,
        )
    clf.fit(X, y)
    return clf
def get_xgboost_classifier(X_train, y_train, X_val, y_val,params=None, tag=""):
    
    param_grid = {'max_depth':[3,5,7], 'min_child_weight': [1,3,5], 'n_estimators': [50]}
    
    if params is None:
        xgb = XGBClassifier(
                 learning_rate =0.2,
                 objective= 'binary:logistic',
                 seed=27)
                 
        t = start("training xgboost ")
        cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=10,test_size=0.2, random_state=123)
        clf = grid_search.GridSearchCV(xgb, param_grid, cv=cv, n_jobs=1, scoring='roc_auc')
        clf = clf.fit(X_train,y_train)
        report(t, nitems=10*len(param_grid))
        
        print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_))
        print "With parameters:"
    
        best_parameters = clf.best_estimator_.get_params()
        for param_name in sorted(param_grid.keys()):
            print '\t%s: %r' % (param_name, best_parameters[param_name]) 
    else:
        clf = XGBClassifier(**params)
        clf.fit(X_train, y_train, eval_set =  [(X_train,y_train),(X_val,y_val)], eval_metric='auc', verbose=False)
        
        if plot_cv_curves:
            train = clf.evals_result()['validation_0']['auc']
            val = clf.evals_result()['validation_1']['auc']
        
            plot_cv_curve(train, val, tag)
        
        if plot_feature_importance:
            plot_feature_importance(clf, tag)

    return clf
def main():
    data_train = pd.read_csv(args.train_dataset)
    X_train = data_train.drop(['Id', 'Class'], axis=1)
    y_train = data_train.loc[:, 'Class']
    data_test = pd.read_csv(args.test_dataset)
    X_test = data_test.drop(['Id'], axis=1)
    Id = data_test.loc[:, 'Id']
    clf = XGBClassifier()
    clf.set_params(**best_dicts)
    clf.fit(X_train, y_train)
    prediction = clf.predict_proba(X_test)
    columns = ['Prediction'+str(i) for i in range(1, 10)]
    prediction = pd.DataFrame(prediction, columns=columns)
    results = pd.concat([Id, prediction], axis=1)
    return (clf, results)
示例#15
0
def myThreadFunc(ThreadID):
	X_train = Xy[ThreadID][0]
	X_test = Xy[ThreadID][1]
	y_train = Xy[ThreadID][2]
	y_test = Xy[ThreadID][3]
		
	y_train2 = le.transform(y_train)   
	y_test2 = le.transform(y_test)   

	clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha)      
	clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False)
	y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit)
	score = calculate_score(y_predicted, y_test2)
	print(score, clf.booster().best_ntree_limit)
	
	train_and_test_scores[ThreadID] = score
# In[91]:

# Random Forests

random_forest = RandomForestClassifier(random_state=1, n_estimators=45, min_samples_split=3, min_samples_leaf=2)

random_forest.fit(X, y)
score=random_forest.score(X, y)

Y_pred = random_forest.predict(X_test)


# In[14]:

#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X, y)
score = xgb.score(X,y)
y_pred = xgb.predict_proba(X_test)  



# In[15]:

print (score)


# In[21]:

# for Random forest
data1_new_train = data1_new.ix[0:11016, :]
data1_new_test = data1_new.ix[11017:, :]
X_train, X_test, Y_train, Y_test = train_test_split(data1_new_train,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=42)

#训练xgboost模型
#设置初试参数
xgb_train = XGBClassifier(booster="gbtree",
                          learning_rate=0.02,
                          n_estimators=1000,
                          max_depth=6,
                          min_child_weight=5,
                          gamma=0,
                          reg_alpha=65,
                          reg_lambda=10,
                          subsample=0.81,
                          colsample_bytree=0.81,
                          objective='binary:logistic',
                          nthread=8,
                          scale_pos_weight=3.9,
                          seed=27)
xgb_train.fit(X_train, Y_train, eval_metric="auc")
pred = xgb_train.predict(X_test)
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(Y_test, pred)
metrics.auc(fpr, tpr)
np.mean(f1_score(Y_test, pred, average=None))
y_prediction = xgb_train.predict(data1_new_test)
predict_result = DataFrame({"user_id": user_id, "y_prediction": y_prediction})
示例#18
0
def model1(df_train, df_test):
	print('model1')

	print('rows', df_train.shape[0]) 

	#remove rows with no sessions data
	hassessions = df_train['HasSessions']
	df_train = df_train.drop(hassessions[hassessions == 0].index)

	#remove rows older than 1/1/2014
	#dac2 = df_train.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
	#print('removing rows', len(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index))
	#df_train = df_train.drop(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index)

	print('rows', df_train.shape[0]) 

	labels = df_train['country_destination'].values
	df_train = df_train.drop(['country_destination'], axis=1)
	piv_train = df_train.shape[0]

	#Creating a DataFrame with train+test data
	df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
	#Removing id and date_first_booking
	df_all = df_all.drop(['id', 'date_first_booking', 'sessions_count', 'HasSessions'], axis=1)

	#Filling nan
	df_all = df_all.fillna(-1)

	#####Feature engineering#######
	print('features in the csv', df_all.shape[1])

	#date_account_created
	print('dac', datetime.now())
	dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
	df_all['dac_year'] = dac[:,0]
	df_all['dac_month'] = dac[:,1]
	df_all['dac_day'] = dac[:,2]

	#day of week, seazon
	print('dac2', datetime.now())
	dac2 = df_all.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
	df_all['dac_weekday'] = dac2.apply(lambda x: x.weekday())
	df_all['dac_season'] = dac2.apply(calculate_season)

	df_all = df_all.drop(['date_account_created'], axis=1)

	#timestamp_first_active
	print('tfa', datetime.now())
	tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
	df_all['tfa_year'] = tfa[:,0]
	df_all['tfa_month'] = tfa[:,1]
	df_all['tfa_day'] = tfa[:,2]
	df_all = df_all.drop(['timestamp_first_active'], axis=1)

	#Age
	print('age', datetime.now())
	av = df_all.age.values
	df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

	#remove features
	print('remove features', datetime.now())
	df_all = df_all.drop(['Sessions' + str(i) for i in [0]], axis=1)
	df_all = df_all.drop(['SessionsD' + str(i) for i in range(456)], axis=1)

	print('features in the model', df_all.shape[1])

	#One-hot-encoding features
	print('one-hot', datetime.now())
	ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'dac_season', 'sessions_preferred_device'] 

	for f in ohe_feats:
		df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
		df_all = df_all.drop([f], axis=1)
		df_all = pd.concat((df_all, df_all_dummy), axis=1)

	#Splitting train and test
	vals = df_all.values
	X = vals[:piv_train]
	y = labels
	X_predict = vals[piv_train:]

	#learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha =  0.03, 6, 0.5, 2, 2, 2, 1
	learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha =  0.03, 8, 0.5, 2, 1, 2, 0

	early_stopping_rounds = 25
	if learning_rate <= 0.03:
		early_stopping_rounds = 50

	print(learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha)

	#n_estimators = 455
	n_estimators = 350
	#n_estimators = 1
	print(n_estimators)

	print('fit start', datetime.now())
	clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=-1)      
	clf2.fit(X, y)
	y_predicted2 = clf2.predict_proba(X_predict)  

	return y_predicted2
示例#19
0
    y_pred = clf_grid.predict(X_test)

    print('best parameter:\n', clf_grid.best_params_)
    print('Best score is {}'.format(clf_grid.best_score_))
    print('accuracy:', metrics.accuracy_score(y_test, y_pred), 'precision:', metrics.precision_score(y_test, y_pred),'recall:', metrics.recall_score(y_test, y_pred), 'f-score:', metrics.accuracy_score(y_test, y_pred),
          'cm:',metrics.confusion_matrix(y_test, y_pred))

    # XGBoost
    # grid search and step-by-step tuning parameters: 1. fix learning rate and number of estimators for tuning tree-based parameters; 2. tune max_depth and min_child_weight; 3. tune gamma;
                                                  # 4. tune subsample and colsample_bytree; 5. tune Regularization Parameters; 6. tune learning rate and number of estimators;
                                                  # reference guide at https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
    # tuning learning rate and number of estimators as an example
from xgboost.sklearn import XGBClassifier

    param_test5 = {'n_estimators': range(50,500,50),'learning_rate':[i/100.0 for i in range(1,10)]}
    gsearch5 = GridSearchCV(estimator=XGBClassifier(max_depth=5,min_child_weight=1, gamma=0.0, subsample=0.9, colsample_bytree=0.7,reg_alpha=0.1,
                                                    objective= 'binary:logistic',nthread=4, scale_pos_weight=1, seed=2),param_grid = param_test5, scoring='accuracy',n_jobs=-1,iid=False, cv=5)

    gsearch5.fit(X_train, y_train)
    y_pred = gsearch5.predict(X_test)

    print('best parameter:\n', gsearch5.best_params_)
    print('Best score is {}'.format(gsearch5.best_score_))
    print('accuracy:', metrics.accuracy_score(y_test, y_pred), 'precision:', metrics.precision_score(y_test, y_pred),
          'recall:', metrics.recall_score(y_test, y_pred), 'f-score:', metrics.accuracy_score(y_test, y_pred),
          'cm:', metrics.confusion_matrix(y_test, y_pred))

        # XGBoost feature importance score with the best parameters
    clf = XGBClassifier(max_depth=5, min_child_weight=1, gamma=0.0, subsample=0.9, colsample_bytree=0.7,
                        reg_alpha=0.1, n_estimators=350, learning_rate=0.05)
    clf.fit(X_train, y_train)
    clf.score(X_test, y_test)
示例#20
0
class XGBoostModel(BaseModel):
    """RandomForest classifier."""
    def __init__(self,
                 max_depth=3,
                 learning_rate=0.1,
                 n_estimators=100,
                 objective="binary:logistic",
                 booster='gbtree',
                 silent=True,
                 n_jobs=1,
                 gamma=0,
                 min_child_weight=1,
                 max_delta_step=0,
                 subsample=1,
                 colsample_bytree=1,
                 colsample_bylevel=1,
                 reg_alpha=0,
                 reg_lambda=1,
                 scale_pos_weight=1,
                 base_score=0.5,
                 random_state=0,
                 missing=None):
        """"""
        super(XGBoostModel).__init__()
        self.model = XGBClassifier(max_depth=max_depth,
                                   learning_rate=learning_rate,
                                   n_estimators=n_estimators,
                                   silent=silent,
                                   objective=objective,
                                   booster=booster,
                                   n_jobs=n_jobs,
                                   gamma=gamma,
                                   min_child_weight=min_child_weight,
                                   max_delta_step=max_delta_step,
                                   subsample=subsample,
                                   colsample_bytree=colsample_bytree,
                                   colsample_bylevel=colsample_bylevel,
                                   reg_alpha=reg_alpha,
                                   reg_lambda=reg_lambda,
                                   scale_pos_weight=scale_pos_weight,
                                   base_score=base_score,
                                   random_state=random_state,
                                   missing=missing)

    def predict(self, features):
        super().predict(features)
        labels = self.model.predict(features)
        return labels

    def predict_prob(self, features):
        super().predict_prob(features)
        probs = self.model.predict_proba(features)
        return probs

    def predict_log_prob(self, features):
        super().predict_log_prob(features)
        probs = self.model.predict_proba(features)
        return probs

    def train(self, features, targets):
        super().train(features, targets)
        start = time.time()
        self.model.fit(X=features, y=targets)
        print('Finished, time %s' % (time.time() - start))

    def accuracy_score(self, features, targets):
        super().accuracy_score(features, targets)
        score = self.model.score(features, targets,
                                 self.model.scale_pos_weight)
        return score

    def abs_errors(self, features, targets):
        targets_pred = self.predict(features)
        result = abs(targets_pred - targets)
        return result

    def rmse_score(self, y_pred, y_true):
        """
        计算RMSE评分,为了体现预测结果0、1、2不同的重要性,增加对1,2预测错误的惩罚度,
        在评分计算时对不同行为分别乘以1,2,2.5的权重因子。
        np.average((y_true - y_pred) ** 2, axis=0, weights=weights)
        :param y_pred: 预测标签
        :param y_true: 真实标签
        :return: 评分
        """
        weight_dict = {0: 1, 1: 2, 2: 2.5}  # 不同类别的误判惩罚权重
        weights = [weight_dict[l] for l in y_true]
        mse = np.average((y_true - y_pred)**2, axis=0, weights=weights)
        score = 1 / (1 + np.sqrt(mse))
        return score
    #
    #feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    #feat_imp.plot(kind='bar', title='Feature Importances')
    #plt.ylabel('Feature Importance Score')


### 建立模型
print('### 建立XGBClassifier')
xgbc = XGBClassifier(
    learning_rate=0.1,  # 学习率
    silent=1,  # 输出中间过程
    n_estimators=150,  # 决策树个数
    max_depth=5,  # 决策树深度
    min_child_weight=1,  # 最小叶子节点权重和?
    gamma=0,  # 惩罚项系数
    subsample=0.8,  # 训练一棵树所用的数据占全部数据集比例
    colsample_bytree=0.8,  # 训练一颗树所用的特征占全部数据集比例
    objective='binary:logistic',  # 损失函数 
    nthread=4,  # 线程数
    scale_pos_weight=1,  # 样本不平衡
    eval_metric='logloss',  # 评估指标
    reg_alpha=0.03,  # 正则化系数
    seed=27)  # 随机种子

### 网格搜索
## step1:决策树个数 n_estimators
print("### 调参:决策树个数")
#modelfit(xgbc, train, test, predictors)

## step2:决策树参数 max_depth/min_child_weight/gamma/subsample/colsample_bytree
print("### 调参:决策树参数")
示例#22
0
		Xy.append([X_train, X_test, y_train, y_test])

	for iter in range(iterations):
#		if iter < 5:
#			continue
		X_train = Xy[iter][0]
		X_test = Xy[iter][1]
		y_train = Xy[iter][2]
		y_test = Xy[iter][3]

		y_train2 = le.transform(y_train)   
		y_test2 = le.transform(y_test)   

		print('fit start', datetime.now())

		clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread)      
		clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2)

submit = 0
if submit == 1:
#	n_estimators = 395
	n_estimators = 349
	#n_estimators = clf.booster().best_ntree_limit 
	print(n_estimators)

	print('fit start', datetime.now())
	clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread)      
	clf2.fit(X, y)
	#clf2.fit(X, y, eval_set=[(X, y2)], eval_metric=calculate_score_dummy, early_stopping_rounds=n_estimators)

	y_predicted = clf2.predict_proba(X_predict)  
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#ClusterCentroids
cc = ClusterCentroids(random_state=0)
os_X,os_y = cc.fit_sample(X_train,y_train)

#XGboost
clf_XG = XGBClassifier(learning_rate= 0.3, min_child_weight=1,
                       max_depth=6,gamma=0,subsample=1, max_delta_step=0, colsample_bytree=1,
                       reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000)  
clf_XG.fit(os_X, os_y,eval_set=[(os_X, os_y), (X_test, y_test)],eval_metric='auc',verbose=False)  
evals_result = clf_XG.evals_result()  
y_true, y_pred = y_test, clf_XG.predict(X_test)  

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)  
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)  
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
 
#Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)
print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1])
示例#24
0
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC
import numpy as np
#事先准备三个模型,xgb和rf模型都已经通过cross_validation找出了较好的参数
clf1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=140,
                     max_depth=1,
                     min_child_weight=2,
                     gamma=0,
                     subsample=0.7,
                     colsample_bytree=0.6,
                     objective='binary:logistic',
                     nthread=4,
                     scale_pos_weight=1)
clf2 = RandomForestClassifier(n_estimators=50,
                              max_depth=1,
                              min_samples_split=4,
                              min_samples_leaf=54,
                              oob_score=True)
clf3 = SVC(C=0.1, probability=True)

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
eclf = VotingClassifier(estimators=[('xgb', clf1), ('rf', clf2),
                                    ('svc', clf3)],
                        voting='hard')
for clf, label in zip([clf1, clf2, clf3, eclf],
                      ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv=5, scoring='accuracy')
示例#25
0
文件: xgboost.py 项目: Helen-n/kaggle
data=pd.read_csv("../data/data.csv")
data.lon.unique().shape

data_x=pd.get_dummies(data.action_type,prefix="action_type")
cols=["combined_shot_type","game_event_id","period","playoffs",
      "shot_type","shot_zone_area","shot_zone_basic","shot_zone_range",
      "matchup","opponent","game_date","shot_distance","minutes_remaining","seconds_remaining",
      "loc_x","loc_y"]
for col in cols:
    data_x=pd.concat([data_x,pd.get_dummies(data[col],prefix=col),],axis=1)
train_x=data_x[-pd.isnull(data.shot_made_flag)]
test_x=data_x[pd.isnull(data.shot_made_flag)]
train_y=data.shot_made_flag[-pd.isnull(data.shot_made_flag)]

clf = XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=550,
                     subsample=0.5, colsample_bytree=0.5, seed=0)
clf.fit(train_x, train_y)
y_pred = clf.predict(train_x)
print("Number of mislabeled points out of a total %d points : %d"  % (train_x.shape[0],(train_y != y_pred).sum()))

def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    print(ll)
    return ll
    
logloss(train_y,clf.predict_proba(train_x)[:,1])
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# In[29]:

preprocessor = make_pipeline(SelectKBest(f_classif, k=10),
                             PolynomialFeatures(2))

AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(random_state=0))
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))
GBoost = make_pipeline(preprocessor, StandardScaler(),
                       GradientBoostingClassifier())
RandomForest = make_pipeline(preprocessor, RandomForestClassifier())
XGB = make_pipeline(preprocessor, XGBClassifier())
Extree = make_pipeline(preprocessor, ExtraTreesClassifier())

dict_of_models = {
    'AdaBoost': AdaBoost,
    'SVM': SVM,
    'GBoost': GBoost,
    'RandomForest': RandomForest,
    'XGB': XGB,
    'Extree': Extree
}

# In[30]:

from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import learning_curve
示例#27
0
from xgboost.sklearn import XGBClassifier

# load data
dataset = loadtxt("pima-indians-diabetes.csv", delimiter=",")
X = dataset[:, 0:8]
Y = dataset[:, 8]

seed = 7

test_size = 0.33
X_train, x_test, Y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=test_size,
                                                    random_state=seed)

model = XGBClassifier()
eval_set = [(x_test, y_test)]
# early_stopping_rounds: 如果连续N 次结果没有提升,则停止
# eval_metric: 损失函数
# eval_set: A list of (X, y) pairs to use as a validation set for early-stopping
# verbose: print 学习结果
model.fit(X_train,
          Y_train,
          early_stopping_rounds=10,
          eval_metric="logloss",
          eval_set=eval_set,
          verbose=True)

# make predictions for test data
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]
for data_file in data_files:
    if exists_in_log_file(data_file):
        # Skip if already trained and tested
        print('Skipping {}'.format(data_file))
        continue

    print('Training on {}'.format(data_file))

    # Loading data
    data = dill.load(open(os.path.join(DATA_DIR, data_file), 'rb'))

    for months_before in data.keys():
        train_x = data[months_before]["TRAIN"]["X"]
        train_y = data[months_before]["TRAIN"]["y"]
        test_x = data[months_before]["TEST"]["X"]
        test_y = data[months_before]["TEST"]["y"]

        # Creating and training model
        clf = XGBClassifier(n_estimators=N_ESTIMATORS,random_state=1,
                            verbose=1, n_jobs=N_JOBS)
        clf.fit(train_x, train_y, verbose=True)

        # Scoring
        pred_y = clf.predict_proba(test_x)

        auc_score = roc_auc_score(test_y, pred_y[:,1])
        log_score = log_loss(test_y, pred_y)

        logging.info('{}, {}, {}, {}'.format(data_file, months_before, auc_score, log_score))
示例#29
0
    def train(self, train_set, dev_set):
        logger.log('Get features from training set')
        if os.path.exists(train_features_file):
            train_features = np.load(train_features_file)
            _, _, train_labels, _, _ = self.get_minibatch(
                train_set, 0, len(train_set))
        else:
            train_features = None
            train_labels = []
            total_batch = int(len(train_set) - 1) / self.batch_size + 1
            for i in tqdm(range(total_batch)):
                minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \
                    minibatch_prem_dep, minibatch_hypo_dep = \
                    self.get_minibatch(train_set, i * self.batch_size, (i+1) * self.batch_size)
                feed_dict = {
                    self.model.premise_x: minibatch_premise_vectors,
                    self.model.hypothesis_x: minibatch_hypothesis_vectors,
                    self.model.y: minibatch_labels,
                    self.model.keep_rate_ph: 1.0
                }
                if 'dep_avg' in self.model_type:
                    feed_dict[self.model.prem_dep] = minibatch_prem_dep
                    feed_dict[self.model.hypo_dep] = minibatch_hypo_dep
                minibatch_features = self.sess.run([self.model.features],
                                                   feed_dict)
                train_features = minibatch_features[0] if train_features is None \
                    else np.concatenate((train_features, minibatch_features[0]))
                train_labels += minibatch_labels

            np.save(train_features_file, train_features)

        logger.log('Get features from dev set')
        if os.path.exists(dev_features_file):
            dev_features = np.load(dev_features_file)
            _, _, dev_labels, _, _ = self.get_minibatch(
                dev_set, 0, len(dev_set))
        else:
            dev_features = None
            dev_labels = []
            total_batch = int(len(dev_set) - 1) / self.batch_size + 1
            for i in tqdm(range(total_batch)):
                minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \
                    minibatch_prem_dep, minibatch_hypo_dep = \
                    self.get_minibatch(dev_set, i * self.batch_size, (i+1) * self.batch_size)
                feed_dict = {
                    self.model.premise_x: minibatch_premise_vectors,
                    self.model.hypothesis_x: minibatch_hypothesis_vectors,
                    self.model.y: minibatch_labels,
                    self.model.keep_rate_ph: 1.0
                }
                if 'dep_avg' in self.model_type:
                    feed_dict[self.model.prem_dep] = minibatch_prem_dep
                    feed_dict[self.model.hypo_dep] = minibatch_hypo_dep
                minibatch_features = self.sess.run([self.model.features],
                                                   feed_dict)
                dev_features = minibatch_features[0] if dev_features is None \
                    else np.concatenate((dev_features, minibatch_features[0]))
                dev_labels += minibatch_labels

            np.save(dev_features_file, dev_features)

        tuned_parameters = {'max_depth': [4, 6, 8], 'n_estimators': [100, 200]}

        best_score = 0.
        best_params = []
        for g in ParameterGrid(tuned_parameters):
            clf = XGBClassifier(nthread=24)
            clf.set_params(**g)
            clf.fit(train_features, train_labels)
            score = clf.score(dev_features, dev_labels)
            logger.log('%s: %f' % (str(g), score))
            if best_score < score:
                best_score = score
                best_params = g
                self.clf = clf

        logger.log('Best score: %s %f' % (str(best_params), best_score))
示例#30
0
    params = {
        'learning_rate': 0.1,
        'n_estimators': 100,
        'seed': 0,
        'subsample': 1,
        'colsample_bytree': 1,
        'objective': 'binary:logistic',
        'max_depth': 3
    }

    # log model params
    for key in params:
        mlflow.log_param(key, params[key])

    # train XGBoost model
    gbtree = XGBClassifier(**params)
    gbtree.fit(train_features, train_labels)

    importances = gbtree.get_booster().get_fscore()
    print(importances)

    # get predictions
    y_pred = gbtree.predict(test_features)

    accuracy = accuracy_score(test_labels, y_pred)
    print("Accuracy: %.1f%%" % (accuracy * 100.0))

    # log accuracy metric
    mlflow.log_metric("accuracy", accuracy)

    sns.set(font_scale=1.5)
params={'max_depth':6, 'eta':0.05,'objective':'multi:softprob', 'subsample':0.8, 'colsample_bytree':1,'min_child_weight':1,'num_class':3}
num_rounds=206
z=[]
dtrain=xgb.DMatrix(train[features],label=y)
clf=xgb.train(params,dtrain,num_rounds)

importance=clf.get_fscore(fmap='xgb.fmap')
importance=sorted(importance.items(),key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

bst=list(df['feature'][df.fscore>0.001])
#df.to_csv('select.csv',index=False)
X_train,X_valid,y_train,y_valid=train_test_split(train[bst],y,test_size=0.6,random_state=10)
print ('start xgboost learning...')
alg = XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=1210, objective='multi:softprob', subsample=0.8, colsample_bytree=1,min_child_weight=1)                    
alg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],eval_metric='mlogloss',early_stopping_rounds=10,verbose=True)


#plt.figure()
#df.plot()
#df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
#plt.title('XGBoost Feature Importance')
#plt.xlabel('relative importance')
#plt.gcf().savefig('feature_importance_xgb.png')
y_pred = alg.predict_proba(test[bst])
result=pd.DataFrame(y_pred,columns=['predict_0','predict_1','predict_2'])
result['id']=test.id.values.copy()
#result.to_csv('xgb10.csv',index=False)

示例#32
0
# *Selection of ML algorithm*: A first approach to deal with imbalanced data is to balance it by discarding the majority class before applying an ML algorithm. The disadvantage of  undersampling is that a model trained in this way will not perform well on real-world skewed test data since almost all the information was discarded. A better approach might be to oversample the minority class, say by the synthetic minority oversampling technique (SMOTE) contained in the 'imblearn' library. Motivated by this, I tried a variety of anomaly-detection and supervised learning approaches. I find, however, that the best result is obtained on the original dataset by using a ML algorithm based on ensembles of decision trees that intrinsically performs well on imbalanced data. Such algorithms not only allow for constructing a model that can cope with the missing values in our data, but they naturally allow for speedup via parallel-processing. Among these algorithms, the extreme gradient-boosted (XGBoost) algorithm used below slightly outperforms random-forest. Finally, XGBoost, like several other ML algorithms, allows for weighting the positive class more compared to the negative class --- a setting that also allows to account for the skew in the data.

# Split the data into training and test sets in a 80:20 ratio

# In[ ]:


trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.2, \
                                                random_state = randomState)

# In[ ]:

# Long computation in this cell (~1.8 minutes)
weights = (Y == 0).sum() / (1.0 * (Y == 1).sum())
clf = XGBClassifier(max_depth = 3, scale_pos_weight = weights, \
                n_jobs = 4)
probabilities = clf.fit(trainX, trainY).predict_proba(testX)
print('AUPRC = {}'.format(average_precision_score(testY, \
                                              probabilities[:, 1])))

# <a href='#top'>back to top</a>

# <a id='importance'></a>
# ##### 6.1. What are the important features for the ML model?
# The figure below shows that the new feature *errorBalanceOrig* that we created is the most relevant feature for the model. The features are ordered based on the number of samples affected by splits on those features.

# In[ ]:

fig = plt.figure(figsize=(14, 9))
ax = fig.add_subplot(111)
示例#33
0
#  gamma=0,
#  subsample=0.6,
#  colsample_bytree=0.7,
#  objective= 'binary:logistic',
#  scale_pos_weight=1,
#  reg_alpha=0.1,
#  seed=27)
# modelfit(xgb1, df_train, predictors, targetname, early_stopping_rounds=50)


xgb1 = XGBClassifier(
 learning_rate=0.01,
 n_estimators=700,
 max_depth=5,
 min_child_weight=8,
 gamma=0.3,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 scale_pos_weight=1,
 seed=27)



xgb1.fit(df_train[predictors], df_train[targetname])
df_test['target'] = xgb1.predict(df_test[predictors])




df_test['target'] = df_test['target'].apply(lambda x: 'Y' if x==1 else 'N')
        y_train, dtrain_predprob)

    feat_imp = pd.Series(
        alg.get_booster().get_fscore()).sort_values(ascending=False)
    print feat_imp
    # feat_imp.plot(kind='bar', title='Feature Importances')
    # plt.ylabel('Feature Importance Score')


# predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(learning_rate=0.2,
                     n_estimators=100,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective='reg:linear',
                     n_jobs=4,
                     scale_pos_weight=1,
                     random_state=27)

# modelfit(xgb1, x_train, y_train)

##### Step 2: Tune max_depth and min_child_weight
### description
# We tune these first as they will have the highest impact on model outcome. To start with, let's set wider ranges
# and then we will perform another iteration for smaller ranges.
### note
# GridSearchCV documentation -> http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# scoring parameters -> http://scikit-learn.org/stable/modules/model_evaluation.html
data_test = large_set.tail(545421)
data_test = pd.DataFrame(pd.concat([data_test, tag_test_set], axis=1))
print 'Finished Reconstructing Train/Test Sets'
print data_train.shape
print data_test.shape


print 'Started Computing train set labels'
label_set = np.sign(label_set['Click'])
label_set[label_set == -1] = 0
print 'Finished computing train set labels'

# fit estimator
print "start XGBClassifier"
n_samples = data_train.shape[0]
est=XGBClassifier(n_estimators=200, learning_rate=0.1, silent= False)

print "start fitting"
est.fit(data_train, label_set)
# predict class labels
probs = est.predict_proba(data_test)

print "cross validation start"
cv = cross_validation.ShuffleSplit(n_samples, n_iter=10, random_state=0)
scores = cross_validation.cross_val_score(est, data_train, label_set, cv=cv)
mean = np.mean(probs[:, 1])
std = np.std(probs[:, 1])
print "Test predicted Mean:", mean
print "Test predicted STD:", std
df = pd.DataFrame(probs[:, 1])
df.columns = ["Prediction"]
示例#36
0
    def dviz_classification_visualization(data_train, target_train,
                                          classifierName):
        clf = tree.DecisionTreeClassifier(max_depth=5, random_state=666)
        clf.fit(data_train, target_train)
        svg_tree = dtreeviz(
            clf,
            data_train,
            target_train,
            target_name=classifierName,
            feature_names=data_train.columns,
            orientation="TD",
            class_names=[classifierName, 'not_' + classifierName],
            fancy=True,
            histtype='strip',
            X=None,
            label_fontsize=12,
            ticks_fontsize=8,
            fontname="Arial")
        fname = os.path.join(
            tree_evaluations_out,
            str(classifierName) + 'fancy_decision_tree_example.svg')
        svg_tree.save(fname)

        print(rounds)
        # READ IN DATA FOLDER AND REMOVE ALL NON-FEATURE VARIABLES (POP DLC COORDINATE DATA AND TARGET DATA)
        print('Reading in ' + str(len(os.listdir(data_folder))) +
              ' annotated files...')
        for i in os.listdir(data_folder):
            if i.__contains__(".csv"):
                currentFn = os.path.join(data_folder, i)
                df = pd.read_csv(currentFn, index_col=0)
                features = features.append(df, ignore_index=True)
                print(features)
        features = features.loc[:, ~features.columns.str.contains('^Unnamed')]
        features = features.drop(["scorer"], axis=1, errors='ignore')
        totalTargetframes = features[classifierName].sum()
        try:
            targetFrame = features.pop(classifierName).values
        except KeyError:
            print(
                'Error: the dataframe does not contain any target annotations. Please check the csv files in the project_folder/csv/target_inserted folder'
            )
        features = features.fillna(0)
        features = drop_bp_cords(features, inifile)
        target_names = []
        loop = 1
        for i in range(model_nos):
            currentModelNames = 'target_name_' + str(loop)
            currentModelNames = config.get('SML settings', currentModelNames)
            if currentModelNames != classifierName:
                target_names.append(currentModelNames)
            loop += 1
        print('# of models to be created: 1')

        for i in range(len(target_names)):
            currentModelName = target_names[i]
            features.pop(currentModelName).values
        class_names = class_names = ['Not_' + classifierName, classifierName]
        feature_list = list(features)
        print('# of features in dataset: ' + str(len(feature_list)))

        # IF SET BY USER - PERFORM UNDERSAMPLING AND OVERSAMPLING IF SET BY USER
        data_train, data_test, target_train, target_test = train_test_split(
            features, targetFrame, test_size=train_test_size)
        under_sample_setting = config.get('create ensemble settings',
                                          'under_sample_setting')
        over_sample_setting = config.get('create ensemble settings',
                                         'over_sample_setting')
        trainDf = data_train
        trainDf[classifierName] = target_train
        targetFrameRows = trainDf.loc[trainDf[classifierName] == 1]
        print('# of ' + str(classifierName) + ' frames in dataset: ' +
              str(totalTargetframes))
        trainDf = trainDf.sample(frac=1).reset_index(drop=True)
        if under_sample_setting == 'Random undersample':
            print('Performing undersampling...')
            under_sample_ratio = config.getfloat('create ensemble settings',
                                                 'under_sample_ratio')
            nonTargetFrameRows = trainDf.loc[trainDf[classifierName] == 0]
            nontargetFrameRowsSize = int(
                len(targetFrameRows) * under_sample_ratio)
            nonTargetFrameRows = nonTargetFrameRows.sample(
                nontargetFrameRowsSize, replace=False)
            trainDf = pd.concat([targetFrameRows, nonTargetFrameRows])
            target_train = trainDf.pop(classifierName).values
            data_train = trainDf
        if under_sample_setting != 'Random undersample':
            target_train = trainDf.pop(classifierName).values
            under_sample_ratio = 'NaN'
        if over_sample_setting == 'SMOTEENN':
            print('Performing SMOTEEN oversampling...')
            over_sample_ratio = config.getfloat('create ensemble settings',
                                                'over_sample_ratio')
            smt = SMOTEENN(sampling_strategy=over_sample_ratio)
            data_train, target_train = smt.fit_sample(data_train, target_train)
        if over_sample_setting == 'SMOTE':
            print('Performing SMOTE oversampling...')
            over_sample_ratio = config.getfloat('create ensemble settings',
                                                'over_sample_ratio')
            smt = SMOTE(sampling_strategy=over_sample_ratio)
            data_train, target_train = smt.fit_sample(data_train, target_train)
        if (over_sample_setting != 'SMOTEENN') or (over_sample_setting !=
                                                   'SMOTE'):
            over_sample_ratio = 'NaN'
        data_train = data_train.sample(frac=1).reset_index(drop=True)
        #target_train = np.random.shuffle(target_train)

        # RUN THE DECISION ENSEMBLE SET BY THE USER
        # run random forest
        if model_to_run == 'RF':
            print('Training model ' + str(classifierName) + '...')
            RF_n_estimators = config.getint('create ensemble settings',
                                            'RF_n_estimators')
            RF_max_features = config.get('create ensemble settings',
                                         'RF_max_features')
            RF_criterion = config.get('create ensemble settings',
                                      'RF_criterion')
            RF_min_sample_leaf = config.getint('create ensemble settings',
                                               'RF_min_sample_leaf')
            clf = RandomForestClassifier(n_estimators=RF_n_estimators,
                                         max_features=RF_max_features,
                                         n_jobs=-1,
                                         criterion=RF_criterion,
                                         min_samples_leaf=RF_min_sample_leaf,
                                         bootstrap=True,
                                         verbose=1)
            try:
                clf.fit(data_train, target_train)
            except ValueError:
                print(
                    'ERROR: The model contains a faulty array. This may happen when trying to train a model with 0 examples of the behavior of interest'
                )

            # predictions = clf.predict_proba(data_test)
            # data_test['probability'] = predictions[:, 1]
            # data_test['prediction'] = np.where(data_test['probability'] > 0.499999, 1, 0)
            # print(data_test['prediction'].sum())

            scoring = ['precision', 'recall', 'f1']
            newDataTargets = np.concatenate((target_train, target_test),
                                            axis=0)
            # #newDataTargets = np.where((newDataTargets == 0) | (newDataTargets == 1), newDataTargets ** 1, newDataTargets)
            # newDataFeatures = np.concatenate((data_train, data_test), axis=0)
            # #newDataFeatures = np.where((newDataFeatures == 0) | (newDataFeatures == 1), newDataFeatures ** 1, newDataFeatures)
            # cv = ShuffleSplit(n_splits=5, test_size=train_test_size)
            # results = cross_validate(clf, newDataFeatures, newDataTargets, cv=cv, scoring=scoring)
            # results = pd.DataFrame.from_dict(results)
            # crossValresultsFname = os.path.join(tree_evaluations_out, str(classifierName) + '_cross_val_100.csv')
            # results.to_csv(crossValresultsFname)

            # #RUN RANDOM FOREST EVALUATIONS
            # compute_permutation_importance = config.get('create ensemble settings', 'compute_permutation_importance')
            # if compute_permutation_importance == 'yes':
            #     print('Calculating permutation importances...')
            #     computePermutationImportance(data_test, target_test, clf)
            #
            # generate_learning_curve = config.get('create ensemble settings', 'generate_learning_curve')
            # if generate_learning_curve == 'yes':
            #     shuffle_splits = config.getint('create ensemble settings', 'LearningCurve_shuffle_k_splits')
            #     dataset_splits = config.getint('create ensemble settings', 'LearningCurve_shuffle_data_splits')
            #     print('Calculating learning curves...')
            #     LearningCurve(features, targetFrame, shuffle_splits, dataset_splits)
            # if generate_learning_curve != 'yes':
            #     shuffle_splits = 'NaN'
            #     dataset_splits = 'NaN'

            # generate_precision_recall_curve = config.get('create ensemble settings', 'generate_precision_recall_curve')
            # if generate_precision_recall_curve == 'yes':
            #     print('Calculating precision recall curve...')
            #     precisionRecallDf = pd.DataFrame()
            #     probabilities = clf.predict_proba(data_test)[:, 1]
            #     precision, recall, thresholds = precision_recall_curve(target_test, probabilities, pos_label=1)
            #     precisionRecallDf['precision'] = precision
            #     precisionRecallDf['recall'] = recall
            #     thresholds = list(thresholds)
            #     thresholds.insert(0, 0.00)
            #     precisionRecallDf['thresholds'] = thresholds
            #     PRCpath = os.path.join(tree_evaluations_out, str(classifierName) + '_precision_recall.csv')
            #     precisionRecallDf.to_csv(PRCpath)
            #
            # generate_example_decision_tree = config.get('create ensemble settings', 'generate_example_decision_tree')
            # if generate_example_decision_tree == 'yes':
            #     print('Generating example decision tree using graphviz...')
            #     estimator = clf.estimators_[3]
            #     generateExampleDecisionTree(estimator)

            generate_classification_report = config.get(
                'create ensemble settings', 'generate_classification_report')
            if generate_classification_report == 'yes':
                print('Generating yellowbrick classification report...')
                generateClassificationReport(clf, class_names, rounds)

            # generate_features_importance_log = config.get('create ensemble settings', 'generate_features_importance_log')
            # if generate_features_importance_log == 'yes':
            #     print('Generating feature importance log...')
            #     importances = list(clf.feature_importances_)
            #     log_df = generateFeatureImportanceLog(importances)
            #
            # generate_features_importance_bar_graph = config.get('create ensemble settings', 'generate_features_importance_bar_graph')
            # if generate_features_importance_bar_graph == 'yes':
            #     N_feature_importance_bars = config.getint('create ensemble settings', 'N_feature_importance_bars')
            #     print('Generating feature importance bar graph...')
            #     generateFeatureImportanceBarGraph(log_df, N_feature_importance_bars)
            # if generate_features_importance_bar_graph != 'yes':
            #     N_feature_importance_bars = 'NaN'

            # generate_example_decision_tree_fancy = config.get('create ensemble settings','generate_example_decision_tree_fancy')
            # if generate_example_decision_tree_fancy == 'yes':
            #     print('Generating fancy decision tree example...')
            #     dviz_classification_visualization(data_train, target_train, classifierName)

            # SAVE MODEL META DATA
            RF_meta_data = config.get('create ensemble settings',
                                      'RF_meta_data')
            if RF_meta_data == 'yes':
                metaDataList = [
                    classifierName, RF_criterion, RF_max_features,
                    RF_min_sample_leaf, RF_n_estimators,
                    compute_permutation_importance,
                    generate_classification_report,
                    generate_example_decision_tree,
                    generate_features_importance_bar_graph,
                    generate_features_importance_log,
                    generate_precision_recall_curve, RF_meta_data,
                    generate_learning_curve, dataset_splits, shuffle_splits,
                    N_feature_importance_bars, over_sample_ratio,
                    over_sample_setting, train_test_size, under_sample_ratio,
                    under_sample_ratio
                ]
                generateMetaData(metaDataList)

        # run gradient boost model
        if model_to_run == 'GBC':
            GBC_n_estimators = config.getint('create ensemble settings',
                                             'GBC_n_estimators')
            GBC_max_features = config.get('create ensemble settings',
                                          'GBC_max_features')
            GBC_max_depth = config.getint('create ensemble settings',
                                          'GBC_max_depth')
            GBC_learning_rate = config.getfloat('create ensemble settings',
                                                'GBC_learning_rate')
            GBC_min_sample_split = config.getint('create ensemble settings',
                                                 'GBC_min_sample_split')
            clf = GradientBoostingClassifier(
                max_depth=GBC_max_depth,
                n_estimators=GBC_n_estimators,
                learning_rate=GBC_learning_rate,
                max_features=GBC_max_features,
                min_samples_split=GBC_min_sample_split,
                verbose=1)
            clf.fit(data_train, target_train)
            clf_pred = clf.predict(data_test)
            print(
                str(classifierName) + str(" Accuracy train: ") +
                str(clf.score(data_train, target_train)))

            generate_example_decision_tree = config.get(
                'create ensemble settings', 'generate_example_decision_tree')
            if generate_example_decision_tree == 'yes':
                estimator = clf.estimators_[3, 0]
                generateExampleDecisionTree(estimator)

            generate_classification_report = config.get(
                'create ensemble settings', 'generate_classification_report')
            if generate_classification_report == 'yes':
                generateClassificationReport(clf, class_names)

            generate_features_importance_log = config.get(
                'create ensemble settings', 'generate_features_importance_log')
            if generate_features_importance_log == 'yes':
                importances = list(clf.feature_importances_)
                log_df = generateFeatureImportanceLog(importances)

            generate_features_importance_bar_graph = config.get(
                'create ensemble settings',
                'generate_features_importance_bar_graph')
            N_feature_importance_bars = config.getint(
                'create ensemble settings', 'N_feature_importance_bars')
            if generate_features_importance_bar_graph == 'yes':
                generateFeatureImportanceBarGraph(log_df,
                                                  N_feature_importance_bars)

        # run XGboost
        if model_to_run == 'XGB':
            XGB_n_estimators = config.getint('create ensemble settings',
                                             'XGB_n_estimators')
            XGB_max_depth = config.getint('create ensemble settings',
                                          'GBC_max_depth')
            XGB_learning_rate = config.getfloat('create ensemble settings',
                                                'XGB_learning_rate')
            clf = XGBClassifier(max_depth=XGB_max_depth,
                                min_child_weight=1,
                                learning_rate=XGB_learning_rate,
                                n_estimators=XGB_n_estimators,
                                silent=0,
                                objective='binary:logistic',
                                max_delta_step=0,
                                subsample=1,
                                colsample_bytree=1,
                                colsample_bylevel=1,
                                reg_alpha=0,
                                reg_lambda=0,
                                scale_pos_weight=1,
                                seed=1,
                                missing=None,
                                verbosity=3)
            clf.fit(data_train, target_train, verbose=True)

        # SAVE MODEL
        modelfn = str(classifierName) + '.sav'
        modelPath = os.path.join(modelDir_out, modelfn)
        pickle.dump(clf, open(modelPath, 'wb'))
        print('Classifier ' + str(classifierName) + ' saved @ ' +
              str('models/generated_models ') + 'folder')
        print(
            'Evaluation files are in models/generated_models/model_evaluations folders'
        )
def main(training_data, test_data):
    # Merging data to ensure consistent cleaning. Putting marker variable to separate later.
    training_data['source'] = 'training'
    test_data['source'] = 'test'
    merged_data = pd.concat([training_data, test_data])

    # Cleaning data
    cleaned_data = data_cleaner(merged_data)

    # Separating data, removing marker
    pred_df = cleaned_data[cleaned_data['source'] == 'training'].copy()
    test_pred = cleaned_data[cleaned_data['source'] == 'test'].copy()

    pred_df.drop('source', axis=1, inplace=True)
    test_pred.drop('source', axis=1, inplace=True)

    # Transforming target into ints, saving the key for later transformation
    labels = LabelEncoder().fit(training_data['country_destination'])
    target_df = pd.Series(labels.transform(
        training_data['country_destination']),
                          index=training_data.index)

    # Training model
    xgb_model = XGBClassifier(max_depth=6,
                              learning_rate=0.3,
                              n_estimators=25,
                              objective='multi:softprob',
                              subsample=0.5,
                              colsample_bytree=0.5,
                              seed=0)
    xgb_model.fit(pred_df.as_matrix(), target_df.tolist())

    # Running the model
    preds = xgb_model.predict_proba(test_pred.as_matrix())

    # Selecting the top 5 most likely for each respondent and stacking.
    # This section is VERY slow and could use being optimized
    model_probs = pd.DataFrame(preds,
                               index=test_pred.index,
                               columns=labels.classes_)

    stacked_probs = pd.Series()
    for i in model_probs.index:
        temp = model_probs.loc[i, :]
        temp_sort = pd.DataFrame(temp.sort_values(ascending=False)[:5].index)

        temp_sort['id'] = i
        temp_sort.columns = ['country', 'id']

        stacked_probs = pd.concat([stacked_probs, temp_sort])

    # # Selecting classes with highest probabilities, compiling into list
    # ids = []
    # cts = []
    # test_ids = pd.Series(test_data.index)
    # for i in range(len(test_ids)):
    #     idx = test_data.index[i]
    #     ids += [idx] * 5
    #     cts += labels.inverse_transform(np.argsort(model_probs[i])[::-1])[:5].tolist()
    #
    # predictions = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])

    # Cleaning output and returning it
    output = stacked_probs[['id', 'country']]
    return output
示例#38
0
    'Property_Area_Urban', 'Loan_tot_income_ratio', 'coapplicant_True'
]
X = dfr_train[col]
y = dfr_train['Loan_Status']
'''
# RandomForest 
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators  = 300, max_features=None,criterion = 'entropy',random_state = 0)
RF.fit(X, y)
'''

#Xgboost
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

classifier = XGBClassifier(learning_rate=0.1, n_estimators=10)
classifier.fit(X, y)

#validation
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=classifier, X=X, y=y, cv=10)
print(accuracies.mean())
print(accuracies.std())
'''
#ensemble
from sklearn import model_selection

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
示例#39
0
文件: test1.py 项目: mircean/ML
def do_cell(task):
    df_train, df_test, x_start, y_start = task[0], task[1], task[2], task[3]
    #print('do_cell', df_train.shape, df_test.shape, x_start, y_start)

    #train
    n_places_th_local = n_places_th
    n_places_local = n_places

    if n_places != 0:
        tmp = df_train.shape[0]
        value_counts = df_train.place_id.value_counts()[0:n_places]
        df_train = pd.merge(df_train, pd.DataFrame(value_counts), left_on='place_id', right_index=True)[df_train.columns]
        n_places_th_local = value_counts.values[n_places - 1]
        percentage = df_train.shape[0]/tmp

    elif n_places_th != 0:
        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]
        df_train = df_train.loc[mask.values]

    else:
        n_places_th_local = 2

        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        while percentage > n_places_percentage:
            n_places_th_local += 1
            n_places_local = value_counts[value_counts >= n_places_th_local].count()
            mask = value_counts[df_train.place_id.values] >= n_places_th_local
            percentage = mask.value_counts()[True]/df_train.shape[0]

        n_places_th_local -= 1
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        df_train = df_train.loc[mask.values]


    #print(x_start, y_start, n_places_local, n_places_th_local, percentage)
        
    #test
    row_ids = df_test.index
    if 'place_id' in df_test.columns:
        df_test = df_test.drop(['place_id'], axis=1)

    le = LabelEncoder()
    y = le.fit_transform(df_train.place_id.values)
    
    X = df_train.drop(['place_id'], axis=1).values
    X_predict = df_test.values

    score = 0
    n_estimators = 0
    if xgb == 1:    
        if xgb_calculate_n_estimators == True:
            clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)

            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
   
                clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=calculate_score, early_stopping_rounds=early_stopping_rounds, verbose=10 if one_cell == 1 else False)
                score = round(1 - clf.booster().best_score, 6)
                n_estimators = clf.booster().best_ntree_limit
            else:
                abc += 1
                xgb_options = clf.get_xgb_params()
                xgb_options['num_class'] = n_places + 1
                train_dmatrix = DMatrix(X, label=y)

                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                cv_results = cv(xgb_options, train_dmatrix, clf.n_estimators, early_stopping_rounds=early_stopping_rounds, verbose_eval=10 if one_cell == 1 else False, show_stdv=False, folds=folds, feval=calculate_score)

                n_estimators = cv_results.shape[0]
                score = round(1 - cv_results.values[-1][0], 6)
                std = round(cv_results.values[-1][1], 6)
        else:
            n_estimators = n_estimators_fixed

        clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)
    else:
        clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1)
        if rf_calculate_score == True:
            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
                y_train2 = le.transform(y_train)
                y_test2 = le.transform(y_test)
    
                clf.fit(X_train, y_train2)
                y_predict = clf.predict_proba(X_test)

                scores_local = []
                for i in range(X_test.shape[0]):
                    score = calculate_score_per_row(y_predict[i], y_test2[i])
                    scores_local.append(score)

                score = np.array(scores_local).mean()
            else:
                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                scores_cv = []
                for train, test in folds:
                    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

                    y_train2 = le.transform(y_train)
                    y_test2 = le.transform(y_test)
    
                    clf.fit(X_train, y_train2)
                    y_predict = clf.predict_proba(X_test)

                    scores_local = []
                    for i in range(X_test.shape[0]):
                        score = calculate_score_per_row(y_predict[i], y_test2[i])
                        scores_local.append(score)

                    score = np.array(scores_local).mean()
                    print('  ', x_start, y_start, score)
                    scores_cv.append(score)

                score = np.array(scores_cv).mean()
    
    #if few_cells == 1 or grid_search == 1:
    #    return [score, None, None]

    clf.fit(X, y)
    y_predict = clf.predict_proba(X_predict)
    ##1
    labels_predict = le.inverse_transform(np.argsort(y_predict, axis=1)[:,::-1][:,:n_topx])    

    print(x_start, y_start, score, n_estimators, n_places_local, n_places_th_local, percentage)

    return [score, row_ids, labels_predict]
示例#40
0
seed = 100
np.random.seed(seed)
random.seed(seed)

X, y = utils.importar_datos()

# ### Métricas finales

pipeline = Pipeline([("preprocessor", pp.PreprocessingOHE()),
                     ("model",
                      XGBClassifier(use_label_encoder=False,
                                    scale_pos_weight=1,
                                    subsample=0.8,
                                    colsample_bytree=0.8,
                                    objective="binary:logistic",
                                    n_estimators=1000,
                                    learning_rate=0.01,
                                    n_jobs=-1,
                                    eval_metric="logloss",
                                    min_child_weight=6,
                                    max_depth=6,
                                    reg_alpha=0.05))])

pipeline = utils.entrenar_y_realizar_prediccion_final_con_metricas(
    X, y, pipeline)

# La métrica objetivo AUC-ROC tiene un resultado similar al obtenido al utilizar LE. Sin embargo, se observa que aumento la tasa de Falsos Negativos con respecto al otro modelo, por lo que su Recall (y por ende su F1 Score) disminuyó (en 0.09). A su vez, mejoró levemente la tasa de Verdaderos Negativos.

# ### Predicción HoldOut

utils.predecir_holdout_y_generar_csv(pipeline,
                                     'Predicciones/4-XGBoost-OHE.csv')
示例#41
0
from sklearn.model_selection import train_test_split, StratifiedKFold, permutation_test_score
from xgboost.sklearn import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

data_all = pd.read_csv("G:/GDM/DATA/GDM.csv")

X ,y= data_all.drop(['OGTTgroup1'],axis=1),data_all.OGTTgroup1
X_log ,y_log= data_all.drop(['OGTTgroup1','weight_gain','income','education','DBP',
                             'parity','multi_pregnancy'],axis=1),data_all.OGTTgroup1

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 1,test_size=0.3,stratify=y)
X_train_log,X_test_log,y_train_log,y_test_log = train_test_split(X_log,y_log,random_state = 1,test_size=0.3,stratify=y_log)

clf = XGBClassifier(random_state=5361,scale_pos_weight=12.026280323450134,n_estimators=200,max_depth=2,
                    min_child_weight=29,colsample_bytree=0.7,subsample=1,gamma=0,
                    reg_alpha=5,reg_lambda=5,learning_rate=0.1, n_jobs=-1).fit(X_train, y_train)
cv = StratifiedKFold(5)

score, permutation_scores, pvalue = permutation_test_score(
    clf, X_train, y_train, scoring="roc_auc", cv=cv, n_permutations=1000, n_jobs=-1)

#print(pvalue)

clf_log = LogisticRegression(random_state=0,fit_intercept=True, C=1e9,solver = 'newton-cg').fit(X_train_log, y_train_log)

score_log, permutation_scores_log, pvalue_log = permutation_test_score(
    clf_log, X_train_log, y_train_log, scoring="roc_auc", cv=cv, n_permutations=1000, n_jobs=-1)
#print(pvalue_log)
示例#42
0
def model_making_main(file):
    logger.info(">> Start - Model making")

    df = pd.read_csv(config.preprocessed_csv, encoding='UTF-8')
    # If getting an error remove .astype(str)
    select_columns = [
        'recepientemail', 'Gender', 'Age(years)', 'Product Type', 'Weight',
        'Height', 'Habit', 'Face Amount', 'Medication', 'Property',
        'Medical Data', 'Family'
    ]
    df['ColumnA'] = df[select_columns].apply(
        lambda x: ','.join(x.dropna().astype(str)), axis=1)

    logging.info("Remove puncutation, tokenize")
    df['Lemmitize'] = df['ColumnA'].apply(rem_punt).apply(tokenize)

    df['Lemmitize'] = df['Lemmitize'].apply(conversion)
    df.to_csv(config.nlp_processed_csv, index=False, encoding="utf-8")

    df = pd.read_csv(config.nlp_processed_csv)

    X = df['Lemmitize']
    of = pd.read_csv(file, encoding='UTF-8')
    y = of['Offer_noise_free']

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=4)
    vect = TfidfVectorizer(max_df=0.5,
                           max_features=10000,
                           min_df=1,
                           use_idf=True,
                           ngram_range=(1, 2),
                           lowercase=True)
    represent = TfidfVectorizer(max_df=0.5,
                                max_features=10000,
                                min_df=1,
                                use_idf=True,
                                ngram_range=(1, 1),
                                lowercase=True)
    matrix = represent.fit_transform(X.values)
    # visualize(represent,matrix,X,y)
    # print(matrix)

    # for i, feature in enumerate(vect.get_feature_names()):
    #    print(i, feature)

    #va = raw_input()

    model1 = XGBClassifier(nthread=4, n_estimators=1000)
    model3 = RandomForestClassifier(n_estimators=60,
                                    n_jobs=3,
                                    max_features="auto",
                                    min_samples_leaf=50)
    model4 = SVC(kernel='rbf', C=1, gamma=10)
    model5 = LogisticRegression()
    model7 = SGDClassifier(alpha=.0001)
    model_making("XGBOOST", vect, model1, X_train, y_train, X_test, y_test)
    model_making("Random Forest", vect, model3, X_train, y_train, X_test,
                 y_test)
    model_making("SVM", vect, model4, X_train, y_train, X_test, y_test)
    model_making("Logistic Regression", vect, model5, X_train, y_train, X_test,
                 y_test)
    model_making("SGDClassifier", vect, model7, X_train, y_train, X_test,
                 y_test)
    # model_with_SVD(vect,X_train,X_test,y_train,y_test)
    logger.info("<< End - Model making")
示例#43
0
    model = GaussianNB(**vars)  
  elif alg.name == 'LogisticRegression':
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(**vars)
  elif alg.name == 'AdaBoost' and alg.type == 'classification':
    from sklearn.ensemble import AdaBoostClassifier
    model = AdaBoostClassifier(**vars)
  elif alg.name == 'GradientBoosting' and alg.type == 'classification':
    from sklearn.ensemble import GradientBoostingClassifier
    model = GradientBoostingClassifier(**vars)
  elif alg.name == 'RandomForest' and alg.type == 'classification':
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(**vars)
  elif alg.name == 'XGBoost' and alg.type == 'classification':
    from xgboost.sklearn import XGBClassifier
    model = XGBClassifier(**vars)   
  elif alg.name == 'CatBoost' and alg.type == 'classification':
    from catboost import CatBoostClassifier
    model = CatBoostClassifier(**vars)       
 
  #-------------------------------------------------------------
  # Regression algorithms   
  elif alg.name == 'TPOT_Regressor':
    from tpot import TPOTRegressor
    model = TPOTRegressor(
        generations=alg.generations,
        cv=alg.cv,
        scoring=alg.scoring,
        verbosity=alg.verbosity
    )
  elif alg.name == 'AutoSklearn_Regressor':
示例#44
0
        )),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])


TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Import the hashing vectorizer

p2 = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                     non_negative=True, norm=None, binary=False,
                                                     ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', XGBClassifier())))
    ])
示例#45
0
def xgboost_algorithm(XTrain,YTrain,XTest):
    xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
    xgb.fit(XTrain, YTrain)
    y_pred_xgboost = xgb.predict_proba(XTest) 
    return y_pred_xgboost
示例#46
0
    feat_imp = pd.Series(
        alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Impxortances')
    plt.ylabel('Feature Importance Score')

    #Choose all predictors except target & IDcols


#%% Step 1: Fix learning rate and number of estimators for tuning tree-based parameters
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=1000,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective='binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)
modelfit(xgb1, train, predictors)

param_test1 = {
    'max_depth': list(range(3, 13, 2)),
    'min_child_weight': list(range(1, 7, 2))
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,
                                                n_estimators=140,
                                                max_depth=5,
                                                min_child_weight=1,
df_all = pd.merge(df_all, df_sess_features, how='left', left_on='id', right_on='id')
df_all = df_all.drop(['id'], axis=1)
#release memory
del df_sessions
del device_freq
del action_freq

#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)                  

print('scores:', NDCG.cross_validation_score(X, labels,xgb,5))
'''
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
示例#48
0
# df_All_stat_9 = pd.read_csv("mchnt_ana.csv", sep=',')
# df_All = pd.merge(left=df_All, right=df_All_stat_9, how='left', left_on='certid', right_on='certid')
#########################

label_df = pd.read_csv("train_label_encrypt.csv", sep=",", low_memory=False, error_bad_lines=False)
df_All = pd.merge(left=df_All, right=label_df, how='left', left_on='certid', right_on='certid')

df_All = df_All.fillna(-1)

df_All_train = df_All[(df_All["label"] == 0) | (df_All["label"] == 1)]
df_All_test = df_All[(df_All["label"] != 0) & (df_All["label"] != 1)]


for i in range(2):
    savename = "xgboost_results_1120_" + str(i) + ".csv"
    print savename
    df_All_train = shuffle(df_All_train)
    X_train = df_All_train.drop(["certid", "label"], axis=1, inplace=False)
    y_train = df_All_train["label"]
    clf = XGBClassifier(learning_rate =0.1,n_estimators=1000,max_depth=5,gamma=0.01,subsample=0.8,colsample_bytree=0.8,objective= 'binary:logistic', reg_alpha=0.1, reg_lambda=0.1,seed=27)
    clf = clf.fit(X_train, y_train)
    X_test = df_All_test.drop(["certid", "label"], axis=1, inplace=False)
    pred = clf.predict(X_test).T
    cerid_arr = np.array(df_All_test["certid"]).T
    result = np.vstack((cerid_arr,pred))
    np.savetxt(savename,result.T,delimiter=',', fmt = "%s")




示例#49
0
文件: train.py 项目: guohuiGH/kaggle
def xgbost(x,y,targetx):
    clf_xgb = XGBClassifier(n_estimators=1000,max_depth=6, learning_rate=0.0075,subsample=0.7,colsample_bytree=0.7,seed=4)
    clf_xgb.fit(x,y)
    return clf_xgb.predict_proba(targetx)[:,1]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

#ClusterCentroids
cc = ClusterCentroids(random_state=0)
os_X, os_y = cc.fit_sample(X_train, y_train)

#XGboost
clf_XG = XGBClassifier(learning_rate=0.3,
                       min_child_weight=1,
                       max_depth=6,
                       gamma=0,
                       subsample=1,
                       max_delta_step=0,
                       colsample_bytree=1,
                       reg_lambda=1,
                       n_estimators=100,
                       seed=1000,
                       scale_pos_weight=1000)
clf_XG.fit(os_X,
           os_y,
           eval_set=[(os_X, os_y), (X_test, y_test)],
           eval_metric='auc',
           verbose=False)
evals_result = clf_XG.evals_result()
y_true, y_pred = y_test, clf_XG.predict(X_test)

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)
示例#51
0
def build_model(X, y):
    print("Fitting classifier")
    xgb = XGBClassifier(max_depth = 4, learning_rate = 0.25, n_estimators = 25,
                            objective = 'multi:softprob', subsample = 0.6, colsample_bytree = 0.6)
    xgb.fit(X, y)
    return xgb
示例#52
0
    #建模
    alg.fit(dtrain[predictors], dtrain['AKI'], eval_metric='auc')

    #对训练集预测
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]

    #输出模型的一些结果
    print("Stopped at iteration: {0}".format(cvresult.shape[0]))
    print("\n关于现在这个模型")
    print("准确率 : %.4g" %
          metrics.accuracy_score(dtrain['AKI'].values, dtrain_predictions))
    print("AUC 得分 (训练集): %f" %
          metrics.roc_auc_score(dtrain['AKI'], dtrain_predprob))


#获得最佳决策树数目
predictors = [x for x in re.columns if x not in [target]]
xgb1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=1000,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective='binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)
modelfit(xgb1, re, data, predictors)
示例#53
0
def run():
    np.random.seed(0)  # seed to shuffle the train set
    n_folds = 4
#    verbose = True
    shuffle = False

    X,y = get_train_data()
    X_submission = mf.get_test_data()

    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]

    skf = list(StratifiedKFold(y, n_folds))
# 这里可以改变参数生成多个模型

    clfs = [RandomForestClassifier(n_estimators=500,
                                   max_features=0.8,
                                   bootstrap=True,
                                   min_samples_leaf=50,
                                   oob_score=True,
                                   criterion='gini',
                                   n_jobs=-1),
            RandomForestClassifier(n_estimators=500,
                                   max_features=0.5,
                                   bootstrap=True,
                                   min_samples_leaf=50,
                                   oob_score=True,
                                   criterion='entropy',
                                   n_jobs=-1),
            ExtraTreesClassifier(n_estimators=500,
                                   min_samples_leaf=50,
                                   criterion='gini',
                                   n_jobs=-1),
            ExtraTreesClassifier(n_estimators=500,
                                   min_samples_leaf=50,
                                   criterion='entropy',
                                   n_jobs=-1),
            GradientBoostingClassifier(learning_rate=0.05, 
                                       n_estimators=500,
                                       max_depth=3, 
                                       max_features=0.65, 
                                       subsample=0.7,
                                       random_state=10,
                                       min_samples_split=350,
                                       min_samples_leaf=70),
            GradientBoostingClassifier(learning_rate=0.01, 
                                       n_estimators=1000,
                                       max_depth=4, 
                                       max_features=0.7, 
                                       subsample=0.8,
                                       random_state=10,
                                       min_samples_split=350,
                                       min_samples_leaf=70),
            XGBClassifier(learning_rate=0.05,
                                      n_estimators=350,
                                      gamma=0,
                                      min_child_weight=5,
                                      max_depth=5,
                                      subsample=0.8,
                                      scale_pos_weight=1,
                                      colsample_bytree=0.8,
                                      objective='binary:logistic',
                                      nthread=8,
                                      eval_metric= 'auc',
                                      seed=10),
            XGBClassifier(learning_rate=0.02,
                                      n_estimators=500,
                                      gamma=0,
                                      min_child_weight=5,
                                      max_depth=5,
                                      subsample=0.7,
                                      scale_pos_weight=1,
                                      colsample_bytree=0.7,
                                      objective='binary:logistic',
                                      nthread=8,
                                      eval_metric= 'auc',
                                      seed=10) 
                                      ]

    print ("Creating train and test sets for blending.")

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print (j, clf)
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print ("Fold", i)
            X_train = X.ix[train,:]
            y_train = y[train]
            X_test = X.ix[test,:]
            y_test = y[test]
            clf.fit(X_train, y_train)
            y_submission = clf.predict_proba(X_test)[:, 1]
            dataset_blend_train[test, j] = y_submission
            print ("train ks_score: ",ks.ks_score(y_submission,y_test))
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

    print
    print ("Blending.")
    clf = LogisticRegression()
    
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

    dataset_blend_train.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/dataset_blend_train.csv',index=False)
    y.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/y.csv',index=False)
    y_submission.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/y_submission.csv',index=False)
    X_user_id.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/X_user_id.csv',index=False)
	
    print ("Linear stretch of predictions to [0,1]")
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())

    test_pre = pd.DataFrame({u'userid':X_user_id,u'probability':y_submission})
    test_pre = test_pre[['userid','probability']]    
    print (test_pre.head())
    test_pre.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/pre_blending.csv',index=False)
示例#54
0
nclass=3

stime = time.time()

trainc=pd.read_csv('./data/train_lon_lat_predicted.csv',index_col=0)
testc=pd.read_csv('./data/test_lon_lat_predicted.csv',index_col=0)
target=pd.read_csv('./data/target.csv',index_col=0)
nf=10

outcome=target['status_group']       
cclf1=XGBClassifier(max_depth=14,
                    learning_rate=0.0588,
                    n_estimators=250,
                    objective='multi:softprob',
                    nthread=8,
                    gamma=0.6890,
                    min_child_weight=7.6550,
                    subsample=0.8, 
                    colsample_bytree=0.8)

              
cclf2=XGBClassifier(max_depth=15,
                    learning_rate=0.03599,
                    n_estimators=385,
                    objective='multi:softprob',
                    nthread=8,
                    gamma=0.6836,
                    min_child_weight= 4.3704,
                    subsample=0.8, 
                    colsample_bytree=0.8)
示例#55
0
    "first_affiliate_tracked",
    "signup_app",
    "first_device_type",
    "first_browser",
]
X = split_categorical_variables(train, categorical_variables)
y = X.pop("country_destination")
label_table = LabelEncoder()
y = label_table.fit_transform(y.values)


# # Let's try a gradiant boost classifier

# In[56]:

xgb_model = XGBClassifier(max_depth=3, n_estimators=10, learning_rate=0.1)
xgb_model.fit(X, y)


# ## How did we do?
#
# * To start, let's look at how well we did just predicting the final outcome


pred = xgb_model.predict_proba(X)

# Find the most probable country
best_country = []  # Not used for now
bestId = []
for i in range(len(pred)):
    bestId.append(np.argsort(pred[i])[::-1])
#Normalize
X = StandardScaler().fit_transform(X)

# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2017)

kfold = cross_validation.StratifiedKFold(y=y_train,
                                         n_folds=5,
                                         random_state=2017)
num_rounds = 100

clf_XGB = XGBClassifier(n_estimators=num_rounds,
                        objective='binary:logistic',
                        seed=2017)

# use early_stopping_rounds to stop the cv when there is no score imporovement
clf_XGB.fit(X_train,
            y_train,
            early_stopping_rounds=20,
            eval_set=[(X_test, y_test)],
            verbose=False)

results = cross_validation.cross_val_score(clf_XGB, X_train, y_train, cv=kfold)

print("\nxgBoost - CV Train : %.2f" % results.mean())
print("xgBoost - Train : %.2f" %
      metrics.accuracy_score(clf_XGB.predict(X_train), y_train))
print("xgBoost - Test : %.2f" %
# In[ ]:

#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]


# In[ ]:


#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  


# In[ ]:

ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
                          verbose_eval=True)
        alg.set_params(n_estimators=cvresult.shape[0])

    alg.fit(dtrain[predictors], dtrain['segment'], eval_metric='auc')

    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]

    print '\nModel Report:'
    print 'AUC (Train): ', metrics.roc_auc_score(dtrain['segment'],
                                                 dtrain_predprob)

    return alg


print 'Training model_1...'
xgb1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=10000,
                     max_depth=4,
                     gamma=0,
                     objective='binary:logistic',
                     seed=27)
model_1 = modelfit(xgb1, train, predictors)

print 'Predictions in progress...'
submit = pd.DataFrame()
submit['ID'] = test['ID']
pred_1 = model_1.predict_proba(test[predictors])[:, 1]
submit['segment'] = pred_1
submit.to_csv('submit.csv', index=False)
def pipe_main(pipe=None):
    '''pipeline construction using sklearn estimators, final step support only
    classifiers currently
    
    .. note::
        data flows through a pipeline consisting of steps as below:
            raw data --> clean --> encoding --> scaling --> feature construction 
            --> feature selection --> resampling --> final estimator
            see scikit-learn preprocess & estimators
    parameter
    ----
    pipe - str 
        - in the format of 'xx_xx' of which 'xx' means steps in pipeline,
          default None
    return
    ----
        1) pipeline instance of chosen steps
        2) if pipe is None, a dict indicating possible choice of 'steps'
    '''
    clean = {
        'clean':
        Split_cls(dtype_filter='not_datetime', na1='null', na2=-999),
        'cleanNA':
        Split_cls(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Split_cls(dtype_filter='not_datetime', na1='most_frequent',
                  na2='mean'),
    }
    #
    encode = {
        'woe': Woe_encoder(max_leaf_nodes=5),
        'oht': Oht_encoder(),
        'ordi': Ordi_encoder(),
    }

    resample = {

        # over_sampling
        'rover':
        RandomOverSampler(),
        'smote':
        SMOTE(),
        'bsmote':
        BorderlineSMOTE(),
        'adasyn':
        ADASYN(),

        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),

        # under sampling cleaning methods
        'tlinks':
        TomekLinks(n_jobs=-1),
        'oside':
        OneSidedSelection(n_jobs=-1),
        'cleanNN':
        NeighbourhoodCleaningRule(n_jobs=-1),
        'enn':
        EditedNearestNeighbours(n_jobs=-1),
        'ann':
        AllKNN(n_jobs=-1),
        'cnn':
        CondensedNearestNeighbour(n_jobs=-1),

        # clean outliers
        'inlierForest':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'IsolationForest'}),
        'inlierLocal':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'LocalOutlierFactor'}),
        'inlierEllip':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'EllipticEnvelope'}),
        'inlierOsvm':
        FunctionSampler(outlier_rejection, kw_args={'method': 'OneClassSVM'}),
        # combine
        'smoteenn':
        SMOTEENN(),
        'smotelink':
        SMOTETomek(),
    }

    scale = {
        'stdscale': StandardScaler(),
        'maxscale': MinMaxScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'qauntile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(normalize_components=True, n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        'rtembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(Woe_encoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(
            LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc')),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fsvm':
        SelectFromModel(LinearSVC('l1', dual=False, C=1e-2)),
        'fxgb':
        SelectFromModel(XGBClassifier(n_jobs=-1)),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5)),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1), step=0.1, n_features_to_select=20),
        'fRFErf':
        RFE(ExtraTreesClassifier(n_estimators=100, max_depth=5),
            step=0.3,
            n_features_to_select=20),
        'fRFElog':
        RFE(LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc'),
            step=0.3,
            n_features_to_select=20)
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }
    # sklearn estimator
    t = all_estimators(type_filter=['classifier'])
    estimator = {}
    for i in t:
        try:
            estimator.update({i[0]: i[1]()})
        except Exception:
            continue

    estimator.update(
        dummy=DummyClassifier(),
        XGBClassifier=XGBClassifier(n_jobs=-1),
        LogisticRegressionCV=LogisticRegressionCV(scoring='roc_auc'),
        EasyEnsembleClassifier=EasyEnsembleClassifier(),
        BalancedRandomForestClassifier=BalancedRandomForestClassifier(),
        RUSBoostClassifier=RUSBoostClassifier(),
        SVC=SVC(C=0.01, gamma='auto'))

    if pipe is None:
        feature_s = {}
        feature_s.update(**feature_m, **feature_u)
        return {
            'clean': clean.keys(),
            'encoding': encode.keys(),
            'resample': resample.keys(),
            'scale': scale.keys(),
            'feature_c': feature_c.keys(),
            'feature_s': feature_s.keys(),
            'classifier': estimator.keys()
        }
    elif isinstance(pipe, str):
        l = pipe.split('_')
        all_keys_dict = {}
        all_keys_dict.update(**clean, **encode, **scale, **feature_c,
                             **feature_m, **feature_u, **estimator, **resample)
        steps = []
        for i in l:
            if all_keys_dict.get(i) is not None:
                steps.append((i, all_keys_dict.get(i)))
            else:
                raise KeyError(
                    "'{}' invalid key for sklearn estimators".format(i))
        return Pipeline(steps)

    else:
        raise ValueError("input pipe must be a string in format 'xx[_xx]'")
示例#60
0
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier

X_train, X_test, y_train, y_test = train_test_split(data,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

clf = DecisionTreeClassifier()
#we have to define max_depth to prevent overfitting
clf.fit(X_train, y_train)
print("Train Accuracy of clf:", clf.score(X_train, y_train))
print("Test Accuracy of clf", clf.score(X_test, y_test))

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
print("Train Accuracy of xgb:", xgb.score(X_train, y_train))
print("Test Accuracy of xgb:", xgb.score(X_test, y_test))

#%%
from sklearn.model_selection import GridSearchCV

#GridSearch on Xgboost Classifier
param_dict = {
    'max_depth': range(2, 3, 4),
    'min_child_weight': range(1, 2, 6),
    'learning_rate': [0.00001, 0.001, 0.01, 0.1],
    'n_estimators': [10, 50, 100]
}