Пример #1
0
def model_xgb_grid_search(features_train, labels_train, features_test):
	
	# Feature normalization
	f_train_normalized = normalize(features_train, axis=0)
	f_test_normalized = normalize(features_test, axis=0)

	# Do PCA
	pca = PCA(n_components=2)
	f_train_pca = pca.fit_transform(f_train_normalized)
	features_train['PCA1'] = f_train_pca[:,0]
	features_train['PCA2'] = f_train_pca[:,1]
	
	f_test_pca = pca.fit_transform(f_test_normalized)
	features_test['PCA1'] = f_test_pca[:,0]
	features_test['PCA2'] = f_test_pca[:,1]

	# Feature selection
	#p = 75, AUC = 0.822000
	p = 70 # AUC = 0.833136
	#p = 65, AUC = 0.832403
	f_train_binarized = Binarizer().fit_transform(scale(features_train))
	select = SelectPercentile(chi2, percentile=p).fit(f_train_binarized, labels_train)
	selected = select.get_support() # a list of True/False to indicate if a feature is selected or not
	selected_features = []
	for i,f in enumerate(features_train.columns):
		if selected[i]:
			selected_features.append(f)
	#print (selected_features)

	features_train = features_train[selected_features]
	features_test = features_test[selected_features]

	# xgboost with GridSearch
	
	xgb_mobdel = XGBClassifier()
	params = {
	"max_depth": [2, 5, 10],
	"min_child_weight": [1, 2, 6]
	}
	clf = GridSearchCV(estimator=xgb_mobdel, param_grid=params, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
	clf.fit(features_train, labels_train)
	print (clf.grid_scores_, clf.best_params_, clf.best_score_)

	# Get the importances of features, returning pairs of features and their importances
	importance = clf.get_fscore() 

	# Sort features by importance, and return the top features only
	# 'key' parameter specifies a function to be called on each list element prior to making comparisons
	# itemgetter(1) returns importances, itemgetter(0) returns features
	sorted_importance = sorted(importance.items(), key=operator.itemgetter(1))[-15:]
	print (sorted_importance)

	# Put pairs of features and their importances into a DataFrame for plotting
	df_importance = pd.DataFrame(sorted_importance, columns=['feature', 'fscore'])

	# Plot the importance of features, which is useful for data exploration phase
	df_importance.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(20, 6))
	plt.title('XGBoost Feature Importance')
	plt.xlabel('feature importance')
	plt.gcf().savefig('feature_importance_xgb.png')
	#plt.show() # if putting show() before gcf().savefig, the figure won't be saved

	return clf.predict(test_xgb)