Пример #1
0
def rf_model(read_csv=True):
	print 'rf_model'
	train_df, test_df = get_train_test_data(cache=read_csv, include_sessions=True)


	cols = [i for i in train_df.columns if i not in EXCLUDE_COLS]
	X = train_df[cols]
	y = train_df['country_destination']

	clf = RandomForestClassifier(n_estimators=20)

	# make submission
	# clf.fit(X.values, y)
	# py.test.set_trace()

	# test_data = test_df[[i for i in test_df.columns if i not in EXCLUDE_COLS]]
	# y_pred = bst.predict_proba(test_data)
	# print 'predicted submission prob'

	# py.test.set_trace()

	# sub = create_kaggle_submission(y_pred, test_df['id'], None)
	# print 'created kaggle submission'

	# kf = KFold(len(X), n_folds=10, random_state=42)
	# score = cross_val_score(bst, X, y, cv=kf, scoring=ndcg_scorer)

	param_dist = {"max_depth": [3, None],
			  "max_features": sp_randint(1, 11),
			  "min_samples_split": sp_randint(1, 11),
			  "min_samples_leaf": sp_randint(1, 11),
			  "bootstrap": [True, False]
			  }
	n_iter_search = 20
	random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
								   n_iter=n_iter_search, scoring=ndcg_scorer)
	start = time()
	random_search.fit(X, y)
	print("RandomizedSearchCV took %.2f seconds for %d candidates"
		  " parameter settings." % ((time() - start), n_iter_search))
	report(random_search.grid_scores_)

	py.test.set_trace()
	return
Пример #2
0
def xgb_model(read_csv=True):
	print 'xgb_model randomcv yr > 2013'
	train_df, test_df = get_train_test_data(cache=read_csv, include_sessions=False)

	train_df = train_df[train_df['tfa_year'] > 2013]
	cols = [i for i in train_df.columns if i not in EXCLUDE_COLS]
	X = train_df[cols]
	y = train_df['country_destination']

	#start classifier
	bst = xgb.XGBClassifier(nthread=4)
	# bst = xgb.XGBClassifier(max_depth=2, nthread=4,
	# 	n_estimators=50,subsample=0.4,learning_rate=0.0.05)
	train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=10000)
	# bst.fit(test_X, test_y)



	param_dist = {"max_depth": [2,4,6],
			  "learning_rate": [0.05, 0.1, 0.15, 0.2],
			  "n_estimators": [30, 50, 70],
			  # "min_samples_leaf": sp_randint(1, 11),
			  # "min_samples_split": sp_randint(1,11),
			  'subsample': [0.4, 0.5, 0.6]
			  # 'max_features': [20, 50, 100]
			  }
	n_iter_search = 20
	random_search = RandomizedSearchCV(bst, param_distributions=param_dist,
								   n_iter=n_iter_search, scoring=ndcg_scorer)
	start = time()
	random_search.fit(test_X, test_y)
	print("RandomizedSearchCV took %.2f seconds for %d candidates"
		  " parameter settings." % ((time() - start), n_iter_search))
	report(random_search.grid_scores_)


	# bagging
	# clfbag = BaggingClassifier(bst, n_estimators=5, max_samples=5000)
	# clfbag.fit(train_X, train_y)
	# y_pred = clfbag.predict_proba(test_X)
	# print 'predicted prob'

	# score = ndcg_score(test_y, y_pred)


	# apply learned model


	# bst.fit(X.values, y)
	# py.test.set_trace()

	test_data = test_df[[i for i in test_df.columns if i not in EXCLUDE_COLS]]
	y_pred = random_search.predict_proba(test_data)
	py.test.set_trace()
	# kaggle_test = pd.read_csv('test.csv')


	sub = create_kaggle_submission(y_pred, test_df['id'], 0.841)
	py.test.set_trace()
	# print 'created kaggle sub'

	# kf = KFold(len(X), n_folds=10, random_state=42)
	# score = cross_val_score(bst, X, y, cv=kf, scoring=ndcg_scorer)

	# param_dist = {"max_depth": [3, None],
	# 		  "max_features": sp_randint(1, 11),
	# 		  "min_samples_split": sp_randint(1, 11),
	# 		  "min_samples_leaf": sp_randint(1, 11),
	# 		  "bootstrap": [True, False],
	# 		  "criterion": ["gini", "entropy"]}
	# n_iter_search = 20
	# random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
	# 							   n_iter=n_iter_search, scoring=ndcg_scorer)
	# start = time()
	# random_search.fit(X, y)
	# print("RandomizedSearchCV took %.2f seconds for %d candidates"
	# 	  " parameter settings." % ((time() - start), n_iter_search))
	# report(random_search.grid_scores_)

	# py.test.set_trace()

	# sub = create_kaggle_submission(y_pred, test_df['id'], np.mean(score))

	# py.test.set_trace()

	#end classifier

	""" trying cross valid