def train_random_forest(X, y): from sklearn.ensemble import RandomForestClassifier from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import train_test_split # train - test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10) # TODO: separate feature selection step # fit random forest classifier rfc = RandomForestClassifier(n_estimators=2000, n_jobs=-1, oob_score=True) param_grid = { 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [20, 35, 50] } CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5) CV_rfc.fit(X_train, y_train) print('Random Forest classifier best parameters:', CV_rfc.best_params_) # normalized discounted cumulative gain predictions = format_predictions(X_test, CV_rfc) ndcg = ndcg_n(predictions, y_test) print('Random Forest classifier NDCG (on test set): {0:.2f}'.format(ndcg)) return CV_rfc
def train_logistic_regression(X, y): from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import train_test_split # train - test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10) # TODO: separate feature selection step # fit logistic regression lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2') lr.fit(X_train, y_train) # normalized discounted cumulative gain predictions = format_predictions(X_test, lr) ndcg = ndcg_n(predictions, y_test) print('Logistic regression classifier NDCG (on test set): : {0:.4f}'.format(ndcg)) return lr
def train_decision_tree(X, y): from sklearn import tree from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import train_test_split # train - test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10) # TODO: separate feature selection step # fit decision tree tree = tree.DecisionTreeClassifier() param_grid = {'max_depth': [3, 5, 7, 9]} CV_tree = GridSearchCV(estimator=tree, param_grid=param_grid, cv=5) CV_tree.fit(X_train, y_train) print('Decision tree classifier best parameters:', CV_tree.best_params_) # normalized discounted cumulative gain predictions = format_predictions(X_test, CV_tree) ndcg = ndcg_n(predictions, y_test) print('Decision tree classifier NDCG (on test set): {0:.2f}'.format(ndcg)) return CV_tree