예제 #1
0
def train_random_forest(X, y):

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.grid_search import GridSearchCV
    from sklearn.cross_validation import train_test_split

    # train - test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

    # TODO: separate feature selection step

    # fit random forest classifier
    rfc = RandomForestClassifier(n_estimators=2000, n_jobs=-1, oob_score=True)
    param_grid = {
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_leaf': [20, 35, 50]
    }
    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
    CV_rfc.fit(X_train, y_train)
    print('Random Forest classifier best parameters:', CV_rfc.best_params_)

    # normalized discounted cumulative gain
    predictions = format_predictions(X_test, CV_rfc)
    ndcg = ndcg_n(predictions, y_test)
    print('Random Forest classifier NDCG (on test set): {0:.2f}'.format(ndcg))

    return CV_rfc
예제 #2
0
def train_logistic_regression(X, y):
    from sklearn.linear_model import LogisticRegression
    from sklearn.cross_validation import train_test_split

    # train - test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

    # TODO: separate feature selection step

    # fit logistic regression
    lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2')
    lr.fit(X_train, y_train)

    # normalized discounted cumulative gain
    predictions = format_predictions(X_test, lr)
    ndcg = ndcg_n(predictions, y_test)
    print('Logistic regression classifier NDCG (on test set): : {0:.4f}'.format(ndcg))

    return lr
예제 #3
0
def train_decision_tree(X, y):

    from sklearn import tree
    from sklearn.grid_search import GridSearchCV
    from sklearn.cross_validation import train_test_split

    # train - test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

    # TODO: separate feature selection step

    # fit decision tree
    tree = tree.DecisionTreeClassifier()
    param_grid = {'max_depth': [3, 5, 7, 9]}
    CV_tree = GridSearchCV(estimator=tree, param_grid=param_grid, cv=5)
    CV_tree.fit(X_train, y_train)
    print('Decision tree classifier best parameters:', CV_tree.best_params_)

    # normalized discounted cumulative gain
    predictions = format_predictions(X_test, CV_tree)
    ndcg = ndcg_n(predictions, y_test)
    print('Decision tree classifier NDCG (on test set): {0:.2f}'.format(ndcg))

    return CV_tree