Пример #1
0
def build_extratrees_features():
    X, y, X_holdout, _ = prepare_data("./data", drop_categorical=False)

    print "Getting OOB predictions from ExtraTreesClassifier"
    clf = ExtraTreesClassifier(n_estimators=500, max_features= 50,criterion= 'entropy',min_samples_split= 5,
                               max_depth= 50, min_samples_leaf= 5, n_jobs=4)
    X_1, X_2 = build_base_features(clf, X, X_holdout, y, 10)
    np.vstack((X_1, X_2)).tofile('./features/extra_trees_oob.npy')
Пример #2
0
def get_sparse_onehot_features():
    X, y, X_holdout, ids = prepare_data("./data", drop_categorical=False)
    cat_idx = get_cat_columns()
    encoder = OneHotEncoder(categorical_features=cat_idx, sparse=True, handle_unknown="ignore")

    X[:, cat_idx] = X[:, cat_idx] + 1
    X_holdout[:, cat_idx] = X_holdout[:, cat_idx] + 1
    X = encoder.fit_transform(X)
    X_holdout = encoder.transform(X_holdout)

    return X.tocsr(), y, X_holdout.tocsr(), ids
Пример #3
0
def build_multinomial_nb_features():
    X, y, X_holdout, _ = prepare_data("./data", drop_categorical=False)

    cat_idx = get_cat_columns()
    X, X_holdout = X[:, cat_idx], X_holdout[:, cat_idx]
    X = X + 1
    X_holdout = X_holdout + 1

    print "Getting OOB predictions from mNB"
    clf = MultinomialNB(alpha=1)
    X_1, X_2 = build_base_features(clf, X, X_holdout, y, 10)
    np.vstack((X_1, X_2)).tofile('./features/NB_oob.npy')
Пример #4
0
def build_rf_features():
    X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False)

    X1, X2 = load_extra_features()
    X = np.hstack((X, X1))
    X_holdout = np.hstack((X_holdout, X2))

    X, X_test, y, y_test = train_test_split(X, y, test_size=0.2)

    t0 = time()
    for max_depth in range(1, 15):
        print "max depth {}".format(max_depth)
        rf_clf = RandomForestClassifier(n_estimators=200, max_depth=max_depth, criterion="entropy", n_jobs=-1)
        rf_clf.fit(X, y)

        print "Done in %0.3fs" % (time() - t0)
        print log_loss(y_test, rf_clf.predict_proba(X_test))
Пример #5
0
def build_knn_features():
    X, y, X_holdout, _ = prepare_data("./data", drop_categorical=True)

    n_rows = X.shape[0]

    scaler = StandardScaler()
    Z = np.vstack((X, X_holdout))
    Z = scaler.fit_transform(np.vstack((X, X_holdout )))

    X = Z[:n_rows]
    X_test = Z[n_rows:]

    for k in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
        print "Getting OOB from KNN for k={}".format(k)
        clf = KNeighborsClassifier(k, n_jobs=-1)
        X_1, X_2 = build_base_features(clf, X, X_test, y, 10)

        M = np.vstack((X_1, X_2))
        M.tofile('./features/knn_oob_{}.npy'.format(k))
Пример #6
0
def build_rf_submission():
    X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False)
    # Right now we look at an extra y_train, y_test to assess the quality of our cv-estimates.
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42)
    print "Run Random Forest with {} data points and {} features.".format(X_train.shape[0], X_train.shape[1])
    t0 = time()
    grid_cv = find_cv_rf_model(X_train, y_train, grid=False)  # stochastic search now
    best_clf = grid_cv.best_estimator_
    y_pred = best_clf.predict_proba(X_test)
    print "Done in %0.3fs" % (time() - t0)
    print "Best params {}: ".format(grid_cv.best_params_)
    print "Best CV score {}: ".format(grid_cv.best_score_)
    print "Training log-loss: {}".format(log_loss(y_train, best_clf.predict_proba(X_train)))
    print "Training accuracy: {}".format(best_clf.score(X_train, y_train))
    print "Test log-loss: {}".format(log_loss(y_test, y_pred))
    print "Test accuracy: {}".format(best_clf.score(X_test, y_test))

    submission_name = "submission_{}.csv".format(time())
    util.note_submission_info("Model: {}".format(best_clf), submission_name)
    util.build_submission(best_clf, X_holdout, ids, submission_name)
Пример #7
0
from extraction import prepare_data, load_extra_features
import xgboost as xgb
import numpy as np
import pandas as pd
from time import time
from sklearn import cross_validation

if __name__ == "__main__":
    X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False)
    X_extra, X_holdout_extra = load_extra_features()

    params = {
       "objective"  : "binary:logistic",
       "eval_metric" : "logloss",
       "eta" : 0.005, # 0.01
       "subsample" : 0.8,
       "colsample_bytree" : 0.8,
       "min_child_weight" : 1,
       "max_depth" : 10
    }

    xg_train = xgb.DMatrix(np.hstack((X, X_extra)), label=y)
    xg_test = xgb.DMatrix(np.hstack((X_holdout, X_holdout_extra)))

    #xg_train = xgb.DMatrix(X, label=y)

    xgb_clf = xgb.train(params, xg_train, num_boost_round=2500, verbose_eval=True, maximize=False)

    y_pred = xgb_clf.predict(xg_test)# ,ntree_limit=xgb_clf.best_iteration)
    #cv_scores = xgb.cv(params, xg_train, num_boost_round=100, nfold=5, metrics="logloss", seed=42, early_stopping_rounds=5)
    #print cv_scores
Пример #8
0
from sklearn.decomposition import TruncatedSVD
import numpy as np
from extraction import prepare_data

if __name__ == "__main__":
    X, _, X_holdout, _ = prepare_data("./data/", drop_categorical=False)

    A = np.vstack((X, X_holdout))

    print "Applying SVD"
    svd = TruncatedSVD(20)
    B = svd.fit_transform(A)
    print B.shape

    for col in xrange(B.shape[1]):
        B[:, col].tofile("./features/svd_{}.npy".format(col))
Пример #9
0
from time import time

import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

from extraction import prepare_data, get_int_feature_columns
from visualization.learning_curve import plot_learning_curve

if __name__ == "__main__":
    X, y, _, _ = prepare_data("../data", drop_categorical=False)

    cat_idx = get_int_feature_columns()
    encoder = OneHotEncoder(categorical_features=cat_idx, sparse=True)
    X = encoder.fit_transform(X, y)


    plt = plot_learning_curve(estimator=LogisticRegression(C=0.1, penalty='l1'),
                              title="Learning Curves of LogReg with logloss",
                              X=X, y=y,
                              cv=5,
                              n_jobs=7,
                              scoring="log_loss")

    plt.savefig("../images/learning_curve_logreg_{}.png".format(time()))
    plt.show()