예제 #1
0
import logging
import csv
from sklearn import linear_model
from sklearn.grid_search import GridSearchCV

if __name__ == "__main__":
    logging.basicConfig(format='--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s', datefmt='%Y/%m/%d %H:%M:%S', level=logging.INFO)
    logging.info("train_sgdc_cv.py Start")
    p = PreProcess()

    out_filepath = 'data/train_1M.csv.out'
    #out_filepath = 'data/train_s404_100K.out.1vs1'
    #out_filepath = 'data/train_1000.csv.out'

    #Load data
    X, y = p.load_train_data(out_filepath)
    logging.info("Shape X = %r, y =%r" %(X.shape, y.shape ))
    logging.info("example X = %s\ny =%r" %(X[0], y[0]))
    logging.info("classes: %r" % list(np.unique(y)))

    #Sampling
    #At least 3
    POWER = 6
    CONST = 1
    #At least 2
    #CV = 5
    n_subsamples = CONST*10**POWER
    n_size = y.shape[0]
    if n_subsamples < n_size:
        logging.info("second index : %d to %d" % (n_size - n_subsamples/2, n_size))
        X_small_train = X[range(n_subsamples/2)+range(n_size - n_subsamples/2, n_size)]
예제 #2
0
if __name__ == "__main__":
    logging.basicConfig(
        format="--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s", datefmt="%Y/%m/%d %H:%M:%S", level=logging.INFO
    )
    logging.info("train_test.py Start")
    SAMPLE = 50000
    p = PreProcess()

    train_filepath = "data/train_1M.csv.out"
    # train_filepath = 'data/train_1000.csv.out'
    # test_filepath = 'data/test.csv.out'
    test_filepattern = "data/test_%d_M.out"

    # Load train data
    logging.info("Loading train set...")
    X_train, y_train = p.load_train_data(train_filepath)

    # Sampling
    if y_train.shape[0] > SAMPLE:
        X_train = X_train[:SAMPLE]
        y_train = y_train[:SAMPLE]
    else:
        SAMPLE = y_train.shape[0]
    logging.info("Sampling %d" % SAMPLE)
    logging.info("Shape X_train = %r, y_train =%r" % (X_train.shape, y_train.shape))
    logging.info("example X_train = %s\ny_train =%r" % (X_train[0], y_train[0]))
    logging.info("classes: %r" % list(np.unique(y_train)))

    # Training
    # Default
    C = 1.0
예제 #3
0
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

if __name__ == "__main__":
    logging.basicConfig(format='--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s', datefmt='%Y/%m/%d %H:%M:%S', level=logging.INFO)
    logging.info("train_knn_cv.py Start")
    p = PreProcess()

    #out_filepath = 'data/train_1M.csv.out'
    #out_filepath = 'data/train_s404_10K.out'
    out_filepath = 'data/train_1000.csv.out'

    #Load data
    #X, y = p.load_train_data(out_filepath)
    #Load data with category
    X, y, enc, map_dict = p.load_train_data(out_filepath, category = True)
    logging.info("Shape X = %r, y =%r" %(X.shape, y.shape ))
    logging.info("example X = %s\ny =%r" %(X[0], y[0]))
    logging.info("classes: %r" % list(np.unique(y)))

    #Sampling
    #At least 3
    POWER = 6
    CONST = 1
    #At least 2
    n_subsamples = CONST*10**POWER
    n_size = y.shape[0]
    if n_subsamples < n_size:
        logging.info("second index : %d to %d" % (n_size - n_subsamples/2, n_size))
        X_small_train = X[range(n_subsamples/2)+range(n_size - n_subsamples/2, n_size)]
        y_small_train = y[range(n_subsamples/2)+range(n_size - n_subsamples/2, n_size)]