import logging import csv from sklearn import linear_model from sklearn.grid_search import GridSearchCV if __name__ == "__main__": logging.basicConfig(format='--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s', datefmt='%Y/%m/%d %H:%M:%S', level=logging.INFO) logging.info("train_sgdc_cv.py Start") p = PreProcess() out_filepath = 'data/train_1M.csv.out' #out_filepath = 'data/train_s404_100K.out.1vs1' #out_filepath = 'data/train_1000.csv.out' #Load data X, y = p.load_train_data(out_filepath) logging.info("Shape X = %r, y =%r" %(X.shape, y.shape )) logging.info("example X = %s\ny =%r" %(X[0], y[0])) logging.info("classes: %r" % list(np.unique(y))) #Sampling #At least 3 POWER = 6 CONST = 1 #At least 2 #CV = 5 n_subsamples = CONST*10**POWER n_size = y.shape[0] if n_subsamples < n_size: logging.info("second index : %d to %d" % (n_size - n_subsamples/2, n_size)) X_small_train = X[range(n_subsamples/2)+range(n_size - n_subsamples/2, n_size)]
if __name__ == "__main__": logging.basicConfig( format="--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s", datefmt="%Y/%m/%d %H:%M:%S", level=logging.INFO ) logging.info("train_test.py Start") SAMPLE = 50000 p = PreProcess() train_filepath = "data/train_1M.csv.out" # train_filepath = 'data/train_1000.csv.out' # test_filepath = 'data/test.csv.out' test_filepattern = "data/test_%d_M.out" # Load train data logging.info("Loading train set...") X_train, y_train = p.load_train_data(train_filepath) # Sampling if y_train.shape[0] > SAMPLE: X_train = X_train[:SAMPLE] y_train = y_train[:SAMPLE] else: SAMPLE = y_train.shape[0] logging.info("Sampling %d" % SAMPLE) logging.info("Shape X_train = %r, y_train =%r" % (X_train.shape, y_train.shape)) logging.info("example X_train = %s\ny_train =%r" % (X_train[0], y_train[0])) logging.info("classes: %r" % list(np.unique(y_train))) # Training # Default C = 1.0
from sklearn.neighbors import KNeighborsClassifier from sklearn.grid_search import GridSearchCV if __name__ == "__main__": logging.basicConfig(format='--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s', datefmt='%Y/%m/%d %H:%M:%S', level=logging.INFO) logging.info("train_knn_cv.py Start") p = PreProcess() #out_filepath = 'data/train_1M.csv.out' #out_filepath = 'data/train_s404_10K.out' out_filepath = 'data/train_1000.csv.out' #Load data #X, y = p.load_train_data(out_filepath) #Load data with category X, y, enc, map_dict = p.load_train_data(out_filepath, category = True) logging.info("Shape X = %r, y =%r" %(X.shape, y.shape )) logging.info("example X = %s\ny =%r" %(X[0], y[0])) logging.info("classes: %r" % list(np.unique(y))) #Sampling #At least 3 POWER = 6 CONST = 1 #At least 2 n_subsamples = CONST*10**POWER n_size = y.shape[0] if n_subsamples < n_size: logging.info("second index : %d to %d" % (n_size - n_subsamples/2, n_size)) X_small_train = X[range(n_subsamples/2)+range(n_size - n_subsamples/2, n_size)] y_small_train = y[range(n_subsamples/2)+range(n_size - n_subsamples/2, n_size)]