def test_ensure_unit_variance(self): data = np.linspace(1,50).reshape((10,5)) stds = [np.std(column) for column in data.T] uv_data = dp.ensure_unit_variance(data, [0,2,3]) data[:,0] = (data[:,0] - np.mean(data[:,0])) / stds[0] data[:,2] = (data[:,2] - np.mean(data[:,2])) / stds[2] data[:,3] = (data[:,3] - np.mean(data[:,3])) / stds[3] for target, actual in zip(data.T, uv_data.T): self.assertAlmostEqual(np.mean(actual), np.mean(target)) self.assertAlmostEqual(np.std(actual), np.std(target))
########## Training phase ########## # In this phase the original (labeled) training data is split into a new set of training and test data. # These sets are used to determine the optimal parameters (number of neighbors) for a NN classifier. # For each tested classifier, the score is calculated and printed. Additionally, the log loss as applied # by kaggle is calculated and printed for the best classifier. train_path = 'data/train.csv' predictions_path = 'data/predictions.csv' # Load training data data = importer.read_labeled(train_path, 3000) # Read at most 3000 data points data = dapo.vectorize(data, 1, features=[('latitude', 7), ('longitude', 8), ('day', 0), ('day_of_week', 0), ('time', 0), ('streets', 6)]) crime_to_id_dict = data.next() data = importer.to_numpy_array(data) # Collect data in array data = dapo.ensure_unit_variance(data, columns_to_normalize=(0, 1, 2, 3, 4)) # Ensure unit variance in appropriate columns # Separate labels from data crime_ids = data[:,-1].astype(int) # Crime ids are in the last column, and are integer values locations = data[:,:-1] # The rest is data # Calculate ranges for the modulo used on circular quantities modulo_for_day = abs( min(locations[:,2]) - max(locations[:,2]) ) modulo_for_day_of_week = abs( min(locations[:,3]) - max(locations[:,3]) ) modulo_for_time = abs( min(locations[:,4]) - max(locations[:,4]) ) modulae = (modulo_for_day, modulo_for_day_of_week, modulo_for_time) # Split into train and test set loc_train, loc_test, crime_ids_train, crime_ids_test = cv.train_test_split(locations, crime_ids, test_size=0.33) # Train and evaluate
import numpy as np import importer as im import evaluation as ev import data_processing as dp from sklearn.svm import SVC import sklearn.cross_validation as cv # load data path = "../data/train.csv" data = im.read_labeled(path, 10000) data = dp.vectorize(data, 1, features=[('latitude', 7), ('longitude', 8), ('day', 0), ('day_of_week', 0), ('time', 0)]) crime_to_id_dict = data.next() data = im.to_numpy_array(data) data = dp.ensure_unit_variance(data, columns_to_normalize=(0,1,2,3,4)) # separate data in features and labels Y = data[:,5].astype(int) X = data[:,:5] # split data in training data and test data train_X, test_X, train_Y, test_Y = cv.train_test_split(X, Y, test_size=0.33) # run svm for several values for C for c in [0.1, 1, 1.5, 10,20,50,100,200]: print "C = {0}".format(c) # create SVM clf = SVC(C=c,kernel='rbf', gamma=1000) # fit SVM clf.fit(train_X, train_Y)