def test_kblr(self): N = 300 for i in range(3): gk = ml.gaussian_kernel(.3) data = test_util.read_data(str(i)) train_data, train_labels, validation_data, validation_labels = data model = ml.kblr_model(gk, .5) solver = ml.BGD(train_data[:N], train_labels[:N], model) solver.fit(.001, 'step_precision', .0000001) label_guess = model.predict(validation_data[:int(.25*N)]).flatten() correct = label_guess == validation_labels[:int(.25*N)] missed = 0 for c in correct: if not c: missed+=1 # print(i,missed) self.assertTrue(missed < .1*validation_labels[:int(.25*N)].size)
def test_stochastic_kblr(self): self.weights = 10 self.epochs = 20 self.N = 400 self.step_size = 0.3 for i in range(3): k = ml.gaussian_kernel(.4) m = ml.sklr_model(k, 1e-9, .0005) train_data, train_labels, validation_data, validation_labels = test_util.read_data(str(i)) losses = self.train(10, m, (train_data[:self.N], train_labels[:self.N])) label_probs = m.predict(validation_data).flatten() predictions = label_probs > .5 correct = predictions == validation_labels missed = 0 for c in correct: if not c: missed+=1 error = "Data set: {}. Missed: {}/{}".format(i,missed,validation_labels[:int(self.N)].size) # print(error) self.assertTrue(missed < .15*validation_labels.size, error)
def train_POLK(step_size, sigma, eps, epochs, data): train_data, train_labels, test_data, test_labels = data train_losses = [] test_losses = [] train_errors = [] test_errors = [] model_orders = [] step_times = [] BS = 10 kernel = ml.gaussian_kernel(sigma) model = ml.sklr_model(kernel, 1e-9, eps) sgd = ml.SGD(model) print('*********************') print( 'training POLK with sigma={} error threshold={} step size={}. '.format( sigma, eps, step_size)) for e in range(epochs): print('epoch: ', e) epoch_start = time.time() seed = e np.random.seed(seed) np.random.shuffle(train_data) np.random.seed(seed) np.random.shuffle(train_labels) flag = 0 prev_size = 0 for i in range(0, train_data.shape[0], BS): start = time.time() sgd.fit(step_size, train_data[i:i + BS], train_labels[i:i + BS]) end = time.time() # add the time to compute sgd step_times.append(end - start) # calcualte training and test loss train_losses.append(model.loss(train_data, train_labels)) test_losses.append(model.loss(test_data, test_labels)) # calculate training accuracy predictions = model.predict(train_data) >= .5 train_labels.shape = predictions.shape correct = (predictions == train_labels).sum() train_errors.append(1 - (correct / (train_labels.shape[0]))) # calculate test accuracy predictions = model.predict(test_data) >= .5 test_labels.shape = predictions.shape correct = (predictions == test_labels).sum() test_errors.append(1 - (correct / (test_labels.shape[0]))) # add the current model order model_order = model.dictionary().shape[0] model_orders.append(model_order) # if the model is accepting all of the values we gave it epsilon is too low - terminate early if prev_size + BS == model_order: if flag > 4: raise Exception('eps too low') print('model order: ', model_order) flag += 1 prev_size = model_order epoch_end = time.time() # print('time to run epoch: {} seconds'.format(epoch_end - epoch_start)) # print('training loss: {}. test loss: {}'.format(train_losses[-1],test_losses[-1])) print('model order: ', model.dictionary().shape[0]) # print('test error: {}'.format(test_errors[-1])) return train_losses, test_losses, train_errors, test_errors, step_times, model_orders
split_ind = int(.9 * samples) data = np.load('data.npy') X, Y = data[:, :2], data[:, 2] train_x, train_y = X[:split_ind], Y[:split_ind] test_x, test_y = X[split_ind:], Y[split_ind:] # for various values of hyperparameters classify the dataset errs = [.01, .001, .0008, .0007, .0006] sigma = .2 # track the stats for each set of hyperparameters stats = {} # how much to train epochs = 10 batch_size = 10 step_size = .3 kernel = ml.gaussian_kernel(sigma) for err in errs: # for some combination of the hyperparams we need to collect some stuff every epoch # - the loss on the training set and the test set # - the model order # - the error rate on the test set train_losses = [] test_losses = [] test_errors = [] model_orders = [] model = ml.sklr_model(kernel, 1e-9, err) sgd = ml.SGD(model) print('*********************') print('error threshold: ', err) for e in range(epochs): print('epoch: ', e)
class TestKernels(unittest.TestCase): """Tests for kernels""" # constants for filters and kernels #polynomial pa = 1.5 pc = .2 pd = 3 #rbf/gaussian gs = .5 g = 1 / (2 * gs * gs) # my kernels lk = ml.linear_kernel(0) pk = ml.polynomial_kernel(pa, pc, pd) gk = ml.gaussian_kernel(gs) def compare_to_sklearn(self, x, y): """Helper for comparing the three kernels to avoid copy-pasting """ sk_lk = linear_kernel(x, y) my_lk = self.lk.gram_matrix(x, y) self.assertTrue(np.allclose(my_lk, sk_lk)) sk_pk = polynomial_kernel(x, y, self.pd, self.pa, self.pc) my_pk = self.pk.gram_matrix(x, y) self.assertTrue(np.allclose(my_pk, sk_pk)) sk_gk = rbf_kernel(x, y, self.g) my_gk = self.gk.gram_matrix(x, y) self.assertTrue(np.allclose(my_gk, sk_gk)) # we're expecting exceptions here so there's no need to compare anything def check_exceptions(self, x, y): message = "to compute a Gram Matrix both input matrices must have the same number of rows." with self.assertRaises(ValueError): self.lk.gram_matrix(x, y) with self.assertRaises(ValueError): self.pk.gram_matrix(x, y) with self.assertRaises(ValueError): self.gk.gram_matrix(x, y) def test_2vectors_same_samples(self): x = np.random.rand(3, 1) y = np.random.rand(3, 1) self.compare_to_sklearn(x, y) def test_2vectors_diff_samples(self): x = np.random.rand(3, 1) y = np.random.rand(4, 1) self.compare_to_sklearn(x, y) def test_2vectors_diff_features(self): x = np.random.rand(3, 1) y = np.random.rand(3, 2) self.check_exceptions(x, y) def test_1vector_1matrix_good(self): x = np.random.rand(1, 5) y = np.random.rand(5, 5) self.compare_to_sklearn(x, y) def test_1vector_1matrix_bad(self): x = np.random.rand(5, 1) y = np.random.rand(5, 5) self.check_exceptions(x, y) def test_2same_samples_matrices_good(self): x = np.random.rand(5, 5) y = np.random.rand(5, 5) self.compare_to_sklearn(x, y) def test_2matrices_diff_features_bad(self): x = np.random.rand(2, 2) y = np.random.rand(2, 3) self.check_exceptions(x, y) def test_2diff_samples_matrices_good(self): x = np.random.rand(3, 3) y = np.random.rand(2, 3) self.compare_to_sklearn(x, y) def test_2diff_samples_and_features_matrices_bad(self): x = np.random.rand(3, 3) y = np.random.rand(2, 4) self.check_exceptions(x, y) def test_large_input(self): x = np.random.rand(2000, 2000) y = np.random.rand(2000, 2000) self.compare_to_sklearn(x, y) def test_call_kernel_gram_matrix(self): k = ml.kernel() a = np.zeros(1) with self.assertRaises(Exception) as e: k.gram_matrix(a, a)