def test_tensor(self): loader = DataLoader(TensorDataset(), batch_size=256) x, = dl_utils.flatten(loader) assert torch.is_tensor(x) assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE])
def find_best_k(ds_train: Dataset, k_choices, num_folds): """ Use cross validation to find the best K for the kNN model. :param ds_train: Training dataset. :param k_choices: A sequence of possible value of k for the kNN model. :param num_folds: Number of folds for cross-validation. :return: tuple (best_k, accuracies) where: best_k: the value of k with the highest mean accuracy across folds accuracies: The accuracies per fold for each k (list of lists). """ accuracies = [] for i, k in enumerate(k_choices): model = KNNClassifier(k) # TODO: Train model num_folds times with different train/val data. # Don't use any third-party libraries. # You can use your train/validation splitter from part 1 (even if # that means that it's not really k-fold CV since it will be a # different split each iteration), or implement something else. # ====== YOUR CODE: ====== accuracies_for_k = [] for j in range(num_folds): removed_offset = len(ds_train) * j // num_folds removed_len = min( len(ds_train) // num_folds, len(ds_train) - removed_offset) ds_actual_train = WithoutSubsetDataset(ds_train, removed_len, offset=removed_offset) ds_actual_valid = datasets.SubsetDataset(ds_train, removed_len, offset=removed_offset) knn_classifier = KNNClassifier(k=k) batch_size = 1024 knn_classifier.train( torch.utils.data.DataLoader(ds_actual_train, batch_size)) x_valid, y_valid = dataloader_utils.flatten( torch.utils.data.DataLoader(ds_actual_valid, batch_size)) y_pred = knn_classifier.predict(x_valid) # Calculate accuracy valid_accuracy = accuracy(y_valid, y_pred) accuracies_for_k.append(valid_accuracy) accuracies.append(accuracies_for_k) # ======================== best_k_idx = np.argmax([np.mean(acc) for acc in accuracies]) best_k = k_choices[best_k_idx] return best_k, accuracies
def test_two_tuple(self): loader = DataLoader(TensorTwoTupleDataset(), batch_size=256) x, y = dl_utils.flatten(loader) assert torch.is_tensor(x) assert torch.is_tensor(y) assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE]) assert y.shape == torch.Size([DATASET_SIZE, DATA_SIZE, 1])
def test_three_tuple(self): loader = DataLoader(TensorThreeTupleDataset(), batch_size=128) x, y, z = dl_utils.flatten(loader) assert torch.is_tensor(x) assert torch.is_tensor(y) assert torch.is_tensor(z) assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE]) assert x.shape == y.shape assert z.shape == torch.Size([DATASET_SIZE, DATA_SIZE, 1])
def implementKNN(classifier: KNNClassifier, dl_train, dl_test): # Get all test data to predict in one go x_test, y_test = dataloader_utils.flatten(dl_test) # Test kNN Classifier # knn_classifier = KNNClassifier(k=100) classifier.train(dl_train) y_pred = classifier.predict(x_test) # Calculate accuracy acc = accuracy(y_test, y_pred) return acc
def train(self, dl_train: DataLoader): """ Trains the KNN model. KNN training is memorizing the training data. Or, equivalently, the model parameters are the training data itself. :param dl_train: A DataLoader with labeled training sample (should return tuples). :return: self """ x_train, y_train = dataloader_utils.flatten(dl_train) self.x_train = x_train self.y_train = y_train self.n_classes = len(set(y_train.numpy())) return self
def find_best_k(ds_train: Dataset, k_choices, num_folds): """ Use cross validation to find the best K for the kNN model. :param ds_train: Training dataset. :param k_choices: A sequence of possible value of k for the kNN model. :param num_folds: Number of folds for cross-validation. :return: tuple (best_k, accuracies) where: best_k: the value of k with the highest mean accuracy across folds accuracies: The accuracies per fold for each k (list of lists). """ accuracies = [] validation_ratio = 1 / num_folds for i, k in enumerate(k_choices): model = KNNClassifier(k) acc_model = [] for _ in range(num_folds): dl_train, dl_valid = dataloaders.create_train_validation_loaders( ds_train, validation_ratio) x_valid, y_valid = dataloader_utils.flatten(dl_valid) model.train(dl_train) y_pred = model.predict(x_valid) # Calculate accuracy acc_model.append(accuracy(y_valid, y_pred)) accuracies.append(acc_model) # TODO: Train model num_folds times with different train/val data. # Don't use any third-party libraries. # You can use your train/validation splitter from part 1 (even if # that means that it's not really k-fold CV since it will be a # different split each iteration), or implement something else. # ====== YOUR CODE: ====== # ======================== best_k_idx = np.argmax([np.mean(acc) for acc in accuracies]) best_k = k_choices[best_k_idx] return best_k, accuracies
mean_acc = 0 for (x,y) in dl_test: y_pred, _ = lin_cls.predict(x) mean_acc += lin_cls.evaluate_accuracy(y, y_pred) mean_acc /= len(dl_test) print(f"Accuracy: {mean_acc:.1f}%") import cs236605.dataloader_utils as dl_utils from hw1.losses import SVMHingeLoss # Create a hinge-loss function loss_fn = SVMHingeLoss(delta=1) # Classify all samples in the test set (because it doesn't depend on initialization) x, y = dl_utils.flatten(dl_test) y_pred, x_scores = lin_cls.predict(x) loss = loss_fn(x, y, x_scores, y_pred) # Compare to pre-computed expected value as a test expected_loss = 8.9579 print("loss =", loss.item()) print('diff =', abs(loss.item()-expected_loss)) test.assertAlmostEqual(loss.item(), expected_loss, delta=1e-1) from hw1.losses import SVMHingeLoss # Create a hinge-loss function loss_fn = SVMHingeLoss(delta=1.) # Compute loss and gradient
def train(self, dl_train: DataLoader, dl_valid: DataLoader, loss_fn: ClassifierLoss, learn_rate=0.1, weight_decay=0.001, max_epochs=100): Result = namedtuple('Result', 'accuracy loss') train_res = Result(accuracy=[], loss=[]) valid_res = Result(accuracy=[], loss=[]) print('Training', end='') for epoch_idx in range(max_epochs): #for epoch_idx in range(3): # TODO: Implement model training loop. # At each epoch, evaluate the model on the entire training set # (batch by batch) and update the weights. # Each epoch, also evaluate on the validation set. # Accumulate average loss and total accuracy for both sets. # The train/valid_res variables should hold the average loss and # accuracy per epoch. # # Don't forget to add a regularization term to the loss, using the # weight_decay parameter. total_correct = 0 average_loss = 0 # ====== YOUR CODE: ====== #print("epoch:",epoch_idx) import cs236605.dataloader_utils as dataloader_utils # Iterate trough train batches and do GD step for each batch num_of_batches = 0 for (x_train, y_train) in dl_train: num_of_batches += 1 # Calc batch loss and accuracy and accumulate them. y_predicted, x_scores = self.predict(x_train) batch_accuracy = self.evaluate_accuracy(y_train, y_predicted) batch_loss = loss_fn.loss(x_train, y_train, x_scores, y_predicted) average_loss += batch_loss total_correct += batch_accuracy # Calc the grad of loos, add Regularization factor, GD step. loss_grad = loss_fn.grad() loss_grad += torch.mul(loss_grad, weight_decay) grad_step = torch.mul(loss_grad, learn_rate) self.weights = self.weights - grad_step # Calculate accuracy and loss on validation set x_valid, y_valid = dataloader_utils.flatten(dl_valid) y_predicted_valid, x_scores_valid = self.predict(x_valid) accuracy_valid = self.evaluate_accuracy(y_valid, y_predicted_valid) valid_loss = loss_fn.loss(x_valid, y_valid, x_scores_valid, y_predicted_valid) # Calc avg loss and acc across all train batches. # Append train/valid loss and acc to lists. average_loss = average_loss / num_of_batches total_correct = total_correct / num_of_batches train_res.loss.append(average_loss) train_res.accuracy.append(total_correct) valid_res.loss.append(valid_loss) valid_res.accuracy.append(accuracy_valid) # ======================== print('.', end='') print('') return train_res, valid_res
def find_best_k(ds_train: Dataset, k_choices, num_folds): """ Use cross validation to find the best K for the kNN model. :param ds_train: Training dataset. :param k_choices: A sequence of possible value of k for the kNN model. :param num_folds: Number of folds for cross-validation. :return: tuple (best_k, accuracies) where: best_k: the value of k with the highest mean accuracy across folds accuracies: The accuracies per fold for each k (list of lists). """ accuracies = [] for i, k in enumerate(k_choices): model = KNNClassifier(k) # TODO: Train model num_folds times with different train/val data. # Don't use any third-party libraries. # You can use your train/validation splitter from part 1 (even if # that means that it's not really k-fold CV since it will be a # different split each iteration), or implement something else. # ====== YOUR CODE: ====== # Generate the starting and ending indices for each fold - In order to simplify the scanning process n_samples = ds_train.subset_len fold_size = n_samples // num_folds validation_ratio = fold_size / n_samples fold_starting_inds = np.array( [j * fold_size for j in range(num_folds)]) acc = [] for j in range(num_folds): # Seperate the training & validation folds if j == 0: train_inds = np.arange(fold_size, n_samples) valid_inds = np.arange(fold_size) elif j == num_folds - 1: train_inds = np.arange((n_samples - fold_size)) valid_inds = np.arange((n_samples - fold_size), n_samples) else: train_inds_1 = np.arange(0, fold_starting_inds[j]) train_inds_2 = np.arange((fold_starting_inds[j] + fold_size), n_samples) train_inds = np.concatenate((train_inds_1, train_inds_2)) valid_inds = np.arange(fold_starting_inds[j], (fold_starting_inds[j] + fold_size)) train_samp = SubsetRandomSampler(train_inds.tolist()) valid_samp = SubsetRandomSampler(valid_inds.tolist()) dl_train = torch.utils.data.DataLoader(ds_train, batch_size=100, shuffle=False, num_workers=1, sampler=train_samp) dl_valid = torch.utils.data.DataLoader(ds_train, batch_size=100, shuffle=False, num_workers=1, sampler=valid_samp) # Train & calculate the accuracy on the current folds division knn_classifier = KNNClassifier(k=k) knn_classifier.train(dl_train) x_valid, y_valid = dataloader_utils.flatten(dl_valid) y_pred = knn_classifier.predict(x_valid) # Calculate accuracy tmp_acc = accuracy(y_valid, y_pred) acc.append(tmp_acc) accuracies.append(acc) # ======================== best_k_idx = np.argmax([np.mean(acc) for acc in accuracies]) best_k = k_choices[best_k_idx] return best_k, accuracies
def find_best_k(ds_train: Dataset, k_choices, num_folds): """ Use cross validation to find the best K for the kNN model. :param ds_train: Training dataset. :param k_choices: A sequence of possible value of k for the kNN model. :param num_folds: Number of folds for cross-validation. :return: tuple (best_k, accuracies) where: best_k: the value of k with the highest mean accuracy across folds accuracies: The accuracies per fold for each k (list of lists). """ accuracies = [] for i, k in enumerate(k_choices): model = KNNClassifier(k) # TODO: Train model num_folds times with different train/val data. # Don't use any third-party libraries. # You can use your train/validation splitter from part 1 (even if # that means that it's not really k-fold CV since it will be a # different split each iteration), or implement something else. # ====== YOUR CODE: ====== validation_ratio = 1 / (num_folds - 1) len1 = int(len(ds_train) * validation_ratio) len2 = int(len(ds_train) - len1) accuracy_of_all_folds = 0 accuracy_fold_list = list() # we need to split and train num_folds times for fold in range(num_folds): temp_ds_train, temp_ds_valid = torch.utils.data.random_split( ds_train, [len1, len2]) #print("ds_train len",len(temp_ds_train)) #print("ds_valid len",len(temp_ds_valid)) dl_train = torch.utils.data.DataLoader(temp_ds_train, shuffle=True) dl_valid = torch.utils.data.DataLoader(temp_ds_valid, shuffle=True) # now we need to train the model model.train(dl_train) # now validate: # predict validation data # get the truth labels, separate data form labels data_valid, labels_valid = dataloader_utils.flatten(dl_valid) pred = model.predict(data_valid) curr_accuracy = accuracy(labels_valid, pred) #print("current fold-",fold,"k-",k," accuracy", curr_accuracy) accuracy_fold_list.append(curr_accuracy) accuracy_of_all_folds += curr_accuracy / num_folds accuracies.append(accuracy_fold_list) # ======================== best_k_idx = np.argmax([np.mean(acc) for acc in accuracies]) best_k = k_choices[best_k_idx] return best_k, accuracies
def train(self, dl_train: DataLoader, dl_valid: DataLoader, loss_fn: ClassifierLoss, learn_rate=0.1, weight_decay=0.001, max_epochs=100): Result = namedtuple('Result', 'accuracy loss') train_res = Result(accuracy=[], loss=[]) valid_res = Result(accuracy=[], loss=[]) print('Training', end='') for epoch_idx in range(max_epochs): # TODO: Implement model training loop. # At each epoch, evaluate the model on the entire training set # (batch by batch) and update the weights. # Each epoch, also evaluate on the validation set. # Accumulate average loss and total accuracy for both sets. # The train/valid_res variables should hold the average loss and # accuracy per epoch. # # Don't forget to add a regularization term to the loss, using the # weight_decay parameter. total_correct = 0 average_loss = 0 # ====== YOUR CODE: ====== train_loss = 0 train_accuracy = 0 n_samples_total = 0 for idx, (x_train, y_train) in enumerate(dl_train): y_train_pred, train_class_scores = self.predict(x_train) w_norm = torch.sum(torch.mul(self.weights, self.weights)).item() train_loss_batch = loss_fn( x_train, y_train, train_class_scores, y_train_pred).item() + weight_decay / 2.0 * w_norm train_accuracy_batch = self.evaluate_accuracy( y_train, y_train_pred) n_samples_total += x_train.shape[0] train_loss += train_loss_batch * float(x_train.shape[0]) train_accuracy += train_accuracy_batch * float( x_train.shape[0]) grad = loss_fn.grad() + weight_decay * self.weights self.weights -= learn_rate * grad train_loss /= n_samples_total train_accuracy /= n_samples_total train_res.loss.append(train_loss) train_res.accuracy.append(train_accuracy) print('Epoch', epoch_idx, 'training loss', train_loss, 'training accuracy', train_accuracy) x_valid, y_valid = dl_utils.flatten(dl_valid) y_valid_pred, valid_class_scores = self.predict(x_valid) valid_loss = loss_fn(x_valid, y_valid, valid_class_scores, y_valid_pred).item() valid_accuracy = self.evaluate_accuracy(y_valid, y_valid_pred) valid_res.loss.append(valid_loss) valid_res.accuracy.append(valid_accuracy) # ======================== print('.', end='') print('') return train_res, valid_res