def test_tensor(self): loader = DataLoader(TensorDataset(), batch_size=256) x, = dl_utils.flatten(loader) assert torch.is_tensor(x) assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE])
def test_two_tuple(self): loader = DataLoader(TensorTwoTupleDataset(), batch_size=256) x, y = dl_utils.flatten(loader) assert torch.is_tensor(x) assert torch.is_tensor(y) assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE]) assert y.shape == torch.Size([DATASET_SIZE, DATA_SIZE, 1])
def test_three_tuple(self): loader = DataLoader(TensorThreeTupleDataset(), batch_size=128) x, y, z = dl_utils.flatten(loader) assert torch.is_tensor(x) assert torch.is_tensor(y) assert torch.is_tensor(z) assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE]) assert x.shape == y.shape assert z.shape == torch.Size([DATASET_SIZE, DATA_SIZE, 1])
def train(self, dl_train: DataLoader): """ Trains the KNN model. KNN training is memorizing the training data. Or, equivalently, the model parameters are the training data itself. :param dl_train: A DataLoader with labeled training sample (should return tuples). :return: self """ x_train, y_train = dataloader_utils.flatten(dl_train) self.x_train = x_train self.y_train = y_train self.n_classes = len(set(y_train.numpy())) return self
def find_best_k(ds_train: Dataset, k_choices, num_folds): """ Use cross validation to find the best K for the kNN model. :param ds_train: Training dataset. :param k_choices: A sequence of possible value of k for the kNN model. :param num_folds: Number of folds for cross-validation. :return: tuple (best_k, accuracies) where: best_k: the value of k with the highest mean accuracy across folds accuracies: The accuracies per fold for each k (list of lists). """ accuracies = [] for i, k in enumerate(k_choices): model = KNNClassifier(k) # TODO: Train model num_folds times with different train/val data. # Don't use any third-party libraries. # You can use your train/validation splitter from part 1 (even if # that means that it's not really k-fold CV since it will be a # different split each iteration), or implement something else. # ====== YOUR CODE: ====== ds_size = len(ds_train) fold_size = int(np.ceil(ds_size/num_folds)) acc_fold = np.zeros(num_folds) for fold_idx in range(num_folds): # Extract data to train and validation. ind_vl = [aa for aa in range(fold_idx*fold_size, (fold_idx+1)*fold_size)] ind_tr = [aa for aa in range(fold_idx*fold_size)] + [aa for aa in range((fold_idx+1)*fold_size, ds_size)] ds_tr = torch.utils.data.dataset.Subset(ds_train, ind_tr) ds_vl = torch.utils.data.dataset.Subset(ds_train, ind_vl) dl_tr = torch.utils.data.DataLoader(ds_tr, 1024) dl_vl = torch.utils.data.DataLoader(ds_vl, 1024) x_vl, y_vl = dataloader_utils.flatten(dl_vl) # train model. model.train(dl_tr) # get validation predictions. y_pred = model.predict(x_vl) # check accuracy. acc_fold[fold_idx] = accuracy(y_vl, y_pred) accuracies.append(acc_fold) # ======================== best_k_idx = np.argmax([np.mean(acc) for acc in accuracies]) best_k = k_choices[best_k_idx] return best_k, accuracies
def find_best_k(ds_train: Dataset, k_choices, num_folds): """ Use cross validation to find the best K for the kNN model. :param ds_train: Training dataset. :param k_choices: A sequence of possible value of k for the kNN model. :param num_folds: Number of folds for cross-validation. :return: tuple (best_k, accuracies) where: best_k: the value of k with the highest mean accuracy across folds accuracies: The accuracies per fold for each k (list of lists). """ accuracies = [] for i, k in enumerate(k_choices): model = KNNClassifier(k) # TODO: Train model num_folds times with different train/val data. # Don't use any third-party libraries. # You can use your train/validation splitter from part 1 (even if # that means that it's not really k-fold CV since it will be a # different split each iteration), or implement something else. # ====== YOUR CODE: ====== # Split training dataset to train and val acc = [] split = torch.utils.data.random_split( ds_train, lengths=(torch.ones(num_folds, dtype=torch.int) * int(len(ds_train) / num_folds))) val_X, val_y = dataloader_utils.flatten( DataLoader(split[len(split) - 1])) # Train and evaluate for index in range(len(split) - 1): train_dl = DataLoader(split[index]) model.train(train_dl) y_pred = model.predict(val_X) acc.append(accuracy(val_y, y_pred)) accuracies.append(acc) # ======================== best_k_idx = np.argmax([np.mean(acc) for acc in accuracies]) best_k = k_choices[best_k_idx] return best_k, accuracies
def find_best_k(ds_train: Dataset, k_choices, num_folds): """ Use cross validation to find the best K for the kNN model. :param ds_train: Training dataset. :param k_choices: A sequence of possible value of k for the kNN model. :param num_folds: Number of folds for cross-validation. :return: tuple (best_k, accuracies) where: best_k: the value of k with the highest mean accuracy across folds accuracies: The accuracies per fold for each k (list of lists). """ print("best k") accuracies = [] fold_size = int(np.floor(len(ds_train) / num_folds)) indices = list(range(len(ds_train))) for i, k in enumerate(k_choices): model = KNNClassifier(k=k) # TODO: Train model num_folds times with different train/val data. # Don't use any third-party libraries. # You can use your train/validation splitter from part 1 (even if # that means that it's not really k-fold CV since it will be a # different split each iteration), or implement something else. cur_accuracies = [] for j_fold in range(num_folds): # set the j-th part as the validation set # set the remaining parts as the training set split1 = fold_size * j_fold split2 = fold_size * (j_fold + 1) train, validation = indices[:split1] + indices[split2:], indices[ split1:split2] train_smp = sampler.SubsetRandomSampler(train) validation_smp = sampler.SubsetRandomSampler(validation) dl_train = torch.utils.data.DataLoader(ds_train, shuffle=False, sampler=train_smp) dl_validation = torch.utils.data.DataLoader(ds_train, shuffle=False, sampler=validation_smp) x_validation, y_validation = dataloader_utils.flatten( dl_validation) # train on the current training set model.train(dl_train) # evaluate current accuracy y_pred = model.predict(x_validation) cur_accuracies.append(accuracy(y_validation, y_pred)) accuracies.append(cur_accuracies) best_k_idx = np.argmax([np.mean(acc) for acc in accuracies]) best_k = k_choices[best_k_idx] return best_k, accuracies