def test_tensor(self):
        loader = DataLoader(TensorDataset(), batch_size=256)

        x, = dl_utils.flatten(loader)

        assert torch.is_tensor(x)
        assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE])
    def test_two_tuple(self):
        loader = DataLoader(TensorTwoTupleDataset(), batch_size=256)

        x, y = dl_utils.flatten(loader)

        assert torch.is_tensor(x)
        assert torch.is_tensor(y)
        assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE])
        assert y.shape == torch.Size([DATASET_SIZE, DATA_SIZE, 1])
    def test_three_tuple(self):
        loader = DataLoader(TensorThreeTupleDataset(), batch_size=128)

        x, y, z = dl_utils.flatten(loader)

        assert torch.is_tensor(x)
        assert torch.is_tensor(y)
        assert torch.is_tensor(z)
        assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE])
        assert x.shape == y.shape
        assert z.shape == torch.Size([DATASET_SIZE, DATA_SIZE, 1])
예제 #4
0
    def train(self, dl_train: DataLoader):
        """
        Trains the KNN model. KNN training is memorizing the training data.
        Or, equivalently, the model parameters are the training data itself.
        :param dl_train: A DataLoader with labeled training sample (should
            return tuples).
        :return: self
        """

        x_train, y_train = dataloader_utils.flatten(dl_train)
        self.x_train = x_train
        self.y_train = y_train
        self.n_classes = len(set(y_train.numpy()))
        return self
예제 #5
0
def find_best_k(ds_train: Dataset, k_choices, num_folds):
    """
    Use cross validation to find the best K for the kNN model.

    :param ds_train: Training dataset.
    :param k_choices: A sequence of possible value of k for the kNN model.
    :param num_folds: Number of folds for cross-validation.
    :return: tuple (best_k, accuracies) where:
        best_k: the value of k with the highest mean accuracy across folds
        accuracies: The accuracies per fold for each k (list of lists).
    """

    accuracies = []

    for i, k in enumerate(k_choices):
        model = KNNClassifier(k)

        # TODO: Train model num_folds times with different train/val data.
        # Don't use any third-party libraries.
        # You can use your train/validation splitter from part 1 (even if
        # that means that it's not really k-fold CV since it will be a
        # different split each iteration), or implement something else.

        # ====== YOUR CODE: ======
        ds_size = len(ds_train)
        fold_size = int(np.ceil(ds_size/num_folds))
        acc_fold = np.zeros(num_folds)
        for fold_idx in range(num_folds):
            # Extract data to train and validation.
            ind_vl = [aa for aa in range(fold_idx*fold_size, (fold_idx+1)*fold_size)]
            ind_tr = [aa for aa in range(fold_idx*fold_size)] + [aa for aa in range((fold_idx+1)*fold_size, ds_size)]
            ds_tr = torch.utils.data.dataset.Subset(ds_train, ind_tr)
            ds_vl = torch.utils.data.dataset.Subset(ds_train, ind_vl)
            dl_tr = torch.utils.data.DataLoader(ds_tr, 1024)
            dl_vl = torch.utils.data.DataLoader(ds_vl, 1024)
            x_vl, y_vl = dataloader_utils.flatten(dl_vl)

            # train model.
            model.train(dl_tr)
            # get validation predictions.
            y_pred = model.predict(x_vl)
            # check accuracy.
            acc_fold[fold_idx] = accuracy(y_vl, y_pred)
        accuracies.append(acc_fold)
        # ========================

    best_k_idx = np.argmax([np.mean(acc) for acc in accuracies])
    best_k = k_choices[best_k_idx]

    return best_k, accuracies
예제 #6
0
def find_best_k(ds_train: Dataset, k_choices, num_folds):
    """
    Use cross validation to find the best K for the kNN model.

    :param ds_train: Training dataset.
    :param k_choices: A sequence of possible value of k for the kNN model.
    :param num_folds: Number of folds for cross-validation.
    :return: tuple (best_k, accuracies) where:
        best_k: the value of k with the highest mean accuracy across folds
        accuracies: The accuracies per fold for each k (list of lists).
    """

    accuracies = []

    for i, k in enumerate(k_choices):
        model = KNNClassifier(k)

        # TODO: Train model num_folds times with different train/val data.
        # Don't use any third-party libraries.
        # You can use your train/validation splitter from part 1 (even if
        # that means that it's not really k-fold CV since it will be a
        # different split each iteration), or implement something else.

        # ====== YOUR CODE: ======
        # Split training dataset to train and val
        acc = []
        split = torch.utils.data.random_split(
            ds_train,
            lengths=(torch.ones(num_folds, dtype=torch.int) *
                     int(len(ds_train) / num_folds)))
        val_X, val_y = dataloader_utils.flatten(
            DataLoader(split[len(split) - 1]))

        # Train and evaluate
        for index in range(len(split) - 1):
            train_dl = DataLoader(split[index])
            model.train(train_dl)
            y_pred = model.predict(val_X)
            acc.append(accuracy(val_y, y_pred))
        accuracies.append(acc)
        # ========================

    best_k_idx = np.argmax([np.mean(acc) for acc in accuracies])
    best_k = k_choices[best_k_idx]

    return best_k, accuracies
예제 #7
0
def find_best_k(ds_train: Dataset, k_choices, num_folds):
    """
    Use cross validation to find the best K for the kNN model.

    :param ds_train: Training dataset.
    :param k_choices: A sequence of possible value of k for the kNN model.
    :param num_folds: Number of folds for cross-validation.
    :return: tuple (best_k, accuracies) where:
        best_k: the value of k with the highest mean accuracy across folds
        accuracies: The accuracies per fold for each k (list of lists).
    """
    print("best k")

    accuracies = []
    fold_size = int(np.floor(len(ds_train) / num_folds))
    indices = list(range(len(ds_train)))

    for i, k in enumerate(k_choices):
        model = KNNClassifier(k=k)

        # TODO: Train model num_folds times with different train/val data.
        # Don't use any third-party libraries.
        # You can use your train/validation splitter from part 1 (even if
        # that means that it's not really k-fold CV since it will be a
        # different split each iteration), or implement something else.

        cur_accuracies = []

        for j_fold in range(num_folds):
            # set the j-th part as the validation set
            # set the remaining parts as the training set
            split1 = fold_size * j_fold
            split2 = fold_size * (j_fold + 1)

            train, validation = indices[:split1] + indices[split2:], indices[
                split1:split2]

            train_smp = sampler.SubsetRandomSampler(train)
            validation_smp = sampler.SubsetRandomSampler(validation)

            dl_train = torch.utils.data.DataLoader(ds_train,
                                                   shuffle=False,
                                                   sampler=train_smp)
            dl_validation = torch.utils.data.DataLoader(ds_train,
                                                        shuffle=False,
                                                        sampler=validation_smp)

            x_validation, y_validation = dataloader_utils.flatten(
                dl_validation)

            # train on the current training set
            model.train(dl_train)
            # evaluate current accuracy
            y_pred = model.predict(x_validation)
            cur_accuracies.append(accuracy(y_validation, y_pred))

        accuracies.append(cur_accuracies)

    best_k_idx = np.argmax([np.mean(acc) for acc in accuracies])
    best_k = k_choices[best_k_idx]

    return best_k, accuracies