def test_tensor(self):
        loader = DataLoader(TensorDataset(), batch_size=256)

        x, = dl_utils.flatten(loader)

        assert torch.is_tensor(x)
        assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE])
def find_best_k(ds_train: Dataset, k_choices, num_folds):
    """
    Use cross validation to find the best K for the kNN model.

    :param ds_train: Training dataset.
    :param k_choices: A sequence of possible value of k for the kNN model.
    :param num_folds: Number of folds for cross-validation.
    :return: tuple (best_k, accuracies) where:
        best_k: the value of k with the highest mean accuracy across folds
        accuracies: The accuracies per fold for each k (list of lists).
    """

    accuracies = []

    for i, k in enumerate(k_choices):

        model = KNNClassifier(k)

        # TODO: Train model num_folds times with different train/val data.
        # Don't use any third-party libraries.
        # You can use your train/validation splitter from part 1 (even if
        # that means that it's not really k-fold CV since it will be a
        # different split each iteration), or implement something else.

        # ====== YOUR CODE: ======
        accuracies_for_k = []
        for j in range(num_folds):
            removed_offset = len(ds_train) * j // num_folds
            removed_len = min(
                len(ds_train) // num_folds,
                len(ds_train) - removed_offset)
            ds_actual_train = WithoutSubsetDataset(ds_train,
                                                   removed_len,
                                                   offset=removed_offset)
            ds_actual_valid = datasets.SubsetDataset(ds_train,
                                                     removed_len,
                                                     offset=removed_offset)

            knn_classifier = KNNClassifier(k=k)

            batch_size = 1024
            knn_classifier.train(
                torch.utils.data.DataLoader(ds_actual_train, batch_size))

            x_valid, y_valid = dataloader_utils.flatten(
                torch.utils.data.DataLoader(ds_actual_valid, batch_size))

            y_pred = knn_classifier.predict(x_valid)

            # Calculate accuracy
            valid_accuracy = accuracy(y_valid, y_pred)
            accuracies_for_k.append(valid_accuracy)

        accuracies.append(accuracies_for_k)
        # ========================

    best_k_idx = np.argmax([np.mean(acc) for acc in accuracies])
    best_k = k_choices[best_k_idx]

    return best_k, accuracies
    def test_two_tuple(self):
        loader = DataLoader(TensorTwoTupleDataset(), batch_size=256)

        x, y = dl_utils.flatten(loader)

        assert torch.is_tensor(x)
        assert torch.is_tensor(y)
        assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE])
        assert y.shape == torch.Size([DATASET_SIZE, DATA_SIZE, 1])
    def test_three_tuple(self):
        loader = DataLoader(TensorThreeTupleDataset(), batch_size=128)

        x, y, z = dl_utils.flatten(loader)

        assert torch.is_tensor(x)
        assert torch.is_tensor(y)
        assert torch.is_tensor(z)
        assert x.shape == torch.Size([DATASET_SIZE, DATA_SIZE, DATA_SIZE])
        assert x.shape == y.shape
        assert z.shape == torch.Size([DATASET_SIZE, DATA_SIZE, 1])
Пример #5
0
def implementKNN(classifier: KNNClassifier, dl_train, dl_test):
    # Get all test data to predict in one go
    x_test, y_test = dataloader_utils.flatten(dl_test)

    # Test kNN Classifier
    #     knn_classifier = KNNClassifier(k=100)
    classifier.train(dl_train)
    y_pred = classifier.predict(x_test)

    # Calculate accuracy
    acc = accuracy(y_test, y_pred)

    return acc
Пример #6
0
    def train(self, dl_train: DataLoader):
        """
        Trains the KNN model. KNN training is memorizing the training data.
        Or, equivalently, the model parameters are the training data itself.
        :param dl_train: A DataLoader with labeled training sample (should
            return tuples).
        :return: self
        """

        x_train, y_train = dataloader_utils.flatten(dl_train)
        self.x_train = x_train
        self.y_train = y_train
        self.n_classes = len(set(y_train.numpy()))
        return self
Пример #7
0
def find_best_k(ds_train: Dataset, k_choices, num_folds):
    """
    Use cross validation to find the best K for the kNN model.

    :param ds_train: Training dataset.
    :param k_choices: A sequence of possible value of k for the kNN model.
    :param num_folds: Number of folds for cross-validation.
    :return: tuple (best_k, accuracies) where:
        best_k: the value of k with the highest mean accuracy across folds
        accuracies: The accuracies per fold for each k (list of lists).
    """

    accuracies = []
    validation_ratio = 1 / num_folds

    for i, k in enumerate(k_choices):
        model = KNNClassifier(k)

        acc_model = []

        for _ in range(num_folds):

            dl_train, dl_valid = dataloaders.create_train_validation_loaders(
                ds_train, validation_ratio)
            x_valid, y_valid = dataloader_utils.flatten(dl_valid)
            model.train(dl_train)
            y_pred = model.predict(x_valid)

            # Calculate accuracy
            acc_model.append(accuracy(y_valid, y_pred))

        accuracies.append(acc_model)

        # TODO: Train model num_folds times with different train/val data.
        # Don't use any third-party libraries.
        # You can use your train/validation splitter from part 1 (even if
        # that means that it's not really k-fold CV since it will be a
        # different split each iteration), or implement something else.

        # ====== YOUR CODE: ======
        # ========================

    best_k_idx = np.argmax([np.mean(acc) for acc in accuracies])
    best_k = k_choices[best_k_idx]

    return best_k, accuracies
Пример #8
0
mean_acc = 0
for (x,y) in dl_test:
    y_pred, _ = lin_cls.predict(x)
    mean_acc += lin_cls.evaluate_accuracy(y, y_pred)
mean_acc /= len(dl_test)

print(f"Accuracy: {mean_acc:.1f}%")

import cs236605.dataloader_utils as dl_utils
from hw1.losses import SVMHingeLoss

# Create a hinge-loss function
loss_fn = SVMHingeLoss(delta=1)

# Classify all samples in the test set (because it doesn't depend on initialization)
x, y = dl_utils.flatten(dl_test)
y_pred, x_scores = lin_cls.predict(x)
loss = loss_fn(x, y, x_scores, y_pred)

# Compare to pre-computed expected value as a test
expected_loss = 8.9579
print("loss =", loss.item())
print('diff =', abs(loss.item()-expected_loss))
test.assertAlmostEqual(loss.item(), expected_loss, delta=1e-1)

from hw1.losses import SVMHingeLoss

# Create a hinge-loss function
loss_fn = SVMHingeLoss(delta=1.)

# Compute loss and gradient
Пример #9
0
    def train(self,
              dl_train: DataLoader,
              dl_valid: DataLoader,
              loss_fn: ClassifierLoss,
              learn_rate=0.1,
              weight_decay=0.001,
              max_epochs=100):

        Result = namedtuple('Result', 'accuracy loss')
        train_res = Result(accuracy=[], loss=[])
        valid_res = Result(accuracy=[], loss=[])

        print('Training', end='')
        for epoch_idx in range(max_epochs):
            #for epoch_idx in range(3):

            # TODO: Implement model training loop.
            # At each epoch, evaluate the model on the entire training set
            # (batch by batch) and update the weights.
            # Each epoch, also evaluate on the validation set.
            # Accumulate average loss and total accuracy for both sets.
            # The train/valid_res variables should hold the average loss and
            # accuracy per epoch.
            #
            # Don't forget to add a regularization term to the loss, using the
            # weight_decay parameter.

            total_correct = 0
            average_loss = 0

            # ====== YOUR CODE: ======
            #print("epoch:",epoch_idx)

            import cs236605.dataloader_utils as dataloader_utils

            # Iterate trough train batches and do GD step for each batch
            num_of_batches = 0
            for (x_train, y_train) in dl_train:
                num_of_batches += 1

                # Calc batch loss and accuracy and accumulate them.
                y_predicted, x_scores = self.predict(x_train)
                batch_accuracy = self.evaluate_accuracy(y_train, y_predicted)
                batch_loss = loss_fn.loss(x_train, y_train, x_scores,
                                          y_predicted)
                average_loss += batch_loss
                total_correct += batch_accuracy

                # Calc the grad of loos, add Regularization factor, GD step.
                loss_grad = loss_fn.grad()
                loss_grad += torch.mul(loss_grad, weight_decay)
                grad_step = torch.mul(loss_grad, learn_rate)
                self.weights = self.weights - grad_step

            # Calculate accuracy and loss on validation set
            x_valid, y_valid = dataloader_utils.flatten(dl_valid)
            y_predicted_valid, x_scores_valid = self.predict(x_valid)
            accuracy_valid = self.evaluate_accuracy(y_valid, y_predicted_valid)
            valid_loss = loss_fn.loss(x_valid, y_valid, x_scores_valid,
                                      y_predicted_valid)

            # Calc avg loss and acc across all train batches.
            # Append train/valid loss and acc to lists.
            average_loss = average_loss / num_of_batches
            total_correct = total_correct / num_of_batches
            train_res.loss.append(average_loss)
            train_res.accuracy.append(total_correct)
            valid_res.loss.append(valid_loss)
            valid_res.accuracy.append(accuracy_valid)
            # ========================
            print('.', end='')

        print('')
        return train_res, valid_res
Пример #10
0
def find_best_k(ds_train: Dataset, k_choices, num_folds):
    """
    Use cross validation to find the best K for the kNN model.

    :param ds_train: Training dataset.
    :param k_choices: A sequence of possible value of k for the kNN model.
    :param num_folds: Number of folds for cross-validation.
    :return: tuple (best_k, accuracies) where:
        best_k: the value of k with the highest mean accuracy across folds
        accuracies: The accuracies per fold for each k (list of lists).
    """

    accuracies = []

    for i, k in enumerate(k_choices):
        model = KNNClassifier(k)

        # TODO: Train model num_folds times with different train/val data.
        # Don't use any third-party libraries.
        # You can use your train/validation splitter from part 1 (even if
        # that means that it's not really k-fold CV since it will be a
        # different split each iteration), or implement something else.

        # ====== YOUR CODE: ======
        # Generate the starting and ending indices for each fold - In order to simplify the scanning process
        n_samples = ds_train.subset_len
        fold_size = n_samples // num_folds
        validation_ratio = fold_size / n_samples

        fold_starting_inds = np.array(
            [j * fold_size for j in range(num_folds)])

        acc = []
        for j in range(num_folds):
            # Seperate the training & validation folds
            if j == 0:
                train_inds = np.arange(fold_size, n_samples)
                valid_inds = np.arange(fold_size)

            elif j == num_folds - 1:
                train_inds = np.arange((n_samples - fold_size))
                valid_inds = np.arange((n_samples - fold_size), n_samples)

            else:
                train_inds_1 = np.arange(0, fold_starting_inds[j])
                train_inds_2 = np.arange((fold_starting_inds[j] + fold_size),
                                         n_samples)

                train_inds = np.concatenate((train_inds_1, train_inds_2))
                valid_inds = np.arange(fold_starting_inds[j],
                                       (fold_starting_inds[j] + fold_size))

            train_samp = SubsetRandomSampler(train_inds.tolist())
            valid_samp = SubsetRandomSampler(valid_inds.tolist())

            dl_train = torch.utils.data.DataLoader(ds_train,
                                                   batch_size=100,
                                                   shuffle=False,
                                                   num_workers=1,
                                                   sampler=train_samp)

            dl_valid = torch.utils.data.DataLoader(ds_train,
                                                   batch_size=100,
                                                   shuffle=False,
                                                   num_workers=1,
                                                   sampler=valid_samp)

            # Train & calculate the accuracy on the current folds division
            knn_classifier = KNNClassifier(k=k)
            knn_classifier.train(dl_train)

            x_valid, y_valid = dataloader_utils.flatten(dl_valid)
            y_pred = knn_classifier.predict(x_valid)

            # Calculate accuracy
            tmp_acc = accuracy(y_valid, y_pred)

            acc.append(tmp_acc)

        accuracies.append(acc)
        # ========================

    best_k_idx = np.argmax([np.mean(acc) for acc in accuracies])
    best_k = k_choices[best_k_idx]

    return best_k, accuracies
Пример #11
0
def find_best_k(ds_train: Dataset, k_choices, num_folds):
    """
    Use cross validation to find the best K for the kNN model.

    :param ds_train: Training dataset.
    :param k_choices: A sequence of possible value of k for the kNN model.
    :param num_folds: Number of folds for cross-validation.
    :return: tuple (best_k, accuracies) where:
        best_k: the value of k with the highest mean accuracy across folds
        accuracies: The accuracies per fold for each k (list of lists).
    """

    accuracies = []

    for i, k in enumerate(k_choices):
        model = KNNClassifier(k)

        # TODO: Train model num_folds times with different train/val data.
        # Don't use any third-party libraries.
        # You can use your train/validation splitter from part 1 (even if
        # that means that it's not really k-fold CV since it will be a
        # different split each iteration), or implement something else.

        # ====== YOUR CODE: ======

        validation_ratio = 1 / (num_folds - 1)

        len1 = int(len(ds_train) * validation_ratio)
        len2 = int(len(ds_train) - len1)

        accuracy_of_all_folds = 0
        accuracy_fold_list = list()
        # we need to split and train num_folds times
        for fold in range(num_folds):

            temp_ds_train, temp_ds_valid = torch.utils.data.random_split(
                ds_train, [len1, len2])
            #print("ds_train len",len(temp_ds_train))
            #print("ds_valid len",len(temp_ds_valid))

            dl_train = torch.utils.data.DataLoader(temp_ds_train, shuffle=True)
            dl_valid = torch.utils.data.DataLoader(temp_ds_valid, shuffle=True)

            # now we need to train the model
            model.train(dl_train)
            # now validate:
            # predict validation data

            # get the truth labels, separate data form labels
            data_valid, labels_valid = dataloader_utils.flatten(dl_valid)

            pred = model.predict(data_valid)
            curr_accuracy = accuracy(labels_valid, pred)
            #print("current fold-",fold,"k-",k,"  accuracy", curr_accuracy)
            accuracy_fold_list.append(curr_accuracy)

            accuracy_of_all_folds += curr_accuracy / num_folds

        accuracies.append(accuracy_fold_list)
        # ========================

    best_k_idx = np.argmax([np.mean(acc) for acc in accuracies])
    best_k = k_choices[best_k_idx]

    return best_k, accuracies
Пример #12
0
    def train(self,
              dl_train: DataLoader,
              dl_valid: DataLoader,
              loss_fn: ClassifierLoss,
              learn_rate=0.1,
              weight_decay=0.001,
              max_epochs=100):

        Result = namedtuple('Result', 'accuracy loss')
        train_res = Result(accuracy=[], loss=[])
        valid_res = Result(accuracy=[], loss=[])

        print('Training', end='')
        for epoch_idx in range(max_epochs):

            # TODO: Implement model training loop.
            # At each epoch, evaluate the model on the entire training set
            # (batch by batch) and update the weights.
            # Each epoch, also evaluate on the validation set.
            # Accumulate average loss and total accuracy for both sets.
            # The train/valid_res variables should hold the average loss and
            # accuracy per epoch.
            #
            # Don't forget to add a regularization term to the loss, using the
            # weight_decay parameter.

            total_correct = 0
            average_loss = 0

            # ====== YOUR CODE: ======

            train_loss = 0
            train_accuracy = 0
            n_samples_total = 0

            for idx, (x_train, y_train) in enumerate(dl_train):
                y_train_pred, train_class_scores = self.predict(x_train)

                w_norm = torch.sum(torch.mul(self.weights,
                                             self.weights)).item()
                train_loss_batch = loss_fn(
                    x_train, y_train, train_class_scores,
                    y_train_pred).item() + weight_decay / 2.0 * w_norm
                train_accuracy_batch = self.evaluate_accuracy(
                    y_train, y_train_pred)

                n_samples_total += x_train.shape[0]
                train_loss += train_loss_batch * float(x_train.shape[0])
                train_accuracy += train_accuracy_batch * float(
                    x_train.shape[0])

                grad = loss_fn.grad() + weight_decay * self.weights
                self.weights -= learn_rate * grad

            train_loss /= n_samples_total
            train_accuracy /= n_samples_total
            train_res.loss.append(train_loss)
            train_res.accuracy.append(train_accuracy)

            print('Epoch', epoch_idx, 'training loss', train_loss,
                  'training accuracy', train_accuracy)

            x_valid, y_valid = dl_utils.flatten(dl_valid)
            y_valid_pred, valid_class_scores = self.predict(x_valid)

            valid_loss = loss_fn(x_valid, y_valid, valid_class_scores,
                                 y_valid_pred).item()
            valid_accuracy = self.evaluate_accuracy(y_valid, y_valid_pred)

            valid_res.loss.append(valid_loss)
            valid_res.accuracy.append(valid_accuracy)

            # ========================
            print('.', end='')

        print('')
        return train_res, valid_res