Exemplo n.º 1
0
    def fit(self, dl_train: DataLoader, dl_test: DataLoader,
            num_epochs, checkpoints: str = None,
            early_stopping: int = None,
            print_every=1, **kw) -> FitResult:
        """
        Trains the model for multiple epochs with a given training set,
        and calculates validation loss over a given validation set.
        :param dl_train: Dataloader for the training set.
        :param dl_test: Dataloader for the test set.
        :param num_epochs: Number of epochs to train for.
        :param checkpoints: Whether to save model to file every time the
            test set accuracy improves. Should be a string containing a
            filename without extension.
        :param early_stopping: Whether to stop training early if there is no
            test loss improvement for this number of epochs.
        :param print_every: Print progress every this number of epochs.
        :return: A FitResult object containing train and test losses per epoch.
        """
        actual_num_epochs = 0
        train_loss, train_acc, test_loss, test_acc = [], [], [], []

        best_acc = None
        epochs_without_improvement = 0

        for epoch in range(num_epochs):
            verbose = False  # pass this to train/test_epoch.
            if epoch % print_every == 0 or epoch == num_epochs-1:
                verbose = True
            self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose)

            # TODO: Train & evaluate for one epoch
            # - Use the train/test_epoch methods.
            # - Save losses and accuracies in the lists above.
            # - Optional: Implement checkpoints. You can use torch.save() to
            #   save the model to a file.
            # - Optional: Implement early stopping. This is a very useful and
            #   simple regularization technique that is highly recommended.
            # ====== YOUR CODE: ======
            losses, acc = self.train_epoch(dl_train, **kw)
            loss = average(losses)
            train_loss.append(loss)
            train_acc.append(acc)

            losses, acc = self.test_epoch(dl_test, **kw)
            loss = average(losses)
            test_loss.append(loss)
            test_acc.append(acc)

            if best_acc is None or acc > best_acc:
                best_acc = acc
                if checkpoints is not None:
                    self.model.save(checkpoints)
            else:
                epochs_without_improvement += 1
                if early_stopping is not None:
                    break
            # ========================

        return FitResult(actual_num_epochs,
                         train_loss, train_acc, test_loss, test_acc)
Exemplo n.º 2
0
def load_experiment(filename):
    with open(filename, 'r') as f:
        output = json.load(f)

    config = output['config']
    fit_res = FitResult(**output['results'])

    return config, fit_res
Exemplo n.º 3
0
    def fit(self,
            dl_train: DataLoader,
            dl_test: DataLoader,
            num_epochs,
            checkpoints: str = None,
            early_stopping: int = None,
            print_every=1,
            post_epoch_fn=None,
            **kw) -> FitResult:
        """
        Trains the model for multiple epochs with a given training set,
        and calculates validation loss over a given validation set.
        :param dl_train: Dataloader for the training set.
        :param dl_test: Dataloader for the test set.
        :param num_epochs: Number of epochs to train for.
        :param checkpoints: Whether to save model to file every time the
            test set accuracy improves. Should be a string containing a
            filename without extension.
        :param early_stopping: Whether to stop training early if there is no
            test loss improvement for this number of epochs.
        :param print_every: Print progress every this number of epochs.
        :param post_epoch_fn: A function to call after each epoch completes.
        :return: A FitResult object containing train and test losses per epoch.
        """
        actual_num_epochs = 0
        train_loss, train_acc, test_loss, test_acc = [], [], [], []

        best_acc = None
        epochs_without_improvement = 0
        low_loss = None
        checkpoint_filename = None
        if checkpoints is not None:
            checkpoint_filename = f'{checkpoints}.pt'
            Path(os.path.dirname(checkpoint_filename)).mkdir(exist_ok=True)
            if os.path.isfile(checkpoint_filename):
                print(f'*** Loading checkpoint file {checkpoint_filename}')
                saved_state = torch.load(checkpoint_filename,
                                         map_location=self.device)
                best_acc = saved_state.get('best_acc', best_acc)
                epochs_without_improvement =\
                    saved_state.get('ewi', epochs_without_improvement)
                self.model.load_state_dict(saved_state['model_state'])

        for epoch in range(num_epochs):
            save_checkpoint = False
            verbose = False  # pass this to train/test_epoch.
            if epoch % print_every == 0 or epoch == num_epochs - 1:
                verbose = True
            self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose)

            # TODO: Train & evaluate for one epoch
            # - Use the train/test_epoch methods.
            # - Save losses and accuracies in the lists above.
            # - Implement early stopping. This is a very useful and
            #   simple regularization technique that is highly recommended.
            # ====== YOUR CODE: ======

            train_result = self.train_epoch(dl_train, **kw)
            train_loss.extend(train_result.losses)
            train_acc.append(train_result.accuracy)
            test_result = self.test_epoch(dl_test, **kw)
            test_loss.extend(test_result.losses)
            test_acc.append(test_result.accuracy)

            min_delta = 0.05

            #The first epoch ends, set the best epoch as test_acc[0]
            if low_loss == None:
                best_acc = test_acc[-1]
                low_loss = test_loss[-1]
                save_checkpoint = True
            # In the following epoch, if the test_loss oscillates little around the low_loss, then
            # it means that there is no improvement. Otherwise, this epoch should be maintained.
            else:
                if test_loss[-1] + min_delta > low_loss:
                    epochs_without_improvement += 1
                    save_checkpoint = False
                else:
                    epochs_without_improvement = 0
                    low_loss = test_loss[-1]
                    best_acc = test_acc[-1]
                    save_checkpoint = True

            # Decide whether to early stop or not. If early stopping, store the value in the checkpoints_final file.
            if epochs_without_improvement > early_stopping:
                break

            #Joy's code
            # train_result = self.train_epoch(dl_train, verbose=verbose)
            # train_loss.extend(train_result.losses)
            # train_acc.append(train_result.accuracy)
            #
            # test_result = self.test_epoch(dl_test, verbose=verbose)
            # test_loss.extend(test_result.losses)
            # test_acc.append(test_result.accuracy)
            #
            # # if there is no improvement or nan loss then stop the training
            # if early_stopping:
            #     losses = [float(l) for l in test_loss]
            #     if str(losses[-1]) == 'nan' or len(set(losses[-early_stopping:])) <= 1:
            #         actual_num_epochs = epoch
            #         break
            #
            # actual_num_epochs = epoch

            # ========================

            # Save model checkpoint if requested
            if save_checkpoint and checkpoint_filename is not None:
                saved_state = dict(best_acc=best_acc,
                                   ewi=epochs_without_improvement,
                                   model_state=self.model.state_dict())
                torch.save(saved_state, checkpoint_filename)
                print(f'*** Saved checkpoint {checkpoint_filename} '
                      f'at epoch {epoch+1}')

            if post_epoch_fn:
                post_epoch_fn(epoch, train_result, test_result, verbose)

        return FitResult(actual_num_epochs, train_loss, train_acc, test_loss,
                         test_acc)
Exemplo n.º 4
0
    def fit(self,
            dl_train: DataLoader,
            dl_test: DataLoader,
            num_epochs,
            checkpoints: str = None,
            early_stopping: int = None,
            print_every=1,
            post_epoch_fn=None,
            **kw) -> FitResult:
        """
        Trains the model for multiple epochs with a given training set,
        and calculates validation loss over a given validation set.
        :param dl_train: Dataloader for the training set.
        :param dl_test: Dataloader for the test set.
        :param num_epochs: Number of epochs to train for.
        :param checkpoints: Whether to save model to file every time the
            test set accuracy improves. Should be a string containing a
            filename without extension.
        :param early_stopping: Whether to stop training early if there is no
            test loss improvement for this number of epochs.
        :param print_every: Print progress every this number of epochs.
        :param post_epoch_fn: A function to call after each epoch completes.
        :return: A FitResult object containing train and test losses per epoch.
        """
        actual_num_epochs = 0
        train_loss, train_acc, test_loss, test_acc = [], [], [], []

        best_acc = None
        epochs_without_improvement = 0

        checkpoint_filename = None
        if checkpoints is not None:
            checkpoint_filename = f'{checkpoints}.pt'
            Path(os.path.dirname(checkpoint_filename)).mkdir(exist_ok=True)
            if os.path.isfile(checkpoint_filename):
                print(f'*** Loading checkpoint file {checkpoint_filename}')
                saved_state = torch.load(checkpoint_filename,
                                         map_location=self.device)
                best_acc = saved_state.get('best_acc', best_acc)
                epochs_without_improvement =\
                    saved_state.get('ewi', epochs_without_improvement)
                self.model.load_state_dict(saved_state['model_state'])

        for epoch in range(num_epochs):
            save_checkpoint = False
            verbose = False  # pass this to train/test_epoch.
            if epoch % print_every == 0 or epoch == num_epochs - 1:
                verbose = True
            self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose)

            # TODO: Train & evaluate for one epoch
            # - Use the train/test_epoch methods.
            # - Save losses and accuracies in the lists above.
            # - Implement early stopping. This is a very useful and
            #   simple regularization technique that is highly recommended.
            # ====== YOUR CODE: ======
            save_checkpoint = True
            epoch_train_loss, epoch_train_acc = self.train_epoch(dl_train)

            # for loss in epoch_train_loss:
            train_loss += epoch_train_loss
            train_acc.append(epoch_train_acc)

            epoch_test_loss, epoch_test_acc = self.test_epoch(dl_test)
            # for loss in epoch_test_loss:
            test_loss += epoch_test_loss
            test_acc.append(epoch_test_acc)
            actual_num_epochs += 1

            if best_acc is None or best_acc < epoch_test_acc:
                best_acc = epoch_test_acc
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1

            if early_stopping is not None and early_stopping > 0:
                if epochs_without_improvement >= early_stopping:
                    break

            train_result = EpochResult(losses=epoch_train_loss,
                                       accuracy=epoch_train_acc)
            test_result = EpochResult(losses=epoch_test_loss,
                                      accuracy=epoch_test_acc)
            # ========================

            # Save model checkpoint if requested
            if save_checkpoint and checkpoint_filename is not None:
                saved_state = dict(best_acc=best_acc,
                                   ewi=epochs_without_improvement,
                                   model_state=self.model.state_dict())
                torch.save(saved_state, checkpoint_filename)
                print(f'*** Saved checkpoint {checkpoint_filename} '
                      f'at epoch {epoch+1}')

            if post_epoch_fn:
                post_epoch_fn(epoch, train_result, test_result, verbose)

        return FitResult(actual_num_epochs, train_loss, train_acc, test_loss,
                         test_acc)
Exemplo n.º 5
0
    def fit(self,
            dl_train: DataLoader,
            dl_test: DataLoader,
            num_epochs,
            checkpoints: str = None,
            early_stopping: int = None,
            print_every=1,
            **kw) -> FitResult:
        """
        Trains the model for multiple epochs with a given training set,
        and calculates validation loss over a given validation set.
        :param dl_train: Dataloader for the training set.
        :param dl_test: Dataloader for the test set.
        :param num_epochs: Number of epochs to train for.
        :param checkpoints: Whether to save model to file every time the
            test set accuracy improves. Should be a string containing a
            filename without extension.
        :param early_stopping: Whether to stop training early if there is no
            test loss improvement for this number of epochs.
        :param print_every: Print progress every this number of epochs.
        :return: A FitResult object containing train and test losses per epoch.
        """
        actual_num_epochs = 0
        train_loss, train_acc, test_loss, test_acc = [], [], [], []

        best_acc = None
        epochs_without_improvement = 0

        for epoch in range(num_epochs):
            verbose = False  # pass this to train/test_epoch.
            if epoch % print_every == 0 or epoch == num_epochs - 1:
                verbose = True
            self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose)

            # TODO: Train & evaluate for one epoch
            # - Use the train/test_epoch methods.
            # - Save losses and accuracies in the lists above.
            # - Optional: Implement checkpoints. You can use torch.save() to
            #   save the model to a file.
            # - Optional: Implement early stopping. This is a very useful and
            #   simple regularization technique that is highly recommended.
            batches = None
            if "max_batches" in kw:
                batches = kw.get("max_batches")

            actual_num_epochs += 1
            train_res = self.train_epoch(dl_train,
                                         verbose=verbose,
                                         max_batches=batches)
            test_res = self.test_epoch(dl_test,
                                       verbose=verbose,
                                       max_batches=batches)
            train_loss.append(sum(train_res.losses) / len(train_res.losses))
            train_acc.append(train_res.accuracy)
            test_loss.append(sum(test_res.losses) / len(test_res.losses))
            test_acc.append(test_res.accuracy)
            if early_stopping is not None and len(test_loss) >= 2:
                if test_loss[-1] >= test_loss[-2]:
                    epochs_without_improvement += 1
                    if epochs_without_improvement == early_stopping:
                        break  #TODO check if really exits loop
                else:
                    epochs_without_improvement = 0

            best_acc = max(best_acc if best_acc is not None else 0,
                           test_res.accuracy)

        return FitResult(actual_num_epochs, train_loss, train_acc, test_loss,
                         test_acc)
Exemplo n.º 6
0
def run_experiment(
        run_name,
        out_dir='./results',
        seed=None,
        # Training params
        bs_train=128,
        bs_test=None,
        batches=100,
        epochs=100,
        early_stopping=3,
        checkpoints=None,
        lr=1e-3,
        reg=1e-3,
        # Model params
        filters_per_layer=[64],
        layers_per_block=2,
        pool_every=2,
        hidden_dims=[1024],
        ycn=False,
        **kw):
    """
    Execute a single run of experiment 1 with a single configuration.
    :param run_name: The name of the run and output file to create.
    :param out_dir: Where to write the output to.
    """
    if not seed:
        seed = random.randint(0, 2**31)
    torch.manual_seed(seed)
    if not bs_test:
        bs_test = max([bs_train // 4, 1])
    cfg = locals()

    tf = torchvision.transforms.ToTensor()
    ds_train = CIFAR10(root=DATA_DIR, download=True, train=True, transform=tf)
    ds_test = CIFAR10(root=DATA_DIR, download=True, train=False, transform=tf)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Select model class (experiment 1 or 2)
    model_cls = models.ConvClassifier if not ycn else models.YourCodeNet

    # TODO: Train
    # - Create model, loss, optimizer and trainer based on the parameters.
    #   Use the model you've implemented previously, cross entropy loss and
    #   any optimizer that you wish.
    # - Run training and save the FitResults in the fit_res variable.
    # - The fit results and all the experiment parameters will then be saved
    #  for you automatically.
    fit_res = None
    # ====== YOUR CODE: ======

    filters_per_block = []
    for filters in filters_per_layer:
        for _ in range(layers_per_block):
            filters_per_block.append(filters)

    model = model_cls((3, 32, 32),
                      10,
                      filters=filters_per_block,
                      pool_every=pool_every,
                      hidden_dims=hidden_dims)
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=lr,
                                momentum=0.9,
                                weight_decay=reg)

    dl_train = torch.utils.data.DataLoader(ds_train, bs_train, shuffle=True)
    dl_test = torch.utils.data.DataLoader(ds_test, bs_test, shuffle=True)

    trainer = training.TorchTrainer(model, loss_fn, optimizer, device)

    actual_num_epochs = 0
    train_loss, train_acc, test_loss, test_acc = [], [], [], []

    best_acc = None
    epochs_without_improvement = 0

    for epoch in range(epochs):

        epoch_res_train = trainer.train_epoch(dl_train,
                                              max_batches=batches,
                                              **kw)
        epoch_res_test = trainer.test_epoch(dl_test, max_batches=batches, **kw)

        train_losses_mean = sum(epoch_res_train.losses) / len(
            epoch_res_train.losses)
        test_losses_mean = sum(epoch_res_test.losses) / len(
            epoch_res_test.losses)
        actual_num_epochs += 1

        train_loss.append(train_losses_mean)
        train_acc.append(epoch_res_train.accuracy)
        test_loss.append(test_losses_mean)
        test_acc.append(epoch_res_test.accuracy)

        if checkpoints != None:
            torch.save(model.state_dict(), checkpoints)

        #if epoch > 0 and torch.sum(torch.FloatTensor(test_loss[epoch])) >= torch.sum(torch.FloatTensor(test_loss[epoch-1])):
        #    epochs_without_improvement += 1
        if early_stopping != None and epochs_without_improvement == early_stopping:
            break

    fit_res = FitResult(actual_num_epochs, train_loss, train_acc, test_loss,
                        test_acc)
    # ========================

    save_experiment(run_name, out_dir, cfg, fit_res)
Exemplo n.º 7
0
    def fit(self, dl_train: DataLoader, dl_test: DataLoader,
            num_epochs, checkpoints: str = None,
            early_stopping: int = None,
            print_every=1, **kw) -> FitResult:
        """
        Trains the model for multiple epochs with a given training set,
        and calculates validation loss over a given validation set.
        :param dl_train: Dataloader for the training set.
        :param dl_test: Dataloader for the test set.
        :param num_epochs: Number of epochs to train for.
        :param checkpoints: Whether to save model to file every time the
            test set accuracy improves. Should be a string containing a
            filename without extension.
        :param early_stopping: Whether to stop training early if there is no
            test loss improvement for this number of epochs.
        :param print_every: Print progress every this number of epochs.
        :return: A FitResult object containing train and test losses per epoch.
        """
        actual_num_epochs = 0
        train_loss, train_acc, test_loss, test_acc = [], [], [], []

        best_acc = None
        epochs_without_improvement = 0

        for epoch in range(num_epochs):
            verbose = False  # pass this to train/test_epoch.
            if epoch % print_every == 0 or epoch == num_epochs-1:
                verbose = True
            self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose)

            # TODO: Train & evaluate for one epoch
            # - Use the train/test_epoch methods.
            # - Save losses and accuracies in the lists above.
            # - Optional: Implement checkpoints. You can use torch.save() to
            #   save the model to a file.
            # - Optional: Implement early stopping. This is a very useful and
            #   simple regularization technique that is highly recommended.
            # ====== YOUR CODE: ======
            # Start with training the model for 1 epoch & recording the results
            train_res = self.train_epoch(dl_train=dl_train)
            train_loss += train_res.losses
            train_acc.append(train_res.accuracy)
            
            # Run an evaluation of the model & save the results
            test_res = self.test_epoch(dl_test=dl_test)
            test_loss += test_res.losses
            test_acc.append(test_res.accuracy)
            
            # if torch.isnan(test_loss[-1]).item():
            #     print("Loss is NaN.\nBreaking training.")
            #     break

            if test_loss[-1] != test_loss[-1]:  # check if last loss is NaN
                print("Loss is NaN.\nBreaking training.")
                break
            
            if epoch == 0:
                best_acc = test_res.accuracy
                epochs_without_improvement = 0
                save_checkpoint = 1
                
            else:
                if test_res.accuracy > best_acc:
                    best_acc = test_res.accuracy
                    save_checkpoint = 1
                    epochs_without_improvement = 0

                else:
                    save_checkpoint = 0
                    epochs_without_improvement += 1
            
            # Create a checkpoint after each epoch
            if checkpoints is not None:
                if save_checkpoint:
                    file = open(checkpoints, 'wb')
                    torch.save(obj=self.model, f=file, pickle_protocol=3)
                    file.close()

            # Implement early stopping
            if early_stopping is not None:
                if epochs_without_improvement == early_stopping:
                    # We haven't improved at all in the last 'early_stopping' epochs
                    print("Reached the Early Stop condition.\nStopping the training.")
                    break
            # ========================

        return FitResult(actual_num_epochs,
                         train_loss, train_acc, test_loss, test_acc)