예제 #1
0
    train_loader = AudioDataLoader(train_dataset,
                                   num_workers=args.num_workers, batch_sampler=train_sampler)
    test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size,
                                  num_workers=args.num_workers, shuffle=True)



    if (not args.no_shuffle and start_epoch != 0) or args.no_sorta_grad:
        print("Shuffling batches for the following epochs")
        train_sampler.shuffle(start_epoch)

    if args.tensorboard and generate_graph:  # TO DO get some audios also
        with torch.no_grad(): # sla vai que ne
            inputs, targets, input_percentages, target_sizes = next(iter(train_loader))
            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
            tensorboard_logger.add_image(inputs, input_sizes, targets, network=model) # add graph doesn't work if model is in gpu

    if freeze_conv:
        model.conv.requires_grad_(requires_grad=False)
        # Free batch norm layer to learn running average
        model.conv.seq_module[1].requires_grad_(requires_grad=True)
        model.conv.seq_module[4].requires_grad_(requires_grad=True)

    if freeze_rnns:
        model.rnns.requires_grad_(requires_grad=False)
        for i in range(1, len(model.rnns)):
            model.rnns[i].batch_norm.requires_grad_(requires_grad=True)

    if remove_bn_conv:
        model.conv.seq_module[1] = Identity()
        model.conv.seq_module[4] = Identity()
예제 #2
0
    def train(self, **kwargs):
        """
        Run optimization to train the model.

        Parameters
        ----------


        """
        world_size = kwargs.pop('world_size', 1)
        gpu_rank = kwargs.pop('gpu_rank', 0)
        rank = kwargs.pop('rank', 0)
        dist_backend = kwargs.pop('dist_backend', 'nccl')
        dist_url = kwargs.pop('dist_url', None)

        os.environ['MASTER_ADDR'] = '127.0.0.1'
        os.environ['MASTER_PORT'] = '1234'

        main_proc = True
        self.distributed = world_size > 1

        if self.distributed:
            if self.gpu_rank:
                torch.cuda.set_device(int(gpu_rank))
            dist.init_process_group(backend=dist_backend,
                                    init_method=dist_url,
                                    world_size=world_size,
                                    rank=rank)
            print('Initiated process group')
            main_proc = rank == 0  # Only the first proc should save models

        if main_proc and self.tensorboard:
            tensorboard_logger = TensorBoardLogger(self.id,
                                                   self.log_dir,
                                                   self.log_params,
                                                   comment=self.sufix)

        if self.distributed:
            train_sampler = DistributedBucketingSampler(
                self.data_train,
                batch_size=self.batch_size,
                num_replicas=world_size,
                rank=rank)
        else:
            if self.sampler_type == 'bucketing':
                train_sampler = BucketingSampler(self.data_train,
                                                 batch_size=self.batch_size,
                                                 shuffle=True)
            if self.sampler_type == 'random':
                train_sampler = RandomBucketingSampler(
                    self.data_train, batch_size=self.batch_size)

        print("Shuffling batches for the following epochs..")
        train_sampler.shuffle(self.start_epoch)

        train_loader = AudioDataLoader(self.data_train,
                                       num_workers=self.num_workers,
                                       batch_sampler=train_sampler)
        val_loader = AudioDataLoader(self.data_val,
                                     batch_size=self.batch_size_val,
                                     num_workers=self.num_workers,
                                     shuffle=True)

        if self.tensorboard and self.generate_graph:  # TO DO get some audios also
            with torch.no_grad():
                inputs, targets, input_percentages, target_sizes = next(
                    iter(train_loader))
                input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
                tensorboard_logger.add_image(inputs,
                                             input_sizes,
                                             targets,
                                             network=self.model)

        self.model = self.model.to(self.device)
        parameters = self.model.parameters()

        if self.update_rule == 'adam':
            optimizer = torch.optim.Adam(parameters,
                                         lr=self.lr,
                                         weight_decay=self.reg)
        if self.update_rule == 'sgd':
            optimizer = torch.optim.SGD(parameters,
                                        lr=self.lr,
                                        weight_decay=self.reg)

        self.model, self.optimizer = amp.initialize(
            self.model,
            optimizer,
            opt_level=self.opt_level,
            keep_batchnorm_fp32=self.keep_batchnorm_fp32,
            loss_scale=self.loss_scale)

        if self.optim_state is not None:
            self.optimizer.load_state_dict(self.optim_state)

        if self.amp_state is not None:
            amp.load_state_dict(self.amp_state)

        if self.distributed:
            self.model = DistributedDataParallel(self.model)

        print(self.model)

        if self.criterion_type == 'cross_entropy_loss':
            self.criterion = torch.nn.CrossEntropyLoss()

        #  Useless for now because I don't save.
        accuracies_train_iters = []
        losses_iters = []

        avg_loss = 0
        batch_time = AverageMeter()
        epoch_time = AverageMeter()
        losses = AverageMeter()

        start_training = time.time()
        for epoch in range(self.start_epoch, self.num_epochs):
            print("Start epoch..")

            # Put model in train mode
            self.model.train()

            y_true_train_epoch = np.array([])
            y_pred_train_epoch = np.array([])

            start_epoch = time.time()
            for i, (data) in enumerate(train_loader, start=0):
                start_batch = time.time()

                print('Start batch..')

                if i == len(train_sampler):  # QUE pq isso deus
                    break

                inputs, targets, input_percentages, _ = data

                input_sizes = input_percentages.mul_(int(inputs.size(3))).int()

                inputs = inputs.to(self.device)
                targets = targets.to(self.device)

                output, loss_value = self._step(inputs, input_sizes, targets)

                print('Step finished.')

                avg_loss += loss_value

                with torch.no_grad():
                    y_pred = self.decoder.decode(output.detach()).cpu().numpy()

                    # import pdb; pdb.set_trace()

                    y_true_train_epoch = np.concatenate(
                        (y_true_train_epoch, targets.cpu().numpy()
                         ))  # maybe I should do it with tensors?
                    y_pred_train_epoch = np.concatenate(
                        (y_pred_train_epoch, y_pred))

                inputs_size = inputs.size(0)
                del output, inputs, input_percentages

                if self.intra_epoch_sanity_check:
                    with torch.no_grad():
                        acc, _ = self.check_accuracy(targets.cpu().numpy(),
                                                     y_pred=y_pred)
                        accuracies_train_iters.append(acc)
                        losses_iters.append(loss_value)

                        cm = confusion_matrix(targets.cpu().numpy(),
                                              y_pred,
                                              labels=self.labels)
                        print('[it %i/%i] Confusion matrix train step:' %
                              ((i + 1, len(train_sampler))))
                        print(pd.DataFrame(cm))

                        if self.tensorboard:
                            tensorboard_logger.update(
                                len(train_loader) * epoch + i + 1, {
                                    'Loss/through_iterations': loss_value,
                                    'Accuracy/train_through_iterations': acc
                                })

                del targets

                batch_time.update(time.time() - start_batch)

            epoch_time.update(time.time() - start_epoch)
            losses.update(loss_value, inputs_size)

            # Write elapsed time (and loss) to terminal
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Epoch {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                      (epoch + 1), (i + 1),
                      len(train_sampler),
                      batch_time=batch_time,
                      data_time=epoch_time,
                      loss=losses))

            # Loss log
            avg_loss /= len(train_sampler)
            self.loss_epochs.append(avg_loss)

            # Accuracy train log
            acc_train, _ = self.check_accuracy(y_true_train_epoch,
                                               y_pred=y_pred_train_epoch)
            self.accuracy_train_epochs.append(acc_train)

            # Accuracy val log
            with torch.no_grad():
                y_pred_val = np.array([])
                targets_val = np.array([])
                for data in val_loader:
                    inputs, targets, input_percentages, _ = data
                    input_sizes = input_percentages.mul_(int(
                        inputs.size(3))).int()
                    _, y_pred_val_batch = self.check_accuracy(
                        targets.cpu().numpy(),
                        inputs=inputs,
                        input_sizes=input_sizes)
                    y_pred_val = np.concatenate((y_pred_val, y_pred_val_batch))
                    targets_val = np.concatenate(
                        (targets_val, targets.cpu().numpy()
                         ))  # TO DO: think of a smarter way to do this later
                    del inputs, targets, input_percentages

            # import pdb; pdb.set_trace()
            acc_val, y_pred_val = self.check_accuracy(targets_val,
                                                      y_pred=y_pred_val)
            self.accuracy_val_epochs.append(acc_val)
            cm = confusion_matrix(targets_val, y_pred_val, labels=self.labels)
            print('Confusion matrix validation:')
            print(pd.DataFrame(cm))

            # Write epoch stuff to tensorboard
            if self.tensorboard:
                tensorboard_logger.update(
                    epoch + 1, {'Loss/through_epochs': avg_loss},
                    parameters=self.model.named_parameters)

                tensorboard_logger.update(epoch + 1, {
                    'train': acc_train,
                    'validation': acc_val
                },
                                          together=True,
                                          name='Accuracy/through_epochs')

            # Keep track of the best model
            if acc_val > self.best_acc_val:
                self.best_acc_val = acc_val
                self.best_params = {}
                for k, v in self.model.named_parameters(
                ):  # TO DO: actually copy model and save later? idk..
                    self.best_params[k] = v.clone()

            # Anneal learning rate. TO DO: find better way to this this specific to every parameter as cs231n does.
            for g in self.optimizer.param_groups:
                g['lr'] = g['lr'] / self.learning_anneal
            print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr']))

            # Shuffle batches order
            print("Shuffling batches...")
            train_sampler.shuffle(epoch)

            # Rechoose batches elements
            if self.sampler_type == 'random':
                train_sampler.recompute_bins()

        end_training = time.time()

        if self.tensorboard:
            tensorboard_logger.close()

        print('Elapsed time in training: %.02f ' %
              ((end_training - start_training) / 60.0))