Exemplo n.º 1
0
class TensorBoard(Callback):

    # TODO: add option to write images; find fix for graph

    def __init__(self, log_dir, update_frequency = 10):
        super(Callback, self).__init__()
        self.log_dir = log_dir
        self.writer = None
        self.update_frequency = update_frequency

    def on_train_begin(self, **_):
        self.writer = SummaryWriter(os.path.join(self.log_dir, datetime.datetime.now().__str__()))
        rndm_input = torch.autograd.Variable(torch.rand(1, *self.model.input_shape), requires_grad = True).to(self.logger['device'])
        # fwd_pass = self.model(rndm_input)
        self.writer.add_graph(self.model, rndm_input)
        return self

    def on_epoch_end(self, **_):
        if (self.logger['epoch'] % self.update_frequency) == 0:
            epoch_metrics = self.logger['epoch_metrics'][self.logger['epoch']]
            for e_metric, e_metric_dct in epoch_metrics.iteritems():
                for e_metric_split, e_metric_val in e_metric_dct.iteritems():
                    self.writer.add_scalar('{}/{}'.format(e_metric_split, e_metric), e_metric_val, self.logger['epoch'])
            for name, param in self.model.named_parameters():
                self.writer.add_histogram(name.replace('.', '/'), param.clone().cpu().data.numpy(), self.logger['epoch'])
        return self

    def on_train_end(self, **_):
        return self.writer.close()
Exemplo n.º 2
0
def learn(learning_rate, iterations, x, y, validation=None, stop_early=False, run_comment=''):
    # Define a neural network using high-level modules.
    writer = SummaryWriter(comment=run_comment)
    model = Sequential(
        Linear(len(x[0]), len(y[0]), bias=True)  # n inputs -> 1 output
    )
    loss_fn = BCEWithLogitsLoss(reduction='sum')  # reduction=mean converges slower.
    # TODO: Add an option to twiddle pos_weight, which lets us trade off precision and recall. Maybe also graph using add_pr_curve(), which can show how that tradeoff is going.
    optimizer = Adam(model.parameters(),lr=learning_rate)

    if validation:
        validation_ins, validation_outs = validation
        previous_validation_loss = None
    with progressbar(range(iterations)) as bar:
        for t in bar:
            y_pred = model(x)  # Make predictions.
            loss = loss_fn(y_pred, y)
            writer.add_scalar('loss', loss, t)
            if validation:
                validation_loss = loss_fn(model(validation_ins), validation_outs)
                if stop_early:
                    if previous_validation_loss is not None and previous_validation_loss < validation_loss:
                        print('Stopping early at iteration {t} because validation error rose.'.format(t=t))
                        model.load_state_dict(previous_model)
                        break
                    else:
                        previous_validation_loss = validation_loss
                        previous_model = model.state_dict()
                writer.add_scalar('validation_loss', validation_loss, t)
            writer.add_scalar('training_accuracy_per_tag', accuracy_per_tag(model, x, y), t)
            optimizer.zero_grad()  # Zero the gradients.
            loss.backward()  # Compute gradients.
            optimizer.step()

    # Horizontal axis is what confidence. Vertical is how many samples were that confidence.
    writer.add_histogram('confidence', confidences(model, x), t)
    writer.close()
    return model
Exemplo n.º 3
0
    optim_reglar.zero_grad()

    loss_normal.backward()
    loss_wdecay.backward()

    optim_normal.step()
    optim_reglar.step()

    if (epoch + 1) % disp_interval == 0:

        net_prob_0.eval()
        net_prob_05.eval()

        # 可视化
        for name, layer in net_prob_0.named_parameters():
            writer.add_histogram(name + '_grad_normal', layer.grad, epoch)
            writer.add_histogram(name + '_data_normal', layer, epoch)

        for name, layer in net_prob_05.named_parameters():
            writer.add_histogram(name + '_grad_regularization', layer.grad,
                                 epoch)
            writer.add_histogram(name + '_data_regularization', layer, epoch)

        test_pred_prob_0, test_pred_prob_05 = net_prob_0(test_x), net_prob_05(
            test_x)

        # 绘图
        plt.scatter(train_x.data.numpy(),
                    train_y.data.numpy(),
                    c='blue',
                    s=50,
Exemplo n.º 4
0
class UNet3DTrainer:
    """3D UNet trainer.

    Args:
        model (Unet3D): UNet 3D model to be trained
        optimizer (nn.optim.Optimizer): optimizer used for training
        lr_scheduler (torch.optim.lr_scheduler._LRScheduler): learning rate scheduler
            WARN: bear in mind that lr_scheduler.step() is invoked after every validation step
            (i.e. validate_after_iters) not after every epoch. So e.g. if one uses StepLR with step_size=30
            the learning rate will be adjusted after every 30 * validate_after_iters iterations.
        loss_criterion (callable): loss function
        eval_criterion (callable): used to compute training/validation metric (such as Dice, IoU, AP or Rand score)
            saving the best checkpoint is based on the result of this function on the validation set
        device (torch.device): device to train on
        loaders (dict): 'train' and 'val' loaders
        checkpoint_dir (string): dir for saving checkpoints and tensorboard logs
        max_num_epochs (int): maximum number of epochs
        max_num_iterations (int): maximum number of iterations
        validate_after_iters (int): validate after that many iterations
        log_after_iters (int): number of iterations before logging to tensorboard
        validate_iters (int): number of validation iterations, if None validate
            on the whole validation set
        eval_score_higher_is_better (bool): if True higher eval scores are considered better
        best_eval_score (float): best validation score so far (higher better)
        num_iterations (int): useful when loading the model from the checkpoint
        num_epoch (int): useful when loading the model from the checkpoint
        tensorboard_formatter (callable): converts a given batch of input/output/target image to a series of images
            that can be displayed in tensorboard
        skip_train_validation (bool): if True eval_criterion is not evaluated on the training set (used mostly when
            evaluation is expensive)
    """

    def __init__(self, model, optimizer, lr_scheduler, loss_criterion,
                 eval_criterion, device, loaders, checkpoint_dir,
                 max_num_epochs=100, max_num_iterations=1e5,
                 validate_after_iters=100, log_after_iters=100,
                 validate_iters=None, num_iterations=1, num_epoch=0,
                 eval_score_higher_is_better=True, best_eval_score=None,
                 tensorboard_formatter=None, skip_train_validation=False):

        self.model = model
        self.optimizer = optimizer
        self.scheduler = lr_scheduler
        self.loss_criterion = loss_criterion
        self.eval_criterion = eval_criterion
        self.device = device
        self.loaders = loaders
        self.checkpoint_dir = checkpoint_dir
        self.max_num_epochs = max_num_epochs
        self.max_num_iterations = max_num_iterations
        self.validate_after_iters = validate_after_iters
        self.log_after_iters = log_after_iters
        self.validate_iters = validate_iters
        self.eval_score_higher_is_better = eval_score_higher_is_better

        logger.info(model)
        logger.info(f'eval_score_higher_is_better: {eval_score_higher_is_better}')

        if best_eval_score is not None:
            self.best_eval_score = best_eval_score
        else:
            # initialize the best_eval_score
            if eval_score_higher_is_better:
                self.best_eval_score = float('-inf')
            else:
                self.best_eval_score = float('+inf')

        self.writer = SummaryWriter(log_dir=os.path.join(checkpoint_dir, 'logs'))

        assert tensorboard_formatter is not None, 'TensorboardFormatter must be provided'
        self.tensorboard_formatter = tensorboard_formatter

        self.num_iterations = num_iterations
        self.num_epoch = num_epoch
        self.skip_train_validation = skip_train_validation

    @classmethod
    def from_checkpoint(cls, checkpoint_path, model, optimizer, lr_scheduler, loss_criterion, eval_criterion, loaders,
                        tensorboard_formatter=None, skip_train_validation=False):
        logger.info(f"Loading checkpoint '{checkpoint_path}'...")
        state = utils.load_checkpoint(checkpoint_path, model, optimizer)
        logger.info(
            f"Checkpoint loaded. Epoch: {state['epoch']}. Best val score: {state['best_eval_score']}. Num_iterations: {state['num_iterations']}")
        checkpoint_dir = os.path.split(checkpoint_path)[0]
        return cls(model, optimizer, lr_scheduler,
                   loss_criterion, eval_criterion,
                   torch.device(state['device']),
                   loaders, checkpoint_dir,
                   eval_score_higher_is_better=state['eval_score_higher_is_better'],
                   best_eval_score=state['best_eval_score'],
                   num_iterations=state['num_iterations'],
                   num_epoch=state['epoch'],
                   max_num_epochs=state['max_num_epochs'],
                   max_num_iterations=state['max_num_iterations'],
                   validate_after_iters=state['validate_after_iters'],
                   log_after_iters=state['log_after_iters'],
                   validate_iters=state['validate_iters'],
                   tensorboard_formatter=tensorboard_formatter,
                   skip_train_validation=skip_train_validation)

    @classmethod
    def from_pretrained(cls, pre_trained, model, optimizer, lr_scheduler, loss_criterion, eval_criterion,
                        device, loaders,
                        max_num_epochs=100, max_num_iterations=1e5,
                        validate_after_iters=100, log_after_iters=100,
                        validate_iters=None, num_iterations=1, num_epoch=0,
                        eval_score_higher_is_better=True, best_eval_score=None,
                        tensorboard_formatter=None, skip_train_validation=False):
        logger.info(f"Logging pre-trained model from '{pre_trained}'...")
        utils.load_checkpoint(pre_trained, model, None)
        checkpoint_dir = os.path.split(pre_trained)[0]
        return cls(model, optimizer, lr_scheduler,
                   loss_criterion, eval_criterion,
                   device, loaders, checkpoint_dir,
                   eval_score_higher_is_better=eval_score_higher_is_better,
                   best_eval_score=best_eval_score,
                   num_iterations=num_iterations,
                   num_epoch=num_epoch,
                   max_num_epochs=max_num_epochs,
                   max_num_iterations=max_num_iterations,
                   validate_after_iters=validate_after_iters,
                   log_after_iters=log_after_iters,
                   validate_iters=validate_iters,
                   tensorboard_formatter=tensorboard_formatter,
                   skip_train_validation=skip_train_validation)

    def fit(self):
        for _ in range(self.num_epoch, self.max_num_epochs):
            # train for one epoch
            should_terminate = self.train(self.loaders['train'])

            if should_terminate:
                logger.info('Stopping criterion is satisfied. Finishing training')
                return

            self.num_epoch += 1
        logger.info(f"Reached maximum number of epochs: {self.max_num_epochs}. Finishing training...")

    def train(self, train_loader):
        """Trains the model for 1 epoch.

        Args:
            train_loader (torch.utils.data.DataLoader): training data loader

        Returns:
            True if the training should be terminated immediately, False otherwise
        """
        train_losses = utils.RunningAverage()
        train_eval_scores = utils.RunningAverage()

        # sets the model in training mode
        self.model.train()
        for i, t in enumerate(train_loader):
            logger.info(
                f'Training iteration {self.num_iterations}. Batch {i}. Epoch [{self.num_epoch}/{self.max_num_epochs - 1}]')
            input, target, weight = self._split_training_batch(t)

            output, loss = self._forward_pass(input, target, weight)
            train_losses.update(loss.item(), self._batch_size(input))

            # compute gradients and update parameters
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if self.num_iterations % self.validate_after_iters == 0:
                # set the model in eval mode
                self.model.eval()
                # evaluate on validation set
                eval_score = self.validate(self.loaders['val'])
                # set the model back to training mode
                self.model.train()

                # adjust learning rate if necessary
                if isinstance(self.scheduler, ReduceLROnPlateau):
                    self.scheduler.step(eval_score[1])
                else:
                    self.scheduler.step()
                # log current learning rate in tensorboard
                self._log_lr()
                # remember best validation metric
                is_best = self._is_best_eval_score(eval_score[1])

                # save checkpoint
                self._save_checkpoint(is_best)

            if self.num_iterations % self.log_after_iters == 0:
                # if model contains final_activation layer for normalizing logits apply it, otherwise both
                # the evaluation metric as well as images in tensorboard will be incorrectly computed
                if hasattr(self.model, 'final_activation') and self.model.final_activation is not None:
                    output = self.model.final_activation(output)

                # compute eval criterion
                if not self.skip_train_validation:
                    eval_score = self.eval_criterion(output, target)
                    train_eval_scores.update(eval_score.numpy(), self._batch_size(input))

                # log stats, params and images
                logger.info(
                    f'Training stats. Loss: {train_losses.avg}. Evaluation score: {train_eval_scores.avg}')
                self._log_stats('train', [train_losses.avg], train_eval_scores.avg)
                self._log_params()
                self._log_images(input, target, output, 'train_')

            if self.should_stop():
                return True

            self.num_iterations += 1

        return False

    def should_stop(self):
        """
        Training will terminate if maximum number of iterations is exceeded or the learning rate drops below
        some predefined threshold (1e-6 in our case)
        """
        if self.max_num_iterations < self.num_iterations:
            logger.info(f'Maximum number of iterations {self.max_num_iterations} exceeded.')
            return True

        min_lr = 1e-6
        lr = self.optimizer.param_groups[0]['lr']
        if lr < min_lr:
            logger.info(f'Learning rate below the minimum {min_lr}.')
            return True

        return False

    def validate(self, val_loader):
        logger.info('Validating...')

        val_losses = utils.RunningAverage()
        val_scores = utils.RunningAverage()

        with torch.no_grad():
            for i, t in enumerate(val_loader):
                logger.info(f'Validation iteration {i}')

                input, target, weight = self._split_training_batch(t)

                output, loss = self._forward_pass(input, target, weight)
                val_losses.update(loss.item(), self._batch_size(input))

                # if model contains final_activation layer for normalizing logits apply it, otherwise
                # the evaluation metric will be incorrectly computed
                if hasattr(self.model, 'final_activation') and self.model.final_activation is not None:
                    output = self.model.final_activation(output)

                if i % 100 == 0:
                    self._log_images(input, target, output, 'val_')

                eval_score = self.eval_criterion(output, target)
                val_scores.update(eval_score.numpy(), self._batch_size(input))

                if self.validate_iters is not None and self.validate_iters <= i:
                    # stop validation
                    break

            self._log_stats('val', [val_losses.avg], val_scores.avg)
            logger.info(f'Validation finished. Loss: {val_losses.avg}. Evaluation score: {val_scores.avg}')
            return val_scores.avg

    def _split_training_batch(self, t):
        def _move_to_device(input):
            if isinstance(input, tuple) or isinstance(input, list):
                return tuple([_move_to_device(x) for x in input])
            else:
                return input.to(self.device)

        t = _move_to_device(t)
        weight = None
        if len(t) == 2:
            input, target = t
        else:
            input, target, weight = t
        return input, target, weight

    def _forward_pass(self, input, target, weight=None):
        # forward pass
        output = self.model(input)

        # compute the loss
        if weight is None:
            loss = self.loss_criterion(output, target)
        else:
            loss = self.loss_criterion(output, target, weight)

        return output, loss

    def _is_best_eval_score(self, eval_score):
        if self.eval_score_higher_is_better:
            is_best = eval_score > self.best_eval_score
        else:
            is_best = eval_score < self.best_eval_score

        if is_best:
            logger.info(f'Saving new best evaluation metric: {eval_score}')
            self.best_eval_score = eval_score

        return is_best

    def _save_checkpoint(self, is_best):
        # remove `module` prefix from layer names when using `nn.DataParallel`
        # see: https://discuss.pytorch.org/t/solved-keyerror-unexpected-key-module-encoder-embedding-weight-in-state-dict/1686/20
        if isinstance(self.model, nn.DataParallel):
            state_dict = self.model.module.state_dict()
        else:
            state_dict = self.model.state_dict()

        utils.save_checkpoint({
            'epoch': self.num_epoch + 1,
            'num_iterations': self.num_iterations,
            'model_state_dict': state_dict,
            'best_eval_score': self.best_eval_score,
            'eval_score_higher_is_better': self.eval_score_higher_is_better,
            'optimizer_state_dict': self.optimizer.state_dict(),
            'device': str(self.device),
            'max_num_epochs': self.max_num_epochs,
            'max_num_iterations': self.max_num_iterations,
            'validate_after_iters': self.validate_after_iters,
            'log_after_iters': self.log_after_iters,
            'validate_iters': self.validate_iters
        }, is_best, checkpoint_dir=self.checkpoint_dir,
            logger=logger)

    def _log_lr(self):
        lr = self.optimizer.param_groups[0]['lr']
        self.writer.add_scalar('learning_rate', lr, self.num_iterations)

    def _log_stats(self, phase, loss_avg, eval_score_avg):
        tag_value = {
            f'{phase}_loss_avg': loss_avg,
            f'{phase}_eval_score_avg': eval_score_avg
        }

        for tag, value in tag_value.items():
            if len(value)>1:
                value_dict = {}
                for i in range(len(value)):
                    value_dict[tag+'_class'+str(i)] = value[i]
                self.writer.add_scalars(tag, value_dict, self.num_iterations)
            else:
                self.writer.add_scalar(tag, value, self.num_iterations)

    def _log_params(self):
        logger.info('Logging model parameters and gradients')
        for name, value in self.model.named_parameters():
            self.writer.add_histogram(name, value.data.cpu().numpy(), self.num_iterations)
            self.writer.add_histogram(name + '/grad', value.grad.data.cpu().numpy(), self.num_iterations)

    def _log_images(self, input, target, prediction, prefix=''):
        inputs_map = {
            'inputs': input,
            'targets': target,
            'predictions': prediction
        }
        img_sources = {}
        for name, batch in inputs_map.items():
            if isinstance(batch, list) or isinstance(batch, tuple):
                for i, b in enumerate(batch):
                    img_sources[f'{name}{i}'] = b.data.cpu().numpy()
            else:
                img_sources[name] = batch.data.cpu().numpy()

        for name, batch in img_sources.items():
            for tag, image in self.tensorboard_formatter(name, batch):
                self.writer.add_image(prefix + tag, image, self.num_iterations, dataformats='CHW')

    @staticmethod
    def _batch_size(input):
        if isinstance(input, list) or isinstance(input, tuple):
            return input[0].size(0)
        else:
            return input.size(0)
Exemplo n.º 5
0
class TBCallback(TrainingCallback):
    def __init__(self, log_dir, input_dim=None):
        self.log_dir = log_dir
        self.input_dim = input_dim
        self.writer = SummaryWriter(log_dir)
        super().__init__()

    def before_training(self, model_trainer):
        if self.input_dim is not None:
            dummy_input = cuda_move(Variable(torch.zeros(self.input_dim)))
            model_file = self.log_dir + 'onnx_model.proto'
            torch.onnx.export(model_trainer.model,
                              dummy_input,
                              model_file,
                              verbose=True)
            self.writer.add_graph_onnx(model_file)
        pass

    def after_epoch(self, model_trainer, train_data, validation_data):
        n_iter = model_trainer.global_step
        train_loss, train_metric = model_trainer.train_losses[
            -1], model_trainer.train_metrics[-1]
        val_loss, val_metric = model_trainer.val_losses[
            -1], model_trainer.val_metrics[-1]

        # data grouping by `slash`
        self.writer.add_scalar('data/train_loss', train_loss, n_iter)
        self.writer.add_scalar('data/train_metric', train_metric, n_iter)
        self.writer.add_scalar('data/val_loss', val_loss, n_iter)
        self.writer.add_scalar('data/val_metric', val_metric, n_iter)

        if n_iter % model_trainer.validation_steps == 0:
            # self.writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)
            for name, param in model_trainer.model.named_parameters():
                self.writer.add_histogram('param/' + name,
                                          param.clone().cpu().data.numpy(),
                                          n_iter,
                                          bins='sturges')
            self._save_gradient_histograms(model_trainer, train_data)

    def after_training(self, model_trainer):
        """ Export scalar data to JSON for external processing and
            save final weights as images.
        """
        # for name, param in model_trainer.model.named_parameters():
        #     param = param.data.clone().cpu()
        #     if len(param.size()) == 2:  # images should have size (width, height, channel)
        #         param = param.unsqueeze(2)
        #     elif len(param.size()) == 1:
        #         param = param.unsqueeze(1)
        #         param = param.unsqueeze(2)
        #     self.writer.add_image(name, param, model_trainer.global_step)

        self.writer.export_scalars_to_json("./all_scalars.json")
        self.writer.close()

    def _save_gradient_histograms(self, model_trainer, train_data):
        # Add gradient norm histogram
        n_iter = model_trainer.global_step
        random_shuffle = list(train_data.get_one_hot_list())
        random.shuffle(random_shuffle)
        for par in model_trainer.model.parameters():
            par.accumulated_grad = []

        n_samples = 100
        for X_i, y_i in random_shuffle[:n_samples]:
            X_data, y_data = cuda_move(X_i), cuda_move(y_i)
            # TODO: backprop through thousand of time steps
            y_out = model_trainer.model.forward(X_data, logits=True)
            loss = F.binary_cross_entropy_with_logits(y_out, y_data)
            model_trainer.model.zero_grad()
            loss.backward()

            for par in model_trainer.model.parameters():
                par.accumulated_grad.append(par.grad)

        for name, par in model_trainer.model.named_parameters():
            t = torch.stack(par.accumulated_grad, 0)
            self.writer.add_histogram('grad/' + name,
                                      t.clone().cpu().data.numpy(),
                                      n_iter,
                                      bins='sturges')
            par.accumulated_grad = None

    def __str__(self):
        return "TBCallback(logdir={})".format(self.log_dir)
Exemplo n.º 6
0
class DeepQTrainer:
    """
    A Deep Q Network trainer. Supports TD learning, Q learning, and Double Q learning
    """
    def __init__(self, data, batch_size, epoch_limit, criterion, save_loc,
                 name, log_dir, gpu_device, lr, clip, on_policy, double_q,
                 num_workers, reset_rate, validate_rate):
        """
        :param data: A pytorch Dataset
        :param batch_size: training/validation batch size
        :param epoch_limit: number of training epochs
        :param criterion: training loss
        :param save_loc: save directory
        :param name: experiment name
        :param log_dir: save directory for tensorboard log files
        :param gpu_device: numbered gpu device on which to run
        :param lr: learning rate for ADAM optimizer
        :param clip: parameter for gradient clipping
        :param on_policy: boolean switch to use TD learning instead of Q learning
        :param double_q: boolean switch to use double Q learning instead of Q learning
        :param num_workers: number of data-loading threads to use
        :param reset_rate: rate to cache target network
        :param validate_rate: rate to check validation performance
        """
        self.writer = SummaryWriter(log_dir='{}/{}'.format(log_dir, name))
        # set various attributes
        attribute_dict = {
            'epoch_limit': epoch_limit,
            'save_loc': save_loc,
            'batch_size': batch_size,
            'criterion': criterion,
            'name': name,
            'gpu_device': gpu_device,
            'lr': lr,
            'clip': clip,
            'data': data,
            'on_policy': on_policy,
            'double_q': double_q,
            'reset_rate': reset_rate,
            'validate_rate': validate_rate
        }
        for key in attribute_dict:
            setattr(self, key, attribute_dict[key])
        self.total_steps = 0
        self.model = None
        self.old_model = None

        sampler = torch.utils.data.sampler.SubsetRandomSampler
        train_sampler = sampler(data.train_ind)
        valid_sampler = sampler(data.valid_ind)
        test_sampler = sampler(data.test_ind)

        self.train_data = DataLoader(data,
                                     batch_size=batch_size,
                                     sampler=train_sampler,
                                     num_workers=num_workers)
        self.train_data = DataLoader(data,
                                     batch_size=batch_size,
                                     sampler=train_sampler,
                                     num_workers=num_workers)
        self.valid_data = DataLoader(data,
                                     batch_size=batch_size,
                                     sampler=valid_sampler,
                                     num_workers=num_workers)
        self.test_data = DataLoader(data,
                                    batch_size=batch_size,
                                    sampler=test_sampler,
                                    num_workers=num_workers)

    def time_to_reset(self):
        return self.total_steps % self.reset_rate == 0

    def time_to_validate(self):
        return self.total_steps % self.validate_rate == 0

    def train(self, model, gamma, optimizer=None):
        """
        Training loop
        :param model: pytorch model to be trained
        :param gamma: discount factor
        :param optimizer: training optimizer, defaults to ADAM
        :return:
        """
        print('starting train')
        torch.save(model.state_dict(),
                   '{}/{}_start.pt'.format(self.save_loc, self.name))
        self.model = to_cuda(model, self.gpu_device)
        self.old_model = copy.deepcopy(model)
        if optimizer is None:
            optimizer = torch.optim.Adam(model.parameters(), lr=self.lr)

        for epoch_num in range(self.epoch_limit):
            print('epoch {}'.format(epoch_num))
            running_train_loss = 0
            for state, action, reward, next_state, next_action, feasible_mask in self.train_data:
                if self.time_to_validate():
                    print('validating')
                    self.validate(gamma=gamma)
                if self.time_to_reset():
                    print('saving model at epoch {}, step {}'.format(
                        epoch_num, self.total_steps))
                    self.old_model = copy.deepcopy(self.model)
                    torch.save(
                        self.model.state_dict(),
                        '{}/{}_epoch_{}_step_{}.pt'.format(
                            self.save_loc, self.name, epoch_num,
                            self.total_steps))

                self.total_steps += 1
                train_loss = self.optimize(state=state,
                                           action=action,
                                           reward=reward,
                                           next_state=next_state,
                                           next_action=next_action,
                                           gamma=gamma,
                                           optimizer=optimizer,
                                           feasible_mask=feasible_mask)
                running_train_loss += train_loss
            self.writer.add_scalar(tag='data/train_epoch_loss',
                                   scalar_value=running_train_loss /
                                   len(self.train_data),
                                   global_step=self.total_steps)
        torch.save(self.model.state_dict(),
                   '{}/{}_final.pt'.format(self.save_loc, self.name))

    def validate(self, gamma):
        """
        Evaluates on validation set. Used to monitor training performance
        :param gamma: discount factor
        :return:
        """
        self.model.eval()
        running_valid_loss = 0
        for state, action, reward, next_state, next_action, feasible_mask in self.valid_data:
            valid_loss = self.optimize(state=state,
                                       action=action,
                                       reward=reward,
                                       next_state=next_state,
                                       next_action=next_action,
                                       gamma=gamma,
                                       optimizer=None,
                                       feasible_mask=feasible_mask,
                                       valid=True)
            running_valid_loss += valid_loss
        self.writer.add_scalar(tag='data/valid_epoch_loss',
                               scalar_value=running_valid_loss /
                               len(self.valid_data),
                               global_step=self.total_steps)
        print('Validation loss: {:.2f}'.format(running_valid_loss /
                                               len(self.valid_data)))
        self.model.train()

    def _q_state_value_estimate(self, state, nonterminal_mask,
                                non_final_states, feasible_mask):
        """
        Helper function which calculates the target state value using a Q value estimation procedure
        :param state: a state tensor
        :param nonterminal_mask: a boolean mask denoting the terminal states
        :param non_final_states: the next state for all non-terminal states
        :param feasible_mask: a boolean mask indicating which actions are allowed in the current state
        :return: An estimate of state value, using the maximal Q values derived from self.old_model
        """
        next_state_values = to_variable(
            to_cuda(torch.zeros(state[0].size(0)).float(), self.gpu_device))
        nonterminal_feasible_mask = feasible_mask[
            nonterminal_mask.nonzero().view(-1)]
        predictions = self.old_model(to_cuda(non_final_states,
                                             self.gpu_device))
        # modifying predictions by adjusted to ensure max value is within feasible action set
        adjuster = 2 * max(abs(predictions.min().data[0]),
                           predictions.max().data[0])
        adjusted_predictions = predictions - adjuster
        adjusted_predictions[nonterminal_feasible_mask] += adjuster
        next_state_values[nonterminal_mask] = adjusted_predictions.max(1)[0]
        next_state_values.volatile = False
        return next_state_values

    def _double_q_state_value_estimate(self, state, nonterminal_mask,
                                       non_final_states, feasible_mask):
        """
        Helper function which calculates the target state value using a Double Q value estimation procedure.
        Essentially the same as _q_state_value_estimate, except we use the current network to choose the actions
        which inform next state value.
        :param state: a state tensor
        :param nonterminal_mask: a boolean mask denoting the terminal states
        :param non_final_states: the next state for all non-terminal states
        :param feasible_mask: a boolean mask indicating which actions are allowed in the current state
        :return: An estimate of state value, using the maximal Q values derived from self.old_model
        """
        next_state_values = to_variable(
            to_cuda(torch.zeros(state[0].size(0)).float(), self.gpu_device))
        nonterminal_feasible_mask = feasible_mask[
            nonterminal_mask.nonzero().view(-1)]
        predictions_dq = self.model(to_cuda(non_final_states, self.gpu_device))
        # modifying predictions by adjusted to ensure max value is within feasible action set
        adjuster = 2 * max(abs(predictions_dq.min().data[0]),
                           predictions_dq.max().data[0])
        adjusted_predictions_dq = predictions_dq - adjuster
        adjusted_predictions_dq[nonterminal_feasible_mask] += adjuster
        max_vals_dq, max_inds_dq = adjusted_predictions_dq.max(1)
        predictions = self.old_model(to_cuda(non_final_states,
                                             self.gpu_device))
        next_state_values[nonterminal_mask] = predictions.gather(
            1, max_inds_dq.view(-1, 1))
        next_state_values.volatile = False
        return next_state_values

    def _on_policy_state_value_estimate(self, state, next_action,
                                        nonterminal_mask, non_final_states):
        """
        Helper function which calculates the target state value using a TD value estimation procedure.
        Essentially the same as _q_state_value_estimate, except we use the observed next action to choose the actions
        which inform next state value. Note this corresponds to on-policy TD value estimation.
        :param state: a state tensor
        :param next_action: the next action taken at non-terminal states
        :param nonterminal_mask: a boolean mask denoting the terminal states
        :param non_final_states: the next state for all non-terminal states
        :return: An estimate of state value, using the maximal Q values derived from self.old_model
        """
        next_state_values = Variable(
            to_cuda(torch.zeros(state[0].size(0)).float(), self.gpu_device))
        predictions = self.old_model(to_cuda(non_final_states,
                                             self.gpu_device))
        next_state_values[nonterminal_mask] = predictions.gather(
            1,
            Variable(
                to_cuda(next_action,
                        self.gpu_device)[nonterminal_mask].view(-1, 1)))
        next_state_values.volatile = False
        return next_state_values

    def log_q_diff(self, pred):
        """
        A convenience function for logging Q value distribution. Can be helpful in diagnosing value collapse.
        :param pred: Predicted Q values
        :return:
        """
        pred_sample = pred.data.cpu().numpy()
        q_diffs = np.diff(np.percentile(pred_sample, [0, 25, 50, 75, 100],
                                        axis=1),
                          axis=0)
        dim_names = ['0_25', '25_50', '50_75', '75_100']
        for dim in range(len(q_diffs)):
            self.writer.add_histogram(tag='predictions/q_diff_{}'.format(
                dim_names[dim]),
                                      values=q_diffs[dim],
                                      global_step=self.total_steps,
                                      bins='auto')

    def log_values_and_advantages(self, state):
        """
        Convenience function for logging state value and action-advantage values
        :param state: vector of states
        :return:
        """
        advantage = self.model.get_advantage(
            to_variable(to_cuda(state, self.gpu_device)))
        self.writer.add_histogram(tag='predictions/advantages',
                                  values=advantage.view(-1),
                                  global_step=self.total_steps,
                                  bins='auto')
        value = self.model.get_value(
            to_variable(to_cuda(state, self.gpu_device)))
        self.writer.add_histogram(tag='predictions/values',
                                  values=value.view(-1),
                                  global_step=self.total_steps,
                                  bins='auto')

    def optimize(self,
                 state,
                 action,
                 reward,
                 next_state,
                 next_action,
                 gamma,
                 optimizer,
                 feasible_mask,
                 valid=False):
        """
        Runs a step of optimization in the training/validation loops
        :param state:
        :param action:
        :param reward:
        :param next_state:
        :param next_action:
        :param gamma:
        :param optimizer:
        :param feasible_mask:
        :param valid:
        :return:
        """
        # each state is a tuple of spatial and flat information
        next_state_court = next_state[0]
        next_state_flat = next_state[1]
        # get non final states
        nonterminal_mask = []
        for batch_id in range(next_state_court.shape[0]):
            nonterminal_mask.append(np.min(next_state_court[batch_id].numpy()))
        nonterminal_mask = to_cuda(
            torch.from_numpy(np.array(nonterminal_mask)) > 0, self.gpu_device)
        non_final_states_court = Variable(torch.cat([
            next_state_court[batch_id].view(1,
                                            next_state_court[batch_id].size(0),
                                            next_state_court[batch_id].size(1),
                                            next_state_court[batch_id].size(2))
            for batch_id in range(len(next_state_court))
            if nonterminal_mask[batch_id]
        ],
                                                    dim=0),
                                          volatile=True)
        non_final_states_flat = Variable(torch.cat([
            next_state_flat[batch_id].view(1,
                                           next_state_flat[batch_id].size(0))
            for batch_id in range(len(next_state_flat))
            if nonterminal_mask[batch_id]
        ],
                                                   dim=0),
                                         volatile=True)
        non_final_states = [non_final_states_court, non_final_states_flat]
        # get q values for observed actions
        predictions = self.model(to_variable(to_cuda(state, self.gpu_device)))
        state_action_values = predictions.gather(
            1, Variable(to_cuda(action, self.gpu_device)))
        # for non-final states, get V(s'), 0 for terminal states
        feasible_mask = to_cuda(feasible_mask, self.gpu_device)
        if self.on_policy:
            next_state_values = self._on_policy_state_value_estimate(
                state=state,
                next_action=next_action,
                nonterminal_mask=nonterminal_mask,
                non_final_states=non_final_states)
        elif self.double_q:
            next_state_values = self._double_q_state_value_estimate(
                state=state,
                nonterminal_mask=nonterminal_mask,
                non_final_states=non_final_states,
                feasible_mask=feasible_mask)
        else:
            next_state_values = self._q_state_value_estimate(
                state=state,
                nonterminal_mask=nonterminal_mask,
                non_final_states=non_final_states,
                feasible_mask=feasible_mask)
        # combine discounted next state values with reward to get expected state value
        expected_state_action_values = (next_state_values * gamma) + Variable(
            to_cuda(reward.view(-1), self.gpu_device))
        # loss is between expected and predicted state-action values
        loss = self.criterion(state_action_values,
                              expected_state_action_values)
        if not valid:
            optimizer.zero_grad()
            loss.backward()
            # gradient clipping
            total_grad = 0
            for param in self.model.parameters():
                param.grad.data.clamp_(-1 * self.clip, self.clip)
                total_grad += np.sum(np.abs(to_np(param.grad)))
            if self.total_steps % 10 == 0:
                self.writer.add_scalar(tag='data/train_loss',
                                       scalar_value=to_np(loss),
                                       global_step=self.total_steps)
                self.writer.add_scalar(tag='data/gradient',
                                       scalar_value=total_grad,
                                       global_step=self.total_steps)
            if self.total_steps % 1000 == 0:
                self.log_q_diff(predictions)
                self.log_values_and_advantages(state)
                self.writer.add_histogram(tag='predictions/q_taken',
                                          values=state_action_values,
                                          global_step=self.total_steps,
                                          bins='auto')
                self.writer.add_histogram(tag='predictions/qs',
                                          values=predictions.view(-1),
                                          global_step=self.total_steps,
                                          bins='auto')
            optimizer.step()
        return loss.cpu().data[0]
        if args.ckpt:
            pass
        else:
            # save graph and clips_order samples
            for data in train_dataloader:
                tuple_clips, tuple_orders = data
                for i in range(args.tl):
                    writer.add_video('train/tuple_clips', tuple_clips[:, i, :, :, :, :], i, fps=8)
                    writer.add_text('train/tuple_orders', str(tuple_orders[:, i].tolist()), i)
                tuple_clips = tuple_clips.to(device)
                writer.add_graph(vcopn, tuple_clips)
                break
            # save init params at step 0
            for name, param in vcopn.named_parameters():
                writer.add_histogram('params/{}'.format(name), param, 0)

        ### loss funciton, optimizer and scheduler ###
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(vcopn.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.wd)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1e-5, patience=50, factor=0.1)

        prev_best_val_loss = float('inf')
        prev_best_model_path = None
        for epoch in range(args.start_epoch, args.start_epoch+args.epochs):
            time_start = time.time()
            train(args, vcopn, criterion, optimizer, device, train_dataloader, writer, epoch)
            print('Epoch time: {:.2f} s.'.format(time.time() - time_start))
            val_loss = validate(args, vcopn, criterion, device, val_dataloader, writer, epoch)
            # scheduler.step(val_loss)         
            writer.add_scalar('train/lr', optimizer.param_groups[0]['lr'], epoch)
Exemplo n.º 8
0
def EnsembleTrain():
    torch.autograd.set_detect_anomaly(True)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    input_shape = (192, 192)
    total_epoch = 10000
    batch_size = 24
    model_folder = MakeFolder(model_root + '/Resnet')

    param_config = {
        RotateTransform.name: {
            'theta': ['uniform', -10, 10]
        },
        ShiftTransform.name: {
            'horizontal_shift': ['uniform', -0.05, 0.05],
            'vertical_shift': ['uniform', -0.05, 0.05]
        },
        ZoomTransform.name: {
            'horizontal_zoom': ['uniform', 0.95, 1.05],
            'vertical_zoom': ['uniform', 0.95, 1.05]
        },
        FlipTransform.name: {
            'horizontal_flip': ['choice', True, False]
        },
        BiasTransform.name: {
            'center': ['uniform', -1., 1., 2],
            'drop_ratio': ['uniform', 0., 1.]
        },
        NoiseTransform.name: {
            'noise_sigma': ['uniform', 0., 0.03]
        },
        ContrastTransform.name: {
            'factor': ['uniform', 0.8, 1.2]
        },
        GammaTransform.name: {
            'gamma': ['uniform', 0.8, 1.2]
        },
        ElasticTransform.name: ['elastic', 1, 0.1, 256]
    }

    sub_train_path = data_root + '/train_name_basemodel.csv'
    sub_val_path = data_root + '/val_name_basemodel.csv'
    sub_train = pd.read_csv(sub_train_path).values.tolist()[0]
    sub_val = pd.read_csv(sub_val_path).values.tolist()[0]
    train_loader, train_batches = _GetLoader(sub_train, param_config,
                                             input_shape, batch_size, True)
    val_loader, val_batches = _GetLoader(sub_val, param_config, input_shape,
                                         batch_size, True)

    model = Resnet(3, 2).to(device)
    model.apply(HeWeightInit)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    cr = torch.nn.NLLLoss()

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='min',
                                                           patience=10,
                                                           factor=0.5,
                                                           verbose=True)
    early_stopping = EarlyStopping(store_path=os.path.join(
        model_folder, '{}-{:.6f}.pt'),
                                   patience=50,
                                   verbose=True)
    writer = SummaryWriter(log_dir=os.path.join(model_folder, 'log'),
                           comment='Net')

    for epoch in range(total_epoch):
        train_loss, val_loss = 0., 0.

        model.train()
        pred_list, label_list = [], []
        for ind, (inputs, outputs) in enumerate(train_loader):
            optimizer.zero_grad()

            inputs = MoveTensorsToDevice(inputs, device)
            outputs = MoveTensorsToDevice(outputs, device)

            preds = model(*inputs)

            loss = cr(preds, outputs.long())

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            pred_list.extend(preds[:, 1].cpu().data.numpy().tolist())
            label_list.extend(outputs.cpu().data.numpy().tolist())

        train_auc = roc_auc_score(label_list, pred_list)

        model.eval()
        pred_list, label_list = [], []
        with torch.no_grad():
            for ind, (inputs, outputs) in enumerate(val_loader):
                inputs = MoveTensorsToDevice(inputs, device)
                outputs = MoveTensorsToDevice(outputs, device)

                preds = model(*inputs)

                loss = cr(preds, outputs.long())

                val_loss += loss.item()

                pred_list.extend(preds[:, 1].cpu().data.numpy().tolist())
                label_list.extend(outputs.cpu().data.numpy().tolist())

            val_auc = roc_auc_score(label_list, pred_list)

        # Save Tensor Board
        for index, (name, param) in enumerate(model.named_parameters()):
            if 'bn' not in name:
                writer.add_histogram(name + '_data',
                                     param.cpu().data.numpy(), epoch + 1)

        writer.add_scalars(
            'Loss', {
                'train_loss': train_loss / train_batches,
                'val_loss': val_loss / val_batches
            }, epoch + 1)
        writer.add_scalars('Auc', {
            'train_auc': train_auc,
            'val_auc': val_auc
        }, epoch + 1)

        print(
            'Epoch {}: loss: {:.3f}, val-loss: {:.3f}, auc: {:.3f}, val-auc: {:.3f}'
            .format(epoch + 1, train_loss / train_batches,
                    val_loss / val_batches, train_auc, val_auc))

        scheduler.step(val_loss)
        early_stopping(val_loss, model, (epoch + 1, val_loss))

        if early_stopping.early_stop:
            print("Early stopping")
            break

        writer.flush()
        writer.close()
Exemplo n.º 9
0
                        X=x_axis.unsqueeze(0).expand(y_axis.size(1), x_axis.size(0)).transpose(0, 1),  # Visdom fix
                        Y=y_axis,
                        win=viz_window,
                        update='replace',
                    )
            if args.tensorboard and main_proc:
                values = {
                    'Avg Train Loss': avg_loss,
                    'Avg WER': wer,
                    'Avg CER': cer
                }
                tensorboard_writer.add_scalars(args.id, values, epoch + 1)
                if args.log_params:
                    for tag, value in model.named_parameters():
                        tag = tag.replace('.', '/')
                        tensorboard_writer.add_histogram(tag, to_np(value), epoch + 1)
                        tensorboard_writer.add_histogram(tag + '/grad', to_np(value.grad), epoch + 1)
            if args.checkpoint and main_proc:
                file_path = '%s/deepspeech_%d.pth' % (save_folder, epoch + 1)
                torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
                                                wer_results=wer_results, cer_results=cer_results),
                           file_path)
                # anneal lr
                optim_state = optimizer.state_dict()
                optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal
                optimizer.load_state_dict(optim_state)
                print('Learning rate annealed to: {lr:.6f}'.format(lr=optim_state['param_groups'][0]['lr']))

            if (best_wer is None or best_wer > wer) and main_proc:
                print("Found better validated model, saving to %s" % args.model_path)
                torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
Exemplo n.º 10
0
            landmarks = landmarks.cuda()

        out = net(images)
        # backprop
        optimizer.zero_grad()
        loss = criterion(out, landmarks)
        loss.backward()
        optimizer.step()
        if iteration % 100 == 0:
            image = images.cpu().data.numpy()[0]
            gt = landmarks.cpu().data.numpy()
            pr = out.cpu().data.numpy()
            # 绿色的真实landmark
            image = draw_landmarks(image, gt[0], (0, 255, 0))
            # 红色的预测landmark
            image = draw_landmarks(image, pr[0], (0, 0, 255))
            image = image[::-1, ...]
            nme = metrics.nme.update(np.reshape(gt, (-1, gt.shape[1]//2, 2)), np.reshape(pr, (-1, gt.shape[1]//2, 2)))
            metrics.auc.update(nme)
            metrics.loss.update(loss)
            writer.add_scalar("watch/NME", metrics.nme.value * 100, iteration)
            writer.add_scalar("watch/AUC", metrics.auc.value * 100, iteration)
            writer.add_scalar("watch/loss", metrics.loss.value, iteration)
            writer.add_scalar("watch/learning_rate", lr, iteration)

            writer.add_image("result", image, iteration)
            writer.add_histogram("predictionx", out.cpu().data.numpy()[:, 0:212:2], iteration)
            state = net.state_dict()
            saver.save(state, iteration)

Exemplo n.º 11
0
 def write_tensorboard_histograms(self, model:nn.Module, iter_count:int, tbwriter:SummaryWriter):
     for name, param in model.named_parameters():
         tbwriter.add_histogram('/weights/' + name, param, iter_count)
Exemplo n.º 12
0
            observation = get_policy_observation(observation)

            expert_loss_grad = []
            expert_loss = []
            for expert_trajectory in expert_trajectories:
                expert_loss_grad.append(
                    sum(2 * (observation - expert_trajectory[0]["state"])) /
                    state_dim)
                expert_loss.append(
                    sum((observation - expert_trajectory[0]["state"])**2) /
                    state_dim)
            loss_grad.append(sum(expert_loss_grad) / num_experts)

            writer.add_histogram(
                f"Imitation loss {timestep}",
                np.array(expert_loss),
                global_step=(max_timestep - timestep) * num_iterations +
                iteration)
            writer.add_scalar(
                f"Imitation loss mean {timestep}",
                sum(expert_loss) / num_experts,
                global_step=(max_timestep - timestep) * num_iterations +
                iteration)

            observation = torch.Tensor(observation)
            episode_reward += reward
            timestep += 1

        writer.add_scalar("Episode reward",
                          episode_reward,
                          global_step=max_timestep * num_iterations +
Exemplo n.º 13
0
def main():

    # Load a config file
    if args.resume_model is None:
        config = load_config(args.config)
    else:
        # Restart from the last checkpoint
        config = load_config(os.path.join(args.resume_model, 'config.yml'))

    # Check differences between args and yaml comfiguraiton
    for k, v in vars(args).items():
        if k not in config.keys():
            warnings.warn("key %s is automatically set to %s" % (k, str(v)))

    # Merge config with args
    for k, v in config.items():
        setattr(args, k, v)

    # Load dataset
    train_set = Dataset(csv_path=args.train_set,
                        dict_path=args.dict,
                        label_type=args.label_type,
                        batch_size=args.batch_size * args.ngpus,
                        bptt=args.bptt,
                        eos=args.eos,
                        max_epoch=args.num_epochs,
                        shuffle=True)
    dev_set = Dataset(csv_path=args.dev_set,
                      dict_path=args.dict,
                      label_type=args.label_type,
                      batch_size=args.batch_size * args.ngpus,
                      bptt=args.bptt,
                      eos=args.eos,
                      shuffle=True)
    eval_sets = []
    for set in args.eval_sets:
        eval_sets += [Dataset(csv_path=set,
                              dict_path=args.dict,
                              label_type=args.label_type,
                              batch_size=1,
                              bptt=args.bptt,
                              eos=args.eos,
                              is_test=True)]

    args.num_classes = train_set.num_classes

    # Model setting
    model = RNNLM(args)
    model.name = args.rnn_type
    model.name += str(args.num_units) + 'H'
    model.name += str(args.num_projs) + 'P'
    model.name += str(args.num_layers) + 'L'
    model.name += '_emb' + str(args.emb_dim)
    model.name += '_' + args.optimizer
    model.name += '_lr' + str(args.learning_rate)
    model.name += '_bs' + str(args.batch_size)
    if args.tie_weights:
        model.name += '_tie'
    if args.residual:
        model.name += '_residual'
    if args.backward:
        model.name += '_bwd'

    if args.resume_model is None:
        # Set save path
        save_path = mkdir_join(args.model, '_'.join(os.path.basename(args.train_set).split('.')[:-1]), model.name)
        model.set_save_path(save_path)  # avoid overwriting

        # Save the config file as a yaml file
        save_config(vars(args), model.save_path)

        # Save the dictionary & wp_model
        shutil.copy(args.dict, os.path.join(save_path, 'dict.txt'))
        if args.label_type == 'wordpiece':
            shutil.copy(args.wp_model, os.path.join(save_path, 'wp.model'))

        # Setting for logging
        logger = set_logger(os.path.join(model.save_path, 'train.log'), key='training')

        for k, v in sorted(vars(args).items(), key=lambda x: x[0]):
            logger.info('%s: %s' % (k, str(v)))

        # Count total parameters
        for name in sorted(list(model.num_params_dict.keys())):
            num_params = model.num_params_dict[name]
            logger.info("%s %d" % (name, num_params))
        logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000))

        # Set optimizer
        model.set_optimizer(optimizer=args.optimizer,
                            learning_rate_init=float(args.learning_rate),
                            weight_decay=float(args.weight_decay),
                            clip_grad_norm=args.clip_grad_norm,
                            lr_schedule=False,
                            factor=args.decay_rate,
                            patience_epoch=args.decay_patient_epoch)

        epoch, step = 1, 0
        learning_rate = float(args.learning_rate)
        metric_dev_best = 10000

    else:
        raise NotImplementedError()

    train_set.epoch = epoch - 1

    # GPU setting
    if args.ngpus >= 1:
        model = CustomDataParallel(model,
                                   device_ids=list(range(0, args.ngpus, 1)),
                                   deterministic=True,
                                   benchmark=False)
        model.cuda()

    logger.info('PID: %s' % os.getpid())
    logger.info('USERNAME: %s' % os.uname()[1])

    # Set process name
    # setproctitle(args.job_name)

    # Set learning rate controller
    lr_controller = Controller(learning_rate_init=learning_rate,
                               decay_type=args.decay_type,
                               decay_start_epoch=args.decay_start_epoch,
                               decay_rate=args.decay_rate,
                               decay_patient_epoch=args.decay_patient_epoch,
                               lower_better=True,
                               best_value=metric_dev_best)

    # Set reporter
    reporter = Reporter(model.module.save_path, max_loss=10)

    # Set the updater
    updater = Updater(args.clip_grad_norm)

    # Setting for tensorboard
    tf_writer = SummaryWriter(model.module.save_path)

    start_time_train = time.time()
    start_time_epoch = time.time()
    start_time_step = time.time()
    not_improved_epoch = 0
    loss_train_mean, acc_train_mean = 0., 0.
    pbar_epoch = tqdm(total=len(train_set))
    pbar_all = tqdm(total=len(train_set) * args.num_epochs)
    while True:
        # Compute loss in the training set (including parameter update)
        ys_train, is_new_epoch = train_set.next()
        model, loss_train, acc_train = updater(model, ys_train, args.bptt)
        loss_train_mean += loss_train
        acc_train_mean += acc_train
        pbar_epoch.update(np.sum([len(y) for y in ys_train]))

        if (step + 1) % args.print_step == 0:
            # Compute loss in the dev set
            ys_dev = dev_set.next()[0]
            model, loss_dev, acc_dev = updater(model, ys_dev, args.bptt, is_eval=True)

            loss_train_mean /= args.print_step
            acc_train_mean /= args.print_step
            reporter.step(step, loss_train_mean, loss_dev, acc_train_mean, acc_dev)

            # Logging by tensorboard
            tf_writer.add_scalar('train/loss', loss_train_mean, step + 1)
            tf_writer.add_scalar('dev/loss', loss_dev, step + 1)
            for n, p in model.module.named_parameters():
                n = n.replace('.', '/')
                if p.grad is not None:
                    tf_writer.add_histogram(n, p.data.cpu().numpy(), step + 1)
                    tf_writer.add_histogram(n + '/grad', p.grad.data.cpu().numpy(), step + 1)

            duration_step = time.time() - start_time_step
            logger.info("...Step:%d(ep:%.2f) loss:%.2f(%.2f)/acc:%.2f(%.2f)/ppl:%.2f(%.2f)/lr:%.5f/bs:%d (%.2f min)" %
                        (step + 1, train_set.epoch_detail,
                         loss_train_mean, loss_dev, acc_train_mean, acc_dev,
                         math.exp(loss_train_mean), math.exp(loss_dev),
                         learning_rate, len(ys_train), duration_step / 60))
            start_time_step = time.time()
            loss_train_mean, acc_train_mean = 0., 0.
        step += args.ngpus

        # Save checkpoint and evaluate model per epoch
        if is_new_epoch:
            duration_epoch = time.time() - start_time_epoch
            logger.info('===== EPOCH:%d (%.2f min) =====' % (epoch, duration_epoch / 60))

            # Save fugures of loss and accuracy
            reporter.epoch()

            if epoch < args.eval_start_epoch:
                # Save the model
                model.module.save_checkpoint(model.module.save_path, epoch, step,
                                             learning_rate, metric_dev_best)
            else:
                start_time_eval = time.time()
                # dev
                ppl_dev = eval_ppl([model.module], dev_set, args.bptt)
                logger.info(' PPL (%s): %.3f' % (dev_set.set, ppl_dev))

                if ppl_dev < metric_dev_best:
                    metric_dev_best = ppl_dev
                    not_improved_epoch = 0
                    logger.info('||||| Best Score |||||')

                    # Update learning rate
                    model.module.optimizer, learning_rate = lr_controller.decay_lr(
                        optimizer=model.module.optimizer,
                        learning_rate=learning_rate,
                        epoch=epoch,
                        value=ppl_dev)

                    # Save the model
                    model.module.save_checkpoint(model.module.save_path, epoch, step,
                                                 learning_rate, metric_dev_best)

                    # test
                    ppl_test_mean = 0.
                    for eval_set in eval_sets:
                        ppl_test = eval_ppl([model.module], eval_set, args.bptt)
                        logger.info(' PPL (%s): %.3f' % (eval_set.set, ppl_test))
                        ppl_test_mean += ppl_test
                    if len(eval_sets) > 0:
                        logger.info(' PPL (mean): %.3f' % (ppl_test_mean / len(eval_sets)))
                else:
                    # Update learning rate
                    model.module.optimizer, learning_rate = lr_controller.decay_lr(
                        optimizer=model.module.optimizer,
                        learning_rate=learning_rate,
                        epoch=epoch,
                        value=ppl_dev)

                    not_improved_epoch += 1

                duration_eval = time.time() - start_time_eval
                logger.info('Evaluation time: %.2f min' % (duration_eval / 60))

                # Early stopping
                if not_improved_epoch == args.not_improved_patient_epoch:
                    break

                if epoch == args.convert_to_sgd_epoch:
                    # Convert to fine-tuning stage
                    model.module.set_optimizer(
                        'sgd',
                        learning_rate_init=float(args.learning_rate),  # TODO: ?
                        weight_decay=float(args.weight_decay),
                        clip_grad_norm=args.clip_grad_norm,
                        lr_schedule=False,
                        factor=args.decay_rate,
                        patience_epoch=args.decay_patient_epoch)
                    logger.info('========== Convert to SGD ==========')

            pbar_epoch = tqdm(total=len(train_set))
            pbar_all.update(len(train_set))

            if epoch == args.num_epoch:
                break

            start_time_step = time.time()
            start_time_epoch = time.time()
            epoch += 1

    duration_train = time.time() - start_time_train
    logger.info('Total time: %.2f hour' % (duration_train / 3600))

    tf_writer.close()
    pbar_epoch.close()
    pbar_all.close()

    return model.module.save_path
Exemplo n.º 14
0
                axis=1)).mean()
        log = 'epoch: %d, train loss: %.6f, test loss: %.6f, train acc: %.6f, test acc: %.6f' % \
              (ep, train_loss.item(), test_loss.item(), train_acc, test_acc)
        print(log)
        if use_tensorboard:
            writer.add_text('training log', log, ep)
            writer.add_scalars('data/loss', {
                'train loss': train_loss.data,
                'test loss': test_loss.data
            }, ep)
            writer.add_scalars('data/accuracy', {
                'train accuracy': train_acc,
                'test accuracy': test_acc
            }, ep)
            for name, param in dnn.named_parameters():
                writer.add_histogram(name,
                                     param.clone().data.cpu().numpy(), ep)
    y_final_test = dnn(data_final_test).detach().numpy().argmax(1)
    data_save['Accept'] = y_final_test
    data_save.to_csv('result.csv')
    y_true = y_test.numpy().argmax(1)
    y_score = dnn(X_test).detach().numpy()[:, 1].squeeze()
    print(y_score)
    y_pred = dnn(X_test).detach().numpy().argmax(1)
    label = ['not accepted', 'accepted']
    print('precision:', metrics.precision_score(y_true, y_pred))
    print('recall:', metrics.recall_score(y_true, y_pred))
    print('f1 score:', metrics.f1_score(y_true, y_pred))
    print('confusion matrix: ', metrics.confusion_matrix(y_true, y_pred))
    print(metrics.classification_report(y_true, y_pred, target_names=label))

    cm = metrics.confusion_matrix(y_true, y_pred)
Exemplo n.º 15
0
def main(args):
    log_path = "{}_{}".format(args.log, random.randint(1, 100))
    train_writer = SummaryWriter(log_dir=log_path + "/train")
    dev_writer = SummaryWriter(log_dir=log_path + "/dev")

    train, dev, test, words = read_corpus(args.data)
    dev_, test_ = dev, test
    train = create_batches(train, args.batch_size)
    dev = create_batches(dev, args.batch_size)
    test = create_batches(test, args.batch_size)

    model = Model(words, args)
    if args.load:
        model.load_state_dict(torch.load(args.load))
    model.cuda()
    print(model)
    print("vocab size: {}".format(model.n_V))

    lr = 1.0 if not args.noam else 1.0 / (args.n_d**0.5) / (args.warmup_steps**
                                                            1.5)
    if args.prune:
        # in place substituion of linear ops in SRU
        flop.make_hard_concrete(model.rnn, in_place=True)
        model.cuda()
        print("model after inserting hardconcrete:")
        print(model)
        hc_modules = flop.get_hardconcrete_modules(model)
        hc_parameters = [
            p for m in hc_modules for p in m.parameters() if p.requires_grad
        ]
        optimizer_hc = Adam(hc_parameters,
                            lr=lr * args.prune_lr,
                            weight_decay=0)
        num_hardconcrete_params = sum(x.numel() for x in hc_parameters)
        print("num of hardconcrete paramters: {}".format(
            num_hardconcrete_params))
        lambda_1 = nn.Parameter(torch.tensor(0.).cuda())
        lambda_2 = nn.Parameter(torch.tensor(0.).cuda())
        optimizer_max = Adam([lambda_1, lambda_2], lr=lr, weight_decay=0)
        optimizer_max.param_groups[0]['lr'] = -lr * args.prune_lr
        hc_linear_modules = flop.get_hardconcrete_linear_modules(model)
        num_prunable_params = sum(m.num_prunable_parameters()
                                  for m in hc_linear_modules)
        print("num of prunable paramters: {}".format(num_prunable_params))
    else:
        args.prune_start_epoch = args.max_epoch

    m_parameters = [
        i[1] for i in model.named_parameters()
        if i[1].requires_grad and 'log_alpha' not in i[0]
    ]
    optimizer = Adam(m_parameters,
                     lr=lr * args.lr,
                     weight_decay=args.weight_decay)
    num_params = sum(x.numel() for x in m_parameters if x.requires_grad)
    print("num of parameters: {}".format(num_params))

    nbatch = 1
    niter = 1
    best_dev = 1e+8
    unroll_size = args.unroll_size
    batch_size = args.batch_size
    N = (len(train[0]) - 1) // unroll_size + 1
    criterion = nn.CrossEntropyLoss()

    model.zero_grad()
    if args.prune:
        optimizer_max.zero_grad()
        optimizer_hc.zero_grad()

    for epoch in range(args.max_epoch):
        start_time = time.time()
        model.train()
        total_loss = 0.0
        hidden = model.init_hidden(batch_size)
        start_prune = epoch >= args.prune_start_epoch

        for i in range(N):
            x = train[0][i * unroll_size:(i + 1) * unroll_size]
            y = train[1][i * unroll_size:(i + 1) * unroll_size].view(-1)
            hidden.detach_()

            # language model forward and backward
            output, hidden = model(x, hidden)
            loss = criterion(output, y)
            (loss / args.update_param_freq).backward()
            loss = loss.item()
            lagrangian_loss = 0
            target_sparsity = 0
            expected_sparsity = 0

            # add lagrangian loss (regularization) when pruning
            if start_prune:
                # compute target sparsity with (optionally) linear warmup
                target_sparsity = args.prune_sparsity
                if args.prune_warmup > 0:
                    niter_ = niter - args.prune_start_epoch * N
                    target_sparsity *= min(1.0, niter_ / args.prune_warmup)

                # compute expected model size and sparsity
                expected_size = sum(
                    m.num_parameters(train=True) for m in hc_linear_modules)
                expected_sparsity = 1.0 - expected_size / num_prunable_params

                # compute lagrangian loss
                lagrangian_loss = lambda_1 * (expected_sparsity - target_sparsity) + \
                                  lambda_2 * (expected_sparsity - target_sparsity)**2
                (lagrangian_loss / args.update_param_freq).backward()
                expected_sparsity = expected_sparsity.item()
                lagrangian_loss = lagrangian_loss.item()

            #  log training stats
            if (niter - 1) % 100 == 0 and nbatch % args.update_param_freq == 0:
                if args.prune:
                    train_writer.add_scalar('sparsity/expected_sparsity',
                                            expected_sparsity, niter)
                    train_writer.add_scalar('sparsity/target_sparsity',
                                            target_sparsity, niter)
                    train_writer.add_scalar('loss/lagrangian_loss',
                                            lagrangian_loss, niter)
                    train_writer.add_scalar('lambda/1', lambda_1.item(), niter)
                    train_writer.add_scalar('lambda/2', lambda_2.item(), niter)
                    if (niter - 1) % 3000 == 0:
                        for index, layer in enumerate(hc_modules):
                            train_writer.add_histogram(
                                'log_alpha/{}'.format(index),
                                layer.log_alpha,
                                niter,
                                bins='sqrt',
                            )
                sys.stderr.write("\r{:.4f} {:.2f} {:.2f}".format(
                    loss,
                    lagrangian_loss,
                    expected_sparsity,
                ))
                train_writer.add_scalar('loss/lm_loss', loss, niter)
                train_writer.add_scalar('loss/total_loss',
                                        loss + lagrangian_loss, niter)
                train_writer.add_scalar(
                    'parameter_norm',
                    calc_norm([x.data for x in m_parameters]), niter)
                train_writer.add_scalar(
                    'gradient_norm',
                    calc_norm(
                        [x.grad for x in m_parameters if x.grad is not None]),
                    niter)

            #  perform gradient decent every few number of backward()
            if nbatch % args.update_param_freq == 0:
                if args.clip_grad > 0:
                    torch.nn.utils.clip_grad_norm(m_parameters, args.clip_grad)
                optimizer.step()
                if start_prune:
                    optimizer_max.step()
                    optimizer_hc.step()
                #  clear gradient
                model.zero_grad()
                if args.prune:
                    optimizer_max.zero_grad()
                    optimizer_hc.zero_grad()
                niter += 1

            if nbatch % args.log_period == 0 or i == N - 1:
                elapsed_time = (time.time() - start_time) / 60.0
                dev_ppl, dev_loss = eval_model(model, dev)
                dev_writer.add_scalar('loss/lm_loss', dev_loss, niter)
                dev_writer.add_scalar('bpc', np.log2(dev_ppl), niter)
                sparsity = 0
                if args.prune:
                    pruned_size = sum(
                        m.num_parameters(train=False)
                        for m in hc_linear_modules)
                    sparsity = 1.0 - pruned_size / num_prunable_params
                    dev_writer.add_scalar('sparsity/hard_sparsity', sparsity,
                                          niter)
                    dev_writer.add_scalar('model_size/total_prunable',
                                          num_prunable_params, niter)
                    dev_writer.add_scalar('model_size/current_prunable',
                                          pruned_size, niter)
                    dev_writer.add_scalar('model_size/total', num_params,
                                          niter)
                    dev_writer.add_scalar(
                        'model_size/current',
                        num_params - num_prunable_params + pruned_size, niter)
                sys.stdout.write(
                    "\rIter={}  lr={:.5f}  train_loss={:.4f}  dev_loss={:.4f}"
                    "  dev_bpc={:.2f}  sparsity={:.2f}\teta={:.1f}m\t[{:.1f}m]\n"
                    .format(niter, optimizer.param_groups[0]['lr'], loss,
                            dev_loss, np.log2(dev_ppl), sparsity,
                            elapsed_time * N / (i + 1), elapsed_time))
                if dev_ppl < best_dev:
                    if (not args.prune
                        ) or sparsity > args.prune_sparsity - 0.02:
                        best_dev = dev_ppl
                        checkpoint = copy_model(model)
                sys.stdout.write("\n")
                sys.stdout.flush()

            nbatch += 1
            if args.noam:
                lr = min(1.0 / (niter**0.5), niter / (args.warmup_steps**1.5))
                optimizer.param_groups[0]['lr'] = lr * args.lr / (args.n_d**
                                                                  0.5)
            if args.noam and start_prune:
                niter_ = niter - args.prune_start_epoch * N
                lr = min(1.0 / (niter_**0.5),
                         niter_ / (args.warmup_steps**1.5))
                optimizer_max.param_groups[0]['lr'] = -lr * args.prune_lr / (
                    args.n_d**0.5)
                optimizer_hc.param_groups[0]['lr'] = lr * args.lr / (args.n_d**
                                                                     0.5)

        if args.save and (epoch + 1) % 10 == 0:
            torch.save(
                copy_model(model),
                "{}.{}.{:.3f}.pt".format(args.save, epoch + 1, sparsity))

    train_writer.close()
    dev_writer.close()

    model.load_state_dict(checkpoint)
    model.cuda()
    dev = create_batches(dev_, 1)
    test = create_batches(test_, 1)
    dev_ppl, dev_loss = eval_model(model, dev)
    test_ppl, test_loss = eval_model(model, test)
    sys.stdout.write("dev_bpc={:.3f}  test_bpc={:.3f}\n".format(
        np.log2(dev_ppl), np.log2(test_ppl)))
class Train:
    __device = []
    __writer = []
    __model = []
    __transformations = []
    __dataset_train = []
    __train_loader = []
    __loss_func = []
    __optimizer = []
    __exp_lr_scheduler = []

    def __init__(self, gpu='0'):
        # Device configuration
        self.__device = torch.device('cuda:'+gpu if torch.cuda.is_available() else 'cpu')
        self.__writer = SummaryWriter('logs')
        self.__model = CNNDriver()
        # Set model to train mode
        self.__model.train()
        print(self.__model)
        self.__writer.add_graph(self.__model, torch.rand(10, 3, 66, 200))
        # Put model on GPU
        self.__model = self.__model.to(self.__device)

    def train(self, num_epochs=100, batch_size=400, lr=0.0001, l2_norm=0.001, save_dir='./save', input='./DataLMDB'):
        # Create log/save directory if it does not exist
        if not os.path.exists('./logs'):
            os.makedirs('./logs')
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        self.__transformations = transforms.Compose([AugmentDrivingTransform(), 
                                                     RandomBrightness(), ConvertToGray(), 
                                                     ConvertToSepia(), AddNoise(), DrivingDataToTensor(),])
        self.__dataset_train = DriveData_LMDB(input, self.__transformations)
        self.__train_loader = DataLoader(self.__dataset_train, batch_size=batch_size, shuffle=True, num_workers=4)

        # Loss and Optimizer
        self.__loss_func = nn.MSELoss()
        # self.__loss_func = nn.SmoothL1Loss()
        self.__optimizer = torch.optim.Adam(self.__model.parameters(), lr=lr, weight_decay=l2_norm)

        # Decay LR by a factor of 0.1 every 10 epochs
        self.__exp_lr_scheduler = lr_scheduler.StepLR(self.__optimizer, step_size=15, gamma=0.1)

        print('Train size:', len(self.__dataset_train), 'Batch size:', batch_size)
        print('Batches per epoch:', len(self.__dataset_train) // batch_size)

        # Train the Model
        iteration_count = 0
        for epoch in range(num_epochs):
            for batch_idx, samples in enumerate(self.__train_loader):

                # Send inputs/labels to GPU
                images = samples['image'].to(self.__device)
                labels = samples['label'].to(self.__device)

                self.__optimizer.zero_grad()

                # Forward + Backward + Optimize
                outputs = self.__model(images)
                loss = self.__loss_func(outputs, labels.unsqueeze(dim=1))

                loss.backward()
                self.__optimizer.step()
                self.__exp_lr_scheduler.step(epoch)

                # Send loss to tensorboard
                self.__writer.add_scalar('loss/', loss.item(), iteration_count)
                self.__writer.add_histogram('steering_out', outputs.clone().detach().cpu().numpy(), iteration_count, bins='doane')
                self.__writer.add_histogram('steering_in', 
                                            labels.unsqueeze(dim=1).clone().detach().cpu().numpy(), iteration_count, bins='doane')

                # Get current learning rate (To display on Tensorboard)
                for param_group in self.__optimizer.param_groups:
                    curr_learning_rate = param_group['lr']
                    self.__writer.add_scalar('learning_rate/', curr_learning_rate, iteration_count)

                # Display on each epoch
                if batch_idx == 0:
                    # Send image to tensorboard
                    self.__writer.add_image('Image', images, epoch)
                    self.__writer.add_text('Steering', 'Steering:' + str(outputs[batch_idx].item()), epoch)
                    # Print Epoch and loss
                    print('Epoch [%d/%d] Loss: %.4f' % (epoch + 1, num_epochs, loss.item()))
                    # Save the Trained Model parameters
                    torch.save(self.__model.state_dict(), save_dir+'/cnn_' + str(epoch) + '.pkl')

                iteration_count += 1
Exemplo n.º 17
0
            'imagebox_label',
            torch.ones(3, 240, 240) * 0.5,
            torch.Tensor([[10, 10, 100, 100], [101, 101, 200, 200]]),
            n_iter,
            labels=['abcde' + str(n_iter), 'fgh' + str(n_iter)])
        x = torch.zeros(sample_rate * 2)
        for i in range(x.size(0)):
            # sound amplitude should in [-1, 1]
            x[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) /
                          float(sample_rate))
        writer.add_audio('myAudio', x, n_iter)
        writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)
        writer.add_text('markdown Text', '''a|b\n-|-\nc|d''', n_iter)
        for name, param in resnet18.named_parameters():
            if 'bn' not in name:
                writer.add_histogram(name, param, n_iter)
        writer.add_pr_curve('xoxo', np.random.randint(2, size=100),
                            np.random.rand(100),
                            n_iter)  # needs tensorboard 0.4RC or later
        writer.add_pr_curve_raw('prcurve with raw data', true_positive_counts,
                                false_positive_counts, true_negative_counts,
                                false_negative_counts, precision, recall,
                                n_iter)
# export scalar data to JSON for external processing
writer.export_scalars_to_json("./all_scalars.json")

dataset = datasets.MNIST('mnist', train=False, download=True)
images = dataset.test_data[:100].float()
label = dataset.test_labels[:100]
features = images.view(100, 784)
writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))
Exemplo n.º 18
0
class SADQ_GQF(object):
    """Adaptive which uses the SADQ algorithm"""
    def __init__(self,
                 name,
                 state_length,
                 network_config,
                 reinforce_config,
                 feature_len,
                 combine_decomposed_func,
                 is_sigmoid=False,
                 memory_resotre=True):
        super(SADQ_GQF, self).__init__()
        self.name = name
        #self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config

        self.memory = ReplayBuffer_decom(self.reinforce_config.memory_size)

        self.learning = True
        self.explanation = False
        self.state_length = state_length

        self.features = 0
        self.feature_len = feature_len
        # Global
        self.steps = 0
        self.reward_history = []
        self.episode_time_history = []
        self.best_reward_mean = -maxsize
        self.episode = 0
        self.feature_len = feature_len
        self.features = None

        self.reset()
        self.memory_resotre = memory_resotre
        reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name

        if not self.network_config.restore_network:
            clear_summary_path(reinforce_summary_path)
        else:
            self.restore_state()

        self.summary = SummaryWriter(log_dir=reinforce_summary_path)
        self.eval_model = feature_q_model(name, state_length, self.feature_len,
                                          self.network_config.output_shape,
                                          network_config)
        self.target_model = feature_q_model(name, state_length,
                                            self.feature_len,
                                            self.network_config.output_shape,
                                            network_config)
        #         self.target_model.eval_mode()

        self.beta_schedule = LinearSchedule(
            self.reinforce_config.beta_timesteps,
            initial_p=self.reinforce_config.beta_initial,
            final_p=self.reinforce_config.beta_final)

        self.epsilon_schedule = LinearSchedule(
            self.reinforce_config.epsilon_timesteps,
            initial_p=self.reinforce_config.starting_epsilon,
            final_p=self.reinforce_config.final_epsilon)

#     def __del__(self):
#         self.save()
#         self.summary.close()

    def should_explore(self):
        self.epsilon = self.epsilon_schedule.value(self.steps)
        self.summary.add_scalar(tag='%s/Epsilon' % self.name,
                                scalar_value=self.epsilon,
                                global_step=self.steps)

        return random.random() < self.epsilon

    def predict(self, state, isGreedy=False, is_random=False):

        if self.learning:
            self.steps += 1
        # add to experience
        if self.previous_state is not None and self.learning and self.current_reward is not None:
            state_crr = np.unique(state, axis=0)
            self.memory.add(self.previous_state, None, self.current_reward,
                            state_crr.reshape(-1, self.state_length), 0,
                            self.features)
#             print("not final : {}".format(self.current_reward) )
#             print(0, self.features)
        if self.learning and self.should_explore() and not isGreedy:
            q_values = None
            fv = None
            choice = random.choice(list(range(len(state))))
            action = choice
        else:
            with torch.no_grad():
                features_vector, q_values = self.eval_model.predict_batch(
                    Tensor(state))
                q_values = FloatTensor(q_values).view(-1)

            _, choice = q_values.max(0)
            action = choice
            fv = features_vector[choice]
#         print("q_value : {}".format(q_values))
#         input()
        if self.learning and self.steps % self.reinforce_config.replace_frequency == 0:
            logger.debug("Replacing target model for %s" % self.name)
            if self.reinforce_config.replace_frequency != 1:
                self.target_model.replace(self.eval_model)
            else:
                self.target_model.replace_soft(self.eval_model)
#             self.target_model.eval_mode()

        if (self.learning and self.steps > self.reinforce_config.update_start
                and self.steps % self.reinforce_config.update_steps == 0):
            self.update_time -= time.time()
            self.update()
            self.update_time += time.time()

        self.current_reward = 0
        self.previous_state = state[action]
        #self.previous_action = action

        return choice, fv  #,q_values

    def disable_learning(self, is_save=False):
        logger.info("Disabled Learning for %s agent" % self.name)
        if is_save:
            #             self.save()
            self.save(force=True)
        self.learning = False
        self.episode = 0

    def enable_learning(self):
        logger.info("enabled Learning for %s agent" % self.name)
        self.learning = True
        self.reset()

    def end_episode(self, state):
        if not self.learning:
            return
#         print("end:")
#         print(self.current_reward)
#         input()
        episode_time = time.time() - self.episode_time

        self.reward_history.append(self.total_reward)
        self.episode_time_history.append(episode_time)
        total_time = sum(self.episode_time_history)
        avg_time = total_time / len(self.episode_time_history)

        logger.info("End of Episode %d, "
                    "Total reward %.2f, "
                    "Epsilon %.2f" %
                    (self.episode + 1, self.total_reward, self.epsilon))

        logger.debug(
            "Episode Time: %.2fs (%.2fs), "
            "Prediction Time: %.2f, "
            "Update Time %.2f" %
            (episode_time, avg_time, self.prediction_time, self.update_time))

        self.episode += 1
        self.summary.add_scalar(tag='%s/Episode Reward' % self.name,
                                scalar_value=self.total_reward,
                                global_step=self.episode)

        self.memory.add(self.previous_state, None, self.current_reward,
                        state.reshape(-1, self.state_length), 1, self.features)
        #         print("final : {}".format(self.current_reward) )
        #         input()
        #         print(1, self.features)
        self.save()
        self.reset()

    def reset(self):
        self.episode_time = time.time()
        self.current_reward = 0
        self.total_reward = 0
        self.previous_state = None
        self.previous_action = None
        self.prediction_time = 0
        self.update_time = 0
        self.features = None

    def restore_state(self):
        restore_path = self.network_config.network_path + "/adaptive.info"
        if self.network_config.network_path and os.path.exists(
                restore_path) and self.memory_resotre:
            logger.info("Restoring state from %s" %
                        self.network_config.network_path)

            with open(restore_path, "rb") as file:
                info = pickle.load(file)

            self.steps = info["steps"]
            #             self.best_reward_mean = info["best_reward_mean"]
            self.episode = info["episode"]
            self.memory.load(self.network_config.network_path)
            print("lenght of memeory: ", len(self.memory))

    def save(self, force=False, appendix=""):
        info = {
            "steps": self.steps,
            "best_reward_mean": self.best_reward_mean,
            "episode": self.episode
        }

        if (len(self.reward_history) >= self.network_config.save_steps and
                self.episode % self.network_config.save_steps == 0) or force:

            total_reward = sum(
                self.reward_history[-self.network_config.save_steps:])
            current_reward_mean = total_reward / self.network_config.save_steps

            if force:  #or current_reward_mean >= self.best_reward_mean:
                print("*************saved*****************",
                      current_reward_mean, self.best_reward_mean)
                if not force:
                    self.best_reward_mean = current_reward_mean
                logger.info("Saving network. Found new best reward (%.2f)" %
                            total_reward)
                self.eval_model.save_network(appendix=appendix)
                self.target_model.save_network(appendix=appendix)
                #                 self.eval_model.save_network()
                #                 self.target_model.save_network()
                with open(self.network_config.network_path + "/adaptive.info",
                          "wb") as file:
                    pickle.dump(info, file, protocol=pickle.HIGHEST_PROTOCOL)
                self.memory.save(self.network_config.network_path)
                print("lenght of memeory: ", len(self.memory))
            else:
                logger.info("The best reward is still %.2f. Not saving" %
                            self.best_reward_mean)

    def reward(self, r):
        self.total_reward += r
        self.current_reward += r

    def passFeatures(self, features):
        self.features = features.copy()
        return

    def summary_test(self, reward, epoch):
        self.summary.add_scalar(tag='%s/eval reward' % self.name,
                                scalar_value=reward,
                                global_step=epoch * 40)

    def summary_GVFs_loss(self, loss, epoch):
        self.summary.add_scalar(tag='%s/GVFs loss' % self.name,
                                scalar_value=loss,
                                global_step=epoch * 40)

    def update(self):
        if len(self.memory._storage) <= self.reinforce_config.batch_size:
            return
#         self.eval_model.train_mode()
        beta = self.beta_schedule.value(self.steps)
        self.summary.add_scalar(tag='%s/Beta' % self.name,
                                scalar_value=beta,
                                global_step=self.steps)
        if self.reinforce_config.use_prior_memory:
            batch = self.memory.sample(self.reinforce_config.batch_size, beta)
            (states, actions, reward, next_states, is_terminal, weights,
             batch_idxes) = batch
            self.summary.add_histogram(tag='%s/Batch Indices' % self.name,
                                       values=Tensor(batch_idxes),
                                       global_step=self.steps)
        else:
            batch = self.memory.sample(self.reinforce_config.batch_size)
            (states, actions, reward, next_states, is_terminal,
             features_vector) = batch

        states = FloatTensor(states)
        #         print(states.size())
        #         next_states = FloatTensor(next_states)
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        features_vector = FloatTensor(features_vector)
        batch_index = torch.arange(self.reinforce_config.batch_size,
                                   dtype=torch.long)
        # Current Q Values
        feature_values, q_values = self.eval_model.predict_batch(states)
        q_values = q_values.flatten()
        q_max = []
        f_max = []
        for i, ns in enumerate(next_states):
            feature_n, q_n = self.target_model.predict_batch(
                FloatTensor(ns).view(-1, self.state_length))
            q_value_max, idx = q_n.max(0)
            features_max = feature_n[idx]

            q_max.append(q_value_max)
            if self.network_config.version in ["v10", "v11"]:
                #                 print(features_max)
                #                 print(ns[idx, 63:67])
                #                 print(states[i, 63:67])
                #                 print(features_max.size(), FloatTensor(ns).view(-1, self.state_length).size(), states.size())
                features_max[:, :3] = (features_max[:, :3] *
                                       ns[idx, 65]) / states[i, 65]
                features_max[:, 3:6] = (features_max[:, 3:6] *
                                        ns[idx, 66]) / states[i, 66]
                features_max[:, 6:9] = (features_max[:, 6:9] *
                                        ns[idx, 63]) / states[i, 63]
                features_max[:, 9:12] = (features_max[:, 9:12] *
                                         ns[idx, 64]) / states[i, 64]
                features_max[features_max == float('inf')] = 0
#                 print(features_max)
#                 input()
            f_max.append(features_max.view(-1))

#         if torch.sum(terminal == torch.sum(features_vector, dim = 1)) != len(terminal):
#             print(terminal)
#             print(features_vector)
#             input()
        q_max = torch.stack(q_max, dim=1).view(-1)
        f_max = torch.stack(f_max)
        q_max = (1 - terminal) * q_max

        f_max = (1 - terminal.view(-1, 1)) * f_max

        q_target = reward + self.reinforce_config.discount_factor * q_max

        f_target = features_vector + self.reinforce_config.discount_factor * f_max

        #         if torch.sum(reward).item() > 0:
        #             print(reward)
        #             print(feature_values)
        #             print(q_target)
        #             print(q_values)
        #             input()
        # update model
        if (torch.sum(feature_values != feature_values).item() +
                torch.sum(f_target != f_target)).item() > 0:

            #             print("1")
            #             print(features_vector)
            #             print("2")
            #             print(feature_values)
            #             print("3")
            #             print(f_target)
            #             print("4")
            #             print(f_max)
            #             print("5")
            #             print(states.tolist())
            #             input()
            f_target[f_target != f_target] = 0
        self.eval_model.fit(q_values, q_target, feature_values, f_target)

        # Update priorities
        if self.reinforce_config.use_prior_memory:
            td_errors = q_values - q_target
            new_priorities = torch.abs(
                td_errors) + 1e-6  # prioritized_replay_eps
            self.memory.update_priorities(batch_idxes, new_priorities.data)

    def load_model(self, model):
        self.eval_model.replace(model)

    def load_weight(self, weight_dict):
        self.eval_model.load_weight(weight_dict)

    def load_model(self, model):
        self.eval_model.replace(model)

    def load_weight(self, new_feature_weights, new_q_weights):
        self.eval_model.feautre_model.load_state_dict(new_feature_weights)
        self.eval_model.q_model.load_state_dict(new_q_weights)
Exemplo n.º 19
0
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

    writer_train.add_scalars("losses", {
        'train_ln': epoch_loss,
        'val_ln': val_epoch_loss
    }, int(epoch))
    writer_train.add_scalars("accuracies", {
        'train_ln': epoch_accuracy,
        'val_ln': val_epoch_accuracy
    }, int(epoch))

    # Learning rate scheduler update
    scheduler.step(val_epoch_loss)

writer_train.add_histogram("error_ln", np.array(train_losses))

elapsed = clock() - start

print(elapsed)

# -----------------------------------------------------------------------------------
# Model classification metrics

classes = [
    'A', 'B', 'C', 'D', 'del', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
    'N', 'nothing', 'O', 'P', 'Q', 'R', 'S', 'space', 'T', 'U', 'V', 'W', 'X',
    'Y', 'Z'
]

correct = 0
Exemplo n.º 20
0
            loss.backward()
            optimizer.step()

        if epoch % 30 == 0:
            writeIntermediateState(0, model, epoch, nx, ny, log_writer,
                                   coordinateSystem)
            writeIntermediateState(100, model, epoch, nx, ny, log_writer,
                                   coordinateSystem)
            writeIntermediateState(200, model, epoch, nx, ny, log_writer,
                                   coordinateSystem)
            writeIntermediateState(250, model, epoch, nx, ny, log_writer,
                                   coordinateSystem)
            writeIntermediateState(300, model, epoch, nx, ny, log_writer,
                                   coordinateSystem)
            writeIntermediateState(400, model, epoch, nx, ny, log_writer,
                                   coordinateSystem)
            writeIntermediateState(500, model, epoch, nx, ny, log_writer,
                                   coordinateSystem)
            writeValidationLoss(model, log_writer, 250, epoch,
                                coordinateSystem)
            writeValidationLoss(model, log_writer, 500, epoch,
                                coordinateSystem)
            sys.stdout.flush()

            print("PDE Loss at Epoch: ", epoch + 1, loss.item())
            if log_writer:
                log_writer.add_histogram(
                    'First Layer Grads',
                    model.lin_layers[0].weight.grad.view(-1, 1), epoch)
                save_checkpoint(model, optimizer, modelPath, epoch)
Exemplo n.º 21
0
def main():
    base = c3d.C3D(with_classifier=False)
    model = ssl_net.SSLNET(base, with_classifier=True, num_classes=12)

    start_epoch = 1
    # pretrain_weight = loadcontinur_weights(pretrain_path)

    # model.load_state_dict(pretrain_weight, strict=False)
    # train
    train_dataset = UntrimmedVideoDataset(params['root'], mode="train")
    if params['data'] == 'UCF-101':
        val_size = 800
    elif params['data'] == 'hmdb':
        val_size = 400
    elif params['data'] == 'Thumos14':
        val_size = 400
    train_dataset, val_dataset = random_split(
        train_dataset, (len(train_dataset) - val_size, val_size))

    print("num_works:{:d}".format(params['num_workers']))
    print("batch_size:{:d}".format(params['batch_size']))
    train_loader = DataLoader(train_dataset,
                              batch_size=params['batch_size'],
                              shuffle=True,
                              num_workers=params['num_workers'])
    val_loader = DataLoader(val_dataset,
                            batch_size=params['batch_size'],
                            shuffle=True,
                            num_workers=params['num_workers'])
    model = nn.DataParallel(model)  #multi-gpu
    model = model.cuda()
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(model.parameters(),
                          lr=params['learning_rate'],
                          momentum=params['momentum'],
                          weight_decay=params['weight_decay'])
    #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     'min',
                                                     min_lr=1e-5,
                                                     patience=50,
                                                     factor=0.1)

    #pretrain_model = pretrain_path.split('/')[-1].split('.')[0] + 'pth'

    model_save_dir = os.path.join(save_path,
                                  '_' + time.strftime('%m-%d-%H-%M'))
    writer = SummaryWriter(model_save_dir)

    for data in train_loader:
        clip, label = data
        writer.add_video('train/clips', clip, 0, fps=8)
        writer.add_text('train/idx', str(label.tolist()), 0)
        clip = clip.cuda()
        #writer.add_graph(model, (clip, clip));
        break
    for name, param in model.named_parameters():
        writer.add_histogram('params/{}'.format(name), param, 0)

    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
    prev_best_val_loss = float('inf')
    prev_best_loss_model_path = None
    prev_best_acc_model_path = None
    best_acc = 0
    best_epoch = 0
    for epoch in tqdm(range(start_epoch, start_epoch + params['epoch_num'])):
        train(train_loader, model, criterion, optimizer, epoch, writer)
        val_loss, top1_avg = validation(val_loader, model, criterion,
                                        optimizer, epoch)
        if top1_avg >= best_acc:
            best_acc = top1_avg
            best_epoch = epoch
            model_path = os.path.join(
                model_save_dir, 'best_acc_model_{}.pth.tar'.format(epoch))
            torch.save(model.state_dict(), model_path)

            prev_best_acc_model_path = model_path
        if val_loss < prev_best_val_loss:
            model_path = os.path.join(
                model_save_dir, 'best_loss_model_{}.pth.tar'.format(epoch))
            torch.save(model.state_dict(), model_path)
            prev_best_val_loss = val_loss

            prev_best_loss_model_path = model_path
        scheduler.step(val_loss)
        if epoch % 20 == 0:
            checkpoints = os.path.join(model_save_dir, str(epoch) + ".pth.tar")
            torch.save(model.state_dict(), checkpoints)
            print("save_to:", checkpoints)
    print("best is :", best_acc, best_epoch)
Exemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--stage', default='train', type=str)

    parser.add_argument('--gpus', default='0,1,2,3', type=str)
    parser.add_argument('--max_epoch', default=200, type=int)
    parser.add_argument('--lr_decay_steps', default='160,190,200', type=str)
    parser.add_argument('--exp', default='', type=str)
    parser.add_argument('--res_path', default='', type=str)
    parser.add_argument('--resume_path', default='', type=str)
    parser.add_argument('--pretrain_path', default='', type=str)

    parser.add_argument('--dataset', default='imagenet', type=str)
    parser.add_argument('--lr', default=0.03, type=float)
    parser.add_argument('--lr_decay_rate', default=0.1, type=float)
    parser.add_argument('--batch_size', default=128, type=int)
    parser.add_argument('--weight_decay', default=5e-4, type=float)
    parser.add_argument('--n_workers', default=32, type=int)
    parser.add_argument('--n_background', default=4096, type=int)
    parser.add_argument('--t', default=0.07, type=float)
    parser.add_argument('--m', default=0.5, type=float)
    parser.add_argument('--dropout', action='store_true')
    parser.add_argument('--blur', action='store_true')
    parser.add_argument('--cos', action='store_true')

    parser.add_argument('--network', default='resnet18', type=str)
    parser.add_argument('--mix', action='store_true')
    parser.add_argument('--not_hardpos', action='store_true')
    parser.add_argument('--InvP', type=int, default=1)
    parser.add_argument('--ramp_up', default='binary', type=str)
    parser.add_argument('--lam_inv', default=0.6, type=float)
    parser.add_argument('--lam_mix', default=1.0, type=float)
    parser.add_argument('--diffusion_layer', default=3, type=int)
    # for cifar 10 the best diffusion_layer is 3 and cifar 100 is 4
    # for imagenet I have only tested when diffusion_layer = 3
    parser.add_argument('--K_nearst', default=4, type=int)
    parser.add_argument('--n_pos', default=50, type=int)
    # for cifar10 the best n_pos is 20, for cifar 100 the best is 10 or 20
    parser.add_argument('--exclusive', default=1, type=int)
    parser.add_argument('--nonlinearhead', default=0, type=int)
    # exclusive best to be 0

    global args
    args = parser.parse_args()
    exp_identifier = get_expidentifier([
        'mix', 'network', 'lam_inv', 'lam_mix', 'diffusion_layer', 'K_nearst',
        'n_pos', 'exclusive', 'max_epoch', 'ramp_up', 'nonlinearhead', 't',
        'weight_decay'
    ], args)
    if not args.InvP: exp_identifier = 'hard'
    args.exp = os.path.join(args.exp, exp_identifier)

    if not os.path.exists(args.exp):
        os.makedirs(args.exp)
    if not os.path.exists(os.path.join(args.exp, 'runs')):
        os.makedirs(os.path.join(args.exp, 'runs'))
    if not os.path.exists(os.path.join(args.exp, 'models')):
        os.makedirs(os.path.join(args.exp, 'models'))
    if not os.path.exists(os.path.join(args.exp, 'logs')):
        os.makedirs(os.path.join(args.exp, 'logs'))

    logger = getLogger(args.exp)

    device_ids = list(map(lambda x: int(x), args.gpus.split(',')))
    device = torch.device('cuda: 0')

    if args.dataset.startswith('cifar'):
        train_loader, val_loader, train_ordered_labels, train_dataset, val_dataset = cifar.get_dataloader(
            args)
    elif args.dataset.startswith('imagenet'):
        train_loader, val_loader, train_ordered_labels, train_dataset, val_dataset = imagenet.get_instance_dataloader(
            args)
    elif args.dataset == 'svhn':
        train_loader, val_loader, train_ordered_labels, train_dataset, val_dataset = svhn.get_dataloader(
            args)

    # create model
    if args.network == 'alexnet':
        network = alexnet(128)
    if args.network == 'alexnet_cifar':
        network = AlexNet_cifar(128)
    elif args.network == 'resnet18_cifar':
        network = ResNet18_cifar(128,
                                 dropout=args.dropout,
                                 non_linear_head=args.nonlinearhead)
    elif args.network == 'resnet50_cifar':
        network = ResNet50_cifar(128, dropout=args.dropout)
    elif args.network == 'wide_resnet28':
        network = WideResNetInstance(28, 2)
    elif args.network == 'resnet18':
        network = resnet18(non_linear_head=args.nonlinearhead)
    elif args.network == 'pre-resnet18':
        network = PreActResNet18(128)
    elif args.network == 'resnet50':
        network = resnet50(non_linear_head=args.nonlinearhead)
    elif args.network == 'pre-resnet50':
        network = PreActResNet50(128)
    network = nn.DataParallel(network, device_ids=device_ids)
    network.to(device)

    # create optimizer

    if args.network == 'pre-resnet18' or args.network == 'pre-resnet50':
        logging.info(
            colorful(
                'Exclude bns from weight decay, copied from LocalAggregation proposed by Zhuang et al [ICCV 2019]'
            ))
        parameters = exclude_bn_weight_bias_from_weight_decay(
            network, weight_decay=args.weight_decay)
    else:
        parameters = network.parameters()

    optimizer = torch.optim.SGD(
        parameters,
        lr=args.lr,
        momentum=0.9,
        weight_decay=args.weight_decay,
    )

    cudnn.benchmark = True

    # create memory_bank
    global writer
    writer = SummaryWriter(comment='InvariancePropagation',
                           logdir=os.path.join(args.exp, 'runs'))
    memory_bank = objective.MemoryBank_v1(len(train_dataset),
                                          train_ordered_labels,
                                          writer,
                                          device,
                                          m=args.m)

    # create criterion
    criterionA = objective.InvariancePropagationLoss(
        args.t,
        diffusion_layer=args.diffusion_layer,
        k=args.K_nearst,
        n_pos=args.n_pos,
        exclusive=args.exclusive,
        InvP=args.InvP,
        hard_pos=(not args.not_hardpos))
    criterionB = objective.MixPointLoss(args.t)
    if args.ramp_up == 'binary':
        ramp_up = lambda i_epoch: objective.BinaryRampUp(i_epoch, 30)
    elif args.ramp_up == 'gaussian':
        ramp_up = lambda i_epoch: objective.GaussianRampUp(i_epoch, 30, 5)
    elif args.ramp_up == 'zero':
        ramp_up = lambda i_epoch: 1

    logging.info(beautify(args))
    start_epoch = 0
    if args.pretrain_path != '' and args.pretrain_path != 'none':
        logging.info('loading pretrained file from {}'.format(
            args.pretrain_path))
        checkpoint = torch.load(args.pretrain_path)
        network.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        _memory_bank = checkpoint['memory_banks']
        try:
            _neigh = checkpoint['neigh']
            memory_bank.neigh = _neigh
        except:
            logging.info(
                colorful(
                    'The Pretrained Path has No NEIGH and require a epoch to re-calculate'
                ))
        memory_bank.points = _memory_bank
        start_epoch = checkpoint['epoch']
    else:
        initialize_memorybank(network, train_loader, device, memory_bank)
    logging.info('start training')
    best_acc = 0.0

    try:
        for i_epoch in range(start_epoch, args.max_epoch):
            adjust_learning_rate(args.lr,
                                 args.lr_decay_steps,
                                 optimizer,
                                 i_epoch,
                                 lr_decay_rate=args.lr_decay_rate,
                                 cos=args.cos,
                                 max_epoch=args.max_epoch)
            train(i_epoch, network, criterionA, criterionB, optimizer,
                  train_loader, device, memory_bank, ramp_up)

            save_name = 'checkpoint.pth'
            checkpoint = {
                'epoch': i_epoch + 1,
                'state_dict': network.state_dict(),
                'optimizer': optimizer.state_dict(),
                'memory_banks': memory_bank.points,
                'neigh': memory_bank.neigh,
            }
            torch.save(checkpoint, os.path.join(args.exp, 'models', save_name))

            # scheduler.step()
            # validate(network, memory_bank, val_loader, train_ordered_labels, device)
            acc = kNN(i_epoch,
                      network,
                      memory_bank,
                      val_loader,
                      train_ordered_labels,
                      K=200,
                      sigma=0.07)
            if acc >= best_acc:
                best_acc = acc
                torch.save(checkpoint,
                           os.path.join(args.exp, 'models', 'best.pth'))
            if i_epoch in [30, 60, 120, 160, 200, 400, 600]:
                torch.save(
                    checkpoint,
                    os.path.join(args.exp, 'models',
                                 '{}.pth'.format(i_epoch + 1)))

            args.y_best_acc = best_acc
            logging.info(
                colorful('[Epoch: {}] val acc: {:.4f}'.format(i_epoch, acc)))
            logging.info(
                colorful('[Epoch: {}] best acc: {:.4f}'.format(
                    i_epoch, best_acc)))
            writer.add_scalar('acc', acc, i_epoch + 1)

            with torch.no_grad():
                for name, param in network.named_parameters():
                    if 'bn' not in name:
                        writer.add_histogram(name, param, i_epoch)

            # cluster
    except KeyboardInterrupt as e:
        logging.info('KeyboardInterrupt at {} Epochs'.format(i_epoch))
        save_result(args)
        exit()

    save_result(args)
Exemplo n.º 23
0
class DQNAdaptive(object):
    """Adaptive which uses the  DQN algorithm"""

    def __init__(self, name, choices, network_config, reinforce_config):
        super(DQNAdaptive, self).__init__()
        self.name = name
        self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config

        self.memory = PrioritizedReplayBuffer(self.reinforce_config.memory_size, 0.6)
        self.learning = True
        self.explanation = False

        # Global
        self.steps = 0
        self.reward_history = []
        self.episode_time_history = []
        self.best_reward_mean = -maxsize
        self.episode = 0

        self.reset()

        reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name

        if not self.network_config.restore_network:
            clear_summary_path(reinforce_summary_path)
        else:
            self.restore_state()

        self.summary = SummaryWriter(log_dir=reinforce_summary_path)

        self.target_model = DQNModel(self.name + "_target", self.network_config, use_cuda)
        self.eval_model = DQNModel(self.name + "_eval", self.network_config, use_cuda)

        self.beta_schedule = LinearSchedule(self.reinforce_config.beta_timesteps,
                                            initial_p=self.reinforce_config.beta_initial,
                                            final_p=self.reinforce_config.beta_final)

        self.epsilon_schedule = LinearSchedule(self.reinforce_config.epsilon_timesteps,
                                               initial_p=self.reinforce_config.starting_epsilon,
                                               final_p=self.reinforce_config.final_epsilon)

    def __del__(self):
        self.save()
        self.summary.close()

    def should_explore(self):
        self.epsilon = self.epsilon_schedule.value(self.steps)
        self.summary.add_scalar(tag='%s/Epsilon' % self.name,
                                scalar_value=self.epsilon,
                                global_step=self.steps)

        return random.random() < self.epsilon

    def predict(self, state):
        self.steps += 1

        # add to experience
        if self.previous_state is not None:
            self.memory.add(self.previous_state,
                            self.previous_action,
                            self.current_reward,
                            state, 0)

        if self.learning and self.should_explore():
            q_values = None
            choice = random.choice(self.choices)
            action = self.choices.index(choice)
        else:
            self.prediction_time -= time.time()
            _state = Tensor(state).unsqueeze(0)
            action, q_values = self.eval_model.predict(_state,
                                                       self.steps,
                                                       self.learning)
            choice = self.choices[action]
            self.prediction_time += time.time()

        if self.learning and self.steps % self.reinforce_config.replace_frequency == 0:
            logger.debug("Replacing target model for %s" % self.name)
            self.target_model.replace(self.eval_model)

        if (self.learning and
            self.steps > self.reinforce_config.update_start and
                self.steps % self.reinforce_config.update_steps == 0):
            self.update_time -= time.time()
            self.update()
            self.update_time += time.time()

        self.current_reward = 0
        self.previous_state = state
        self.previous_action = action

        return choice, q_values

    def disable_learning(self, is_save = True):
        logger.info("Disabled Learning for %s agent" % self.name)
        if is_save:
            self.save()
        self.learning = False
        self.episode = 0

    def end_episode(self, state):
        if not self.learning:
            return

        episode_time = time.time() - self.episode_time

        self.reward_history.append(self.total_reward)
        self.episode_time_history.append(episode_time)
        total_time = sum(self.episode_time_history)
        avg_time = total_time / len(self.episode_time_history)

        logger.info("End of Episode %d, "
                    "Total reward %.2f, "
                    "Epsilon %.2f" % (self.episode + 1,
                                      self.total_reward,
                                      self.epsilon))

        logger.debug("Episode Time: %.2fs (%.2fs), "
                     "Prediction Time: %.2f, "
                     "Update Time %.2f" % (episode_time,
                                           avg_time,
                                           self.prediction_time,
                                           self.update_time))

        self.episode += 1
        self.summary.add_scalar(tag='%s/Episode Reward' % self.name,
                                scalar_value=self.total_reward,
                                global_step=self.episode)

        self.memory.add(self.previous_state,
                        self.previous_action,
                        self.current_reward,
                        state, 1)
        self.save()
        self.reset()

    def reset(self):
        self.episode_time = time.time()
        self.current_reward = 0
        self.total_reward = 0
        self.previous_state = None
        self.previous_action = None
        self.prediction_time = 0
        self.update_time = 0

    def restore_state(self):
        restore_path = self.network_config.network_path + "/adaptive.info"
        if self.network_config.network_path and os.path.exists(restore_path):
            logger.info("Restoring state from %s" % self.network_config.network_path)

            with open(restore_path, "rb") as file:
                info = pickle.load(file)

            self.steps = info["steps"]
            self.best_reward_mean = info["best_reward_mean"]
            self.episode = info["episode"]

    def save(self, force=False):
        info = {
            "steps": self.steps,
            "best_reward_mean": self.best_reward_mean,
            "episode": self.episode
        }

        if (len(self.reward_history) >= self.network_config.save_steps and
                self.episode % self.network_config.save_steps == 0):

            total_reward = sum(self.reward_history[-self.network_config.save_steps:])
            current_reward_mean = total_reward / self.network_config.save_steps

            if current_reward_mean >= self.best_reward_mean:
                self.best_reward_mean = current_reward_mean
                logger.info("Saving network. Found new best reward (%.2f)" % current_reward_mean)
                self.eval_model.save_network()
                self.target_model.save_network()
                with open(self.network_config.network_path + "/adaptive.info", "wb") as file:
                    pickle.dump(info, file, protocol=pickle.HIGHEST_PROTOCOL)
            else:
                logger.info("The best reward is still %.2f. Not saving" % self.best_reward_mean)

    def reward(self, r):
        self.total_reward += r
        self.current_reward += r

    def update(self):
        if self.steps <= self.reinforce_config.batch_size:
            return

        beta = self.beta_schedule.value(self.steps)
        self.summary.add_scalar(tag='%s/Beta' % self.name,
                                scalar_value=beta, global_step=self.steps)

        batch = self.memory.sample(self.reinforce_config.batch_size, beta)

        (states, actions, reward, next_states,
         is_terminal, weights, batch_idxes) = batch

        self.summary.add_histogram(tag='%s/Batch Indices' % self.name,
                                   values=Tensor(batch_idxes),
                                   global_step=self.steps)

        states = FloatTensor(states)
        next_states = FloatTensor(next_states)
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.reinforce_config.batch_size,
                                   dtype=torch.long)

        # Current Q Values
        q_actions, q_values = self.eval_model.predict_batch(states)
        q_values = q_values[batch_index, actions]

        # Calculate target
        actions, q_next = self.target_model.predict_batch(next_states)
        q_max = q_next.max(1)[0].detach()
        q_max = (1 - terminal) * q_max

        q_target = reward + self.reinforce_config.discount_factor * q_max

        # update model
        self.eval_model.fit(q_values, q_target, self.steps)
        # Update priorities
        td_errors = q_values - q_target
        new_priorities = torch.abs(td_errors) + 1e-6  # prioritized_replay_eps
        self.memory.update_priorities(batch_idxes, new_priorities.data)
Exemplo n.º 24
0
def train_model(model,
                datasets,
                batch_size,
                epochs,
                learning_rate,
                weight_decay=0,
                metadata=None,
                weights=None,
                checkpoint=None):
    """Train a sequence model on the Emoji Dataset.
    Args:
        model (torch.nn.Module): the model to be trained
        datasets (tuple): contains 3 datasets (TweetsBaseDataset)
            corresponding to train, dev and test splits
        batch_size (int): mini-batch size for training
        epochs (int): number of iterations over the training set
        learning_rate (float): used in the optimizer
        weight_decay (float): regularization factor for the optimizer
        metadata (dict): contains keys and values of any type with a valid
            string representation, which are saved for visualization in
            TensorBoard. Use to log model name and hyperparameters
        weights (dict): maps strings to weights (torch.tensor) to be
            visualized as histograms in TensorBoard
        checkpoint (str): path of an existing checkpoint (.pt) file
    Returns:
        tuple, containing best validation F1 score and test F1 score
    """
    train_set, dev_set, test_set = datasets
    train_loader = DataLoader(train_set,
                              batch_size,
                              shuffle=True,
                              num_workers=4,
                              collate_fn=TweetsBaseDataset.collate_fn)

    model.to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=weight_decay)

    if checkpoint is not None:
        load_training_state(model, optimizer, checkpoint, eval_model=False)

    # A writer to save TensorBoard events
    writer = SummaryWriter()
    logdir = writer.file_writer.get_logdir()

    # Write hyperparameters to summary
    if metadata is None:
        metadata = {}
    metadata['Batch size'] = batch_size
    metadata['Learning rate'] = learning_rate
    text_summary = _build_text_summary(metadata)
    writer.add_text('metadata', text_summary)

    best_score = 0
    test_f1 = 0
    best_ckpt_link = os.path.join(logdir, 'best-ckpt.pt')

    try:
        steps = 0
        for epoch in range(1, epochs + 1):
            model.train()
            print('Epoch {:d}/{:d}'.format(epoch, epochs))
            n_batches = 0
            for inputs, labels, lengths, indices in train_loader:
                steps += 1
                n_batches += 1

                inputs = inputs.to(device)
                labels = labels.to(device)
                lengths = lengths.to(device)

                # Initialize the gradients to zero
                optimizer.zero_grad()

                # Run the model
                outputs = model(inputs, lengths)
                loss = criterion(outputs, labels)

                # Optimize
                loss.backward()
                optimizer.step()

                # Log scores on training set
                if n_batches % 100 == 0:
                    f1 = _get_score(outputs, labels)
                    print("\r{}/{}: loss = {:.4f}, f1_score = {:.4f}".format(
                        n_batches, len(train_loader), loss, f1),
                          end='',
                          flush=True)

                    # Write metrics to TensorBoard
                    writer.add_scalar('training/loss', loss, steps)
                    writer.add_scalar('training/f1_score', f1, steps)
                    # Write histograms
                    if weights is not None:
                        for name, data in weights.items():
                            writer.add_histogram('weights/' + name, data,
                                                 steps)

            # Evaluate on dev set
            eval_loss, eval_f1 = evaluate(model, criterion, dev_set)
            print("\nvalidation loss = {:.4f}, validation f1_score = {:.4f}".
                  format(eval_loss, eval_f1))

            # Write to Tensorboard
            writer.add_scalar('validation/loss', eval_loss, steps)
            writer.add_scalar('validation/f1_score', eval_f1, steps)

            # Save the checkpoint
            ckpt_path = os.path.join(logdir, 'ckpt-{:d}.pt'.format(epoch))
            save_model(model, optimizer, epoch, ckpt_path)

            # Create a symbolic link to the best model
            if eval_f1 > best_score:
                best_score = eval_f1
                if os.path.islink(best_ckpt_link):
                    os.unlink(best_ckpt_link)
                os.symlink(os.path.basename(ckpt_path), best_ckpt_link)

        print("Training Completed. Evaluating on test set...")

        # Evaluate on test set
        test_loss, test_f1 = evaluate(model, criterion, test_set)
        _, test_precision = evaluate(model,
                                     criterion,
                                     test_set,
                                     score="precision")
        _, test_recall = evaluate(model, criterion, test_set, score="recall")
        print(
            "\ntest loss = {:.4f}, test f1_score = {:.4f}, test precision = {:.4f}, test recall = {:.4f}"
            .format(test_loss, test_f1, test_precision, test_recall))

        # Write to Tensorboard
        writer.add_scalar('test/loss', test_loss, 0)
        writer.add_scalar('test/f1_score', test_f1, 0)

    except KeyboardInterrupt:
        print('Interrupted training.')

    return best_score, test_f1
Exemplo n.º 25
0
# For testing just do everything in one giant batch
testloader = torch.utils.data.DataLoader(
    dataset_test, batch_size=len(dataset_test), shuffle=False, num_workers=0,
)

model = FeedForward(dim=dataset_train.dim, hidden_size=args.hidden_size, output_size=dataset_train.dim)

# Open a tensorboard writer if a logging directory is given
if args.logdir != '':
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    save_dir = osp.join(args.logdir, current_time)
    writer = SummaryWriter(log_dir=save_dir)
    if args.weight_histogram:
        # Log the initial parameters
        for name, param in model.named_parameters():
            writer.add_histogram('parameters/' + name, param.clone().cpu().data.numpy(), 0)

mse_criterion = nn.MSELoss()
cosine_criterion = nn.CosineEmbeddingLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

for e in range(args.epochs):
    print('Epoch: {0}'.format(e + 1))

    avg_mse_loss = 0
    avg_cosine_loss = 0
    n_batches = 0
    for i, data in enumerate(trainloader):

        noisy, clean = data
Exemplo n.º 26
0
            single_mt.eval()
            eval_x, eval_y = dataset.slide_seq2seq_batch(
                2, config.max_seq, 'eval')
            eval_x = torch.from_numpy(eval_x).contiguous().to(config.device,
                                                              dtype=torch.int)
            eval_y = torch.from_numpy(eval_y).contiguous().to(config.device,
                                                              dtype=torch.int)

            eval_preiction, weights = single_mt.forward(eval_x)

            eval_metrics = eval_metric_set(eval_preiction, eval_y)
            torch.save(single_mt.state_dict(),
                       args.model_dir + '/train-{}.pth'.format(e))
            if b == 0:
                train_summary_writer.add_histogram("target_analysis",
                                                   batch_y,
                                                   global_step=e)
                train_summary_writer.add_histogram("source_analysis",
                                                   batch_x,
                                                   global_step=e)
                for i, weight in enumerate(weights):
                    attn_log_name = "attn/layer-{}".format(i)
                    utils.attention_image_summary(attn_log_name,
                                                  weight,
                                                  step=idx,
                                                  writer=eval_summary_writer)

            eval_summary_writer.add_scalar('loss',
                                           eval_metrics['loss'],
                                           global_step=idx)
            eval_summary_writer.add_scalar('accuracy',
Exemplo n.º 27
0
class TBXLogger(Logger):
    """TensorBoardX Logger.

    Note that hparams will be written only after a trial has terminated.
    This logger automatically flattens nested dicts to show on TensorBoard:

        {"a": {"b": 1, "c": 2}} -> {"a/b": 1, "a/c": 2}
    """
    def _init(self):
        try:
            from tensorboardX import SummaryWriter
        except ImportError:
            logger.error("pip install 'ray[tune]' to see TensorBoard files.")
            raise
        self._file_writer = SummaryWriter(self.logdir, flush_secs=30)
        self.last_result = None

    def on_result(self, result):
        step = result.get(TIMESTEPS_TOTAL) or result[TRAINING_ITERATION]

        tmp = result.copy()
        for k in [
                "config", "pid", "timestamp", TIME_TOTAL_S, TRAINING_ITERATION
        ]:
            if k in tmp:
                del tmp[k]  # not useful to log these

        flat_result = flatten_dict(tmp, delimiter="/")
        path = ["ray", "tune"]
        valid_result = {}

        for attr, value in flat_result.items():
            full_attr = "/".join(path + [attr])
            if type(value) in VALID_SUMMARY_TYPES:
                valid_result[full_attr] = value
                self._file_writer.add_scalar(full_attr,
                                             value,
                                             global_step=step)
            elif type(value) is list and len(value) > 0:
                valid_result[full_attr] = value
                try:
                    self._file_writer.add_histogram(full_attr,
                                                    value,
                                                    global_step=step)
                # In case TensorboardX still doesn't think it's a valid value
                # (e.g. `[[]]`), warn and move on.
                except (ValueError, TypeError):
                    if log_once("invalid_tbx_value"):
                        logger.warning(
                            "You are trying to log an invalid value ({}={}) "
                            "via {}!".format(full_attr, value,
                                             type(self).__name__))

        self.last_result = valid_result
        self._file_writer.flush()

    def flush(self):
        if self._file_writer is not None:
            self._file_writer.flush()

    def close(self):
        if self._file_writer is not None:
            if self.trial and self.trial.evaluated_params and self.last_result:
                self._try_log_hparams(self.last_result)
            self._file_writer.close()

    def _try_log_hparams(self, result):
        # TBX currently errors if the hparams value is None.
        scrubbed_params = {
            k: v
            for k, v in self.trial.evaluated_params.items() if v is not None
        }
        from tensorboardX.summary import hparams
        experiment_tag, session_start_tag, session_end_tag = hparams(
            hparam_dict=scrubbed_params, metric_dict=result)
        self._file_writer.file_writer.add_summary(experiment_tag)
        self._file_writer.file_writer.add_summary(session_start_tag)
        self._file_writer.file_writer.add_summary(session_end_tag)
Exemplo n.º 28
0
class GAN(object):
    """GAN class."""
    def __init__(self, opt, dataset_load=None, exp_dir=None):
        """Constructor."""
        # Save variables
        self.opt = opt
        self.dataset_load = dataset_load
        self.opt.out_dir = exp_dir

        # Define other variables
        self.real_label = 1
        self.fake_label = 0

        # Losses file
        file_name = os.path.join(self.opt.out_dir, 'losses.txt')
        self.output_loss_file = open(file_name, "wt")

        # TODO: Add comment
        if self.opt.full_sphere_sampling:
            self.opt.phi = None
            self.opt.theta = None
            self.opt.cam_dist = self.opt.cam_dist + 0.2
        else:
            self.opt.angle = None
            self.opt.axis = None

        # TensorboardX
        self.writer = SummaryWriter(self.opt.vis_monitoring)
        print(self.opt.vis_monitoring)
        print(self.opt.out_dir)
        # Create dataset loader
        self.create_dataset_loader()

        # Create the networks

        # Create create_tensors
        self.create_tensors()

        # Create criterion

        # Create create optimizers

        # Create splats rendering scene
        self.create_scene()

    def create_dataset_loader(self, ):
        """Create dataset leader."""
        # Define camera positions
        if self.opt.same_view:
            # self.cam_pos = uniform_sample_sphere(radius=self.opt.cam_dist,
            #                                      num_samples=1)
            arrays = [
                np.asarray([3., 3., 3.]) for _ in range(self.opt.batchSize)
            ]  # TODO: Magic numbers
            self.cam_pos = np.stack(arrays, axis=0)

        # Create dataset loader
        self.dataset_load.initialize_dataset()
        self.dataset = self.dataset_load.get_dataset()
        self.dataset_load.initialize_dataset_loader(1)  # TODO: Hack
        self.dataset_loader = self.dataset_load.get_dataset_loader()

    def create_networks(self, ):
        """Create networks."""
        self.netG, self.netG2, self.netD, self.netD2 = create_networks(
            self.opt, verbose=True, depth_only=True)  # TODO: Remove D2 and G2
        # Create the normal estimation network which takes pointclouds in the
        # camera space and outputs the normals
        assert self.netG2 is None
        self.sph_normals = True
        self.netG2 = NEstNetV1_2(sph=self.sph_normals)
        print(self.netG2)
        if not self.opt.no_cuda:
            self.netD = self.netD.cuda()
            self.netG = self.netG.cuda()
            self.netG2 = self.netG2.cuda()

    def create_scene(self, ):
        """Create a semi-empty scene with camera parameters."""
        self.scene = create_scene(self.opt.splats_img_size,
                                  self.opt.splats_img_size, self.opt.fovy,
                                  self.opt.focal_length, self.opt.n_splats)

    def create_tensors(self, ):
        """Create the tensors."""
        # Create tensors
        self.input = torch.FloatTensor(self.opt.batchSize,
                                       self.opt.render_img_nc,
                                       self.opt.render_img_size,
                                       self.opt.render_img_size)
        self.input_depth = torch.FloatTensor(self.opt.batchSize, 1,
                                             self.opt.render_img_size,
                                             self.opt.render_img_size)
        self.input_normal = torch.FloatTensor(self.opt.batchSize, 1,
                                              self.opt.render_img_size,
                                              self.opt.render_img_size)
        self.input_cond = torch.FloatTensor(self.opt.batchSize, 3)

        self.noise = torch.FloatTensor(self.opt.batchSize, int(self.opt.nz), 1,
                                       1)
        self.fixed_noise = torch.FloatTensor(self.opt.batchSize,
                                             int(self.opt.nz), 1,
                                             1).normal_(0, 1)

        self.label = torch.FloatTensor(2 * self.opt.batchSize)
        self.one = torch.FloatTensor([1])
        self.mone = self.one * -1

        # Move them to the GPU
        if not self.opt.no_cuda:
            self.input = self.input.cuda()
            self.input_depth = self.input_depth.cuda()
            self.input_normal = self.input_normal.cuda()
            self.input_cond = self.input_cond.cuda()

            self.label = self.label.cuda()
            self.noise = self.noise.cuda()
            self.fixed_noise = self.fixed_noise.cuda()

            self.one = self.one.cuda()
            self.mone = self.mone.cuda()

        self.fixed_noise = Variable(self.fixed_noise)  # TODO: Why?

    def create_criterion(self, ):
        """Create criterion."""
        self.criterion = nn.BCELoss()
        if not self.opt.no_cuda:
            self.criterion = self.criterion.cuda()

    def create_optimizers(self, ):
        """Create optimizers."""
        if self.opt.optimizer == 'adam':
            self.optimizerD = optim.Adam(self.netD.parameters(),
                                         lr=self.opt.lr,
                                         betas=(self.opt.beta1, 0.999))
            self.optimizerG = optim.Adam(self.netG.parameters(),
                                         lr=self.opt.lr,
                                         betas=(self.opt.beta1, 0.999))
            self.optimizerG2 = optim.Adam(self.netG2.parameters(),
                                          lr=self.opt.lr,
                                          betas=(self.opt.beta1, 0.999))
        elif self.opt.optimizer == 'rmsprop':
            self.optimizerD = optim.RMSprop(self.netD.parameters(),
                                            lr=self.opt.lr)
            self.optimizerG = optim.RMSprop(self.netG.parameters(),
                                            lr=self.opt.lr)
            self.optimizerG2 = optim.RMSprop(self.netG2.parameters(),
                                             lr=self.opt.lr)
        else:
            raise ValueError('Unknown optimizer: ' + self.opt.optimizer)

        # Create the schedulers
        if self.opt.lr_sched_type == 'step':
            LR_fn = optim.lr_scheduler.StepLR
        elif self.opt.lr_sched_type == 'exp':
            LR_fn = optim.lr_scheduler.ExponentialLR
        elif self.opt.lr_sched_type is None:
            LR_fn = None
        else:
            raise ValueError('Unknown scheduler')

        self.optG_z_lr_scheduler = LR_fn(self.optimizerG,
                                         step_size=self.opt.z_lr_sched_step,
                                         gamma=self.opt.z_lr_sched_gamma)
        self.optG2_normal_lr_scheduler = LR_fn(
            self.optimizerG2,
            step_size=self.opt.normal_lr_sched_step,
            gamma=self.opt.normal_lr_sched_gamma)
        self.LR_SCHED_MAP = [
            self.optG_z_lr_scheduler, self.optG2_normal_lr_scheduler
        ]
        self.OPT_MAP = [self.optimizerG, self.optimizerG2]

    def get_samples(self):
        """Get samples."""
        try:
            samples = self.data_iter.next()
        except StopIteration:
            del self.data_iter
            self.data_iter = iter(self.dataset_loader)
            samples = self.data_iter.next()
        except AttributeError:
            self.data_iter = iter(self.dataset_loader)
            samples = self.data_iter.next()
        return samples

    def get_real_samples(self):
        """Get a real sample."""
        # Define the camera poses
        if not self.opt.same_view:
            if self.opt.full_sphere_sampling:
                self.cam_pos = uniform_sample_sphere(
                    radius=self.opt.cam_dist,
                    num_samples=self.opt.batchSize,
                    axis=self.opt.axis,
                    angle=np.deg2rad(self.opt.angle),
                    theta_range=self.opt.theta,
                    phi_range=self.opt.phi)
            else:
                self.cam_pos = uniform_sample_sphere(
                    radius=self.opt.cam_dist,
                    num_samples=self.opt.batchSize,
                    axis=self.opt.axis,
                    angle=self.opt.angle,
                    theta_range=np.deg2rad(self.opt.theta),
                    phi_range=np.deg2rad(self.opt.phi))
        if self.opt.full_sphere_sampling_light:
            self.light_pos1 = uniform_sample_sphere(
                radius=self.opt.cam_dist,
                num_samples=self.opt.batchSize,
                axis=self.opt.axis,
                angle=np.deg2rad(44),
                theta_range=self.opt.theta,
                phi_range=self.opt.phi)
            # self.light_pos2 = uniform_sample_sphere(radius=self.opt.cam_dist, num_samples=self.opt.batchSize,
            #                                      axis=self.opt.axis, angle=np.deg2rad(40),
            #                                      theta_range=self.opt.theta, phi_range=self.opt.phi)
        else:
            print("inbox")
            light_eps = 0.15
            self.light_pos1 = np.random.rand(self.opt.batchSize,
                                             3) * self.opt.cam_dist + light_eps
            self.light_pos2 = np.random.rand(self.opt.batchSize,
                                             3) * self.opt.cam_dist + light_eps

            # TODO: deg2rad in all the angles????

        # Create a splats rendering scene
        large_scene = create_scene(self.opt.width, self.opt.height,
                                   self.opt.fovy, self.opt.focal_length,
                                   self.opt.n_splats)
        lookat = self.opt.at if self.opt.at is not None else [
            0.0, 0.0, 0.0, 1.0
        ]
        large_scene['camera']['at'] = tch_var_f(lookat)

        # Render scenes
        data, data_depth, data_normal, data_cond = [], [], [], []
        inpath = self.opt.vis_images + '/'
        inpath2 = self.opt.vis_input + '/'
        for idx in range(self.opt.batchSize):
            # Save the splats into the rendering scene
            if self.opt.use_mesh:
                if 'sphere' in large_scene['objects']:
                    del large_scene['objects']['sphere']
                if 'disk' in large_scene['objects']:
                    del large_scene['objects']['disk']
                if 'triangle' not in large_scene['objects']:
                    large_scene['objects'] = {
                        'triangle': {
                            'face': None,
                            'normal': None,
                            'material_idx': None
                        }
                    }
                samples = self.get_samples()

                large_scene['objects']['triangle']['material_idx'] = tch_var_l(
                    np.zeros(samples['mesh']['face'][0].shape[0],
                             dtype=int).tolist())
                large_scene['objects']['triangle']['face'] = Variable(
                    samples['mesh']['face'][0].cuda(), requires_grad=False)
                large_scene['objects']['triangle']['normal'] = Variable(
                    samples['mesh']['normal'][0].cuda(), requires_grad=False)
            else:
                if 'sphere' in large_scene['objects']:
                    del large_scene['objects']['sphere']
                if 'triangle' in large_scene['objects']:
                    del large_scene['objects']['triangle']
                if 'disk' not in large_scene['objects']:
                    large_scene['objects'] = {
                        'disk': {
                            'pos': None,
                            'normal': None,
                            'material_idx': None
                        }
                    }
                large_scene['objects']['disk']['radius'] = tch_var_f(
                    np.ones(self.opt.n_splats) * self.opt.splats_radius)
                large_scene['objects']['disk']['material_idx'] = tch_var_l(
                    np.zeros(self.opt.n_splats, dtype=int).tolist())
                large_scene['objects']['disk']['pos'] = Variable(
                    samples['splats']['pos'][idx].cuda(), requires_grad=False)
                large_scene['objects']['disk']['normal'] = Variable(
                    samples['splats']['normal'][idx].cuda(),
                    requires_grad=False)

            # Set camera position
            if not self.opt.same_view:
                large_scene['camera']['eye'] = tch_var_f(self.cam_pos[idx])
            else:
                large_scene['camera']['eye'] = tch_var_f(self.cam_pos[0])

            large_scene['lights']['pos'][0, :3] = tch_var_f(
                self.light_pos1[idx])
            #large_scene['lights']['pos'][1,:3]=tch_var_f(self.light_pos2[idx])

            # Render scene
            res = render(large_scene,
                         norm_depth_image_only=self.opt.norm_depth_image_only,
                         double_sided=True,
                         use_quartic=self.opt.use_quartic)

            # Get rendered output
            if self.opt.render_img_nc == 1:
                depth = res['depth']
                im_d = depth.unsqueeze(0)
            else:
                depth = res['depth']
                im_d = depth.unsqueeze(0)
                im = res['image'].permute(2, 0, 1)
                im_ = get_data(res['image'])
                #im_img_ = get_normalmap_image(im_)
                target_normal_ = get_data(res['normal'])
                target_normalmap_img_ = get_normalmap_image(target_normal_)
                im_n = tch_var_f(target_normalmap_img_).view(
                    im.shape[1], im.shape[2], 3).permute(2, 0, 1)

            # Add depth image to the output structure
            file_name = inpath2 + str(self.iterationa_no) + "_" + str(
                self.critic_iter) + 'input_{:05d}.txt'.format(idx)
            text_file = open(file_name, "w")
            text_file.write('%s\n' % (str(large_scene['camera']['eye'].data)))
            text_file.close()
            out_file_name = inpath2 + str(self.iterationa_no) + "_" + str(
                self.critic_iter) + 'input_{:05d}.npy'.format(idx)
            np.save(out_file_name, self.cam_pos[idx])
            out_file_name2 = inpath2 + str(self.iterationa_no) + "_" + str(
                self.critic_iter) + 'input_light{:05d}.npy'.format(idx)
            np.save(out_file_name2, self.light_pos1[idx])
            out_file_name3 = inpath2 + str(self.iterationa_no) + "_" + str(
                self.critic_iter) + 'input_im{:05d}.npy'.format(idx)
            np.save(out_file_name3, get_data(res['image']))
            out_file_name4 = inpath2 + str(self.iterationa_no) + "_" + str(
                self.critic_iter) + 'input_depth{:05d}.npy'.format(idx)
            np.save(out_file_name4, get_data(res['depth']))
            out_file_name5 = inpath2 + str(self.iterationa_no) + "_" + str(
                self.critic_iter) + 'input_normal{:05d}.npy'.format(idx)
            np.save(out_file_name5, get_data(res['normal']))

            if self.iterationa_no % (self.opt.save_image_interval * 5) == 0:
                imsave((inpath + str(self.iterationa_no) +
                        'real_normalmap_{:05d}.png'.format(idx)),
                       target_normalmap_img_)
                imsave((inpath + str(self.iterationa_no) +
                        'real_depth_{:05d}.png'.format(idx)), get_data(depth))
                # imsave(inpath + str(self.iterationa_no) + 'real_depthmap_{:05d}.png'.format(idx), im_d)
                # imsave(inpath + str(self.iterationa_no) + 'world_normalmap_{:05d}.png'.format(idx), target_worldnormalmap_img_)
            data.append(im)
            data_depth.append(im_d)
            data_normal.append(im_n)
            data_cond.append(large_scene['camera']['eye'])
        # Stack real samples
        real_samples = torch.stack(data)
        real_samples_depth = torch.stack(data_depth)
        real_samples_normal = torch.stack(data_normal)
        real_samples_cond = torch.stack(data_cond)
        self.batch_size = real_samples.size(0)
        if not self.opt.no_cuda:
            real_samples = real_samples.cuda()
            real_samples_depth = real_samples_depth.cuda()
            real_samples_normal = real_samples_normal.cuda()
            real_samples_cond = real_samples_cond.cuda()

        # Set input/output variables

        self.input.resize_as_(real_samples.data).copy_(real_samples.data)
        self.input_depth.resize_as_(real_samples_depth.data).copy_(
            real_samples_depth.data)
        self.input_normal.resize_as_(real_samples_normal.data).copy_(
            real_samples_normal.data)
        self.input_cond.resize_as_(real_samples_cond.data).copy_(
            real_samples_cond.data)
        self.label.resize_(self.batch_size).fill_(self.real_label)
        # TODO: Remove Variables
        self.inputv = Variable(self.input)
        self.inputv_depth = Variable(self.input_depth)
        self.inputv_normal = Variable(self.input_normal)
        self.inputv_cond = Variable(self.input_cond)
        self.labelv = Variable(self.label)

    def generate_noise_vector(self, ):
        """Generate a noise vector."""
        self.noise.resize_(self.batch_size, int(self.opt.nz), 1,
                           1).normal_(0, 1)
        self.noisev = Variable(self.noise)  # TODO: Add volatile=True???

    def generate_normals(self, z_batch, cam_pos, camera):
        """Generate normals from depth."""
        W, H = camera['viewport'][2:]
        normals = []
        for z, eye in zip(z_batch, cam_pos):
            camera['eye'] = eye
            pcl = z_to_pcl_CC(z.squeeze(), camera)
            n = self.netG2(pcl.view(H, W, 3).permute(2, 0, 1)[np.newaxis, ...])
            n = n.squeeze().permute(1, 2, 0).view(-1, 3).contiguous()
            normals.append(n)
        return torch.stack(normals)

    def tensorboard_pos_hook(self, grad):

        self.writer.add_image("position_gradient_im",
                              torch.sqrt(torch.sum(grad**2, dim=-1)),
                              self.iterationa_no)
        self.writer.add_scalar("position_mean_channel1",
                               get_data(torch.mean(torch.abs(grad[:, :, 0]))),
                               self.iterationa_no)
        self.writer.add_scalar("position_gradient_mean_channel2",
                               get_data(torch.mean(torch.abs(grad[:, :, 1]))),
                               self.iterationa_no)
        self.writer.add_scalar("position_gradient_mean_channel3",
                               get_data(torch.mean(torch.abs(grad[:, :, 2]))),
                               self.iterationa_no)
        self.writer.add_scalar("position_gradient_mean",
                               get_data(torch.mean(grad)), self.iterationa_no)
        self.writer.add_histogram("position_gradient_hist_channel1",
                                  grad[:, :, 0].clone().cpu().data.numpy(),
                                  self.iterationa_no)
        self.writer.add_histogram("position_gradient_hist_channel2",
                                  grad[:, :, 1].clone().cpu().data.numpy(),
                                  self.iterationa_no)
        self.writer.add_histogram("position_gradient_hist_channel3",
                                  grad[:, :, 2].clone().cpu().data.numpy(),
                                  self.iterationa_no)
        self.writer.add_histogram(
            "position_gradient_hist_norm",
            torch.sqrt(torch.sum(grad**2, dim=-1)).clone().cpu().data.numpy(),
            self.iterationa_no)
        #print('grad', grad)

    def tensorboard_normal_hook(self, grad):

        self.writer.add_image("normal_gradient_im",
                              torch.sqrt(torch.sum(grad**2, dim=-1)),
                              self.iterationa_no)
        self.writer.add_scalar("normal_gradient_mean_channel1",
                               get_data(torch.mean(torch.abs(grad[:, :, 0]))),
                               self.iterationa_no)
        self.writer.add_scalar("normal_gradient_mean_channel2",
                               get_data(torch.mean(torch.abs(grad[:, :, 1]))),
                               self.iterationa_no)
        self.writer.add_scalar("normal_gradient_mean_channel3",
                               get_data(torch.mean(torch.abs(grad[:, :, 2]))),
                               self.iterationa_no)
        self.writer.add_scalar("normal_gradient_mean",
                               get_data(torch.mean(grad)), self.iterationa_no)
        self.writer.add_histogram("normal_gradient_hist_channel1",
                                  grad[:, :, 0].clone().cpu().data.numpy(),
                                  self.iterationa_no)
        self.writer.add_histogram("normal_gradient_hist_channel2",
                                  grad[:, :, 1].clone().cpu().data.numpy(),
                                  self.iterationa_no)
        self.writer.add_histogram("normal_gradient_hist_channel3",
                                  grad[:, :, 2].clone().cpu().data.numpy(),
                                  self.iterationa_no)
        self.writer.add_histogram(
            "normal_gradient_hist_norm",
            torch.sqrt(torch.sum(grad**2, dim=-1)).clone().cpu().data.numpy(),
            self.iterationa_no)
        #print('grad', grad)

    def tensorboard_z_hook(self, grad):

        self.writer.add_scalar("z_gradient_mean",
                               get_data(torch.mean(torch.abs(grad))),
                               self.iterationa_no)
        self.writer.add_histogram("z_gradient_hist_channel",
                                  grad.clone().cpu().data.numpy(),
                                  self.iterationa_no)

        self.writer.add_image("z_gradient_im", grad, self.iterationa_no)

    def tensorboard_hook(self, grad):
        self.writer.add_scalar("z_gradient_mean",
                               get_data(torch.mean(grad[0])),
                               self.iterationa_no)
        self.writer.add_histogram("z_gradient_hist_channel",
                                  grad[0].clone().cpu().data.numpy(),
                                  self.iterationa_no)

        self.writer.add_image(
            "z_gradient_im", grad[0].view(self.opt.splats_img_size,
                                          self.opt.splats_img_size),
            self.iterationa_no)

    def train(self, ):
        """Train network."""
        # Load pretrained model if required
        if self.opt.gen_model_path is not None:
            print("Reloading networks from")
            print(' > Generator', self.opt.gen_model_path)
            self.netG.load_state_dict(
                torch.load(open(self.opt.gen_model_path, 'rb')))
            print(' > Generator2', self.opt.gen_model_path2)
            self.netG2.load_state_dict(
                torch.load(open(self.opt.gen_model_path2, 'rb')))
            print(' > Discriminator', self.opt.dis_model_path)
            self.netD.load_state_dict(
                torch.load(open(self.opt.dis_model_path, 'rb')))
            print(' > Discriminator2', self.opt.dis_model_path2)
            self.netD2.load_state_dict(
                torch.load(open(self.opt.dis_model_path2, 'rb')))

        # Start training
        file_name = os.path.join(self.opt.out_dir, 'L2.txt')
        with open(file_name, 'wt') as l2_file:
            curr_generator_idx = 0
            for iteration in range(self.opt.n_iter):
                self.iterationa_no = iteration
                self.critic_iter = 0
                # Train Discriminator critic_iters times
                for j in range(self.opt.critic_iters):
                    # Train with real
                    #################
                    self.in_critic = 1

                    self.get_real_samples()
                    self.critic_iter += 1

    def save_networks(self, epoch):
        """Save networks to hard disk."""
        torch.save(self.netG.state_dict(),
                   '%s/netG_epoch_%d.pth' % (self.opt.out_dir, epoch))
        torch.save(self.netG2.state_dict(),
                   '%s/netG2_epoch_%d.pth' % (self.opt.out_dir, epoch))
        torch.save(self.netD.state_dict(),
                   '%s/netD_epoch_%d.pth' % (self.opt.out_dir, epoch))
        torch.save(self.netD2.state_dict(),
                   '%s/netD2_epoch_%d.pth' % (self.opt.out_dir, epoch))

    def save_images(self, epoch, input, output):
        """Save images."""
        if self.opt.render_img_nc == 1:
            imsave(self.opt.out_dir + '/input2' + str(epoch) + '.png',
                   np.uint8(255. * input.cpu().data.numpy().squeeze()))
            imsave(self.opt.out_dir + '/fz' + str(epoch) + '.png',
                   np.uint8(255. * output.cpu().data.numpy().squeeze()))
        else:
            imsave(
                self.opt.out_dir + '/input2' + str(epoch) + '.png',
                np.uint8(255. * input.cpu().data.numpy().transpose((1, 2, 0))))
            imsave(
                self.opt.out_dir + '/output2' + str(epoch) + '.png',
                np.uint8(255. * output.cpu().data.numpy().transpose(
                    (1, 2, 0))))
Exemplo n.º 29
0
        writer.add_image('Image', x, n_iter)

        dummy_audio = torch.zeros(sample_rate * 2)
        for i in range(x.size(0)):
            # amplitude of sound should in [-1, 1]
            dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) /
                                    float(sample_rate))
        writer.add_audio('myAudio',
                         dummy_audio,
                         n_iter,
                         sample_rate=sample_rate)

        writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)

        for name, param in resnet18.named_parameters():
            writer.add_histogram(name,
                                 param.clone().cpu().data.numpy(), n_iter)

        # needs tensorboard 0.4RC or later
        writer.add_pr_curve('xoxo', np.random.randint(2, size=100),
                            np.random.rand(100), n_iter)

dataset = datasets.MNIST('mnist', train=False, download=True)
images = dataset.test_data[:100].float()
label = dataset.test_labels[:100]

features = images.view(100, 784)
writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))

# export scalar data to JSON for external processing
writer.export_scalars_to_json("./all_scalars.json")
writer.close()
Exemplo n.º 30
0
class UNet3DTrainer:
    """3D UNet trainer.

    Args:
        model (Unet3D): UNet 3D model to be trained
        optimizer (nn.optim.Optimizer): optimizer used for training
        lr_scheduler (torch.optim.lr_scheduler._LRScheduler): learning rate scheduler
            WARN: bear in mind that lr_scheduler.step() is invoked after every validation step
            (i.e. validate_after_iters) not after every epoch. So e.g. if one uses StepLR with step_size=30
            the learning rate will be adjusted after every 30 * validate_after_iters iterations.
        loss_criterion (callable): loss function
        eval_criterion (callable): used to compute training/validation metric (such as Dice, IoU, AP or Rand score)
            saving the best checkpoint is based on the result of this function on the validation set
        device (torch.device): device to train on
        loaders (dict): 'train' and 'val' loaders
        checkpoint_dir (string): dir for saving checkpoints and tensorboard logs
        max_num_epochs (int): maximum number of epochs
        max_num_iterations (int): maximum number of iterations
        validate_after_iters (int): validate after that many iterations
        log_after_iters (int): number of iterations before logging to tensorboard
        validate_iters (int): number of validation iterations, if None validate
            on the whole validation set
        eval_score_higher_is_better (bool): if True higher eval scores are considered better
        best_eval_score (float): best validation score so far (higher better)
        num_iterations (int): useful when loading the model from the checkpoint
        num_epoch (int): useful when loading the model from the checkpoint
    """
    def __init__(self,
                 model,
                 optimizer,
                 lr_scheduler,
                 loss_criterion,
                 eval_criterion,
                 device,
                 loaders,
                 checkpoint_dir,
                 max_num_epochs=100,
                 max_num_iterations=1e5,
                 validate_after_iters=100,
                 log_after_iters=100,
                 validate_iters=None,
                 num_iterations=1,
                 num_epoch=0,
                 eval_score_higher_is_better=True,
                 best_eval_score=None,
                 logger=None):
        if logger is None:
            self.logger = utils.get_logger('UNet3DTrainer',
                                           level=logging.DEBUG)
        else:
            self.logger = logger

        self.logger.info(model)
        self.model = model
        self.optimizer = optimizer
        self.scheduler = lr_scheduler
        self.loss_criterion = loss_criterion
        self.eval_criterion = eval_criterion
        self.device = device
        self.loaders = loaders
        self.checkpoint_dir = checkpoint_dir
        self.max_num_epochs = max_num_epochs
        self.max_num_iterations = max_num_iterations
        self.validate_after_iters = validate_after_iters
        self.log_after_iters = log_after_iters
        self.validate_iters = validate_iters
        self.eval_score_higher_is_better = eval_score_higher_is_better
        logger.info(
            f'eval_score_higher_is_better: {eval_score_higher_is_better}')

        if best_eval_score is not None:
            self.best_eval_score = best_eval_score
        else:
            # initialize the best_eval_score
            if eval_score_higher_is_better:
                self.best_eval_score = float('-inf')
            else:
                self.best_eval_score = float('+inf')

        self.writer = SummaryWriter(
            log_dir=os.path.join(checkpoint_dir, 'logs'))

        self.num_iterations = num_iterations
        self.num_epoch = num_epoch

    @classmethod
    def from_checkpoint(cls,
                        checkpoint_path,
                        model,
                        optimizer,
                        lr_scheduler,
                        loss_criterion,
                        eval_criterion,
                        loaders,
                        logger=None):
        logger.info(f"Loading checkpoint '{checkpoint_path}'...")
        state = utils.load_checkpoint(checkpoint_path, model, optimizer)
        logger.info(
            f"Checkpoint loaded. Epoch: {state['epoch']}. Best val score: {state['best_eval_score']}. Num_iterations: {state['num_iterations']}"
        )
        checkpoint_dir = os.path.split(checkpoint_path)[0]
        return cls(
            model,
            optimizer,
            lr_scheduler,
            loss_criterion,
            eval_criterion,
            torch.device(state['device']),
            loaders,
            checkpoint_dir,
            eval_score_higher_is_better=state['eval_score_higher_is_better'],
            best_eval_score=state['best_eval_score'],
            num_iterations=state['num_iterations'],
            num_epoch=state['epoch'],
            max_num_epochs=state['max_num_epochs'],
            max_num_iterations=state['max_num_iterations'],
            validate_after_iters=state['validate_after_iters'],
            log_after_iters=state['log_after_iters'],
            validate_iters=state['validate_iters'],
            logger=logger)

    @classmethod
    def from_pretrained(cls,
                        pre_trained,
                        model,
                        optimizer,
                        lr_scheduler,
                        loss_criterion,
                        eval_criterion,
                        device,
                        loaders,
                        max_num_epochs=100,
                        max_num_iterations=1e5,
                        validate_after_iters=100,
                        log_after_iters=100,
                        validate_iters=None,
                        num_iterations=1,
                        num_epoch=0,
                        eval_score_higher_is_better=True,
                        best_eval_score=None,
                        logger=None):
        logger.info(f"Logging pre-trained model from '{pre_trained}'...")
        utils.load_checkpoint(pre_trained, model, None)
        checkpoint_dir = os.path.split(pre_trained)[0]
        return cls(model,
                   optimizer,
                   lr_scheduler,
                   loss_criterion,
                   eval_criterion,
                   device,
                   loaders,
                   checkpoint_dir,
                   eval_score_higher_is_better=eval_score_higher_is_better,
                   best_eval_score=best_eval_score,
                   num_iterations=num_iterations,
                   num_epoch=num_epoch,
                   max_num_epochs=max_num_epochs,
                   max_num_iterations=max_num_iterations,
                   validate_after_iters=validate_after_iters,
                   log_after_iters=log_after_iters,
                   validate_iters=validate_iters,
                   logger=logger)

    def fit(self):
        for _ in range(self.num_epoch, self.max_num_epochs):
            # train for one epoch
            should_terminate = self.train(self.loaders['train'])

            if should_terminate:
                break

            self.num_epoch += 1

    def train(self, train_loader):
        """Trains the model for 1 epoch.

        Args:
            train_loader (torch.utils.data.DataLoader): training data loader

        Returns:
            True if the training should be terminated immediately, False otherwise
        """
        train_losses = utils.RunningAverage()
        train_eval_scores = utils.RunningAverage()

        # sets the model in training mode
        self.model.train()

        for i, t in enumerate(train_loader):
            self.logger.info(
                f'Training iteration {self.num_iterations}. Batch {i}. Epoch [{self.num_epoch}/{self.max_num_epochs - 1}]'
            )

            input, target, weight = self._split_training_batch(t)

            output, loss = self._forward_pass(input, target, weight)

            train_losses.update(loss.item(), self._batch_size(input))

            # compute gradients and update parameters
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if self.num_iterations % self.validate_after_iters == 0:
                # evaluate on validation set
                eval_score = self.validate(self.loaders['val'])
                # adjust learning rate if necessary
                if isinstance(self.scheduler, ReduceLROnPlateau):
                    self.scheduler.step(eval_score)
                else:
                    self.scheduler.step()
                # log current learning rate in tensorboard
                self._log_lr()
                # remember best validation metric
                is_best = self._is_best_eval_score(eval_score)

                # save checkpoint
                self._save_checkpoint(is_best)

            if self.num_iterations % self.log_after_iters == 0:
                # if model contains final_activation layer for normalizing logits apply it, otherwise both
                # the evaluation metric as well as images in tensorboard will be incorrectly computed
                if hasattr(self.model, 'final_activation'):
                    output = self.model.final_activation(output)

                # compute eval criterion
                eval_score = self.eval_criterion(output, target)
                train_eval_scores.update(eval_score.item(),
                                         self._batch_size(input))

                # log stats, params and images
                self.logger.info(
                    f'Training stats. Loss: {train_losses.avg}. Evaluation score: {train_eval_scores.avg}'
                )
                self._log_stats('train', train_losses.avg,
                                train_eval_scores.avg)
                self._log_params()
                self._log_images(input, target, output)

            if self.max_num_iterations < self.num_iterations:
                self.logger.info(
                    f'Maximum number of iterations {self.max_num_iterations} exceeded. Finishing training...'
                )
                return True

            self.num_iterations += 1

        return False

    def validate(self, val_loader):
        self.logger.info('Validating...')

        val_losses = utils.RunningAverage()
        val_scores = utils.RunningAverage()

        try:
            # set the model in evaluation mode; final_activation doesn't need to be called explicitly
            self.model.eval()
            with torch.no_grad():
                for i, t in enumerate(val_loader):
                    self.logger.info(f'Validation iteration {i}')

                    input, target, weight = self._split_training_batch(t)

                    output, loss = self._forward_pass(input, target, weight)
                    val_losses.update(loss.item(), self._batch_size(input))

                    eval_score = self.eval_criterion(output, target)
                    val_scores.update(eval_score.item(),
                                      self._batch_size(input))

                    if self.validate_iters is not None and self.validate_iters <= i:
                        # stop validation
                        break

                self._log_stats('val', val_losses.avg, val_scores.avg)
                self.logger.info(
                    f'Validation finished. Loss: {val_losses.avg}. Evaluation score: {val_scores.avg}'
                )
                return val_scores.avg
        finally:
            # set back in training mode
            self.model.train()

    def _split_training_batch(self, t):
        def _move_to_device(input):
            if isinstance(input, tuple) or isinstance(input, list):
                return tuple([_move_to_device(x) for x in input])
            else:
                return input.to(self.device)

        t = _move_to_device(t)
        weight = None
        if len(t) == 2:
            input, target = t
        else:
            input, target, weight = t
        return input, target, weight

    def _forward_pass(self, input, target, weight=None):
        # forward pass
        output = self.model(input)

        # compute the loss
        if weight is None:
            loss = self.loss_criterion(output, target)
        else:
            loss = self.loss_criterion(output, target, weight)

        return output, loss

    def _is_best_eval_score(self, eval_score):
        if self.eval_score_higher_is_better:
            is_best = eval_score > self.best_eval_score
        else:
            is_best = eval_score < self.best_eval_score

        if is_best:
            self.logger.info(
                f'Saving new best evaluation metric: {eval_score}')
            self.best_eval_score = eval_score

        return is_best

    def _save_checkpoint(self, is_best):
        utils.save_checkpoint(
            {
                'epoch': self.num_epoch + 1,
                'num_iterations': self.num_iterations,
                'model_state_dict': self.model.state_dict(),
                'best_eval_score': self.best_eval_score,
                'eval_score_higher_is_better':
                self.eval_score_higher_is_better,
                'optimizer_state_dict': self.optimizer.state_dict(),
                'device': str(self.device),
                'max_num_epochs': self.max_num_epochs,
                'max_num_iterations': self.max_num_iterations,
                'validate_after_iters': self.validate_after_iters,
                'log_after_iters': self.log_after_iters,
                'validate_iters': self.validate_iters
            },
            is_best,
            checkpoint_dir=self.checkpoint_dir,
            logger=self.logger)

    def _log_lr(self):
        lr = self.optimizer.param_groups[0]['lr']
        self.writer.add_scalar('learning_rate', lr, self.num_iterations)

    def _log_stats(self, phase, loss_avg, eval_score_avg):
        tag_value = {
            f'{phase}_loss_avg': loss_avg,
            f'{phase}_eval_score_avg': eval_score_avg
        }

        for tag, value in tag_value.items():
            self.writer.add_scalar(tag, value, self.num_iterations)

    def _log_params(self):
        self.logger.info('Logging model parameters and gradients')
        for name, value in self.model.named_parameters():
            self.writer.add_histogram(name,
                                      value.data.cpu().numpy(),
                                      self.num_iterations)
            self.writer.add_histogram(name + '/grad',
                                      value.grad.data.cpu().numpy(),
                                      self.num_iterations)

    def _log_images(self, input, target, prediction):
        inputs_map = {
            'inputs': input,
            'targets': target,
            'predictions': prediction
        }
        img_sources = {}
        for name, batch in inputs_map.items():
            if isinstance(batch, list) or isinstance(batch, tuple):
                for i, b in enumerate(batch):
                    img_sources[f'{name}{i}'] = b.data.cpu().numpy()
            else:
                img_sources[name] = batch.data.cpu().numpy()

        for name, batch in img_sources.items():
            for tag, image in self._images_from_batch(name, batch):
                self.writer.add_image(tag,
                                      image,
                                      self.num_iterations,
                                      dataformats='HW')

    def _images_from_batch(self, name, batch):
        tag_template = '{}/batch_{}/channel_{}/slice_{}'

        tagged_images = []

        if batch.ndim == 5:
            # NCDHW
            slice_idx = batch.shape[2] // 2  # get the middle slice
            for batch_idx in range(batch.shape[0]):
                for channel_idx in range(batch.shape[1]):
                    tag = tag_template.format(name, batch_idx, channel_idx,
                                              slice_idx)
                    img = batch[batch_idx, channel_idx, slice_idx, ...]
                    tagged_images.append((tag, self._normalize_img(img)))
        else:
            # batch has no channel dim: NDHW
            slice_idx = batch.shape[1] // 2  # get the middle slice
            for batch_idx in range(batch.shape[0]):
                tag = tag_template.format(name, batch_idx, 0, slice_idx)
                img = batch[batch_idx, slice_idx, ...]
                tagged_images.append((tag, self._normalize_img(img)))

        return tagged_images

    @staticmethod
    def _normalize_img(img):
        return (img - np.min(img)) / np.ptp(img)

    @staticmethod
    def _batch_size(input):
        if isinstance(input, list) or isinstance(input, tuple):
            return input[0].size(0)
        else:
            return input.size(0)
Exemplo n.º 31
0
class TensorboardWriter(FromParams):
    """
    Class that handles Tensorboard (and other) logging.

    Parameters
    ----------
    get_batch_num_total : Callable[[], int]
        A thunk that returns the number of batches so far. Most likely this will
        be a closure around an instance variable in your ``Trainer`` class.
    serialization_dir : str, optional (default = None)
        If provided, this is where the Tensorboard logs will be written.
    summary_interval : int, optional (default = 100)
        Most statistics will be written out only every this many batches.
    histogram_interval : int, optional (default = None)
        If provided, activation histograms will be written out every this many batches.
        If None, activation histograms will not be written out.
    should_log_parameter_statistics : bool, optional (default = True)
        Whether to log parameter statistics.
    should_log_learning_rate : bool, optional (default = False)
        Whether to log learning rate.
    """
    def __init__(self,
                 get_batch_num_total: Callable[[], int],
                 serialization_dir: Optional[str] = None,
                 summary_interval: int = 100,
                 histogram_interval: int = None,
                 should_log_parameter_statistics: bool = True,
                 should_log_learning_rate: bool = False) -> None:
        if serialization_dir is not None:
            self._train_log = SummaryWriter(os.path.join(serialization_dir, "log", "train"))
            self._validation_log = SummaryWriter(os.path.join(serialization_dir, "log", "validation"))
        else:
            self._train_log = self._validation_log = None

        self._summary_interval = summary_interval
        self._histogram_interval = histogram_interval
        self._should_log_parameter_statistics = should_log_parameter_statistics
        self._should_log_learning_rate = should_log_learning_rate
        self._get_batch_num_total = get_batch_num_total

    @staticmethod
    def _item(value: Any):
        if hasattr(value, 'item'):
            val = value.item()
        else:
            val = value
        return val

    def should_log_this_batch(self) -> bool:
        return self._get_batch_num_total() % self._summary_interval == 0

    def should_log_histograms_this_batch(self) -> bool:
        return self._histogram_interval is not None and self._get_batch_num_total() % self._histogram_interval == 0

    def add_train_scalar(self, name: str, value: float, timestep: int = None) -> None:
        timestep = timestep or self._get_batch_num_total()
        # get the scalar
        if self._train_log is not None:
            self._train_log.add_scalar(name, self._item(value), timestep)

    def add_train_histogram(self, name: str, values: torch.Tensor) -> None:
        if self._train_log is not None:
            if isinstance(values, torch.Tensor):
                values_to_write = values.cpu().data.numpy().flatten()
                self._train_log.add_histogram(name, values_to_write, self._get_batch_num_total())
    
    def add_graph(self, model, inputs) -> None:
        if self._train_log is not None:
            self._train_log.add_graph(model, inputs)
    
    def add_validation_scalar(self, name: str, value: float, timestep: int = None) -> None:
        timestep = timestep or self._get_batch_num_total()
        if self._validation_log is not None:
            self._validation_log.add_scalar(name, self._item(value), timestep)

    def log_parameter_and_gradient_statistics(self, # pylint: disable=invalid-name
                                              model: Model,
                                              batch_grad_norm: float) -> None:
        """
        Send the mean and std of all parameters and gradients to tensorboard, as well
        as logging the average gradient norm.
        """
        if self._should_log_parameter_statistics:
            # Log parameter values to Tensorboard
            for name, param in model.named_parameters():
                self.add_train_scalar("parameter_mean/" + name, param.data.mean())
                if param.data.numel() > 1:
                    self.add_train_scalar("parameter_std/" + name, param.data.std())
                if param.grad is not None:
                    if param.grad.is_sparse:
                        # pylint: disable=protected-access
                        grad_data = param.grad.data._values()
                    else:
                        grad_data = param.grad.data

                    # skip empty gradients
                    if torch.prod(torch.tensor(grad_data.shape)).item() > 0: # pylint: disable=not-callable
                        self.add_train_scalar("gradient_mean/" + name, grad_data.mean())
                        if grad_data.numel() > 1:
                            self.add_train_scalar("gradient_std/" + name, grad_data.std())
                    else:
                        # no gradient for a parameter with sparse gradients
                        logger.info("No gradient for %s, skipping tensorboard logging.", name)
            # norm of gradients
            if batch_grad_norm is not None:
                self.add_train_scalar("gradient_norm", batch_grad_norm)

    def log_learning_rates(self,
                           model: Model,
                           optimizer: torch.optim.Optimizer):
        """
        Send current parameter specific learning rates to tensorboard
        """
        if self._should_log_learning_rate:
            # optimizer stores lr info keyed by parameter tensor
            # we want to log with parameter name
            names = {param: name for name, param in model.named_parameters()}
            for group in optimizer.param_groups:
                if 'lr' not in group:
                    continue
                rate = group['lr']
                for param in group['params']:
                    # check whether params has requires grad or not
                    effective_rate = rate * float(param.requires_grad)
                    self.add_train_scalar("learning_rate/" + names[param], effective_rate)

    def log_histograms(self, model: Model, histogram_parameters: Set[str]) -> None:
        """
        Send histograms of parameters to tensorboard.
        """
        for name, param in model.named_parameters():
            if name in histogram_parameters:
                self.add_train_histogram("parameter_histogram/" + name, param)

    def log_metrics(self,
                    train_metrics: dict,
                    val_metrics: dict = None,
                    epoch: int = None,
                    log_to_console: bool = False) -> None:
        """
        Sends all of the train metrics (and validation metrics, if provided) to tensorboard.
        """
        metric_names = set(train_metrics.keys())
        if val_metrics is not None:
            metric_names.update(val_metrics.keys())
        val_metrics = val_metrics or {}

        # For logging to the console
        if log_to_console:
            dual_message_template = "%s |  %8.3f  |  %8.3f"
            no_val_message_template = "%s |  %8.3f  |  %8s"
            no_train_message_template = "%s |  %8s  |  %8.3f"
            header_template = "%s |  %-10s"
            name_length = max([len(x) for x in metric_names])
            logger.info(header_template, "Training".rjust(name_length + 13), "Validation")

        for name in metric_names:
            # Log to tensorboard
            train_metric = train_metrics.get(name)
            if train_metric is not None:
                self.add_train_scalar(name, train_metric, timestep=epoch)
            val_metric = val_metrics.get(name)
            if val_metric is not None:
                self.add_validation_scalar(name, val_metric, timestep=epoch)

            # And maybe log to console
            if log_to_console and val_metric is not None and train_metric is not None:
                logger.info(dual_message_template, name.ljust(name_length), train_metric, val_metric)
            elif log_to_console and val_metric is not None:
                logger.info(no_train_message_template, name.ljust(name_length), "N/A", val_metric)
            elif log_to_console and train_metric is not None:
                logger.info(no_val_message_template, name.ljust(name_length), train_metric, "N/A")

    def enable_activation_logging(self, model: Model) -> None:
        if self._histogram_interval is not None:
            # To log activation histograms to the forward pass, we register
            # a hook on forward to capture the output tensors.
            # This uses a closure to determine whether to log the activations,
            # since we don't want them on every call.
            for _, module in model.named_modules():
                if not getattr(module, 'should_log_activations', False):
                    # skip it
                    continue

                def hook(module_, inputs, outputs):
                    # pylint: disable=unused-argument,cell-var-from-loop
                    log_prefix = 'activation_histogram/{0}'.format(module_.__class__)
                    if self.should_log_histograms_this_batch():
                        self.log_activation_histogram(outputs, log_prefix)
                module.register_forward_hook(hook)

    def log_activation_histogram(self, outputs, log_prefix: str) -> None:
        if isinstance(outputs, torch.Tensor):
            log_name = log_prefix
            self.add_train_histogram(log_name, outputs)
        elif isinstance(outputs, (list, tuple)):
            for i, output in enumerate(outputs):
                log_name = "{0}_{1}".format(log_prefix, i)
                self.add_train_histogram(log_name, output)
        elif isinstance(outputs, dict):
            for k, tensor in outputs.items():
                log_name = "{0}_{1}".format(log_prefix, k)
                self.add_train_histogram(log_name, tensor)
        else:
            # skip it
            pass

    def close(self) -> None:
        """
        Calls the ``close`` method of the ``SummaryWriter`` s which makes sure that pending
        scalars are flushed to disk and the tensorboard event files are closed properly.
        """
        if self._train_log is not None:
            self._train_log.close()
        if self._validation_log is not None:
            self._validation_log.close()
Exemplo n.º 32
0
def train(opts):
    CUDA = torch.cuda.is_available() and not opts.no_cuda
    device = 'cuda' if CUDA else 'cpu'

    # seed initialization
    random.seed(opts.seed)
    np.random.seed(opts.seed)
    torch.manual_seed(opts.seed)
    if CUDA:
        torch.cuda.manual_seed_all(opts.seed)

    # model build
    model = LPSRNN(dropout=opts.dropout)
    model.to(device)
    print(model)
    writer = SummaryWriter(os.path.join(opts.save_path,
                                        'train'))
    opt = optim.Adam(model.parameters(), lr=opts.lr)
    if opts.loss == 'l2':
        criterion = nn.MSELoss()
    elif opts.loss == 'l1':
        criterion = nn.L1Loss()
    else:
        raise TypeError('Loss function {} not understood'.format(opts.loss))
    dset = WavPairDataset(opts.dataset, transform=wav2stft(logpower=True))
    va_dset = WavPairDataset(opts.dataset, split='valid',
                             transform=wav2stft(logpower=True))
    collater = SeqLPSCollater(maxlen=opts.maxlen)
    dloader = DataLoader(dset, batch_size=opts.batch_size,
                         shuffle=True, num_workers=opts.num_workers,
                         collate_fn=collater)
    va_dloader = DataLoader(va_dset, batch_size=opts.batch_size,
                            shuffle=False, num_workers=opts.num_workers,
                            collate_fn=collater)
    timings = []
    global_step = 0
    patience = opts.patience
    min_va_loss = np.inf
    for epoch in range(opts.epoch):
        model.train()
        beg_t = timeit.default_timer()
        for bidx, batch in enumerate(dloader, start=1):
            # split into (X, Y) pairs
            lps_x, lps_y = batch
            lps_x, lps_x_pha = torch.chunk(lps_x, 2, dim=3)
            lps_x = lps_x.squeeze(3)
            lps_y, lps_y_pha = torch.chunk(lps_y, 2, dim=3)
            lps_y = lps_y.squeeze(3)
            lps_x = lps_x.to(device)
            lps_y = lps_y.to(device)
            opt.zero_grad()
            y_, state = model(lps_x)
            loss = criterion(y_, lps_y)
            loss.backward()        
            opt.step()
            end_t = timeit.default_timer()
            timings.append(end_t - beg_t)
            beg_t = timeit.default_timer()
            if bidx % opts.save_freq == 0 or bidx >= len(dloader):
                print('Batch {}/{} (epoch {}) loss: {:.3f} '
                      'btime: {:.3f} s, mbtime: {:.3f}'
                      ''.format(bidx, len(dloader), epoch, loss.item(),
                                timings[-1], np.mean(timings)))
                writer.add_scalar('training/loss', loss.item(), global_step)
                writer.add_histogram('training/lps_x', lps_x.cpu().data,
                                     global_step, bins='sturges')
                writer.add_histogram('training/lps_y', lps_y.cpu().data,
                                     global_step, bins='sturges')
                writer.add_histogram('training/pred_y', y_.cpu().data,
                                     global_step, bins='sturges')
            global_step += 1
        va_losses = eval_epoch(va_dloader, model, criterion, epoch, writer, 
                               opts.save_freq, device)
        mva_loss = np.mean(va_losses)
        if min_va_loss > mva_loss:
            print('Val loss improved {:.3f} --> {:.3f}'.format(min_va_loss,
                                                               mva_loss))
            min_va_loss = mva_loss
            torch.save(model.state_dict(), 
                       os.path.join(opts.save_path,
                                    'model-e{}.ckpt'.format(epoch)))
            patience = opts.patience
        else:
            patience -= 1
            print('Val loss did not improve. Curr patience'
                  '{}/{}'.format(patience, opts.patience))
            if patience <= 0:
                print('Finishing training, out of patience')
                break
Exemplo n.º 33
0
class MSGAN:
    '''
        Class that include all the parameters of the optimisation and
        save the model after each epoch
    '''
    def __init__(self,
                 data_folder,
                 Nepochs=1000,
                 SlopLRelu=0.2,
                 use_cuda=True):
        '''
            SamplesFile: Location of the hdf5 file with all the samples
            Nepochs: Number of epochs
            balance: Balance in the final loss for the generator
            WParam: Parameter that attirbute more weigth to the diagonal because
                    of the increase in difficulty
            startcounter:   Start the counter of the number of iteration at startcounter.
                            Allow to resume the learning of a model

        '''
        self.use_cuda = use_cuda

        self.latent_dim = 10

        self.Nepochs = Nepochs
        self.StartEpochs = 0  #Different from zeros if the optimisation is resuming

        self.SlopLRelu = SlopLRelu

        self.NF = 32
        self.Ndepth = 5
        self.Ndepth_max = 5
        self.scales = [4, 8, 16, 32]
        self.Nfeatures = [16, 32, 64, 128]
        # self.len_ohe = 10

        self.depths = [i for i in range(self.Ndepth)]
        self.Nscales = len(self.scales)

        self.G = MSGenerator(NF=self.NF,
                             scales=self.scales,
                             depths=self.depths,
                             Ndepth_max=self.Ndepth_max,
                             SlopLRelu=self.SlopLRelu,
                             latent_dim=self.latent_dim)
        # len_ohe=self.len_ohe)

        self.D = MSDiscriminator(NF=self.NF,
                                 scales=self.scales,
                                 depths=self.depths,
                                 Ndepth_max=self.Ndepth_max,
                                 SlopLRelu=self.SlopLRelu,
                                 Nfeatures=self.Nfeatures)
        # len_ohe=self.len_ohe)

        self.optim_G = optim.Adam(self.G.parameters(),
                                  lr=0.0002,
                                  betas=(0.5, 0.999))
        self.optim_D = optim.Adam(self.D.parameters(),
                                  lr=0.0002,
                                  betas=(0.5, 0.999))

        self.batchsize = 8

        self.compute_adv_loss = WGANLoss()  #.cuda()

        if self.use_cuda:
            self.G = self.G.cuda()
            self.D = self.D.cuda()
            self.compute_adv_loss = self.compute_adv_loss.cuda()

        self.reg_param = 10.  #regularization parameter

        #Init Logger for the tensorboard
        # self.logger = Logger('./logs'  + "/") #Logger for the scalar & histograms
        self.writer = SummaryWriter('./logs/')  #Writter for image saving

        self.g_loss_record = {key: [] for key in ['train', 'val']}
        self.g_adv_loss_record = {key: [] for key in ['train', 'val']}
        self.d_loss_record = {key: [] for key in ['train', 'val']}
        self.d_adv_loss_fake_record = {key: [] for key in ['train', 'val']}
        self.d_adv_loss_real_record = {key: [] for key in ['train', 'val']}

        #Generator/Discriminator loss mean as indicator for the optimal model
        self.best_g_loss = np.power(10, 20)  #np.inf
        self.best_d_loss = np.power(10, 20)  #np.inf

        self.get_training_images = True
        self.ImagesDir = './TrainingImages/'
        if not os.path.exists(self.ImagesDir): os.makedirs(self.ImagesDir)
        self.ModelDir = './TrainingModels/'
        if not os.path.exists(self.ModelDir): os.makedirs(self.ModelDir)

        #Define dataloader
        train_dataset, test_dataset = load_CIFAR10_datasets(
            data_folder, self.latent_dim, self.Nscales)
        self.train_loader = DataLoader(train_dataset,
                                       batch_size=self.batchsize,
                                       shuffle=True,
                                       num_workers=30,
                                       pin_memory=True)

        self.test_loader = DataLoader(test_dataset,
                                      batch_size=self.batchsize,
                                      shuffle=True,
                                      num_workers=30,
                                      pin_memory=True)

        self.train_len = len(train_dataset)
        self.test_len = len(test_dataset)

        #Load label names
        self.label_names = load_CIFAR10_label_names(data_folder)

        #Fixed latent code to generate always the same image
        # to follow the evolution of the training
        self.fixed_LC = [
            np.random.randn(1, self.latent_dim).astype(np.float32)
            for _ in range(self.Nscales)
        ]

        print("Initialize the networks weigths...")
        self.G.apply(self.he_init)
        self.D.apply(self.he_init)

    def train(self):

        for self.epoch in tqdm(range(self.StartEpochs, self.Nepochs)):

            self.G.train()
            self.D.train()
            self.phase = 'train'

            self.clear_loss_records()
            total_iter = np.ceil(self.train_len / self.batchsize)
            for self.counter, (self.X, _, self.latent_codes) in enumerate(
                    tqdm(self.train_loader, total=total_iter, desc='train')):

                self.X = Variable(self.X, requires_grad=True)  #.cuda()
                self.latent_codes = [
                    Variable(latent_code) for latent_code in self.latent_codes
                ]

                if self.use_cuda:
                    self.X = self.X.cuda()
                    self.latent_codes = [
                        latent_code.cuda() for latent_code in self.latent_codes
                    ]

                #Pooling Real image to fit generator Output
                factors = [
                    int(self.scales[-1] / scale) for scale in self.scales
                ]
                self.Img_real = [
                    nn.AvgPool2d(factor, stride=factor, padding=0)(self.X)
                    for factor in factors
                ]

                # ===update D===
                self.optim_D.zero_grad()
                self.forward_D()
                self.backward_D()

                # ===update G===
                self.forward_D()
                self.optim_G.zero_grad()
                self.forward_G()
                self.backward_G()

                # print 'record loss'
                self.record_loss()
                self.StartEpochs = self.epoch

                # if self.counter > 1000:
                #     break

            # ===validation===
            self.validate()
            # ===tensorboard visualization===
            self.tensorboard()

            # ===save model===
            self.save()

    def forward_D(self):

        self.Img_fake = self.G(self.latent_codes)
        _, self.d_real = self.D(self.Img_real)
        _, self.d_fake = self.D([Img.detach() for Img in self.Img_fake])
        #detach means that the netork is using HRfake but block the optimization
        #to the network that generated it, ie clone the variable as it's a new one

    def forward_G(self):
        _, self.d_fake = self.D(self.Img_fake)

    def backward_G(self):
        self.g_loss = self.compute_G_loss()
        self.g_loss.backward()
        self.optim_G.step()

    def backward_D(self):
        self.d_loss = self.compute_D_loss()

        #retain_graph=False because the loss function
        # for the gradient and the discriminator are not
        # the same and therefore the gradients are differents

        # self.d_loss.backward(retain_graph=False)
        self.d_loss.backward(retain_graph=True)
        self.d_real_reg = sum([
            self.reg_param * compute_grad2(d_real_i, Img_real_i).mean()
            for d_real_i, Img_real_i in zip(self.d_real, self.Img_real)
        ])
        self.d_real_reg.backward()
        # R1_reg(dloss_real,d_real,x_real)
        self.optim_D.step()

    def compute_G_loss(self):
        #Make the discr find True when fake
        Nlayers = len(self.d_fake)
        # learning_factors = [numpy2var((Nlayers-(i+1))*((self.epoch+1)/self.Nmax_epoch),use_cuda=True) for i in range((Nlayers))]
        # self.g_adv_loss = [self.compute_adv_loss(self.d_fake[i], True)*learning_factors[i] for i in range(len(self.d_fake))]
        self.g_adv_loss = [
            self.compute_adv_loss(d_fake_i, True) for d_fake_i in self.d_fake
        ]

        #Concatenate all losses
        self.g_adv_loss = sum(self.g_adv_loss)

        return self.g_adv_loss

    def compute_D_loss(self):

        # Nlayers = len(self.d_real)
        # self.d_adv_loss = []
        # for i in range(len(self.d_real)):
        #     self.d_adv_loss_real = self.compute_adv_loss(self.d_real[i], True)
        #     self.d_adv_loss_fake = self.compute_adv_loss(self.d_fake[i], False)
        #     # learning_factor = numpy2var((Nlayers-(i+1))*((self.epoch+1)/self.Nmax_epoch),use_cuda=True)
        #     # self.d_adv_loss.append((self.d_adv_loss_real + self.d_adv_loss_fake)*learning_factor)
        #     self.d_adv_loss.append((self.d_adv_loss_real + self.d_adv_loss_fake))
        #
        # #Concatenate all losses
        # self.d_adv_loss = sum(self.d_adv_loss)
        self.d_adv_loss_real = sum([
            self.compute_adv_loss(d_real_i, True) for d_real_i in self.d_real
        ])
        self.d_adv_loss_fake = sum([
            self.compute_adv_loss(d_fake_i, False) for d_fake_i in self.d_fake
        ])

        #Regularization on the gradient of real samples
        # self.d_adv_loss_real = self.d_adv_loss_real + self.d_real_reg

        self.d_adv_loss = (self.d_adv_loss_real +
                           self.d_adv_loss_fake) / (2. * len(self.d_real))

        return self.d_adv_loss

    def record_loss(self):
        p = self.phase
        self.g_loss_record[p].append(
            var2numpy(self.g_loss.mean(), use_cuda=self.use_cuda))
        self.d_loss_record[p].append(
            var2numpy(self.d_loss.mean(), use_cuda=self.use_cuda))
        self.d_adv_loss_fake_record[p].append(
            var2numpy(self.d_adv_loss_fake.mean(), use_cuda=self.use_cuda))
        self.d_adv_loss_real_record[p].append(
            var2numpy(self.d_adv_loss_real.mean(), use_cuda=self.use_cuda))

    def clear_loss_records(self):
        for p in ['train', 'val']:
            self.g_loss_record[p] = []
            self.d_loss_record[p] = []
            self.d_adv_loss_fake_record[p] = []
            self.d_adv_loss_real_record[p] = []

    def validate(self):

        self.G.eval()
        self.D.eval()
        self.phase = 'val'
        total_iter = np.ceil(self.test_len / self.batchsize)

        for self.counter, (self.X, _, self.latent_codes) in enumerate(
                tqdm(self.test_loader, total=total_iter, desc='validation')):

            #Generate latent code
            self.X = Variable(self.X, requires_grad=True)  #.cuda()
            # self.X = Variable(self.X)#.cuda()
            self.latent_codes = [
                Variable(latent_code) for latent_code in self.latent_codes
            ]

            if self.use_cuda:
                self.X = self.X.cuda()
                self.latent_codes = [
                    latent_code.cuda() for latent_code in self.latent_codes
                ]

            with torch.no_grad():
                #Pooling Real image to fit generator Output
                # factors = [int(self.ImgSizes[-1]/imgsize) for imgsize in self.ImgSizes]
                factors = [
                    int(self.scales[-1] / scale) for scale in self.scales
                ]
                self.Img_real = [
                    nn.AvgPool2d(factor, stride=factor, padding=0)(self.X)
                    for factor in factors
                ]

                self.forward_D()
                self.forward_G()
                self.g_loss = self.compute_G_loss()
                self.d_loss = self.compute_D_loss()
                self.record_loss()

            # if self.counter > 1000:
            #     break

    def predict(self, img, labels, batchsize=1):

        dataset = dataset_h5(img, labels, self.latent_dim, self.Nscales)
        data_loader = DataLoader(dataset,
                                 batch_size=batchsize,
                                 shuffle=False,
                                 num_workers=30,
                                 pin_memory=True)

        self.D.eval()
        total_iter = np.ceil(img.shape[0] / batchsize)

        features = [[] for _ in self.scales]

        for counter, (X, _, _) in enumerate(
                tqdm(data_loader, total=total_iter, desc='prediction')):

            X = Variable(X)
            if self.use_cuda:
                X = X.cuda()

            with torch.no_grad():
                #Pooling Real image to fit generator Output
                factors = [
                    int(self.scales[-1] / scale) for scale in self.scales
                ]
                Img_real = [
                    nn.AvgPool2d(factor, stride=factor, padding=0)(X)
                    for factor in factors
                ]
                for i, feat in enumerate(self.D(Img_real)[0]):
                    if self.use_cuda:
                        feat = feat.cpu()
                    features[i].append(feat.data.numpy()[0])

        features = [np.vstack(feat) for feat in features]

        return features

    def generate(self, Nimages=1, latent_codes=None):

        self.G.eval()

        get_latent = True if latent_codes == None else False

        gen_image = [[] for i in range(Nimages)]
        for i in range(Nimages):
            #Generate latent code
            seed = np.random.seed(i + datetime.now().second +
                                  datetime.now().microsecond)

            if get_latent:
                latent_codes = [
                    np.random.randn(1, self.latent_dim).astype(np.float32)
                    for _ in range(self.Nscales)
                ]

            latent_codes = [
                Variable(torch.from_numpy(latent_code))
                for latent_code in latent_codes
            ]
            if self.use_cuda:
                latent_codes = [
                    latent_code.cuda() for latent_code in latent_codes
                ]

            with torch.no_grad():
                #Pooling Real image to fit generator Output
                # factors = [int(self.ImgSizes[-1]/imgsize) for imgsize in self.ImgSizes]

                for X in self.G(latent_codes):
                    if self.use_cuda:
                        X = X.cpu()
                        gen_image[i].append(X.data.numpy()[0].transpose(
                            (1, 2, 0)))

        return gen_image

    def save(self):
        file_name = os.path.join(self.ModelDir, 'Epoch%d' % (self.epoch))
        g_file = file_name + '-G.pth'
        d_file = file_name + '-D.pth'

        g_loss_mean = np.array(self.g_loss_record['val']).mean()
        d_loss_mean = np.array(self.d_loss_record['val']).mean()

        # if g_loss_mean<self.best_g_loss:
        if True:
            state = {
                'state_dict': self.G.state_dict(),
                'optimizer': self.optim_G.state_dict(),
                'epoch': self.epoch,
            }
            torch.save(state, g_file)
            self.best_g_loss = g_loss_mean

        # if d_loss_mean<self.best_d_loss:
        if True:
            state = {
                'state_dict': self.D.state_dict(),
                'optimizer': self.optim_D.state_dict(),
                'epoch': self.epoch,
            }
            torch.save(state, d_file)
            self.best_d_loss = d_loss_mean

    def load(self, Gpath, Dpath):
        state_g = torch.load(Gpath)
        self.G.load_state_dict(state_g['state_dict'])
        self.optim_G.load_state_dict(state_g['optimizer'])

        state_d = torch.load(Dpath)
        self.D.load_state_dict(state_d['state_dict'])
        self.optim_D.load_state_dict(state_d['optimizer'])

        #Reset the best loss for the generator and discriminator
        self.best_g_loss = np.power(10, 20)  #np.inf
        self.best_d_loss = np.power(10, 20)  #np.inf

    def tensorboard(self):
        # ===Add scalar losses===
        for p in ['train', 'val']:
            prefix = p + '/'
            info = {
                prefix + 'G_loss':
                np.array(self.g_loss_record[p]).mean(),
                prefix + 'D_loss':
                np.array(self.d_loss_record[p]).mean(),
                prefix + 'D_adv_loss_fake':
                np.array(self.d_adv_loss_fake_record[p]).mean(),
                prefix + 'D_adv_loss_real':
                np.array(self.d_adv_loss_real_record[p]).mean()
            }

            # self.writer.add_scalars(p, info, self.epoch)
            for tag, value in info.items():
                self.writer.add_scalars(tag, {tag: value}, self.epoch)
                # self.logger.scalar_summary(tag, value, self.epoch)

        # ===Add gradien histogram===
        for tag, value in self.G.named_parameters():
            tag = tag.replace('.', '/')
            self.writer.add_histogram('G/' + prefix + tag, var2numpy(value),
                                      self.epoch)
            # self.logger.histo_summary('G/' + prefix +tag, var2numpy(value), self.epoch)
            if value.grad is not None:
                self.writer.add_histogram('G/' + prefix + tag + '/grad',
                                          var2numpy(value.grad), self.epoch)
                # self.logger.histo_summary('G/' + prefix +tag + '/grad', var2numpy(value.grad), self.epoch)

        for tag, value in self.D.named_parameters():
            tag = tag.replace('.', '/')
            self.writer.add_histogram('D/' + prefix + tag, var2numpy(value),
                                      self.epoch)
            # self.logger.histo_summary('D/' + prefix + tag, var2numpy(value), self.epoch)
            if value.grad is not None:
                self.writer.add_histogram('D/' + prefix + tag + '/grad',
                                          var2numpy(value.grad), self.epoch)
                # self.logger.histo_summary('D/' + prefix + tag + '/grad',var2numpy(value.grad), self.epoch)

        #===generate sample images===
        if self.get_training_images == True:
            #
            # K = np.random.randint(self.batchsize)
            #
            # f,ax = plt.subplots(1,len(self.Img_fake),figsize=(int(5*len(self.Img_fake)),5))
            # for i in range(len(self.Img_fake)):
            #     if self.use_cuda:
            #         img = self.Img_fake[i].cpu()
            #     else:
            #         img = self.Img_fake[i]
            #
            #     img = (img.data.numpy()[K]).transpose((1,2,0))
            #

            gen_img = self.generate(Nimages=1, latent_codes=self.fixed_LC)[0]
            f, ax = plt.subplots(1,
                                 len(gen_img),
                                 figsize=(5 * len(gen_img), 5))
            for i, img in enumerate(gen_img):
                ax[i].imshow(img)

            plt.savefig(os.path.join(self.ImagesDir,
                                     'Img-Epoch%d.png' % (self.epoch)),
                        format='png')

    def he_init(self, layer, nonlinearity='conv2d'):

        classname = layer.__class__.__name__

        # Check if the leayer is a convolution.
        # If True, apply Kaiming normalization
        if classname.find('Conv') != -1:
            nonlinearity = nonlinearity.lower()
            if nonlinearity not in [
                    'linear', 'conv1d', 'conv2d', 'conv3d', 'relu',
                    'leaky_relu', 'sigmoid', 'tanh'
            ]:
                if not hasattr(layer, 'gain') or layer.gain is None:
                    gain = 0  # default
                else:
                    gain = layer.gain
            elif nonlinearity == 'leaky_relu':
                # assert param is not None, 'Negative_slope(param) should be given.'
                gain = calculate_gain(nonlinearity, self.SlopLRelu)
            else:
                gain = calculate_gain(nonlinearity)
                kaiming_normal(layer.weight, a=gain)

    def copy(self, model):
        '''
            Allow to get paramters from another pretrained model
        '''
        for key in model.__dict__.keys():
            self.__dict__[key] = model.__dict__[key]
Exemplo n.º 34
0
class SummaryWorker(multiprocessing.Process):
    def __init__(self, env):
        super(SummaryWorker, self).__init__()
        self.env = env
        self.config = env.config
        self.queue = multiprocessing.Queue()
        try:
            self.timer_scalar = utils.train.Timer(env.config.getfloat('summary', 'scalar'))
        except configparser.NoOptionError:
            self.timer_scalar = lambda: False
        try:
            self.timer_image = utils.train.Timer(env.config.getfloat('summary', 'image'))
        except configparser.NoOptionError:
            self.timer_image = lambda: False
        try:
            self.timer_histogram = utils.train.Timer(env.config.getfloat('summary', 'histogram'))
        except configparser.NoOptionError:
            self.timer_histogram = lambda: False
        with open(os.path.expanduser(os.path.expandvars(env.config.get('summary_histogram', 'parameters'))), 'r') as f:
            self.histogram_parameters = utils.RegexList([line.rstrip() for line in f])
        self.draw_bbox = utils.visualize.DrawBBox(env.config, env.category)
        self.draw_iou = utils.visualize.DrawIou(env.config)

    def __call__(self, name, **kwargs):
        if getattr(self, 'timer_' + name)():
            kwargs = getattr(self, 'copy_' + name)(**kwargs)
            self.queue.put((name, kwargs))

    def stop(self):
        self.queue.put((None, {}))

    def run(self):
        self.writer = SummaryWriter(os.path.join(self.env.model_dir, self.env.args.run))
        while True:
            name, kwargs = self.queue.get()
            if name is None:
                break
            func = getattr(self, 'summary_' + name)
            try:
                func(**kwargs)
            except:
                traceback.print_exc()

    def copy_scalar(self, **kwargs):
        step, loss_total, loss, loss_hparam = (kwargs[key] for key in 'step, loss_total, loss, loss_hparam'.split(', '))
        loss_total = loss_total.data.clone().cpu().numpy()
        loss = {key: loss[key].data.clone().cpu().numpy() for key in loss}
        loss_hparam = {key: loss_hparam[key].data.clone().cpu().numpy() for key in loss_hparam}
        return dict(
            step=step,
            loss_total=loss_total,
            loss=loss, loss_hparam=loss_hparam,
        )

    def summary_scalar(self, **kwargs):
        step, loss_total, loss, loss_hparam = (kwargs[key] for key in 'step, loss_total, loss, loss_hparam'.split(', '))
        for key in loss:
            self.writer.add_scalar('loss/' + key, loss[key][0], step)
        if self.config.getboolean('summary_scalar', 'loss_hparam'):
            self.writer.add_scalars('loss_hparam', {key: loss_hparam[key][0] for key in loss_hparam}, step)
        self.writer.add_scalar('loss_total', loss_total[0], step)

    def copy_image(self, **kwargs):
        step, height, width, rows, cols, data, pred, debug = (kwargs[key] for key in 'step, height, width, rows, cols, data, pred, debug'.split(', '))
        data = {key: data[key].clone().cpu().numpy() for key in 'image, yx_min, yx_max, cls'.split(', ')}
        pred = {key: pred[key].data.clone().cpu().numpy() for key in 'yx_min, yx_max, iou, logits'.split(', ') if key in pred}
        matching = (debug['positive'].float() - debug['negative'].float() + 1) / 2
        matching = matching.data.clone().cpu().numpy()
        return dict(
            step=step, height=height, width=width, rows=rows, cols=cols,
            data=data, pred=pred,
            matching=matching,
        )

    def summary_image(self, **kwargs):
        step, height, width, rows, cols, data, pred, matching = (kwargs[key] for key in 'step, height, width, rows, cols, data, pred, matching'.split(', '))
        image = data['image']
        limit = min(self.config.getint('summary_image', 'limit'), image.shape[0])
        image = image[:limit, :, :, :]
        yx_min, yx_max, iou = (pred[key] for key in 'yx_min, yx_max, iou'.split(', '))
        scale = [height / rows, width / cols]
        yx_min, yx_max = (a * scale for a in (yx_min, yx_max))
        if 'logits' in pred:
            cls = np.argmax(F.softmax(torch.autograd.Variable(torch.from_numpy(pred['logits'])), -1).data.cpu().numpy(), -1)
        else:
            cls = np.zeros(iou.shape, np.int)
        if self.config.getboolean('summary_image', 'bbox'):
            # data
            canvas = np.copy(image)
            canvas = pybenchmark.profile('bbox/data')(self.draw_bbox_data)(canvas, *(data[key] for key in 'yx_min, yx_max, cls'.split(', ')))
            self.writer.add_image('bbox/data', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step)
            # pred
            canvas = np.copy(image)
            canvas = pybenchmark.profile('bbox/pred')(self.draw_bbox_pred)(canvas, yx_min, yx_max, cls, iou, nms=True)
            self.writer.add_image('bbox/pred', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step)
        if self.config.getboolean('summary_image', 'iou'):
            # bbox
            canvas = np.copy(image)
            canvas_data = self.draw_bbox_data(canvas, *(data[key] for key in 'yx_min, yx_max, cls'.split(', ')), colors=['g'])
            # data
            for i, canvas in enumerate(pybenchmark.profile('iou/data')(self.draw_bbox_iou)(list(map(np.copy, canvas_data)), yx_min, yx_max, cls, matching, rows, cols, colors=['w'])):
                canvas = np.stack(canvas)
                canvas = torch.from_numpy(canvas).permute(0, 3, 1, 2)
                canvas = torchvision.utils.make_grid(canvas.float(), normalize=True, scale_each=True)
                self.writer.add_image('iou/data%d' % i, canvas, step)
            # pred
            for i, canvas in enumerate(pybenchmark.profile('iou/pred')(self.draw_bbox_iou)(list(map(np.copy, canvas_data)), yx_min, yx_max, cls, iou, rows, cols, colors=['w'])):
                canvas = np.stack(canvas)
                canvas = torch.from_numpy(canvas).permute(0, 3, 1, 2)
                canvas = torchvision.utils.make_grid(canvas.float(), normalize=True, scale_each=True)
                self.writer.add_image('iou/pred%d' % i, canvas, step)

    def draw_bbox_data(self, canvas, yx_min, yx_max, cls, colors=None):
        batch_size = len(canvas)
        if len(cls.shape) == len(yx_min.shape):
            cls = np.argmax(cls, -1)
        yx_min, yx_max, cls = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls))
        return [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(canvas, yx_min, yx_max, cls)]

    def draw_bbox_pred(self, canvas, yx_min, yx_max, cls, iou, colors=None, nms=False):
        batch_size = len(canvas)
        mask = iou > self.config.getfloat('detect', 'threshold')
        yx_min, yx_max = (np.reshape(a, [a.shape[0], -1, 2]) for a in (yx_min, yx_max))
        cls, iou, mask = (np.reshape(a, [a.shape[0], -1]) for a in (cls, iou, mask))
        yx_min, yx_max, cls, iou, mask = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls, iou, mask))
        yx_min, yx_max, cls, iou = ([a[m] for a, m in zip(l, mask)] for l in (yx_min, yx_max, cls, iou))
        if nms:
            overlap = self.config.getfloat('detect', 'overlap')
            keep = [pybenchmark.profile('nms')(utils.postprocess.nms)(torch.Tensor(iou), torch.Tensor(yx_min), torch.Tensor(yx_max), overlap) if iou.shape[0] > 0 else [] for yx_min, yx_max, iou in zip(yx_min, yx_max, iou)]
            keep = [np.array(k, np.int) for k in keep]
            yx_min, yx_max, cls = ([a[k] for a, k in zip(l, keep)] for l in (yx_min, yx_max, cls))
        return [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(canvas, yx_min, yx_max, cls)]

    def draw_bbox_iou(self, canvas_share, yx_min, yx_max, cls, iou, rows, cols, colors=None):
        batch_size = len(canvas_share)
        yx_min, yx_max = ([np.squeeze(a, -2) for a in np.split(a, a.shape[-2], -2)] for a in (yx_min, yx_max))
        cls, iou = ([np.squeeze(a, -1) for a in np.split(a, a.shape[-1], -1)] for a in (cls, iou))
        results = []
        for i, (yx_min, yx_max, cls, iou) in enumerate(zip(yx_min, yx_max, cls, iou)):
            mask = iou > self.config.getfloat('detect', 'threshold')
            yx_min, yx_max = (np.reshape(a, [a.shape[0], -1, 2]) for a in (yx_min, yx_max))
            cls, iou, mask = (np.reshape(a, [a.shape[0], -1]) for a in (cls, iou, mask))
            yx_min, yx_max, cls, iou, mask = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls, iou, mask))
            yx_min, yx_max, cls = ([a[m] for a, m in zip(l, mask)] for l in (yx_min, yx_max, cls))
            canvas = [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(np.copy(canvas_share), yx_min, yx_max, cls)]
            iou = [np.reshape(a, [rows, cols]) for a in iou]
            canvas = [self.draw_iou(_canvas, iou) for _canvas, iou in zip(canvas, iou)]
            results.append(canvas)
        return results

    def copy_histogram(self, **kwargs):
        return {key: kwargs[key].data.clone().cpu().numpy() if torch.is_tensor(kwargs[key]) else kwargs[key] for key in 'step, dnn'.split(', ')}


    def summary_histogram(self, **kwargs):
        step, dnn = (kwargs[key] for key in 'step, dnn'.split(', '))
        for name, param in dnn.named_parameters():
            if self.histogram_parameters(name):
                self.writer.add_histogram(name, param, step)
Exemplo n.º 35
0
def train(model_path, epochs):
    trans = DataUtill.get_ImageNet_transform(random_horizontal_flip=True)
    train_data = DataUtill.Placesdataset(data_path, transforms=trans)
    train_data_loader = datas.DataLoader(train_data,
                                         batch_size,
                                         shuffle=True,
                                         num_workers=8)

    # 可视化数据
    # torchvision.utils.save_image(valid_batch["img"], "pic.png", normalize=True)

    encoder = alexnet(True)
    num_fea = encoder.classifier[6].in_features
    features = list(encoder.classifier.children())[:-1]
    ofc = nn.Linear(num_fea, 200)
    nn.init.normal(ofc.weight, 0, 0.01)
    features.append(ofc)
    encoder.classifier = nn.Sequential(*features)

    encoder = encoder.cuda()
    global_step = 0

    optimizer = optim.SGD(encoder.parameters(),
                          learning_rate,
                          0.9,
                          weight_decay=0.0005)
    optimizer = optim.lr_scheduler.ExponentialLR(optimizer, 0.998)
    critizen = nn.CrossEntropyLoss()

    Writer = SummaryWriter(log_dir=model_path)

    # 先计算一次当前acc
    max_acc, min_eval_loss = eval(trans, encoder, critizen)
    print("初始准确率为{}%".format(max_acc))
    Writer.add_scalar("/eval/eval_loss", min_eval_loss, global_step)
    Writer.add_scalar("/eval/accuracy", max_acc, global_step)

    for epoch in range(epochs):
        for step, batch in enumerate(train_data_loader):
            global_step = global_step + 1
            input = batch["img"]
            label = batch["class"]

            # torchvision.utils.save_image(input, "pic.png", normalize=True)

            input = autograd.Variable(input)
            label = autograd.Variable(label)

            input = input.cuda()
            label = label.cuda()

            encoder.zero_grad()
            output = encoder(input)
            loss = critizen(output, label.squeeze())
            loss.backward(retain_graph=True)
            optimizer.step()

            if global_step % 100 == 0:
                Writer.add_scalar("train_loss", loss, global_step)

            if global_step % 1000 == 0:
                Writer.add_histogram("/conv1/grad",
                                     encoder.features[0].weight.grad,
                                     global_step)
                Writer.add_histogram("/conv1/weight",
                                     encoder.features[0].weight, global_step)
                Writer.add_histogram("/fc6/grad",
                                     encoder.classifier[6].weight.grad,
                                     global_step)
                Writer.add_histogram("/fc6/weight",
                                     encoder.classifier[6].weight, global_step)
                acc, eval_loss = eval(trans, encoder, critizen)
                Writer.add_scalar("/eval/accuracy", acc, global_step)
                Writer.add_scalar("/eval/eval_loss", eval_loss, global_step)
                if acc > max_acc:
                    max_acc = max(acc, max_acc)
                    DataUtill.save_param(encoder, model_path + "alexnet.pkl")
                    print(
                        "save params in {} epoch {} step with accuracy {}% , and the loss is {}"
                        .format(epoch, step, acc, eval_loss))