class ParseLogCallback(object):
    """
    1. log distribution's std to tensorboard (as distribution)
    This function make use of mxnet's "monitor" module, and it's output to a log file.
    while training, it is possible to specify layers to be monitored.
    these layers will be printed to a given log file,
    their values are computed **asynchronously**.

    2. log training loss to tensorboard (as scalar)

    Currently - does not support resume training..
    """
    def __init__(self, dist_logging_dir=None, scalar_logging_dir=None,
                 logfile_path=None, batch_size=None, iter_monitor=0,
                 frequent=None, prefix='ssd'):
        self.scalar_logging_dir = scalar_logging_dir
        self.dist_logging_dir = dist_logging_dir
        self.logfile_path = logfile_path
        self.batch_size = batch_size
        self.iter_monitor = iter_monitor
        self.frequent = frequent
        self.prefix = prefix
        self.batch = 0
        self.line_idx = 0
        try:
            from tensorboard import SummaryWriter
            self.dist_summary_writer = SummaryWriter(dist_logging_dir)
            self.scalar_summary_writer = SummaryWriter(scalar_logging_dir)
        except ImportError:
            logging.error('You can install tensorboard via `pip install tensorboard`.')

    def __call__(self, param):
        """Callback to parse a log file and and add params to TensorBoard."""

        # save distributions from the monitor output log
        if not self.iter_monitor == 0 and self.batch % self.iter_monitor == 0:
            with open(self.logfile_path) as fp:
                for i in range(self.line_idx):
                    fp.next()
                for line in fp:
                    if line.startswith('Batch'):
                        line = line.split(' ')
                        line = [x for x in line if x]
                        layer_name = line[2]
                        layer_value = np.array(float(line[3].split('\t')[0])).flatten()
                        if np.isfinite(layer_value):
                            self.dist_summary_writer.add_histogram(layer_name, layer_value)
                    self.line_idx += 1

        # save training loss
        if self.batch % self.frequent == 0:
            if param.eval_metric is None:
                return
            name_value = param.eval_metric.get_name_value()
            for name, value in name_value:
                if self.prefix is not None:
                    name = '%s-%s' % (self.prefix, name)
                self.scalar_summary_writer.add_scalar(name, value, global_step=self.batch)
        self.batch += 1
示例#2
0
class LogMetricsCallback(object):
    """Log metrics periodically in TensorBoard.
    This callback works almost same as `callback.Speedometer`, but write TensorBoard event file
    for visualization. For more usage, please refer https://github.com/dmlc/tensorboard

    Parameters
    ----------
    logging_dir : str
        TensorBoard event file directory.
        After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization.
    prefix : str
        Prefix for a metric name of `scalar` value.
        You might want to use this param to leverage TensorBoard plot feature,
        where TensorBoard plots different curves in one graph when they have same `name`.
        The follow example shows the usage(how to compare a train and eval metric in a same graph).

    Examples
    --------
    >>> # log train and eval metrics under different directories.
    >>> training_log = 'logs/train'
    >>> evaluation_log = 'logs/eval'
    >>> # in this case, each training and evaluation metric pairs has same name,
    >>> # you can add a prefix to make it separate.
    >>> batch_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(training_log)]
    >>> eval_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(evaluation_log)]
    >>> # run
    >>> model.fit(train,
    >>>     ...
    >>>     batch_end_callback = batch_end_callbacks,
    >>>     eval_end_callback  = eval_end_callbacks)
    >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization.
    """
    def __init__(self, logging_dir, prefix=None):
        self.prefix = prefix
        try:
            from tensorboard import SummaryWriter
            self.summary_writer = SummaryWriter(logging_dir)
        except ImportError:
            logging.error(
                'You can install tensorboard via `pip install tensorboard`.')

    def __call__(self, param):
        """Callback to log training speed and metrics in TensorBoard."""
        if param.eval_metric is None:
            return
        name_value = param.eval_metric.get_name_value()
        for name, value in name_value:
            if self.prefix is not None:
                name = '%s-%s' % (self.prefix, name)
            self.summary_writer.add_scalar(name,
                                           value,
                                           global_step=param.epoch)
示例#3
0
class LogMetricsCallback(object):
    """Log metrics periodically in TensorBoard.
    This callback works almost same as `callback.Speedometer`, but write TensorBoard event file
    for visualization. For more usage, please refer https://github.com/dmlc/tensorboard

    Parameters
    ----------
    logging_dir : str
        TensorBoard event file directory.
        After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization.
    prefix : str
        Prefix for a metric name of `scalar` value.
        You might want to use this param to leverage TensorBoard plot feature,
        where TensorBoard plots different curves in one graph when they have same `name`.
        The follow example shows the usage(how to compare a train and eval metric in a same graph).

    Examples
    --------
    >>> # log train and eval metrics under different directories.
    >>> training_log = 'logs/train'
    >>> evaluation_log = 'logs/eval'
    >>> # in this case, each training and evaluation metric pairs has same name,
    >>> # you can add a prefix to make it separate.
    >>> batch_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(training_log)]
    >>> eval_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(evaluation_log)]
    >>> # run
    >>> model.fit(train,
    >>>     ...
    >>>     batch_end_callback = batch_end_callbacks,
    >>>     eval_end_callback  = eval_end_callbacks)
    >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization.
    """
    def __init__(self, logging_dir, prefix=None):
        self.prefix = prefix
        try:
            from tensorboard import SummaryWriter
            self.summary_writer = SummaryWriter(logging_dir)
        except ImportError:
            logging.error('You can install tensorboard via `pip install tensorboard`.')

    def __call__(self, param):
        """Callback to log training speed and metrics in TensorBoard."""
        if param.eval_metric is None:
            return
        name_value = param.eval_metric.get_name_value()
        for name, value in name_value:
            if self.prefix is not None:
                name = '%s-%s' % (self.prefix, name)
            self.summary_writer.add_scalar(name, value)
示例#4
0
class Logger():
    def __init__(self, root):
        self.writer = SummaryWriter(root)
        self.last_indexes = defaultdict(int)

    def scalar(self, key, value, index=None):
        index = index if index is not None else self.last_indexes[key]
        self.last_indexes[key] += 1

        value = to_numeric(value)

        self.writer.add_scalar(key, value, index)

    def from_stats(self, key_value_dictionary, index=None):
        for key in key_value_dictionary:
            self.scalar(key, key_value_dictionary[key], index)
class LogMetricsCallback(object):
    def __init__(self, logging_dir, prefix=None):
        self.prefix = prefix
        self.itr = 0
        try:
            from tensorboard import SummaryWriter
            self.summary_writer = SummaryWriter(logging_dir)
        except ImportError:
            logging.error(
                'You can install tensorboard via `pip install tensorboard`.')

    def __call__(self, name_value):
        """Callback to log training speed and metrics in TensorBoard."""
        if name_value is None:
            return
        for name, value in name_value:
            if self.prefix is not None:
                name = '%s-%s' % (self.prefix, name)
            self.summary_writer.add_scalar(name, value, self.itr)
        self.itr += 1
示例#6
0
class LogMetricsCallback(object):
    """Log metrics periodically in TensorBoard.
    This callback works almost same as `callback.Speedometer`, but write TensorBoard event file
    for visualization. For more usage, please refer https://github.com/dmlc/tensorboard

    Parameters
    ----------
    logging_dir : str
        TensorBoard event file directory.
        After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization.
    prefix : str
        Prefix for a metric name of `scalar` value.
        You might want to use this param to leverage TensorBoard plot feature,
        where TensorBoard plots different curves in one graph when they have same `name`.
        The follow example shows the usage(how to compare a train and eval metric in a same graph).
    """
    def __init__(self, logging_dir, prefix=None, global_step=100):
        self.prefix = prefix
        self.global_step = global_step
        try:
            from tensorboard import SummaryWriter
            self.summary_writer = SummaryWriter(logging_dir)
        except ImportError:
            logging.error(
                'You can install tensorboard via `pip install tensorboard`.')

    def __call__(self, param):
        """Callback to log training speed and metrics in TensorBoard."""
        if param.eval_metric is None:
            return
        name_value = param.eval_metric.get_name_value()
        for name, value in name_value:
            if self.prefix is not None:
                name = '%s-%s' % (self.prefix, name)
            self.summary_writer.add_scalar(name,
                                           value,
                                           global_step=self.global_step)
示例#7
0
    def train(self) -> None:
        epoch_counter = 0
        # Resume from serialization path if it contains a saved model.
        if self._serialization_dir is not None:
            # Set up tensorboard logging.
            train_log = SummaryWriter(os.path.join(self._serialization_dir, "log", "train"))
            validation_log = SummaryWriter(os.path.join(self._serialization_dir, "log", "validation"))
            if any(["model_state_epoch_" in x
                    for x in os.listdir(self._serialization_dir)]):
                logger.info("Loading model from checkpoint.")
                epoch_counter = self._restore_checkpoint()

        if self._grad_clipping is not None:
            # Pylint is unable to tell that we're in the case that _glad_clipping is not None...
            # pylint: disable=invalid-unary-operand-type
            clip_function = lambda grad: grad.clamp(-self._grad_clipping, self._grad_clipping)
            for parameter in self._model.parameters():
                if parameter.requires_grad:
                    parameter.register_hook(clip_function)

        logger.info("Beginning training.")
        num_training_batches = self._iterator.get_num_batches(self._train_dataset)
        if self._validation_dataset is not None:
            num_validation_batches = self._iterator.get_num_batches(self._validation_dataset)
        validation_metric_per_epoch: List[float] = []
        for epoch in range(epoch_counter, self._num_epochs):
            logger.info("Epoch %d/%d", epoch + 1, self._num_epochs)
            train_loss = 0.0
            val_loss = 0.0
            # Set the model to "train" mode.
            self._model.train()
            train_generator = self._iterator(self._train_dataset, num_epochs=1)

            train_generator_tqdm = tqdm.tqdm(train_generator,
                                             disable=self._no_tqdm,
                                             total=num_training_batches)
            last_log = time.time()
            batch_num = 0
            logger.info("Training")
            for batch in train_generator_tqdm:
                batch_num += 1
                self._optimizer.zero_grad()
                output_dict = self._forward(batch, for_training=True)
                try:
                    loss = output_dict["loss"]
                    loss.backward()
                    # Make sure Variable is on the cpu before converting to numpy.
                    # .cpu() is a no-op if you aren't using GPUs.
                    train_loss += loss.data.cpu().numpy()
                except KeyError:
                    raise ConfigurationError("The model you are trying to optimize does not contain a"
                                             " 'loss' key in the output of model.forward(inputs).")

                if self._grad_norm:
                    clip_grad_norm(self._model.parameters(), self._grad_norm)
                self._optimizer.step()
                metrics = self._model.get_metrics()
                metrics["loss"] = float(train_loss / batch_num)
                description = self._description_from_metrics(metrics)
                train_generator_tqdm.set_description(description)

                batch_num_total = num_training_batches * epoch + batch_num
                if self._serialization_dir and batch_num_total % self._summary_interval == 0:
                    for name, param in self._model.named_parameters():
                        train_log.add_scalar("PARAMETER_MEAN/" + name, param.data.mean(), batch_num_total)
                        train_log.add_scalar("PARAMETER_STD/" + name, param.data.std(), batch_num_total)
                        if param.grad is not None:
                            train_log.add_scalar("GRAD_MEAN/" + name, param.grad.data.mean(), batch_num_total)
                            train_log.add_scalar("GRAD_STD/" + name, param.grad.data.std(), batch_num_total)
                    train_log.add_scalar("LOSS/loss_train", metrics["loss"], batch_num_total)
                if self._no_tqdm and time.time() - last_log > self._log_interval:
                    logger.info("Batch %d/%d: %s", batch_num, num_training_batches, description)
                    last_log = time.time()
            metrics = self._model.get_metrics(reset=True)
            metrics["loss"] = float(train_loss / batch_num)

            if self._validation_dataset is not None:
                logger.info("Validating")
                # Switch to evaluation mode.
                self._model.eval()
                val_generator = self._iterator(self._validation_dataset, num_epochs=1)
                val_generator_tqdm = tqdm.tqdm(val_generator,
                                               disable=self._no_tqdm,
                                               total=num_validation_batches)
                batch_num = 0
                for batch in val_generator_tqdm:
                    batch_num += 1
                    val_output_dict = self._forward(batch, for_training=False)
                    loss = val_output_dict["loss"]
                    val_loss += loss.data.cpu().numpy()
                    val_metrics = self._model.get_metrics()
                    val_metrics["loss"] = float(val_loss / batch_num)
                    description = self._description_from_metrics(val_metrics)
                    val_generator_tqdm.set_description(description)
                    if self._no_tqdm and time.time() - last_log > self._log_interval:
                        logger.info("Batch %d/%d: %s", batch_num, num_validation_batches, description)
                        last_log = time.time()
                val_metrics = self._model.get_metrics(reset=True)
                val_metrics["loss"] = float(val_loss / batch_num)
                message_template = "Training %s : %3f    Validation %s : %3f "
                for name, value in metrics.items():
                    logger.info(message_template, name, value, name, val_metrics[name])
                    if self._serialization_dir:
                        train_log.add_scalar(name, value, epoch)
                        validation_log.add_scalar(name, val_metrics[name], epoch)

                this_epoch_val_metric = val_metrics[self._validation_metric]
                if len(validation_metric_per_epoch) > self._patience:
                    # Is the worst validation performance in past self._patience
                    # epochs is better than current value?
                    if self._validation_metric_decreases:
                        should_stop = max(validation_metric_per_epoch[-self._patience:]) < this_epoch_val_metric
                    else:
                        should_stop = min(validation_metric_per_epoch[-self._patience:]) > this_epoch_val_metric
                    if should_stop:
                        logger.info("Ran out of patience.  Stopping training.")
                        break
                validation_metric_per_epoch.append(this_epoch_val_metric)

                if self._validation_metric_decreases:
                    is_best_so_far = this_epoch_val_metric == min(validation_metric_per_epoch)
                else:
                    is_best_so_far = this_epoch_val_metric == max(validation_metric_per_epoch)
                if self._serialization_dir:
                    self._save_checkpoint(epoch, is_best=is_best_so_far)

                if self._learning_rate_scheduler:
                    # Grim hack to determine whether the validation metric we are recording
                    # needs to be passed to the scheduler. This is required because the
                    # step() function of the different schedulers are (understandably)
                    # different to ReduceLROnPlateau.
                    if isinstance(self._learning_rate_scheduler,
                                  torch.optim.lr_scheduler.ReduceLROnPlateau):
                        self._learning_rate_scheduler.step(this_epoch_val_metric, epoch)
                    self._learning_rate_scheduler.step(epoch)
            else:
                message_template = "Training %s : %3f "
                for name, value in metrics.items():
                    logger.info(message_template, name, value)
                    if self._serialization_dir:
                        train_log.add_scalar(name, value, epoch)
                if self._serialization_dir:
                    self._save_checkpoint(epoch)
                if self._learning_rate_scheduler:
                    if isinstance(self._learning_rate_scheduler,
                                  torch.optim.lr_scheduler.ReduceLROnPlateau):
                        raise ConfigurationError("The reduce_on_plateau learning rate scheduler requires "
                                                 "a validation metric to compute the schedule and therefore "
                                                 "must be used with a validation dataset.")
                    self._learning_rate_scheduler.step(epoch)
示例#8
0
        optimizer.zero_grad()

        #calcolo uscita
        out = net(data_batch)
        #qui mi serve solo uscita finale
        #out = out[0]
        #loss
        loss_value = loss(out, labels_batch)
        #accuracy
        accuracy_value = classification_accuracy(out, labels_batch)
        # propago
        loss_value.backward()
        # adesso ho accesso al gradiente e posso aggiornare anche i pesi
        optimizer.step()
        #LOGGING
        progress.update(progress.value + 1,
                        loss=loss_value.data.cpu().numpy()[0],
                        accuracy=accuracy_value,
                        epoch=i + 1)

        if j % logging_step == 0:
            #LOSS ACCURACY
            writer.add_scalar('loss', loss_value.data[0], step)
            writer.add_scalar('accuracy', accuracy_value, step)
            step += 1
            #PARAMS
            #for name, param in net.named_parameters():
            #   writer.add_histogram(name, param.clone().cpu().data.numpy(), i*batch_number+j)

    progress.finish()
示例#9
0
optimizer_feat = torch.optim.Adam(res101.parameters(), lr=1e-4)

for t in range(10):
    for i, (img, label) in enumerate(loader):
        img = img.cuda()
        label = label[0].cuda()
        label = Variable(label)
        input = Variable(img)

        feats = res101(input)
        output = seg(feats)

        seg.zero_grad()
        res101.zero_grad()
        loss = criterion(output, label)
        loss.backward()
        optimizer_feat.step()
        optimizer_seg.step()

        ## see
        input = make_image_grid(img, mean, std)
        label = make_label_grid(label.data)
        label = Colorize()(label).type(torch.FloatTensor)
        output = make_label_grid(torch.max(output, dim=1)[1].data)
        output = Colorize()(output).type(torch.FloatTensor)
        writer.add_image('image', input, i)
        writer.add_image('label', label, i)
        writer.add_image('pred', output, i)
        writer.add_scalar('loss', loss.data[0], i)

        print "epoch %d step %d, loss=%.4f" % (t, i, loss.data.cpu()[0])
示例#10
0
def train_loop(thread_id, env_name, shared_model, opt, phi, board_path):
    logger = logging.getLogger(__name__)
    agent = Agent_a3c(shared_model=shared_model, optimizer=opt, phi=phi)
    agent.generagte_local_model(thread_id)

    done = False
    episode = 0
    r = 0
    step = 0
    global_step = 0
    local_r_sum = 0
    env = gym.make(env_name)
    obs = env.reset()

    # set writer

    writer = SummaryWriter(board_path)

    while True:
        if done or step == MAX_EPISODE_LEN:
            obs = env.reset()
            global_step += step
            if thread_id == 0:
                logger.info(
                    'episode: {}, r_sum: {}, total_step:{}, step len in episode: {}'
                    .format(episode, local_r_sum, step, step - step_last))
                if episode % EVAL_INTERVAL == 0:
                    evaluate(env, agent)
            writer.add_scalar('reward_sum_{}'.format(thread_id), local_r_sum,
                              episode)
            writer.add_scalar('V_{}'.format(thread_id),
                              agent.shared_V_out.data, episode)
            writer.add_scalar('A_{}'.format(thread_id), agent.A.data, episode)
            writer.add_scalar('loss_v_{}'.format(thread_id), agent.V_loss.data,
                              episode)
            writer.add_scalar('loss_pi_{}'.format(thread_id),
                              agent.pi_loss.data, episode)
            writer.add_scalar('loss_entropy_{}'.format(thread_id),
                              agent.entropy_loss.data, episode)
            writer.add_all_parameter_histograms([agent.pi_loss], episode)
            writer.add_all_parameter_histograms([agent.V_loss], episode)
            writer.add_all_parameter_histograms([agent.entropy_loss], episode)

            r = 0
            step_last = step
            local_r_sum = 0
            done = False
            episode += 1

        else:
            a = agent.act_and_train(obs, r, is_state_terminal=done)
            obs, r, done, info = env.step(a)
            r = 0.01 * r  # reduce
            if thread_id == 0:
                logger.debug(
                    'step: {}, r: {}, a: {}, s: {}, done: {}, info: {}'.format(
                        step, r, a, obs, done, info))
            local_r_sum += r
            step += 1
示例#11
0
def main(args):
    writer = SummaryWriter(args.logs_dir)

    sys.stdout = Logger(osp.join(args.logs_dir, 'train_log.txt'))
    print(args)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.benchmark = True

    # Create data loaders
    data_dir = osp.join(args.data_dir, args.dataset)
    dataset, num_classes, dim_featx, dim_featy, train_loader, val_loader, test_loader = \
        get_data(
            args.dataset, data_dir, args.data_type,
            args.batch_size, args.workers, args.combine_trainval,
            head_feat_dir=args.head_feat_dir,
            face_feat_dir=args.face_feat_dir,
            body_feat_dir=args.body_feat_dir,
            upperbody_feat_dir=args.upperbody_feat_dir)

    # Create model
    model = RANet(4, num_features=dim_featx)
    # model = torch.nn.DataParallel(model).cuda()
    model = model.cuda()

    # load from checkpoint
    if args.resume:
        checkpoint = load_checkpoint(args.resume)
        model.load_state_dict(checkpoint['state_dict'])
        args.start_epoch = checkpoint['epoch']
        best_top1 = checkpoint['best_top1']
        print("=> start epoch {}  best top1 {:.1%}".format(
            args.start_epoch, best_top1))
    else:
        best_top1 = 0

    # Criterion
    criterion = OIM4bLoss(dim_featy,
                          num_classes,
                          scalar=args.oim_scalar,
                          momentum=args.oim_momentum)
    criterion.init_lut(train_loader)
    criterion.cuda()

    # Optimizer
    if args.optimizer == 'sgd':
        param_groups = model.parameters()
        optimizer = torch.optim.SGD(param_groups,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay,
                                    nesterov=True)
    elif args.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
    else:
        raise ValueError("Cannot recognize optimizer type:", args.optimizer)

    # Evaluator and Trainer
    evaluator = RAEvaluator(model)
    trainer = RATrainer(model, criterion)

    # Schedule learning rate
    def adjust_lr(epoch):
        if args.optimizer == 'sgd':
            lr = args.lr * (0.1**(epoch // 20))
        elif args.optimizer == 'adam':
            lr = args.lr if epoch <= 50 else \
                args.lr * (0.01 ** (epoch - 50) / 30)
        else:
            raise ValueError("Cannot recognize optimizer type:",
                             args.optimizer)
        for g in optimizer.param_groups:
            g['lr'] = lr * g.get('lr_mult', 1)

    # start training
    top1 = evaluator.evaluate(val_loader, print_summary=True)
    test_top1 = evaluator.test(test_loader,
                               dataset.gallery,
                               dataset.query,
                               print_summary=True)
    for epoch in range(args.start_epoch, args.epochs):
        adjust_lr(epoch)
        loss, prec = trainer.train(epoch,
                                   train_loader,
                                   optimizer,
                                   print_freq=1)
        writer.add_scalar('Train loss', loss, epoch + 1)
        writer.add_scalar('Train accuracy', prec, epoch + 1)

        top1 = evaluator.evaluate(val_loader, print_summary=False)
        writer.add_scalar('Val accuracy', top1, epoch + 1)
        test_top1 = evaluator.test(test_loader,
                                   dataset.gallery,
                                   dataset.query,
                                   print_summary=True)
        test_top1 = evaluator.test(test_loader,
                                   dataset.query,
                                   dataset.gallery,
                                   print_summary=True)
        writer.add_scalar('Test accuracy', test_top1, epoch + 1)

        is_best = top1 > best_top1
        best_top1 = max(top1, best_top1)
        save_checkpoint(
            {
                'state_dict': model.state_dict(),
                'epoch': epoch + 1,
                'best_top1': best_top1,
            },
            is_best,
            fpath=osp.join(args.logs_dir, 'checkpoint.pth.tar'))

        print('\n * Finished epoch {:3d}  top1: {:5.1%}  best: {:5.1%}{}\n'.
              format(epoch, top1, best_top1, ' *' if is_best else ''))

    # final test
    print('Test with best model:')
    checkpoint = load_checkpoint(osp.join(args.logs_dir, 'model_best.pth.tar'))
    model.load_state_dict(checkpoint['state_dict'])
    evaluator.test(test_loader, dataset.gallery, dataset.query)
示例#12
0
def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay):
    summary_writer = SummaryWriter(args.tblog_dir)
    lr_scheduler = SimpleLRScheduler(learning_rate)
    optimizer_params = {'lr_scheduler': lr_scheduler}
    module.init_params()
    module.init_optimizer(kvstore=kvstore,
                          optimizer=optimizer,
                          optimizer_params=optimizer_params)
    n_epoch = 0
    while True:
        if n_epoch >= num_epoch:
            break
        train_iter.reset()
        val_iter.reset()
        loss_metric.reset()
        for n_batch, data_batch in enumerate(train_iter):
            module.forward_backward(data_batch)
            module.update()
            module.update_metric(loss_metric, data_batch.label)
            loss_metric.get_batch_log(n_batch)
        train_acc, train_loss, train_recon_err = loss_metric.get_name_value()
        loss_metric.reset()
        for n_batch, data_batch in enumerate(val_iter):
            module.forward(data_batch)
            module.update_metric(loss_metric, data_batch.label)
            loss_metric.get_batch_log(n_batch)
        val_acc, val_loss, val_recon_err = loss_metric.get_name_value()

        summary_writer.add_scalar('train_acc', train_acc, n_epoch)
        summary_writer.add_scalar('train_loss', train_loss, n_epoch)
        summary_writer.add_scalar('train_recon_err', train_recon_err, n_epoch)
        summary_writer.add_scalar('val_acc', val_acc, n_epoch)
        summary_writer.add_scalar('val_loss', val_loss, n_epoch)
        summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch)

        print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, train_recon_err))
        print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err))
        print('SAVE CHECKPOINT')

        module.save_checkpoint(prefix=model_prefix, epoch=n_epoch)
        n_epoch += 1
        lr_scheduler.learning_rate = learning_rate * (decay ** n_epoch)
def test_log_scalar_summary():
    logdir = './experiment/scalar'
    writer = SummaryWriter(logdir)
    for i in range(10):
        writer.add_scalar('test_scalar', i+1)
    writer.close()
示例#14
0
        ###########################
        netG.zero_grad()
        labelv = Variable(
            label.fill_(real_label))  # fake labels are real for generator cost
        output = netD(fake)
        errG = criterion(output, labelv)
        errG.backward()
        D_G_z2 = output.data.mean()
        optimizerG.step()

        print(
            '[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
            % (epoch, opt.niter, i, len(dataloader), errD.data[0],
               errG.data[0], D_x, D_G_z1, D_G_z2))
        niter = epoch * len(dataloader) + i
        writer.add_scalar('Loss/D', errD.data[0], niter)
        writer.add_scalar('Loss/G', errG.data[0], niter)
        writer.add_scalar('D(x)', D_x, niter)
        writer.add_scalar('D(G(z1))', D_G_z1, niter)
        writer.add_scalar('D(G(z2))', D_G_z2, niter)

        if i % 100 == 0:
            vutils.save_image(real_cpu,
                              '%s/real_samples.png' % opt.outf,
                              normalize=True)
            writer.add_image('real_samples',
                             vutils.make_grid(real_cpu, normalize=True), niter)
            fake = netG(fixed_noise)
            vutils.save_image(fake.data,
                              '%s/fake_samples_epoch_%03d.png' %
                              (opt.outf, epoch),
示例#15
0
    def train(self) -> None:
        epoch_counter = 0
        # Resume from serialization path if it contains a saved model.
        if self._serialization_prefix is not None:
            # Set up tensorboard logging.
            train_log = SummaryWriter(
                os.path.join(self._serialization_prefix, "log", "train"))
            validation_log = SummaryWriter(
                os.path.join(self._serialization_prefix, "log", "validation"))
            if any([
                    "model_state_epoch_" in x
                    for x in os.listdir(self._serialization_prefix)
            ]):
                logger.info("Loading model from checkpoint.")
                epoch_counter = self._restore_checkpoint()

        if self._grad_clipping is not None:
            # Pylint is unable to tell that we're in the case that _glad_clipping is not None...
            # pylint: disable=invalid-unary-operand-type
            clip_function = lambda grad: grad.clamp(-self._grad_clipping, self.
                                                    _grad_clipping)
            for parameter in self._model.parameters():
                if parameter.requires_grad:
                    parameter.register_hook(clip_function)

        logger.info("Beginning training.")
        num_training_batches = self._iterator.get_num_batches(
            self._train_dataset)
        if self._validation_dataset is not None:
            num_validation_batches = self._iterator.get_num_batches(
                self._validation_dataset)
        for epoch in range(epoch_counter, self._num_epochs):
            logger.info("Epoch %d/%d", epoch + 1, self._num_epochs)
            train_loss = 0.0
            val_loss = 0.0
            validation_metric_per_epoch = []  # type: List[float]
            # Set the model to "train" mode.
            self._model.train()
            train_generator = self._iterator(self._train_dataset, num_epochs=1)

            train_generator_tqdm = tqdm.tqdm(train_generator,
                                             total=num_training_batches)
            batch_num = 0
            for batch in train_generator_tqdm:
                batch_num += 1
                tensor_batch = arrays_to_variables(batch, self._cuda_device)
                self._optimizer.zero_grad()
                output_dict = self._model.forward(**tensor_batch)
                try:
                    loss = output_dict["loss"]
                    loss.backward()
                    # Make sure Variable is on the cpu before converting to numpy.
                    # .cpu() is a no-op if you aren't using GPUs.
                    train_loss += loss.data.cpu().numpy()
                except KeyError:
                    raise ConfigurationError(
                        "The model you are trying to optimize does not contain a"
                        " 'loss' key in the output of model.forward(inputs).")

                if self._grad_norm:
                    clip_grad_norm(self._model.parameters(), self._grad_norm)
                self._optimizer.step()
                metrics = self._model.get_metrics()
                metrics["loss"] = float(train_loss / batch_num)
                train_generator_tqdm.set_description(
                    self._description_from_metrics(metrics))
            metrics = self._model.get_metrics(reset=True)
            metrics["loss"] = float(train_loss / batch_num)

            if self._validation_dataset is not None:
                # Switch to evaluation mode.
                self._model.eval()
                val_generator = self._iterator(self._validation_dataset,
                                               num_epochs=1)
                val_generator_tqdm = tqdm.tqdm(val_generator,
                                               total=num_validation_batches)
                batch_num = 0
                for batch in val_generator_tqdm:
                    batch_num += 1
                    tensor_batch = arrays_to_variables(batch,
                                                       self._cuda_device,
                                                       for_training=False)
                    val_output_dict = self._model.forward(**tensor_batch)
                    loss = val_output_dict["loss"]
                    val_loss += loss.data.cpu().numpy()
                    val_metrics = self._model.get_metrics()
                    val_metrics["loss"] = float(val_loss / batch_num)
                    val_generator_tqdm.set_description(
                        self._description_from_metrics(val_metrics))
                val_metrics = self._model.get_metrics(reset=True)
                val_metrics["loss"] = float(val_loss / batch_num)
                message_template = "Training %s : %3f    Validation %s : %3f "
                for name, value in metrics.items():
                    logger.info(message_template, name, value, name,
                                val_metrics[name])
                    if self._serialization_prefix:
                        train_log.add_scalar(name, value, epoch)
                        validation_log.add_scalar(name, val_metrics[name],
                                                  epoch)

                this_epoch = val_metrics[self._validation_metric]
                if len(validation_metric_per_epoch) > self._patience:
                    if max(validation_metric_per_epoch[-self._patience:]
                           ) > this_epoch:
                        logger.info("Ran out of patience.  Stopping training.")
                        break
                validation_metric_per_epoch.append(this_epoch)
                is_best_so_far = this_epoch == max(validation_metric_per_epoch)
                if self._serialization_prefix:
                    self._save_checkpoint(epoch, is_best=is_best_so_far)
            else:
                message_template = "Training %s : %3f "
                for name, value in metrics.items():
                    logger.info(message_template, name, value)
                    if self._serialization_prefix:
                        train_log.add_scalar(name, value, epoch)
                if self._serialization_prefix:
                    self._save_checkpoint(epoch)
示例#16
0
def main():
    parser = argparse.ArgumentParser(description="Train U-net")

    parser.add_argument('--name', type=str, default='unknown',
                        help='network name')

    parser.add_argument('--model_dir', type=str, required=True,
                        help='Where network will be saved and restored')

    parser.add_argument("--lr",
                        type=float,
                        default=1e-4,
                        help="Adam learning rate")

    parser.add_argument("--batch_size",
                        type=int,
                        default=5,
                        help="Batch size")


    parser.add_argument("--input_size",
                        type=int,
                        default=324,
                        help="Input size of the image will fed into network. Input_size = 16*n + 4, Default: 324")

    parser.add_argument("--output_size",
                        type=int,
                        default=116,
                        help="size of the image produced by network. Default: 116")


    parser.add_argument("--tb_log_dir",
                        type=str,
                        required=True,
                        help="Tensorboard log dir")

    parser.add_argument("--n_steps",
                        type=int,
                        default=0,
                        help="Number of the steps. Default: 0 means infinity steps.")

    parser.add_argument("--dataset_dir",
                        type=str,
                        default="../dataset/trainset")

    parser.add_argument("--pretrained_vgg",
                        type=str,
                        choices=['yes', 'no'],
                        default="yes",
                        help="Use pretrained vgg weigth")

    parser.add_argument("--fix_vgg",
                        type=str,
                        choices=['yes', 'no'],
                        default="yes",
                        help="Fix vgg weights while learning")

    parser.add_argument("--validation_freq",
                        type=int,
                        default=100,
                        help="Validation freq. Default 100")

    parser.add_argument("--validation_set_size",
                        type=int,
                        default=20,
                        help="metrics will be averaged by validation_set_size. Default 20")

    parser.add_argument("--channel",
                        type=str,
                        choices=['rgb', 'gray'],
                        default="rgb",
                        help="channel. Default: rgb")




    args = parser.parse_args()

    net_name = args.name
    model_dir = args.model_dir
    learning_rate = args.lr
    batch_size = args.batch_size
    net_input_size = args.input_size
    net_output_size = args.output_size
    tb_log_dir = args.tb_log_dir
    n_steps = args.n_steps
    dataset_dir = args.dataset_dir
    pretrained_vgg = args.pretrained_vgg == 'yes'
    fix_vgg = args.fix_vgg == 'yes'
    validation_freq = args.validation_freq
    validation_set_size = args.validation_set_size
    channel = args.channel

    print("Load dataset")
    dataset = DS.DataSet(dataset_dir)

    print("Initialize network manager")
    network_manager = NManager(model_dir, net_name)
    if network_manager.registered:
        net = network_manager.get_net()
    else:
        print("Use pretrained weihts %s" % str(pretrained_vgg))
        net = U.Unet(vgg_pretrained=pretrained_vgg)
        network_manager.register_net(net)

    print("Move to GPU")
    net.cuda()

    if channel == "rgb":
        def get_features(x):
            return x.get_ndarray([DS.ChannelRGB_PanSharpen])
    else:
        def get_features(x):
            img0 = x.get_ndarray([DS.ChannelPAN])[0]
            img = np.array([img0, img0, img0])
            return img


    def get_target(x):
        return x.get_interior_mask()

    train_sampler = S.Sampler(dataset.train_images(), get_features, get_target,
                                         net_input_size, net_output_size, rotate_amplitude=20,
                                         random_crop=True, reflect=True)()

    test_sampler = S.Sampler(dataset.test_images(), get_features, get_target,
                                         net_input_size, net_output_size, rotate_amplitude=20,
                                         random_crop=True, reflect=True)()

    if fix_vgg:
        parameters = list(net.bn.parameters()) + list(net.decoder.parameters()) + list(net.conv1x1.parameters())
    else:
        parameters = net.parameters()

    print("LR: %f" % learning_rate)
    optimizer = torch.optim.Adam(parameters, lr=learning_rate)

    logger = SummaryWriter(tb_log_dir + "/" + net_name)

    print("Start learning")
    with network_manager.session(n_steps) as (iterator, initial_step):
        for step in tqdm.tqdm(iterator, initial=initial_step):
            batch_features, batch_target = batch_generator(train_sampler, batch_size)
            
            batch_features = Variable(FloatTensor(batch_features)).cuda()
            batch_target = Variable(FloatTensor(batch_target)).cuda()

            predicted = net.forward(batch_features)

            train_metrics = eval_base_metrics(predicted, batch_target)
            train_metrics = eval_precision_recall_f1(**train_metrics)

            loss = train_metrics['loss']

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            log_metrics(logger, '', train_metrics, step)
            logger.add_scalar('lr', np.log(learning_rate)/np.log(10), step)
            
            if step % 1000 == 0:
                network_manager.save()

            if step % validation_freq == 0:
                test_metrics = average_metrics(net, test_sampler, batch_size, validation_set_size)
                log_metrics(logger, 'val', test_metrics, step)

                avg_train_metrics = average_metrics(net, train_sampler, batch_size, validation_set_size)
                log_metrics(logger, 'avg_train', avg_train_metrics, step)

                generate_image(logger, net, 'val', dataset.test_images(), get_features, get_target,
                               net_input_size, net_output_size, step)

                generate_image(logger, net, 'train', dataset.train_images(), get_features, get_target,
                               net_input_size, net_output_size, step)
示例#17
0
def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix,
                decay):
    summary_writer = SummaryWriter(args.tblog_dir)
    lr_scheduler = SimpleLRScheduler(learning_rate)
    optimizer_params = {'lr_scheduler': lr_scheduler}
    module.init_params()
    module.init_optimizer(kvstore=kvstore,
                          optimizer=optimizer,
                          optimizer_params=optimizer_params)
    n_epoch = 0
    while True:
        if n_epoch >= num_epoch:
            break
        train_iter.reset()
        val_iter.reset()
        loss_metric.reset()
        for n_batch, data_batch in enumerate(train_iter):
            module.forward_backward(data_batch)
            module.update()
            module.update_metric(loss_metric, data_batch.label)
            loss_metric.get_batch_log(n_batch)
        train_acc, train_loss, train_recon_err = loss_metric.get_name_value()
        loss_metric.reset()
        for n_batch, data_batch in enumerate(val_iter):
            module.forward(data_batch)
            module.update_metric(loss_metric, data_batch.label)
            loss_metric.get_batch_log(n_batch)
        val_acc, val_loss, val_recon_err = loss_metric.get_name_value()

        summary_writer.add_scalar('train_acc', train_acc, n_epoch)
        summary_writer.add_scalar('train_loss', train_loss, n_epoch)
        summary_writer.add_scalar('train_recon_err', train_recon_err, n_epoch)
        summary_writer.add_scalar('val_acc', val_acc, n_epoch)
        summary_writer.add_scalar('val_loss', val_loss, n_epoch)
        summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch)

        print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' %
              (n_epoch, train_acc, train_loss, train_recon_err))
        print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' %
              (n_epoch, val_acc, val_loss, val_recon_err))
        print('SAVE CHECKPOINT')

        module.save_checkpoint(prefix=model_prefix, epoch=n_epoch)
        n_epoch += 1
        lr_scheduler.learning_rate = learning_rate * (decay**n_epoch)
示例#18
0
def main():
    global args, best_photo_loss, n_iter
    args = parser.parse_args()
    if args.dataset_format == 'stacked':
        from datasets.stacked_sequence_folders import SequenceFolder
    elif args.dataset_format == 'sequential':
        from datasets.sequence_folders import SequenceFolder
    save_path = Path('{}epochs{},b{},lr{}'.format(
        args.epochs,
        ',epochSize' + str(args.epoch_size) if args.epoch_size > 0 else '',
        args.batch_size, args.lr))
    timestamp = datetime.datetime.now().strftime("%m-%d-%H:%M")
    args.save_path = 'checkpoints' / save_path / timestamp
    print('=> will save everything to {}'.format(args.save_path))
    args.save_path.makedirs_p()
    torch.manual_seed(args.seed)

    train_writer = SummaryWriter(args.save_path / 'train')
    valid_writer = SummaryWriter(args.save_path / 'valid')
    output_writers = []
    if args.log_output:
        for i in range(3):
            output_writers.append(
                SummaryWriter(args.save_path / 'valid' / str(i)))

    # Data loading code
    normalize = custom_transforms.Normalize(mean=[0.5, 0.5, 0.5],
                                            std=[0.2, 0.2, 0.2])
    input_transform = custom_transforms.Compose([
        custom_transforms.RandomHorizontalFlip(),
        custom_transforms.RandomScaleCrop(),
        custom_transforms.ArrayToTensor(), normalize
    ])

    print("=> fetching scenes in '{}'".format(args.data))
    train_set = SequenceFolder(args.data,
                               transform=input_transform,
                               seed=args.seed,
                               train=True)
    val_set = SequenceFolder(args.data,
                             transform=custom_transforms.Compose([
                                 custom_transforms.ArrayToTensor(), normalize
                             ]),
                             seed=args.seed,
                             train=False)
    print('{} samples found in {} train scenes'.format(len(train_set),
                                                       len(train_set.scenes)))
    print('{} samples found in {} valid scenes'.format(len(val_set),
                                                       len(val_set.scenes)))
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_set,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.epoch_size == 0:
        args.epoch_size = len(train_loader)

    # create model
    print("=> creating model")

    disp_net = models.DispNetS().cuda()
    pose_exp_net = models.PoseExpNet(nb_ref_imgs=args.sequence_length -
                                     1).cuda()

    if args.pretrained_exp_pose:
        print("=> using pre-trained weights for explainabilty and pose net")
        a = torch.load(args.pretrained_exp_pose)
        pose_exp_net.load_state_dict(a['state_dict'])
    else:
        pose_exp_net.init_weights()

    if args.pretrained_disp:
        print("=> using pre-trained weights for Dispnet")
        a = torch.load(args.pretrained_disp)
        disp_net.load_state_dict(a['state_dict'])
    else:
        disp_net.init_weights()

    cudnn.benchmark = True
    print('=> setting adam solver')

    parameters = set()
    for net_ in [disp_net, pose_exp_net]:
        parameters |= set(net_.parameters())
    optimizer = torch.optim.Adam(parameters,
                                 args.lr,
                                 betas=(args.momentum, args.beta),
                                 weight_decay=args.weight_decay)

    with open(args.save_path / args.log_summary, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        writer.writerow(['train_loss', 'validation_loss'])

    with open(args.save_path / args.log_full, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        writer.writerow(
            ['train_loss', 'photo_loss', 'explainability_loss', 'smooth_loss'])

    logger = TermLogger(n_epochs=args.epochs,
                        train_size=min(len(train_loader), args.epoch_size),
                        valid_size=len(val_loader))
    logger.epoch_bar.start()

    for epoch in range(args.epochs):
        logger.epoch_bar.update(epoch)

        # train for one epoch
        logger.reset_train_bar()
        train_loss = train(train_loader, disp_net, pose_exp_net, optimizer,
                           args.epoch_size, logger, train_writer)
        logger.train_writer.write(' * Avg Loss : {:.3f}'.format(train_loss))

        # evaluate on validation set
        logger.reset_valid_bar()
        valid_photo_loss, valid_exp_loss, valid_total_loss = validate(
            val_loader, disp_net, pose_exp_net, epoch, logger, output_writers)
        logger.valid_writer.write(
            ' * Avg Photo Loss : {:.3f}, Valid Loss : {:.3f}, Total Loss : {:.3f}'
            .format(valid_photo_loss, valid_exp_loss, valid_total_loss))
        valid_writer.add_scalar(
            'photometric_error', valid_photo_loss * 4, n_iter
        )  # Loss is multiplied by 4 because it's only one scale, instead of 4 during training
        valid_writer.add_scalar('explanability_loss', valid_exp_loss * 4,
                                n_iter)
        valid_writer.add_scalar('total_loss', valid_total_loss * 4, n_iter)

        if best_photo_loss < 0:
            best_photo_loss = valid_photo_loss

        # remember lowest error and save checkpoint
        is_best = valid_photo_loss < best_photo_loss
        best_photo_loss = min(valid_photo_loss, best_photo_loss)
        save_checkpoint(args.save_path, {
            'epoch': epoch + 1,
            'state_dict': disp_net.state_dict()
        }, {
            'epoch': epoch + 1,
            'state_dict': pose_exp_net.state_dict()
        }, is_best)

        with open(args.save_path / args.log_summary, 'a') as csvfile:
            writer = csv.writer(csvfile, delimiter='\t')
            writer.writerow([train_loss, valid_total_loss])
    logger.epoch_bar.finish()
示例#19
0
        deconv.zero_grad()
        feature.zero_grad()

        loss.backward()

        optimizer_feature.step()
        optimizer_deconv.step()

        # visulize
        image = make_image_grid(inputs.data[:, :3], mean, std)
        writer.add_image('Image', torchvision.utils.make_grid(image), ib)
        msk = functional.sigmoid(msk)
        mask1 = msk.data
        mask1 = mask1.repeat(1, 3, 1, 1)
        writer.add_image('Image2', torchvision.utils.make_grid(mask1), ib)
        print('loss: %.4f (epoch: %d, step: %d)' % (loss.data[0], it, ib))
        writer.add_scalar('M_global', loss.data[0], istep)
        istep += 1

        del inputs, msk, lbl, loss, feats, mask1, image
        gc.collect()
        if ib % 1000 == 0:
            filename = ('%s/deconv-epoch-%d-step-%d.pth' %
                        (check_root, it, ib))
            torch.save(deconv.state_dict(), filename)
            filename = ('%s/feature-epoch-%d-step-%d.pth' %
                        (check_root, it, ib))
            torch.save(feature.state_dict(), filename)
            print('save: (epoch: %d, step: %d)' % (it, ib))
    validation(val_loader, '%s/%d' % (val_output_root, it), feature, deconv)
示例#20
0
class LogMetricsCallback(object):
    """Log metrics periodically in TensorBoard.
    This callback works almost same as `callback.Speedometer`, but write TensorBoard event file
    for visualization. For more usage, please refer https://github.com/dmlc/tensorboard

    Parameters
    ----------
    logging_dir : str
        TensorBoard event file directory.
        After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization.
    prefix : str
        Prefix for a metric name of `scalar` value.
        You might want to use this param to leverage TensorBoard plot feature,
        where TensorBoard plots different curves in one graph when they have same `name`.
        The follow example shows the usage(how to compare a train and eval metric in a same graph).

    Examples
    --------
    >>> # log train and eval metrics under different directories.
    >>> training_log = 'logs/train'
    >>> evaluation_log = 'logs/eval'
    >>> # in this case, each training and evaluation metric pairs has same name, you can add a prefix
    >>> # to make it separate.
    >>> batch_end_callbacks = [mx.tensorboard.LogMetricsCallback(training_log)]
    >>> eval_end_callbacks = [mx.tensorboard.LogMetricsCallback(evaluation_log)]
    >>> # run
    >>> model.fit(train,
    >>>     ...
    >>>     batch_end_callback = batch_end_callbacks,
    >>>     eval_end_callback  = eval_end_callbacks)
    >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization.
    """
    def __init__(self, logging_dir, score_store=False, prefix=None):
        self.prefix = prefix
        self.step = 0
        self.score_store = score_store
        try:
            self.summary_writer = SummaryWriter(logging_dir)
        except ImportError:
            logging.error(
                'You can install tensorboard via `pip install tensorboard`.')

    def __call__(self, param):
        """Callback to log training speed and metrics in TensorBoard."""
        self.step += 1
        if param.eval_metric is None:
            return
        name_value = param.eval_metric.get_name_value()
        if self.step % 20 == 0:
            for name, value in name_value:
                if self.prefix is not None:
                    name = '%s-%s' % (self.prefix, name)
                self.summary_writer.add_scalar(name, value, self.step)
        if self.step % 1000 == 0:
            im_ori = param.locals['data_batch'].label[0].asnumpy()
            im_rec = (param.locals['rec_img'])[0].asnumpy()
            im_ori = imageFromTensor(im_ori)
            im_rec = imageFromTensor(im_rec)
            self.summary_writer.add_image('im_ori', im_ori, self.step)
            self.summary_writer.add_image('im_rec', im_rec, self.step)

            if self.score_store:
                facenet_scores = param.locals['facenet_scores']
                self.summary_writer.add_scalar('scores_mean',
                                               facenet_scores.mean(),
                                               self.step)
                self.summary_writer.add_histogram('facenet_scores',
                                                  facenet_scores, self.step)
示例#21
0
def main():
    global args
    args = parser.parse_args()

    # Data preprocessing.
    print('==> Preparing data......')
    assert (args.dataset == 'cifar10' or args.dataset
            == 'cifar100'), "Only support cifar10 or cifar100 dataset"
    if args.dataset == 'cifar10':
        print('To train and eval on cifar10 dataset......')
        num_classes = 10
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(mean_cifar10, std_cifar10),
        ])
        transform_test = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean_cifar10, std_cifar10),
        ])
        train_set = torchvision.datasets.CIFAR10(root='./data',
                                                 train=True,
                                                 download=True,
                                                 transform=transform_train)
        train_loader = torch.utils.data.DataLoader(train_set,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=4)

        test_set = torchvision.datasets.CIFAR10(root='./data',
                                                train=False,
                                                download=True,
                                                transform=transform_test)
        test_loader = torch.utils.data.DataLoader(test_set,
                                                  batch_size=100,
                                                  shuffle=False,
                                                  num_workers=4)
    else:
        print('To train and eval on cifar100 dataset......')
        num_classes = 100
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(mean_cifar100, std_cifar100),
        ])
        transform_test = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean_cifar100, std_cifar100),
        ])
        train_set = torchvision.datasets.CIFAR100(root='./data',
                                                  train=True,
                                                  download=True,
                                                  transform=transform_train)
        train_loader = torch.utils.data.DataLoader(train_set,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=4)

        test_set = torchvision.datasets.CIFAR100(root='./data',
                                                 train=False,
                                                 download=True,
                                                 transform=transform_test)
        test_loader = torch.utils.data.DataLoader(test_set,
                                                  batch_size=100,
                                                  shuffle=False,
                                                  num_workers=4)

    # Model
    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isdir(
            args.ckpt_path), 'Error: checkpoint directory not exists!'
        checkpoint = torch.load(os.path.join(args.ckpt_path, 'ckpt.t7'))
        model = checkpoint['model']
        best_acc = checkpoint['best_acc']
        start_epoch = checkpoint['epoch']
    else:
        print('==> Building model..')
        model = models.__dict__[args.arch](num_classes)
        start_epoch = args.start_epoch

    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))

    # Use GPUs if available.
    if torch.cuda.is_available():
        model.cuda()
        model = torch.nn.DataParallel(model,
                                      device_ids=range(
                                          torch.cuda.device_count()))
        cudnn.benchmark = True

    # Define loss function and optimizer.
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          nesterov=args.nesterov,
                          weight_decay=args.weight_decay)

    log_dir = 'logs/' + datetime.now().strftime('%B%d  %H:%M:%S')
    train_writer = SummaryWriter(os.path.join(log_dir, 'train'))
    test_writer = SummaryWriter(os.path.join(log_dir, 'test'))

    # Save argparse commandline to a file.
    with open(os.path.join(log_dir, 'commandline_args.txt'), 'w') as f:
        f.write('\n'.join(sys.argv[1:]))

    best_acc = 0  # best test accuracy

    for epoch in range(start_epoch, args.epochs):
        # Learning rate schedule.
        lr = adjust_learning_rate(optimizer, epoch + 1)
        train_writer.add_scalar('lr', lr, epoch)

        # Train for one epoch.
        train(train_loader, model, criterion, optimizer, train_writer, epoch)

        # Eval on test set.
        num_iter = (epoch + 1) * len(train_loader)
        acc = eval(test_loader, model, criterion, test_writer, epoch, num_iter)

        # Save checkpoint.
        print('Saving Checkpoint......')
        state = {
            'model': model.module if torch.cuda.is_available() else model,
            'best_acc': best_acc,
            'epoch': epoch,
        }
        if not os.path.isdir(os.path.join(log_dir, 'last_ckpt')):
            os.mkdir(os.path.join(log_dir, 'last_ckpt'))
        torch.save(state, os.path.join(log_dir, 'last_ckpt', 'ckpt.t7'))
        if acc > best_acc:
            best_acc = acc
            if not os.path.isdir(os.path.join(log_dir, 'best_ckpt')):
                os.mkdir(os.path.join(log_dir, 'best_ckpt'))
            torch.save(state, os.path.join(log_dir, 'best_ckpt', 'ckpt.t7'))

        train_writer.add_scalar('best_acc', best_acc, epoch)

    train_writer.close()
    test_writer.close()
示例#22
0
        d_real = L_Df(xR)
        d_fake = L_G#L_Df(xG)
        L_D = d_real-kt*d_fake
        L_D.backward()
        optimD.step()

        L_D_val = L_D.data[0]
        L_G_val = L_G.data[0]


        kt = kt+lamk*(opt.gamma*L_D_val-L_G_val)
        if kt<0:
            kt = 0
        M_global = L_D_val + math.fabs(opt.gamma*L_D_val-L_G_val)

        writer.add_scalar('misc/M_global', M_global, n_iter)
        writer.add_scalar('misc/kt', kt, n_iter)
        writer.add_scalar('loss/L_D', L_D_val, n_iter)
        writer.add_scalar('loss/L_G', L_G_val, n_iter)
        writer.add_scalar('loss/d_real', d_real.data[0], n_iter)
        writer.add_scalar('loss/d_fake', d_fake.data[0], n_iter)

        LD_LG = L_D_val-L_G_val
        log_variable(M_global, L_D_val, L_G_val, kt, LD_LG)
        if n_iter%10000==0:
            opt.lr = opt.lr/2
            for param_group in optimD.param_groups:
                param_group['lr'] = opt.lr#param_group['lr']/2
            for param_group in optimG.param_groups:
                param_group['lr'] = opt.lr#param_group['lr']/2
            
示例#23
0
        err_d_fake.backward()
        optimizerD.step()
        err_d = err_d_fake + err_d_real

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        output = net_d(input_fake)
        output = F.sigmoid(output)
        label.fill_(1)

        err_g = criterion(output, Variable(label))
        net_g.zero_grad()
        err_g.backward()
        optimizerG.step()
        if i % 100 == 0:
            ##########################
            # Visualization
            ##########################
            images = make_grid((input_fake.data[:8] + 1) / 2)
            writer.add_image('images', images, i)
            writer.add_scalar('error D', err_d.data[0], i)
            writer.add_scalar('error G', err_g.data[0], i)

        print 'epoch %d step %d, err_d=%.4f, err_g=%.4f' % (
            epoch, i, err_d.data[0], err_g.data[0])
    torch.save(net_g.state_dict(),
               '%s/NetG-epoch-%d-step-%d.pth' % (check_root, epoch, i))
    torch.save(net_d.state_dict(),
               '%s/NetD-epoch-%d-step-%d.pth' % (check_root, epoch, i))
示例#24
0
import torch
import torchvision.utils as vutils
import numpy as np
import torchvision.models as models
from datetime import datetime
from tensorboard import SummaryWriter
resnet18 = models.resnet18(True)
writer = SummaryWriter('runs/' + datetime.now().strftime('%B%d  %H:%M:%S'))
sample_rate = 44100
freqs = [262, 294, 330, 349, 392, 440, 440, 440, 440, 440, 440]
for n_iter in range(100):
    M_global = torch.rand(1)  # value to keep
    writer.add_scalar('M_global', M_global[0], n_iter)
    x = torch.rand(32, 3, 64, 64)  # output from network
    if n_iter % 10 == 0:
        x = vutils.make_grid(x, normalize=True, scale_each=True)
        writer.add_image('Image', x, n_iter)
        x = torch.zeros(sample_rate * 2)
        for i in range(x.size(0)):
            x[i] = np.cos(
                freqs[n_iter // 10] * np.pi * float(i) /
                float(sample_rate))  # sound amplitude should in [-1, 1]
        writer.add_audio('Audio', x, n_iter)
        for name, param in resnet18.named_parameters():
            writer.add_histogram(name,
                                 param.clone().cpu().data.numpy(), n_iter)
        writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)
        writer.add_text('another Text',
                        'another text logged at step:' + str(n_iter), n_iter)

writer.close()
示例#25
0
def do_training(args, module, data_train, data_val, begin_epoch=0):
    from distutils.dir_util import mkpath
    from log_util import LogUtil

    log = LogUtil().getlogger()
    mkpath(os.path.dirname(get_checkpoint_path(args)))

    #seq_len = args.config.get('arch', 'max_t_count')
    batch_size = args.config.getint('common', 'batch_size')
    save_checkpoint_every_n_epoch = args.config.getint(
        'common', 'save_checkpoint_every_n_epoch')
    save_checkpoint_every_n_batch = args.config.getint(
        'common', 'save_checkpoint_every_n_batch')
    enable_logging_train_metric = args.config.getboolean(
        'train', 'enable_logging_train_metric')
    enable_logging_validation_metric = args.config.getboolean(
        'train', 'enable_logging_validation_metric')

    contexts = parse_contexts(args)
    num_gpu = len(contexts)
    eval_metric = STTMetric(batch_size=batch_size,
                            num_gpu=num_gpu,
                            is_logging=enable_logging_validation_metric,
                            is_epoch_end=True)
    # tensorboard setting
    loss_metric = STTMetric(batch_size=batch_size,
                            num_gpu=num_gpu,
                            is_logging=enable_logging_train_metric,
                            is_epoch_end=False)

    optimizer = args.config.get('optimizer', 'optimizer')
    learning_rate = args.config.getfloat('train', 'learning_rate')
    learning_rate_annealing = args.config.getfloat('train',
                                                   'learning_rate_annealing')

    mode = args.config.get('common', 'mode')
    num_epoch = args.config.getint('train', 'num_epoch')
    clip_gradient = args.config.getfloat('optimizer', 'clip_gradient')
    weight_decay = args.config.getfloat('optimizer', 'weight_decay')
    save_optimizer_states = args.config.getboolean('train',
                                                   'save_optimizer_states')
    show_every = args.config.getint('train', 'show_every')
    optimizer_params_dictionary = json.loads(
        args.config.get('optimizer', 'optimizer_params_dictionary'))
    kvstore_option = args.config.get('common', 'kvstore_option')
    n_epoch = begin_epoch
    is_bucketing = args.config.getboolean('arch', 'is_bucketing')

    if clip_gradient == 0:
        clip_gradient = None
    if is_bucketing and mode == 'load':
        model_file = args.config.get('common', 'model_file')
        model_name = os.path.splitext(model_file)[0]
        model_num_epoch = int(model_name[-4:])

        model_path = 'checkpoints/' + str(model_name[:-5])
        symbol, data_names, label_names = module(1600)
        model = STTBucketingModule(
            sym_gen=module,
            default_bucket_key=data_train.default_bucket_key,
            context=contexts)
        data_train.reset()

        model.bind(data_shapes=data_train.provide_data,
                   label_shapes=data_train.provide_label,
                   for_training=True)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            model_path, model_num_epoch)
        model.set_params(arg_params, aux_params)
        module = model
    else:
        module.bind(data_shapes=data_train.provide_data,
                    label_shapes=data_train.provide_label,
                    for_training=True)

    if begin_epoch == 0 and mode == 'train':
        module.init_params(initializer=get_initializer(args))

    lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate)

    def reset_optimizer(force_init=False):
        optimizer_params = {
            'lr_scheduler': lr_scheduler,
            'clip_gradient': clip_gradient,
            'wd': weight_decay
        }
        optimizer_params.update(optimizer_params_dictionary)
        module.init_optimizer(kvstore=kvstore_option,
                              optimizer=optimizer,
                              optimizer_params=optimizer_params,
                              force_init=force_init)

    if mode == "train":
        reset_optimizer(force_init=True)
    else:
        reset_optimizer(force_init=False)
        data_train.reset()
        data_train.is_first_epoch = True

    #tensorboard setting
    tblog_dir = args.config.get('common', 'tensorboard_log_dir')
    summary_writer = SummaryWriter(tblog_dir)

    while True:

        if n_epoch >= num_epoch:
            break
        loss_metric.reset()
        log.info('---------train---------')
        for nbatch, data_batch in enumerate(data_train):
            module.forward_backward(data_batch)
            module.update()
            # tensorboard setting
            if (nbatch + 1) % show_every == 0:
                module.update_metric(loss_metric, data_batch.label)
            #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch)
            if (nbatch + 1) % save_checkpoint_every_n_batch == 0:
                log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch,
                         nbatch)
                module.save_checkpoint(
                    prefix=get_checkpoint_path(args) + "n_epoch" +
                    str(n_epoch) + "n_batch",
                    epoch=(int(
                        (nbatch + 1) / save_checkpoint_every_n_batch) - 1),
                    save_optimizer_states=save_optimizer_states)
        # commented for Libri_sample data set to see only train cer
        log.info('---------validation---------')
        data_val.reset()
        eval_metric.reset()
        for nbatch, data_batch in enumerate(data_val):
            # when is_train = False it leads to high cer when batch_norm
            module.forward(data_batch, is_train=True)
            module.update_metric(eval_metric, data_batch.label)

        # tensorboard setting
        val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value()
        log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer,
                 int(val_n_label - val_l_dist), val_n_label)
        curr_acc = val_cer
        summary_writer.add_scalar('CER validation', val_cer, n_epoch)
        assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric'

        data_train.reset()
        data_train.is_first_epoch = False

        # tensorboard setting
        train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value(
        )
        summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch)
        summary_writer.add_scalar('CER train', train_cer, n_epoch)

        # save checkpoints
        if n_epoch % save_checkpoint_every_n_epoch == 0:
            log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch)
            module.save_checkpoint(prefix=get_checkpoint_path(args),
                                   epoch=n_epoch,
                                   save_optimizer_states=save_optimizer_states)

        n_epoch += 1

        lr_scheduler.learning_rate = learning_rate / learning_rate_annealing

    log.info('FINISH')
示例#26
0
文件: main.py 项目: cfh3c/Pytorch-NCE
    if args.train:
        # At any point you can hit Ctrl + C to break out of training early.
        try:
            # Loop over epochs.
            for epoch in range(1, args.epochs + 1):
                epoch_start_time = time.time()
                train(model,
                      corpus.train,
                      lr=lr,
                      weight_decay=args.weight_decay)
                if args.prof:
                    break
                val_ppl = evaluate(model, corpus.valid)
                if args.tb_name:
                    writer.add_scalar('valid_PPL', val_ppl, epoch)
                print('-' * 89)
                print('| end of epoch {:3d} | time: {:5.2f}s |'
                      'valid ppl {:8.2f}'.format(
                          epoch, (time.time() - epoch_start_time), val_ppl))
                print('-' * 89)
                with open(args.save + '.epoch_{}'.format(epoch), 'wb') as f:
                    torch.save(model, f)
                # Save the model if the validation loss is the best we've seen so far.
                if not best_val_ppl or val_ppl < best_val_ppl:
                    with open(args.save, 'wb') as f:
                        torch.save(model, f)
                    best_val_ppl = val_ppl
                else:
                    # Anneal the learning rate if no improvement has been seen in the
                    # validation dataset.
示例#27
0
        # accuracy
        accuracy_discriminator = classification_accuracy(out_cat, labels_cat)
        # BACKPROP
        optimizer_discriminator.zero_grad()
        net.zero_grad()
        loss_discriminator.backward(retain_graph=True)
        optimizer_discriminator.step()

        # LOGGING
        progress.update(progress.value + 1,
                        loss_discriminator=loss_discriminator.data.cpu().numpy()[0],
                        accuracy_discriminator=accuracy_discriminator.data.cpu().numpy()[0],
                        )

        # LOSS ACCURACY
        writer.add_scalar('pretrain_loss_discriminator', loss_discriminator.data[0], i)
        writer.add_scalar('pretrain_accuracy_discriminator', accuracy_discriminator.data[0], i)
    progress.finish()

#print "JOINT TRAIN"
for i in xrange(num_epochs):
    progress = progressbar.ProgressBar(min_value=0, max_value=batch_number, initial_value=0, widgets=widgets).start()

    for j, (data_batch, labels_batch) in enumerate(loader):
        net.train(True)
        # REAL
        net.batch_real = True
        # trasformo in variabili
        data_batch = Variable(data_batch, requires_grad=True).cuda()
        # calcolo uscita
        out_real = net(data_batch)
                            tfboard_writer=tfboard_writer)
                else :
                    pass

                niter += 1 # global iteration counter #
                pass
            pass

        # info_header = ['set', 'loss', 'loss core', 'loss bern end', 'acc bern end']
        # info_table = []
        # logger.info("Epoch %d -- lrate %f -- time %.2fs"%(ee+1, opts['lrate'], time.time() - start_time))
        # for set_name in mloss.keys() :
            # mloss[set_name] /= mcount[set_name]
            # mloss_core[set_name] /= mcount[set_name]
            # mloss_bernend[set_name] /= mcount[set_name]
            # macc_bernend[set_name] /= mcount[set_name]
            # info_table.append([set_name, mloss[set_name], mloss_core[set_name], mloss_bernend[set_name], macc_bernend[set_name]])
        # logger.info('\n'+tab.tabulate(info_table, headers=info_header, floatfmt='.3f', tablefmt='rst'))

        # serialized best dev model #
        save_model(model_gen_a2b, 'gen_a2b', ee)
        save_model(model_gen_b2a, 'gen_b2a', ee)

        # increase step scheduler #
        scheduler_coeff_gan.step()
        if tfboard_writer is not None :
            tfboard_writer.add_scalar('coeff/coeff_gan', scheduler_coeff_gan.value, ee)
        pass

    pass
示例#29
0
def train():

    loader_train = CityscapesLoader('/home/cattaneod/CITYSCAPES_crop/',
                                    split='train',
                                    is_transform=True,
                                    img_size=None,
                                    transforms=data_augmentation)
    trainloader = data.DataLoader(loader_train,
                                  batch_size=batch_size,
                                  num_workers=num_workers,
                                  shuffle=True,
                                  pin_memory=True)
    loader_test = CityscapesLoader(base_data_folder,
                                   split='test',
                                   is_transform=True,
                                   img_size=None,
                                   transforms=data_augmentation)
    test_loader = data.DataLoader(loader_test,
                                  batch_size=batch_size,
                                  num_workers=num_workers,
                                  shuffle=False,
                                  pin_memory=True)
    loader_val = CityscapesLoader(base_data_folder,
                                  split='val',
                                  is_transform=True,
                                  img_size=image_shape,
                                  return_original=True)
    valloader = data.DataLoader(loader_val,
                                batch_size=batch_size,
                                num_workers=num_workers,
                                shuffle=False,
                                pin_memory=True)

    model = deeplab_resnet_DUC.Res_Deeplab_DUC(num_classes)

    if TBWriter:
        writer = SummaryWriter()
    '''
    if resume:
        print("Loading from: ", resume_filename)
        saved_state_dict = torch.load(resume_filename)
        if num_classes != 21:
            for i in saved_state_dict:
                # Scale.layer5.conv2d_list.3.weight
                i_parts = i.split('.')
                if i_parts[1] == 'layer5':
                    saved_state_dict[i] = model.state_dict()[i]

        model.load_state_dict(saved_state_dict)
    '''

    if torch.cuda.is_available():
        print("Using GPU")
        model.cuda(0)
    else:
        print("Using CPU")

    model.train()

    if opt == "SGD":
        optimizer = torch.optim.SGD([{
            'params': get_1x_lr_params_NOscale(model),
            'lr': l_rate
        }, {
            'params': get_10x_lr_params(model),
            'lr': 10 * l_rate
        }],
                                    lr=l_rate,
                                    momentum=0.9,
                                    weight_decay=5e-4)
    elif opt == "Adam":
        optimizer = torch.optim.Adam([{
            'params': get_1x_lr_params_NOscale(model),
            'lr': 0 * l_rate
        }, {
            'params': get_10x_lr_params(model),
            'lr': 10 * l_rate
        }],
                                     lr=l_rate,
                                     weight_decay=5e-4)

    if resume:
        print("Resuming From ", resume_filename)
        checkpoint = torch.load(resume_filename)
        saved_state_dict = checkpoint['state_dict']
        if reset_layer5:
            for i in model.state_dict():
                # Scale.layer5.conv2d_list.3.weight
                i_parts = i.split('.')
                if i not in saved_state_dict or i_parts[1] == 'layer5':
                    saved_state_dict[i] = model.state_dict()[i]
        model.load_state_dict(saved_state_dict)
        starting_epoch = checkpoint['epoch'] + 1
        if poly_lr:
            lr_ = poly_lr2(l_rate,
                           len(trainloader) * starting_epoch,
                           lr_decay_iter=1,
                           max_iter=len(trainloader) * epochs)
            if lr_:
                if opt == "SGD":
                    optimizer = torch.optim.SGD(
                        [{
                            'params': get_1x_lr_params_NOscale(model),
                            'lr': lr_
                        }, {
                            'params': get_10x_lr_params(model),
                            'lr': 10 * lr_
                        }],
                        lr=lr_,
                        momentum=0.9,
                        weight_decay=5e-4)
                elif opt == "Adam":
                    optimizer = torch.optim.Adam(
                        [{
                            'params': get_1x_lr_params_NOscale(model),
                            'lr': 0 * lr_
                        }, {
                            'params': get_10x_lr_params(model),
                            'lr': 10 * lr_
                        }],
                        lr=lr_,
                        weight_decay=5e-4)

    best_metric = 0
    old_file = ""
    train_acc = AverageMeter()
    train_IoU = AverageMeter()
    train_loss = AverageMeter()
    for epoch in range(starting_epoch, epochs):
        train_acc.reset()
        train_IoU.reset()
        train_loss.reset()
        train_cfmatrix = np.zeros((num_classes, num_classes))

        print("\nEpoch: ", epoch)

        if overlay_during_training and epoch % 1 == 0:
            for i in range(15):
                print("Overlaying image ", i)
                names, original_img, test_img, _ = loader_val[i]
                test_img = test_img.unsqueeze(0)
                original_img = original_img.unsqueeze(0)
                original_img = Variable(original_img.cuda())
                model.eval()
                test_pred = model(
                    Variable(test_img.cuda(0), requires_grad=True))
                test_img = Variable(test_img.cuda(0), requires_grad=True)
                #if TBWriter and i==0:
                #    writer.add_graph(model, test_pred)
                test_pred = F.upsample_bilinear(test_pred, (1024, 2048))
                overlay_images(names,
                               original_img,
                               test_pred,
                               epoch,
                               str(i) + '_',
                               convert_id=False)
                del test_pred
                del test_img

        model.train()
        optimizer.zero_grad()
        with tqdm.tqdm(trainloader, ncols=150) as t:
            lr_ = l_rate
            for i, (images, labels) in enumerate(t):
                if torch.cuda.is_available():
                    images = Variable(images.cuda(0))
                    labels = Variable(labels.cuda(0))
                else:
                    images = Variable(images)
                    labels = Variable(labels)

                iter = len(trainloader) * epoch + i

                outputs = model(images)
                #g = make_dot(outputs)
                #g.save('./t.dot')

                loss = misc.cross_entropy2d(outputs, labels, ignore_index=255)
                loss = loss / update_batches

                loss.backward()

                t.set_description('Loss: %8.4f - LR = %f' %
                                  (update_batches * loss.data[0], lr_))

                train_loss.update(update_batches * loss.data[0])
                acc, IoU, cf_matrix = accuracy_IoU(
                    outputs, labels, np.array(range(num_classes)))
                if acc is not None:
                    train_acc.update(acc)
                    train_IoU.update(np.nanmean(IoU))
                    train_cfmatrix = train_cfmatrix + cf_matrix

                if i % update_batches == 0:
                    optimizer.step()
                    if poly_lr:
                        lr_ = poly_lr2(l_rate,
                                       iter,
                                       lr_decay_iter=1,
                                       max_iter=len(trainloader) * epochs)
                        if lr_:
                            t.set_description(
                                'Step: %8.4f - LR = %f' %
                                (update_batches * loss.data[0], lr_))
                            if opt == "SGD":
                                optimizer = torch.optim.SGD(
                                    [{
                                        'params':
                                        get_1x_lr_params_NOscale(model),
                                        'lr': lr_
                                    }, {
                                        'params': get_10x_lr_params(model),
                                        'lr': 10 * lr_
                                    }],
                                    lr=lr_,
                                    momentum=0.9,
                                    weight_decay=5e-4)
                            elif opt == "Adam":
                                optimizer = torch.optim.Adam(
                                    [{
                                        'params':
                                        get_1x_lr_params_NOscale(model),
                                        'lr': 0 * lr_
                                    }, {
                                        'params': get_10x_lr_params(model),
                                        'lr': 10 * lr_
                                    }],
                                    lr=lr_,
                                    weight_decay=5e-4)

                    #print("%8.2f %%  ->  Loss: %8.6f " % (i / len(trainloader) * 100, loss.data[0]), end='\r')
                    optimizer.zero_grad()

                if i > 0 and i % TBUpdate == 0 and TBWriter:
                    writer.add_scalar('Train Accuracy', train_acc.avg, iter)
                    writer.add_scalar('Train IoU', train_IoU.avg, iter)
                    writer.add_scalar('Train Loss', train_loss.avg, iter)

                del outputs
                del loss
                del images
                del labels

                t.update(1)

                rows = train_cfmatrix.sum(axis=1)
        cols = train_cfmatrix.sum(axis=0)
        IoU = np.ndarray(train_cfmatrix.shape[0])
        for i in range(train_cfmatrix.shape[0]):
            if rows[i] + cols[i] > 0.:
                IoU[i] = train_cfmatrix[i][i] / (rows[i] + cols[i] -
                                                 train_cfmatrix[i][i])
            else:
                IoU[i] = np.nan
        print("\nTrain Accuracy: ", train_acc.avg)
        print("Train Loss: ", train_loss.avg)
        print("Micro IoU: ", train_IoU.avg, "\n")
        print("Macro IoU: ", np.nanmean(IoU), "\n")

        if check_validation:
            #VALIDATION!!!
            val_acc = AverageMeter()
            val_IoU = AverageMeter()
            val_loss = AverageMeter()
            val_cfmatrix = np.zeros((num_classes, num_classes))
            model.eval()
            for i, (images, labels) in enumerate(valloader):
                if torch.cuda.is_available():
                    images = Variable(images.cuda(0))
                    labels = Variable(labels.cuda(0))
                else:
                    images = Variable(images)
                    labels = Variable(labels)
                iter = len(trainloader) * epoch + i
                #poly_lr_scheduler(optimizer, l_rate, iter)

                outputs = model(images)

                loss = cross_entropy2d(outputs, labels, ignore_index=255)

                val_loss.update(loss.data[0])
                acc, IoU, cf_matrix = accuracy_IoU(
                    outputs, labels, np.array(range(num_classes)))
                if acc is not None:
                    val_acc.update(acc)
                    val_IoU.update(np.nanmean(IoU))
                    val_cfmatrix = val_cfmatrix + cf_matrix

                del outputs
                del loss
                del images
                del labels
            print("\nVal Accuracy: ", val_acc.avg)
            print("Val Loss: ", val_loss.avg)
            print("Val IoU: ", val_IoU.avg, "\n")
            if TBWriter:
                writer.add_scalar('Val Accuracy', val_acc.avg, epoch)
                writer.add_scalar('Val IoU', val_IoU.avg, epoch)
                writer.add_scalar('Val Loss', val_loss.avg, epoch)

        save_metric = train_IoU.avg
        if check_validation:
            save_metric = val_IoU.avg

        if best_metric < save_metric:
            best_metric = save_metric
            print("New Best IoU!")
            if save:
                torch.save(
                    {
                        'epoch': epoch,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                    }, base_save_folder + "/checkpoint_" + str(epoch) + "_" +
                    str(save_metric) + ".pth.tar")
                print("Model Saves As " + base_save_folder + "/checkpoint_" +
                      str(epoch) + "_" + str(save_metric) + ".pth.tar")
                if os.path.isfile(old_file):
                    os.remove(old_file)
                old_file = base_save_folder + "/checkpoint_" + str(
                    epoch) + "_" + str(save_metric) + ".pth.tar"

        print("Best IoU So Far: ", best_metric)

    if TBWriter:
        writer.close()
    print("End Of Training")
示例#30
0
def train():
    data_augmentation = DataAugmentationTransform_old(translation_range=(0.0, 0.15),
                                                      rotation_range=10,
                                                      zoom_range = (0.8, 1.0),
                                                      flip_p = 0.5,
                                                      brightness_range = (-0.2, 0.2),
                                                      gamma_range = (0.5, 1.5),
                                                      saturation_range=(-0.3, 0.3))
    loader_train = CityscapesLoader(base_data_folder, split='train', is_transform=True, img_size=image_shape, transforms=None)
    trainloader = data.DataLoader(loader_train, batch_size=batch_size, num_workers=4, shuffle=True, pin_memory=True)
    if overlay_during_training:
        loader_test = CityscapesLoader(base_data_folder, split='test', is_transform=True, img_size=image_shape)
        test_loader = data.DataLoader(loader_test, batch_size=batch_size, num_workers=4, shuffle=False, pin_memory=True)
    if check_validation:
        loader_val = CityscapesLoader(base_data_folder, split='val', is_transform=True, img_size=image_shape)
        valloader = data.DataLoader(loader_val, batch_size=batch_size, num_workers=4, shuffle=False, pin_memory=True)
    model = get_model('fcn1s',num_classes)

    writer = SummaryWriter()

    if resume:
        print("Resuming From ",resume_filename)
        checkpoint = torch.load(resume_filename)
        model.load_state_dict(checkpoint['state_dict'])
        #starting_epoch = checkpoint['epoch']
        #optimizer.load_state_dict(checkpoint['optimizer'])

    for param in model.parameters():
        param.requires_grad = True

    if freeze_layers:
        print("Freezing VGG layers")
        for param in model.conv_block1.parameters():
            param.requires_grad = False
        for param in model.conv_block2.parameters():
            param.requires_grad = False
        for param in model.conv_block3.parameters():
            param.requires_grad = False
        for param in model.conv_block4.parameters():
            param.requires_grad = False
        for param in model.conv_block5.parameters():
            param.requires_grad = False

    if torch.cuda.is_available():
        print("Using GPU")
        model.cuda(0)
    else:
        print("Using CPU")

    model.train()

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    if opt == "SGD":
        optimizer = torch.optim.SGD(parameters, lr=l_rate, momentum=0.9, weight_decay=5e-4)
    elif opt =="Adam":
        optimizer = torch.optim.Adam(parameters, lr=l_rate, weight_decay=5e-4)

    best_metric = 0
    old_file = ""
    for epoch in range(starting_epoch, epochs):
        train_acc = 0
        train_IoU = 0
        train_loss = 0
        train_count = 0

        print("\nEpoch: ",epoch)

        if overlay_during_training and epoch % 5  == 0:
            test_img = loader_test[67]
            test_img = test_img.unsqueeze(0)
            model.eval()
            test_pred = model(Variable(test_img.cuda(0), requires_grad=True))
            test_img = Variable(test_img.cuda(0), requires_grad=True)
            overlay_images(test_img, test_pred, epoch, '67_')
            writer.add_graph(model, test_pred)
            del test_pred
            del test_img

            test_img = loader_test[88]
            test_img = test_img.unsqueeze(0)
            test_pred = model(Variable(test_img.cuda(0), requires_grad=True))
            test_img = Variable(test_img.cuda(0), requires_grad=True)
            overlay_images(test_img, test_pred, epoch, '88_')
            del test_pred
            del test_img

            test_img = loader_test[175]
            test_img = test_img.unsqueeze(0)
            test_pred = model(Variable(test_img.cuda(0), requires_grad=True))
            test_img = Variable(test_img.cuda(0), requires_grad=True)
            overlay_images(test_img, test_pred, epoch, '175_')
            del test_pred
            del test_img

        model.train()
        with tqdm.tqdm(trainloader, ncols=100) as t:
            for i, (images, labels) in enumerate(t):
                if torch.cuda.is_available():
                    images = Variable(images.cuda(0))
                    labels = Variable(labels.cuda(0))
                else:
                    images = Variable(images)
                    labels = Variable(labels)

                iter = len(trainloader) * epoch + i
                if poly_lr:
                    poly_lr_scheduler(optimizer, l_rate, iter, lr_decay_iter=10)

                optimizer.zero_grad()
                outputs = model(images)

                loss = cross_entropy2d(outputs, labels, ignore_index=255)

                loss.backward()
                optimizer.step()

                #print("%8.2f %%  ->  Loss: %8.6f " % (i / len(trainloader) * 100, loss.data[0]), end='\r')
                t.set_description('Loss: %8.6f' % loss.data[0])
                t.update(1)

                train_loss = train_loss + loss.data[0]
                acc, IoU = accuracy_IoU(outputs,labels, np.array(range(num_classes)))
                train_acc = train_acc + acc
                train_IoU = train_IoU + IoU.mean()
                train_count = train_count + 1

                del outputs
                del loss
                del images
                del labels


        train_acc = train_acc / train_count
        train_IoU = train_IoU / train_count
        train_loss = train_loss / train_count
        print("\nTrain Accuracy: ", train_acc)
        print("Train Loss: ", train_loss)
        print("Train IoU: ", train_IoU, "\n")
        writer.add_scalar('Train Accuracy', train_acc, epoch)
        writer.add_scalar('Train IoU', train_IoU, epoch)
        writer.add_scalar('Train Los', train_loss, epoch)

        if check_validation:
            #VALIDATION!!!
            val_acc = 0
            val_IoU = 0
            val_loss = 0
            val_count = 0
            model.eval()
            for i, (images, labels) in enumerate(valloader):
                if torch.cuda.is_available():
                    images = Variable(images.cuda(0))
                    labels = Variable(labels.cuda(0))
                else:
                    images = Variable(images)
                    labels = Variable(labels)
                iter = len(trainloader) * epoch + i
                #poly_lr_scheduler(optimizer, l_rate, iter)

                outputs = model(images)

                loss = cross_entropy2d(outputs, labels, ignore_index=255)

                val_loss = val_loss + loss.data[0]
                acc, IoU = accuracy_IoU(outputs,labels, np.array(range(num_classes)))
                val_acc = val_acc + acc
                val_IoU = val_IoU + IoU.mean()
                val_count = val_count + 1

                del outputs
                del loss
                del images
                del labels
            val_acc = val_acc / val_count
            val_IoU = val_IoU / val_count
            val_loss = val_loss / val_count
            print("\nVal Accuracy: ", val_acc)
            print("Val Loss: ", val_loss)
            print("Val IoU: ", val_IoU, "\n")
            writer.add_scalar('Val Accuracy', val_acc, epoch)
            writer.add_scalar('Val IoU', val_IoU, epoch)
            writer.add_scalar('Val Loss', val_loss, epoch)

        save_metric = val_IoU
        if check_validation:
            save_metric = val_IoU

        if best_metric < save_metric:
            best_metric = save_metric
            print("New Best IoU!")
            if save:
                torch.save({
                    'epoch': epoch,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                },
                 base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar")
                print("Model Saves As " + base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar")
                if os.path.isfile(old_file):
                    os.remove(old_file)
                old_file = base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar"

        print("Best IoU So Far: ", best_metric)

    writer.close()
    print("End Of Training")
示例#31
0
        # re-parameterize
        std = logvar.mul(0.5).exp_()
        noise.resize_(bsize_now, nz).normal_(0, 1)

        output = decoder(
            (Variable(noise).mul(std).add_(mu)).view(bsize_now, nz, 1, 1))
        loss = loss_function(output, Variable(input), mu, logvar, bsize,
                             img_size)

        encoder.zero_grad()
        decoder.zero_grad()
        loss.backward()
        optimizer_de.step()
        optimizer_en.step()
        print 'epoch %d step %d, err_d=%.4f' % (epoch, i, loss.data[0])

        if i % 100 == 0:
            # ##########################
            # # Visualization
            # ##########################
            images = make_grid(output.data[:8])
            writer.add_image('output', images, i)
            images = make_grid(input[:8])
            writer.add_image('images', images, i)
            writer.add_scalar('error', loss.data[0], i)
        del mu, logvar, std, output, loss
        gc.collect()
    torch.save(decoder.state_dict(),
               '%s/decoder-epoch-%d-step-%d.pth' % (check_root, epoch, i))
    torch.save(encoder.state_dict(),
               '%s/encoder-epoch-%d-step-%d.pth' % (check_root, epoch, i))
示例#32
0
        accuracy_value = classification_accuracy(out, labels_batch)
        # BACKPROP
        #optimizer.zero_grad()
        #net.zero_grad()
        loss_value.backward()
        optimizer.step()

        # LOGGING
        progress.update(progress.value + 1,
                        loss=loss_value.data.cpu().numpy()[0],
                        accuracy=accuracy_value.data.cpu().numpy()[0],
                        epoch=i + 1)

        if j % logging_step == 0:
            # LOSS ACCURACY
            writer.add_scalar('loss', loss_value.data[0], i * batch_number + j)
            writer.add_scalar('accuracy', accuracy_value.data[0],
                              i * batch_number + j)
            # PARAMS
            for name, param in net.named_parameters():
                writer.add_histogram(name,
                                     param.clone().cpu().data.numpy(),
                                     i * batch_number + j)

        if j % logging_text_step == 0:
            net.train(False)
            # STEP
            s = "non sopporto i giocatori di biliardo, i soprannomi, gli indecisi, i no"[
                0:75]
            s_final = s
            s = numpy.asarray([
示例#33
0
def do_training(args, module, data_train, data_val, begin_epoch=0):
    from distutils.dir_util import mkpath
    from log_util import LogUtil

    log = LogUtil().getlogger()
    mkpath(os.path.dirname(get_checkpoint_path(args)))

    seq_len = args.config.get('arch', 'max_t_count')
    batch_size = args.config.getint('common', 'batch_size')
    save_checkpoint_every_n_epoch = args.config.getint('common', 'save_checkpoint_every_n_epoch')
    save_checkpoint_every_n_batch = args.config.getint('common', 'save_checkpoint_every_n_batch')
    enable_logging_train_metric = args.config.getboolean('train', 'enable_logging_train_metric')
    enable_logging_validation_metric = args.config.getboolean('train', 'enable_logging_validation_metric')

    contexts = parse_contexts(args)
    num_gpu = len(contexts)
    eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=seq_len,is_logging=enable_logging_validation_metric,is_epoch_end=True)
    # tensorboard setting
    loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=seq_len,is_logging=enable_logging_train_metric,is_epoch_end=False)

    optimizer = args.config.get('train', 'optimizer')
    momentum = args.config.getfloat('train', 'momentum')
    learning_rate = args.config.getfloat('train', 'learning_rate')
    learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing')

    mode = args.config.get('common', 'mode')
    num_epoch = args.config.getint('train', 'num_epoch')
    clip_gradient = args.config.getfloat('train', 'clip_gradient')
    weight_decay = args.config.getfloat('train', 'weight_decay')
    save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states')
    show_every = args.config.getint('train', 'show_every')
    n_epoch=begin_epoch

    if clip_gradient == 0:
        clip_gradient = None

    module.bind(data_shapes=data_train.provide_data,
                label_shapes=data_train.provide_label,
                for_training=True)

    if begin_epoch == 0 and mode == 'train':
        module.init_params(initializer=get_initializer(args))


    lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate)

    def reset_optimizer(force_init=False):
        if optimizer == "sgd":
            module.init_optimizer(kvstore='device',
                                  optimizer=optimizer,
                                  optimizer_params={'lr_scheduler': lr_scheduler,
                                                    'momentum': momentum,
                                                    'clip_gradient': clip_gradient,
                                                    'wd': weight_decay},
                                  force_init=force_init)
        elif optimizer == "adam":
            module.init_optimizer(kvstore='device',
                                  optimizer=optimizer,
                                  optimizer_params={'lr_scheduler': lr_scheduler,
                                                    #'momentum': momentum,
                                                    'clip_gradient': clip_gradient,
                                                    'wd': weight_decay},
                                  force_init=force_init)
        else:
            raise Exception('Supported optimizers are sgd and adam. If you want to implement others define them in train.py')
    if mode == "train":
        reset_optimizer(force_init=True)
    else:
        reset_optimizer(force_init=False)

    #tensorboard setting
    tblog_dir = args.config.get('common', 'tensorboard_log_dir')
    summary_writer = SummaryWriter(tblog_dir)
    while True:

        if n_epoch >= num_epoch:
            break

        loss_metric.reset()
        log.info('---------train---------')
        for nbatch, data_batch in enumerate(data_train):

            module.forward_backward(data_batch)
            module.update()
            # tensorboard setting
            if (nbatch + 1) % show_every == 0:
                module.update_metric(loss_metric, data_batch.label)
            #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch)
            if (nbatch+1) % save_checkpoint_every_n_batch == 0:
                log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch, nbatch)
                module.save_checkpoint(prefix=get_checkpoint_path(args)+"n_epoch"+str(n_epoch)+"n_batch", epoch=(int((nbatch+1)/save_checkpoint_every_n_batch)-1), save_optimizer_states=save_optimizer_states)
        # commented for Libri_sample data set to see only train cer
        log.info('---------validation---------')
        data_val.reset()
        eval_metric.reset()
        for nbatch, data_batch in enumerate(data_val):
            # when is_train = False it leads to high cer when batch_norm
            module.forward(data_batch, is_train=True)
            module.update_metric(eval_metric, data_batch.label)

        # tensorboard setting
        val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value()
        log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer, int(val_n_label - val_l_dist), val_n_label)
        curr_acc = val_cer
        summary_writer.add_scalar('CER validation', val_cer, n_epoch)
        assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric'

        data_train.reset()

        # tensorboard setting
        train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value()
        summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch)
        summary_writer.add_scalar('CER train', train_cer, n_epoch)

        # save checkpoints
        if n_epoch % save_checkpoint_every_n_epoch == 0:
            log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch)
            module.save_checkpoint(prefix=get_checkpoint_path(args), epoch=n_epoch, save_optimizer_states=save_optimizer_states)

        n_epoch += 1

        lr_scheduler.learning_rate=learning_rate/learning_rate_annealing

    log.info('FINISH')