def test(config):
    """Test point cloud data loader.
  """
    from torch.utils.data import DataLoader
    from lib.utils import Timer
    timer = Timer()
    DatasetClass = StanfordVoxelization2cmDataset
    transformations = [
        t.RandomHorizontalFlip(DatasetClass.ROTATION_AXIS,
                               DatasetClass.IS_TEMPORAL),
        t.ChromaticAutoContrast(),
        t.ChromaticTranslation(config.data_aug_color_trans_ratio),
        t.ChromaticJitter(config.data_aug_color_jitter_std),
        t.HueSaturationTranslation(config.data_aug_hue_max,
                                   config.data_aug_saturation_max),
    ]

    dataset = DatasetClass(config,
                           input_transform=t.Compose(transformations),
                           augment_data=True,
                           cache=True,
                           elastic_distortion=True)

    data_loader = DataLoader(
        dataset=dataset,
        collate_fn=t.cfl_collate_fn_factory(limit_numpoints=False),
        batch_size=4,
        shuffle=True)

    # Start from index 1
    iter = data_loader.__iter__()
    for i in range(100):
        timer.tic()
        data = iter.next()
        print(timer.toc())
示例#2
0
def test(config, intensity=False):
    """Test point cloud data loader.
  """
    from torch.utils.data import DataLoader
    from lib.utils import Timer
    import open3d as o3d

    def make_pcd(coords, feats):
        pcd = o3d.geometry.PointCloud()
        pcd.points = o3d.utility.Vector3dVector(coords[:, :3].float().numpy())
        pcd.colors = o3d.utility.Vector3dVector(feats[:, :3].numpy() / 255)
        if intensity:
            pcd.intensities = o3d.utility.Vector3dVector(feats[:, 3:3].numpy())
        return pcd

    timer = Timer()
    DatasetClass = FacilityArea5Dataset
    transformations = [
        t.RandomHorizontalFlip(DatasetClass.ROTATION_AXIS,
                               DatasetClass.IS_TEMPORAL),
        t.ChromaticAutoContrast(),
        t.ChromaticTranslation(config.data_aug_color_trans_ratio),
        t.ChromaticJitter(config.data_aug_color_jitter_std),
    ]

    dataset = DatasetClass(config,
                           prevoxel_transform=t.ElasticDistortion(
                               DatasetClass.ELASTIC_DISTORT_PARAMS),
                           input_transform=t.Compose(transformations),
                           augment_data=True,
                           cache=True,
                           elastic_distortion=True)

    data_loader = DataLoader(
        dataset=dataset,
        collate_fn=t.cfl_collate_fn_factory(limit_numpoints=False),
        batch_size=1,
        shuffle=True)

    # Start from index 1
    iter = data_loader.__iter__()
    for i in range(100):
        timer.tic()
        coords, feats, labels = iter.next()
        pcd = make_pcd(coords, feats)
        o3d.visualization.draw_geometries([pcd])
        print(timer.toc())
示例#3
0
    def test(self):
        from torch.utils.data import DataLoader
        from lib.utils import Timer
        from config import get_config
        config = get_config()

        dataset = SynthiaVoxelizationDataset(config)
        timer = Timer()

        data_loader = DataLoader(
            dataset=dataset,
            collate_fn=cfl_collate_fn_factory(limit_numpoints=False),
            num_workers=0,
            batch_size=4,
            shuffle=True)

        # Start from index 1
        # for i, batch in enumerate(data_loader, 1):
        iter = data_loader.__iter__()
        for i in range(100):
            timer.tic()
            batch = iter.next()
            print(batch, timer.toc())
示例#4
0
class Solver(object):
    """Solver for generic networks.
    """
    def __init__(self, net, graph, is_training):
        self.net = net
        self.graph = graph
        self.is_training = is_training
        self.num_epochs = cfg.TRAIN.NUM_EPOCHS
        self.train_timer = Timer()
        self.data_timer = Timer()

        self.global_step = slim.get_or_create_global_step()

        # Build basic ops and tensors
        if self.is_training:
            self.net.build_train_ops(self.global_step)

        if isinstance(net, Classifier):
            self.saver = tf.train.Saver(var_list=net.vars_to_restore,
                                        name='saver')
        else:
            self.saver = tf.train.Saver(max_to_keep=None,
                                        name='saver_all_var')  # Save all vars
        self.init_ops = self.build_init_ops()
        self.val_loss_ph = tf.placeholder(tf.float32,
                                          shape=(),
                                          name='val_loss_ph')
        self.net.build_summary_ops(self.graph)
        self.val_loss_summary = tf.summary.scalar(name='val_loss',
                                                  tensor=self.val_loss_ph)

        print('saver variables:')
        print_tensor_shapes(net.vars_to_restore, prefix='-->')

    def build_init_ops(self):
        """Builds the init ops.

        Returns:
            init_op: Initialization op.
            ready_op: Initialization op.
            local_init_op: Initialization op.
        """
        with tf.name_scope('init_ops'):
            init_op = tf.global_variables_initializer()
            ready_op = tf.report_uninitialized_variables()
            local_init_op = tf.group(tf.local_variables_initializer(),
                                     tf.tables_initializer())
        return init_op, ready_op, local_init_op

    def restore_checkpoint(self, sess):
        """Restores the network to a previously saved checkpoint if a path is provided from the
        config.

        Args:
            sess: Current session.
        """
        if cfg.DIR.CKPT_PATH is not None:
            tf.logging.info('Restoring checkpoint.')
            self.saver.restore(sess, cfg.DIR.CKPT_PATH)
        else:
            tf.logging.info('Using network with random weights.')

    def train_step(self, sess, train_queue, step):
        """Executes a train step, including saving the summaries if appropriate.

        Args:
            sess: Current session.
            train_queue: Data queue containing train set minibatches.
            step: The current training iteration (0-based indexing).

        Returns:
            print_dict: Dictionary of items such as losses (for just one minibatch) to print.
        """
        print_dict = self.net.train_step(sess,
                                         train_queue,
                                         step,
                                         data_timer=self.data_timer)
        return print_dict

    def val_step(self, sess, val_queue):
        """Executes a validation step, which simply computes the loss.
        Args:
            sess: Current session.
            val_queue: Data queue containing validation set minibatches.
        Returns:
            val_loss: Loss for a single minibatch of validation data.
        """
        raise NotImplementedError('Must be implemented by a subclass.')

    def validate(self, sess, val_queue, step, num_val_iter):
        raise NotImplementedError('Must be implemented by a subclass.')

    def train(self,
              train_iters_per_epoch,
              train_queue,
              val_iters_per_epoch=None,
              val_queue=None):
        """Train the network, computing the validation loss if val_iters_per_epoch and val_queue are
        provided.

        Args:
            train_iters_per_epoch: Number of iterations in a single epoch of train data, as computed
                by the data process.
            train_queue: Data queue containing minibatches of train data.
            val_iters_per_epoch: Optional input describing the number of iterations in a single
                epoch of validation data, as computed by the data process.
            val_queue: Optional input representing the data queue containing minibatches of
                validation data.
        """
        if (val_iters_per_epoch is None and val_queue is not None) or \
                (val_iters_per_epoch is not None and val_queue is None):
            raise ValueError('Need to input both val size and val queue.')
        if val_iters_per_epoch is not None and val_queue is not None:
            run_validation = True
        else:
            run_validation = False

        print('-------------- BEGIN TRAINING --------------')
        num_train_iter = get_num_iterations(train_iters_per_epoch,
                                            num_epochs=cfg.TRAIN.NUM_EPOCHS,
                                            disp=True)
        num_val_iter = 20000 // cfg.CONST.BATCH_SIZE  # Evaluate on roughly 20000 samples
        if val_iters_per_epoch is not None:
            num_val_iter = min(num_val_iter, val_iters_per_epoch)

        with tf.Session() as sess:
            sess.run(self.init_ops)
            self.restore_checkpoint(sess)

            # Train loop
            for step in range(num_train_iter):
                # For randomized model
                # self.save(sess, step)
                # break

                self.train_timer.tic()
                print_dict = self.train_step(sess, train_queue, step)
                self.train_timer.toc()

                if (step + 1) % cfg.CONST.PRINT_FREQ == 0:
                    print_dict['queue size'] = (str(train_queue.qsize()) +
                                                '/' +
                                                str(cfg.CONST.QUEUE_CAPACITY))
                    print_dict[
                        'data fetch (sec/step)'] = '%.2f' % self.data_timer.average_time
                    print_dict[
                        'train step (sec/step)'] = '%.2f' % self.train_timer.average_time
                    print_train_step_data(print_dict, step)

                    # Reset timers
                    self.data_timer.reset()
                    self.train_timer.reset()

                if (run_validation is True) and (
                    (step + 1) % cfg.TRAIN.VALIDATION_FREQ == 0):
                    validation_val = self.validate(sess, val_queue, step,
                                                   num_val_iter)
                    if validation_val == -1:  # Training termination flag
                        tf.logging.info(
                            'Terminating train loop due to decreasing validation performance.'
                        )
                        break
                    else:
                        val_summary = sess.run(
                            self.val_loss_summary,
                            feed_dict={self.val_loss_ph: validation_val})
                        self.net.summary_writer.add_summary(
                            val_summary, (step + 1))

                if (step + 1) % cfg.TRAIN.CKPT_FREQ == 0:
                    self.save(sess, step)

            # save model after training
            self.save(sess, step)

    def forward_pass_batches(self, sess, minibatch_generator):
        """Forward pass a series of minibatches from the minibatch generator.
        """
        minibatch_list = []
        outputs_list = []
        for step, minibatch in enumerate(minibatch_generator):
            np.random.seed(1234)
            try:
                outputs = self.net.forward_pass(sess, minibatch)
            except KeyError:
                outputs = self.net.forward_pass(sess,
                                                minibatch,
                                                full_feed_dict=True)
            # Reduce size of minibatch so we can pass through entire val set
            if isinstance(self.net, LBA):
                minibatch_save = {
                    'raw_embedding_batch': minibatch['raw_embedding_batch'],
                    'caption_label_batch': minibatch['caption_label_batch'],
                    'category_list': minibatch['category_list'],
                    'model_list': minibatch['model_list'],
                }
                minibatch = minibatch_save
            if isinstance(self.net, Classifier):
                minibatch_save = {
                    'class_label_batch': minibatch['class_label_batch'],
                    'model_id_list': minibatch['model_id_list'],
                }
                minibatch = minibatch_save
            minibatch_list.append(minibatch)
            outputs_list.append(outputs)

            if (step + 1) % 100 == 0:
                tf.logging.info('%s  Step: %d' %
                                (str(datetime.now()), step + 1))

        return minibatch_list, outputs_list

    def val_phase_minibatch_generator(self, val_queue, num_val_iter):
        """Return a minibatch generator for the test phase.
        """
        for step in range(num_val_iter):
            minibatch = val_queue.get()
            minibatch['test_queue'] = True
            yield minibatch

    def evaluate(self, minibatch_list, outputs_list):
        """Do some evaluation of the outputs.
        """
        pass

    def test(self,
             test_process,
             test_queue,
             num_minibatches=None,
             save_outputs=False):
        """Compute (and optionally save) the outputs for the test set. This function only computes
        the outputs for num_minibatches minibatches.

        Args:
            test_process: Data process for the test data.
            test_queue: Queue containing minibatches of test data.
            num_minibatches: Number of minibatches to compute the outputs for.
            save_outputs: Boolean flag for whether or not to save the outputs.
        """
        with tf.Session() as sess:
            if cfg.DIR.CKPT_PATH is None:
                raise ValueError('Please provide a checkpoint.')
                sess.run(self.init_ops)
            else:
                self.restore_checkpoint(sess)

            def test_phase_minibatch_generator():
                for step, minibatch in enumerate(
                        get_while_running(test_process, test_queue)):
                    if (num_minibatches is not None) and (step
                                                          == num_minibatches):
                        break
                    yield minibatch

            minibatch_generator = test_phase_minibatch_generator()
            minibatch_list, outputs_list = self.forward_pass_batches(
                sess, minibatch_generator)
            self.evaluate(minibatch_list, outputs_list)

        if save_outputs:
            self.save_outputs(minibatch_list, outputs_list)

    def save(self, sess, step):
        """Save a checkpoint.
        """
        ckpt_path = os.path.join(cfg.DIR.LOG_PATH, 'model.ckpt')
        tf.logging.info('Saving checkpoint (step %d).', (step + 1))
        self.saver.save(sess, ckpt_path, global_step=(step + 1))

    def save_outputs(self, minibatch_list, outputs_list, filename=None):
        """Save the outputs (from the self.test).
        """
        raise NotImplementedError('Must be implemented by a subclass.')
示例#5
0
    def train(self, train_loader, val_loader=None):
        ''' Given data queues, train the network '''
        # Parameter directory
        save_dir = os.path.join(cfg.DIR.OUT_PATH)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Timer for the training op and parallel data loading op.
        train_timer = Timer()
        data_timer = Timer()
        training_losses = []

        # Setup learning rates
        lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()]

        #Setup the lr_scheduler
        self.lr_scheduler = lr_scheduler.MultiStepLR(self.optimizer,
                                                     lr_steps,
                                                     gamma=0.1)

        start_iter = 0
        # Resume training
        if cfg.TRAIN.RESUME_TRAIN:
            self.load(cfg.CONST.WEIGHTS)
            start_iter = cfg.TRAIN.INITIAL_ITERATION

        # Main training loop
        train_loader_iter = iter(train_loader)
        for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1):
            self.lr_scheduler.step()

            data_timer.tic()
            try:
                batch_img, batch_voxel = train_loader_iter.next()
            except StopIteration:
                train_loader_iter = iter(train_loader)
                batch_img, batch_voxel = train_loader_iter.next()

            data_timer.toc()

            if self.net.is_x_tensor4:
                batch_img = batch_img[0]

            # Apply one gradient step
            train_timer.tic()
            loss = self.train_loss(batch_img, batch_voxel)
            train_timer.toc()

            training_losses.append(loss.item())

            # Decrease learning rate at certain points
            if train_ind in lr_steps:
                #for pytorch optimizer, learning rate can only be set when the optimizer is created
                #or using torch.optim.lr_scheduler
                print('Learing rate decreased to %f: ' %
                      cfg.TRAIN.LEARNING_RATES[str(train_ind)])

            # Debugging modules
            #
            # Print status, run validation, check divergence, and save model.
            if train_ind % cfg.TRAIN.PRINT_FREQ == 0:
                # Print the current loss
                print('%s Iter: %d Loss: %f' %
                      (datetime.now(), train_ind, loss))

            if train_ind % cfg.TRAIN.VALIDATION_FREQ == 0 and val_loader is not None:
                # Print test loss and params to check convergence every N iterations

                val_losses = 0
                val_num_iter = min(cfg.TRAIN.NUM_VALIDATION_ITERATIONS,
                                   len(val_loader))
                val_loader_iter = iter(val_loader)
                for i in range(val_num_iter):
                    batch_img, batch_voxel = val_loader_iter.next()
                    val_loss = self.train_loss(batch_img, batch_voxel)
                    val_losses += val_loss
                var_losses_mean = val_losses / val_num_iter
                print('%s Test loss: %f' % (datetime.now(), var_losses_mean))

            if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0:
                # Check that the network parameters are all valid
                nan_or_max_param = max_or_nan(self.net.parameters())
                if has_nan(nan_or_max_param):
                    print('NAN detected')
                    break

            if (train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0) or \
                    (train_ind == cfg.TRAIN.NUM_ITERATION):
                # Save the checkpoint every a few iterations or at the end.
                self.save(training_losses, save_dir, train_ind)

            #loss is a Variable containing torch.FloatTensor of size 1
            if loss.item() > cfg.TRAIN.LOSS_LIMIT:
                print("Cost exceeds the threshold. Stop training")
                break
示例#6
0
def train(model, data_loader, val_data_loader, config, transform_data_fn=None):

    device = config.device_id
    distributed = get_world_size() > 1

    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    writer = SummaryWriter(log_dir=config.log_dir)
    data_timer, iter_timer = Timer(), Timer()
    fw_timer, bw_timer, ddp_timer = Timer(), Timer(), Timer()

    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    fw_time_avg, bw_time_avg, ddp_time_avg = AverageMeter(), AverageMeter(
    ), AverageMeter()

    losses, scores = AverageMeter(), AverageMeter()

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)

    writer = SummaryWriter(log_dir=config.log_dir)

    # Train the network
    logging.info('===> Start training on {} GPUs, batch-size={}'.format(
        get_world_size(), config.batch_size * get_world_size()))
    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(
                checkpoint_fn,
                map_location=lambda s, l: default_restore_location(s, 'cpu'))
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            load_state(model, state['state_dict'])

            if config.resume_optimizer:
                scheduler = initialize_scheduler(optimizer,
                                                 config,
                                                 last_step=curr_iter)
                optimizer.load_state_dict(state['optimizer'])
            if 'best_val' in state:
                best_val_miou = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))

    data_iter = data_loader.__iter__()  # (distributed) infinite sampler
    while is_training:
        for iteration in range(len(data_loader) // config.iter_size):
            optimizer.zero_grad()
            data_time, batch_loss, batch_score = 0, 0, 0
            iter_timer.tic()

            # set random seed for every iteration for trackability
            _set_seed(config, curr_iter)

            for sub_iter in range(config.iter_size):
                # Get training data
                data_timer.tic()
                coords, input, target = data_iter.next()

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, :3] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                color = input[:, :3].int()
                if config.normalize_color:
                    input[:, :3] = input[:, :3] / 255. - 0.5
                sinput = SparseTensor(input, coords).to(device)

                data_time += data_timer.toc(False)

                # Feed forward
                fw_timer.tic()

                inputs = (sinput, ) if config.wrapper_type == 'None' else (
                    sinput, coords, color)
                # model.initialize_coords(*init_args)
                soutput = model(*inputs)
                # The output of the network is not sorted
                target = target.long().to(device)

                loss = criterion(soutput.F, target.long())

                # Compute and accumulate gradient
                loss /= config.iter_size

                pred = get_prediction(data_loader.dataset, soutput.F, target)
                score = precision_at_one(pred, target)

                fw_timer.toc(False)
                bw_timer.tic()

                # bp the loss
                loss.backward()

                bw_timer.toc(False)

                # gather information
                logging_output = {
                    'loss': loss.item(),
                    'score': score / config.iter_size
                }

                ddp_timer.tic()
                if distributed:
                    logging_output = all_gather_list(logging_output)
                    logging_output = {
                        w: np.mean([a[w] for a in logging_output])
                        for w in logging_output[0]
                    }

                batch_loss += logging_output['loss']
                batch_score += logging_output['score']
                ddp_timer.toc(False)

            # Update number of steps
            optimizer.step()
            scheduler.step()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))
            fw_time_avg.update(fw_timer.diff)
            bw_time_avg.update(bw_timer.diff)
            ddp_time_avg.update(ddp_timer.diff)

            losses.update(batch_loss, target.size(0))
            scores.update(batch_score, target.size(0))

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(x) for x in scheduler.get_lr()])
                debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tData time: {:.4f}, Forward time: {:.4f}, Backward time: {:.4f}, DDP time: {:.4f}, Total iter time: {:.4f}".format(
                    scores.avg, data_time_avg.avg, fw_time_avg.avg,
                    bw_time_avg.avg, ddp_time_avg.avg, iter_time_avg.avg)
                logging.info(debug_str)
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                # Write logs
                writer.add_scalar('training/loss', losses.avg, curr_iter)
                writer.add_scalar('training/precision_at_1', scores.avg,
                                  curr_iter)
                writer.add_scalar('training/learning_rate',
                                  scheduler.get_lr()[0], curr_iter)
                losses.reset()
                scores.reset()

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(model, optimizer, epoch, curr_iter, config,
                           best_val_miou, best_val_iter)

            # Validation
            if curr_iter % config.val_freq == 0:
                val_miou = validate(model, val_data_loader, writer, curr_iter,
                                    config, transform_data_fn)
                if val_miou > best_val_miou:
                    best_val_miou = val_miou
                    best_val_iter = curr_iter
                    checkpoint(model, optimizer, epoch, curr_iter, config,
                               best_val_miou, best_val_iter, "best_val")
                logging.info("Current best mIoU: {:.3f} at iter {}".format(
                    best_val_miou, best_val_iter))

                # Recover back
                model.train()

            if curr_iter % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

            # End of iteration
            curr_iter += 1

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
               best_val_iter)
    val_miou = validate(model, val_data_loader, writer, curr_iter, config,
                        transform_data_fn)
    if val_miou > best_val_miou:
        best_val_miou = val_miou
        best_val_iter = curr_iter
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter, "best_val")
    logging.info("Current best mIoU: {:.3f} at iter {}".format(
        best_val_miou, best_val_iter))
示例#7
0
def test(model,
         data_loader,
         config,
         transform_data_fn=None,
         has_gt=True,
         validation=None,
         epoch=None):
    device = get_torch_device(config.is_cuda)
    dataset = data_loader.dataset
    num_labels = dataset.NUM_LABELS
    global_timer, data_timer, iter_timer = Timer(), Timer(), Timer()
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)
    alpha, gamma, eps = 1, 2, 1e-6  # Focal Loss parameters
    losses, scores, ious = AverageMeter(), AverageMeter(), 0
    aps = np.zeros((0, num_labels))
    hist = np.zeros((num_labels, num_labels))

    if not config.is_train:
        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(checkpoint_fn)
            model.load_state_dict(state['state_dict'])
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))
    if validation:
        logging.info('===> Start validating')
    else:
        logging.info('===> Start testing')

    global_timer.tic()
    data_iter = data_loader.__iter__()
    max_iter = len(data_loader)
    max_iter_unique = max_iter

    all_preds, all_labels, batch_losses, batch_loss = [], [], {}, 0

    # Fix batch normalization running mean and std
    model.eval()

    # Clear cache (when run in val mode, cleanup training cache)
    torch.cuda.empty_cache()

    if config.save_prediction or config.test_original_pointcloud:
        if config.save_prediction:
            save_pred_dir = config.save_pred_dir
            os.makedirs(save_pred_dir, exist_ok=True)
        else:
            save_pred_dir = tempfile.mkdtemp()
        if os.listdir(save_pred_dir):
            raise ValueError(f'Directory {save_pred_dir} not empty. '
                             'Please remove the existing prediction.')

    with torch.no_grad():
        for iteration in range(max_iter):
            data_timer.tic()
            if config.return_transformation:
                coords, input, target, transformation = data_iter.next()
            else:
                coords, input, target = data_iter.next()
                transformation = None
            data_time = data_timer.toc(False)

            # Preprocess input
            iter_timer.tic()

            if config.wrapper_type != 'None':
                color = input[:, :3].int()
            if config.normalize_color:
                input[:, :3] = input[:, :3] / 255. - 0.5
            sinput = SparseTensor(input, coords).to(device)

            # Feed forward
            inputs = (sinput, ) if config.wrapper_type == 'None' else (sinput,
                                                                       coords,
                                                                       color)
            soutput = model(*inputs)
            output = soutput.F

            pred = get_prediction(dataset, output, target).int()
            iter_time = iter_timer.toc(False)

            all_preds.append(pred.cpu().detach().numpy())
            all_labels.append(target.cpu().detach().numpy())

            if config.save_prediction or config.test_original_pointcloud:
                save_predictions(coords, pred, transformation, dataset, config,
                                 iteration, save_pred_dir)

            if has_gt:
                if config.evaluate_original_pointcloud:
                    raise NotImplementedError('pointcloud')
                    output, pred, target = permute_pointcloud(
                        coords, pointcloud, transformation, dataset.label_map,
                        output, pred)

                target_np = target.numpy()
                num_sample = target_np.shape[0]
                target = target.to(device)
                """# focal loss
        input_soft = nn.functional.softmax(output, dim=1) + eps
        focal_weight = torch.pow(-input_soft + 1., gamma)
        loss = (-alpha * focal_weight * torch.log(input_soft)).mean()"""

                loss = criterion(output, target.long())

                batch_loss += loss

                losses.update(float(loss), num_sample)
                scores.update(precision_at_one(pred, target), num_sample)
                hist += fast_hist(pred.cpu().numpy().flatten(),
                                  target_np.flatten(), num_labels)
                ious = per_class_iu(hist) * 100

                prob = torch.nn.functional.softmax(output, dim=1)
                ap = average_precision(prob.cpu().detach().numpy(), target_np)
                aps = np.vstack((aps, ap))
                # Due to heavy bias in class, there exists class with no test label at all
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=RuntimeWarning)
                    ap_class = np.nanmean(aps, 0) * 100.

            if iteration % config.test_stat_freq == 0 and iteration > 0:
                preds = np.concatenate(all_preds)
                targets = np.concatenate(all_labels)
                to_ignore = [
                    i for i in range(len(targets)) if targets[i] == 255
                ]
                preds_trunc = [
                    preds[i] for i in range(len(preds)) if i not in to_ignore
                ]
                targets_trunc = [
                    targets[i] for i in range(len(targets))
                    if i not in to_ignore
                ]
                cm = confusion_matrix(targets_trunc,
                                      preds_trunc,
                                      normalize='true')
                np.savetxt(config.log_dir + '/cm_epoch_{0}.txt'.format(epoch),
                           cm)

                reordered_ious = dataset.reorder_result(ious)
                reordered_ap_class = dataset.reorder_result(ap_class)
                class_names = dataset.get_classnames()
                print_info(iteration,
                           max_iter_unique,
                           data_time,
                           iter_time,
                           has_gt,
                           losses,
                           scores,
                           reordered_ious,
                           hist,
                           reordered_ap_class,
                           class_names=class_names)

            if iteration % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

            batch_losses[epoch] = batch_loss

    global_time = global_timer.toc(False)

    reordered_ious = dataset.reorder_result(ious)
    reordered_ap_class = dataset.reorder_result(ap_class)
    class_names = dataset.get_classnames()
    print_info(iteration,
               max_iter_unique,
               data_time,
               iter_time,
               has_gt,
               losses,
               scores,
               reordered_ious,
               hist,
               reordered_ap_class,
               class_names=class_names)

    if not config.is_train:
        preds = np.concatenate(all_preds)
        targets = np.concatenate(all_labels)
        to_ignore = [i for i in range(len(targets)) if targets[i] == 255]
        preds_trunc = [
            preds[i] for i in range(len(preds)) if i not in to_ignore
        ]
        targets_trunc = [
            targets[i] for i in range(len(targets)) if i not in to_ignore
        ]
        cm = confusion_matrix(targets_trunc, preds_trunc, normalize='true')
        np.savetxt(config.log_dir + '/cm.txt', cm)

    if config.test_original_pointcloud:
        logging.info('===> Start testing on original pointcloud space.')
        dataset.test_pointcloud(save_pred_dir)
    logging.info("Finished test. Elapsed time: {:.4f}".format(global_time))

    if validation:
        loss_file_name = "/val_loss.txt"
        with open(config.log_dir + loss_file_name, 'a') as val_loss_file:
            for key in batch_losses:
                val_loss_file.writelines('{0}, {1}\n'.format(
                    batch_losses[key], key))
        val_loss_file.close()
        return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean(
            per_class_iu(hist)) * 100, batch_losses

    else:
        return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean(
            per_class_iu(hist)) * 100
示例#8
0
    def train(self, train_queue, val_queue=None):
        ''' Given data queues, train the network '''
        # Parameter directory
        save_dir = os.path.join(cfg.DIR.OUT_PATH)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Timer for the training op and parallel data loading op.
        train_timer = Timer()
        data_timer = Timer()
        training_losses = []

        # Setup learning rates
        lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()]

        #Setup the lr_scheduler
        self.lr_scheduler = lr_scheduler.MultiStepLR(self.optimizer,
                                                     lr_steps,
                                                     gamma=0.1)  # gamma为下降系数

        start_iter = 0
        # Resume training
        if cfg.TRAIN.RESUME_TRAIN:
            self.load(cfg.CONST.WEIGHTS)
            start_iter = cfg.TRAIN.INITIAL_ITERATION

        if cfg.TRAIN.SHOW_LOSS:  # 要动态打印
            import matplotlib.pyplot as plot
            plot.figure(1, figsize=(12, 5))
            plot.ion()

        # Main training loop
        for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1):
            self.lr_scheduler.step()

            data_timer.tic()
            batch_img, batch_voxel = train_queue.get()
            data_timer.toc()

            if self.net.is_x_tensor4:
                batch_img = batch_img[0]

            # Apply one gradient step
            train_timer.tic()
            loss = self.train_loss(batch_img, batch_voxel)
            train_timer.toc()

            #             print(loss)
            # training_losses.append(loss.data)  转换为numpy数组
            # print(type(loss))
            # print(loss.data.numpy())
            # print(loss.data.numpy().shape)
            # print(type(loss.data.numpy()))
            if (torch.cuda.is_available()):
                training_losses.append(loss.cpu().data.numpy())
            else:
                training_losses.append(loss.data.numpy())

            # Decrease learning rate at certain points
            if train_ind in lr_steps:
                #for pytorch optimizer, learning rate can only be set when the optimizer is created
                #or using torch.optim.lr_scheduler
                print('Learing rate decreased to %f: ' %
                      cfg.TRAIN.LEARNING_RATES[str(train_ind)])

            # '''
            # Debugging modules
            # '''

            # Print status, run validation, check divergence, and save model.
            if train_ind % cfg.TRAIN.PRINT_FREQ == 0:  #40
                # Print the current loss
                print('%s Iter: %d Loss: %f' %
                      (datetime.now(), train_ind, loss))
                '''
                @TODO(dingyadong): loss dynamic Visualization
                '''
                # plot
                if (train_ind != 0):
                    steps = np.linspace(0,
                                        train_ind,
                                        train_ind + 1,
                                        dtype=np.float32)
                    if cfg.TRAIN.SHOW_LOSS:  # 要动态打印
                        plot.plot(steps, training_losses, 'b-')
                        plot.draw()
                # plot.pause(0.05)

            if train_ind % cfg.TRAIN.VALIDATION_FREQ == 0 and val_queue is not None:
                # Print test loss and params to check convergence every N iterations

                val_losses = 0
                for i in range(cfg.TRAIN.NUM_VALIDATION_ITERATIONS):
                    batch_img, batch_voxel = val_queue.get()
                    val_loss = self.train_loss(batch_img, batch_voxel)
                    val_losses += val_loss
                var_losses_mean = val_losses / cfg.TRAIN.NUM_VALIDATION_ITERATIONS
                print('%s Test loss: %f' % (datetime.now(), var_losses_mean))

            if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0:
                # Check that the network parameters are all valid
                nan_or_max_param = max_or_nan(self.net.parameters())
                if has_nan(nan_or_max_param):
                    print('NAN detected')
                    break

            if train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0:
                self.save(training_losses, save_dir, train_ind)

            #loss is a Variable containing torch.FloatTensor of size 1
            if loss.data > cfg.TRAIN.LOSS_LIMIT:
                print("Cost exceeds the threshold. Stop training")
                break

        if cfg.TRAIN.SHOW_LOSS:  # 要动态打印ƒ
            plot.ioff()
            plot.show()
示例#9
0
def test(model, data_loader, config, transform_data_fn=None, has_gt=True):
    device = get_torch_device(config.is_cuda)
    dataset = data_loader.dataset
    num_labels = dataset.NUM_LABELS
    global_timer, data_timer, iter_timer = Timer(), Timer(), Timer()
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)
    losses, scores, ious = AverageMeter(), AverageMeter(), 0
    aps = np.zeros((0, num_labels))
    hist = np.zeros((num_labels, num_labels))

    logging.info('===> Start testing')

    global_timer.tic()
    data_iter = data_loader.__iter__()
    max_iter = len(data_loader)
    max_iter_unique = max_iter

    # Fix batch normalization running mean and std
    model.eval()

    # Clear cache (when run in val mode, cleanup training cache)
    torch.cuda.empty_cache()

    if config.save_prediction or config.test_original_pointcloud:
        if config.save_prediction:
            save_pred_dir = config.save_pred_dir
            os.makedirs(save_pred_dir, exist_ok=True)
        else:
            save_pred_dir = tempfile.mkdtemp()
        if os.listdir(save_pred_dir):
            raise ValueError(f'Directory {save_pred_dir} not empty. '
                             'Please remove the existing prediction.')

    with torch.no_grad():
        for iteration in range(max_iter):
            data_timer.tic()
            if config.return_transformation:
                coords, input, target, transformation = data_iter.next()
            else:
                coords, input, target = data_iter.next()
                transformation = None
            data_time = data_timer.toc(False)

            # Preprocess input
            iter_timer.tic()

            if config.wrapper_type != 'None':
                color = input[:, :3].int()
            if config.normalize_color:
                input[:, :3] = input[:, :3] / 255. - 0.5
            sinput = SparseTensor(input, coords).to(device)

            # Feed forward
            inputs = (sinput, ) if config.wrapper_type == 'None' else (sinput,
                                                                       coords,
                                                                       color)
            soutput = model(*inputs)
            output = soutput.F

            pred = get_prediction(dataset, output, target).int()
            iter_time = iter_timer.toc(False)

            if config.save_prediction or config.test_original_pointcloud:
                save_predictions(coords, pred, transformation, dataset, config,
                                 iteration, save_pred_dir)

            if has_gt:
                if config.evaluate_original_pointcloud:
                    raise NotImplementedError('pointcloud')
                    output, pred, target = permute_pointcloud(
                        coords, pointcloud, transformation, dataset.label_map,
                        output, pred)

                target_np = target.numpy()

                num_sample = target_np.shape[0]

                target = target.to(device)

                cross_ent = criterion(output, target.long())
                losses.update(float(cross_ent), num_sample)
                scores.update(precision_at_one(pred, target), num_sample)
                hist += fast_hist(pred.cpu().numpy().flatten(),
                                  target_np.flatten(), num_labels)
                ious = per_class_iu(hist) * 100

                prob = torch.nn.functional.softmax(output, dim=1)
                ap = average_precision(prob.cpu().detach().numpy(), target_np)
                aps = np.vstack((aps, ap))
                # Due to heavy bias in class, there exists class with no test label at all
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=RuntimeWarning)
                    ap_class = np.nanmean(aps, 0) * 100.

            if iteration % config.test_stat_freq == 0 and iteration > 0:
                reordered_ious = dataset.reorder_result(ious)
                reordered_ap_class = dataset.reorder_result(ap_class)
                class_names = dataset.get_classnames()
                print_info(iteration,
                           max_iter_unique,
                           data_time,
                           iter_time,
                           has_gt,
                           losses,
                           scores,
                           reordered_ious,
                           hist,
                           reordered_ap_class,
                           class_names=class_names)

            if iteration % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

    global_time = global_timer.toc(False)

    reordered_ious = dataset.reorder_result(ious)
    reordered_ap_class = dataset.reorder_result(ap_class)
    class_names = dataset.get_classnames()
    print_info(iteration,
               max_iter_unique,
               data_time,
               iter_time,
               has_gt,
               losses,
               scores,
               reordered_ious,
               hist,
               reordered_ap_class,
               class_names=class_names)

    if config.test_original_pointcloud:
        logging.info('===> Start testing on original pointcloud space.')
        dataset.test_pointcloud(save_pred_dir)

    logging.info("Finished test. Elapsed time: {:.4f}".format(global_time))

    return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean(
        per_class_iu(hist)) * 100
def train(model, data_loader, val_data_loader, config, transform_data_fn=None):
    all_losses = []
    device = get_torch_device(config.is_cuda)
    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    writer = SummaryWriter(log_dir=config.log_dir)
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    losses, scores, batch_losses = AverageMeter(), AverageMeter(), {}

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)
    alpha, gamma, eps = 1, 2, 1e-6

    writer = SummaryWriter(log_dir=config.log_dir)

    # Train the network
    logging.info('===> Start training')
    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(checkpoint_fn)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            model.load_state_dict(state['state_dict'])
            if config.resume_optimizer:
                scheduler = initialize_scheduler(optimizer,
                                                 config,
                                                 last_step=curr_iter)
                optimizer.load_state_dict(state['optimizer'])
            if 'best_val' in state:
                best_val_miou = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))

    data_iter = data_loader.__iter__()

    while is_training:
        print(
            "********************************** epoch N° {0} ************************"
            .format(epoch))
        for iteration in range(len(data_loader) // config.iter_size):
            print("####### Iteration N° {0}".format(iteration))
            optimizer.zero_grad()
            data_time, batch_loss = 0, 0
            iter_timer.tic()
            for sub_iter in range(config.iter_size):
                print("------------- Sub_iteration N° {0}".format(sub_iter))
                # Get training data
                data_timer.tic()
                coords, input, target = data_iter.next()
                print("len of coords : {0}".format(len(coords)))

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, :3] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                color = input[:, :3].int()

                if config.normalize_color:
                    input[:, :3] = input[:, :3] / 255. - 0.5
                sinput = SparseTensor(input, coords).to(device)

                data_time += data_timer.toc(False)

                # Feed forward
                inputs = (sinput, ) if config.wrapper_type == 'None' else (
                    sinput, coords, color)
                # model.initialize_coords(*init_args)
                soutput = model(*inputs)
                # The output of the network is not sorted
                target = target.long().to(device)
                print("count of classes : {0}".format(
                    np.unique(target.cpu().numpy(), return_counts=True)))
                print("target : {0}\ntarget_len : {1}".format(
                    target, len(target)))
                print("target [0]: {0}".format(target[0]))
                input_soft = nn.functional.softmax(soutput.F, dim=1) + eps
                print("input_soft[0] : {0}".format(input_soft[0]))
                focal_weight = torch.pow(-input_soft + 1., gamma)
                print("focal_weight : {0}\nweight[0] : {1}".format(
                    focal_weight, focal_weight[0]))
                focal_loss = (-alpha * focal_weight *
                              torch.log(input_soft)).mean()
                loss = criterion(soutput.F, target.long())
                print("focal_loss :{0}\nloss : {1}".format(focal_loss, loss))

                # Compute and accumulate gradient
                loss /= config.iter_size
                #batch_loss += loss
                batch_loss += loss.item()
                print("batch_loss : {0}".format(batch_loss))
                loss.backward()

            # Update number of steps
            optimizer.step()
            scheduler.step()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))

            pred = get_prediction(data_loader.dataset, soutput.F, target)
            score = precision_at_one(pred, target)
            losses.update(batch_loss, target.size(0))
            scores.update(score, target.size(0))

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(x) for x in scheduler.get_lr()])
                debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tData time: {:.4f}, Total iter time: {:.4f}".format(
                    scores.avg, data_time_avg.avg, iter_time_avg.avg)
                logging.info(debug_str)
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                # Write logs
                writer.add_scalar('training/loss', losses.avg, curr_iter)
                writer.add_scalar('training/precision_at_1', scores.avg,
                                  curr_iter)
                writer.add_scalar('training/learning_rate',
                                  scheduler.get_lr()[0], curr_iter)
                losses.reset()
                scores.reset()

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(model, optimizer, epoch, curr_iter, config,
                           best_val_miou, best_val_iter)

            # Validation
            if curr_iter % config.val_freq == 0:
                val_miou, val_losses = validate(model, val_data_loader, writer,
                                                curr_iter, config,
                                                transform_data_fn, epoch)

                if val_miou > best_val_miou:
                    best_val_miou = val_miou
                    best_val_iter = curr_iter
                    checkpoint(model, optimizer, epoch, curr_iter, config,
                               best_val_miou, best_val_iter, "best_val")
                logging.info("Current best mIoU: {:.3f} at iter {}".format(
                    best_val_miou, best_val_iter))

                # Recover back
                model.train()

            if curr_iter % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

            batch_losses[epoch] = batch_loss
            # End of iteration
            curr_iter += 1
        with open(config.log_dir + "/train_loss.txt", 'a') as train_loss_log:
            train_loss_log.writelines('{0}, {1}\n'.format(
                batch_losses[epoch], epoch))
        train_loss_log.close()
        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
               best_val_iter)
    val_miou = validate(model, val_data_loader, writer, curr_iter, config,
                        transform_data_fn, epoch)[0]
    if val_miou > best_val_miou:
        best_val_miou = val_miou
        best_val_iter = curr_iter
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter, "best_val")
    logging.info("Current best mIoU: {:.3f} at iter {}".format(
        best_val_miou, best_val_iter))
示例#11
0
def train_worker(gpu,
                 num_devices,
                 NetClass,
                 data_loader,
                 val_data_loader,
                 config,
                 transform_data_fn=None):
    if gpu is not None:
        print("Use GPU: {} for training".format(gpu))
        rank = gpu
    addr = 23491
    dist.init_process_group(backend="nccl",
                            init_method="tcp://127.0.0.1:{}".format(addr),
                            world_size=num_devices,
                            rank=rank)

    # replace with DistributedSampler
    if config.multiprocess:
        from lib.dataloader_dist import InfSampler
        sampler = InfSampler(data_loader.dataset)
        data_loader = DataLoader(dataset=data_loader.dataset,
                                 num_workers=data_loader.num_workers,
                                 batch_size=data_loader.batch_size,
                                 collate_fn=data_loader.collate_fn,
                                 worker_init_fn=data_loader.worker_init_fn,
                                 sampler=sampler)

    if data_loader.dataset.NUM_IN_CHANNEL is not None:
        num_in_channel = data_loader.dataset.NUM_IN_CHANNEL
    else:
        num_in_channel = 3
    num_labels = data_loader.dataset.NUM_LABELS

    # load model
    if config.pure_point:
        model = NetClass(num_class=config.num_labels,
                         N=config.num_points,
                         normal_channel=config.num_in_channel)
    else:
        if config.model == 'MixedTransformer':
            model = NetClass(config,
                             num_class=num_labels,
                             N=config.num_points,
                             normal_channel=num_in_channel)
        elif config.model == 'MinkowskiVoxelTransformer':
            model = NetClass(config, num_in_channel, num_labels)
        elif config.model == 'MinkowskiTransformerNet':
            model = NetClass(config, num_in_channel, num_labels)
        elif "Res" in config.model:
            model = NetClass(num_in_channel, num_labels, config)
        else:
            model = NetClass(num_in_channel, num_labels, config)

    if config.weights == 'modelzoo':
        model.preload_modelzoo()
    elif config.weights.lower() != 'none':
        state = torch.load(config.weights)
        # delete the keys containing the attn since it raises size mismatch
        d = {k: v for k, v in state['state' '_dict'].items() if 'map' not in k}
        if config.weights_for_inner_model:
            model.model.load_state_dict(d)
        else:
            if config.lenient_weight_loading:
                matched_weights = load_state_with_same_shape(
                    model, state['state_dict'])
                model_dict = model.state_dict()
                model_dict.update(matched_weights)
                model.load_state_dict(model_dict)
            else:
                model.load_state_dict(d, strict=False)

    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    # use model with DDP
    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[gpu], find_unused_parameters=False)
    # Synchronized batch norm
    model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(model)

    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    regs, losses, scores = AverageMeter(), AverageMeter(), AverageMeter()

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)

    # Train the network
    if rank == 0:
        setup_logger(config)
        logging.info('===> Start training')

    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        # Test loaded ckpt first
        v_loss, v_score, v_mAP, v_mIoU = test(model, val_data_loader, config)

        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(checkpoint_fn)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            # we skip attention maps because the shape won't match because voxel number is different
            # e.g. copyting a param with shape (23385, 8, 4) to (43529, 8, 4)
            d = {
                k: v
                for k, v in state['state_dict'].items() if 'map' not in k
            }
            # handle those attn maps we don't load from saved dict
            for k in model.state_dict().keys():
                if k in d.keys(): continue
                d[k] = model.state_dict()[k]
            model.load_state_dict(d)
            if config.resume_optimizer:
                scheduler = initialize_scheduler(optimizer,
                                                 config,
                                                 last_step=curr_iter)
                optimizer.load_state_dict(state['optimizer'])
            if 'best_val' in state:
                best_val_miou = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))

    data_iter = data_loader.__iter__()
    device = gpu  # multitrain fed in the device
    if config.dataset == "SemanticKITTI":
        num_class = 19
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
        config.val_freq = config.val_freq * 10  # origianl val_freq_
    elif config.dataset == 'S3DIS':
        num_class = 13
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
    elif config.dataset == "Nuscenes":
        num_class = 16
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
        config.val_freq = config.val_freq * 50
    else:
        val_freq_ = config.val_freq
        num_class = 20

    while is_training:

        total_correct_class = torch.zeros(num_class, device=device)
        total_iou_deno_class = torch.zeros(num_class, device=device)

        for iteration in range(len(data_loader) // config.iter_size):

            optimizer.zero_grad()
            data_time, batch_loss = 0, 0
            iter_timer.tic()

            if curr_iter >= config.max_iter:
                # if curr_iter >= max(config.max_iter, config.epochs*(len(data_loader) // config.iter_size):
                is_training = False
                break
            elif curr_iter >= config.max_iter * (2 / 3):
                config.val_freq = val_freq_ * 2  # valid more freq on lower half

            for sub_iter in range(config.iter_size):

                # Get training data
                data_timer.tic()
                if config.return_transformation:
                    coords, input, target, _, _, pointcloud, transformation = data_iter.next(
                    )
                else:
                    coords, input, target, _, _ = data_iter.next(
                    )  # ignore unique_map and inverse_map

                if config.use_aux:
                    assert target.shape[1] == 2
                    aux = target[:, 1]
                    target = target[:, 0]
                else:
                    aux = None

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, 1:] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                if config.normalize_color:
                    input[:, :3] = input[:, :3] / input[:, :3].max() - 0.5
                    coords_norm = coords[:, 1:] / coords[:, 1:].max() - 0.5

                # cat xyz into the rgb feature
                if config.xyz_input:
                    input = torch.cat([coords_norm, input], dim=1)
                # print(device)

                sinput = SparseTensor(input, coords, device=device)

                # d = {}
                # d['coord'] = sinput.C
                # d['feat'] = sinput.F
                # torch.save(d, 'voxel.pth')
                # import ipdb; ipdb.set_trace()

                data_time += data_timer.toc(False)
                # model.initialize_coords(*init_args)
                if aux is not None:
                    soutput = model(sinput, aux)
                elif config.enable_point_branch:
                    soutput = model(sinput,
                                    iter_=curr_iter / config.max_iter,
                                    enable_point_branch=True)
                else:
                    soutput = model(
                        sinput, iter_=curr_iter / config.max_iter
                    )  # feed in the progress of training for annealing inside the model
                    # soutput = model(sinput)
                # The output of the network is not sorted
                target = target.view(-1).long().to(device)

                loss = criterion(soutput.F, target.long())

                # ====== other loss regs =====
                cur_loss = torch.tensor([0.], device=device)
                if hasattr(model, 'module.block1'):
                    cur_loss = torch.tensor([0.], device=device)

                    if hasattr(model.module.block1[0], 'vq_loss'):
                        if model.block1[0].vq_loss is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].vq_loss  # m is the nn.Sequential obj, m[0] is the TRBlock
                            logging.info(
                                'Cur Loss: {}, Cur vq_loss: {}'.format(
                                    loss, cur_loss))
                            loss += cur_loss

                    if hasattr(model.module.block1[0], 'diverse_loss'):
                        if model.block1[0].diverse_loss is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].diverse_loss  # m is the nn.Sequential obj, m[0] is the TRBlock
                            logging.info(
                                'Cur Loss: {}, Cur diverse _loss: {}'.format(
                                    loss, cur_loss))
                            loss += cur_loss

                    if hasattr(model.module.block1[0], 'label_reg'):
                        if model.block1[0].label_reg is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].label_reg  # m is the nn.Sequential obj, m[0] is the TRBlock
                            # logging.info('Cur Loss: {}, Cur diverse _loss: {}'.format(loss, cur_loss))
                            loss += cur_loss

                # Compute and accumulate gradient
                loss /= config.iter_size
                batch_loss += loss.item()
                if not config.use_sam:
                    loss.backward()
                else:
                    with model.no_sync():
                        loss.backward()

            # Update number of steps
            if not config.use_sam:
                optimizer.step()
            else:
                optimizer.first_step(zero_grad=True)
                soutput = model(sinput,
                                iter_=curr_iter / config.max_iter,
                                aux=starget)
                criterion(soutput.F, target.long()).backward()
                optimizer.second_step(zero_grad=True)

            if config.lr_warmup is None:
                scheduler.step()
            else:
                if curr_iter >= config.lr_warmup:
                    scheduler.step()
                else:
                    for g in optimizer.param_groups:
                        g['lr'] = config.lr * (iteration +
                                               1) / config.lr_warmup

            # CLEAR CACHE!
            torch.cuda.empty_cache()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))

            pred = get_prediction(data_loader.dataset, soutput.F, target)
            score = precision_at_one(pred, target, ignore_label=-1)

            regs.update(cur_loss.item(), target.size(0))
            losses.update(batch_loss, target.size(0))
            scores.update(score, target.size(0))

            # calc the train-iou
            for l in range(num_class):
                total_correct_class[l] += ((pred == l) & (target == l)).sum()
                total_iou_deno_class[l] += (((pred == l) & (target != -1)) |
                                            (target == l)).sum()

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(g['lr']) for g in optimizer.param_groups])
                IoU = ((total_correct_class) /
                       (total_iou_deno_class + 1e-6)).mean() * 100.
                debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tIoU {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format(
                    scores.avg, IoU.item(), data_time_avg.avg,
                    iter_time_avg.avg)
                if regs.avg > 0:
                    debug_str += "\n Additional Reg Loss {:.3f}".format(
                        regs.avg)

                if rank == 0:
                    logging.info(debug_str)
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                # Write logs
                losses.reset()
                scores.reset()

            # only save status on the 1st gpu
            if rank == 0:

                # Save current status, save before val to prevent occational mem overflow
                if curr_iter % config.save_freq == 0:
                    checkpoint(model,
                               optimizer,
                               epoch,
                               curr_iter,
                               config,
                               best_val_miou,
                               best_val_iter,
                               save_inter=True)

                # Validation
                if curr_iter % config.val_freq == 0:
                    val_miou = validate(model, val_data_loader, None,
                                        curr_iter, config, transform_data_fn
                                        )  # feedin None for SummaryWriter args
                    if val_miou > best_val_miou:
                        best_val_miou = val_miou
                        best_val_iter = curr_iter
                        checkpoint(model,
                                   optimizer,
                                   epoch,
                                   curr_iter,
                                   config,
                                   best_val_miou,
                                   best_val_iter,
                                   "best_val",
                                   save_inter=True)
                    if rank == 0:
                        logging.info(
                            "Current best mIoU: {:.3f} at iter {}".format(
                                best_val_miou, best_val_iter))

                    # Recover back
                    model.train()

            # End of iteration
            curr_iter += 1

        IoU = (total_correct_class) / (total_iou_deno_class + 1e-6)
        if rank == 0:
            logging.info('train point avg class IoU: %f' %
                         ((IoU).mean() * 100.))

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    if rank == 0:
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter)
        v_loss, v_score, v_mAP, val_mIoU = test(model, val_data_loader, config)

        if val_miou > best_val_miou and rank == 0:
            best_val_miou = val_miou
            best_val_iter = curr_iter
            logging.info("Final best miou: {}  at iter {} ".format(
                val_miou, curr_iter))
            checkpoint(model, optimizer, epoch, curr_iter, config,
                       best_val_miou, best_val_iter, "best_val")

            logging.info("Current best mIoU: {:.3f} at iter {}".format(
                best_val_miou, best_val_iter))
def test(model, data_loader, config, transform_data_fn=None, has_gt=True, save_pred=False, split=None, submit_dir=None):
    device = get_torch_device(config.is_cuda)
    dataset = data_loader.dataset
    num_labels = dataset.NUM_LABELS
    global_timer, data_timer, iter_timer = Timer(), Timer(), Timer()
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)
    losses, scores, ious = AverageMeter(), AverageMeter(), 0
    aps = np.zeros((0, num_labels))
    hist = np.zeros((num_labels, num_labels))

    # some cfgs concerning the usage of instance-level information
    config.save_pred = save_pred
    if split is not None:
        assert save_pred
    if config.save_pred:
        save_dict = {}
        save_dict['pred'] = []
        save_dict['coord'] = []

    logging.info('===> Start testing')

    global_timer.tic()
    data_iter = data_loader.__iter__()
    max_iter = len(data_loader)
    max_iter_unique = max_iter

    # Fix batch normalization running mean and std
    model.eval()

    # Clear cache (when run in val mode, cleanup training cache)
    torch.cuda.empty_cache()

    # semantic kitti label inverse mapping
    if config.submit:
        remap_lut = Remap().getRemapLUT()

    with torch.no_grad():

        # Calc of the iou
        total_correct = np.zeros(num_labels)
        total_seen = np.zeros(num_labels)
        total_positive = np.zeros(num_labels)
        point_nums = np.zeros([19])

        for iteration in range(max_iter):
            data_timer.tic()
            if config.return_transformation:
                coords, input, target, unique_map_list, inverse_map_list, pointcloud, transformation, filename = data_iter.next()
            else:
                coords, input, target, unique_map_list, inverse_map_list, filename = data_iter.next()
            data_time = data_timer.toc(False)

            if config.use_aux:
                assert target.shape[1] == 2
                aux = target[:,1]
                target = target[:,0]
            else:
                aux = None

            # Preprocess input
            iter_timer.tic()

            if config.normalize_color:
                input[:, :3] = input[:, :3] / input[:,:3].max() - 0.5
                coords_norm = coords[:,1:] / coords[:,1:].max() - 0.5

            XYZ_INPUT = config.xyz_input
            # cat xyz into the rgb feature
            if XYZ_INPUT:
                input = torch.cat([coords_norm, input], dim=1)

            sinput = ME.SparseTensor(input, coords, device=device)

            # Feed forward
            if aux is not None:
                soutput = model(sinput)
            else:
                soutput = model(sinput, iter_ = iteration / max_iter, enable_point_branch=config.enable_point_branch)
            output = soutput.F
            if torch.isnan(output).sum() > 0:
                import ipdb; ipdb.set_trace()

            pred = get_prediction(dataset, output, target).int()
            assert sum([int(t.shape[0]) for t in unique_map_list]) == len(pred), "number of points in unique_map doesn't match predition, do not enable preprocessing"
            iter_time = iter_timer.toc(False)

            if config.save_pred or config.submit:
                # troublesome processing for splitting each batch's data, and export
                batch_ids = sinput.C[:,0]
                splits_at = torch.stack([torch.where(batch_ids == i)[0][-1] for i in torch.unique(batch_ids)]).int()
                splits_at = splits_at + 1
                splits_at_leftshift_one = splits_at.roll(shifts=1)
                splits_at_leftshift_one[0] = 0
                # len_per_batch = splits_at - splits_at_leftshift_one
                len_sum = 0
                batch_id = 0
                for start, end in zip(splits_at_leftshift_one, splits_at):
                    len_sum += len(pred[int(start):int(end)])
                    pred_this_batch = pred[int(start):int(end)]
                    coord_this_batch = pred[int(start):int(end)]
                    if config.save_pred:
                        save_dict['pred'].append(pred_this_batch[inverse_map_list[batch_id]])
                    else: # save submit result
                        submission_path = filename[batch_id].replace(config.semantic_kitti_path, submit_dir).replace('velodyne', 'predictions').replace('.bin', '.label')
                        parent_dir = Path(submission_path).parent.absolute()
                        if not os.path.exists(parent_dir):
                            os.makedirs(parent_dir)
                        label_pred = pred_this_batch[inverse_map_list[batch_id]].cpu().numpy()
                        label_pred = remap_lut[label_pred].astype(np.uint32)
                        label_pred.tofile(submission_path)
                        print(submission_path)
                    batch_id += 1
                assert len_sum == len(pred)

            # Unpack it to original length
            REVERT_WHOLE_POINTCLOUD = True
            # print('{}/{}'.format(iteration, max_iter))
            if REVERT_WHOLE_POINTCLOUD:
                whole_pred = []
                whole_target = []
                for batch_ in range(config.batch_size):
                    batch_mask_ = (soutput.C[:,0] == batch_).cpu().numpy()
                    if batch_mask_.sum() == 0: # for empty batch, skip em 
                        continue
                    try:
                        whole_pred_ = soutput.F[batch_mask_][inverse_map_list[batch_]]
                    except:
                        import ipdb; ipdb.set_trace()
                    whole_target_ = target[batch_mask_][inverse_map_list[batch_]]
                    whole_pred.append(whole_pred_)
                    whole_target.append(whole_target_)
                whole_pred = torch.cat(whole_pred, dim=0)
                whole_target = torch.cat(whole_target, dim=0)

                pred = get_prediction(dataset, whole_pred, whole_target).int()
                output = whole_pred
                target = whole_target

            if has_gt:
                target_np = target.numpy()
                num_sample = target_np.shape[0]
                target = target.to(device)
                output = output.to(device)

                cross_ent = criterion(output, target.long())
                losses.update(float(cross_ent), num_sample)
                scores.update(precision_at_one(pred, target), num_sample)
                hist += fast_hist(pred.cpu().numpy().flatten(), target_np.flatten(), num_labels) # within fast hist, mark label should >=0 & < num_label to filter out 255 / -1
                ious = per_class_iu(hist) * 100
                prob = torch.nn.functional.softmax(output, dim=-1)

                pred = pred[target != -1]
                target = target[target != -1]

                # for _ in range(num_labels): # debug for SemKITTI: spvnas way of calc miou
                    # total_seen[_] += torch.sum(target == _)
                    # total_correct[_] += torch.sum((pred == target) & (target == _))
                    # total_positive[_] += torch.sum(pred == _)

                # ious_ = []
                # for _ in range(num_labels):
                    # if total_seen[_] == 0:
                        # ious_.append(1)
                    # else:
                        # ious_.append(total_correct[_]/(total_seen[_] + total_positive[_] - total_correct[_]))
                # ious_ = torch.stack(ious_, dim=-1).cpu().numpy()*100
                # print(np.nanmean(per_class_iu(hist)), np.nanmean(ious_))
                # ious = np.array(ious_)*100


                # calc the ratio of total points
                # for i_ in range(19):
                    # point_nums[i_] += (target == i_).sum().detach()

                # skip calculating aps
                ap = average_precision(prob.cpu().detach().numpy(), target_np)
                aps = np.vstack((aps, ap))
                # Due to heavy bias in class, there exists class with no test label at all
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=RuntimeWarning)
                    ap_class = np.nanmean(aps, 0) * 100.

            if iteration % config.test_stat_freq == 0 and iteration > 0 and not config.submit:
                reordered_ious = dataset.reorder_result(ious)
                reordered_ap_class = dataset.reorder_result(ap_class)
                # dirty fix for semnaticcKITTI has no getclassnames
                if hasattr(dataset, "class_names"):
                    class_names = dataset.get_classnames()
                else: # semnantic KITTI
                    class_names = None
                print_info(
                        iteration,
                        max_iter_unique,
                        data_time,
                        iter_time,
                        has_gt,
                        losses,
                        scores,
                        reordered_ious,
                        hist,
                        reordered_ap_class,
                        class_names=class_names)

            if iteration % 5 == 0:
                # Clear cache
                torch.cuda.empty_cache()

    if config.save_pred:
        # torch.save(save_dict, os.path.join(config.log_dir, 'preds_{}_with_coord.pth'.format(split)))
        torch.save(save_dict, os.path.join(config.log_dir, 'preds_{}.pth'.format(split)))
        print("===> saved prediction result")

    global_time = global_timer.toc(False)

    save_map(model, config)

    reordered_ious = dataset.reorder_result(ious)
    reordered_ap_class = dataset.reorder_result(ap_class)
    if hasattr(dataset, "class_names"):
        class_names = dataset.get_classnames()
    else:
        class_names = None
    print_info(
            iteration,
            max_iter_unique,
            data_time,
            iter_time,
            has_gt,
            losses,
            scores,
            reordered_ious,
            hist,
            reordered_ap_class,
            class_names=class_names)

    logging.info("Finished test. Elapsed time: {:.4f}".format(global_time))

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean(per_class_iu(hist)) * 100
示例#13
0
    def train(self, train_queue, val_queue=None):
        ''' Given data queues, train the network '''
        # Parameter directory
        save_dir = os.path.join(cfg.DIR.OUT_PATH)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Timer for the training op and parallel data loading op.
        train_timer = Timer()
        data_timer = Timer()
        training_losses = []

        start_iter = 0
        # Resume training
        if cfg.TRAIN.RESUME_TRAIN:
            self.net.load(cfg.CONST.WEIGHTS)
            start_iter = cfg.TRAIN.INITIAL_ITERATION

        # Setup learning rates
        lr = cfg.TRAIN.DEFAULT_LEARNING_RATE
        lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()]

        print('Set the learning rate to %f.' % lr)
        self.set_lr(lr)

        # Main training loop
        for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1):
            data_timer.tic()
            batch_img, batch_voxel = train_queue.get()
            data_timer.toc()

            if self.net.is_x_tensor4:
                batch_img = batch_img[0]

            # Apply one gradient step
            train_timer.tic()
            loss = self.train_loss(batch_img, batch_voxel)
            train_timer.toc()

            training_losses.append(loss)

            # Decrease learning rate at certain points
            if train_ind in lr_steps:
                # edict only takes string for key. Hacky way
                self.set_lr(np.float(cfg.TRAIN.LEARNING_RATES[str(train_ind)]))
                print('Learing rate decreased to %f: ' % self.lr.get_value())

            # Debugging modules
            #
            # Print status, run validation, check divergence, and save model.
            if train_ind % cfg.TRAIN.PRINT_FREQ == 0:
                # Print the current loss
                print('%s Iter: %d Loss: %f' %
                      (datetime.now(), train_ind, loss))

            if train_ind % cfg.TRAIN.VALIDATION_FREQ == 0 and val_queue is not None:
                # Print test loss and params to check convergence every N iterations
                val_losses = []
                for i in range(cfg.TRAIN.NUM_VALIDATION_ITERATIONS):
                    batch_img, batch_voxel = val_queue.get()
                    _, val_loss, _ = self.test_output(batch_img, batch_voxel)
                    val_losses.append(val_loss)
                print('%s Test loss: %f' %
                      (datetime.now(), np.mean(val_losses)))

            if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0:
                # Check that the network parameters are all valid
                max_param = max_or_nan(self.net.params)
                if np.isnan(max_param):
                    print('NAN detected')
                    break

            if train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0:
                self.save(training_losses, save_dir, train_ind)

            if loss > cfg.TRAIN.LOSS_LIMIT:
                print("Cost exceeds the threshold. Stop training")
                break
def train(model, data_loader, val_data_loader, config, transform_data_fn=None):
    device = get_torch_device(config.is_cuda)
    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    writer = SummaryWriter(log_dir=config.log_dir)
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    losses, scores = AverageMeter(), AverageMeter()

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)

    writer = SummaryWriter(log_dir=config.log_dir)

    # Train the network
    logging.info('===> Start training')
    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(checkpoint_fn)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            model.load_state_dict(state['state_dict'])
            if config.resume_optimizer:
                scheduler = initialize_scheduler(optimizer,
                                                 config,
                                                 last_step=curr_iter)
                optimizer.load_state_dict(state['optimizer'])
            if 'best_val' in state:
                best_val_miou = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))

    data_iter = data_loader.__iter__()
    while is_training:
        for iteration in range(len(data_loader) // config.iter_size):
            optimizer.zero_grad()
            data_time, batch_loss = 0, 0
            iter_timer.tic()

            for sub_iter in range(config.iter_size):
                # Get training data
                data_timer.tic()
                if config.return_transformation:
                    coords, input, target, pointcloud, transformation = data_iter.next(
                    )
                else:
                    coords, input, target = data_iter.next()

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, 1:] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                if config.normalize_color:
                    input[:, :3] = input[:, :3] / 255. - 0.5
                sinput = SparseTensor(input, coords).to(device)

                data_time += data_timer.toc(False)

                # model.initialize_coords(*init_args)
                soutput = model(sinput)
                # The output of the network is not sorted
                target = target.long().to(device)

                loss = criterion(soutput.F, target.long())

                # Compute and accumulate gradient
                loss /= config.iter_size
                batch_loss += loss.item()
                loss.backward()

            # Update number of steps
            optimizer.step()
            scheduler.step()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))

            pred = get_prediction(data_loader.dataset, soutput.F, target)
            score = precision_at_one(pred, target)
            losses.update(batch_loss, target.size(0))
            scores.update(score, target.size(0))

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(x) for x in scheduler.get_lr()])
                debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format(
                    scores.avg, data_time_avg.avg, iter_time_avg.avg)
                logging.info(debug_str)
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                # Write logs
                writer.add_scalar('training/loss', losses.avg, curr_iter)
                writer.add_scalar('training/precision_at_1', scores.avg,
                                  curr_iter)
                writer.add_scalar('training/learning_rate',
                                  scheduler.get_lr()[0], curr_iter)
                losses.reset()
                scores.reset()

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(model, optimizer, epoch, curr_iter, config,
                           best_val_miou, best_val_iter)

            # Validation
            if curr_iter % config.val_freq == 0:
                val_miou = validate(model, val_data_loader, writer, curr_iter,
                                    config, transform_data_fn)
                if val_miou > best_val_miou:
                    best_val_miou = val_miou
                    best_val_iter = curr_iter
                    checkpoint(model, optimizer, epoch, curr_iter, config,
                               best_val_miou, best_val_iter, "best_val")
                logging.info("Current best mIoU: {:.3f} at iter {}".format(
                    best_val_miou, best_val_iter))

                # Recover back
                model.train()

            # End of iteration
            curr_iter += 1

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
               best_val_iter)
    val_miou = validate(model, val_data_loader, writer, curr_iter, config,
                        transform_data_fn)
    if val_miou > best_val_miou:
        best_val_miou = val_miou
        best_val_iter = curr_iter
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter, "best_val")
    logging.info("Current best mIoU: {:.3f} at iter {}".format(
        best_val_miou, best_val_iter))
示例#15
0
def train(model, data_loader, val_data_loader, config, transform_data_fn=None):

    device = get_torch_device(config.is_cuda)
    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    regs, losses, scores = AverageMeter(), AverageMeter(), AverageMeter()

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)

    # Train the network
    logging.info('===> Start training')
    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        # Test loaded ckpt first
        v_loss, v_score, v_mAP, v_mIoU = test(model, val_data_loader, config)

        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(checkpoint_fn)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            # we skip attention maps because the shape won't match because voxel number is different
            # e.g. copyting a param with shape (23385, 8, 4) to (43529, 8, 4)
            d = {
                k: v
                for k, v in state['state_dict'].items() if 'map' not in k
            }
            # handle those attn maps we don't load from saved dict
            for k in model.state_dict().keys():
                if k in d.keys(): continue
                d[k] = model.state_dict()[k]
            model.load_state_dict(d)
            if config.resume_optimizer:
                scheduler = initialize_scheduler(optimizer,
                                                 config,
                                                 last_step=curr_iter)
                optimizer.load_state_dict(state['optimizer'])
            if 'best_val' in state:
                best_val_miou = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))

    data_iter = data_loader.__iter__()
    if config.dataset == "SemanticKITTI":
        num_class = 19
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
        config.val_freq = config.val_freq * 10
    elif config.dataset == "S3DIS":
        num_class = 13
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
        config.val_freq = config.val_freq
    elif config.dataset == "Nuscenes":
        num_class = 16
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
        config.val_freq = config.val_freq * 50
    else:
        num_class = 20
        val_freq_ = config.val_freq

    while is_training:
        total_correct_class = torch.zeros(num_class, device=device)
        total_iou_deno_class = torch.zeros(num_class, device=device)

        for iteration in range(len(data_loader) // config.iter_size):
            optimizer.zero_grad()
            data_time, batch_loss = 0, 0
            iter_timer.tic()

            if curr_iter >= config.max_iter:
                # if curr_iter >= max(config.max_iter, config.epochs*(len(data_loader) // config.iter_size):
                is_training = False
                break
            elif curr_iter >= config.max_iter * (2 / 3):
                config.val_freq = val_freq_ * 2  # valid more freq on lower half

            for sub_iter in range(config.iter_size):
                # Get training data
                data_timer.tic()
                pointcloud = None

                if config.return_transformation:
                    coords, input, target, _, _, pointcloud, transformation, _ = data_iter.next(
                    )
                else:
                    coords, input, target, _, _, _ = data_iter.next(
                    )  # ignore unique_map and inverse_map

                if config.use_aux:
                    assert target.shape[1] == 2
                    aux = target[:, 1]
                    target = target[:, 0]
                else:
                    aux = None

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, 1:] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                if config.normalize_color:
                    input[:, :3] = input[:, :3] / input[:, :3].max() - 0.5
                    coords_norm = coords[:, 1:] / coords[:, 1:].max() - 0.5

                # cat xyz into the rgb feature
                if config.xyz_input:
                    input = torch.cat([coords_norm, input], dim=1)
                sinput = SparseTensor(input, coords, device=device)
                starget = SparseTensor(
                    target.unsqueeze(-1).float(),
                    coordinate_map_key=sinput.coordinate_map_key,
                    coordinate_manager=sinput.coordinate_manager,
                    device=device
                )  # must share the same coord-manager to align for sinput

                data_time += data_timer.toc(False)
                # model.initialize_coords(*init_args)

                # d = {}
                # d['c'] = sinput.C
                # d['l'] = starget.F
                # torch.save('./plot/test-label.pth')
                # import ipdb; ipdb.set_trace()

                # Set up profiler
                # memory_profiler = CUDAMemoryProfiler(
                # [model, criterion],
                # filename="cuda_memory.profile"
                # )
                # sys.settrace(memory_profiler)
                # threading.settrace(memory_profiler)

                # with torch.autograd.profiler.profile(enabled=True, use_cuda=True, record_shapes=False, profile_memory=True) as prof0:
                if aux is not None:
                    soutput = model(sinput, aux)
                elif config.enable_point_branch:
                    soutput = model(sinput,
                                    iter_=curr_iter / config.max_iter,
                                    enable_point_branch=True)
                else:
                    # label-aux, feed it in as additional reg
                    soutput = model(
                        sinput, iter_=curr_iter / config.max_iter, aux=starget
                    )  # feed in the progress of training for annealing inside the model

                # The output of the network is not sorted
                target = target.view(-1).long().to(device)
                loss = criterion(soutput.F, target.long())

                # ====== other loss regs =====
                if hasattr(model, 'block1'):
                    cur_loss = torch.tensor([0.], device=device)

                    if hasattr(model.block1[0], 'vq_loss'):
                        if model.block1[0].vq_loss is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].vq_loss  # m is the nn.Sequential obj, m[0] is the TRBlock
                            logging.info(
                                'Cur Loss: {}, Cur vq_loss: {}'.format(
                                    loss, cur_loss))
                            loss += cur_loss

                    if hasattr(model.block1[0], 'diverse_loss'):
                        if model.block1[0].diverse_loss is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].diverse_loss  # m is the nn.Sequential obj, m[0] is the TRBlock
                            logging.info(
                                'Cur Loss: {}, Cur diverse _loss: {}'.format(
                                    loss, cur_loss))
                            loss += cur_loss

                    if hasattr(model.block1[0], 'label_reg'):
                        if model.block1[0].label_reg is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].label_reg  # m is the nn.Sequential obj, m[0] is the TRBlock
                            # logging.info('Cur Loss: {}, Cur diverse _loss: {}'.format(loss, cur_loss))
                            loss += cur_loss

                # Compute and accumulate gradient
                loss /= config.iter_size
                batch_loss += loss.item()
                loss.backward()

                # soutput = model(sinput)

            # Update number of steps
            if not config.use_sam:
                optimizer.step()
            else:
                optimizer.first_step(zero_grad=True)
                soutput = model(sinput,
                                iter_=curr_iter / config.max_iter,
                                aux=starget)
                criterion(soutput.F, target.long()).backward()
                optimizer.second_step(zero_grad=True)

            if config.lr_warmup is None:
                scheduler.step()
            else:
                if curr_iter >= config.lr_warmup:
                    scheduler.step()
                for g in optimizer.param_groups:
                    g['lr'] = config.lr * (iteration + 1) / config.lr_warmup

            # CLEAR CACHE!
            torch.cuda.empty_cache()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))

            pred = get_prediction(data_loader.dataset, soutput.F, target)
            score = precision_at_one(pred, target, ignore_label=-1)

            regs.update(cur_loss.item(), target.size(0))
            losses.update(batch_loss, target.size(0))
            scores.update(score, target.size(0))

            # calc the train-iou
            for l in range(num_class):
                total_correct_class[l] += ((pred == l) & (target == l)).sum()
                total_iou_deno_class[l] += (((pred == l) & (target != -1)) |
                                            (target == l)).sum()

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(x) for x in scheduler.get_lr()])
                IoU = ((total_correct_class) /
                       (total_iou_deno_class + 1e-6)).mean() * 100.
                debug_str = "[{}] ===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    config.log_dir.split('/')[-2], epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tIoU {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format(
                    scores.avg, IoU.item(), data_time_avg.avg,
                    iter_time_avg.avg)
                if regs.avg > 0:
                    debug_str += "\n Additional Reg Loss {:.3f}".format(
                        regs.avg)
                # print(debug_str)
                logging.info(debug_str)
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                # Write logs
                losses.reset()
                scores.reset()

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(model,
                           optimizer,
                           epoch,
                           curr_iter,
                           config,
                           best_val_miou,
                           best_val_iter,
                           save_inter=True)

            # Validation
            if curr_iter % config.val_freq == 0:
                val_miou = validate(model, val_data_loader, None, curr_iter,
                                    config, transform_data_fn)
                if val_miou > best_val_miou:
                    best_val_miou = val_miou
                    best_val_iter = curr_iter
                    checkpoint(model,
                               optimizer,
                               epoch,
                               curr_iter,
                               config,
                               best_val_miou,
                               best_val_iter,
                               "best_val",
                               save_inter=True)
                logging.info("Current best mIoU: {:.3f} at iter {}".format(
                    best_val_miou, best_val_iter))
                # print("Current best mIoU: {:.3f} at iter {}".format(best_val_miou, best_val_iter))

                # Recover back
                model.train()

            # End of iteration
            curr_iter += 1

        IoU = (total_correct_class) / (total_iou_deno_class + 1e-6)
        logging.info('train point avg class IoU: %f' % ((IoU).mean() * 100.))

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
               best_val_iter)
    v_loss, v_score, v_mAP, val_miou = test(model, val_data_loader, config)
    if val_miou > best_val_miou:
        best_val_miou = val_miou
        best_val_iter = curr_iter
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter, "best_val")
    logging.info("Current best mIoU: {:.3f} at iter {}".format(
        best_val_miou, best_val_iter))
示例#16
0
def train(pipeline_model, data_loader, val_data_loader, config):
    # Set up the train flag for batch normalization
    pipeline_model.train()

    num_devices = torch.cuda.device_count()
    num_devices = min(config.max_ngpu, num_devices)
    devices = list(range(num_devices))
    target_device = devices[0]
    pipeline_model.to(target_device)
    if num_devices > 1:
        pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(
            pipeline_model, devices)

    # Configuration
    writer = SummaryWriter(logdir=config.log_dir)
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    meters = collections.defaultdict(AverageMeter)
    hists = pipeline_model.initialize_hists()

    optimizer = pipeline_model.initialize_optimizer(config)
    scheduler = pipeline_model.initialize_scheduler(optimizer, config)

    writer = SummaryWriter(logdir=config.log_dir)

    # Train the network
    logging.info('===> Start training')
    best_val, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        if osp.isfile(config.resume):
            logging.info("=> loading checkpoint '{}'".format(config.resume))
            state = torch.load(config.resume)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            pipeline_model.load_state_dict(state['state_dict'])
            if config.resume_optimizer:
                curr_iter = state['iteration'] + 1
                scheduler = pipeline_model.initialize_scheduler(
                    optimizer, config, last_step=curr_iter)
                pipeline_model.load_optimizer(optimizer, state['optimizer'])
            if 'best_val' in state:
                best_val = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                config.resume, state['epoch']))
        else:
            logging.info("=> no checkpoint found at '{}'".format(
                config.resume))

    data_iter = data_loader.__iter__()
    while is_training:
        for iteration in range(len(data_loader)):
            pipeline_model.reset_gradient(optimizer)
            iter_timer.tic()

            pipelines = parallel.replicate(pipeline_model, devices)

            # Get training data
            data_timer.tic()
            inputs = []
            for pipeline, device in zip(pipelines, devices):
                with torch.cuda.device(device):
                    while True:
                        datum = pipeline.load_datum(data_iter, has_gt=True)
                        num_boxes = sum(box.shape[0]
                                        for box in datum['bboxes_coords'])
                        if config.skip_empty_boxes and num_boxes == 0:
                            continue
                        break
                    inputs.append(datum)
            data_time_avg.update(data_timer.toc(False))

            outputs = parallel.parallel_apply(pipelines,
                                              [(x, True) for x in inputs],
                                              devices=devices)
            losses = parallel.parallel_apply(
                [pipeline.loss for pipeline in pipelines],
                tuple(zip(inputs, outputs)),
                devices=devices)
            losses = parallel.gather(losses, target_device)
            losses = dict([(k, v.mean()) for k, v in losses.items()])

            meters, hists = pipeline_model.update_meters(meters, hists, losses)

            # Compute and accumulate gradient
            losses['loss'].backward()

            # Update number of steps
            pipeline_model.step_optimizer(losses, optimizer, scheduler,
                                          iteration)

            iter_time_avg.update(iter_timer.toc(False))

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join([
                    '{:.3e}'.format(x) for x in scheduler['default'].get_lr()
                ])
                debug_str = "===> Epoch[{}]({}/{}): LR: {}\n".format(
                    epoch, curr_iter, len(data_loader), lrs)
                debug_str += log_meters(meters, log_perclass_meters=False)
                debug_str += f"\n    data time: {data_time_avg.avg:.3f}"
                debug_str += f"    iter time: {iter_time_avg.avg:.3f}"
                logging.info(debug_str)

                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()

                # Write logs
                update_writer(writer, meters, curr_iter, 'training')
                writer.add_scalar('training/learning_rate',
                                  scheduler['default'].get_lr()[0], curr_iter)

                # Reset meters
                reset_meters(meters, hists)

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(pipeline_model, optimizer, epoch, curr_iter, config,
                           best_val, best_val_iter)

            if config.heldout_save_freq > 0 and curr_iter % config.heldout_save_freq == 0:
                checkpoint(pipeline_model,
                           optimizer,
                           epoch,
                           curr_iter,
                           config,
                           best_val,
                           best_val_iter,
                           heldout_save=True)

            # Validation
            if curr_iter % config.val_freq == 0:
                if num_devices > 1:
                    unconvert_sync_batchnorm(pipeline_model)
                best_val, best_val_iter = validate(pipeline_model,
                                                   val_data_loader, config,
                                                   writer, curr_iter, best_val,
                                                   best_val_iter, optimizer,
                                                   epoch)
                if num_devices > 1:
                    pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(
                        pipeline_model, devices)

            if curr_iter % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

            # End of iteration
            curr_iter += 1

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    if num_devices > 1:
        unconvert_sync_batchnorm(pipeline_model)
    validate(pipeline_model, val_data_loader, config, writer, curr_iter,
             best_val, best_val_iter, optimizer, epoch)
    if num_devices > 1:
        pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(
            pipeline_model, devices)
    checkpoint(pipeline_model, optimizer, epoch, curr_iter, config, best_val,
               best_val_iter)
示例#17
0
def train_distill(model,
                  data_loader,
                  val_data_loader,
                  config,
                  transform_data_fn=None):
    '''
    the distillation training
    some cfgs here
    '''

    # distill_lambda = 1
    # distill_lambda = 0.33
    distill_lambda = 0.67

    # TWO_STAGE=True: Transformer is first trained with L2 loss to match ResNet's activation, and then it fintunes like normal training on the second stage.
    # TWO_STAGE=False: Transformer trains with combined loss

    TWO_STAGE = False
    # STAGE_PERCENTAGE = 0.7

    device = get_torch_device(config.is_cuda)
    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    losses, scores = AverageMeter(), AverageMeter()

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)

    # Train the network
    logging.info('===> Start training')
    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    # TODO:
    # load the sub-model only
    # FIXME: some dirty hard-written stuff, only supporting current state

    tch_model_cls = load_model('Res16UNet18A')
    tch_model = tch_model_cls(3, 20, config).to(device)

    # checkpoint_fn = "/home/zhaotianchen/project/point-transformer/SpatioTemporalSegmentation-ScanNet/outputs/ScannetSparseVoxelizationDataset/Res16UNet18A/resnet_base/weights.pth"
    checkpoint_fn = "/home/zhaotianchen/project/point-transformer/SpatioTemporalSegmentation-ScanNet/outputs/ScannetSparseVoxelizationDataset/Res16UNet18A/Res18A/weights.pth"  # voxel-size: 0.05
    assert osp.isfile(checkpoint_fn)
    logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
    state = torch.load(checkpoint_fn)
    d = {k: v for k, v in state['state_dict'].items() if 'map' not in k}
    tch_model.load_state_dict(d)
    if 'best_val' in state:
        best_val_miou = state['best_val']
        best_val_iter = state['best_val_iter']
    logging.info("=> loaded checkpoint '{}' (epoch {})".format(
        checkpoint_fn, state['epoch']))

    if config.resume:
        raise NotImplementedError
        # Test loaded ckpt first

        # checkpoint_fn = config.resume + '/weights.pth'
        # if osp.isfile(checkpoint_fn):
        # logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
        # state = torch.load(checkpoint_fn)
        # curr_iter = state['iteration'] + 1
        # epoch = state['epoch']
        # d = {k:v for k,v in state['state_dict'].items() if 'map' not in k }
        # model.load_state_dict(d)
        # if config.resume_optimizer:
        # scheduler = initialize_scheduler(optimizer, config, last_step=curr_iter)
        # optimizer.load_state_dict(state['optimizer'])
        # if 'best_val' in state:
        # best_val_miou = state['best_val']
        # best_val_iter = state['best_val_iter']
        # logging.info("=> loaded checkpoint '{}' (epoch {})".format(checkpoint_fn, state['epoch']))
        # else:
        # raise ValueError("=> no checkpoint found at '{}'".format(checkpoint_fn))

    # test after loading the ckpt
    v_loss, v_score, v_mAP, v_mIoU = test(tch_model, val_data_loader, config)
    logging.info('Tch model tested, bes_miou: {}'.format(v_mIoU))

    data_iter = data_loader.__iter__()
    while is_training:

        num_class = 20
        total_correct_class = torch.zeros(num_class, device=device)
        total_iou_deno_class = torch.zeros(num_class, device=device)

        total_iteration = len(data_loader) // config.iter_size
        for iteration in range(total_iteration):

            # NOTE: for single stage distillation, L2 loss might be too large at first
            # so we added a warmup training that don't use L2 loss
            if iteration < 0:
                use_distill = False
            else:
                use_distill = True

            # Stage 1 / Stage 2 boundary
            if TWO_STAGE:
                stage_boundary = int(total_iteration * STAGE_PERCENTAGE)

            optimizer.zero_grad()
            data_time, batch_loss = 0, 0
            iter_timer.tic()

            for sub_iter in range(config.iter_size):
                # Get training data
                data_timer.tic()
                if config.return_transformation:
                    coords, input, target, _, _, pointcloud, transformation = data_iter.next(
                    )
                else:
                    coords, input, target, _, _ = data_iter.next(
                    )  # ignore unique_map and inverse_map

                if config.use_aux:
                    assert target.shape[1] == 2
                    aux = target[:, 1]
                    target = target[:, 0]
                else:
                    aux = None

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, 1:] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                if config.normalize_color:
                    input[:, :3] = input[:, :3] / 255. - 0.5
                    coords_norm = coords[:, 1:] / coords[:, 1:].max() - 0.5

                # cat xyz into the rgb feature
                if config.xyz_input:
                    input = torch.cat([coords_norm, input], dim=1)

                sinput = SparseTensor(input, coords, device=device)

                # TODO: return both-models
                # in order to not breaking the valid interface, use a get_loss to get the regsitered loss

                data_time += data_timer.toc(False)
                # model.initialize_coords(*init_args)
                if aux is not None:
                    raise NotImplementedError

                # flatten ground truth tensor
                target = target.view(-1).long().to(device)

                if TWO_STAGE:
                    if iteration < stage_boundary:
                        # Stage 1: train transformer on L2 loss
                        soutput, anchor = model(sinput, save_anchor=True)
                        # Make sure gradient don't flow to teacher model
                        with torch.no_grad():
                            _, tch_anchor = tch_model(sinput, save_anchor=True)
                        loss = DistillLoss(tch_anchor, anchor)
                    else:
                        # Stage 2: finetune transformer on Cross-Entropy
                        soutput = model(sinput)
                        loss = criterion(soutput.F, target.long())
                else:
                    if use_distill:  # after warm up
                        soutput, anchor = model(sinput, save_anchor=True)
                        # if pretrained teacher, do not let the grad flow to teacher to update its params
                        with torch.no_grad():
                            tch_soutput, tch_anchor = tch_model(
                                sinput, save_anchor=True)

                    else:  # warming up
                        soutput = model(sinput)
                    # The output of the network is not sorted
                    loss = criterion(soutput.F, target.long())
                    #  Add L2 loss if use distillation
                    if use_distill:
                        distill_loss = DistillLoss(tch_anchor,
                                                   anchor) * distill_lambda
                        loss += distill_loss

                # Compute and accumulate gradient
                loss /= config.iter_size
                batch_loss += loss.item()
                loss.backward()

            # Update number of steps
            optimizer.step()
            scheduler.step()

            # CLEAR CACHE!
            torch.cuda.empty_cache()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))

            pred = get_prediction(data_loader.dataset, soutput.F, target)
            score = precision_at_one(pred, target, ignore_label=-1)
            losses.update(batch_loss, target.size(0))
            scores.update(score, target.size(0))

            # calc the train-iou
            for l in range(num_class):
                total_correct_class[l] += ((pred == l) & (target == l)).sum()
                total_iou_deno_class[l] += (((pred == l) & (target != -1)) |
                                            (target == l)).sum()

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(x) for x in scheduler.get_lr()])
                debug_str = "[{}] ===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    config.log_dir, epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format(
                    scores.avg, data_time_avg.avg, iter_time_avg.avg)
                logging.info(debug_str)
                if use_distill and not TWO_STAGE:
                    logging.info('Loss {} Distill Loss:{}'.format(
                        loss, distill_loss))
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                losses.reset()
                scores.reset()

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(model,
                           optimizer,
                           epoch,
                           curr_iter,
                           config,
                           best_val_miou,
                           best_val_iter,
                           save_inter=True)

            # Validation
            if curr_iter % config.val_freq == 0:
                val_miou = validate(model, val_data_loader, None, curr_iter,
                                    config, transform_data_fn)
                if val_miou > best_val_miou:
                    best_val_miou = val_miou
                    best_val_iter = curr_iter
                    checkpoint(model,
                               optimizer,
                               epoch,
                               curr_iter,
                               config,
                               best_val_miou,
                               best_val_iter,
                               "best_val",
                               save_inter=True)
                logging.info("Current best mIoU: {:.3f} at iter {}".format(
                    best_val_miou, best_val_iter))

                # Recover back
                model.train()

            # End of iteration
            curr_iter += 1

        IoU = (total_correct_class) / (total_iou_deno_class + 1e-6)
        logging.info('train point avg class IoU: %f' % ((IoU).mean() * 100.))

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
               best_val_iter)
    v_loss, v_score, v_mAP, val_miou = test(model, val_data_loader, config)
    if val_miou > best_val_miou:
        best_val_miou = val_miou
        best_val_iter = curr_iter
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter, "best_val")
    logging.info("Current best mIoU: {:.3f} at iter {}".format(
        best_val_miou, best_val_iter))
示例#18
0
def test(pipeline_model, data_loader, config, has_gt=True):
    global_timer, data_timer, iter_timer = Timer(), Timer(), Timer()
    meters = collections.defaultdict(AverageMeter)
    hists = pipeline_model.initialize_hists()

    logging.info('===> Start testing')

    global_timer.tic()
    data_iter = data_loader.__iter__()
    max_iter = len(data_loader)

    # Fix batch normalization running mean and std
    pipeline_model.eval()

    # Clear cache (when run in val mode, cleanup training cache)
    torch.cuda.empty_cache()

    if config.save_prediction or config.test_original_pointcloud:
        if config.save_prediction:
            save_pred_dir = config.save_pred_dir
            os.makedirs(save_pred_dir, exist_ok=True)
        else:
            save_pred_dir = tempfile.mkdtemp()
        if os.listdir(save_pred_dir):
            raise ValueError(f'Directory {save_pred_dir} not empty. '
                             'Please remove the existing prediction.')

    with torch.no_grad():
        for iteration in range(max_iter):
            iter_timer.tic()
            data_timer.tic()
            datum = pipeline_model.load_datum(data_iter, has_gt=has_gt)
            data_time = data_timer.toc(False)

            output_dict = pipeline_model(datum, False)
            iter_time = iter_timer.toc(False)

            if config.save_prediction or config.test_original_pointcloud:
                pipeline_model.save_prediction(datum, output_dict,
                                               save_pred_dir, iteration)

            if config.visualize and iteration % config.visualize_freq == 0:
                pipeline_model.visualize_predictions(datum, output_dict,
                                                     iteration)

            if has_gt:
                loss_dict = pipeline_model.loss(datum, output_dict)
                if config.visualize and iteration % config.visualize_freq == 0:
                    pipeline_model.visualize_groundtruth(datum, iteration)
                loss_dict.update(pipeline_model.evaluate(datum, output_dict))

                meters, hists = pipeline_model.update_meters(
                    meters, hists, loss_dict)

            if iteration % config.test_stat_freq == 0 and iteration > 0:
                debug_str = "===> {}/{}\n".format(iteration, max_iter)
                debug_str += log_meters(meters, log_perclass_meters=True)
                debug_str += f"\n    data time: {data_time:.3f}    iter time: {iter_time:.3f}"
                logging.info(debug_str)

            if iteration % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

    global_time = global_timer.toc(False)

    debug_str = "===> Final test results:\n"
    debug_str += log_meters(meters, log_perclass_meters=True)
    logging.info(debug_str)

    if config.test_original_pointcloud:
        pipeline_model.test_original_pointcloud(save_pred_dir)

    logging.info('Finished test. Elapsed time: {:.4f}'.format(global_time))

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    return meters
示例#19
0
文件: solver.py 项目: zjyrr/McRecon
    def train(self, generator_queue, discriminator_queue):
        ''' Given data queues, train the network '''
        # Parameter directory
        save_dir = os.path.join(cfg.DIR.OUT_PATH)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Timer for the training op and parallel data loading op.
        train_timer = Timer()
        data_timer = Timer()

        start_iter = 0
        # Resume training
        if cfg.TRAIN.RESUME_TRAIN:
            self.net.load(cfg.CONST.WEIGHTS)
            start_iter = cfg.TRAIN.INITIAL_ITERATION

        # Setup learning rates
        lr = cfg.TRAIN.DEFAULT_LEARNING_RATE
        lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()]

        print('Set the learning rate to %f.' % lr)
        gen_lr = lr

        discriminator_losses = []
        generator_losses = []
        mask_losses = []
        generator_idx = 0
        voxel_loss = 0
        generator_loss = 0
        mask_loss = 0
        discriminator_loss = 0

        # Main training loop
        for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1):
            self.net.noise.set_value(max(1 - float(train_ind) / 20000., 1e-8))
            data_timer.tic()
            gen_img, gen_camera, _ = generator_queue.get()
            data_timer.toc()
            data_timer.tic()
            _, _, disc_voxel = discriminator_queue.get()
            data_timer.toc()

            # Apply one gradient step
            train_timer.tic()
            if self.net.discriminator_loss is not None:
                error_F, error_R = self.evaluate_discriminator(
                    gen_img, gen_camera, disc_voxel)
                if error_F > 0.2 or error_R > 0.2:
                    self.set_lr(gen_lr / 100.)
                    discriminator_loss = self.discriminator_train_loss(
                        gen_img, gen_camera, disc_voxel)
                    discriminator_losses.append(discriminator_loss)
            self.set_lr(gen_lr)
            results = self.generator_train_loss(gen_img, gen_camera)
            generator_loss = results[0]
            generator_losses.append(generator_loss)
            generator_idx += 1
            mask_loss = results[1]
            mask_losses.append(mask_loss)
            activations = results[2:]
            train_timer.toc()

            # Decrease learning rate at certain points
            if train_ind in lr_steps:
                # edict only takes string for key. Hacky way
                gen_lr = np.float(cfg.TRAIN.LEARNING_RATES[str(train_ind)])
                print('Learing rate decreased to %f: ' % gen_lr)

            # Debugging modules
            #
            # Print status, run validation, check divergence, and save model.
            if train_ind % cfg.TRAIN.PRINT_FREQ == 0:
                # Print the current loss
                get_mean = lambda x, y: np.mean(x) if x else y
                print("""%s Iter %d: Discriminator loss %f, Generator """
                      """loss %f, Mask loss %f""" %
                      (datetime.now(), train_ind,
                       get_mean(discriminator_losses, discriminator_loss),
                       get_mean(generator_losses, generator_loss),
                       get_mean(mask_losses, mask_loss)))
                discriminator_losses = []
                generator_losses = []
                mask_losses = []

            if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0:
                # Check that the network parameters are all valid
                max_param = max_or_nan(self.net.all_params)
                if np.isnan(max_param):
                    print('NAN detected')
                    break

            if train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0:
                self.save(mask_losses, save_dir, train_ind)