def main_worker(gpu, save_dir, args):
    # basic setup
    cudnn.benchmark = True
    args.gpu = gpu
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    model = HyperRegression(args)

    torch.cuda.set_device(args.gpu)
    model = model.cuda(args.gpu)
    start_epoch = 0
    optimizer = model.make_optimizer(args)
    if args.resume_checkpoint is None and os.path.exists(
            os.path.join(save_dir, 'checkpoint-latest.pt')):
        args.resume_checkpoint = os.path.join(
            save_dir, 'checkpoint-latest.pt')  # use the latest checkpoint
    if args.resume_checkpoint is not None:
        if args.resume_optimizer:
            model, optimizer, start_epoch = resume(
                args.resume_checkpoint,
                model,
                optimizer,
                strict=(not args.resume_non_strict))
        else:
            model, _, start_epoch = resume(args.resume_checkpoint,
                                           model,
                                           optimizer=None,
                                           strict=(not args.resume_non_strict))
        print('Resumed from: ' + args.resume_checkpoint)

    # main training loop
    start_time = time.time()
    point_nats_avg_meter = AverageValueMeter()
    if args.distributed:
        print("[Rank %d] World size : %d" % (args.rank, dist.get_world_size()))

    print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs))
    for epoch in range(start_epoch, args.epochs):
        print("Epoch starts:")
        data = ExampleData()
        train_loader = torch.utils.data.DataLoader(dataset=data,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=0,
                                                   pin_memory=True)
        for bidx, data in enumerate(train_loader):
            x, y = data
            x = x.float().to(args.gpu).unsqueeze(1)
            y = y.float().to(args.gpu).unsqueeze(1).unsqueeze(2)
            step = bidx + len(train_loader) * epoch
            model.train()
            recon_nats = model(x, y, optimizer, step, None)
            point_nats_avg_meter.update(recon_nats.item())
            if step % args.log_freq == 0:
                duration = time.time() - start_time
                start_time = time.time()
                print(
                    "[Rank %d] Epoch %d Batch [%2d/%2d] Time [%3.2fs] PointNats %2.5f"
                    % (args.rank, epoch, bidx, len(train_loader), duration,
                       point_nats_avg_meter.avg))
        # save visualizations
        kk = 3
        if (epoch + 1) % args.viz_freq == 0:
            # reconstructions
            model.eval()
            x = torch.from_numpy(np.linspace(0, kk, num=100)).float().to(
                args.gpu).unsqueeze(1)
            _, y = model.decode(x, 100)
            x = x.cpu().detach().numpy()
            y = y.cpu().detach().numpy()
            x = np.expand_dims(x, 1).repeat(100, axis=1).flatten()
            y = y.flatten()
            figs, axs = plt.subplots(1, 1, figsize=(12, 12))
            plt.xlim([0, kk])
            plt.ylim([-2, 2])
            plt.scatter(x, y)
            plt.savefig(
                os.path.join(
                    save_dir, 'images',
                    'tr_vis_sampled_epoch%d-gpu%s.png' % (epoch, args.gpu)))
            plt.clf()
        if (epoch + 1) % args.save_freq == 0:
            save(model, optimizer, epoch + 1,
                 os.path.join(save_dir, 'checkpoint-%d.pt' % epoch))
            save(model, optimizer, epoch + 1,
                 os.path.join(save_dir, 'checkpoint-latest.pt'))
示例#2
0
def main_worker(gpu, save_dir, ngpus_per_node, args):
    # basic setup
    cudnn.benchmark = True
    args.gpu = gpu
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.distributed:
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    if args.log_name is not None:
        log_dir = "runs/%s" % args.log_name
    else:
        log_dir = "runs/time-%d" % time.time()

    if not args.distributed or (args.rank % ngpus_per_node == 0):
        writer = SummaryWriter(logdir=log_dir)
    else:
        writer = None

    if not args.use_latent_flow:  # auto-encoder only
        args.prior_weight = 0
        args.entropy_weight = 0

    # multi-GPU setup
    model = PointFlow(args)
    if args.distributed:  # Multiple processes, single GPU per process
        if args.gpu is not None:

            def _transform_(m):
                return nn.parallel.DistributedDataParallel(
                    m,
                    device_ids=[args.gpu],
                    output_device=args.gpu,
                    check_reduction=True)

            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            model.multi_gpu_wrapper(_transform_)
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = 0
        else:
            assert 0, "DistributedDataParallel constructor should always set the single device scope"
    elif args.gpu is not None:  # Single process, single GPU per process
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:  # Single process, multiple GPUs per process

        def _transform_(m):
            return nn.DataParallel(m)

        model = model.cuda()
        model.multi_gpu_wrapper(_transform_)

    # resume checkpoints
    start_epoch = 0
    optimizer = model.make_optimizer(args)
    if args.resume_checkpoint is None and os.path.exists(
            os.path.join(save_dir, 'checkpoint-latest.pt')):
        args.resume_checkpoint = os.path.join(
            save_dir, 'checkpoint-latest.pt')  # use the latest checkpoint
    if args.resume_checkpoint is not None:
        if args.resume_optimizer:
            model, optimizer, start_epoch = resume(
                args.resume_checkpoint,
                model,
                optimizer,
                strict=(not args.resume_non_strict))
        else:
            model, _, start_epoch = resume(args.resume_checkpoint,
                                           model,
                                           optimizer=None,
                                           strict=(not args.resume_non_strict))
        print('Resumed from: ' + args.resume_checkpoint)

    # initialize datasets and loaders
    tr_dataset = MyDataset(args.data_dir, istest=False)
    te_dataset = MyDataset(args.data_dir, istest=True)
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            tr_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(dataset=tr_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True,
                                               worker_init_fn=init_np_seed)
    test_loader = torch.utils.data.DataLoader(dataset=te_dataset,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              num_workers=0,
                                              pin_memory=True,
                                              drop_last=False,
                                              worker_init_fn=init_np_seed)

    # save dataset statistics
    # if not args.distributed or (args.rank % ngpus_per_node == 0):
    #     np.save(os.path.join(save_dir, "train_set_mean.npy"), tr_dataset.all_points_mean)
    #     np.save(os.path.join(save_dir, "train_set_std.npy"), tr_dataset.all_points_std)
    #     np.save(os.path.join(save_dir, "train_set_idx.npy"), np.array(tr_dataset.shuffle_idx))
    #     np.save(os.path.join(save_dir, "val_set_mean.npy"), te_dataset.all_points_mean)
    #     np.save(os.path.join(save_dir, "val_set_std.npy"), te_dataset.all_points_std)
    #     np.save(os.path.join(save_dir, "val_set_idx.npy"), np.array(te_dataset.shuffle_idx))

    # load classification dataset if needed
    if args.eval_classification:
        from datasets import get_clf_datasets

        def _make_data_loader_(dataset):
            return torch.utils.data.DataLoader(dataset=dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               num_workers=0,
                                               pin_memory=True,
                                               drop_last=False,
                                               worker_init_fn=init_np_seed)

        clf_datasets = get_clf_datasets(args)
        clf_loaders = {
            k: [_make_data_loader_(ds) for ds in ds_lst]
            for k, ds_lst in clf_datasets.items()
        }
    else:
        clf_loaders = None

    # initialize the learning rate scheduler
    if args.scheduler == 'exponential':
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, args.exp_decay)
    elif args.scheduler == 'step':
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=args.epochs // 2,
                                              gamma=0.1)
    elif args.scheduler == 'linear':

        def lambda_rule(ep):
            lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float(
                0.5 * args.epochs)
            return lr_l

        scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                lr_lambda=lambda_rule)
    else:
        assert 0, "args.schedulers should be either 'exponential' or 'linear'"

    # main training loop
    start_time = time.time()
    entropy_avg_meter = AverageValueMeter()
    latent_nats_avg_meter = AverageValueMeter()
    point_nats_avg_meter = AverageValueMeter()
    if args.distributed:
        print("[Rank %d] World size : %d" % (args.rank, dist.get_world_size()))

    print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs))
    for epoch in range(start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        # adjust the learning rate
        if (epoch + 1) % args.exp_decay_freq == 0:
            scheduler.step(epoch=epoch)
            if writer is not None:
                writer.add_scalar('lr/optimizer', scheduler.get_lr()[0], epoch)

        # train for one epoch
        for bidx, data in enumerate(train_loader):
            idx_batch, tr_batch, te_batch = data['idx'], data[
                'train_points'], data['test_points']
            step = bidx + len(train_loader) * epoch
            model.train()
            inputs = tr_batch.cuda(args.gpu, non_blocking=True)
            out = model(inputs, optimizer, step, writer)
            entropy, prior_nats, recon_nats = out['entropy'], out[
                'prior_nats'], out['recon_nats']
            entropy_avg_meter.update(entropy)
            point_nats_avg_meter.update(recon_nats)
            latent_nats_avg_meter.update(prior_nats)
            if step % args.log_freq == 0:
                duration = time.time() - start_time
                start_time = time.time()
                print(
                    "[Rank %d] Epoch %d Batch [%2d/%2d] Time [%3.2fs] Entropy %2.5f LatentNats %2.5f PointNats %2.5f"
                    % (args.rank, epoch, bidx, len(train_loader), duration,
                       entropy_avg_meter.avg, latent_nats_avg_meter.avg,
                       point_nats_avg_meter.avg))

        # evaluate on the validation set
        # if not args.no_validation and (epoch + 1) % args.val_freq == 0:
        #     from utils import validate
        #     validate(test_loader, model, epoch, writer, save_dir, args, clf_loaders=clf_loaders)

        # save visualizations
        if (epoch + 1) % args.viz_freq == 0:
            # reconstructions
            model.eval()
            samples = model.reconstruct(inputs)
            results = []
            for idx in range(min(10, inputs.size(0))):
                res = visualize_point_clouds(samples[idx], inputs[idx], idx)
                results.append(res)
            res = np.concatenate(results, axis=1)
            scipy.misc.imsave(
                os.path.join(
                    save_dir, 'images',
                    'tr_vis_conditioned_epoch%d-gpu%s.png' %
                    (epoch, args.gpu)), res.transpose((1, 2, 0)))
            if writer is not None:
                writer.add_image('tr_vis/conditioned', torch.as_tensor(res),
                                 epoch)

            # samples
            if args.use_latent_flow:
                num_samples = min(10, inputs.size(0))
                num_points = inputs.size(1)
                _, samples = model.sample(num_samples, num_points)
                results = []
                for idx in range(num_samples):
                    res = visualize_point_clouds(samples[idx], inputs[idx],
                                                 idx)
                    results.append(res)
                res = np.concatenate(results, axis=1)
                scipy.misc.imsave(
                    os.path.join(
                        save_dir, 'images',
                        'tr_vis_conditioned_epoch%d-gpu%s.png' %
                        (epoch, args.gpu)), res.transpose((1, 2, 0)))
                if writer is not None:
                    writer.add_image('tr_vis/sampled', torch.as_tensor(res),
                                     epoch)

        # save checkpoints
        if not args.distributed or (args.rank % ngpus_per_node == 0):
            if (epoch + 1) % args.save_freq == 0:
                save(model, optimizer, epoch + 1,
                     os.path.join(save_dir, 'checkpoint-%d.pt' % epoch))
                save(model, optimizer, epoch + 1,
                     os.path.join(save_dir, 'checkpoint-latest.pt'))
示例#3
0
        extrinsic = extrinsic.cuda()

        R = extrinsic[:, 0:3, 0:3]
        t = extrinsic[:, 0:3, 3].unsqueeze(1)

        # Forward pass
        R_pred, t_pred = network(image)

        # Loss computation
        xyz_rot = transformation(xyz, R_pred, t_pred)
        xyz_rot_gt = transformation(xyz, R, t)

        batch_loss = loss(xyz_rot, xyz_rot_gt)

        batch_loss.backward()
        train_loss.update(batch_loss.item())
        torch.nn.utils.clip_grad_norm_(network.parameters(),
                                       0.1)  # Clip gradients
        optimizer.step()  # gradient update

        print('[%d: %d/%d] train loss:  %f' %
              (epoch, i, len_dataset / opt.batch_size, batch_loss.item()))

        # VALIDATION
    test_loss.reset()
    network.eval()
    with torch.no_grad():
        for i, data in enumerate(dataloader_test, 0):
            # Load data
            points, image, _, extrinsic, _ = data
示例#4
0
def main_worker(gpu, save_dir, ngpus_per_node, args):
    # basic setup
    cudnn.benchmark = True
    normalize = False
    args.gpu = gpu
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    model = HyperRegression(args)

    torch.cuda.set_device(args.gpu)
    model = model.cuda(args.gpu)
    start_epoch = 0
    optimizer = model.make_optimizer(args)
    if args.resume_checkpoint is None and os.path.exists(
            os.path.join(save_dir, 'checkpoint-latest.pt')):
        args.resume_checkpoint = os.path.join(
            save_dir, 'checkpoint-latest.pt')  # use the latest checkpoint
    if args.resume_checkpoint is not None:
        if args.resume_optimizer:
            model, optimizer, start_epoch = resume(
                args.resume_checkpoint,
                model,
                optimizer,
                strict=(not args.resume_non_strict))
        else:
            model, _, start_epoch = resume(args.resume_checkpoint,
                                           model,
                                           optimizer=None,
                                           strict=(not args.resume_non_strict))
        print('Resumed from: ' + args.resume_checkpoint)

    # initialize datasets and loaders

    # initialize the learning rate scheduler
    if args.scheduler == 'exponential':
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, args.exp_decay)
    elif args.scheduler == 'step':
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=args.epochs // 2,
                                              gamma=0.1)
    elif args.scheduler == 'linear':

        def lambda_rule(ep):
            lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float(
                0.5 * args.epochs)
            return lr_l

        scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                lr_lambda=lambda_rule)
    else:
        assert 0, "args.schedulers should be either 'exponential' or 'linear'"

    # main training loop
    start_time = time.time()
    entropy_avg_meter = AverageValueMeter()
    latent_nats_avg_meter = AverageValueMeter()
    point_nats_avg_meter = AverageValueMeter()
    if args.distributed:
        print("[Rank %d] World size : %d" % (args.rank, dist.get_world_size()))

    print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs))
    data = SDDData(split='train', normalize=normalize, root=args.data_dir)
    data_test = SDDData(split='test', normalize=normalize, root=args.data_dir)
    train_loader = torch.utils.data.DataLoader(dataset=data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=0,
                                               pin_memory=True)
    test_loader = torch.utils.data.DataLoader(dataset=data_test,
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=0,
                                              pin_memory=True)
    for epoch in range(start_epoch, args.epochs):
        # adjust the learning rate
        if (epoch + 1) % args.exp_decay_freq == 0:
            scheduler.step(epoch=epoch)

        # train for one epoch
        print("Epoch starts:")
        for bidx, data in enumerate(train_loader):
            # if bidx < 2:
            x, y = data
            #y = y.float().to(args.gpu).unsqueeze(1).repeat(1, 10).unsqueeze(2)
            x = x.float().to(args.gpu)
            y = y.float().to(args.gpu).unsqueeze(1)
            y = y.repeat(1, 20, 1)
            y += torch.randn(y.shape[0], y.shape[1], y.shape[2]).to(args.gpu)
            step = bidx + len(train_loader) * epoch
            model.train()
            recon_nats = model(x, y, optimizer, step, None)
            point_nats_avg_meter.update(recon_nats.item())
            if step % args.log_freq == 0:
                duration = time.time() - start_time
                start_time = time.time()
                print(
                    "[Rank %d] Epoch %d Batch [%2d/%2d] Time [%3.2fs] PointNats %2.5f"
                    % (args.rank, epoch, bidx, len(train_loader), duration,
                       point_nats_avg_meter.avg))
                # print("Memory")
                # print(process.memory_info().rss / (1024.0 ** 3))
        # save visualizations
        if (epoch + 1) % args.viz_freq == 0:
            # reconstructions
            model.eval()
            for bidx, data in enumerate(test_loader):
                x, _ = data
                x = x.float().to(args.gpu)
                _, y_pred = model.decode(x, 100)
                y_pred = y_pred.cpu().detach().numpy().squeeze()
                # y_pred[y_pred < 0] = 0
                # y_pred[y_pred >= 0.98] = 0.98
                testing_sequence = data_test.dataset.scenes[
                    data_test.test_id].sequences[bidx]
                objects_list = []
                for k in range(3):
                    objects_list.append(
                        decode_obj(testing_sequence.objects[k],
                                   testing_sequence.id))
                objects = np.stack(objects_list, axis=0)
                gt_object = decode_obj(testing_sequence.objects[-1],
                                       testing_sequence.id)
                drawn_img_hyps = draw_hyps(testing_sequence.imgs[-1], y_pred,
                                           gt_object, objects, normalize)
                cv2.imwrite(
                    os.path.join(save_dir, 'images',
                                 str(bidx) + '-' + str(epoch) + '-hyps.jpg'),
                    drawn_img_hyps)
        if (epoch + 1) % args.save_freq == 0:
            save(model, optimizer, epoch + 1,
                 os.path.join(save_dir, 'checkpoint-%d.pt' % epoch))
            save(model, optimizer, epoch + 1,
                 os.path.join(save_dir, 'checkpoint-latest.pt'))
示例#5
0
    mean_dist1 = 0
    for k in range(B_size):
      dist1_per_batch = dist1[k]
      mask_per_batch = mask_per_pts[k, :]
      mean_dist1 += torch.mean(dist1_per_batch[mask_per_batch]) / B_size

    # Finally here is the final loss computation:
    # both sides of Chamfer + BCE on occupancy grids
    loss_ch = torch.mean(dist2) + mean_dist1
    loss_occ = F.binary_cross_entropy(occupancy, target_occupancy)
    loss_net = loss_ch + 100.0 * loss_occ

    loss_net.backward()
    optimizer.step()  # gradient update

    total_train_loss.update(loss_net.item())
    chd_train_loss.update(loss_ch.item())
    occ_train_loss.update(loss_occ.item())

    # VISUALIZE
    if i % 200 <= 0:
      print("Storing to file...")
      save_pointcloud(points[0].data.cpu(),
                      os.path.join(output_folder, f'train_GT_{epoch}_{i}.ply'))
      save_pointcloud(points_flat[0][mask_per_pts[0]].data.cpu(),
                      os.path.join(output_folder, f'train_output_{epoch}_{i}.ply'))
      save_image(img[0], os.path.join(output_folder, f'train_input_{epoch}_{i}.png'))
      
    print('[%d: %d/%d] Train Chamfer Loss:  %f, Train Occupancy Loss:  %f  ' % (
      epoch, i, len_dataset / opt.batch_size, loss_ch.item(), loss_occ.item()))
示例#6
0
    # Prevent from reaching 0 (otherwise cannot take log)
    z_fake = torch.clamp(z_fake, min=0.001, max=1.)

    # Compute losses
    depth_loss = depth_criterion(z_fake, z)

    grad_real, grad_fake = imgrad_yx(z), imgrad_yx(z_fake)
    grad_loss = grad_criterion(grad_fake, grad_real)     * grad_factor * (epoch>3)
    normal_loss = normal_criterion(grad_fake, grad_real) * normal_factor * (epoch>7)

    loss = depth_loss + grad_loss + normal_loss
    loss.backward()
    optimizer.step()  # gradient update

    train_total.update(loss.item())
    train_logRMSE.update(depth_loss.item())
    train_grad.update(grad_loss.item())
    train_normal.update(normal_loss.item())

    # Print info
    print("[epoch %2d][iter %4d] loss: %.4f , RMSElog: %.4f , grad_loss: %.4f , normal_loss: %.4f" \
          % (epoch, i, loss, depth_loss, grad_loss, normal_loss))

    # VISUALIZE
    if i == 0:
      for idx in [0, img.shape[0]-1]:
        save_image(img[idx],    os.path.join(output_folder, f'train_input_{epoch}_{idx}.png'))
        save_image(z[idx],      os.path.join(output_folder, f'train_GT_{epoch}_{idx}.png'))
        save_image(z_fake[idx], os.path.join(output_folder, f'train_pred_{epoch}_{idx}.png'))
示例#7
0
        # Then map back to the scaling used in DISN
        scale = T[0, 0, 0]
        pointsReconstructed = pointsReconstructed / scale
        gt_points = gt_points / scale

        points_flat = pointsReconstructed.reshape(B_size, -1, 3)

        # In case the output is empty, just randomly put 100 points
        if points_flat.shape[1] == 0:
            print(f'Error: for shape {mesh_name}, the output is empty')
            points_flat = torch.rand(1, 100, 3).cuda() - 0.5

        ##### f-score computation
        f_score_value = test_f_score(points_flat, gt_points,
                                     opt.pts_for_fscore).item()
        overall_f_score_5_percent.update(f_score_value)
        per_cat_f_score_5_percent[cat].update(f_score_value)

        ##### Chamfer loss
        chd_value = test_chamfer(points_flat, gt_points,
                                 opt.pts_for_chd).item()
        overall_chd_loss.update(chd_value)
        per_cat_chd_loss[cat].update(chd_value)

        ##### IoU computation
        iou_value = test_shellIoU(points_flat, gt_points,
                                  opt.pts_for_IoU).item()
        overall_iou_loss.update(iou_value)
        per_cat_iou_loss[cat].update(iou_value)

        # Save output point clouds for the first 10 objects per category