示例#1
0
 def get_preds_dist(self, dataset='valid', with_target=False):
     self.model.eval()
     if dataset == 'train':
         train_sampler = OrderedDistributedSampler(self.train_dl.dataset,
                                                   get_world_size(),
                                                   rank=get_rank())
         ordered_dist_train_dl = DataLoader(
             self.train_dl.dataset,
             self.train_dl.batch_size,
             shuffle=False,
             sampler=train_sampler,
             num_workers=self.train_dl.num_workers,
             collate_fn=self.train_dl.collate_fn,
             pin_memory=self.train_dl.pin_memory,
             timeout=self.train_dl.timeout,
             worker_init_fn=self.train_dl.worker_init_fn)
         bar = tqdm(ordered_dist_train_dl) if is_main_process(
         ) else ordered_dist_train_dl
     else:
         valid_sampler = OrderedDistributedSampler(self.valid_dl.dataset,
                                                   get_world_size(),
                                                   rank=get_rank())
         ordered_dist_valid_dl = DataLoader(
             self.valid_dl.dataset,
             self.valid_dl.batch_size,
             shuffle=False,
             sampler=valid_sampler,
             num_workers=self.valid_dl.num_workers,
             collate_fn=self.valid_dl.collate_fn,
             pin_memory=self.valid_dl.pin_memory,
             timeout=self.valid_dl.timeout,
             worker_init_fn=self.valid_dl.worker_init_fn)
         bar = tqdm(ordered_dist_valid_dl) if is_main_process(
         ) else ordered_dist_valid_dl
     outputs = []
     targets = []
     for batch in bar:
         x, y = batch_gpu(batch)
         output = self.model(x)
         output = to_cpu(output)
         outputs.append(output)
         if with_target:
             targets.append(to_cpu(y))
     outputs = torch.cat(outputs)
     all_outputs = all_gather(outputs)
     if with_target:
         targets = torch.cat(targets)
         all_targets = all_gather(targets)
     if not is_main_process():
         return
     all_outputs = torch.cat(all_outputs,
                             dim=0).cpu()[:len(self.valid_dl.dataset)]
     if with_target:
         all_targets = torch.cat(all_targets,
                                 dim=0).cpu()[:len(self.valid_dl.dataset)]
         return all_outputs, all_targets
     else:
         return all_outputs
示例#2
0
 def on_epoch_end(self, epoch: int, **kwargs) -> None:
     "Compare the value monitored to its best score and maybe save the model."
     if self.every == "epoch":
         self.learn.save(f'{self.name}_{epoch}')
     else:  # every="improvement"
         c = self.get_monitor_value()
         world_size = get_world_size()
         if world_size == 1:
             current = c
             if current is not None and self.operator(current, self.best):
                 print(
                     f'Better model found at epoch {epoch} with {self.monitor} value: {current}.'
                 )
                 self.best = current
                 self.learn.save(f'{self.name}')
         else:
             with torch.no_grad():
                 c = torch.tensor(c).cuda()
                 dist.reduce(c, dst=0)
                 if get_rank() == 0:
                     current = c / world_size
                     current = current.data
                     if current is not None and current < self.best:
                         print(
                             f'Better model found at epoch {epoch} with {self.monitor} value: {current}.'
                         )
                         self.best = current
                         self.learn.save(f'{self.name}')
示例#3
0
def to_cuda(x):
    if hasattr(x, 'cuda'):
        return x.cuda(device=get_rank())
    elif isinstance(x, (list, tuple)):
        return [to_cuda(xi) for xi in x]
    elif isinstance(x, dict):
        return {k: to_cuda(v) for k, v in x.items()}
示例#4
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="configs/kitti/e2e_disp_rcnn_R_101_FPN_mf_2d.yaml",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("disprcnn", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    model = train(cfg, args.local_rank, args.distributed)
示例#5
0
def train(cfg, local_rank, distributed):
    torch.autograd.set_detect_anomaly(True)
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    optimizer, uncert = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)
    model = fix_parameters(model, cfg)
    # Initialize mixed-precision training
    if distributed:
        model = DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
            find_unused_parameters=True)

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(
        cfg.MODEL.WEIGHT,
        load_optimizer=cfg.SOLVER.LOAD_OPTIMIZER,
        load_scheduler=cfg.SOLVER.LOAD_SCHEDULER)
    arguments.update(extra_checkpoint_data)
    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(model, data_loader, optimizer, scheduler, checkpointer,
             torch.device(cfg.MODEL.DEVICE), checkpoint_period, arguments,
             uncert, cfg)

    return model
示例#6
0
    def _setup_logger(self):
        logger = logging.getLogger(self.__class__.__name__)
        logger.setLevel(logging.DEBUG)
        # don't log results for the non-master process
        if get_rank() > 0:
            return logger
        ch = logging.StreamHandler(stream=sys.stdout)
        ch.setLevel(logging.DEBUG)
        formatter = logging.Formatter(
            "%(asctime)s %(name)s %(levelname)s: %(message)s")
        ch.setFormatter(formatter)
        logger.addHandler(ch)

        fh = logging.FileHandler(os.path.join(self.output_dir, 'log.txt'))
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(formatter)
        logger.addHandler(fh)
        return logger
示例#7
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Inference")
    parser.add_argument(
        "--config-file",
        default=
        "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "--ckpt",
        help=
        "The path to the checkpoint for test, default is the latest checkpoint.",
        default=None,
    )
    parser.add_argument('--no_force_recompute', action='store_true')
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    distributed = num_gpus > 1

    if distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    save_dir = ""
    logger = setup_logger("disprcnn", save_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(cfg)

    model = build_detection_model(cfg)
    model.to(cfg.MODEL.DEVICE)

    output_dir = cfg.OUTPUT_DIR
    checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
    ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt
    _ = checkpointer.load(ckpt, use_latest=args.ckpt is None)

    iou_types = ("bbox", )
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm", )
    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    if cfg.OUTPUT_DIR:
        for idx, dataset_name in enumerate(dataset_names):
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference",
                                         dataset_name)
            mkdir(output_folder)
            output_folders[idx] = output_folder
    data_loaders_val = make_data_loader(cfg,
                                        is_train=False,
                                        is_distributed=distributed)
    for dlv, dn, of in zip(data_loaders_val, dataset_names, output_folders):
        inference(
            model,
            dlv,
            dataset_name=dn,
            iou_types=iou_types,
            box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
            device=cfg.MODEL.DEVICE,
            expected_results=cfg.TEST.EXPECTED_RESULTS,
            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
            output_folder=of,
            force_recompute=not args.no_force_recompute,
        )
        synchronize()
示例#8
0
 def __init__(self, learn):
     super().__init__(learn)
     if get_rank() == 0:
         self.tb_writer = SummaryWriter(learn.model_dir, flush_secs=10)
示例#9
0
def do_train(model, data_loader, optimizer, scheduler, checkpointer, device,
             checkpoint_period, arguments, uncert, cfg):
    if get_rank() == 0:
        tb_writer = SummaryWriter(cfg.OUTPUT_DIR, flush_secs=20)
    logger = logging.getLogger("disprcnn.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    fix_model_training(model, cfg)
    start_training_time = time.time()
    end = time.time()
    grad_norm_clip = cfg.SOLVER.GRAD_CLIP
    if isinstance(scheduler, OneCycleScheduler):
        scheduler = OneCycleScheduler(optimizer,
                                      cfg.SOLVER.BASE_LR,
                                      cfg.SOLVER.MAX_ITER,
                                      last_epoch=start_iter)
    valid_iter = start_iter
    for it, (images, targets,
             other_fields) in enumerate(data_loader, start_iter):
        if cfg.SOLVER.PRINT_ITERATION:
            print('iteration', it)
        if not check_forward(targets):
            logger.info('check forward failed, not forwarding this iteration.')
            it -= 1
            continue
        data_time = time.time() - end
        iteration = it + 1
        valid_iter += 1
        arguments["iteration"] = iteration
        try:
            images = {k: v.to(device) for k, v in images.items()}
            targets = {
                k: [t.to(device) for t in v]
                for k, v in targets.items()
            }
            if cfg.SOLVER.OFFLINE_2D_PREDICTIONS == '':
                # return idx only
                loss_dict = model(images, targets)
            else:
                _, preds2d = other_fields
                preds2d = {
                    k: [t.to(device) for t in v]
                    for k, v in preds2d.items()
                }
                loss_dict = model(images, preds2d, targets)
            # torch.cuda.synchronize()
            # print('forward cost', time.time() - begin)
            # losses = sum(loss for loss in loss_dict.values())
            losses = compute_losses(loss_dict, cfg, uncert)
            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = reduce_loss_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            meters.update(loss=losses_reduced, **loss_dict_reduced)
            optimizer.zero_grad()
            losses.backward()
            if cfg.SOLVER.DO_GRAD_CLIP:
                clip_grad_norm_(model.parameters(), grad_norm_clip)
            optimizer.step()
            scheduler.step()
        except KeyboardInterrupt as e:
            raise e
        except Exception as e:
            if cfg.SOLVER.ALLOW_EXCEPTION:
                print(e)
                valid_iter -= 1
            else:
                raise e

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % cfg.SOLVER.PRINT_INTERVAL == 0 or iteration == max_iter:
            if get_rank() == 0:
                if 'loss_dict_reduced' in locals():
                    for k, v in loss_dict_reduced.items():
                        tb_writer.add_scalar(k,
                                             v.data.cpu().numpy(), iteration)
                if 'losses_reduced' in locals():
                    tb_writer.add_scalar('losses_reduced',
                                         losses_reduced.item(), iteration)
                tb_writer.add_scalar('lr', optimizer.param_groups[0]["lr"],
                                     iteration)
                tb_writer.add_scalar('batch_time', batch_time, iteration)

                if cfg.SOLVER.UNCERT_LOSS_WEIGHT != 0:
                    for i, a in enumerate(uncert.data.cpu().numpy()):
                        tb_writer.add_scalar('uncert' + str(i), a, iteration)
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "valid_iter: {valid_iter}",
                    "{meters}",
                    "lr: {lr:.8f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    valid_iter=valid_iter,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0 and iteration != 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))