def get_preds_dist(self, dataset='valid', with_target=False): self.model.eval() if dataset == 'train': train_sampler = OrderedDistributedSampler(self.train_dl.dataset, get_world_size(), rank=get_rank()) ordered_dist_train_dl = DataLoader( self.train_dl.dataset, self.train_dl.batch_size, shuffle=False, sampler=train_sampler, num_workers=self.train_dl.num_workers, collate_fn=self.train_dl.collate_fn, pin_memory=self.train_dl.pin_memory, timeout=self.train_dl.timeout, worker_init_fn=self.train_dl.worker_init_fn) bar = tqdm(ordered_dist_train_dl) if is_main_process( ) else ordered_dist_train_dl else: valid_sampler = OrderedDistributedSampler(self.valid_dl.dataset, get_world_size(), rank=get_rank()) ordered_dist_valid_dl = DataLoader( self.valid_dl.dataset, self.valid_dl.batch_size, shuffle=False, sampler=valid_sampler, num_workers=self.valid_dl.num_workers, collate_fn=self.valid_dl.collate_fn, pin_memory=self.valid_dl.pin_memory, timeout=self.valid_dl.timeout, worker_init_fn=self.valid_dl.worker_init_fn) bar = tqdm(ordered_dist_valid_dl) if is_main_process( ) else ordered_dist_valid_dl outputs = [] targets = [] for batch in bar: x, y = batch_gpu(batch) output = self.model(x) output = to_cpu(output) outputs.append(output) if with_target: targets.append(to_cpu(y)) outputs = torch.cat(outputs) all_outputs = all_gather(outputs) if with_target: targets = torch.cat(targets) all_targets = all_gather(targets) if not is_main_process(): return all_outputs = torch.cat(all_outputs, dim=0).cpu()[:len(self.valid_dl.dataset)] if with_target: all_targets = torch.cat(all_targets, dim=0).cpu()[:len(self.valid_dl.dataset)] return all_outputs, all_targets else: return all_outputs
def on_epoch_end(self, epoch: int, **kwargs) -> None: "Compare the value monitored to its best score and maybe save the model." if self.every == "epoch": self.learn.save(f'{self.name}_{epoch}') else: # every="improvement" c = self.get_monitor_value() world_size = get_world_size() if world_size == 1: current = c if current is not None and self.operator(current, self.best): print( f'Better model found at epoch {epoch} with {self.monitor} value: {current}.' ) self.best = current self.learn.save(f'{self.name}') else: with torch.no_grad(): c = torch.tensor(c).cuda() dist.reduce(c, dst=0) if get_rank() == 0: current = c / world_size current = current.data if current is not None and current < self.best: print( f'Better model found at epoch {epoch} with {self.monitor} value: {current}.' ) self.best = current self.learn.save(f'{self.name}')
def to_cuda(x): if hasattr(x, 'cuda'): return x.cuda(device=get_rank()) elif isinstance(x, (list, tuple)): return [to_cuda(xi) for xi in x] elif isinstance(x, dict): return {k: to_cuda(v) for k, v in x.items()}
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="configs/kitti/e2e_disp_rcnn_R_101_FPN_mf_2d.yaml", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("disprcnn", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed)
def train(cfg, local_rank, distributed): torch.autograd.set_detect_anomaly(True) model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer, uncert = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) model = fix_parameters(model, cfg) # Initialize mixed-precision training if distributed: model = DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, find_unused_parameters=True) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load( cfg.MODEL.WEIGHT, load_optimizer=cfg.SOLVER.LOAD_OPTIMIZER, load_scheduler=cfg.SOLVER.LOAD_SCHEDULER) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train(model, data_loader, optimizer, scheduler, checkpointer, torch.device(cfg.MODEL.DEVICE), checkpoint_period, arguments, uncert, cfg) return model
def _setup_logger(self): logger = logging.getLogger(self.__class__.__name__) logger.setLevel(logging.DEBUG) # don't log results for the non-master process if get_rank() > 0: return logger ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) formatter = logging.Formatter( "%(asctime)s %(name)s %(levelname)s: %(message)s") ch.setFormatter(formatter) logger.addHandler(ch) fh = logging.FileHandler(os.path.join(self.output_dir, 'log.txt')) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) return logger
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--ckpt", help= "The path to the checkpoint for test, default is the latest checkpoint.", default=None, ) parser.add_argument('--no_force_recompute', action='store_true') parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("disprcnn", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt _ = checkpointer.load(ckpt, use_latest=args.ckpt is None) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for dlv, dn, of in zip(data_loaders_val, dataset_names, output_folders): inference( model, dlv, dataset_name=dn, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=of, force_recompute=not args.no_force_recompute, ) synchronize()
def __init__(self, learn): super().__init__(learn) if get_rank() == 0: self.tb_writer = SummaryWriter(learn.model_dir, flush_secs=10)
def do_train(model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, uncert, cfg): if get_rank() == 0: tb_writer = SummaryWriter(cfg.OUTPUT_DIR, flush_secs=20) logger = logging.getLogger("disprcnn.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() fix_model_training(model, cfg) start_training_time = time.time() end = time.time() grad_norm_clip = cfg.SOLVER.GRAD_CLIP if isinstance(scheduler, OneCycleScheduler): scheduler = OneCycleScheduler(optimizer, cfg.SOLVER.BASE_LR, cfg.SOLVER.MAX_ITER, last_epoch=start_iter) valid_iter = start_iter for it, (images, targets, other_fields) in enumerate(data_loader, start_iter): if cfg.SOLVER.PRINT_ITERATION: print('iteration', it) if not check_forward(targets): logger.info('check forward failed, not forwarding this iteration.') it -= 1 continue data_time = time.time() - end iteration = it + 1 valid_iter += 1 arguments["iteration"] = iteration try: images = {k: v.to(device) for k, v in images.items()} targets = { k: [t.to(device) for t in v] for k, v in targets.items() } if cfg.SOLVER.OFFLINE_2D_PREDICTIONS == '': # return idx only loss_dict = model(images, targets) else: _, preds2d = other_fields preds2d = { k: [t.to(device) for t in v] for k, v in preds2d.items() } loss_dict = model(images, preds2d, targets) # torch.cuda.synchronize() # print('forward cost', time.time() - begin) # losses = sum(loss for loss in loss_dict.values()) losses = compute_losses(loss_dict, cfg, uncert) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() if cfg.SOLVER.DO_GRAD_CLIP: clip_grad_norm_(model.parameters(), grad_norm_clip) optimizer.step() scheduler.step() except KeyboardInterrupt as e: raise e except Exception as e: if cfg.SOLVER.ALLOW_EXCEPTION: print(e) valid_iter -= 1 else: raise e batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % cfg.SOLVER.PRINT_INTERVAL == 0 or iteration == max_iter: if get_rank() == 0: if 'loss_dict_reduced' in locals(): for k, v in loss_dict_reduced.items(): tb_writer.add_scalar(k, v.data.cpu().numpy(), iteration) if 'losses_reduced' in locals(): tb_writer.add_scalar('losses_reduced', losses_reduced.item(), iteration) tb_writer.add_scalar('lr', optimizer.param_groups[0]["lr"], iteration) tb_writer.add_scalar('batch_time', batch_time, iteration) if cfg.SOLVER.UNCERT_LOSS_WEIGHT != 0: for i, a in enumerate(uncert.data.cpu().numpy()): tb_writer.add_scalar('uncert' + str(i), a, iteration) logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "valid_iter: {valid_iter}", "{meters}", "lr: {lr:.8f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, valid_iter=valid_iter, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0 and iteration != 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))