def benchmark_train(args): cfg = setup(args) model = build_model(cfg) logger.info("Model:\n{}".format(model)) if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) optimizer = build_optimizer(cfg, model) checkpointer = DetectionCheckpointer(model, optimizer=optimizer) checkpointer.load(cfg.MODEL.WEIGHTS) cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 data_loader = build_detection_train_loader(cfg) dummy_data = list(itertools.islice(data_loader, 100)) def f(): data = DatasetFromList(dummy_data, copy=False) while True: yield from data max_iter = 400 trainer = SimpleTrainer(model, f(), optimizer) trainer.register_hooks([ hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)]) ]) trainer.train(1, max_iter)
def __init__(self, size: int): """ Args: size (int): the total number of data of the underlying dataset to sample from """ self._size = size assert size > 0 self._rank = comm.get_rank() self._world_size = comm.get_world_size() shard_size = (self._size - 1) // self._world_size + 1 begin = shard_size * self._rank end = min(shard_size * (self._rank + 1), self._size) self._local_indices = range(begin, end)
def default_setup(cfg, args): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the mydl logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (CfgNode): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: PathManager.mkdirs(output_dir) rank = comm.get_rank() setup_logger(output_dir, distributed_rank=rank, name="fvcore") logger = setup_logger(output_dir, distributed_rank=rank) logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, PathManager.open(args.config_file, "r").read())) logger.info("Running with full config:\n{}".format(cfg)) if comm.is_main_process() and output_dir: # Note: some of our scripts may expect the existence of # config.yaml in output directory path = os.path.join(output_dir, "config.yaml") with PathManager.open(path, "w") as f: f.write(cfg.dump()) logger.info("Full config saved to {}".format(os.path.abspath(path))) if cfg.VERSION == 2: # make sure each worker has a different, yet deterministic seed if specified seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank) # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK
def main(args): cfg = setup(args) model = build_model(cfg) logger.info("Model:\n{}".format(model)) if args.eval_only: DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume ) return do_test(cfg, model) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) do_train(cfg, model) return do_test(cfg, model)
def __init__(self, size: int, shuffle: bool = True, seed: Optional[int] = None): """ Args: size (int): the total number of data of the underlying dataset to sample from shuffle (bool): whether to shuffle the indices or not seed (int): the initial seed of the shuffle. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). """ self._size = size assert size > 0 self._shuffle = shuffle if seed is None: seed = comm.shared_random_seed() self._seed = int(seed) self._rank = comm.get_rank() self._world_size = comm.get_world_size()
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
def __init__(self, dataset_dicts, repeat_thresh, shuffle=True, seed=None): """ Args: dataset_dicts (list[dict]): annotations in mydl dataset format. repeat_thresh (float): frequency threshold below which data is repeated. shuffle (bool): whether to shuffle the indices or not seed (int): the initial seed of the shuffle. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). """ self._shuffle = shuffle if seed is None: seed = comm.shared_random_seed() self._seed = int(seed) self._rank = comm.get_rank() self._world_size = comm.get_world_size() # Get fractional repeat factors and split into whole number (_int_part) # and fractional (_frac_part) parts. rep_factors = self._get_repeat_factors(dataset_dicts, repeat_thresh) self._int_part = torch.trunc(rep_factors) self._frac_part = rep_factors - self._int_part
def forward(self, input): if comm.get_world_size() == 1 or not self.training: return super().forward(input) assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs" C = input.shape[1] mean = torch.mean(input, dim=[0, 2, 3]) meansqr = torch.mean(input * input, dim=[0, 2, 3]) vec = torch.cat([mean, meansqr], dim=0) vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) var = meansqr - mean * mean self.running_mean += self.momentum * (mean.detach() - self.running_mean) self.running_var += self.momentum * (var.detach() - self.running_var) invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) return input * scale + bias
def __init__(self, cfg): """ Args: cfg (CfgNode): """ logger = logging.getLogger("mydl") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for d2 setup_logger() # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) super().__init__(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, ): logger = logging.getLogger("mydl.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) dataset_names = cfg.DATASETS.TEST for iteration, (images, targets, _) in enumerate(data_loader, start_iter): if any(len(target) < 1 for target in targets): logger.error( f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" ) continue data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: meters_val = MetricLogger(delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum( loss for loss in loss_dict_reduced.values()) meters_val.update(loss=losses_reduced, **loss_dict_reduced) synchronize() logger.info( meters_val.delimiter.join([ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, is_for_period=False): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format( images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format( images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file("mydl.config.paths_catalog", cfg.PATHS_CATALOG, True) DatasetCatalog = paths_catalog.DatasetCatalog dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST # If bbox aug is enabled in testing, simply set transforms to None and we will apply transforms later transforms = None if not is_train and cfg.TEST.BBOX_AUG.ENABLED else build_transforms( cfg, is_train) datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train or is_for_period) if is_train: # save category_id to label name mapping save_labels(datasets, cfg.OUTPUT_DIR) data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter) collator = BBoxAugCollator() if not is_train and cfg.TEST.BBOX_AUG.ENABLED else \ BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train or is_for_period: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders
def build_detection_train_loader(cfg, mapper=None): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers ) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers ) images_per_worker = images_per_batch // num_workers # force the training of algorithm without remove image that have not annotations : False annotation_filter_emtpy = True if cfg.VERSION == 1: annotation_filter_emtpy = False else: annotation_filter_emtpy = cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=annotation_filter_emtpy, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.VERSION != 1 and cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD ) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter(0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True ) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def load_cityscapes_instances(image_dir, gt_dir, from_json=True, to_polygons=True): """ Args: image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train". gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train". from_json (bool): whether to read annotations from the raw json file or the png files. to_polygons (bool): whether to represent the segmentation as polygons (COCO's format) instead of masks (cityscapes's format). Returns: list[dict]: a list of dicts in mydl standard format. (See `Using Custom Datasets </tutorials/datasets.html>`_ ) """ if from_json: assert to_polygons, ( "Cityscapes's json annotations are in polygon format. " "Converting to mask format is not supported now.") files = [] for image_file in glob.glob(os.path.join(image_dir, "**/*.png")): suffix = "leftImg8bit.png" assert image_file.endswith(suffix) prefix = image_dir instance_file = gt_dir + image_file[ len(prefix):-len(suffix)] + "gtFine_instanceIds.png" assert os.path.isfile(instance_file), instance_file label_file = gt_dir + image_file[ len(prefix):-len(suffix)] + "gtFine_labelIds.png" assert os.path.isfile(label_file), label_file json_file = gt_dir + image_file[ len(prefix):-len(suffix)] + "gtFine_polygons.json" files.append((image_file, instance_file, label_file, json_file)) assert len(files), "No images found in {}".format(image_dir) logger = logging.getLogger(__name__) logger.info("Preprocessing cityscapes annotations ...") # This is still not fast: all workers will execute duplicate works and will # take up to 10m on a 8GPU server. pool = mp.Pool(processes=max(mp.cpu_count() // get_world_size() // 2, 4)) ret = pool.map( functools.partial(cityscapes_files_to_dict, from_json=from_json, to_polygons=to_polygons), files, ) logger.info("Loaded {} images from {}".format(len(ret), image_dir)) # Map cityscape ids to contiguous ids from cityscapesscripts.helpers.labels import labels labels = [l for l in labels if l.hasInstances and not l.ignoreInEval] dataset_id_to_contiguous_id = {l.id: idx for idx, l in enumerate(labels)} for dict_per_image in ret: for anno in dict_per_image["annotations"]: anno["category_id"] = dataset_id_to_contiguous_id[ anno["category_id"]] return ret