def __init__(self, cfg): """ Args: cfg (CfgNode): """ super().__init__() logger = logging.getLogger("detectron2") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for d2 setup_logger() cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def build_batch_data_loader( # type: ignore[no-untyped-def] dataset, sampler, total_batch_size: int, *, aspect_ratio_grouping: bool = False, num_workers: int = 0, drop_last: bool = True, ) -> Union[torch.utils.data.DataLoader, AspectRatioGroupedDataset]: """ Build a batched dataloader for training. Modified from detectron2 to expose the `drop_last` option. Args: dataset (torch.utils.data.Dataset): map-style PyTorch dataset. Can be indexed. sampler (torch.utils.data.sampler.Sampler): a sampler that produces indices total_batch_size, aspect_ratio_grouping, num_workers): see :func:`build_detection_train_loader`. Returns: iterable[list]. Length of each list is the batch size of the current GPU. Each element in the list comes from the dataset. """ world_size = get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size) batch_size = total_batch_size // world_size if aspect_ratio_grouping: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=num_workers, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict return AspectRatioGroupedDataset(data_loader, batch_size) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, batch_size, drop_last=drop_last) # srnet: expose drop_last to caller return torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, )
def build_batch_data_loader(dataset, sampler, total_batch_size, *, aspect_ratio_grouping=False, num_workers=0): """ Build a batched dataloader. The main differences from `torch.utils.data.DataLoader` are: 1. support aspect ratio grouping options 2. use no "batch collation", because this is common for detection training Args: dataset (torch.utils.data.Dataset): a pytorch map-style or iterable dataset. sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces indices. Must be provided iff. ``dataset`` is a map-style dataset. total_batch_size, aspect_ratio_grouping, num_workers): see :func:`build_detection_train_loader`. Returns: iterable[list]. Length of each list is the batch size of the current GPU. Each element in the list comes from the dataset. """ world_size = get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size) batch_size = total_batch_size // world_size if isinstance(dataset, torchdata.IterableDataset): assert sampler is None, "sampler must be None if dataset is IterableDataset" else: dataset = ToIterableDataset(dataset, sampler) if aspect_ratio_grouping: data_loader = torchdata.DataLoader( dataset, num_workers=num_workers, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict return AspectRatioGroupedDataset(data_loader, batch_size) else: return torchdata.DataLoader( dataset, batch_size=batch_size, drop_last=True, num_workers=num_workers, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, )
def fcos_losses(self, instances): num_classes = instances.logits_pred.size(1) assert num_classes == self.num_classes labels = instances.labels.flatten() pos_inds = torch.nonzero(labels != num_classes).squeeze(1) num_pos_local = pos_inds.numel() num_gpus = get_world_size() total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item() num_pos_avg = max(total_num_pos / num_gpus, 1.0) # prepare one_hot class_target = torch.zeros_like(instances.logits_pred) class_target[pos_inds, labels[pos_inds]] = 1 class_loss = sigmoid_focal_loss_jit( instances.logits_pred, class_target, alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / num_pos_avg instances = instances[pos_inds] instances.pos_inds = pos_inds ctrness_targets = compute_ctrness_targets(instances.reg_targets) ctrness_targets_sum = ctrness_targets.sum() loss_denorm = max( reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6) instances.gt_ctrs = ctrness_targets if pos_inds.numel() > 0: reg_loss = self.loc_loss_func(instances.reg_pred, instances.reg_targets, ctrness_targets) / loss_denorm ctrness_loss = F.binary_cross_entropy_with_logits( instances.ctrness_pred, ctrness_targets, reduction="sum") / num_pos_avg else: reg_loss = instances.reg_pred.sum() * 0 ctrness_loss = instances.ctrness_pred.sum() * 0 losses = { "loss_fcos_cls": class_loss, "loss_fcos_loc": reg_loss, "loss_fcos_ctr": ctrness_loss } extras = {"instances": instances, "loss_denorm": loss_denorm} return extras, losses
def default_setup(cfg, args=None): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the detectron2 logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (CfgNode): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: PathManager.mkdirs(output_dir) rank = comm.get_rank() setup_logger(output_dir, distributed_rank=rank, name="fvcore") logger = setup_logger(output_dir, distributed_rank=rank) handlers = [] handlers.append(logger.handlers[-2]) handlers.append(logger.handlers[-1]) logger.handlers = handlers logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file"): logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, PathManager.open(args.config_file, "r").read())) logger.info("Running with full config:\n{}".format(cfg)) if comm.is_main_process() and output_dir: # Note: some of our scripts may expect the existence of # config.yaml in output directory path = os.path.join(output_dir, "config.yaml") with PathManager.open(path, "w") as f: f.write(cfg.dump()) logger.info("Full config saved to {}".format(os.path.abspath(path))) # make sure each worker has a different, yet deterministic seed if specified seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank) # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK
def wrap_model_with_ddp(self, cfg, model): """ Returns: torch.nn.Module: Overwrite this function if you'd like to implement more with `torch.nn.parallel.DistributedDataParallel`, such as adding `find_unused_parameters=True`. """ if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) return model
def _get_config(self): cfg = get_cfg() add_deeplab_config(cfg) defaultConfig = optionsHelper.get_hierarchical_value(self.options, ['options', 'model', 'config', 'value', 'id']) configFile = os.path.join(os.getcwd(), 'ai/models/detectron2/_functional/configs', defaultConfig) cfg.merge_from_file(configFile) # disable SyncBatchNorm if not running on distributed system if comm.get_world_size() <= 1: cfg.MODEL.RESNETS.NORM = 'BN' cfg.MODEL.SEM_SEG_HEAD.NORM = 'BN' return cfg
def _compute_num_images_per_worker(cfg: CfgNode): num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers return images_per_worker
def __init__(self, size: int): """ Args: size (int): the total number of data of the underlying dataset to sample from """ self._size = size assert size > 0 self._rank = comm.get_rank() self._world_size = comm.get_world_size() shard_size = (self._size - 1) // self._world_size + 1 self.begin = shard_size * self._rank self.end = min(shard_size * (self._rank + 1), self._size)
def load_kitti2cityscapes_instances(image_dir, gt_dir, from_json=True, to_polygons=True, istest=False): """ Args: image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train". gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train". from_json (bool): whether to read annotations from the raw json file or the png files. to_polygons (bool): whether to represent the segmentation as polygons (COCO's format) instead of masks (cityscapes's format). Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets </tutorials/datasets.html>`_ ) """ if from_json: assert to_polygons, ( "Cityscapes's json annotations are in polygon format. " "Converting to mask format is not supported now.") files = _get_kitti2cityscapes_files(image_dir, gt_dir, istest) logger.info("Preprocessing kitti2citycsapes annotations ...") # This is still not fast: all workers will execute duplicate works and will # take up to 10m on a 8GPU server. pool = mp.Pool(processes=max(mp.cpu_count() // get_world_size() // 2, 4)) ret = pool.map( functools.partial(_kitti2cityscapes_files_to_dict, from_json=from_json, to_polygons=to_polygons, istest=istest), files, ) logger.info("Loaded {} images from {}".format(len(ret), image_dir)) if not istest: # Map cityscape ids to contiguous ids from .kitti2cityscapes_label import labels labels = [l for l in labels if l.hasInstances and not l.ignoreInEval] dataset_id_to_contiguous_id = { l.id: idx for idx, l in enumerate(labels) } for dict_per_image in ret: for anno in dict_per_image["annotations"]: anno["category_id"] = dataset_id_to_contiguous_id[ anno["category_id"]] return ret
def build_semisup_batch_data_loader_two_crop(dataset, sampler, total_batch_size_label, total_batch_size_unlabel, *, aspect_ratio_grouping=False, num_workers=0): world_size = get_world_size() assert ( total_batch_size_label > 0 and total_batch_size_label % world_size == 0 ), "Total label batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size_label, world_size) assert ( total_batch_size_unlabel > 0 and total_batch_size_unlabel % world_size == 0 ), "Total unlabel batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size_label, world_size) batch_size_label = total_batch_size_label // world_size batch_size_unlabel = total_batch_size_unlabel // world_size label_dataset, unlabel_dataset = dataset label_sampler, unlabel_sampler = sampler if aspect_ratio_grouping: label_data_loader = torch.utils.data.DataLoader( label_dataset, sampler=label_sampler, num_workers=num_workers, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict unlabel_data_loader = torch.utils.data.DataLoader( unlabel_dataset, sampler=unlabel_sampler, num_workers=num_workers, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict return AspectRatioGroupedSemiSupDatasetTwoCrop( (label_data_loader, unlabel_data_loader), (batch_size_label, batch_size_unlabel), ) else: raise NotImplementedError( "ASPECT_RATIO_GROUPING = False is not supported yet")
def build_batch_data_loader(dataset, sampler, total_batch_size, *, aspect_ratio_grouping=False, num_workers=0): """ Build a batched dataloader for training. Args: dataset (torch.utils.data.Dataset): map-style PyTorch dataset. Can be indexed. sampler (torch.utils.data.sampler.Sampler): a sampler that produces indices total_batch_size, aspect_ratio_grouping, num_workers): see :func:`build_detection_train_loader`. Returns: iterable[list]. Length of each list is the batch size of the current GPU. Each element in the list comes from the dataset. """ world_size = get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size) # 实际batch_size batch_size = total_batch_size // world_size if aspect_ratio_grouping: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=num_workers, batch_sampler=None, #operator.itemgetter(0) 获取对象的第0个元素的函数 collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict # 根据aspectRatio 来选成一组 return AspectRatioGroupedDataset(data_loader, batch_size) else: # batch_sampler 返回的是一个batch batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, batch_size, drop_last=True) # drop_last so the batch always have the same size return torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, # do nothing, 因为常规的会合并,这个不合并 worker_init_fn=worker_init_reset_seed, )
def forward(self, outputs, targets): """ This performs the loss computation. Parameters: outputs: dict of tensors, see the output specification of the model for the format targets: list of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the losses applied, see each loss' doc """ outputs_without_aux = { k: v for k, v in outputs.items() if k != 'aux_outputs' and k != 'enc_outputs' } # Retrieve the matching between the outputs of the last layer and the targets indices = self.matcher(outputs_without_aux, targets) # Compute the average number of target boxes accross all nodes, for normalization purposes num_boxes = sum(len(t["labels"]) for t in targets) num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) if is_dist_avail_and_initialized(): torch.distributed.all_reduce(num_boxes) num_boxes = torch.clamp(num_boxes / comm.get_world_size(), min=1).item() # Compute all the requested losses losses = {} for loss in self.losses: losses.update( self.get_loss(loss, outputs, targets, indices, num_boxes)) # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. if 'aux_outputs' in outputs: for i, aux_outputs in enumerate(outputs['aux_outputs']): indices = self.matcher(aux_outputs, targets) for loss in self.losses: if loss == 'masks': # Intermediate masks losses are too costly to compute, we ignore them. continue kwargs = {} if loss == 'labels': # Logging is enabled only for the last layer kwargs = {'log': False} l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs) l_dict = {k + f'_{i}': v for k, v in l_dict.items()} losses.update(l_dict) return losses
def fcos_losses(self, labels, reg_targets, logits_pred, reg_pred, ctrness_pred, gt_inds, mask_centers_targets): num_classes = logits_pred.size(1) assert num_classes == self.num_classes labels = labels.flatten() pos_inds = torch.nonzero(labels != num_classes).squeeze(1) num_pos_local = pos_inds.numel() num_gpus = get_world_size() total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item() num_pos_avg = max(total_num_pos / num_gpus, 1.0) # prepare one_hot class_target = torch.zeros_like(logits_pred) class_target[pos_inds, labels[pos_inds]] = 1 class_loss = sigmoid_focal_loss_jit( logits_pred, class_target, alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / num_pos_avg reg_pred = reg_pred[pos_inds] reg_targets = reg_targets[pos_inds] ctrness_pred = ctrness_pred[pos_inds] gt_inds = gt_inds[pos_inds] mask_center = mask_centers_targets[pos_inds] # 需要修改 # ctrness_targets = compute_ctrness_targets(reg_targets) # ctrness_targets_sum = ctrness_targets.sum() # loss_denorm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6) if pos_inds.numel() > 0: reg_loss = self.loc_loss_func( reg_pred, reg_targets, ctrness_pred, mask_center, ) else: reg_loss = reg_pred.sum() * 0 losses = {"loss_fcos_cls": class_loss, "loss_fcos_loc": reg_loss} extras = { "pos_inds": pos_inds, "gt_inds": gt_inds, } return losses, extras
def build_ss_batch_data_loader(dataset, sampler, total_batch_size_label, total_batch_size_unlabel, *, aspect_ratio_grouping=True, num_workers=0): """Instantiates two data loaders based on provided metadata and wraps them into a single loader. Code is largely taken from `detectron2.data.build.build_batch_data_loader`. """ world_size = get_world_size() # Check that batch sizes are divisible by the #GPUs assert ( total_batch_size_label > 0 and total_batch_size_label % world_size == 0 ), "Total label batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size_label, world_size) assert ( total_batch_size_unlabel > 0 and total_batch_size_unlabel % world_size == 0 ), "Total unlabel batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size_label, world_size) # Calculate per-GPU batch sizes batch_size_label = total_batch_size_label // world_size batch_size_unlabel = total_batch_size_unlabel // world_size label_dataset, unlabel_dataset = dataset label_sampler, unlabel_sampler = sampler assert aspect_ratio_grouping, "ASPECT_RATIO_GROUPING = False is not supported yet" # Wrapper for DataLoader instantiation to avoid duplicate code create_data_loader = lambda dataset, sampler: torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=num_workers, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict label_data_loader = create_data_loader(label_dataset, label_sampler) unlabel_data_loader = create_data_loader(unlabel_dataset, unlabel_sampler) return AspectRatioGroupedSSDataset( (label_data_loader, unlabel_data_loader), (batch_size_label, batch_size_unlabel), )
def build_batch_data_loader(dataset, sampler, total_batch_size, *, aspect_ratio_grouping=False, num_workers=0): """ Build a batched dataloader for training. Args: dataset (torch.utils.data.Dataset): map-style PyTorch dataset. Can be indexed. sampler (torch.utils.data.sampler.Sampler): a sampler that produces indices total_batch_size (int): total batch size across GPUs. aspect_ratio_grouping (bool): whether to group images with similar aspect ratio for efficiency. When enabled, it requires each element in dataset be a dict with keys "width" and "height". num_workers (int): number of parallel data loading workers Returns: iterable[list]. Length of each list is the batch size of the current GPU. Each element in the list comes from the dataset. """ world_size = get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size) batch_size = total_batch_size // world_size if aspect_ratio_grouping: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=num_workers, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict return AspectRatioGroupedDataset(data_loader, batch_size) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, batch_size, drop_last=True) # drop_last so the batch always have the same size return torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, )
def benchmark_data_advanced(args): # benchmark dataloader with more details to help analyze performance bottleneck cfg = setup(args) benchmark = create_data_benchmark(cfg, args) if comm.get_rank() == 0: benchmark.benchmark_dataset(100) benchmark.benchmark_mapper(100) benchmark.benchmark_workers(100, warmup=10) benchmark.benchmark_IPC(100, warmup=10) if comm.get_world_size() > 1: benchmark.benchmark_distributed(100) logger.info("Rerun ...") benchmark.benchmark_distributed(100)
def reset(self): self._working_dir = tempfile.TemporaryDirectory( prefix="cityscapes_eval_") self._temp_dir = self._working_dir.name # All workers will write to the same results directory # TODO this does not work in distributed training assert (comm.get_local_size() == comm.get_world_size( )), "CityscapesEvaluator currently do not work with multiple machines." self._temp_dir = comm.all_gather(self._temp_dir)[0] if self._temp_dir != self._working_dir.name: self._working_dir.cleanup() self._logger.info( "Writing cityscapes results to temporary directory {} ...".format( self._temp_dir))
def fcos_losses( labels, reg_targets, logits_pred, reg_pred, ctrness_pred, focal_loss_alpha, focal_loss_gamma, iou_loss, ): num_classes = logits_pred.size(1) labels = labels.flatten() pos_inds = torch.nonzero(labels != num_classes).squeeze(1) num_pos_local = pos_inds.numel() num_gpus = get_world_size() total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item() num_pos_avg = max(total_num_pos / num_gpus, 1.0) # prepare one_hot class_target = torch.zeros_like(logits_pred) class_target[pos_inds, labels[pos_inds]] = 1 class_loss = sigmoid_focal_loss_jit( logits_pred, class_target, alpha=focal_loss_alpha, gamma=focal_loss_gamma, reduction="sum", ) / num_pos_avg reg_pred = reg_pred[pos_inds] reg_targets = reg_targets[pos_inds] ctrness_pred = ctrness_pred[pos_inds] ctrness_targets = compute_ctrness_targets(reg_targets) ctrness_targets_sum = ctrness_targets.sum() ctrness_norm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6) reg_loss = iou_loss(reg_pred, reg_targets, ctrness_targets) / ctrness_norm ctrness_loss = F.binary_cross_entropy_with_logits( ctrness_pred, ctrness_targets, reduction="sum") / num_pos_avg losses = { "loss_fcos_cls": class_loss, "loss_fcos_loc": reg_loss, "loss_fcos_ctr": ctrness_loss } return losses, {}
def build_batch_data_loader(dataset, sampler, total_batch_size, *, aspect_ratio_grouping=False, num_workers=0): """ Build a batched dataloader. The main differences from `torch.utils.data.DataLoader` are: 1. support aspect ratio grouping options 2. use no "batch collation", because this is common for detection training Args: dataset (torch.utils.data.Dataset): map-style PyTorch dataset. Can be indexed. sampler (torch.utils.data.sampler.Sampler): a sampler that produces indices total_batch_size, aspect_ratio_grouping, num_workers): see :func:`build_detection_train_loader`. Returns: iterable[list]. Length of each list is the batch size of the current GPU. Each element in the list comes from the dataset. """ world_size = get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size) batch_size = total_batch_size // world_size if aspect_ratio_grouping: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=num_workers, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict return AspectRatioGroupedDataset(data_loader, batch_size) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, batch_size, drop_last=True) # drop_last so the batch always have the same size return torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, )
def setup_loaders(cfg): loaders = {} loaders["train"] = build_data_loader( cfg, "MeshVox", "train", multigpu=comm.get_world_size() > 1 ) # Since sampling the mesh is now coupled with the data loader, we need to # make two different Dataset / DataLoaders for the training set: one for # training which uses precomputd samples, and one for evaluation which uses # more samples and computes them on the fly. This is sort of gross. loaders["train_eval"] = build_data_loader(cfg, "MeshVox", "train_eval", multigpu=False) loaders["val"] = build_data_loader(cfg, "MeshVox", "val", multigpu=False) return loaders
def setup_myargs_for_multiple_processing(myargs): from detectron2.utils import comm distributed = comm.get_world_size() > 1 if distributed and comm.is_main_process(): # setup logging in the project logfile = myargs.args.logfile logging_utils.get_logger(filename=logfile, logger_names=['template_lib', 'tl'], stream=True) logger = logging.getLogger('tl') myargs.logger = logger myargs.stdout = sys.stdout myargs.stderr = sys.stderr logging_utils.redirect_print_to_logger(logger=logger) return myargs
def main(args): cfg = setup(args) model = build_model(cfg) logger.info("Model:\n{}".format(model)) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) do_infer(cfg, args, model)
def forward(self, input): if comm.get_world_size() == 1 or not self.training: return super().forward(input) B, C = input.shape[0], input.shape[1] mean = torch.mean(input, dim=[0, 2, 3]) meansqr = torch.mean(input * input, dim=[0, 2, 3]) if self._stats_mode == "": assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.' vec = torch.cat([mean, meansqr], dim=0) vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) momentum = self.momentum else: if B == 0: vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype) vec = vec + input.sum( ) # make sure there is gradient w.r.t input else: vec = torch.cat([ mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype) ], dim=0) vec = differentiable_all_reduce(vec * B) total_batch = vec[-1].detach() momentum = total_batch.clamp( max=1) * self.momentum # no update if total_batch is 0 total_batch = torch.max( total_batch, torch.ones_like(total_batch)) # avoid div-by-zero mean, meansqr, _ = torch.split(vec / total_batch, C) var = meansqr - mean * mean invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) self.running_mean += momentum * (mean.detach() - self.running_mean) self.running_var += momentum * (var.detach() - self.running_var) return input * scale + bias
def main(args): train_name, num_class = regist_coco_dataset(args.train_annotation, args.thing_classes) val_name, _ = regist_coco_dataset(args.val_annotation, args.thing_classes) test_name, _ = regist_coco_dataset(args.test_annotation, args.thing_classes) cfg, hyperparameters = setup(args, train_name, val_name,test_name, num_class) dest_dir = os.path.join(cfg.OUTPUT_DIR, 'sample_compare_result') if not args.resume: if os.path.isdir(cfg.OUTPUT_DIR): shutil.rmtree(cfg.OUTPUT_DIR) os.mkdir(cfg.OUTPUT_DIR) os.mkdir(dest_dir) if hasattr(args, 'opts'): mlflow.log_params(hyperparameters) model = build_model(cfg) logger.info("Model:\n{}".format(model)) if args.eval_only: DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume ) return do_evaluate(cfg, model) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) model = do_train(cfg, model, resume=args.resume) # mlflow.pytorch.log_model(pytorch_model = model, # artifact_path = 'model_best', # conda_env = mlflow.pytorch.get_default_conda_env()) mlflow.log_artifact(os.path.join(cfg.OUTPUT_DIR, f'model_{os.getenv("MLFLOW_EXPERIMENT_NAME")}.pth')) results = do_evaluate(cfg, model) mlflow.log_metrics({k + '_bbox':v for k,v in results['bbox'].items()}) mlflow.log_metrics({k + '_segm':v for k,v in results['segm'].items()}) experiment_name = os.getenv('MLFLOW_EXPERIMENT_NAME') compare_gt_coco(cfg, annotation_file = args.test_annotation, dest_dir = dest_dir, weight = os.path.join(cfg.OUTPUT_DIR, f'model_{experiment_name}.pth'), score_thres_test = 0.7, num_sample = num_class ) mlflow.log_artifacts(dest_dir)
def __init__(self, dataset, num_replicas=None, rank=None): """ Arguments: - dataset (:obj:`dataset`): instance of dataset object """ if num_replicas is None: num_replicas = comm.get_world_size() if rank is None: rank = comm.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = len(range(rank, len(self.dataset), num_replicas)) self.total_size = len(self.dataset)
def __init__(self, cfg): """Initializes the CSDTrainer. Most of the code is from `super.__init__()`, the only change is that for `self._trainer` the `CSDTrainer` is used and weight scheduling parameters are injected into it, look for "CSD: ... " comments. """ TrainerBase.__init__(self) # CSD: don't call `super`'s init as we are overriding it logger = logging.getLogger("detectron2") if not logger.isEnabledFor(logging.INFO): # setup_logger is not called for d2 setup_logger() cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) model = create_ddp_model(model, broadcast_buffers=False) self._trainer = CSDTrainer(model, data_loader, optimizer) # CSD: use a CSD-specific trainer # CSD: inject weight scheduling parameters into trainer ( self._trainer.solver_csd_beta, self._trainer.solver_csd_t0, self._trainer.solver_csd_t1, self._trainer.solver_csd_t2, self._trainer.solver_csd_t, ) = ( cfg.SOLVER.CSD_WEIGHT_SCHEDULE_RAMP_BETA, cfg.SOLVER.CSD_WEIGHT_SCHEDULE_RAMP_T0, cfg.SOLVER.CSD_WEIGHT_SCHEDULE_RAMP_T1, cfg.SOLVER.CSD_WEIGHT_SCHEDULE_RAMP_T2, cfg.SOLVER.CSD_WEIGHT_SCHEDULE_RAMP_T, ) self.scheduler = self.build_lr_scheduler(cfg, optimizer) self.checkpointer = WandbDetectionCheckpointer( # CSD: use custom checkpointer (only few lines are added there) # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, trainer=weakref.proxy(self), ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def __init__(self, dataset_dicts, seed: Optional[int] = None): """ Args: size (int): the total number of data of the underlying dataset to sample from seed (int): the initial seed of the shuffle. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). """ self._size = len(dataset_dicts) assert self._size > 0 if seed is None: seed = comm.shared_random_seed() self._seed = int(seed) self._rank = comm.get_rank() self._world_size = comm.get_world_size() self.weights = self._get_class_balance_factor(dataset_dicts)
def load_cornell_instances(image_dir, to_polygons=True): """ Args: image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train". to_polygons (bool): whether to represent the segmentation as polygons (COCO's format) instead of masks (cityscapes's format). Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets </tutorials/datasets.html>`_ ) """ files = [] for grasps_file in glob.glob(os.path.join(image_dir, "*cpos.txt")): assert os.path.isfile(grasps_file), grasps_file cat_id = int(re.search("pcd(\d+)cpos.txt", grasps_file).group(1)) image_file = grasps_file.replace("cpos.txt", "r.png") #image_file = grasps_file.replace("cpos.txt", "d.tiff") #TODO: using depth assert os.path.isfile(image_file), image_file neg_grasps_file = grasps_file.replace("cpos.txt", "cneg.txt") assert os.path.isfile(neg_grasps_file), neg_grasps_file files.append((cat_id, image_file, grasps_file, neg_grasps_file)) assert len(files), "No images found in {}".format(image_dir) logger = logging.getLogger(__name__) logger.info("Preprocessing cornell annotations ...") # This is still not fast: all workers will execute duplicate works and will # take up to 10m on a 8GPU server. pool = mp.Pool(processes=max(mp.cpu_count() // get_world_size() // 2, 4)) ret = pool.map( functools.partial(cornell_files_to_dict, to_polygons=to_polygons), files, ) logger.info("Loaded {} images from {}".format(len(ret), image_dir)) # Map ids to contiguous ids #dataset_id_to_contiguous_id = {l.id: idx for idx, l in enumerate(os.listdir(image_dir))} #for dict_per_image in ret: # for anno in dict_per_image["annotations"]: # anno["category_id"] = dataset_id_to_contiguous_id[anno["category_id"]] return ret
def _guess_batch_size(): # Inputs are meta-arch dependent, the most general solution will be # adding a function like `get_batch_size()` to each meta arch ret = 1 try: model_input_shapes = model_data(model)["input_shapes"] assert isinstance(model_input_shapes, list) assert len(model_input_shapes) > 0 # assuming the first input is a list of images ret = len(model_input_shapes[0]) except Exception: ret = cfg.SOLVER.IMS_PER_BATCH // comm.get_world_size() logger.warning("Could not get batch size, compute from" f" `cfg.SOLVER.IMS_PER_BATCH`={ret}") pass return ret