def __call__(self, embedding, targets): embedding = F.normalize(embedding, dim=1) if comm.get_world_size() > 1: all_embedding = concat_all_gather(embedding) all_targets = concat_all_gather(targets) else: all_embedding = embedding all_targets = targets dist_mat = torch.matmul(embedding, all_embedding.t()) N, M = dist_mat.size() is_pos = targets.view(N, 1).expand(N, M).eq( all_targets.view(M, 1).expand(M, N).t()) is_neg = targets.view(N, 1).expand(N, M).ne( all_targets.view(M, 1).expand(M, N).t()) s_p = dist_mat[is_pos].contiguous().view(N, -1) s_n = dist_mat[is_neg].contiguous().view(N, -1) alpha_p = F.relu(-s_p.detach() + 1 + self.m) alpha_n = F.relu(s_n.detach() + self.m) delta_p = 1 - self.m delta_n = self.m logit_p = -self.s * alpha_p * (s_p - delta_p) logit_n = self.s * alpha_n * (s_n - delta_n) loss = F.softplus( torch.logsumexp(logit_p, dim=1) + torch.logsumexp(logit_n, dim=1)).mean() return loss * self._scale
def __init__(self, data_source: str, batch_size: int, num_instances: int, seed: Optional[int] = None): self.data_source = data_source self.batch_size = batch_size self.num_instances = num_instances self.num_pids_per_batch = batch_size // self.num_instances self.index_pid = defaultdict(list) self.pid_cam = defaultdict(list) self.pid_index = defaultdict(list) for index, info in enumerate(data_source): pid = info[1] camid = info[2] self.index_pid[index] = pid self.pid_cam[pid].append(camid) self.pid_index[pid].append(index) self.pids = list(self.pid_index.keys()) self.num_identities = len(self.pids) if seed is None: seed = comm.shared_random_seed() self._seed = int(seed) self._rank = comm.get_rank() self._world_size = comm.get_world_size()
def fcos_losses( labels, reg_targets, logits_pred, reg_pred, ctrness_pred, focal_loss_alpha, focal_loss_gamma, iou_loss, ): num_classes = logits_pred.size(1) labels = labels.flatten() pos_inds = torch.nonzero(labels != num_classes).squeeze(1) num_pos_local = pos_inds.numel() num_gpus = get_world_size() total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item() num_pos_avg = max(total_num_pos / num_gpus, 1.0) # prepare one_hot class_target = torch.zeros_like(logits_pred) class_target[pos_inds, labels[pos_inds]] = 1 class_loss = sigmoid_focal_loss_jit( logits_pred, class_target, alpha=focal_loss_alpha, gamma=focal_loss_gamma, reduction="sum", ) / num_pos_avg reg_pred = reg_pred[pos_inds] reg_targets = reg_targets[pos_inds] ctrness_pred = ctrness_pred[pos_inds] ctrness_targets = compute_ctrness_targets(reg_targets) ctrness_targets_sum = ctrness_targets.sum() ctrness_norm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6) reg_loss = iou_loss(reg_pred, reg_targets, ctrness_targets) / ctrness_norm ctrness_loss = F.binary_cross_entropy_with_logits( ctrness_pred, ctrness_targets, reduction="sum") / num_pos_avg losses = { "loss_fcos_cls": class_loss, "loss_fcos_loc": reg_loss, "loss_fcos_ctr": ctrness_loss } return losses, {}
def __init__(self, cfg): """ Args: cfg (CfgNode): """ logger = logging.getLogger("core") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for core setup_logger() # Assume these objects must be constructed in this order. data_loader = self.build_train_loader(cfg) cfg = self.auto_scale_hyperparams(cfg, data_loader) model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: # ref to https://github.com/pytorch/pytorch/issues/22049 to set `find_unused_parameters=True` # for part of the parameters is not updated. model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) super().__init__(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = Checkpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, save_to_disk=comm.is_main_process(), optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 if cfg.SOLVER.SWA.ENABLED: self.max_iter = cfg.SOLVER.MAX_ITER + cfg.SOLVER.SWA.ITER else: self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def default_setup(cfg, args): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the detectron2 logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (CfgNode): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: PathManager.mkdirs(output_dir) rank = comm.get_rank() setup_logger(output_dir, distributed_rank=rank, name="fvcore") logger = setup_logger(output_dir, distributed_rank=rank) logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, PathManager.open(args.config_file, "r").read())) logger.info("Running with full config:\n{}".format(cfg)) if comm.is_main_process() and output_dir: # Note: some of our scripts may expect the existence of # config.yaml in output directory path = os.path.join(output_dir, "config.yaml") with PathManager.open(path, "w") as f: f.write(cfg.dump()) logger.info("Full config saved to {}".format(os.path.abspath(path))) # make sure each worker has a different, yet deterministic seed if specified seed_all_rng() # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK
def __init__(self, size: int, shuffle: bool = True, seed: Optional[int] = None): """ Args: size (int): the total number of data of the underlying dataset to sample from shuffle (bool): whether to shuffle the indices or not seed (int): the initial seed of the shuffle. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). """ self._size = size assert size > 0 self._shuffle = shuffle if seed is None: seed = comm.shared_random_seed() self._seed = int(seed) self._rank = comm.get_rank() self._world_size = comm.get_world_size()
def auto_scale_hyperparams(cfg, data_loader): r""" This is used for auto-computation actual training iterations, because some hyper-param, such as MAX_ITER, means training epochs rather than iters, so we need to convert specific hyper-param to training iterations. """ cfg = cfg.clone() frozen = cfg.is_frozen() cfg.defrost() iters_per_epoch = len(data_loader.dataset) // ( cfg.SOLVER.IMS_PER_BATCH * comm.get_world_size()) cfg.MODEL.HEADS.NUM_CLASSES = data_loader.dataset.num_classes cfg.SOLVER.MAX_ITER *= iters_per_epoch cfg.SOLVER.WARMUP_ITERS *= iters_per_epoch cfg.SOLVER.FREEZE_ITERS *= iters_per_epoch cfg.SOLVER.DELAY_ITERS *= iters_per_epoch for i in range(len(cfg.SOLVER.STEPS)): cfg.SOLVER.STEPS[i] *= iters_per_epoch cfg.SOLVER.SWA.ITER *= iters_per_epoch cfg.SOLVER.SWA.PERIOD *= iters_per_epoch cfg.SOLVER.CHECKPOINT_PERIOD *= iters_per_epoch # Evaluation period must be divided by 200 for writing into tensorboard. num_mode = 200 - (cfg.TEST.EVAL_PERIOD * iters_per_epoch) % 200 cfg.TEST.EVAL_PERIOD = cfg.TEST.EVAL_PERIOD * iters_per_epoch + num_mode logger = logging.getLogger(__name__) logger.info( f"Auto-scaling the config to num_classes={cfg.MODEL.HEADS.NUM_CLASSES}, " f"max_Iter={cfg.SOLVER.MAX_ITER}, wamrup_Iter={cfg.SOLVER.WARMUP_ITERS}, " f"freeze_Iter={cfg.SOLVER.FREEZE_ITERS}, delay_Iter={cfg.SOLVER.DELAY_ITERS}, " f"step_Iter={cfg.SOLVER.STEPS}, ckpt_Iter={cfg.SOLVER.CHECKPOINT_PERIOD}, " f"eval_Iter={cfg.TEST.EVAL_PERIOD}.") if frozen: cfg.freeze() return cfg
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
def build_reid_train_loader(cfg): train_transforms = build_transforms(cfg, is_train=True) train_items = list() for d in cfg.DATASETS.NAMES: dataset = DATASET_REGISTRY.get(d)(root=_root, combineall=cfg.DATASETS.COMBINEALL) if comm.is_main_process(): dataset.show_train() train_items.extend(dataset.train) train_set = CommDataset(train_items, train_transforms, relabel=True) num_workers = cfg.DATALOADER.NUM_WORKERS mini_batch_size = cfg.SOLVER.IMS_PER_BATCH num_instance = cfg.DATALOADER.NUM_INSTANCE global_batch_size = mini_batch_size * comm.get_world_size() if cfg.DATALOADER.PK_SAMPLER: if cfg.DATALOADER.NAIVE_WAY: data_sampler = samplers.NaiveIdentitySampler( train_set.img_items, global_batch_size, num_instance) else: data_sampler = samplers.BalancedIdentitySampler( train_set.img_items, global_batch_size, num_instance) else: data_sampler = samplers.TrainingSampler(len(train_set)) batch_sampler = torch.utils.data.sampler.BatchSampler( data_sampler, mini_batch_size, True) train_loader = torch.utils.data.DataLoader( train_set, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=fast_batch_collator, ) return train_loader
def __call__(self, embedding, targets): if self._normalize_feature: embedding = normalize(embedding, axis=-1) # For distributed training, gather all features from different process. if comm.get_world_size() > 1: all_embedding = concat_all_gather(embedding) all_targets = concat_all_gather(targets) else: all_embedding = embedding all_targets = targets dist_mat = euclidean_dist(embedding, all_embedding) N, M = dist_mat.size() is_pos = targets.view(N, 1).expand(N, M).eq( all_targets.view(M, 1).expand(M, N).t()) is_neg = targets.view(N, 1).expand(N, M).ne( all_targets.view(M, 1).expand(M, N).t()) if self._hard_mining: dist_ap, dist_an = hard_example_mining(dist_mat, is_pos, is_neg) else: dist_ap, dist_an = weighted_example_mining(dist_mat, is_pos, is_neg) y = dist_an.new().resize_as_(dist_an).fill_(1) if self._margin > 0: loss = F.margin_ranking_loss(dist_an, dist_ap, y, margin=self._margin) else: loss = F.soft_margin_loss(dist_an - dist_ap, y) if loss == float('Inf'): loss = F.margin_ranking_loss(dist_an, dist_ap, y, margin=0.3) return loss * self._scale
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file("core.config.paths_catalog", cfg.PATHS_CATALOG, True) DatasetCatalog = paths_catalog.DatasetCatalog dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST # If bbox aug is enabled in testing, simply set transforms to None and we will apply transforms later transforms = None if not is_train and cfg.TEST.BBOX_AUG.ENABLED else build_transforms( cfg, is_train) datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train) data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter) collator = BBoxAugCollator() if not is_train and cfg.TEST.BBOX_AUG.ENABLED else \ BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders