def build_detection_train_loader(cfg: CfgNode, mapper=None): """ A data loader is created in a way similar to that of Detectron2. The main differences are: - it allows to combine datasets with different but compatible object category sets The data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ _add_category_whitelists_to_metadata(cfg) _add_category_maps_to_metadata(cfg) dataset_dicts = combine_detection_dataset_dicts( cfg.DATASETS.TRAIN, keep_instance_predicate=_get_train_keep_instance_predicate(cfg), proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) sampler = RepeatFactorTrainingSampler(repeat_factors) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def build_detection_train_loader(cfg, mapper=None): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Coordinate a random shuffle order shared among all processes (all GPUs) 3. Each process spawn another few workers to process the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will yield. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be ``DatasetMapper(cfg, True)``. Returns: an infinite iterator of training data """ dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) # TODO avoid if-else? if sampler_name == "TrainingSampler": sampler = TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD ) sampler = RepeatFactorTrainingSampler(repeat_factors) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def build_detection_semisup_train_loader(cfg, mapper=None): dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) # Divide into labeled and unlabeled sets according to supervision percentage label_dicts, unlabel_dicts = divide_label_unlabel( dataset_dicts, cfg.DATALOADER.SUP_PERCENT, cfg.DATALOADER.RANDOM_DATA_SEED, cfg.DATALOADER.RANDOM_DATA_SEED_PATH, ) dataset = DatasetFromList(label_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": repeat_factors = ( RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( label_dicts, cfg.DATALOADER.REPEAT_THRESHOLD)) sampler = RepeatFactorTrainingSampler(repeat_factors) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) # list num of labeled and unlabeled logger.info("Number of training samples " + str(len(dataset))) logger.info("Supervision percentage " + str(cfg.DATALOADER.SUP_PERCENT)) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def test_repeat_factors_from_category_frequency(self): repeat_thresh = 0.5 dataset_dicts = [ { "annotations": [{ "category_id": 0 }, { "category_id": 1 }] }, { "annotations": [{ "category_id": 0 }] }, { "annotations": [] }, ] rep_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( dataset_dicts, repeat_thresh) expected_rep_factors = torch.tensor([math.sqrt(3 / 2), 1.0, 1.0]) self.assertTrue(torch.allclose(rep_factors, expected_rep_factors))
def build_custom_train_loader(cfg, mapper=None): """ Modified from detectron2.data.build.build_custom_train_loader, but supports different samplers """ dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) # TODO avoid if-else? if sampler_name == "TrainingSampler": sampler = TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) sampler = RepeatFactorTrainingSampler(repeat_factors) elif sampler_name == "ClassAwareSampler": sampler = ClassAwareSampler(dataset_dicts) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None): if dataset is None: dataset = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) if mapper is None: mapper = DatasetMapper(cfg, True) if sampler is None: sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger("detectron2.trainer") logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( dataset, cfg.DATALOADER.REPEAT_THRESHOLD ) sampler = RepeatFactorTrainingSampler(repeat_factors) else: raise ValueError( "Unknown training sampler: {}".format(sampler_name)) return { "dataset": dataset, "sampler": sampler, "mapper": mapper, "total_batch_size": cfg.SOLVER.IMS_PER_BATCH, "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING, "num_workers": cfg.DATALOADER.NUM_WORKERS, }
def build_weighted_detection_train_loader(cfg: CfgNode, mapper=None): dataset_repeat_factors = get_train_datasets_repeat_factors(cfg) # OrderedDict to guarantee order of values() consistent with repeat factors dataset_name_to_dicts = OrderedDict( { name: get_detection_dataset_dicts( [name], filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) for name in cfg.DATASETS.TRAIN } ) # Repeat factor for every sample in the dataset repeat_factors = [ [dataset_repeat_factors[dsname]] * len(dataset_name_to_dicts[dsname]) for dsname in cfg.DATASETS.TRAIN ] repeat_factors = list(itertools.chain.from_iterable(repeat_factors)) dataset_dicts = dataset_name_to_dicts.values() dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) logger.info( "Using WeightedTrainingSampler with repeat_factors={}".format( cfg.DATASETS.TRAIN_REPEAT_FACTOR ) ) sampler = RepeatFactorTrainingSampler(torch.tensor(repeat_factors)) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
# %% -------------------- # https://detectron2.readthedocs.io/en/latest/modules/data.html#detectron2.data.samplers.RepeatFactorTrainingSampler # https://github.com/facebookresearch/detectron2/issues/2139 dataset_dicts_1 = [ {"annotations": [{"category_id": 0}, {"category_id": 1}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}, {"category_id": 2}]}, {"annotations": [{"category_id": 0}, {"category_id": 2}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, {"annotations": [{"category_id": 0}]}, ] sampler = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(dataset_dicts_1, 3) # Inherit the Sampler, based on the threshold to reduce the problem of sample imbalance, # categories with fewer samples are more likely to be sampled, which is suitable for the LVIS # data set print(sampler)