def build_detection_test_loader_with_attributes(cfg, dataset_name, mapper=None): dataset_dicts = get_detection_dataset_dicts( [dataset_name], filter_empty=False, proposal_files=[ cfg.DATASETS.PROPOSAL_FILES_TEST[list( cfg.DATASETS.TEST).index(dataset_name)] ] if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts) if mapper is None: mapper = AttributeDatasetMapper(cfg, False) dataset = MapDataset(dataset, mapper) sampler = samplers.InferenceSampler(len(dataset)) batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def build_detection_val_loader(cfg, dataset_name: str, mapper=None): dataset_dicts = get_detection_dataset_dicts( [dataset_name], filter_empty=False, proposal_files=[ cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)] ] if cfg.MODEL.LOAD_PROPOSALS else None, ) # dataset_dicts = get_detection_dataset_dicts( # [dataset_name], # filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, # min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE # if cfg.MODEL.KEYPOINT_ON # else 0, # proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, # ) dataset = DatasetFromList(dataset_dicts) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler = InferenceSampler(len(dataset)) batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def build_detection_meta_loader(cfg, mapper=None): """ build the meta set from training data with Class Balanced Sampling """ dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) logger = logging.getLogger(__name__) logger.info("Using training sampler Class Balanced Sampler") repeat_factors = ClassBalancedTrainingSampler.repeat_factors_by_inverse_category_frequency( dataset_dicts) sampler = ClassBalancedTrainingSampler(repeat_factors) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def build_detection_test_loader_for_images(cfg, dataset_path, mapper=None): image_list = glob.glob(os.path.join(dataset_path, "*.jpg")) image_list += glob.glob(os.path.join(dataset_path, "*.png")) dataset_dicts = [{ "file_name": x, "image_id": os.path.splitext(os.path.basename(x))[0] } for x in image_list] dataset = DatasetFromList(dataset_dicts) if mapper is None: mapper = TestDatasetMapper(cfg, False) dataset = MapDataset(dataset, mapper) sampler = samplers.InferenceSampler(len(dataset)) batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def build_detection_train_loader_with_attributes(cfg, mapper=None): num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = AttributeDatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter(0), worker_init_fn=worker_init_reset_seed, ) data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_custom_train_loader(cfg, mapper=None): """ Modified from detectron2.data.build.build_custom_train_loader, but supports different samplers """ source_aware = cfg.DATALOADER.SOURCE_AWARE if source_aware: dataset_dicts = get_detection_dataset_dicts_with_source( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) sizes = [0 for _ in range(len(cfg.DATASETS.TRAIN))] for d in dataset_dicts: sizes[d['dataset_source']] += 1 print('dataset sizes', sizes) else: dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: assert 0 # mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) # TODO avoid if-else? if sampler_name == "TrainingSampler": sampler = TrainingSampler(len(dataset)) elif sampler_name == "MultiDatasetSampler": assert source_aware sampler = MultiDatasetSampler(cfg, sizes, dataset_dicts) elif sampler_name == "RepeatFactorTrainingSampler": repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) sampler = RepeatFactorTrainingSampler(repeat_factors) elif sampler_name == "ClassAwareSampler": sampler = ClassAwareSampler(dataset_dicts) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def build_detection_val_loader(cfg, dataset_name, total_batch_size, mapper=None): """ Similar to `build_detection_train_loader`. But this function uses the given `dataset_name` argument (instead of the names in cfg), and uses batch size 1. Args: cfg: a detectron2 CfgNode dataset_name (str): a name of the dataset that's available in the DatasetCatalog mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, False)`. Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. """ world_size = comm.get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size) batch_size = total_batch_size // world_size dataset_dicts = get_detection_dataset_dicts( [dataset_name], filter_empty=False, proposal_files=[ cfg.DATASETS.PROPOSAL_FILES_TEST[list( cfg.DATASETS.TEST).index(dataset_name)] ] if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) # sampler = InferenceSampler(len(dataset)) sampler = DistributedSampler(dataset, shuffle=False) # logger.info("Start Computing Validation Loss on {} images".format(len(dataset))) # drop_last so the batch always have the same size batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, batch_size, drop_last=False) # batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def build_detection_train_loader(cfg: CfgNode, mapper=None): """ A data loader is created in a way similar to that of Detectron2. The main differences are: - it allows to combine datasets with different but compatible object category sets The data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ _add_category_whitelists_to_metadata(cfg) _add_category_maps_to_metadata(cfg) dataset_dicts = combine_detection_dataset_dicts( cfg.DATASETS.TRAIN, keep_instance_predicate=_get_train_keep_instance_predicate(cfg), proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD ) sampler = RepeatFactorTrainingSampler(repeat_factors) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def build_detection_test_query_loader(cfg, dataset_name, mapper=None): """ Similar to `build_detection_train_loader`. But this function uses the given `dataset_name` argument (instead of the names in cfg), and uses batch size 1. Args: cfg: a detectron2 CfgNode dataset_name (str): a name of the dataset that's available in the DatasetCatalog mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, False)`. Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_detection_dataset_dicts( [dataset_name], filter_empty=False, proposal_files=[ cfg.DATASETS.PROPOSAL_FILES_TEST[list( cfg.DATASETS.TEST).index(dataset_name)] ] if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts) if mapper is None: mapper = DatasetMapper(cfg, False) dataset = MapDataset(dataset, mapper) sampler = samplers.InferenceSampler(len(dataset)) # Always use 1 image per worker during inference since this is the # standard when reporting inference time in papers. batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, images_per_worker, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def build_detection_train_loader(cfg, mapper=None): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Coordinate a random shuffle order shared among all processes (all GPUs) 3. Each process spawn another few workers to process the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will yield. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be ``DatasetMapper(cfg, True)``. Returns: an infinite iterator of training data """ dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) # TODO avoid if-else? if sampler_name == "TrainingSampler": sampler = TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD ) sampler = RepeatFactorTrainingSampler(repeat_factors) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def build_detection_semisup_train_loader(cfg, mapper=None): dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) # Divide into labeled and unlabeled sets according to supervision percentage label_dicts, unlabel_dicts = divide_label_unlabel( dataset_dicts, cfg.DATALOADER.SUP_PERCENT, cfg.DATALOADER.RANDOM_DATA_SEED, cfg.DATALOADER.RANDOM_DATA_SEED_PATH, ) dataset = DatasetFromList(label_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": repeat_factors = ( RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( label_dicts, cfg.DATALOADER.REPEAT_THRESHOLD)) sampler = RepeatFactorTrainingSampler(repeat_factors) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) # list num of labeled and unlabeled logger.info("Number of training samples " + str(len(dataset))) logger.info("Supervision percentage " + str(cfg.DATALOADER.SUP_PERCENT)) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def build_detection_test_loader(cfg, dataset_name, mapper=None): """ Similar to `build_detection_train_loader`. But this function uses the given `dataset_name` argument (instead of the names in cfg), and uses batch size 1. Args: cfg: a detectron2 CfgNode dataset_name (str): a name of the dataset that's available in the DatasetCatalog mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, False)`. Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. """ num_frames = cfg.MODEL.SPATIOTEMPORAL.NUM_FRAMES if cfg.MODEL.SPATIOTEMPORAL.FORWARD_AGGREGATION: # (f_{t-NUM_FRAMES}, ..., f_{t-1}, f_t, f_{t+1}, ..., f_{t+NUM_FRAMES}) num_frames = (2 * num_frames) + 1 dataset_dicts = get_detection_dataset_dicts( cfg, [dataset_name], num_frames, train=False, proposal_files=[ cfg.DATASETS.PROPOSAL_FILES_TEST[list( cfg.DATASETS.TEST).index(dataset_name)] ] if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts) if mapper is None: mapper = DatasetMapper(cfg, False) dataset = MapDataset(dataset, mapper) sampler = samplers.InferenceSampler(len(dataset)) # Always use 1 image per worker during inference since this is the # standard when reporting inference time in papers. batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def build_detection_test_loader(cfg, dataset_name, mapper=None): """ Similar to `build_detection_train_loader`. But this function uses the given `dataset_name` argument (instead of the names in cfg), and uses batch size 1. Args: cfg: a detectron2 CfgNode dataset_name (str): a name of the dataset that's available in the DatasetCatalog mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, False)`. Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. """ _add_category_whitelists_to_metadata(cfg) _add_category_maps_to_metadata(cfg) dataset_dicts = combine_detection_dataset_dicts( [dataset_name], keep_instance_predicate=_get_test_keep_instance_predicate(cfg), proposal_files=[ cfg.DATASETS.PROPOSAL_FILES_TEST[list( cfg.DATASETS.TEST).index(dataset_name)] ] if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts) if mapper is None: mapper = DatasetMapper(cfg, False) dataset = MapDataset(dataset, mapper) sampler = samplers.InferenceSampler(len(dataset)) # Always use 1 image per worker during inference since this is the # standard when reporting inference time in papers. batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def build_weighted_detection_train_loader(cfg: CfgNode, mapper=None): dataset_repeat_factors = get_train_datasets_repeat_factors(cfg) # OrderedDict to guarantee order of values() consistent with repeat factors dataset_name_to_dicts = OrderedDict( { name: get_detection_dataset_dicts( [name], filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) for name in cfg.DATASETS.TRAIN } ) # Repeat factor for every sample in the dataset repeat_factors = [ [dataset_repeat_factors[dsname]] * len(dataset_name_to_dicts[dsname]) for dsname in cfg.DATASETS.TRAIN ] repeat_factors = list(itertools.chain.from_iterable(repeat_factors)) dataset_dicts = dataset_name_to_dicts.values() dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) logger.info( "Using WeightedTrainingSampler with repeat_factors={}".format( cfg.DATASETS.TRAIN_REPEAT_FACTOR ) ) sampler = RepeatFactorTrainingSampler(torch.tensor(repeat_factors)) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def build_detection_train_loader(cfg): """Builds a data loader for the baseline trainer with support of training on the subset of labeled data only. Most of code comes from `d2.data.build.build_detection_train_loader()`, see it for more details. """ # CSD: check config is supported assert cfg.DATALOADER.SAMPLER_TRAIN == "TrainingSampler", "Unsupported training sampler: {}".format( cfg.DATALOADER.SAMPLER_TRAIN) # Original code dataset = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) # CSD: subsample the dataset if needed dataset = check_subsample_dataset(dataset, cfg) if comm.is_main_process(): # Log counts logger = setup_logger(name=__name__) logger.debug("Number of images in the dataset: {}".format( len(dataset))) _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0]) # Original code mapper = DatasetMapper(cfg, True) sampler = TrainingSampler(len(dataset)) dataset = DatasetFromList(dataset, copy=False) dataset = MapDataset(dataset, mapper) sampler = TrainingSampler(len(dataset)) assert isinstance(sampler, torch.utils.data.sampler.Sampler) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def build_batch_test_loader(dataset, *, mapper, sampler=None, num_workers=0): """ Similar to `build_detection_train_loader`, but uses a batch size of 1, and :class:`InferenceSampler`. This sampler coordinates all workers to produce the exact set of all samples. This interface is experimental. Args: dataset (list or torch.utils.data.Dataset): a list of dataset dicts, or a map-style pytorch dataset. They can be obtained by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``. sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces indices to be applied on ``dataset``. Default to :class:`InferenceSampler`, which splits the dataset across all workers. num_workers (int): number of parallel data loading workers Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. Examples: :: data_loader = build_detection_test_loader( DatasetRegistry.get("my_test"), mapper=DatasetMapper(...)) # or, instantiate with a CfgNode: data_loader = build_detection_test_loader(cfg, "my_test") """ if isinstance(dataset, list): dataset = DatasetFromList(dataset, copy=False) if mapper is not None: dataset = MapDataset(dataset, mapper) if sampler is None: sampler = InferenceSampler(len(dataset)) # Always use 1 image per worker during inference since this is the # standard when reporting inference time in papers. batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 4, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def build_detection_test_loader(cfg, dataset_name, mapper=None): """ Influenced from detectron2's own build_detection_test_loader but used to process mini-batch size greater than 1 Args: cfg: a detectron2 CfgNode dataset_name (str): a name of the dataset that's available in the DatasetCatalog mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, False)`. Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. """ dataset_dicts = get_detection_dataset_dicts( [dataset_name], filter_empty=False, proposal_files=[ cfg.DATASETS.PROPOSAL_FILES_TEST[ list(cfg.DATASETS.TEST).index(dataset_name) ] ] if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts) if mapper is None: mapper = DatasetMapper(cfg, False) dataset = MapDataset(dataset, mapper) sampler = samplers.InferenceSampler(len(dataset)) batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, cfg.SOLVER.IMS_PER_BATCH, drop_last=False ) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def build_players_loader(json_file, image_root, batch_size): """ """ dataset_dicts = load_coco_json(json_file, image_root) dataset = DatasetFromList(dataset_dicts) dataset = MapDataset(dataset, PlayerMapper()) sampler = samplers.InferenceSampler(len(dataset)) # Always use 1 image per worker during inference since this is the # standard when reporting inference time in papers. batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=batch_size, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def build_train_loader(cfg, mapper=None): if mapper is None: mapper = get_dataset_mapper(cfg.DATASETS.TRAIN[0])(cfg, True) dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) # TODO avoid if-else? if sampler_name == "TrainingSampler": sampler = TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) sampler = RepeatFactorTrainingSampler(repeat_factors) elif sampler_name == "RatioFactorTrainingSampler": repeat_factors = repeat_factors_from_ratios(dataset_dicts) sampler = RepeatFactorTrainingSampler(repeat_factors) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def build_classification_test_loader(cfg, dataset_name, mapper=None): """ Similar to `build_detection_train_loader`. But this function uses the given `dataset_name` argument (instead of the names in cfg), and uses batch size 1. Args: cfg: a detectron2 CfgNode dataset_name (str): a name of the dataset that's available in the DatasetCatalog mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, False)`. Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. """ dataset_dicts = get_classification_dataset_dicts([dataset_name]) dataset = DatasetFromList(dataset_dicts) if mapper is None: mapper = DatasetMapper(cfg, False) dataset = MapDataset(dataset, mapper) sampler = samplers.InferenceSampler(len(dataset)) # Always use 1 image per worker during inference since this is the # standard when reporting inference time in papers. batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def build_detection_train_loader_drop_ids(cfg, drop_image_ids, mapper=None): """ A rewrite for the detectron2.data.build.build_detection_train_loader function, as it supports drop images of certian_ids specified by drop_image_ids. Returns: an infinite iterator of training data """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList( [dd for dd in dataset_dicts if dd['image_id'] not in drop_image_ids], copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def my_build_detection_train_loader(cfg, mapper=None, isShuffleData=True, curriculum_fraction=0): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": # If the fraction is the default 0, use the whole dataset if (curriculum_fraction == 0): sampler = samplers.TrainingSampler(len(dataset), shuffle=isShuffleData) # If the fraction is not 0, then take that fraction of the dataset as a subset else: new_len = int(round(len(dataset) * curriculum_fraction)) sampler = samplers.TrainingSampler(new_len, shuffle=isShuffleData) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_detection_train_loader(cfg: CfgNode, mapper=None): """ A data loader is created in a way similar to that of Detectron2. The main differences are: - it allows to combine datasets with different but compatible object category sets The data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ images_per_worker = _compute_num_images_per_worker(cfg) _add_category_whitelists_to_metadata(cfg) _add_category_maps_to_metadata(cfg) dataset_dicts = combine_detection_dataset_dicts( cfg.DATASETS.TRAIN, keep_instance_predicate=_get_train_keep_instance_predicate(cfg), proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_detection_train_loader(cfg, mapper=None): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ # Change the batching strategy to replicate the N-1 first frames of each video # to not load frames from different videos in the same batch num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH num_frames = cfg.MODEL.SPATIOTEMPORAL.NUM_FRAMES if cfg.MODEL.SPATIOTEMPORAL.FORWARD_AGGREGATION: # (f_{t-NUM_FRAMES}, ..., f_{t-1}, f_t, f_{t+1}, ..., f_{t+NUM_FRAMES}) num_frames = (2 * num_frames) + 1 assert ( images_per_batch == 1 ), "SOLVER.IMS_PER_BATCH ({}) must be 1. Actual batch size in spatio-temporal dataset must be set to num_frames({}).".format( images_per_batch, num_frames) images_per_batch = num_frames assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_detection_dataset_dicts( cfg, cfg.DATASETS.TRAIN, num_frames, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, long_term=cfg.MODEL.SPATIOTEMPORAL.LONG_TERM) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler = samplers.TrainingSampler(len(dataset), shuffle=True) data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict return data_loader
def build_classification_train_loader(cfg, mapper=None): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers ) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers ) images_per_worker = images_per_batch // num_workers dataset_dicts = get_classification_dataset_dicts(cfg.DATASETS.TRAIN) dataset = DatasetFromList(dataset_dicts, copy=False) # Bin edges for batching images with similar aspect ratios. If ASPECT_RATIO_GROUPING # is enabled, we define two bins with an edge at height / width = 1. if mapper is None: mapper = ClassificationDatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD ) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) # if cfg.DATALOADER.ASPECT_RATIO_GROUPING: # data_loader = torch.utils.data.DataLoader( # dataset, # sampler=sampler, # num_workers=cfg.DATALOADER.NUM_WORKERS, # batch_sampler=None, # collate_fn=operator.itemgetter(0), # don't batch, but yield individual elements # worker_init_fn=worker_init_reset_seed, # ) # yield individual mapped dict # data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) # else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True ) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_detection_train_loader(cfg, mapper=None): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: a torch DataLoader object """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) # Bin edges for batching images with similar aspect ratios. If ASPECT_RATIO_GROUPING # is enabled, we define two bins with an edge at height / width = 1. group_bin_edges = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] aspect_ratios = [ float(img["height"]) / float(img["width"]) for img in dataset ] if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) elif sampler_name == "InferenceSampler": sampler = samplers.InferenceSampler(len(dataset)) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) batch_sampler = build_batch_data_sampler(sampler, images_per_worker, group_bin_edges, aspect_ratios) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_detection_semisup_train_loader_two_crops(cfg, mapper=None): if cfg.DATASETS.CROSS_DATASET: # cross-dataset (e.g., coco-additional) label_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN_LABEL, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) unlabel_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN_UNLABEL, filter_empty=False, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) else: # different degree of supervision (e.g., COCO-supervision) dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) # Divide into labeled and unlabeled sets according to supervision percentage label_dicts, unlabel_dicts = divide_label_unlabel( dataset_dicts, cfg.DATALOADER.SUP_PERCENT, cfg.DATALOADER.RANDOM_DATA_SEED, cfg.DATALOADER.RANDOM_DATA_SEED_PATH, ) label_dataset = DatasetFromList(label_dicts, copy=False) # exclude the labeled set from unlabeled dataset unlabel_dataset = DatasetFromList(unlabel_dicts, copy=False) # include the labeled set in unlabel dataset # unlabel_dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) label_dataset = MapDataset(label_dataset, mapper) unlabel_dataset = MapDataset(unlabel_dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": label_sampler = TrainingSampler(len(label_dataset)) unlabel_sampler = TrainingSampler(len(unlabel_dataset)) elif sampler_name == "RepeatFactorTrainingSampler": raise NotImplementedError("{} not yet supported.".format(sampler_name)) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) return build_semisup_batch_data_loader_two_crop( (label_dataset, unlabel_dataset), (label_sampler, unlabel_sampler), cfg.SOLVER.IMG_PER_BATCH_LABEL, cfg.SOLVER.IMG_PER_BATCH_UNLABEL, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def build_classification_train_loader(cfg, mapper=None, multiplier=1): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH sample_num = cfg.DATASETS.WEAK_CLASSIFIER_SAMPLE_NUM assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers images_per_worker = int(images_per_worker * multiplier) if sample_num > 0: np.random.seed(cfg.DATASETS.SAMPLE_SEED) print("Setting sampling seed:", cfg.DATASETS.SAMPLE_SEED) dataset_names = cfg.DATASETS.CLASSIFIER_TRAIN if isinstance(dataset_names, str): dataset_names = [dataset_names] dataset_dicts = [ DatasetCatalog.get(dataset_name) for dataset_name in dataset_names ] dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) label_to_annotation_dict = { e: [] for e in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES) } for e in dataset_dicts: per_label_record = {} for ann in e['annotations']: if ann['category_id'] in per_label_record: per_label_record[ann['category_id']]['annotations'].append( ann) else: record = copy.deepcopy(e) # filter annotations annotations_filtered = [ann] record['annotations'] = annotations_filtered per_label_record[ann['category_id']] = record for key in per_label_record.keys(): label_to_annotation_dict[key].append(per_label_record[key]) label_to_annotation_dict_sampled = {} for id_class, ann_list in label_to_annotation_dict.items(): if id_class in cfg.DATASETS.FEWSHOT.BASE_CLASSES_ID: if not cfg.DATASETS.OVER_SAMPLE: if cfg.DATASETS.BASE_MULTIPLIER > 0: try: ann_list_sampled = np.random.choice( ann_list, size=int(sample_num * cfg.DATASETS.BASE_MULTIPLIER), replace=False) except: ann_list_sampled = np.random.choice( ann_list, size=int(sample_num * cfg.DATASETS.BASE_MULTIPLIER), replace=True) else: ann_list_sampled = ann_list else: print("BASE OVER SAMPLING") ann_list_sampled = ann_list label_to_annotation_dict_sampled[id_class] = ann_list_sampled else: if not cfg.DATASETS.OVER_SAMPLE: if cfg.DATASETS.BASE_MULTIPLIER > 0: try: ann_list_sampled = np.random.choice( ann_list, size=sample_num, replace=False) except: ann_list_sampled = np.random.choice( ann_list, size=sample_num, replace=True) if cfg.DATASETS.NOVEL_MULTIPLER > 0: ann_list_sampled = np.repeat( ann_list_sampled, cfg.DATASETS.NOVEL_MULTIPLER) else: ann_list_sampled = [] else: try: ann_list_sampled_temp = np.random.choice( ann_list, size=sample_num, replace=False) if not cfg.DATASETS.SAMPLE_WITH_REPLACEMENT: print("OVER SAMPLING") ann_list_sampled = np.random.choice( ann_list_sampled_temp, size=len(ann_list), replace=True) else: ann_list_sampled_temp = np.random.choice( ann_list, size=sample_num, replace=False) num_repeat = len(ann_list) // len( ann_list_sampled_temp) num_remainder = len(ann_list) % len( ann_list_sampled_temp) ann_list_sampled = np.repeat( ann_list_sampled_temp, num_repeat) if num_remainder > 0: ann_list_sampled = np.hstack( (ann_list_sampled, np.random.choice(ann_list_sampled_temp, size=num_remainder, replace=True))) print("OVER SAMPLING FIXED NEW", len(ann_list_sampled_temp), len(ann_list_sampled)) except: ann_list_sampled = ann_list label_to_annotation_dict_sampled[id_class] = ann_list_sampled dataset_dicts = [] for k, v in label_to_annotation_dict_sampled.items(): dataset_dicts.extend(v) DatasetCatalog.register("classifier_train_sampled", lambda: dataset_dicts) MetadataCatalog.get("classifier_train_sampled").set( thing_classes=MetadataCatalog.get(dataset_names[0]).thing_classes, evaluator_type='pascal_voc') dataset_name = ('classifier_train_sampled', ) # print([(x['image_id'], len(x['annotations'])) for x in dataset_dicts[:50]]) # print_instances_class_histogram_1(dataset_dicts, MetadataCatalog.get(dataset_names[0]).thing_classes) else: dataset_name = cfg.DATASETS.CLASSIFIER_TRAIN dataset_dicts = get_detection_dataset_dicts( dataset_name, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_CLASSIFIER_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None) dataset = DatasetFromList(dataset_dicts, copy=False) # # filtering # dataset_filtered = [] # for sample in dataset: # e_class_ids = set([e['category_id'] for e in sample['annotations']]) # for e_class_ids_ in e_class_ids: # if e_class_ids_ in cfg.DATASETS.FEWSHOT.NOVEL_CLASSES_ID: # dataset_filtered.append(sample) # break # dataset = dataset_filtered if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_classification_train_loader(cfg, mapper=None): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Coordinate a random shuffle order shared among all processes (all GPUs) 3. Each process spawn another few workers to process the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will yield. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_classification_dataset_dicts(cfg.DATASETS.TRAIN) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) sampler = samplers.TrainingSampler(len(dataset)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_detection_query_loader(cfg, dataset_names_tuple, dataset_proposal_files_tuple, mapper=None, is_train=True): """ - Modified from detectron2.data.build_detection_train_loader - `dataset_names_tuple`: since we need to provide dataset names using different variables (meta-setup) and cfg could not be modified (CfgNode is immutable) - `is_train`: will create duplicated entries according to annotations So if an image contains five annotations, it will appear in the dataset five times with different annotations """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_detection_dataset_dicts( dataset_names_tuple, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=dataset_proposal_files_tuple if cfg.MODEL.LOAD_PROPOSALS else None, ) logger = logging.getLogger(__name__) # Train: split annotations class-wise if is_train: print( "Query dataset num instances before annotation-wise duplication: {}" .format(len(dataset_dicts))) dataset_dicts = duplicate_data_acc_to_annotation_categories( dataset_dicts) print( "Query dataset num instances after annotation-wise duplication: {}" .format(len(dataset_dicts))) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, is_train=is_train) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader