def build_detection_train_loader_with_attributes(cfg, mapper=None): num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = AttributeDatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter(0), worker_init_fn=worker_init_reset_seed, ) data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def my_build_batch_data_loader(dataset, sampler, total_batch_size, *, aspect_ratio_grouping=False, num_workers=0): """Build a batched dataloader for training. Args: dataset (torch.utils.data.Dataset): map-style PyTorch dataset. Can be indexed. sampler (torch.utils.data.sampler.Sampler): a sampler that produces indices total_batch_size, aspect_ratio_grouping, num_workers): see :func:`build_detection_train_loader`. Returns: iterable[list]. Length of each list is the batch size of the current GPU. Each element in the list comes from the dataset. """ world_size = comm.get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size) batch_size = total_batch_size // world_size # Horovod: limit # of CPU threads to be used per worker. if num_workers > 0: torch.set_num_threads(num_workers) kwargs = {"num_workers": num_workers} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe # https://github.com/horovod/horovod/blob/master/examples/pytorch/pytorch_imagenet_resnet50.py # if (num_workers > 0 and hasattr(mp, '_supports_context') and # mp._supports_context and 'forkserver' in mp.get_all_start_methods()): # kwargs['multiprocessing_context'] = 'forkserver' if aspect_ratio_grouping: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, **kwargs, ) # yield individual mapped dict return AspectRatioGroupedDataset(data_loader, batch_size) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, batch_size, drop_last=True) # drop_last so the batch always have the same size return torch.utils.data.DataLoader( dataset, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, **kwargs, )
def build_batch_data_loader( # type: ignore[no-untyped-def] dataset, sampler, total_batch_size: int, *, aspect_ratio_grouping: bool = False, num_workers: int = 0, drop_last: bool = True, ) -> Union[torch.utils.data.DataLoader, AspectRatioGroupedDataset]: """ Build a batched dataloader for training. Modified from detectron2 to expose the `drop_last` option. Args: dataset (torch.utils.data.Dataset): map-style PyTorch dataset. Can be indexed. sampler (torch.utils.data.sampler.Sampler): a sampler that produces indices total_batch_size, aspect_ratio_grouping, num_workers): see :func:`build_detection_train_loader`. Returns: iterable[list]. Length of each list is the batch size of the current GPU. Each element in the list comes from the dataset. """ world_size = get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size) batch_size = total_batch_size // world_size if aspect_ratio_grouping: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=num_workers, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict return AspectRatioGroupedDataset(data_loader, batch_size) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, batch_size, drop_last=drop_last) # srnet: expose drop_last to caller return torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, )
def build_batch_data_loader(dataset, sampler, total_batch_size, *, aspect_ratio_grouping=False, num_workers=0): """ Build a batched dataloader for training. Args: dataset (torch.utils.data.Dataset): map-style PyTorch dataset. Can be indexed. sampler (torch.utils.data.sampler.Sampler): a sampler that produces indices total_batch_size (int): total batch size across GPUs. aspect_ratio_grouping (bool): whether to group images with similar aspect ratio for efficiency. When enabled, it requires each element in dataset be a dict with keys "width" and "height". num_workers (int): number of parallel data loading workers Returns: iterable[list]. Length of each list is the batch size of the current GPU. Each element in the list comes from the dataset. """ world_size = get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size) batch_size = total_batch_size // world_size if aspect_ratio_grouping: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=num_workers, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict return AspectRatioGroupedDataset(data_loader, batch_size) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, batch_size, drop_last=True) # drop_last so the batch always have the same size return torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, )
def test_reiter_leak(self): data = [(1, 0), (0, 1), (1, 0), (0, 1)] data = [{"width": a, "height": b} for (a, b) in data] batchsize = 2 dataset = AspectRatioGroupedDataset(data, batchsize) for _ in range(5): for idx, __ in enumerate(dataset): if idx == 1: # manually break, so the iterator does not stop by itself break # check that bucket sizes are valid for bucket in dataset._buckets: self.assertLess(len(bucket), batchsize)
def build_batch_data_loader(dataset, sampler, total_batch_size, *, aspect_ratio_grouping=False, num_workers=0): """ Build a batched dataloader for training. Args: dataset (torch.utils.data.Dataset): map-style PyTorch dataset. Can be indexed. sampler (torch.utils.data.sampler.Sampler): a sampler that produces indices total_batch_size, aspect_ratio_grouping, num_workers): see :func:`build_detection_train_loader`. Returns: iterable[list]. Length of each list is the batch size of the current GPU. Each element in the list comes from the dataset. """ world_size = get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size) batch_size = total_batch_size // world_size if aspect_ratio_grouping: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=num_workers, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict return AspectRatioGroupedDataset(data_loader, batch_size) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, batch_size, drop_last=True) # drop_last so the batch always have the same size return torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, )
def build_detection_train_loader(cfg: CfgNode, mapper=None): """ A data loader is created in a way similar to that of Detectron2. The main differences are: - it allows to combine datasets with different but compatible object category sets The data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ images_per_worker = _compute_num_images_per_worker(cfg) _add_category_whitelists_to_metadata(cfg) _add_category_maps_to_metadata(cfg) dataset_dicts = combine_detection_dataset_dicts( cfg.DATASETS.TRAIN, keep_instance_predicate=_get_train_keep_instance_predicate(cfg), proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def my_build_detection_train_loader(cfg, mapper=None, isShuffleData=True, curriculum_fraction=0): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": # If the fraction is the default 0, use the whole dataset if (curriculum_fraction == 0): sampler = samplers.TrainingSampler(len(dataset), shuffle=isShuffleData) # If the fraction is not 0, then take that fraction of the dataset as a subset else: new_len = int(round(len(dataset) * curriculum_fraction)) sampler = samplers.TrainingSampler(new_len, shuffle=isShuffleData) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_classification_train_loader(cfg, mapper=None, multiplier=1): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Start workers to work on the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will return. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH sample_num = cfg.DATASETS.WEAK_CLASSIFIER_SAMPLE_NUM assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers images_per_worker = int(images_per_worker * multiplier) if sample_num > 0: np.random.seed(cfg.DATASETS.SAMPLE_SEED) print("Setting sampling seed:", cfg.DATASETS.SAMPLE_SEED) dataset_names = cfg.DATASETS.CLASSIFIER_TRAIN if isinstance(dataset_names, str): dataset_names = [dataset_names] dataset_dicts = [ DatasetCatalog.get(dataset_name) for dataset_name in dataset_names ] dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) label_to_annotation_dict = { e: [] for e in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES) } for e in dataset_dicts: per_label_record = {} for ann in e['annotations']: if ann['category_id'] in per_label_record: per_label_record[ann['category_id']]['annotations'].append( ann) else: record = copy.deepcopy(e) # filter annotations annotations_filtered = [ann] record['annotations'] = annotations_filtered per_label_record[ann['category_id']] = record for key in per_label_record.keys(): label_to_annotation_dict[key].append(per_label_record[key]) label_to_annotation_dict_sampled = {} for id_class, ann_list in label_to_annotation_dict.items(): if id_class in cfg.DATASETS.FEWSHOT.BASE_CLASSES_ID: if not cfg.DATASETS.OVER_SAMPLE: if cfg.DATASETS.BASE_MULTIPLIER > 0: try: ann_list_sampled = np.random.choice( ann_list, size=int(sample_num * cfg.DATASETS.BASE_MULTIPLIER), replace=False) except: ann_list_sampled = np.random.choice( ann_list, size=int(sample_num * cfg.DATASETS.BASE_MULTIPLIER), replace=True) else: ann_list_sampled = ann_list else: print("BASE OVER SAMPLING") ann_list_sampled = ann_list label_to_annotation_dict_sampled[id_class] = ann_list_sampled else: if not cfg.DATASETS.OVER_SAMPLE: if cfg.DATASETS.BASE_MULTIPLIER > 0: try: ann_list_sampled = np.random.choice( ann_list, size=sample_num, replace=False) except: ann_list_sampled = np.random.choice( ann_list, size=sample_num, replace=True) if cfg.DATASETS.NOVEL_MULTIPLER > 0: ann_list_sampled = np.repeat( ann_list_sampled, cfg.DATASETS.NOVEL_MULTIPLER) else: ann_list_sampled = [] else: try: ann_list_sampled_temp = np.random.choice( ann_list, size=sample_num, replace=False) if not cfg.DATASETS.SAMPLE_WITH_REPLACEMENT: print("OVER SAMPLING") ann_list_sampled = np.random.choice( ann_list_sampled_temp, size=len(ann_list), replace=True) else: ann_list_sampled_temp = np.random.choice( ann_list, size=sample_num, replace=False) num_repeat = len(ann_list) // len( ann_list_sampled_temp) num_remainder = len(ann_list) % len( ann_list_sampled_temp) ann_list_sampled = np.repeat( ann_list_sampled_temp, num_repeat) if num_remainder > 0: ann_list_sampled = np.hstack( (ann_list_sampled, np.random.choice(ann_list_sampled_temp, size=num_remainder, replace=True))) print("OVER SAMPLING FIXED NEW", len(ann_list_sampled_temp), len(ann_list_sampled)) except: ann_list_sampled = ann_list label_to_annotation_dict_sampled[id_class] = ann_list_sampled dataset_dicts = [] for k, v in label_to_annotation_dict_sampled.items(): dataset_dicts.extend(v) DatasetCatalog.register("classifier_train_sampled", lambda: dataset_dicts) MetadataCatalog.get("classifier_train_sampled").set( thing_classes=MetadataCatalog.get(dataset_names[0]).thing_classes, evaluator_type='pascal_voc') dataset_name = ('classifier_train_sampled', ) # print([(x['image_id'], len(x['annotations'])) for x in dataset_dicts[:50]]) # print_instances_class_histogram_1(dataset_dicts, MetadataCatalog.get(dataset_names[0]).thing_classes) else: dataset_name = cfg.DATASETS.CLASSIFIER_TRAIN dataset_dicts = get_detection_dataset_dicts( dataset_name, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_CLASSIFIER_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None) dataset = DatasetFromList(dataset_dicts, copy=False) # # filtering # dataset_filtered = [] # for sample in dataset: # e_class_ids = set([e['category_id'] for e in sample['annotations']]) # for e_class_ids_ in e_class_ids: # if e_class_ids_ in cfg.DATASETS.FEWSHOT.NOVEL_CLASSES_ID: # dataset_filtered.append(sample) # break # dataset = dataset_filtered if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_detection_query_loader(cfg, dataset_names_tuple, dataset_proposal_files_tuple, mapper=None, is_train=True): """ - Modified from detectron2.data.build_detection_train_loader - `dataset_names_tuple`: since we need to provide dataset names using different variables (meta-setup) and cfg could not be modified (CfgNode is immutable) - `is_train`: will create duplicated entries according to annotations So if an image contains five annotations, it will appear in the dataset five times with different annotations """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_detection_dataset_dicts( dataset_names_tuple, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=dataset_proposal_files_tuple if cfg.MODEL.LOAD_PROPOSALS else None, ) logger = logging.getLogger(__name__) # Train: split annotations class-wise if is_train: print( "Query dataset num instances before annotation-wise duplication: {}" .format(len(dataset_dicts))) dataset_dicts = duplicate_data_acc_to_annotation_categories( dataset_dicts) print( "Query dataset num instances after annotation-wise duplication: {}" .format(len(dataset_dicts))) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, is_train=is_train) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_detection_train_loader_drop_ids(cfg, drop_image_ids, mapper=None): """ A rewrite for the detectron2.data.build.build_detection_train_loader function, as it supports drop images of certian_ids specified by drop_image_ids. Returns: an infinite iterator of training data """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList( [dd for dd in dataset_dicts if dd['image_id'] not in drop_image_ids], copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_classification_train_loader(cfg, mapper=None): """ A data loader is created by the following steps: 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. 2. Coordinate a random shuffle order shared among all processes (all GPUs) 3. Each process spawn another few workers to process the dicts. Each worker will: * Map each metadata dict into another format to be consumed by the model. * Batch them by simply putting dicts into a list. The batched ``list[mapped_dict]`` is what this dataloader will yield. Args: cfg (CfgNode): the config mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. By default it will be `DatasetMapper(cfg, True)`. Returns: an infinite iterator of training data """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_classification_dataset_dicts(cfg.DATASETS.TRAIN) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = DatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) sampler = samplers.TrainingSampler(len(dataset)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader
def build_classification_train_loader(cfg, mapper=None): """ Build a classification data loader from cfg. Returns: list[dict]: Each dict contains, * image: Tensor, image in (C, H, W) format. * label (optional): int, groundtruth class """ num_workers = get_world_size() images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_workers == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( images_per_batch, num_workers) assert ( images_per_batch >= num_workers ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( images_per_batch, num_workers) images_per_worker = images_per_batch // num_workers dataset_dicts = get_classification_dataset_dicts(cfg.DATASETS.TRAIN) dataset = DatasetFromList(dataset_dicts, copy=False) if mapper is None: mapper = ClsDatasetMapper(cfg, True) dataset = MapDataset(dataset, mapper) sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = samplers.TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": sampler = samplers.RepeatFactorTrainingSampler( dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, images_per_worker, drop_last=True) # drop_last so the batch always have the same size data_loader = torch.utils.data.DataLoader( dataset, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, ) return data_loader