예제 #1
0
    def test_training_sampler_seed(self):
        seed_all_rng(42)
        sampler = TrainingSampler(30)
        data = list(itertools.islice(sampler, 65))

        seed_all_rng(42)
        sampler = TrainingSampler(30)
        seed_all_rng(999)  # should be ineffective
        data2 = list(itertools.islice(sampler, 65))
        self.assertEqual(data, data2)
예제 #2
0
def build_inference_based_loader(
        cfg: CfgNode, dataset_cfg: CfgNode,
        model: torch.nn.Module) -> InferenceBasedLoader:
    """
    Constructs data loader based on inference results of a model.
    """
    dataset = build_bootstrap_dataset(dataset_cfg.DATASET,
                                      dataset_cfg.IMAGE_LOADER)
    training_sampler = TrainingSampler(len(dataset))
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=dataset_cfg.IMAGE_LOADER.BATCH_SIZE,
        sampler=training_sampler,
        num_workers=dataset_cfg.IMAGE_LOADER.NUM_WORKERS,
        collate_fn=trivial_batch_collator,
    )
    return InferenceBasedLoader(
        model,
        data_loader=data_loader,
        data_sampler=build_data_sampler(dataset_cfg.DATA_SAMPLER),
        data_filter=build_data_filter(dataset_cfg.FILTER),
        shuffle=True,
        batch_size=dataset_cfg.INFERENCE.OUTPUT_BATCH_SIZE,
        inference_batch_size=dataset_cfg.INFERENCE.INPUT_BATCH_SIZE,
    )
예제 #3
0
def build_custom_train_loader(cfg, mapper=None):
    """
    Modified from detectron2.data.build.build_custom_train_loader, but supports
    different samplers
    """
    source_aware = cfg.DATALOADER.SOURCE_AWARE
    if source_aware:
        dataset_dicts = get_detection_dataset_dicts_with_source(
            cfg.DATASETS.TRAIN,
            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
            if cfg.MODEL.KEYPOINT_ON else 0,
            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN
            if cfg.MODEL.LOAD_PROPOSALS else None,
        )
        sizes = [0 for _ in range(len(cfg.DATASETS.TRAIN))]
        for d in dataset_dicts:
            sizes[d['dataset_source']] += 1
        print('dataset sizes', sizes)
    else:
        dataset_dicts = get_detection_dataset_dicts(
            cfg.DATASETS.TRAIN,
            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
            if cfg.MODEL.KEYPOINT_ON else 0,
            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN
            if cfg.MODEL.LOAD_PROPOSALS else None,
        )
    dataset = DatasetFromList(dataset_dicts, copy=False)

    if mapper is None:
        assert 0
        # mapper = DatasetMapper(cfg, True)
    dataset = MapDataset(dataset, mapper)

    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
    logger = logging.getLogger(__name__)
    logger.info("Using training sampler {}".format(sampler_name))
    # TODO avoid if-else?
    if sampler_name == "TrainingSampler":
        sampler = TrainingSampler(len(dataset))
    elif sampler_name == "MultiDatasetSampler":
        assert source_aware
        sampler = MultiDatasetSampler(cfg, sizes, dataset_dicts)
    elif sampler_name == "RepeatFactorTrainingSampler":
        repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
            dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD)
        sampler = RepeatFactorTrainingSampler(repeat_factors)
    elif sampler_name == "ClassAwareSampler":
        sampler = ClassAwareSampler(dataset_dicts)
    else:
        raise ValueError("Unknown training sampler: {}".format(sampler_name))

    return build_batch_data_loader(
        dataset,
        sampler,
        cfg.SOLVER.IMS_PER_BATCH,
        aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        num_workers=cfg.DATALOADER.NUM_WORKERS,
    )
예제 #4
0
def build_inference_based_loader(
    cfg: CfgNode,
    dataset_cfg: CfgNode,
    model: torch.nn.Module,
    embedder: Optional[torch.nn.Module] = None,
) -> InferenceBasedLoader:
    """
    Constructs data loader based on inference results of a model.
    """
    dataset = build_bootstrap_dataset(dataset_cfg.DATASET, dataset_cfg.IMAGE_LOADER)
    meta = MetadataCatalog.get(dataset_cfg.DATASET)
    training_sampler = TrainingSampler(len(dataset))
    data_loader = torch.utils.data.DataLoader(
        dataset,  # pyre-ignore[6]
        batch_size=dataset_cfg.IMAGE_LOADER.BATCH_SIZE,
        sampler=training_sampler,
        num_workers=dataset_cfg.IMAGE_LOADER.NUM_WORKERS,
        collate_fn=trivial_batch_collator,
        worker_init_fn=worker_init_reset_seed,
    )
    return InferenceBasedLoader(
        model,
        data_loader=data_loader,
        data_sampler=build_data_sampler(cfg, dataset_cfg.DATA_SAMPLER, embedder),
        data_filter=build_data_filter(dataset_cfg.FILTER),
        shuffle=True,
        batch_size=dataset_cfg.INFERENCE.OUTPUT_BATCH_SIZE,
        inference_batch_size=dataset_cfg.INFERENCE.INPUT_BATCH_SIZE,
        category_to_class_mapping=meta.category_to_class_mapping,
    )
예제 #5
0
def build_detection_train_loader(cfg):
    """Builds a data loader for the baseline trainer with support of training on the subset of labeled data only.

    Most of code comes from `d2.data.build.build_detection_train_loader()`, see it for more details.
    """

    # CSD: check config is supported
    assert cfg.DATALOADER.SAMPLER_TRAIN == "TrainingSampler", "Unsupported training sampler: {}".format(
        cfg.DATALOADER.SAMPLER_TRAIN)

    # Original code
    dataset = get_detection_dataset_dicts(
        cfg.DATASETS.TRAIN,
        filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
        min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
        if cfg.MODEL.KEYPOINT_ON else 0,
        proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN
        if cfg.MODEL.LOAD_PROPOSALS else None,
    )

    # CSD: subsample the dataset if needed
    dataset = check_subsample_dataset(dataset, cfg)

    if comm.is_main_process():  # Log counts
        logger = setup_logger(name=__name__)
        logger.debug("Number of images in the dataset: {}".format(
            len(dataset)))
        _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])

    # Original code
    mapper = DatasetMapper(cfg, True)

    sampler = TrainingSampler(len(dataset))

    dataset = DatasetFromList(dataset, copy=False)
    dataset = MapDataset(dataset, mapper)
    sampler = TrainingSampler(len(dataset))
    assert isinstance(sampler, torch.utils.data.sampler.Sampler)

    return build_batch_data_loader(
        dataset,
        sampler,
        cfg.SOLVER.IMS_PER_BATCH,
        aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        num_workers=cfg.DATALOADER.NUM_WORKERS,
    )
예제 #6
0
def build_detection_train_loader(cfg: CfgNode, mapper=None):
    """
    A data loader is created in a way similar to that of Detectron2.
    The main differences are:
     - it allows to combine datasets with different but compatible object category sets

    The data loader is created by the following steps:
    1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts.
    2. Start workers to work on the dicts. Each worker will:
        * Map each metadata dict into another format to be consumed by the model.
        * Batch them by simply putting dicts into a list.
    The batched ``list[mapped_dict]`` is what this dataloader will return.

    Args:
        cfg (CfgNode): the config
        mapper (callable): a callable which takes a sample (dict) from dataset and
            returns the format to be consumed by the model.
            By default it will be `DatasetMapper(cfg, True)`.

    Returns:
        an infinite iterator of training data
    """

    _add_category_whitelists_to_metadata(cfg)
    _add_category_maps_to_metadata(cfg)
    dataset_dicts = combine_detection_dataset_dicts(
        cfg.DATASETS.TRAIN,
        keep_instance_predicate=_get_train_keep_instance_predicate(cfg),
        proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
    )
    dataset = DatasetFromList(dataset_dicts, copy=False)

    if mapper is None:
        mapper = DatasetMapper(cfg, True)
    dataset = MapDataset(dataset, mapper)

    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
    logger = logging.getLogger(__name__)
    logger.info("Using training sampler {}".format(sampler_name))
    if sampler_name == "TrainingSampler":
        sampler = TrainingSampler(len(dataset))
    elif sampler_name == "RepeatFactorTrainingSampler":
        repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
            dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD
        )
        sampler = RepeatFactorTrainingSampler(repeat_factors)
    else:
        raise ValueError("Unknown training sampler: {}".format(sampler_name))

    return build_batch_data_loader(
        dataset,
        sampler,
        cfg.SOLVER.IMS_PER_BATCH,
        aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        num_workers=cfg.DATALOADER.NUM_WORKERS,
    )
예제 #7
0
def build_detection_train_loader(cfg, mapper=None):
    """
    A data loader is created by the following steps:

    1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts.
    2. Coordinate a random shuffle order shared among all processes (all GPUs)
    3. Each process spawn another few workers to process the dicts. Each worker will:
       * Map each metadata dict into another format to be consumed by the model.
       * Batch them by simply putting dicts into a list.

    The batched ``list[mapped_dict]`` is what this dataloader will yield.

    Args:
        cfg (CfgNode): the config
        mapper (callable): a callable which takes a sample (dict) from dataset and
            returns the format to be consumed by the model.
            By default it will be ``DatasetMapper(cfg, True)``.

    Returns:
        an infinite iterator of training data
    """
    dataset_dicts = get_detection_dataset_dicts(
        cfg.DATASETS.TRAIN,
        filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
        min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
        if cfg.MODEL.KEYPOINT_ON
        else 0,
        proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
    )
    dataset = DatasetFromList(dataset_dicts, copy=False)

    if mapper is None:
        mapper = DatasetMapper(cfg, True)
    dataset = MapDataset(dataset, mapper)
    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
    logger = logging.getLogger(__name__)
    logger.info("Using training sampler {}".format(sampler_name))
    # TODO avoid if-else?
    if sampler_name == "TrainingSampler":
        sampler = TrainingSampler(len(dataset))
    elif sampler_name == "RepeatFactorTrainingSampler":
        repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
            dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD
        )
        sampler = RepeatFactorTrainingSampler(repeat_factors)
    else:
        raise ValueError("Unknown training sampler: {}".format(sampler_name))
    return build_batch_data_loader(
        dataset,
        sampler,
        cfg.SOLVER.IMS_PER_BATCH,
        aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        num_workers=cfg.DATALOADER.NUM_WORKERS,
    )
예제 #8
0
def build_detection_semisup_train_loader(cfg, mapper=None):

    dataset_dicts = get_detection_dataset_dicts(
        cfg.DATASETS.TRAIN,
        filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
        min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
        if cfg.MODEL.KEYPOINT_ON else 0,
        proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN
        if cfg.MODEL.LOAD_PROPOSALS else None,
    )

    # Divide into labeled and unlabeled sets according to supervision percentage
    label_dicts, unlabel_dicts = divide_label_unlabel(
        dataset_dicts,
        cfg.DATALOADER.SUP_PERCENT,
        cfg.DATALOADER.RANDOM_DATA_SEED,
        cfg.DATALOADER.RANDOM_DATA_SEED_PATH,
    )

    dataset = DatasetFromList(label_dicts, copy=False)

    if mapper is None:
        mapper = DatasetMapper(cfg, True)
    dataset = MapDataset(dataset, mapper)

    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
    logger = logging.getLogger(__name__)
    logger.info("Using training sampler {}".format(sampler_name))

    if sampler_name == "TrainingSampler":
        sampler = TrainingSampler(len(dataset))
    elif sampler_name == "RepeatFactorTrainingSampler":
        repeat_factors = (
            RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
                label_dicts, cfg.DATALOADER.REPEAT_THRESHOLD))
        sampler = RepeatFactorTrainingSampler(repeat_factors)
    else:
        raise ValueError("Unknown training sampler: {}".format(sampler_name))

    # list num of labeled and unlabeled
    logger.info("Number of training samples " + str(len(dataset)))
    logger.info("Supervision percentage " + str(cfg.DATALOADER.SUP_PERCENT))

    return build_batch_data_loader(
        dataset,
        sampler,
        cfg.SOLVER.IMS_PER_BATCH,
        aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        num_workers=cfg.DATALOADER.NUM_WORKERS,
    )
예제 #9
0
def build_simple_dataloader(dataset_name: list, batch_size):
    dataset_dicts = get_detection_dataset_dicts(dataset_name)
    dataset = DatasetFromList(dataset_dicts, copy=False)

    cfg = get_cfg()
    cfg["aug_kwargs"] = {}

    dataset = MapDataset(dataset, AlbumentationsMapper(cfg, False))

    # set the shuffle to False in debugging mode
    sampler = TrainingSampler(len(dataset), shuffle=False, seed=42)
    dataloader = build_batch_data_loader(dataset=dataset, sampler=sampler,
                                         total_batch_size=batch_size)

    return dataloader
예제 #10
0
def build_hand_train_loader(cfg):
    dataset_dicts, num_per_epoch = load_hand(cfg.HAND_PROJECT.DATA.MODE, cfg.HAND_PROJECT.DATA.ANNOT_SUBSET_TRAIN, cfg.HAND_PROJECT.DATA.BASE_PATH, selects=cfg.HAND_PROJECT.DATA.SELECTS)
    
    # pdb.set_trace()
    dataset = DatasetFromList(dataset_dicts, copy=False)
    mapper = DatasetMapper(cfg, True)
    dataset = MapDataset(dataset, mapper)

    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
    logger = logging.getLogger(__name__)
    logger.info("Using training sampler {}".format(sampler_name))
    
    sampler = TrainingSampler(len(dataset))
    
    return build_batch_data_loader(dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, num_workers=cfg.DATALOADER.NUM_WORKERS), num_per_epoch
예제 #11
0
def build_detection_train_loader_with_train_sampler(cfg, mapper, seed=42, shuffle=True):
    dataset_dicts = get_detection_dataset_dicts(cfg.DATASETS.TRAIN)
    dataset = DatasetFromList(dataset_dicts, copy=False)
    dataset = MapDataset(dataset, mapper)

    logger = logging.getLogger(__name__)
    logger.info("Using training sampler TrainingSampler with shuffle=False")
    sampler = TrainingSampler(len(dataset), shuffle=shuffle, seed=seed)

    return build_batch_data_loader(
        dataset,
        sampler,
        cfg.SOLVER.IMS_PER_BATCH,
        aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        num_workers=cfg.DATALOADER.NUM_WORKERS,
    )
예제 #12
0
def build_x_train_loader(
    dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0
):
    if isinstance(dataset, list):
        dataset = DatasetFromList(dataset, copy=False)
    if mapper is not None:
        dataset = MapDataset(dataset, mapper)
    if sampler is None:
        sampler = TrainingSampler(len(dataset))
    assert isinstance(sampler, torch.utils.data.sampler.Sampler)
    return build_batch_data_loader(
        dataset,
        sampler,
        total_batch_size,
        aspect_ratio_grouping=aspect_ratio_grouping,
        num_workers=num_workers,
    )
예제 #13
0
    def test_to_iterable(self):
        sampler = TrainingSampler(100, seed=10)
        dataset = DatasetFromList(list(range(100)))
        dataset = ToIterableDataset(dataset, sampler)
        data_loader = data.DataLoader(dataset, num_workers=0, collate_fn=operator.itemgetter(0))

        output = list(itertools.islice(data_loader, 100))
        self.assertEqual(set(output), set(range(100)))

        data_loader = data.DataLoader(
            dataset,
            num_workers=2,
            collate_fn=operator.itemgetter(0),
            worker_init_fn=worker_init_reset_seed,
            # reset seed should not affect behavior of TrainingSampler
        )
        output = list(itertools.islice(data_loader, 100))
        # multiple workers should not lead to duplicate or different data
        self.assertEqual(set(output), set(range(100)))
예제 #14
0
def build_train_loader(cfg, mapper=None):
    if mapper is None:
        mapper = get_dataset_mapper(cfg.DATASETS.TRAIN[0])(cfg, True)

    dataset_dicts = get_detection_dataset_dicts(
        cfg.DATASETS.TRAIN,
        filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
        min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
        if cfg.MODEL.KEYPOINT_ON else 0,
        proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN
        if cfg.MODEL.LOAD_PROPOSALS else None,
    )
    dataset = DatasetFromList(dataset_dicts, copy=False)

    dataset = MapDataset(dataset, mapper)

    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
    logger = logging.getLogger(__name__)
    logger.info("Using training sampler {}".format(sampler_name))
    # TODO avoid if-else?
    if sampler_name == "TrainingSampler":
        sampler = TrainingSampler(len(dataset))
    elif sampler_name == "RepeatFactorTrainingSampler":
        repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
            dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD)
        sampler = RepeatFactorTrainingSampler(repeat_factors)
    elif sampler_name == "RatioFactorTrainingSampler":
        repeat_factors = repeat_factors_from_ratios(dataset_dicts)
        sampler = RepeatFactorTrainingSampler(repeat_factors)

    else:
        raise ValueError("Unknown training sampler: {}".format(sampler_name))
    return build_batch_data_loader(
        dataset,
        sampler,
        cfg.SOLVER.IMS_PER_BATCH,
        aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        num_workers=cfg.DATALOADER.NUM_WORKERS,
    )
예제 #15
0
def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
    if dataset is None:
        dataset = get_detection_dataset_dicts(
            cfg.DATASETS.TRAIN,
            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
            if cfg.MODEL.KEYPOINT_ON
            else 0,
            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
        )

    if mapper is None:
        mapper = DatasetMapper(cfg, True)

    if sampler is None:
        sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
        logger = logging.getLogger("detectron2.trainer")
        logger.info("Using training sampler {}".format(sampler_name))
        if sampler_name == "TrainingSampler":
            sampler = TrainingSampler(len(dataset))
        elif sampler_name == "RepeatFactorTrainingSampler":
            repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
                dataset, cfg.DATALOADER.REPEAT_THRESHOLD
            )
            sampler = RepeatFactorTrainingSampler(repeat_factors)
        else:
            raise ValueError(
                "Unknown training sampler: {}".format(sampler_name))

    return {
        "dataset": dataset,
        "sampler": sampler,
        "mapper": mapper,
        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
        "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        "num_workers": cfg.DATALOADER.NUM_WORKERS,
    }
예제 #16
0
def build_train_dataloader(cfg):  # like 'build_detection_train_loader'
    if 'coco_2017_train' in cfg.DATASETS.TRAIN:
        descs_train: List[Dict] = DatasetCatalog.get("coco_2017_train")
        ds_train = DatasetFromList(descs_train, copy=False)
        mapper = DatasetMapper(cfg, True)
    else:  # Open-Image-Dataset
        if 'get_detection_dataset_dicts':
            all_descs_train: List[Dict] = DatasetCatalog.get("oid_train")
        if 'rebalancing':
            image_id_vs_idx = {}
            for idx, desc in enumerate(all_descs_train):
                image_id_vs_idx[desc['image_id']] = idx
            descs_train = list(map(lambda img_id: all_descs_train[image_id_vs_idx[img_id]], sample_image_ids()))
            print('_' * 50 + f'train dataset len: {len(descs_train)}')

        ds_train = DatasetFromList(descs_train, copy=False)

        if 'DatasetMapper':
            augs = [RandomContrast(0.8, 1.2),
                    RandomBrightness(0.8, 1.2),
                    RandomSaturation(0.8, 1.2)]
            augs.extend(build_augmentation(cfg, is_train=True))
            mapper = make_mapper('oid_train', is_train=True, augmentations=T.AugmentationList(augs))
    ds_train = MapDataset(ds_train, mapper)

    sampler = TrainingSampler(len(ds_train))
    data_loader = build_batch_data_loader(
        ds_train,
        sampler,
        cfg.SOLVER.IMS_PER_BATCH,
        aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        num_workers=cfg.DATALOADER.NUM_WORKERS,
    )
    global DATA_LOADER
    DATA_LOADER = data_loader
    return data_loader
예제 #17
0
def build_ss_train_loader(cfg, mapper):
    """Builds a semi-supervised data loader that yields both labeled and unlabeled images.

    Data can be loaded in two modes (defined in `cfg.DATASETS.MODE`):
      - "CROSS_DATASET": labeled and unlabeled images come from two disparate datasets, e.g.
      VOCtrain and VOCtest
      - "RANDOM_SPLIT": labeled and unlabeled images come from the same dataset by splitting it
      into the labeled and unlabeled parts
    For more details see `build_ss_datasets()`.

    Each batch consists of `cfg.SOLVER.IMS_PER_BATCH_LABELED` labeled and
    `cfg.SOLVER.IMS_PER_BATCH_UNLABELED` unlabeled images, which can be modified
    in `csd/config/config.py` or in a custom `configs/*.yaml` config file
    supplied to your training script.

    The actual x-flips happen inside `AspectRatioGroupedSSDataset` that is instantiated by
    `build_ss_batch_data_loader`

    The returned tuple contains (1) a tuple of lists with dicts for labeled and unlabeled images
    and (2) a DataLoader with infinite sampling yielding a pair of batches with labeled and unlabeled
    images with the same aspect ratio within batch.

    Specifically, the returned DataLoader yields a tuple of lists:
    ([labeled_img, labeled_img_xflip], [unlabeled_im, unlabeled_img_xflip]).
    """

    # Load labeled and unlabeled dataset dicts (either use two separate ones or perform a random split)
    labeled_dataset_dicts, unlabeled_dataset_dicts = build_ss_datasets(cfg)

    # Log the datasets sizes
    if comm.is_main_process():
        logger = setup_logger(name=__name__)
        logger.debug(
            "Number of images in the labeled and unlabeled datasets: {}, {}".
            format(len(labeled_dataset_dicts), len(unlabeled_dataset_dicts)))

        # Print updated metadata counts
        print_instances_class_histogram(
            labeled_dataset_dicts,
            MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).thing_classes)

    # Map metadata into actual objects (note: data augmentations also take place here)
    labeled_dataset = MapDataset(labeled_dataset_dicts, mapper)
    unlabeled_dataset = MapDataset(unlabeled_dataset_dicts, mapper)

    # Define data samplers
    assert cfg.DATALOADER.SAMPLER_TRAIN == "TrainingSampler", "Unsupported training sampler: {}".format(
        cfg.DATALOADER.SAMPLER_TRAIN)
    labeled_sampler = TrainingSampler(len(labeled_dataset))
    unlabeled_sampler = TrainingSampler(len(unlabeled_dataset))

    return (
        labeled_dataset_dicts,
        unlabeled_dataset_dicts,
    ), build_ss_batch_data_loader(  # Initialize actual dataloaders
        (labeled_dataset, unlabeled_dataset),
        (labeled_sampler, unlabeled_sampler),
        cfg.SOLVER.IMS_PER_BATCH_LABELED,
        cfg.SOLVER.IMS_PER_BATCH_UNLABELED,
        aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        num_workers=cfg.DATALOADER.NUM_WORKERS,
    )
예제 #18
0
                cfg.DATASETS.TRAIN,
                filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
                min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
                if cfg.MODEL.KEYPOINT_ON
                else 0,
                proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
            )
            dataset = DatasetFromList(dataset_dicts, copy=False)
            dataset = MapDataset(dataset, DatasetMapper(cfg, True))

            sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
            logger = logging.getLogger(__name__)
            logger.info("Using training sampler {}".format(sampler_name))
            # TODO avoid if-else?
            if sampler_name == "TrainingSampler":
                sampler = TrainingSampler(len(dataset))
            else:
                raise ValueError("Unknown training sampler: {}".format(sampler_name))
            data_loader = build_batch_data_loader(
                dataset,
                sampler,
                cfg.SOLVER.IMS_PER_BATCH,
                aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
                num_workers=cfg.DATALOADER.NUM_WORKERS,
            )

        # logger.info("Starting training from iteration {}".format(start_iter))
        # with EventStorage(start_iter) as storage:
        #     for data, iteration in zip(data_loader, range(start_iter, max_iter)):
        #         iteration = iteration + 1
        #         storage.step()
예제 #19
0
 def test_build_iterable_dataloader_train(self):
     kwargs = self._get_kwargs()
     ds = DatasetFromList(kwargs.pop("dataset"))
     ds = ToIterableDataset(ds, TrainingSampler(len(ds)))
     dl = build_detection_train_loader(dataset=ds, **kwargs)
     next(iter(dl))
예제 #20
0
def build_detection_semisup_train_loader_two_crops(cfg, mapper=None):
    if cfg.DATASETS.CROSS_DATASET:  # cross-dataset (e.g., coco-additional)
        label_dicts = get_detection_dataset_dicts(
            cfg.DATASETS.TRAIN_LABEL,
            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
            if cfg.MODEL.KEYPOINT_ON else 0,
            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN
            if cfg.MODEL.LOAD_PROPOSALS else None,
        )
        unlabel_dicts = get_detection_dataset_dicts(
            cfg.DATASETS.TRAIN_UNLABEL,
            filter_empty=False,
            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
            if cfg.MODEL.KEYPOINT_ON else 0,
            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN
            if cfg.MODEL.LOAD_PROPOSALS else None,
        )
    else:  # different degree of supervision (e.g., COCO-supervision)
        dataset_dicts = get_detection_dataset_dicts(
            cfg.DATASETS.TRAIN,
            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
            if cfg.MODEL.KEYPOINT_ON else 0,
            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN
            if cfg.MODEL.LOAD_PROPOSALS else None,
        )

        # Divide into labeled and unlabeled sets according to supervision percentage
        label_dicts, unlabel_dicts = divide_label_unlabel(
            dataset_dicts,
            cfg.DATALOADER.SUP_PERCENT,
            cfg.DATALOADER.RANDOM_DATA_SEED,
            cfg.DATALOADER.RANDOM_DATA_SEED_PATH,
        )

    label_dataset = DatasetFromList(label_dicts, copy=False)
    # exclude the labeled set from unlabeled dataset
    unlabel_dataset = DatasetFromList(unlabel_dicts, copy=False)
    # include the labeled set in unlabel dataset
    # unlabel_dataset = DatasetFromList(dataset_dicts, copy=False)

    if mapper is None:
        mapper = DatasetMapper(cfg, True)
    label_dataset = MapDataset(label_dataset, mapper)
    unlabel_dataset = MapDataset(unlabel_dataset, mapper)

    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
    logger = logging.getLogger(__name__)
    logger.info("Using training sampler {}".format(sampler_name))
    if sampler_name == "TrainingSampler":
        label_sampler = TrainingSampler(len(label_dataset))
        unlabel_sampler = TrainingSampler(len(unlabel_dataset))
    elif sampler_name == "RepeatFactorTrainingSampler":
        raise NotImplementedError("{} not yet supported.".format(sampler_name))
    else:
        raise ValueError("Unknown training sampler: {}".format(sampler_name))
    return build_semisup_batch_data_loader_two_crop(
        (label_dataset, unlabel_dataset),
        (label_sampler, unlabel_sampler),
        cfg.SOLVER.IMG_PER_BATCH_LABEL,
        cfg.SOLVER.IMG_PER_BATCH_UNLABEL,
        aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        num_workers=cfg.DATALOADER.NUM_WORKERS,
    )