Пример #1
0
    def inject_fake_data(self, tmpdir, config):
        root = pathlib.Path(tmpdir) / "caltech101"
        images = root / "101_ObjectCategories"
        annotations = root / "Annotations"

        categories = (("Faces", "Faces_2"), ("helicopter", "helicopter"),
                      ("ying_yang", "ying_yang"))
        num_images_per_category = 2

        for image_category, annotation_category in categories:
            datasets_utils.create_image_folder(
                root=images,
                name=image_category,
                file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
                num_examples=num_images_per_category,
            )
            self._create_annotation_folder(
                root=annotations,
                name=annotation_category,
                file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
                num_examples=num_images_per_category,
            )

        # This is included in the original archive, but is removed by the dataset. Thus, an empty directory suffices.
        os.makedirs(images / "BACKGROUND_Google")

        return num_images_per_category * len(categories)
Пример #2
0
def country211(info, root, config):
    split_name_mapper = {
        "train": "train",
        "val": "valid",
        "test": "test",
    }
    split_folder = pathlib.Path(root, "country211", split_name_mapper[config["split"]])
    split_folder.mkdir(parents=True, exist_ok=True)

    num_examples = {
        "train": 3,
        "val": 4,
        "test": 5,
    }[config["split"]]

    classes = ("AD", "BS", "GR")
    for cls in classes:
        create_image_folder(
            split_folder,
            name=cls,
            file_name_fn=lambda idx: f"{idx}.jpg",
            num_examples=num_examples,
        )
    make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz")
    return num_examples * len(classes)
Пример #3
0
    def inject_fake_data(self, tmpdir, config):
        year, is_test_set = (("2007", True) if config["year"] == "2007-test"
                             or config["image_set"] == "test" else
                             (config["year"], False))
        image_set = config["image_set"]

        base_dir = pathlib.Path(tmpdir)
        if year == "2011":
            base_dir /= "TrainVal"
        base_dir = base_dir / "VOCdevkit" / f"VOC{year}"
        os.makedirs(base_dir)

        num_images, num_images_per_image_set = self._create_image_set_files(
            base_dir, "ImageSets", is_test_set)
        datasets_utils.create_image_folder(base_dir, "JPEGImages",
                                           lambda idx: f"{idx:06d}.jpg",
                                           num_images)

        datasets_utils.create_image_folder(base_dir, "SegmentationClass",
                                           lambda idx: f"{idx:06d}.png",
                                           num_images)
        annotation = self._create_annotation_files(base_dir, "Annotations",
                                                   num_images)

        return dict(num_examples=num_images_per_image_set[image_set],
                    annotation=annotation)
Пример #4
0
def imagenet(info, root, config):
    from scipy.io import savemat

    categories = info.categories
    wnids = [info.extra.category_to_wnid[category] for category in categories]
    if config.split == "train":
        num_samples = len(wnids)
        archive_name = "ILSVRC2012_img_train.tar"

        files = []
        for wnid in wnids:
            create_image_folder(
                root=root,
                name=wnid,
                file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG",
                num_examples=1,
            )
            files.append(make_tar(root, f"{wnid}.tar"))
    elif config.split == "val":
        num_samples = 3
        archive_name = "ILSVRC2012_img_val.tar"
        files = [
            create_image_file(root, f"ILSVRC2012_val_{idx + 1:08d}.JPEG")
            for idx in range(num_samples)
        ]

        devkit_root = root / "ILSVRC2012_devkit_t12"
        data_root = devkit_root / "data"
        data_root.mkdir(parents=True)

        with open(data_root / "ILSVRC2012_validation_ground_truth.txt",
                  "w") as file:
            for label in torch.randint(0, len(wnids),
                                       (num_samples, )).tolist():
                file.write(f"{label}\n")

        num_children = 0
        synsets = [(idx, wnid, category, "", num_children, [], 0, 0)
                   for idx, (category,
                             wnid) in enumerate(zip(categories, wnids), 1)]
        num_children = 1
        synsets.extend(
            (0, "", "", "", num_children, [], 0, 0) for _ in range(5))
        savemat(data_root / "meta.mat", dict(synsets=synsets))

        make_tar(root,
                 devkit_root.with_suffix(".tar.gz").name,
                 compression="gz")
    else:  # config.split == "test"
        num_samples = 5
        archive_name = "ILSVRC2012_img_test_v10102019.tar"
        files = [
            create_image_file(root, f"ILSVRC2012_test_{idx + 1:08d}.JPEG")
            for idx in range(num_samples)
        ]

    make_tar(root, archive_name, *files)

    return num_samples
Пример #5
0
    def generate(self, root):
        classification_anns_meta = (
            dict(cls="Abyssinian", label=0, species="cat"),
            dict(cls="Keeshond", label=18, species="dog"),
            dict(cls="Yorkshire Terrier", label=36, species="dog"),
        )
        split_and_classification_anns = [
            self._meta_to_split_and_classification_ann(meta, idx)
            for meta, idx in itertools.product(classification_anns_meta, (1, 2,
                                                                          10))
        ]
        image_ids, *_ = zip(*split_and_classification_anns)

        image_files = create_image_folder(
            root,
            "images",
            file_name_fn=lambda idx: f"{image_ids[idx]}.jpg",
            num_examples=len(image_ids))

        anns_folder = root / "annotations"
        anns_folder.mkdir()
        random.shuffle(split_and_classification_anns)
        splits = ("trainval", "test")
        num_samples_map = {}
        for offset, split in enumerate(splits):
            split_and_classification_anns_in_split = split_and_classification_anns[
                offset::len(splits)]
            with open(anns_folder / f"{split}.txt", "w") as file:
                writer = csv.writer(file, delimiter=" ")
                for split_and_classification_ann in split_and_classification_anns_in_split:
                    writer.writerow(split_and_classification_ann)

            num_samples_map[split] = len(
                split_and_classification_anns_in_split)

        segmentation_files = create_image_folder(
            anns_folder,
            "trimaps",
            file_name_fn=lambda idx: f"{image_ids[idx]}.png",
            num_examples=len(image_ids))

        # The dataset has some rogue files
        for path in image_files[:3]:
            path.with_suffix(".mat").touch()
        for path in segmentation_files:
            path.with_name(f".{path.name}").touch()

        make_tar(root, "images.tar.gz", compression="gz")
        make_tar(root,
                 anns_folder.with_suffix(".tar.gz").name,
                 compression="gz")

        return num_samples_map
def caltech101(info, root, config):
    def create_ann_file(root, name):
        import scipy.io

        box_coord = make_tensor((1, 4), dtype=torch.int32,
                                low=0).numpy().astype(np.uint16)
        obj_contour = make_tensor((2, int(torch.randint(3, 6, size=()))),
                                  dtype=torch.float64,
                                  low=0).numpy()

        scipy.io.savemat(str(pathlib.Path(root) / name),
                         dict(box_coord=box_coord, obj_contour=obj_contour))

    def create_ann_folder(root, name, file_name_fn, num_examples):
        root = pathlib.Path(root) / name
        root.mkdir(parents=True)

        for idx in range(num_examples):
            create_ann_file(root, file_name_fn(idx))

    images_root = root / "101_ObjectCategories"
    anns_root = root / "Annotations"

    ann_category_map = {
        "Faces_2": "Faces",
        "Faces_3": "Faces_easy",
        "Motorbikes_16": "Motorbikes",
        "Airplanes_Side_2": "airplanes",
    }

    num_images_per_category = 2
    for category in info.categories:
        create_image_folder(
            root=images_root,
            name=category,
            file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
            num_examples=num_images_per_category,
        )
        create_ann_folder(
            root=anns_root,
            name=ann_category_map.get(category, category),
            file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
            num_examples=num_images_per_category,
        )

    (images_root / "BACKGROUND_Goodle").mkdir()
    make_tar(root, f"{images_root.name}.tar.gz", images_root, compression="gz")

    make_tar(root, f"{anns_root.name}.tar", anns_root)

    return num_images_per_category * len(info.categories)
Пример #7
0
    def generate(cls, root):
        archive_folder = root / "benchmark_RELEASE"
        dataset_folder = archive_folder / "dataset"
        dataset_folder.mkdir(parents=True, exist_ok=True)

        ids, num_samples_map = cls._make_split_files(defaultdict(lambda: dataset_folder, {"train_noval": root}))
        sizes = cls._make_anns_folder(dataset_folder, "cls", ids)
        create_image_folder(
            dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx]
        )

        make_tar(root, "benchmark.tgz", archive_folder, compression="gz")

        return num_samples_map
def imagenet(info, root, config):
    wnids = tuple(info.extra.wnid_to_category.keys())
    if config.split == "train":
        images_root = root / "ILSVRC2012_img_train"

        num_samples = len(wnids)

        for wnid in wnids:
            files = create_image_folder(
                root=images_root,
                name=wnid,
                file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG",
                num_examples=1,
            )
            make_tar(images_root, f"{wnid}.tar", files[0].parent)
    elif config.split == "val":
        num_samples = 3
        files = create_image_folder(
            root=root,
            name="ILSVRC2012_img_val",
            file_name_fn=lambda image_idx:
            f"ILSVRC2012_val_{image_idx + 1:08d}.JPEG",
            num_examples=num_samples,
        )
        images_root = files[0].parent
    else:  # config.split == "test"
        images_root = root / "ILSVRC2012_img_test_v10102019"

        num_samples = 3

        create_image_folder(
            root=images_root,
            name="test",
            file_name_fn=lambda image_idx:
            f"ILSVRC2012_test_{image_idx + 1:08d}.JPEG",
            num_examples=num_samples,
        )
    make_tar(root, f"{images_root.name}.tar", images_root)

    devkit_root = root / "ILSVRC2012_devkit_t12"
    devkit_root.mkdir()
    data_root = devkit_root / "data"
    data_root.mkdir()
    with open(data_root / "ILSVRC2012_validation_ground_truth.txt",
              "w") as file:
        for label in torch.randint(0, len(wnids), (num_samples, )).tolist():
            file.write(f"{label}\n")
    make_tar(root, f"{devkit_root}.tar.gz", devkit_root, compression="gz")

    return num_samples
Пример #9
0
def eurosat(info, root, config):
    data_folder = pathlib.Path(root, "eurosat", "2750")
    data_folder.mkdir(parents=True)

    num_examples_per_class = 3
    classes = ("AnnualCrop", "Forest")
    for cls in classes:
        create_image_folder(
            root=data_folder,
            name=cls,
            file_name_fn=lambda idx: f"{cls}_{idx}.jpg",
            num_examples=num_examples_per_class,
        )
    make_zip(root, "EuroSAT.zip", data_folder)
    return len(classes) * num_examples_per_class
Пример #10
0
    def _create_lmdb(self, root, cls):
        lmdb = datasets_utils.lazy_importer.lmdb
        hexdigits_lowercase = string.digits + string.ascii_lowercase[:6]

        folder = f"{cls}_lmdb"

        num_images = torch.randint(1, 4, size=()).item()
        format = "webp"
        files = datasets_utils.create_image_folder(
            root, folder, lambda idx: f"{idx}.{format}", num_images)

        with lmdb.open(str(root /
                           folder)) as env, env.begin(write=True) as txn:
            for file in files:
                key = "".join(
                    random.choice(hexdigits_lowercase)
                    for _ in range(40)).encode()

                buffer = io.BytesIO()
                Image.open(file).save(buffer, format)
                buffer.seek(0)
                value = buffer.read()

                txn.put(key, value)

                os.remove(file)

        return num_images
Пример #11
0
    def inject_fake_data(self, tmpdir, config):
        base_folder = pathlib.Path(tmpdir) / "celeba"
        os.makedirs(base_folder)

        num_images, num_images_per_split = self._create_split_txt(base_folder)

        datasets_utils.create_image_folder(base_folder, "img_align_celeba",
                                           lambda idx: f"{idx + 1:06d}.jpg",
                                           num_images)
        attr_names = self._create_attr_txt(base_folder, num_images)
        self._create_identity_txt(base_folder, num_images)
        self._create_bbox_txt(base_folder, num_images)
        self._create_landmarks_txt(base_folder, num_images)

        return dict(num_examples=num_images_per_split[config["split"]],
                    attr_names=attr_names)
Пример #12
0
    def inject_fake_data(self, tmpdir, config):
        tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories"

        categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter"))
        num_images_per_category = 2

        for idx, category in categories:
            datasets_utils.create_image_folder(
                tmpdir,
                name=f"{idx:03d}.{category}",
                file_name_fn=lambda image_idx:
                f"{idx:03d}_{image_idx + 1:04d}.jpg",
                num_examples=num_images_per_category,
            )

        return num_images_per_category * len(categories)
Пример #13
0
    def _make_images_archive(cls, root, name, *, num_samples):
        image_paths = create_image_folder(
            root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples
        )

        images_meta = []
        for path in image_paths:
            with PIL.Image.open(path) as image:
                width, height = image.size
            images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height))

        make_zip(root, f"{name}.zip")

        return images_meta
def dtd(info, root, _):
    data_folder = root / "dtd"

    num_images_per_class = 3
    image_folder = data_folder / "images"
    categories = {"banded", "marbled", "zigzagged"}
    image_ids_per_category = {
        category: [
            str(path.relative_to(path.parents[1]).as_posix())
            for path in create_image_folder(
                image_folder,
                category,
                file_name_fn=lambda idx: f"{category}_{idx:04d}.jpg",
                num_examples=num_images_per_class,
            )
        ]
        for category in categories
    }

    meta_folder = data_folder / "labels"
    meta_folder.mkdir()

    with open(meta_folder / "labels_joint_anno.txt", "w") as file:
        for cls, image_ids in image_ids_per_category.items():
            for image_id in image_ids:
                joint_categories = random.choices(
                    list(categories - {cls}),
                    k=int(torch.randint(len(categories) - 1, ())))
                file.write(
                    " ".join([image_id, *sorted([cls, *joint_categories])]) +
                    "\n")

    image_ids = list(itertools.chain(*image_ids_per_category.values()))
    splits = ("train", "val", "test")
    num_samples_map = {}
    for fold in range(1, 11):
        random.shuffle(image_ids)
        for offset, split in enumerate(splits):
            image_ids_in_config = image_ids[offset::len(splits)]
            with open(meta_folder / f"{split}{fold}.txt", "w") as file:
                file.write("\n".join(image_ids_in_config) + "\n")

            num_samples_map[info.make_config(
                split=split, fold=str(fold))] = len(image_ids_in_config)

    make_tar(root, "dtd-r1.0.1.tar.gz", data_folder, compression="gz")

    return num_samples_map
Пример #15
0
def caltech256(info, root, config):
    dir = root / "256_ObjectCategories"
    num_images_per_category = 2

    for idx, category in enumerate(info.categories, 1):
        files = create_image_folder(
            dir,
            name=f"{idx:03d}.{category}",
            file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg",
            num_examples=num_images_per_category,
        )
        if category == "spider":
            open(files[0].parent / "RENAME2", "w").close()

    make_tar(root, f"{dir.name}.tar", dir)

    return num_images_per_category * len(info.categories)
    def _make_images(cls, images_folder):
        image_files = []
        for category_idx, category in [
            (1, "Black_footed_Albatross"),
            (100, "Brown_Pelican"),
            (200, "Common_Yellowthroat"),
        ]:
            image_files.extend(
                create_image_folder(
                    images_folder,
                    cls._category_folder(category, category_idx),
                    lambda image_idx:
                    f"{cls._file_stem(category, image_idx)}.jpg",
                    num_examples=5,
                ))

        return image_files
Пример #17
0
    def generate(cls, root):
        image_file_names, num_samples_map = cls._make_split_file(root)

        image_files = create_image_folder(
            root, "img_align_celeba", file_name_fn=lambda idx: image_file_names[idx], num_examples=len(image_file_names)
        )
        make_zip(root, image_files[0].parent.with_suffix(".zip").name)

        for make_ann_file_fn in (
            cls._make_identity_file,
            cls._make_attributes_file,
            cls._make_bounding_boxes_file,
            cls._make_landmarks_file,
        ):
            make_ann_file_fn(root, image_file_names)

        return num_samples_map
def clevr(info, root, config):
    data_folder = root / "CLEVR_v1.0"

    num_samples_map = {
        "train": 3,
        "val": 2,
        "test": 1,
    }

    images_folder = data_folder / "images"
    image_files = {
        split: create_image_folder(
            images_folder,
            split,
            file_name_fn=lambda idx: f"CLEVR_{split}_{idx:06d}.jpg",
            num_examples=num_samples,
        )
        for split, num_samples in num_samples_map.items()
    }

    scenes_folder = data_folder / "scenes"
    scenes_folder.mkdir()
    for split in ["train", "val"]:
        with open(scenes_folder / f"CLEVR_{split}_scenes.json", "w") as file:
            json.dump(
                {
                    "scenes": [
                        {
                            "image_filename": image_file.name,
                            # We currently only return the number of objects in a scene.
                            # Thus, it is sufficient for now to only mock the number of elements.
                            "objects": [None] * int(torch.randint(1, 5, ())),
                        } for image_file in image_files[split]
                    ]
                },
                file,
            )

    make_zip(root, f"{data_folder.name}.zip")

    return {
        config_: num_samples_map[config_.split]
        for config_ in info._configs
    }
Пример #19
0
    def inject_fake_data(self, tmpdir, config):
        tmpdir = pathlib.Path(tmpdir)

        num_images = 3
        num_annotations_per_image = 2

        image_folder = tmpdir / "images"
        files = datasets_utils.create_image_folder(
            tmpdir,
            name="images",
            file_name_fn=lambda idx: f"{idx:012d}.jpg",
            num_examples=num_images)
        file_names = [file.relative_to(image_folder) for file in files]

        annotation_folder = tmpdir / "annotations"
        os.makedirs(annotation_folder)
        annotation_file, info = self._create_annotation_file(
            annotation_folder, file_names, num_annotations_per_image)

        info["num_examples"] = num_images
        return (str(image_folder), str(annotation_file)), info
Пример #20
0
    def inject_fake_data(self, tmpdir, config):
        tmpdir = pathlib.Path(tmpdir)

        num_images = 3
        num_annotations_per_image = 2

        files = datasets_utils.create_image_folder(
            tmpdir,
            name=self._IMAGE_FOLDER,
            file_name_fn=lambda idx: f"{idx:012d}.jpg",
            num_examples=num_images)
        file_names = [
            file.relative_to(tmpdir / self._IMAGE_FOLDER) for file in files
        ]

        annotation_folder = tmpdir / self._ANNOTATIONS_FOLDER
        os.makedirs(annotation_folder)
        info = self._create_annotation_file(annotation_folder,
                                            self._ANNOTATIONS_FILE, file_names,
                                            num_annotations_per_image)

        info["num_examples"] = num_images
        return info
Пример #21
0
def gtsrb(info, root, config):
    num_examples_per_class = 5 if config.split == "train" else 3
    classes = ("00000", "00042", "00012")
    num_examples = num_examples_per_class * len(classes)

    csv_columns = [
        "Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2",
        "ClassId"
    ]

    def _make_ann_file(path, num_examples, class_idx):
        if class_idx == "random":
            class_idx = torch.randint(1, len(classes) + 1, size=(1, )).item()

        with open(path, "w") as csv_file:
            writer = csv.DictWriter(csv_file,
                                    fieldnames=csv_columns,
                                    delimiter=";")
            writer.writeheader()
            for image_idx in range(num_examples):
                writer.writerow({
                    "Filename": f"{image_idx:05d}.ppm",
                    "Width": torch.randint(1, 100, size=()).item(),
                    "Height": torch.randint(1, 100, size=()).item(),
                    "Roi.X1": torch.randint(1, 100, size=()).item(),
                    "Roi.Y1": torch.randint(1, 100, size=()).item(),
                    "Roi.X2": torch.randint(1, 100, size=()).item(),
                    "Roi.Y2": torch.randint(1, 100, size=()).item(),
                    "ClassId": class_idx,
                })

    if config["split"] == "train":
        train_folder = root / "GTSRB" / "Training"
        train_folder.mkdir(parents=True)

        for class_idx in classes:
            create_image_folder(
                train_folder,
                name=class_idx,
                file_name_fn=lambda image_idx:
                f"{class_idx}_{image_idx:05d}.ppm",
                num_examples=num_examples_per_class,
            )
            _make_ann_file(
                path=train_folder / class_idx / f"GT-{class_idx}.csv",
                num_examples=num_examples_per_class,
                class_idx=int(class_idx),
            )
        make_zip(root, "GTSRB-Training_fixed.zip", train_folder)
    else:
        test_folder = root / "GTSRB" / "Final_Test"
        test_folder.mkdir(parents=True)

        create_image_folder(
            test_folder,
            name="Images",
            file_name_fn=lambda image_idx: f"{image_idx:05d}.ppm",
            num_examples=num_examples,
        )

        make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder)

        _make_ann_file(
            path=root / "GT-final_test.csv",
            num_examples=num_examples,
            class_idx="random",
        )

        make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv")

    return num_examples