예제 #1
0
 def _prepare(self):
     self.root = edu.get_root(self.NAME)
     self._data_path = Path(self.root).joinpath("data.p")
     if not edu.is_prepared(self.root):
         # prep
         self.logger.info("Preparing dataset {} in {}".format(
             self.NAME, self.root))
         root = Path(self.root)
         urls = dict((v, urllib.parse.urljoin(self.URL, v))
                     for k, v in self.FILES.items())
         local_files = edu.download_urls(urls, target_dir=root)
         data = dict()
         for k, v in local_files.items():
             data[k] = read_mnist_file(v)
         with open(self._data_path, "wb") as f:
             pickle.dump(data, f)
         edu.mark_prepared(self.root)
예제 #2
0
    def _prepare(self):
        self.root = edu.get_root(self.NAME)
        self._data_path = Path(self.root).joinpath("data.p")
        if not edu.is_prepared(self.root):
            # prep
            self.logger.info("Preparing dataset {} in {}".format(
                self.NAME, self.root))
            root = Path(self.root)
            urls = dict((v, urllib.parse.urljoin(self.URL, v))
                        for k, v in self.FILES.items())
            local_files = edu.download_urls(urls, target_dir=root)
            edu.unpack(local_files["cifar-10-python.tar.gz"])
            base = os.path.join(self.root, "cifar-10-batches-py")
            labels = list()
            filenames = list()
            datas = list()
            for batch_file in ["data_batch_{}".format(i) for i in range(1, 6)]:
                with open(os.path.join(base, batch_file), "rb") as f:
                    batch_data = pickle.load(f, encoding="bytes")
                labels += batch_data["labels".encode()]
                filenames += [
                    fname.decode()
                    for fname in batch_data["filenames".encode()]
                ]
                datas.append(batch_data["data".encode()])
            with open(os.path.join(base, "test_batch"), "rb") as f:
                test_data = pickle.load(f, encoding="bytes")
            test_labels = test_data["labels".encode()]
            test_filenames = [
                fname.decode() for fname in test_data["filenames".encode()]
            ]
            test_datas = test_data["data".encode()]
            with open(os.path.join(base, "batches.meta"), "rb") as f:
                _meta = pickle.load(f, encoding="bytes")
            meta = {
                "label_names":
                [name.decode() for name in _meta["label_names".encode()]],
                "num_vis":
                _meta["num_vis".encode()],
                "num_cases_per_batch":
                _meta["num_cases_per_batch".encode()],
            }

            # convert to (32,32,3) RGB uint8
            images = np.concatenate(datas, axis=0)
            images = np.reshape(images, [-1, 3, 32, 32])
            images = np.transpose(images, [0, 2, 3, 1])
            test_images = test_datas
            test_images = np.reshape(test_images, [-1, 3, 32, 32])
            test_images = np.transpose(test_images, [0, 2, 3, 1])

            filenames = np.array(filenames)
            test_filenames = np.array(test_filenames)
            labels = np.array(labels)
            test_labels = np.array(test_labels)

            data = {
                "train":
                dict(images=images, filenames=filenames, labels=labels),
                "test":
                dict(images=test_images,
                     filenames=test_filenames,
                     labels=test_labels),
                "meta":
                meta,
            }
            with open(self._data_path, "wb") as f:
                pickle.dump(data, f)
            edu.mark_prepared(self.root)