Пример #1
0
    def download(self):
        if self._check_integrity():
            return
        super(FC100ClassDataset, self).download()

        subfolder = os.path.join(self.root, self.subfolder)
        if not os.path.exists(subfolder):
            os.makedirs(subfolder)

        filename_fine_names = os.path.join(self.root, self.filename_fine_names)
        with open(filename_fine_names, 'r') as f:
            fine_names = json.load(f)

        for split in ['train', 'val', 'test']:
            split_filename_labels = os.path.join(
                subfolder, self.filename_labels.format(split))
            if os.path.isfile(split_filename_labels):
                continue

            data = get_asset(self.folder,
                             self.subfolder,
                             '{0}.json'.format(split),
                             dtype='json')
            with open(split_filename_labels, 'w') as f:
                labels = [[coarse_name, fine_name] for coarse_name in data
                          for fine_name in fine_names[coarse_name]]
                json.dump(labels, f)
Пример #2
0
    def download(self):
        import zipfile
        import shutil

        if self._check_integrity():
            return

        for name in self.zips_md5:
            zip_filename = '{0}.zip'.format(name)
            filename = os.path.join(self.root, zip_filename)
            if os.path.isfile(filename):
                continue

            url = '{0}/{1}'.format(self.download_url_prefix, zip_filename)
            download_url(url, self.root, zip_filename, self.zips_md5[name])

            with zipfile.ZipFile(filename, 'r') as f:
                f.extractall(self.root)

        filename = os.path.join(self.root, self.filename)
        with h5py.File(filename, 'w') as f:
            for name in self.zips_md5:
                group = f.create_group(name)

                alphabets = list_dir(os.path.join(self.root, name))
                characters = [(name, alphabet, character) for alphabet in alphabets
                    for character in list_dir(os.path.join(self.root, name, alphabet))]

                split = 'train' if name == 'images_background' else 'test'
                labels_filename = os.path.join(self.root,
                    self.filename_labels.format('', split))
                with open(labels_filename, 'w') as f_labels:
                    labels = sorted(characters)
                    json.dump(labels, f_labels)

                for _, alphabet, character in characters:
                    filenames = glob.glob(os.path.join(self.root, name,
                        alphabet, character, '*.png'))
                    dataset = group.create_dataset('{0}/{1}'.format(alphabet,
                        character), (len(filenames), 105, 105), dtype='uint8')

                    for i, char_filename in enumerate(filenames):
                        image = Image.open(char_filename, mode='r').convert('L')
                        dataset[i] = ImageOps.invert(image)

                shutil.rmtree(os.path.join(self.root, name))

        for split in ['train', 'val', 'test']:
            filename = os.path.join(self.root, self.filename_labels.format(
                'vinyals_', split))
            data = get_asset(self.folder, '{0}.json'.format(split), dtype='json')

            with open(filename, 'w') as f:
                labels = sorted([('images_{0}'.format(name), alphabet, character)
                    for (name, alphabets) in data.items()
                    for (alphabet, characters) in alphabets.items()
                    for character in characters])
                json.dump(labels, f)
Пример #3
0
    def download(self):
        import zipfile
        import shutil
        import glob
        from tqdm import tqdm

        if self._check_integrity():
            return

        zip_filename = os.path.join(self.root, self.zip_filename)
        if not os.path.isfile(zip_filename):
            download_file_from_google_drive(self.gdrive_id,
                                            self.root,
                                            self.zip_filename,
                                            md5=self.zip_md5)

        zip_foldername = os.path.join(self.root, self.image_folder)
        if not os.path.isdir(zip_foldername):
            with zipfile.ZipFile(zip_filename, 'r') as f:
                for member in tqdm(f.infolist(), desc='Extracting '):
                    try:
                        f.extract(member, self.root)
                    except zipfile.BadZipFile:
                        print('Error: Zip file is corrupted')

        for split in ['train', 'val', 'test']:
            filename = os.path.join(self.root, self.filename.format(split))
            if os.path.isfile(filename):
                continue

            labels = get_asset(self.folder, '{0}.json'.format(split))
            labels_filename = os.path.join(self.root,
                                           self.filename_labels.format(split))
            with open(labels_filename, 'w') as f:
                json.dump(labels, f)

            image_folder = os.path.join(zip_foldername, split)

            with h5py.File(filename, 'w') as f:
                group = f.create_group('datasets')
                dtype = h5py.special_dtype(vlen=np.uint8)
                for i, label in enumerate(tqdm(labels, desc=filename)):
                    images = glob.glob(
                        os.path.join(image_folder, label, '*.png'))
                    images.sort()
                    dataset = group.create_dataset(label, (len(images), ),
                                                   dtype=dtype)
                    for i, image in enumerate(images):
                        with open(image, 'rb') as f:
                            array = bytearray(f.read())
                            dataset[i] = np.asarray(array, dtype=np.uint8)

        if os.path.isdir(zip_foldername):
            shutil.rmtree(zip_foldername)
Пример #4
0
    def download(self):
        import tarfile
        import shutil
        import glob
        from tqdm import tqdm

        if self._check_integrity():
            return

        tgz_filename = os.path.join(self.root, self.tgz_filename)
        if not os.path.isfile(tgz_filename):
            download_file_from_google_drive(self.gdrive_id,
                                            self.root,
                                            self.tgz_filename,
                                            md5=self.tgz_md5)

        tgz_filename = os.path.join(self.root, self.tgz_filename)
        with tarfile.open(tgz_filename, 'r') as f:
            f.extractall(self.root)
        image_folder = os.path.join(self.root, self.image_folder)

        for split in ['train', 'val', 'test']:
            filename = os.path.join(self.root, self.filename.format(split))
            if os.path.isfile(filename):
                continue

            labels = get_asset(self.folder, '{0}.json'.format(split))
            labels_filename = os.path.join(self.root,
                                           self.filename_labels.format(split))
            with open(labels_filename, 'w') as f:
                json.dump(labels, f)

            with h5py.File(filename, 'w') as f:
                group = f.create_group('datasets')
                dtype = h5py.special_dtype(vlen=np.uint8)
                for i, label in enumerate(tqdm(labels, desc=filename)):
                    images = glob.glob(
                        os.path.join(image_folder, label, '*.jpg'))
                    images.sort()
                    dataset = group.create_dataset(label, (len(images), ),
                                                   dtype=dtype)
                    for i, image in enumerate(images):
                        with open(image, 'rb') as f:
                            array = bytearray(f.read())
                            dataset[i] = np.asarray(array, dtype=np.uint8)

        tar_folder, _ = os.path.splitext(tgz_filename)
        if os.path.isdir(tar_folder):
            shutil.rmtree(tar_folder)

        attributes_filename = os.path.join(self.root, 'attributes.txt')
        if os.path.isfile(attributes_filename):
            os.remove(attributes_filename)
Пример #5
0
    def download(self):

        if self._check_integrity():
            return

        from sklearn.datasets import fetch_openml

        data = fetch_openml(data_id=self.open_ml_id)
        features = data.data
        targets = data.target

        os.makedirs(self.root, exist_ok=True)

        # for each meta-data-split, get the labels, then check which data-point belongs to the set (via a mask).
        # then, retrieve the features and targets belonging to the set. Then create hdf5 file for these features.
        for s, split in enumerate(['train', 'val', 'test']):
            labels_assets_split = get_asset(self.folder,
                                            '{0}.json'.format(split))

            is_in_split = [t in labels_assets_split for t in targets]
            features_split = features.loc[is_in_split]
            targets_split = targets.loc[is_in_split]
            assert targets_split.shape[0] == features_split.shape[0]

            unique_targets_split = np.unique(targets_split)
            if len(labels_assets_split) > unique_targets_split.shape[0]:
                print(
                    f"unique set of labels ({(unique_targets_split.shape[0])}) is smaller than set of labels "
                    f"given by assets ({len(labels_assets_split)}). Proceeding with unique set of labels."
                )

            # write unique targets to json file.
            labels_filename = os.path.join(self.root,
                                           self.filename_labels.format(split))
            with open(labels_filename, 'w') as f:
                json.dump(unique_targets_split.tolist(), f)

            # write data (features and class labels)
            filename = os.path.join(self.root, self.filename.format(split))
            with h5py.File(filename, 'w') as f:
                group = f.create_group('datasets')

                for i, label in enumerate(
                        tqdm(unique_targets_split, desc=filename)):
                    data_class = features_split.loc[targets_split == label]
                    group.create_dataset(label, data=data_class)
Пример #6
0
    def download(self):
        if self._check_integrity():
            return
        super(CIFARFSClassDataset, self).download()

        subfolder = os.path.join(self.root, self.subfolder)
        if not os.path.exists(subfolder):
            os.makedirs(subfolder)

        for split in ['train', 'val', 'test']:
            split_filename_labels = os.path.join(
                subfolder, self.filename_labels.format(split))
            if os.path.isfile(split_filename_labels):
                continue

            data = get_asset(self.folder,
                             self.subfolder,
                             '{0}.json'.format(split),
                             dtype='json')
            with open(split_filename_labels, 'w') as f:
                json.dump(data, f)
Пример #7
0
def get_task_id_splits(meta_split):
    return get_asset(TCGA.folder, '{}.json'.format(meta_split), dtype='json')
Пример #8
0
def get_task_variables():
    return get_asset(TCGA.folder, 'task_variables.json', dtype='json')
Пример #9
0
def get_cancers():
    return get_asset(TCGA.folder, 'cancers.json', dtype='json')
Пример #10
0
    def download(self, chunksize=100):
        try:
            import gzip
            import shutil
            import pandas as pd
            from six.moves import urllib
            import academictorrents as at
        except ImportError as exception:
            raise ImportError('{0}. To use the TCGA dataset, you need to '
                              'install the necessary dependencies with '
                              '`pip install torchmeta[tcga]`.'.format(
                                  exception.message))

        clinical_matrices_folder = os.path.join(self.root, 'clinicalMatrices')
        if not os.path.exists(clinical_matrices_folder):
            os.makedirs(clinical_matrices_folder)

        for cancer in self.cancers:
            filename = self.clinical_matrix_filename.format(cancer)
            rawpath = os.path.join(clinical_matrices_folder,
                                   '{0}.gz'.format(filename))
            filepath = os.path.join(clinical_matrices_folder,
                                    '{0}.tsv'.format(filename))

            if os.path.isfile(filepath):
                continue

            if not os.path.exists(rawpath):
                print('Downloading `{0}.gz`...'.format(filename))
                url = self.clinical_matrix_url.format(cancer)
                urllib.request.urlretrieve(url, rawpath)

            print('Extracting `{0}.gz`...'.format(filename))
            with gzip.open(rawpath, 'rb') as gzf:
                with open(filepath, 'wb') as f:
                    shutil.copyfileobj(gzf, f)

        gene_expression_file = os.path.join(self.root,
                                            self.gene_expression_filename)
        if not os.path.isfile(gene_expression_file):
            from tqdm import tqdm
            print('Downloading `{0}` using `academictorrents`...'.format(
                self.gene_expression_filename))
            csv_file = at.get(self.gene_expression_torrent,
                              datastore=self.root)
            print('Downloaded to: `{0}`'.format(csv_file))

            print(
                'Converting TCGA CSV dataset to HDF5. This may take a while, '
                'but only happens on the first run.')
            reader = pd.read_csv(csv_file,
                                 compression='gzip',
                                 sep='\t',
                                 header=0,
                                 index_col=0,
                                 chunksize=chunksize)
            shape = (10459, 20530)

            with tqdm(total=shape[1]) as pbar:
                with h5py.File(gene_expression_file, 'w') as f:
                    dataset = f.create_dataset('expression_data',
                                               shape=shape,
                                               dtype='f4')
                    gene_ids = []
                    for idx, chunk in enumerate(reader):
                        slice_ = slice(idx * chunksize, (idx + 1) * chunksize)
                        dataset[:, slice_] = chunk.T
                        gene_ids.extend(chunk.index)
                        pbar.update(chunk.shape[0])
                    all_sample_ids = chunk.columns.tolist()

            gene_ids_file = os.path.join(self.root, 'gene_ids.json')
            with open(gene_ids_file, 'w') as f:
                json.dump(gene_ids, f)

            all_sample_ids_file = os.path.join(self.root,
                                               'all_sample_ids.json')
            with open(all_sample_ids_file, 'w') as f:
                json.dump(all_sample_ids, f)

            if os.path.isfile(csv_file):
                os.remove(csv_file)

            print('Done')

        self._process_clinical_matrices()

        # Create label files
        for split in ['train', 'val', 'test']:
            filename = os.path.join(self.root,
                                    self.filename_tasks.format(split))
            data = get_asset(self.folder,
                             '{0}.json'.format(split),
                             dtype='json')

            with open(filename, 'w') as f:
                labels = sorted([key.split('|', 1) for key in data])
                json.dump(labels, f)

        # Clean up
        for cancer in self.cancers:
            filename = self.clinical_matrix_filename.format(cancer)
            rawpath = os.path.join(clinical_matrices_folder,
                                   '{0}.gz'.format(filename))
            if os.path.isfile(rawpath):
                os.remove(rawpath)
    def download(self, normalize):

        if self._check_integrity():
            return

        from sklearn.datasets import fetch_openml

        data = fetch_openml(data_id=self.open_ml_id)
        features = data.data
        targets = data.target

        os.makedirs(self.root, exist_ok=True)

        # for each meta-data-split, get the labels, then check which data-point belongs to the set (via a mask).
        # then, retrieve the features and targets belonging to the set. Then create hdf5 file for these features.
        for s, split in enumerate(['train', 'val', 'test']):
            targets_assets_split = get_asset(self.folder,
                                             '{0}.json'.format(split))

            is_in_split = [t in targets_assets_split for t in targets]
            features_split = features.loc[is_in_split]
            targets_split = targets.loc[is_in_split]
            assert targets_split.shape[0] == features_split.shape[0]

            unique_targets_split = np.unique(targets_split)
            if len(targets_assets_split) > unique_targets_split.shape[0]:
                print(
                    f"unique set of labels ({(unique_targets_split.shape[0])}) is smaller than set of labels "
                    f"given by assets ({len(targets_assets_split)}). Proceeding with unique set of labels."
                )

            # write unique targets to json file.
            labels_filename = os.path.join(self.root,
                                           self.filename_labels.format(split))
            with open(labels_filename, 'w') as f:
                json.dump(unique_targets_split.tolist(), f)

            # normalize between 0 and 1 with stats from 'train' split only
            if split == 'train':
                lower, upper = np.zeros(features.shape[1]), np.ones(
                    features.shape[1])
                if normalize:
                    lower = np.min(features_split, axis=0)
                    upper = np.max(features_split, axis=0)
                self._lower_upper = {
                    'lower': lower.tolist(),
                    'upper': upper.tolist()
                }
                lower_upper_filename = os.path.join(self.root,
                                                    self.filename_lower_upper)
                with open(lower_upper_filename, 'w') as f:
                    json.dump(self._lower_upper, f)

            lower_upper = self.lower_upper
            lower = np.array(lower_upper['lower'])
            upper = np.array(lower_upper['upper'])
            features_split = np.true_divide((features_split - lower),
                                            (upper - lower))

            # write data (features and class labels)
            filename = os.path.join(self.root, self.filename.format(split))
            with h5py.File(filename, 'w') as f:
                group = f.create_group('datasets')

                for i, label in enumerate(
                        tqdm(unique_targets_split, desc=filename)):
                    data_class = features_split.loc[targets_split == label]
                    group.create_dataset(label, data=data_class)
Пример #12
0
    def download(self):
        import tarfile

        if self._check_integrity():
            return

        chunkSize = 1024
        r = requests.get(self.train_tar_url, stream=True)
        with open(self.root + '/cars_train.tgz', 'wb') as f:
            pbar = tqdm(unit="B", total=int(r.headers['Content-Length']))
            for chunk in r.iter_content(chunk_size=chunkSize):
                if chunk:  # filter out keep-alive new chunks
                    pbar.update(len(chunk))
                    f.write(chunk)

        r = requests.get(self.devkit_tar_url, stream=True)
        with open(self.root + '/car_devkit.tgz', 'wb') as f:
            pbar = tqdm(unit="B", total=int(r.headers['Content-Length']))
            for chunk in r.iter_content(chunk_size=chunkSize):
                if chunk:  # filter out keep-alive new chunks
                    pbar.update(len(chunk))
                    f.write(chunk)

        filename = os.path.join(self.root, 'cars_train.tgz')
        with tarfile.open(filename, 'r') as f:
            f.extractall(self.root)

        filename = os.path.join(self.root, 'car_devkit.tgz')
        with tarfile.open(filename, 'r') as f:
            f.extractall(self.root)

        annos_path = os.path.join(self.root, 'devkit', 'cars_train_annos.mat')
        cars_meta_path = os.path.join(self.root, 'devkit', 'cars_meta.mat')

        annos = loadmat(annos_path)['annotations'][0]
        cars_meta = loadmat(cars_meta_path)['class_names'][0]
        cars_meta = [c[0] for c in cars_meta]

        names_to_bboxes = {}
        clss_to_names = collections.defaultdict(list)

        for xmin, ymin, xmax, ymax, label, filename in annos:
            bbox = (int(xmin[0][0]), int(ymin[0][0]), int(xmax[0][0]),
                    int(ymax[0][0]))
            label = int(label[0][0]) - 1
            filename = str(filename[0])
            names_to_bboxes[filename] = bbox
            clss_to_names[cars_meta[label]].append(filename)

        for split in ['train', 'val', 'test']:
            filename = os.path.join(self.root, self.filename.format(split))
            labels = get_asset(self.folder, '{}.json'.format(split))
            labels_filename = os.path.join(self.root,
                                           self.filename_labels.format(split))

            with open(labels_filename, 'w') as f:
                json.dump(labels, f)

            with h5py.File(filename, 'w') as f:
                group = f.create_group('datasets')
                for i, label in enumerate(tqdm(labels, desc=filename)):
                    images = []
                    for file in clss_to_names[label]:
                        file_path = os.path.join(self.root, 'cars_train', file)
                        img = Image.open(file_path).convert('RGB')
                        bbox = names_to_bboxes[file]
                        img = np.asarray(img.crop(bbox).resize((84, 84)),
                                         dtype=np.uint8)
                        images.append(img)

                    dataset = group.create_dataset(label,
                                                   (len(images), 84, 84, 3))

                    for j, image in enumerate(images):
                        dataset[j] = image
    def download(self, normalize):

        if self._check_integrity():
            return

        from sklearn.datasets import fetch_openml

        data = fetch_openml(data_id=self.open_ml_id)
        features = data.data
        targets = data.target

        os.makedirs(self.root, exist_ok=True)

        # for each meta-data-split, get the labels, then check which data-point belongs to the set (via a mask).
        # then, retrieve the features and targets belonging to the set. Then create hdf5 file for these features.
        for s, split in enumerate(['train', 'val', 'test']):
            targets_assets_split = get_asset(self.folder,
                                             '{0}.json'.format(split))

            is_in_split = [t in targets_assets_split for t in targets]
            features_split = features.loc[is_in_split]
            targets_split = targets.loc[is_in_split]
            assert targets_split.shape[0] == features_split.shape[0]

            unique_targets_split = np.sort(np.unique(targets_split))
            if len(targets_assets_split) > unique_targets_split.shape[0]:
                print(
                    f"unique set of labels ({(unique_targets_split.shape[0])}) is smaller than set of labels "
                    f"given by assets ({len(targets_assets_split)}). Proceeding with unique set of labels."
                )

            # write unique targets to json file.
            labels_filename = os.path.join(self.root,
                                           self.filename_labels.format(split))
            with open(labels_filename, 'w') as f:
                json.dump(unique_targets_split.tolist(), f)

            # normalize to zero mean and standard deviation 1 with stats from 'train' split only
            if split == 'train':
                mean, std = np.zeros(features.shape[1]), np.ones(
                    features.shape[1])
                if normalize:
                    mean = np.mean(features_split, axis=0)
                    std = np.std(features_split, axis=0)

                self._mean_std = {'mean': mean.tolist(), 'std': std.tolist()}
                mean_std_filename = os.path.join(self.root,
                                                 self.filename_mean_std)
                with open(mean_std_filename, 'w') as f:
                    json.dump(self._mean_std, f)

            mean_std = self.mean_std
            mean = np.array(mean_std['mean'])
            std = np.array(mean_std['std'])
            features_split = (features_split - mean) / (std + 1e-10)

            # write data (features and class labels)
            filename = os.path.join(self.root, self.filename.format(split))
            with h5py.File(filename, 'w') as f:
                group = f.create_group('datasets')

                for i, label in enumerate(
                        tqdm(unique_targets_split, desc=filename)):
                    data_class = features_split.loc[targets_split == label]
                    group.create_dataset(label, data=data_class)
Пример #14
0
    def download(self):
        import tarfile

        if self._check_integrity():
            return

        chunkSize = 1024
        r = requests.get(self.tar_url, stream=True)
        with open(self.root + '/fgvc-aircraft-2013b.tar.gz', 'wb') as f:
            pbar = tqdm(unit="B", total=int(r.headers['Content-Length']))
            for chunk in r.iter_content(chunk_size=chunkSize):
                if chunk:  # filter out keep-alive new chunks
                    pbar.update(len(chunk))
                    f.write(chunk)

        filename = os.path.join(self.root, 'fgvc-aircraft-2013b.tar.gz')
        with tarfile.open(filename, 'r') as f:
            f.extractall(self.root)

        # Cropping images with bounding box same as meta-dataset.
        bboxes_path = os.path.join(self.root, 'fgvc-aircraft-2013b', 'data',
                                   'images_box.txt')
        with open(bboxes_path, 'r') as f:
            names_to_bboxes = [
                line.split('\n')[0].split(' ') for line in f.readlines()
            ]
            names_to_bboxes = dict(
                (name, map(int, (xmin, ymin, xmax, ymax)))
                for name, xmin, ymin, xmax, ymax in names_to_bboxes)

        # Retrieve mapping from filename to cls
        cls_trainval_path = os.path.join(self.root, 'fgvc-aircraft-2013b',
                                         'data', 'images_variant_trainval.txt')
        with open(cls_trainval_path, 'r') as f:
            filenames_to_clsnames = [
                line.split('\n')[0].split(' ', 1) for line in f.readlines()
            ]

        cls_test_path = os.path.join(self.root, 'fgvc-aircraft-2013b', 'data',
                                     'images_variant_test.txt')
        with open(cls_test_path, 'r') as f:
            filenames_to_clsnames += [
                line.split('\n')[0].split(' ', 1) for line in f.readlines()
            ]

        filenames_to_clsnames = dict(filenames_to_clsnames)
        clss_to_names = collections.defaultdict(list)
        for filename, cls in filenames_to_clsnames.items():
            clss_to_names[cls].append(filename)

        for split in ['train', 'val', 'test']:
            filename = os.path.join(self.root, self.filename.format(split))
            labels = get_asset(self.folder, '{}.json'.format(split))
            labels_filename = os.path.join(self.root,
                                           self.filename_labels.format(split))

            with open(labels_filename, 'w') as f:
                json.dump(labels, f)

            with h5py.File(filename, 'w') as f:
                group = f.create_group('datasets')
                for i, label in enumerate(tqdm(labels, desc=filename)):
                    images = []
                    for file in clss_to_names[label]:
                        file_path = os.path.join(self.root,
                                                 'fgvc-aircraft-2013b', 'data',
                                                 'images',
                                                 '{}.jpg'.format(file))
                        img = Image.open(file_path)
                        bbox = names_to_bboxes[file]
                        img = np.asarray(img.crop(bbox).resize((32, 32)),
                                         dtype=np.uint8)
                        images.append(img)

                    dataset = group.create_dataset(label,
                                                   (len(images), 32, 32, 3))

                    for j, image in enumerate(images):
                        dataset[j] = image