예제 #1
0
    def load_datasets(self):
        """
        GSC specifies specific files to be used as training, test, and validation.

        We assume the data has already been processed using the pre-processing scripts
        here: https://github.com/numenta/nupic.torch/tree/master/examples/gsc
        """
        validation_dataset = PreprocessedDataset(
            cachefilepath=self.data_dir,
            basename="gsc_valid",
            qualifiers=[""],
        )

        test_dataset = PreprocessedDataset(
            cachefilepath=self.data_dir,
            basename="gsc_test_noise",
            qualifiers=["{:02d}".format(int(100 * n)) for n in self.noise_values],
        )
        train_dataset = PreprocessedDataset(
            cachefilepath=self.data_dir,
            basename="gsc_train",
            qualifiers=range(30),
        )

        self.train_loader = DataLoader(
            train_dataset, batch_size=self.batch_size, shuffle=True
        )

        self.validation_loader = DataLoader(
            validation_dataset, batch_size=self.batch_size, shuffle=False
        )

        self.test_loader = DataLoader(
            test_dataset, batch_size=self.batch_size, shuffle=False
        )
예제 #2
0
    def load_datasets(self):
        """
        GSC specifies specific files to be used as training, test, and validation.

        We assume the data has already been processed using the pre-processing scripts
        here: https://github.com/numenta/nupic.torch/tree/master/examples/gsc
        """
        validation_dataset = ClasswiseDataset(cachefilepath=self.data_dir,
                                              basename="data_valid",
                                              qualifiers=[""])
        self.validation_loader = DataLoader(
            validation_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            drop_last=True,
        )

        self.gen_test_dataset = PreprocessedDataset(
            cachefilepath=self.test_data_dir,
            basename="gsc_test_noise",
            qualifiers=["00"],
            transform=self.subtract_label_transform(),
        )

        self.gen_test_loader = DataLoader(
            self.gen_test_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=True,
        )

        self.train_dataset = PreprocessedDataset(
            cachefilepath=self.test_data_dir,
            basename="gsc_train",
            qualifiers=range(30),
            transform=self.subtract_label_transform(),
        )

        self.full_train_loader = DataLoader(self.train_dataset,
                                            batch_size=self.batch_size,
                                            shuffle=True,
                                            drop_last=True)

        self.test_loader = []

        # Iterate over labels
        for class_ in np.arange(12):
            test_dataset = ClasswiseDataset(
                cachefilepath=self.data_dir,
                basename="data_test_0noise",
                qualifiers=[class_ + 1],
            )

            self.test_loader.append(
                DataLoader(
                    test_dataset,
                    batch_size=self.batch_size,
                    shuffle=False,
                    drop_last=True,
                ))
예제 #3
0
def preprocessed_gsc(root, train=True, qualifiers=None, download=True):
    """
    Create train or test dataset from preprocessed GSC data, downloading if
    necessary.

    Warning: Be sure to call dataset.load_next() following each epoch of training.
    Otherwise, no new augmentations will be loaded, and the same exact samples
    will be reused.

    .. note:: To load the preprocessed noise dataset use noise levels
              `["05", "10", ..., "50"]` as qualifiers on the test dataset.
              Noise level "00" is equivalent to the regular "test" dataset
    .. seealso:: PreprocessedDataset

    :param root: directory to store or load downloaded data
    :param train: whether to load train or test data
    :param qualifiers: List of qualifiers for each preprocessed files in this dataset.
           If None, `range(30)` will be used for training and "00" for testing.
    :param download: whether to download the data
    """

    root = os.path.expanduser(root)
    if download:
        download_gsc_data(root)

    if train:
        basename = "gsc_train"
        if qualifiers is None:
            qualifiers = range(30)
    else:
        # Load test and noise dataset
        basename = "gsc_test_noise"
        if qualifiers is None:
            qualifiers = ["00"]

    dataset = PreprocessedDataset(
        cachefilepath=root,
        basename=basename,
        qualifiers=qualifiers,
    )

    return dataset
예제 #4
0
def process_gsc_by_class(data_dir=None):

    if data_dir is None:
        data_dir = "/home/ec2-user/nta/data/"

    if not os.path.isdir(data_dir + "data_classes"):
        os.mkdir(data_dir + "data_classes")

    class_min = 1
    class_max = 12

    ranges = [np.arange(k, k + 1) for k in range(30)]

    for k in range(class_min, class_max + 1):
        data_tensor = torch.zeros(0, 1, 32, 32)
        for j in range(len(ranges)):
            dataset = PreprocessedDataset(cachefilepath=data_dir,
                                          basename="gsc_train",
                                          qualifiers=ranges[j])

            class_indices = np.where(dataset.tensors[1] == k)[0]

            if len(class_indices) > 0:
                data_tensor = torch.cat((data_tensor, torch.Tensor(
                    dataset.tensors[0][class_indices, :, :, :])))

        labels_tensor = torch.Tensor(data_tensor.shape[0] * [k - 1]).long()

        out_tensor = list((data_tensor, labels_tensor))
        with open(data_dir + "/data_classes/data_train_{}.npz".format(k), "wb") as f:
            torch.save(out_tensor, f)

    for k in range(class_min, class_max + 1):
        data_tensor = torch.zeros(0, 1, 32, 32)
        dataset = PreprocessedDataset(cachefilepath=data_dir,
                                      basename="gsc_valid",
                                      qualifiers=[""]
                                      )

        class_indices = np.where(dataset.tensors[1] == k)[0]

        if len(class_indices) > 0:
            data_tensor = torch.cat((data_tensor, torch.Tensor(
                dataset.tensors[0][class_indices, :, :, :])))

        labels_tensor = torch.Tensor(data_tensor.shape[0] * [k - 1]).long()

        out_tensor = list((data_tensor, labels_tensor))
        with open(data_dir + "/data_classes/data_valid.npz", "wb") as f:
            torch.save(out_tensor, f)

    noise_values = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]

    for k in range(class_min, class_max + 1):
        for n in noise_values:
            data_tensor = torch.zeros(0, 1, 32, 32)
            dataset = PreprocessedDataset(cachefilepath=data_dir,
                                          basename="gsc_test_noise",
                                          qualifiers=["{:02d}".format(int(100 * n))])

            class_indices = np.where(dataset.tensors[1] == k)[0]

            if len(class_indices) > 0:
                data_tensor = torch.cat((data_tensor, torch.Tensor(
                    dataset.tensors[0][class_indices, :, :, :])))

            if k == 1:
                labels_tensor = torch.Tensor(55860 * [k - 1]).long()
            else:
                labels_tensor = torch.Tensor(data_tensor.shape[0] * [k - 1]).long()

            out_tensor = list((data_tensor, labels_tensor))
            if n == 0.0:
                tensor_string = "data_test_0noise{}.npz".format(k)
            else:
                tensor_string = "data_test_{}_{}.npz".format(k, n)

            with open(data_dir + "/data_classes/" + tensor_string, "wb") as f:
                torch.save(out_tensor, f)