Пример #1
0
    def getDataShape(self, subset):
        '''
        Returns the shape of the input data for the given subset

        Parameters
        ----------
        subset : int
            The subset indicator. Integer assigned by global variables in opendeep.data.dataset.py

        Returns
        -------
        tuple
            Return the list of shapes of this dataset's subset sequences. This will separate out the shapes for each
            sequence individually as items in the list, while the dataset is still concatenated into a single matrix.
        '''
        if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]:
            log.error('Subset %s not recognized!',
                      datasets.get_subset_strings(subset))
            return None
        if subset is datasets.TRAIN:
            return self.train_shapes
        elif subset is datasets.VALID:
            return self.valid_shapes
        elif subset is datasets.TEST:
            return self.test_shapes
Пример #2
0
    def __init__(self,
                 dataset=None,
                 subset=None,
                 batch_size=1,
                 minimum_batch_size=1,
                 rng=None):
        # make sure the subset is recognized
        if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]:
            log.error(
                'Dataset subset %s not recognized, try TRAIN, VALID, or TEST',
                datasets.get_subset_strings(subset))
        self.dataset = dataset
        self.subset = subset
        self.batch_size = batch_size
        self.minimum_batch_size = minimum_batch_size

        # determine the number of possible iterations given the batch size, minimum batch size, dataset, and subset
        self.data_len = self.dataset.getDataShape(self.subset)[0]
        batches = self.data_len / self.batch_size
        self.iterations = batches * [batch_size]

        remainder = numpy.remainder(self.data_len, self.batch_size)
        if remainder >= self.minimum_batch_size:
            self.iterations.append(remainder)

        self.iteration_index = 0
Пример #3
0
 def getDataShape(self, subset):
     if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]:
         log.error('Subset %s not recognized!', datasets.get_subset_strings(subset))
         return None
     if subset is datasets.TRAIN:
         return self.train_shapes
     elif subset is datasets.VALID:
         return self.valid_shapes
     elif subset is datasets.TEST:
         return self.test_shapes
Пример #4
0
 def getDataShape(self, subset):
     '''
     :return: tuple
     Return the shape of this dataset's subset in a NxD tuple where N=#examples and D=dimensionality
     '''
     if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]:
         log.error('Subset %s not recognized!', datasets.get_subset_strings(subset))
         return None
     if subset is datasets.TRAIN:
         return self._train_shape
     elif subset is datasets.VALID:
         return self._valid_shape
     elif subset is datasets.TEST:
         return self._test_shape
     else:
         log.critical('No getDataShape method implemented for %s for subset %s!',
                      str(type(self)),
                      datasets.get_subset_strings(subset))
         raise NotImplementedError()
Пример #5
0
 def getDataShape(self, subset):
     '''
     :return: tuple
     Return the shape of this dataset's subset in a NxD tuple where N=#examples and D=dimensionality
     '''
     if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]:
         log.error('Subset %s not recognized!', datasets.get_subset_strings(subset))
         return None
     if subset is datasets.TRAIN:
         return self._train_shape
     elif subset is datasets.VALID:
         return self._valid_shape
     elif subset is datasets.TEST:
         return self._test_shape
     else:
         log.critical('No getDataShape method implemented for %s for subset %s!',
                      str(type(self)),
                      datasets.get_subset_strings(subset))
         raise NotImplementedError()
Пример #6
0
 def hasSubset(self, subset):
     '''
     :param subset: integer
     The integer representing the subset of the data to consider dataset.(TRAIN, VALID, or TEST)
     :return: boolean
     Whether or not this dataset has the given subset split
     '''
     if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]:
         log.error('Subset %s not recognized!', datasets.get_subset_strings(subset))
         return False
     # it has them all.
     return True
Пример #7
0
 def hasSubset(self, subset):
     '''
     :param subset: integer
     The integer representing the subset of the data to consider dataset.(TRAIN, VALID, or TEST)
     :return: boolean
     Whether or not this dataset has the given subset split
     '''
     if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]:
         log.error('Subset %s not recognized!', datasets.get_subset_strings(subset))
         return False
     # it has them all.
     return True
Пример #8
0
 def __init__(self,
              dataset,
              subset=datasets.TRAIN,
              batch_size=1,
              minimum_batch_size=1,
              rng=None):
     _t = time.time()
     log.debug('Initializing a %s sequential iterator over %s',
               str(type(dataset)), datasets.get_subset_strings(subset))
     super(self.__class__, self).__init__(dataset, subset, batch_size,
                                          minimum_batch_size, rng)
     log.debug('iterator took %s to make' %
               make_time_units_string(time.time() - _t))
Пример #9
0
    def getDataShape(self, subset):
        '''
        Returns the shape of the input data for the given subset

        Parameters
        ----------
        subset : int
            The subset indicator. Integer assigned by global variables in opendeep.data.dataset.py

        Returns
        -------
        tuple
            Return the shape of this dataset's subset in a (N, D) tuple where N=#examples and D=dimensionality
        '''
        if subset is datasets.TRAIN:
            return self._train_shape
        elif subset is datasets.VALID:
            return self._valid_shape
        elif subset is datasets.TEST:
            return self._test_shape
        else:
            log.error('Subset %s not recognized!', datasets.get_subset_strings(subset))
            return None
Пример #10
0
    def getSubset(self, subset):
        """
        Returns the (x, y) pair of shared variables for the given train, validation, or test subset.

        Parameters
        ----------
        subset : int
            The subset indicator. Integer assigned by global variables in opendeep.data.dataset.py

        Returns
        -------
        tuple
            (x, y) tuple of shared variables holding the dataset input and label, or None if the subset doesn't exist.
        """
        if subset is datasets.TRAIN:
            return self.train_X, self.train_Y
        elif subset is datasets.VALID:
            return self.valid_X, self.valid_Y
        elif subset is datasets.TEST:
            return self.test_X, self.test_Y
        else:
            log.error('Subset %s not recognized!', datasets.get_subset_strings(subset))
            return None, None
Пример #11
0
    def _get_givens_subset(self, subset, batch_slice):
        """
        This translates a batch slice of start and end indices into the actual data from the given subset.

        Parameters
        ----------
        subset : int
            The subset to use - determined in opendeep.data.datasets as TRAIN, VALID, or TEST attributes.
        batch_slice : symbolic slice
            The symbolic slice to grab from the data.

        Returns
        -------
        OrderedDict
            The givens to provide to a function where it sets the input variable to the actual batch representation
            of data from the dataset: (input_variable: data[batch])
        """
        # translate the data_idx into the givens for the model
        # first get the lists of input variables the model requires - inputs and targets
        model_inputs = raise_to_list(self.model.get_inputs())
        model_targets = raise_to_list(self.model.get_targets())
        givens = None
        if self.dataset.getSubset(subset)[0] is not None:
            # grab the data and labels
            data, labels = self.dataset.getSubset(subset)
            # create the givens for the input function as pairs of (input_variable: sliced_data)
            givens = OrderedDict(zip(model_inputs, [data[batch_slice]]))
            # include labels as well if they are required by the model
            if model_targets is not None and len(model_targets) > 0:
                if labels is None:
                    log.error("No labels in the dataset!")
                    raise AssertionError, "No lables in the dataset!"
                givens.update(OrderedDict(zip(model_targets, [labels[batch_slice]])))
        else:
            log.warning("Dataset doesn't have subset %s" % get_subset_strings(subset))

        return givens
Пример #12
0
    def getDataShape(self, subset):
        '''
        Returns the shape of the input data for the given subset

        Parameters
        ----------
        subset : int
            The subset indicator. Integer assigned by global variables in opendeep.data.dataset.py

        Returns
        -------
        tuple
            Return the list of shapes of this dataset's subset sequences. This will separate out the shapes for each
            sequence individually as items in the list, while the dataset is still concatenated into a single matrix.
        '''
        if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]:
            log.error('Subset %s not recognized!', datasets.get_subset_strings(subset))
            return None
        if subset is datasets.TRAIN:
            return self.train_shapes
        elif subset is datasets.VALID:
            return self.valid_shapes
        elif subset is datasets.TEST:
            return self.test_shapes
Пример #13
0
    def getDataShape(self, subset):
        '''
        Returns the shape of the input data for the given subset

        Parameters
        ----------
        subset : int
            The subset indicator. Integer assigned by global variables in opendeep.data.dataset.py

        Returns
        -------
        tuple
            Return the shape of this dataset's subset in a (N, D) tuple where N=#examples and D=dimensionality
        '''
        if subset is datasets.TRAIN:
            return self._train_shape
        elif subset is datasets.VALID:
            return self._valid_shape
        elif subset is datasets.TEST:
            return self._test_shape
        else:
            log.error('Subset %s not recognized!',
                      datasets.get_subset_strings(subset))
            return None
Пример #14
0
    def getSubset(self, subset):
        """
        Returns the (x, y) pair of shared variables for the given train, validation, or test subset.

        Parameters
        ----------
        subset : int
            The subset indicator. Integer assigned by global variables in opendeep.data.dataset.py

        Returns
        -------
        tuple
            (x, y) tuple of shared variables holding the dataset input and label, or None if the subset doesn't exist.
        """
        if subset is datasets.TRAIN:
            return self.train_X, self.train_Y
        elif subset is datasets.VALID:
            return self.valid_X, self.valid_Y
        elif subset is datasets.TEST:
            return self.test_X, self.test_Y
        else:
            log.error('Subset %s not recognized!',
                      datasets.get_subset_strings(subset))
            return None, None
Пример #15
0
    def __init__(self,
                 dataset,
                 subset=datasets.TRAIN,
                 batch_size=1,
                 minimum_batch_size=1,
                 rng=None):
        # initialize a numpy rng if one is not provided
        if rng is None:
            random.seed(123)
            self.rng = random
        else:
            self.rng = rng

        _t = time.time()
        log.debug('Initializing a %s random iterator over %s',
                  str(type(dataset)), datasets.get_subset_strings(subset))
        super(self.__class__, self).__init__(dataset, subset, batch_size,
                                             minimum_batch_size)

        # randomize the indices to access
        self.indices = numpy.arange(self.data_len)
        self.rng.shuffle(self.indices)
        log.debug('iterator took %s to make' %
                  make_time_units_string(time.time() - _t))
Пример #16
0
 def hasSubset(self, subset):
     if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]:
         log.error('Subset %s not recognized!', datasets.get_subset_strings(subset))
     else:
         # it has train valid and test
         return True