def getDataShape(self, subset): ''' Returns the shape of the input data for the given subset Parameters ---------- subset : int The subset indicator. Integer assigned by global variables in opendeep.data.dataset.py Returns ------- tuple Return the list of shapes of this dataset's subset sequences. This will separate out the shapes for each sequence individually as items in the list, while the dataset is still concatenated into a single matrix. ''' if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]: log.error('Subset %s not recognized!', datasets.get_subset_strings(subset)) return None if subset is datasets.TRAIN: return self.train_shapes elif subset is datasets.VALID: return self.valid_shapes elif subset is datasets.TEST: return self.test_shapes
def __init__(self, dataset=None, subset=None, batch_size=1, minimum_batch_size=1, rng=None): # make sure the subset is recognized if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]: log.error( 'Dataset subset %s not recognized, try TRAIN, VALID, or TEST', datasets.get_subset_strings(subset)) self.dataset = dataset self.subset = subset self.batch_size = batch_size self.minimum_batch_size = minimum_batch_size # determine the number of possible iterations given the batch size, minimum batch size, dataset, and subset self.data_len = self.dataset.getDataShape(self.subset)[0] batches = self.data_len / self.batch_size self.iterations = batches * [batch_size] remainder = numpy.remainder(self.data_len, self.batch_size) if remainder >= self.minimum_batch_size: self.iterations.append(remainder) self.iteration_index = 0
def getDataShape(self, subset): if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]: log.error('Subset %s not recognized!', datasets.get_subset_strings(subset)) return None if subset is datasets.TRAIN: return self.train_shapes elif subset is datasets.VALID: return self.valid_shapes elif subset is datasets.TEST: return self.test_shapes
def getDataShape(self, subset): ''' :return: tuple Return the shape of this dataset's subset in a NxD tuple where N=#examples and D=dimensionality ''' if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]: log.error('Subset %s not recognized!', datasets.get_subset_strings(subset)) return None if subset is datasets.TRAIN: return self._train_shape elif subset is datasets.VALID: return self._valid_shape elif subset is datasets.TEST: return self._test_shape else: log.critical('No getDataShape method implemented for %s for subset %s!', str(type(self)), datasets.get_subset_strings(subset)) raise NotImplementedError()
def hasSubset(self, subset): ''' :param subset: integer The integer representing the subset of the data to consider dataset.(TRAIN, VALID, or TEST) :return: boolean Whether or not this dataset has the given subset split ''' if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]: log.error('Subset %s not recognized!', datasets.get_subset_strings(subset)) return False # it has them all. return True
def __init__(self, dataset, subset=datasets.TRAIN, batch_size=1, minimum_batch_size=1, rng=None): _t = time.time() log.debug('Initializing a %s sequential iterator over %s', str(type(dataset)), datasets.get_subset_strings(subset)) super(self.__class__, self).__init__(dataset, subset, batch_size, minimum_batch_size, rng) log.debug('iterator took %s to make' % make_time_units_string(time.time() - _t))
def getDataShape(self, subset): ''' Returns the shape of the input data for the given subset Parameters ---------- subset : int The subset indicator. Integer assigned by global variables in opendeep.data.dataset.py Returns ------- tuple Return the shape of this dataset's subset in a (N, D) tuple where N=#examples and D=dimensionality ''' if subset is datasets.TRAIN: return self._train_shape elif subset is datasets.VALID: return self._valid_shape elif subset is datasets.TEST: return self._test_shape else: log.error('Subset %s not recognized!', datasets.get_subset_strings(subset)) return None
def getSubset(self, subset): """ Returns the (x, y) pair of shared variables for the given train, validation, or test subset. Parameters ---------- subset : int The subset indicator. Integer assigned by global variables in opendeep.data.dataset.py Returns ------- tuple (x, y) tuple of shared variables holding the dataset input and label, or None if the subset doesn't exist. """ if subset is datasets.TRAIN: return self.train_X, self.train_Y elif subset is datasets.VALID: return self.valid_X, self.valid_Y elif subset is datasets.TEST: return self.test_X, self.test_Y else: log.error('Subset %s not recognized!', datasets.get_subset_strings(subset)) return None, None
def _get_givens_subset(self, subset, batch_slice): """ This translates a batch slice of start and end indices into the actual data from the given subset. Parameters ---------- subset : int The subset to use - determined in opendeep.data.datasets as TRAIN, VALID, or TEST attributes. batch_slice : symbolic slice The symbolic slice to grab from the data. Returns ------- OrderedDict The givens to provide to a function where it sets the input variable to the actual batch representation of data from the dataset: (input_variable: data[batch]) """ # translate the data_idx into the givens for the model # first get the lists of input variables the model requires - inputs and targets model_inputs = raise_to_list(self.model.get_inputs()) model_targets = raise_to_list(self.model.get_targets()) givens = None if self.dataset.getSubset(subset)[0] is not None: # grab the data and labels data, labels = self.dataset.getSubset(subset) # create the givens for the input function as pairs of (input_variable: sliced_data) givens = OrderedDict(zip(model_inputs, [data[batch_slice]])) # include labels as well if they are required by the model if model_targets is not None and len(model_targets) > 0: if labels is None: log.error("No labels in the dataset!") raise AssertionError, "No lables in the dataset!" givens.update(OrderedDict(zip(model_targets, [labels[batch_slice]]))) else: log.warning("Dataset doesn't have subset %s" % get_subset_strings(subset)) return givens
def __init__(self, dataset, subset=datasets.TRAIN, batch_size=1, minimum_batch_size=1, rng=None): # initialize a numpy rng if one is not provided if rng is None: random.seed(123) self.rng = random else: self.rng = rng _t = time.time() log.debug('Initializing a %s random iterator over %s', str(type(dataset)), datasets.get_subset_strings(subset)) super(self.__class__, self).__init__(dataset, subset, batch_size, minimum_batch_size) # randomize the indices to access self.indices = numpy.arange(self.data_len) self.rng.shuffle(self.indices) log.debug('iterator took %s to make' % make_time_units_string(time.time() - _t))
def hasSubset(self, subset): if subset not in [datasets.TRAIN, datasets.VALID, datasets.TEST]: log.error('Subset %s not recognized!', datasets.get_subset_strings(subset)) else: # it has train valid and test return True