def next(self): """ Get the next subset of the dataset during dataset iteration. Converts index selections for batches to boolean selections that are supported by HDF5 datasets. """ next_index = self._subset_iterator.next() # convert to boolean selection sel = np.zeros(self.num_examples, dtype=bool) sel[next_index] = True next_index = sel rval = [] for data, fn in safe_izip(self._raw_data, self._convert): try: this_data = data[next_index] except TypeError: this_data = data[next_index, :] if fn: this_data = fn(this_data) assert not contains_nan(this_data) rval.append(this_data) rval = tuple(rval) if not self._return_tuple and len(rval) == 1: rval, = rval return rval
def next(self): next_index = self._subset_iterator.next() # convert to boolean selection sel = np.zeros(self.num_examples, dtype=bool) sel[next_index] = True next_index = sel rval = [] for data, fn in safe_izip(self._raw_data, self._convert): try: this_data = data[next_index] except TypeError: this_data = data[next_index, :] if fn: this_data = fn(this_data) if self._preprocessor is not None: d = DenseDesignMatrix(X=this_data) self._preprocessor.apply(d) this_data = d.get_design_matrix() assert not np.any(np.isnan(this_data)) rval.append(this_data) rval = tuple(rval) if not self._return_tuple and len(rval) == 1: rval, = rval return rval
def next(self): """ Get the next subset of the dataset during dataset iteration. Converts index selections for batches to boolean selections that are supported by HDF5 datasets. """ next_index = self._subset_iterator.next() # convert to boolean selection sel = np.zeros(self.num_examples, dtype=bool) sel[next_index] = True next_index = sel rval = [] for data, fn in safe_izip(self._raw_data, self._convert): try: this_data = data[next_index] except TypeError: # FB: Why this try..except is there? I think this is useless. # Do not hide the original if we can't fall back. # FV: This is triggered if the shape of next_index is # incompatible with the shape of the dataset. See for an # example test_hdf5_topo_view(), where where i # next.index.shape = (10,) and data is 'data': <HDF5 # dataset "y": shape (10, 3), type "<f8"> # I think it would be better to explicitly check if # next_index.shape is incompatible with data.shape, for # instance checking if next_index.ndim == data.ndim if data.ndim > 1: this_data = data[next_index, :] else: raise # Check if the dataset data is a vector and transform it into a # one-column matrix. This is needed to automatically convert the # shape of the data later (in the format_as method of the # Space.) if fn: this_data = fn(this_data) assert not contains_nan(this_data) rval.append(this_data) rval = tuple(rval) if not self._return_tuple and len(rval) == 1: rval, = rval return rval
def next(self): """ Get the next subset of the dataset during dataset iteration. Converts index selections for batches to boolean selections that are supported by HDF5 datasets. """ next_index = self._subset_iterator.next() # convert to boolean selection #sel = np.zeros(self.num_examples, dtype=bool) sel = np.zeros(NUM_TRAINING_SAMPLES, dtype=bool) sel[next_index] = True next_index = sel rval = [] for data, fn in safe_izip(self._raw_data, self._convert): try: this_data = data[next_index] except TypeError: this_data = data[next_index, :] if fn: #import IPython #IPython.embed() this_data = fn(this_data) assert not np.any(np.isnan(this_data)) rval.append(this_data) rval = tuple(rval) if not self._return_tuple and len(rval) == 1: rval, = rval #import IPython #IPython.embed() rval = (np.rollaxis(rval[0], 1), rval[1]) return rval
def __init__(self, dataset, subset_iterator, data_specs=None, return_tuple=False, convert=None): self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple # Keep only the needed sources in self._raw_data. # Remember what source they correspond to in self._source assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) # the dataset's data spec is either a single (space, source) pair, # or a pair of (non-nested CompositeSpace, non-nested tuple). # We could build a mapping and call flatten(..., return_tuple=True) # but simply putting spaces, sources and data in tuples is simpler. if not isinstance(dataset_source, tuple): dataset_source = (dataset_source,) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space,) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) all_data = self._dataset.get_data() if not isinstance(all_data, tuple): all_data = (all_data,) space, source = data_specs if not isinstance(source, tuple): source = (source,) if not isinstance(space, CompositeSpace): sub_spaces = (space,) else: sub_spaces = space.components assert len(source) == len(sub_spaces) self._raw_data = tuple(all_data[dataset_source.index(s)] for s in source) self._source = source if convert is None: self._convert = [None for s in source] else: assert len(convert) == len(source) self._convert = convert for i, (so, sp, dt) in enumerate(safe_izip(source, sub_spaces, self._raw_data)): idx = dataset_source.index(so) dspace = dataset_sub_spaces[idx] init_fn = self._convert[i] fn = init_fn # If there is an init_fn, it is supposed to take # care of the formatting, and it should be an error # if it does not. If there was no init_fn, then # the iterator will try to format using the generic # space-formatting functions. if init_fn is None: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. if fn is None: def fn(batch, dspace=dspace, sp=sp): try: return dspace.np_format_as(batch, sp) except ValueError as e: msg = str(e) + '\nMake sure that the model and '\ 'dataset have been initialized with '\ 'correct values.' raise ValueError(msg) else: fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace.np_format_as(fn_(batch), sp)) self._convert[i] = fn