def forward(self, data): """Map data from input to output space. Parameters ---------- data : Dataset-like, (at least 2D)-array-like Typically this is a `Dataset`, but it might also be a plain data array, or even something completely different(TM) that is supported by a subclass' implementation. If such an object is Dataset-like it is handled by a dedicated method that also transforms dataset attributes if necessary. If an array-like is passed, it has to be at least two-dimensional, with the first axis separating samples or observations. For single samples `forward1()` might be more appropriate. """ if is_datasetlike(data): return self._forward_dataset(data) else: if __debug__: if hasattr(data, 'ndim') and data.ndim < 2: raise ValueError( 'Mapper.forward() only support mapping of data with ' 'at least two dimensions, where the first axis ' 'separates samples/observations. Consider using ' 'Mapper.forward1() instead.') return self._forward_data(data)
def forward(self, data): """Map data from input to output space. Parameters ---------- data : Dataset-like, (at least 2D)-array-like Typically this is a `Dataset`, but it might also be a plain data array, or even something completely different(TM) that is supported by a subclass' implementation. If such an object is Dataset-like it is handled by a dedicated method that also transforms dataset attributes if necessary. If an array-like is passed, it has to be at least two-dimensional, with the first axis separating samples or observations. For single samples `forward1()` might be more appropriate. """ if is_datasetlike(data): if __debug__: debug('MAP', "Forward-map %s-shaped dataset through '%s'." % (data.shape, self)) return self._forward_dataset(data) else: if hasattr(data, 'ndim') and data.ndim < 2: raise ValueError( 'Mapper.forward() only support mapping of data with ' 'at least two dimensions, where the first axis ' 'separates samples/observations. Consider using ' 'Mapper.forward1() instead.') if __debug__: debug('MAP', "Forward-map data through '%s'." % (self)) return self._forward_data(data)
def compute(self, ds1, ds2=None): """Generic computation of any kernel Assumptions: - ds1, ds2 are either datasets or arrays, - presumably 2D (not checked neither enforced here - _compute takes ndarrays. If your kernel needs datasets, override compute """ if is_datasetlike(ds1): ds1 = ds1.samples if ds2 is None: ds2 = ds1 elif is_datasetlike(ds2): ds2 = ds2.samples # TODO: assure 2D shape self._compute(ds1, ds2)
def hstack(datasets): """Stacks datasets horizontally (appending features). Sample attribute collections are merged incrementally, attribute with identical keys overwriting previous ones in the stacked dataset. All datasets must have an identical set of feature attributes (matching keys, not values), otherwise a ValueError will be raised. No dataset attributes from any source dataset will be transferred into the stacked dataset. Parameters ---------- datasets : tuple Sequence of datasets to be stacked. Returns ------- AttrDataset (or respective subclass) """ # # XXX Use CombinedMapper in here whenever it comes back # # fall back to numpy if it is not a dataset if not is_datasetlike(datasets[0]): # we might get a list of 1Ds that would yield wrong results when # turned into a dict (would run along samples-axis) return AttrDataset(np.atleast_2d(np.hstack(datasets))) if __debug__: target = sorted(datasets[0].fa.keys()) if not np.all([sorted(ds.fa.keys()) == target for ds in datasets]): raise ValueError("Feature attributes collections of to be stacked " "datasets have varying attributes.") # will puke if not equal number of samples stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=1) stacked_fa = {} for attr in datasets[0].fa: stacked_fa[attr] = np.concatenate( [ds.fa[attr].value for ds in datasets], axis=0) # create the dataset merged = datasets[0].__class__(stacked_samp, fa=stacked_fa) for ds in datasets: merged.sa.update(ds.sa) return merged
def train(self, ds): """ The default implementation calls ``_pretrain()``, ``_train()``, and finally ``_posttrain()``. Parameters ---------- ds: Dataset Training dataset. Returns ------- None """ got_ds = is_datasetlike(ds) # TODO remove first condition if all Learners get only datasets if got_ds and (ds.nfeatures == 0 or len(ds) == 0): raise DegenerateInputError("Cannot train classifier on degenerate data %s" % ds) if __debug__: debug("LRN", "Training learner %(lrn)s on dataset %(dataset)s", msgargs={"lrn": self, "dataset": ds}) self._pretrain(ds) # remember the time when started training t0 = time.time() if got_ds: # things might have happened during pretraining if ds.nfeatures > 0: result = self._train(ds) else: warning("Trying to train on dataset with no features present") if __debug__: debug("LRN", "No features present for training, no actual training " "is called") result = None else: # in this case we claim to have no idea and simply try to train result = self._train(ds) # store timing self.ca.training_time = time.time() - t0 # and post-proc result = self._posttrain(ds) # finally flag as trained self._set_trained()
def reverse(self, data): """Reverse-map data from output back into input space. Parameters ---------- data : Dataset-like, anything Typically this is a `Dataset`, but it might also be a plain data array, or even something completely different(TM) that is supported by a subclass' implementation. If such an object is Dataset-like it is handled by a dedicated method that also transforms dataset attributes if necessary. """ if is_datasetlike(data): return self._reverse_dataset(data) else: return self._reverse_data(data)
def vstack(datasets): """Stacks datasets vertically (appending samples). Feature attribute collections are merged incrementally, attribute with identical keys overwriting previous ones in the stacked dataset. All datasets must have an identical set of sample attributes (matching keys, not values), otherwise a ValueError will be raised. No dataset attributes from any source dataset will be transferred into the stacked dataset. If all input dataset have common dataset attributes that are also valid for the stacked dataset, they can be moved into the output dataset like this:: ds_merged = vstack((ds1, ds2, ds3)) ds_merged.a.update(ds1.a) Parameters ---------- datasets : tuple Sequence of datasets to be stacked. Returns ------- AttrDataset (or respective subclass) """ # fall back to numpy if it is not a dataset if not is_datasetlike(datasets[0]): return AttrDataset(np.vstack(datasets)) if __debug__: target = sorted(datasets[0].sa.keys()) if not np.all([sorted(ds.sa.keys()) == target for ds in datasets]): raise ValueError("Sample attributes collections of to be stacked " "datasets have varying attributes.") # will puke if not equal number of features stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=0) stacked_sa = {} for attr in datasets[0].sa: stacked_sa[attr] = np.concatenate( [ds.sa[attr].value for ds in datasets], axis=0) # create the dataset merged = datasets[0].__class__(stacked_samp, sa=stacked_sa) for ds in datasets: merged.fa.update(ds.fa) return merged
def p(self, x, return_tails=False, **kwargs): """Returns the p-value for values of `x`. Returned values are determined left, right, or from any tail depending on the constructor setting. In case a `FeaturewiseMeasure` was used to estimate the distribution the method returns an array. In that case `x` can be a scalar value or an array of a matching shape. """ peas = _pvalue(x, self.cdf, self.__tail, return_tails=return_tails, **kwargs) if is_datasetlike(x): # return the p-values in a dataset as well and assign the input # dataset attributes to the return dataset too pds = x.copy(deep=False) if return_tails: pds.samples = peas[0] return pds, peas[1] else: pds.samples = peas return pds return peas
def wrap_samples(obj, data, *args, **kwargs): if is_datasetlike(data): return fx(obj, data, *args, **kwargs) else: return fx(obj, Dataset(data), *args, **kwargs)
def test_from_wizard(): samples = np.arange(12).reshape((4, 3)).view(myarray) labels = range(4) chunks = [1, 1, 2, 2] ds = Dataset(samples, sa={'targets': labels, 'chunks': chunks}) ds.init_origids('both') first = ds.sa.origids # now do again and check that they get regenerated ds.init_origids('both') assert_false(first is ds.sa.origids) assert_array_equal(first, ds.sa.origids) ok_(is_datasetlike(ds)) ok_(not is_datasetlike(labels)) # array subclass survives ok_(isinstance(ds.samples, myarray)) ## XXX stuff that needs thought: # ds.sa (empty) has this in the public namespace: # add, get, getvalue, has_key, is_set, items, listing, name, names # owner, remove, reset, setvalue, which_set # maybe we need some form of leightweightCollection? assert_array_equal(ds.samples, samples) assert_array_equal(ds.sa.targets, labels) assert_array_equal(ds.sa.chunks, chunks) # same should work for shortcuts assert_array_equal(ds.targets, labels) assert_array_equal(ds.chunks, chunks) ok_(sorted(ds.sa.keys()) == ['chunks', 'origids', 'targets']) ok_(sorted(ds.fa.keys()) == ['origids']) # add some more ds.a['random'] = 'blurb' # check stripping attributes from a copy cds = ds.copy() # full copy ok_(sorted(cds.sa.keys()) == ['chunks', 'origids', 'targets']) ok_(sorted(cds.fa.keys()) == ['origids']) ok_(sorted(cds.a.keys()) == ['random']) cds = ds.copy(sa=[], fa=[], a=[]) # plain copy ok_(cds.sa.keys() == []) ok_(cds.fa.keys() == []) ok_(cds.a.keys() == []) cds = ds.copy(sa=['targets'], fa=None, a=['random']) # partial copy ok_(cds.sa.keys() == ['targets']) ok_(cds.fa.keys() == ['origids']) ok_(cds.a.keys() == ['random']) # there is not necessarily a mapper present ok_(not ds.a.has_key('mapper')) # has to complain about misshaped samples attributes assert_raises(ValueError, Dataset.from_wizard, samples, labels + labels) # check that we actually have attributes of the expected type ok_(isinstance(ds.sa['targets'], ArrayCollectable)) # the dataset will take care of not adding stupid stuff assert_raises(ValueError, ds.sa.__setitem__, 'stupid', np.arange(3)) assert_raises(ValueError, ds.fa.__setitem__, 'stupid', np.arange(4)) # or change proper attributes to stupid shapes try: ds.sa.targets = np.arange(3) except ValueError: pass else: ok_(False, msg="Assigning value with improper shape to attribute " "did not raise exception.")