def plot_samples_distance(dataset, sortbyattr=None): """Plot the euclidean distances between all samples of a dataset. Parameters ---------- dataset : Dataset Providing the samples. sortbyattr : None or str If None, the samples distances will be in the same order as their appearance in the dataset. Alternatively, the name of a samples attribute can be given, which wil then be used to sort/group the samples, e.g. to investigate the similarity samples by label or by chunks. """ if sortbyattr is not None: slicer = [] for attr in dataset.sa[sortbyattr].unique: slicer += \ get_samples_by_attr(dataset, sortbyattr, attr).tolist() samples = dataset.samples[slicer] else: samples = dataset.samples ed = np.sqrt(squared_euclidean_distance(samples)) pl.imshow(ed) pl.colorbar()
def _train(self, ds): # local binding chunks_attr = self.__chunks_attr params = self.__params param_est = self.__param_est # populate a dictionary with tuples of (mean,std) for all chunks, or # a global value that is is used for the whole data if not params is None: # we got mean and std already if not isinstance(params, dict): # turn into dict, otherwise assume that we have parameters per # chunk params = {'__all__': params} else: # no parameters given, need to estimate if not param_est is None: est_attr, est_attr_values = param_est # which samples to use for estimation est_ids = set(get_samples_by_attr(ds, est_attr, est_attr_values)) else: est_ids = slice(None) # now we can either do it one for all, or per chunk if not chunks_attr is None: # per chunk estimate params = {} for c in ds.sa[chunks_attr].unique: slicer = np.where(ds.sa[chunks_attr].value == c)[0] if not isinstance(est_ids, slice): slicer = list(est_ids.intersection(set(slicer))) params[c] = self._compute_params(ds.samples[slicer]) else: # global estimate params = {'__all__': self._compute_params(ds.samples[est_ids])} self.__params_dict = params