class PDistTargetSimilarity(Measure): """Calculate the correlations of PDist measures with a target Target dissimilarity correlation `Measure`. Computes the correlation between the dissimilarity matrix defined over the pairwise distances between the samples of dataset and the target dissimilarity matrix. """ is_trained = True """Indicate that this measure is always trained.""" pairwise_metric = Parameter('correlation', constraints='str', doc="""\ Distance metric to use for calculating pairwise vector distances for dissimilarity matrix (DSM). See scipy.spatial.distance.pdist for all possible metrics.""") comparison_metric = Parameter('pearson', constraints=EnsureChoice('pearson', 'spearman'), doc="""\ Similarity measure to be used for comparing dataset DSM with the target DSM.""") center_data = Parameter(False, constraints='bool', doc="""\ If True then center each column of the data matrix by subtracing the column mean from each element. This is recommended especially when using pairwise_metric='correlation'.""") corrcoef_only = Parameter(False, constraints='bool', doc="""\ If True, return only the correlation coefficient (rho), otherwise return rho and probability, p.""") def __init__(self, target_dsm, **kwargs): """ Parameters ---------- target_dsm : array (length N*(N-1)/2) Target dissimilarity matrix Returns ------- Dataset If ``corrcoef_only`` is True, contains one feature: the correlation coefficient (rho); or otherwise two-fetaures: rho plus p. """ # init base classes first Measure.__init__(self, **kwargs) self.target_dsm = target_dsm if self.params.comparison_metric == 'spearman': self.target_dsm = rankdata(target_dsm) def _call(self,dataset): data = dataset.samples if self.params.center_data: data = data - np.mean(data,0) dsm = pdist(data,self.params.pairwise_metric) if self.params.comparison_metric=='spearman': dsm = rankdata(dsm) rho, p = pearsonr(dsm,self.target_dsm) if self.params.corrcoef_only: return Dataset([rho], fa={'metrics': ['rho']}) else: return Dataset([[rho,p]], fa={'metrics': ['rho', 'p']})
class PDistConsistency(Measure): """Calculate the correlations of PDist measures across chunks This measures the consistency in similarity structure across runs within individuals, or across individuals if the target dataset is made from several subjects in some common space and where the sample attribute specified as the chunks_attr codes for subject identity. @author: ACC Aug 2013 """ is_trained = True """Indicate that this measure is always trained.""" chunks_attr = Parameter('chunks', constraints='str', doc="""\ Chunks attribute to use for chunking dataset. Can be any samples attribute.""") pairwise_metric = Parameter('correlation', constraints='str', doc="""\ Distance metric to use for calculating dissimilarity matrices from the set of samples in each chunk specified. See spatial.distance.pdist for all possible metrics.""") consistency_metric = Parameter('pearson', constraints=EnsureChoice('pearson', 'spearman'), doc="""\ Correlation measure to use for the correlation between dissimilarity matrices.""") center_data = Parameter(False, constraints='bool', doc="""\ If True then center each column of the data matrix by subtracing the column mean from each element. This is recommended especially when using pairwise_metric='correlation'.""") square = Parameter(False, constraints='bool', doc="""\ If True return the square distance matrix, if False, returns the flattened upper triangle.""") def __init__(self, **kwargs): """ Returns ------- Dataset Contains the pairwise correlations between the DSMs computed from each chunk of the input dataset. If square is False, this is a column vector of length N(N-1)/2 for N chunks. If square is True, this is a square matrix of size NxN for N chunks. """ # TODO: Another metric for consistency metric could be the "Rv" # coefficient... (ac) # init base classes first Measure.__init__(self, **kwargs) def _call(self, dataset): """Computes the average correlation in similarity structure across chunks.""" chunks_attr = self.params.chunks_attr nchunks = len(dataset.sa[chunks_attr].unique) if nchunks < 2: raise StandardError("This measure calculates similarity consistency across " "chunks and is not meaningful for datasets with only " "one chunk:") dsms = [] chunks = [] for chunk in dataset.sa[chunks_attr].unique: data = np.atleast_2d( dataset.samples[dataset.sa[chunks_attr].value == chunk,:]) if self.params.center_data: data = data - np.mean(data,0) dsm = pdist(data, self.params.pairwise_metric) dsms.append(dsm) chunks.append(chunk) dsms = np.vstack(dsms) if self.params.consistency_metric=='spearman': dsms = np.apply_along_axis(rankdata, 1, dsms) corrmat = np.corrcoef(dsms) if self.params.square: ds = Dataset(corrmat, sa={self.params.chunks_attr: chunks}) else: ds = Dataset(squareform(corrmat,checks=False), sa=dict(pairs=list(combinations(chunks, 2)))) return ds
class GDA(Classifier): """Gaussian Discriminant Analysis -- base for LDA and QDA """ __tags__ = ['binary', 'multiclass', 'oneclass'] prior = Parameter('laplacian_smoothing', constraints=EnsureChoice('laplacian_smoothing', 'uniform', 'ratio'), doc="""How to compute prior distribution.""") allow_pinv = Parameter( True, constraints='bool', doc="""Allow pseudo-inverse in case of degenerate covariance(s).""") def __init__(self, **kwargs): """Initialize a GDA classifier. """ # init base class first Classifier.__init__(self, **kwargs) # pylint friendly initializations self.means = None """Means of features per class""" self.cov = None """Co-variances per class, but "vars" is taken ;)""" self.ulabels = None """Labels classifier was trained on""" self.priors = None """Class probabilities""" self.nsamples_per_class = None """Number of samples per class - used by derived classes""" # Define internal state of classifier self._norm_weight = None def _get_priors(self, nlabels, nsamples, nsamples_per_class): """Return prior probabilities given data """ prior = self.params.prior if prior == 'uniform': priors = np.ones((nlabels, )) / nlabels elif prior == 'laplacian_smoothing': priors = (1+np.squeeze(nsamples_per_class)) \ / (float(nsamples) + nlabels) elif prior == 'ratio': priors = np.squeeze(nsamples_per_class) / float(nsamples) else: raise ValueError, \ "No idea on how to handle '%s' way to compute priors" \ % self.params.prior return np.atleast_1d(priors) def _train(self, dataset): """Train the classifier using `dataset` (`Dataset`). """ params = self.params targets_sa_name = self.get_space() targets_sa = dataset.sa[targets_sa_name] # get the dataset information into easy vars X = dataset.samples labels = targets_sa.value self.ulabels = ulabels = targets_sa.unique nlabels = len(ulabels) label2index = dict((l, il) for il, l in enumerate(ulabels)) # set the feature dimensions nsamples = len(X) nfeatures = dataset.nfeatures self.means = means = \ np.zeros((nlabels, nfeatures)) # degenerate dimension are added for easy broadcasting later on # XXX might want to remove -- for now taken from GNB as is self.nsamples_per_class = nsamples_per_class \ = np.zeros((nlabels, 1)) self.cov = cov = \ np.zeros((nlabels, nfeatures, nfeatures)) # Estimate cov # better loop than repmat! ;) for l, il in label2index.iteritems(): Xl = X[labels == l] nsamples_per_class[il] = len(Xl) # TODO: degenerate case... no samples for known label for # some reason? means[il] = np.mean(Xl, axis=0) # since we have means already lets do manually cov here Xldm = Xl - means[il] cov[il] = np.dot(Xldm.T, Xldm) # scaling will be done correspondingly in LDA or QDA # Store prior probabilities self.priors = self._get_priors(nlabels, nsamples, nsamples_per_class) if __debug__ and 'GDA' in debug.active: debug( 'GDA', "training finished on data.shape=%s " % (X.shape, ) + "min:max(data)=%f:%f" % (np.min(X), np.max(X))) def _untrain(self): """Untrain classifier and reset all learnt params """ self.means = None self.cov = None self.ulabels = None self.priors = None super(GDA, self)._untrain() @accepts_dataset_as_samples def _predict(self, data): """Predict the output for the provided data. """ params = self.params self.ca.estimates = prob_cs_cp = self._g_k(data) # Take the class with maximal (log)probability # XXX in GNB it is axis=0, i.e. classes were first winners = prob_cs_cp.argmax(axis=1) predictions = [self.ulabels[c] for c in winners] if __debug__ and 'GDA' in debug.active: debug( 'GDA', "predict on data.shape=%s min:max(data)=%f:%f " % (data.shape, np.min(data), np.max(data))) return predictions def _inv(self, cov): try: return np.linalg.inv(cov) except Exception, e: if self.params.allow_pinv: try: return np.linalg.pinv(cov) except Exception, e: pass raise DegenerateInputError, \ "Data is probably singular, since inverse fails. Got %s"\ % (e,)
class ProcrusteanMapper(ProjectionMapper): """Mapper to project from one space to another using Procrustean transformation (shift + scaling + rotation). Training this mapper requires data for both source and target space to be present in the training dataset. The source space data is taken from the training dataset's ``samples``, while the target space is taken from a sample attribute corresponding to the ``space`` setting of the ProcrusteanMapper. See: http://en.wikipedia.org/wiki/Procrustes_transformation """ scaling = Parameter( True, constraints='bool', doc="""Estimate a global scaling factor for the transformation (no longer rigid body)""") reflection = Parameter( True, constraints='bool', doc="""Allow for the data to be reflected (so it might not be a rotation. Effective only for non-oblique transformations. """) reduction = Parameter( True, constraints='bool', doc="""If true, it is allowed to map into lower-dimensional space. Forward transformation might be suboptimal then and reverse transformation might not recover all original variance.""") oblique = Parameter( False, constraints='bool', doc="""Either to allow non-orthogonal transformation -- might heavily overfit the data if there is less samples than dimensions. Use `oblique_rcond`.""") oblique_rcond = Parameter( -1, constraints='float', doc="""Cutoff for 'small' singular values to regularize the inverse. See :class:`~numpy.linalg.lstsq` for more information.""") svd = Parameter( 'numpy', constraints=EnsureChoice('numpy', 'scipy', 'dgesvd'), doc="""Implementation of SVD to use. dgesvd requires ctypes to be available.""") def __init__(self, space='targets', **kwargs): ProjectionMapper.__init__(self, space=space, **kwargs) self._scale = None """Estimated scale""" if self.params.svd == 'dgesvd' and not externals.exists( 'liblapack.so'): warning( "Reverting choice of svd for ProcrusteanMapper to be default " "'numpy' since liblapack.so seems not to be available for " "'dgesvd'") self.params.svd = 'numpy' def _train(self, source): params = self.params # Since it is unsupervised, we don't care about labels datas = () odatas = () means = () shapes = () assess_residuals = __debug__ and 'MAP_' in debug.active target = source.sa[self.get_space()].value for i, ds in enumerate((source, target)): if is_datasetlike(ds): data = np.asarray(ds.samples) else: data = ds if assess_residuals: odatas += (data, ) if self._demean: if i == 0: mean = self._offset_in else: mean = data.mean(axis=0) data = data - mean else: # no demeaning === zero means mean = np.zeros(shape=data.shape[1:]) means += (mean, ) datas += (data, ) shapes += (data.shape, ) # shortcuts for sizes sn, sm = shapes[0] tn, tm = shapes[1] # Check the sizes if sn != tn: raise ValueError, "Data for both spaces should have the same " \ "number of samples. Got %d in source and %d in target space" \ % (sn, tn) # Sums of squares ssqs = [np.sum(d**2, axis=0) for d in datas] # XXX check for being invariant? # needs to be tuned up properly and not raise but handle for i in xrange(2): if np.all(ssqs[i] <= np.abs((np.finfo(datas[i].dtype).eps * sn * means[i])**2)): raise ValueError, "For now do not handle invariant in time datasets" norms = [np.sqrt(np.sum(ssq)) for ssq in ssqs] normed = [data / norm for (data, norm) in zip(datas, norms)] # add new blank dimensions to source space if needed if sm < tm: normed[0] = np.hstack((normed[0], np.zeros((sn, tm - sm)))) if sm > tm: if params.reduction: normed[1] = np.hstack((normed[1], np.zeros((sn, sm - tm)))) else: raise ValueError, "reduction=False, so mapping from " \ "higher dimensionality " \ "source space is not supported. Source space had %d " \ "while target %d dimensions (features)" % (sm, tm) source, target = normed if params.oblique: # Just do silly linear system of equations ;) or naive # inverse problem if sn == sm and tm == 1: T = np.linalg.solve(source, target) else: T = np.linalg.lstsq(source, target, rcond=params.oblique_rcond)[0] ss = 1.0 else: # Orthogonal transformation # figure out optimal rotation if params.svd == 'numpy': U, s, Vh = np.linalg.svd(np.dot(target.T, source), full_matrices=False) elif params.svd == 'scipy': # would raise exception if not present externals.exists('scipy', raise_=True) import scipy U, s, Vh = scipy.linalg.svd(np.dot(target.T, source), full_matrices=False) elif params.svd == 'dgesvd': from mvpa2.support.lapack_svd import svd as dgesvd U, s, Vh = dgesvd(np.dot(target.T, source), full_matrices=True, algo='svd') else: raise ValueError('Unknown type of svd %r' % (params.svd)) T = np.dot(Vh.T, U.T) if not params.reflection: # then we need to assure that it is only rotation # "recipe" from # http://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem # for more and info and original references, see # http://dx.doi.org/10.1007%2FBF02289451 s_new = np.ones_like(s) s_new[-1] = np.linalg.det(T) T = np.dot(Vh.T * s_new, U.T) # figure out scale and final translation if not params.reflection: ss = np.sum(s_new * s) else: ss = np.sum(s) # if we were to collect standardized distance # std_d = 1 - sD**2 # select out only relevant dimensions if sm != tm: T = T[:sm, :tm] self._scale = scale = ss * norms[1] / norms[0] # Assign projection if self.params.scaling: proj = scale * T else: proj = T self._proj = proj if self._demean: self._offset_out = means[1] if __debug__ and 'MAP_' in debug.active: # compute the residuals res_f = self.forward(odatas[0]) d_f = np.linalg.norm(odatas[1] - res_f) / np.linalg.norm(odatas[1]) res_r = self.reverse(odatas[1]) d_r = np.linalg.norm(odatas[0] - res_r) / np.linalg.norm(odatas[0]) debug( 'MAP_', "%s, residuals are forward: %g," " reverse: %g" % (repr(self), d_f, d_r)) def _compute_recon(self): """For Procrustean mapper, inverse is transpose. So, let's skip computing inverse in the super class. """ # XXX Change pinv to superclass compute_recon? if self.params.oblique: #return ProjectionMapper._compute_recon(self) return np.linalg.pinv(self._proj) else: return np.transpose( self._proj / self._scale**2) if self.params.scaling else np.transpose( self._proj)
class GroupClusterThreshold_NN3(Learner): """Statistical evaluation of group-level average accuracy maps This algorithm can be used to perform cluster-thresholding of searchlight-based group analyses. It implements a two-stage procedure that uses the results of within-subject permutation analyses, estimates a per feature cluster forming threshold (via bootstrap), and uses the thresholded bootstrap samples to estimate the distribution of cluster sizes in group-average accuracy maps under the NULL hypothesis, as described in [1]_. Note: this class implements a modified version of that algorithm. The present implementation differs in, at least, four aspects from the description in that paper. 1) Cluster p-values refer to the probability of observing a particular cluster size or a larger one (original paper: probability to observe a larger cluster only). Consequently, probabilities reported by this implementation will have a tendency to be higher in comparison. 2) Clusters found in the original (unpermuted) accuracy map are always included in the NULL distribution estimate of cluster sizes. This provides an explicit lower bound for probabilities, as there will always be at least one observed cluster for every cluster size found in the original accuracy map. Consequently, it is impossible to get a probability of zero for clusters of any size (see [2] for more information). 3) Bootstrap accuracy maps that contain no clusters are counted in a dedicated size-zero bin in the NULL distribution of cluster sizes. This change yields reliable cluster-probabilities even for very low featurewise threshold probabilities, where (some portion) of the bootstrap accuracy maps do not contain any clusters. 4) The method for FWE-correction used by the original authors is not provided. Instead, a range of alternatives implemented by the statsmodels package are available. Moreover, this implementation minimizes the required memory demands and allows for computing large numbers of bootstrap samples without significant increase in memory demand (CPU time trade-off). Instances of this class must be trained before than can be used to threshold accuracy maps. The training dataset must match the following criteria: 1) For every subject in the group, it must contain multiple accuracy maps that are the result of a within-subject classification analysis based on permuted class labels. One map must corresponds to one fixed permutation for all features in the map, as described in [1]_. The original authors recommend 100 accuracy maps per subject for a typical searchlight analysis. 2) It must contain a sample attribute indicating which sample is associated with which subject, because bootstrapping average accuracy maps is implemented by randomly drawing one map from each subject. The name of the attribute can be configured via the ``chunk_attr`` parameter. After training, an instance can be called with a dataset to perform threshold and statistical evaluation. Unless a single-sample dataset is passed, all samples in the input dataset will be averaged prior thresholding. Returns ------- Dataset This is a shallow copy of the input dataset (after a potential averaging), hence contains the same data and attributes. In addition it includes the following attributes: ``fa.featurewise_thresh`` Vector with feature-wise cluster-forming thresholds. ``fa.clusters_featurewise_thresh`` Vector with labels for clusters after thresholding the input data with the desired feature-wise probability. Each unique non-zero element corresponds to an individual super-threshold cluster. Cluster values are sorted by cluster size (number of features). The largest cluster is always labeled with ``1``. ``fa.clusters_fwe_thresh`` Vector with labels for super-threshold clusters after correction for multiple comparisons. The attribute is derived from ``fa.clusters_featurewise_thresh`` by removing all clusters that do not pass the threshold when controlling for the family-wise error rate. ``a.clusterstats`` Record array with information on all detected clusters. The array is sorted according to cluster size, starting with the largest cluster in terms of number of features. The array contains the fields ``size`` (number of features comprising the cluster), ``mean``, ``median``, min``, ``max``, ``std`` (respective descriptive statistics for all clusters), and ``prob_raw`` (probability of observing the cluster of a this size or larger under the NULL hypothesis). If correction for multiple comparisons is enabled an additional field ``prob_corrected`` (probability after correction) is added. ``a.clusterlocations`` Record array with information on the location of all detected clusters. The array is sorted according to cluster size (same order as ``a.clusterstats``. The array contains the fields ``max`` (feature coordinate of the maximum score within the cluster, and ``center_of_mass`` (coordinate of the center of mass; weighted by the feature values within the cluster. References ---------- .. [1] Johannes Stelzer, Yi Chen and Robert Turner (2013). Statistical inference and multiple testing correction in classification-based multi-voxel pattern analysis (MVPA): Random permutations and cluster size control. NeuroImage, 65, 69--82. .. [2] Smyth, G. K., & Phipson, B. (2010). Permutation P-values Should Never Be Zero: Calculating Exact P-values When Permutations Are Randomly Drawn. Statistical Applications in Genetics and Molecular Biology, 9, 1--12. """ n_bootstrap = Parameter( 100000, constraints=EnsureInt() & EnsureRange(min=1), doc="""Number of bootstrap samples to be generated from the training dataset. For each sample, an average map will be computed from a set of randomly drawn samples (one from each chunk). Bootstrap samples will be used to estimate a featurewise NULL distribution of accuracy values for initial thresholding, and to estimate the NULL distribution of cluster sizes under the NULL hypothesis. A larger number of bootstrap samples reduces the lower bound of probabilities, which may be beneficial for multiple comparison correction.""") feature_thresh_prob = Parameter( 0.001, constraints=EnsureFloat() & EnsureRange(min=0.0, max=1.0), doc="""Feature-wise probability threshold. The value corresponding to this probability in the NULL distribution of accuracies will be used as threshold for cluster forming. Given that the NULL distribution is estimated per feature, the actual threshold value will vary across features yielding a threshold vector. The number of bootstrap samples need to be adequate for a desired probability. A ``ValueError`` is raised otherwise.""") chunk_attr = Parameter( 'chunks', doc="""Name of the attribute indicating the individual chunks from which a single sample each is drawn for averaging into a bootstrap sample.""") fwe_rate = Parameter( 0.05, constraints=EnsureFloat() & EnsureRange(min=0.0, max=1.0), doc="""Family-wise error rate for multiple comparison correction of cluster size probabilities.""") multicomp_correction = Parameter( 'fdr_bh', constraints=EnsureChoice('bonferroni', 'sidak', 'holm-sidak', 'holm', 'simes-hochberg', 'hommel', 'fdr_bh', 'fdr_by', None), doc="""Strategy for multiple comparison correction of cluster probabilities. All methods supported by statsmodels' ``multitest`` are available. In addition, ``None`` can be specified to disable correction.""") n_blocks = Parameter( 1, constraints=EnsureInt() & EnsureRange(min=1), doc="""Number of segments used to compute the feature-wise NULL distributions. This parameter determines the peak memory demand. In case of a single segment a matrix of size (n_bootstrap x nfeatures) will be allocated. Increasing the number of segments reduces the peak memory demand by that roughly factor. """) n_proc = Parameter( 1, constraints=EnsureInt() & EnsureRange(min=1), doc="""Number of parallel processes to use for computation. Requires `joblib` external module.""") def __init__(self, **kwargs): # force disable auto-train: would make no sense Learner.__init__(self, auto_train=False, **kwargs) if 1. / (self.params.n_bootstrap + 1) > self.params.feature_thresh_prob: raise ValueError('number of bootstrap samples is insufficient for' ' the desired threshold probability') self.untrain() def _untrain(self): self._thrmap = None self._null_cluster_sizes = None @due.dcite( Doi("10.1016/j.neuroimage.2012.09.063"), description="Statistical assessment of (searchlight) MVPA results", tags=['implementation']) def _train(self, ds): # shortcuts chunk_attr = self.params.chunk_attr # # Step 0: bootstrap maps by drawing one for each chunk and average them # (do N iterations) # this could take a lot of memory, hence instead of computing the maps # we compute the source maps they can be computed from and then (re)build # the matrix of bootstrapped maps either row-wise or column-wise (as # needed) to save memory by a factor of (close to) `n_bootstrap` # which samples belong to which chunk chunk_samples = dict([(c, np.where(ds.sa[chunk_attr].value == c)[0]) for c in ds.sa[chunk_attr].unique]) # pre-built the bootstrap combinations bcombos = [[random.sample(v, 1)[0] for v in chunk_samples.values()] for i in xrange(self.params.n_bootstrap)] bcombos = np.array(bcombos, dtype=int) # # Step 1: find the per-feature threshold that corresponds to some p # in the NULL segwidth = ds.nfeatures / self.params.n_blocks # speed things up by operating on an array not a dataset ds_samples = ds.samples if __debug__: debug('GCTHR', 'Compute per-feature thresholds in %i blocks of %i features' % (self.params.n_blocks, segwidth)) # Execution can be done in parallel as the estimation is independent # across features def featuresegment_producer(ncols): for segstart in xrange(0, ds.nfeatures, ncols): # one average map for every stored bcombo # this also slices the input data into feature subsets # for the compute blocks yield [np.mean( # get a view to a subset of the features # -- should be somewhat efficient as feature axis is # sliced ds_samples[sidx, segstart:segstart + ncols], axis=0) for sidx in bcombos] if self.params.n_proc == 1: # Serial execution thrmap = np.hstack( # merge across compute blocks [get_thresholding_map(d, self.params.feature_thresh_prob) # compute a partial threshold map for as many features # as fit into a compute block for d in featuresegment_producer(segwidth)]) else: # Parallel execution verbose_level_parallel = 50 \ if (__debug__ and 'GCTHR' in debug.active) else 0 # local import as only parallel execution needs this from joblib import Parallel, delayed # same code as above, just in parallel with joblib's Parallel thrmap = np.hstack( Parallel(n_jobs=self.params.n_proc, pre_dispatch=self.params.n_proc, verbose=verbose_level_parallel)( delayed(get_thresholding_map) (d, self.params.feature_thresh_prob) for d in featuresegment_producer(segwidth))) # store for later thresholding of input data self._thrmap = thrmap # # Step 2: threshold all NULL maps and build distribution of NULL cluster # sizes # cluster_sizes = Counter() # recompute the bootstrap average maps to threshold them and determine # cluster sizes dsa = dict(mapper=ds.a.mapper) if 'mapper' in ds.a else {} if __debug__: debug('GCTHR', 'Estimating NULL distribution of cluster sizes') # this step can be computed in parallel chunks to speeds things up if self.params.n_proc == 1: # Serial execution for sidx in bcombos: avgmap = np.mean(ds_samples[sidx], axis=0)[None] # apply threshold clustermap = avgmap > thrmap # wrap into a throw-away dataset to get the reverse mapping right bds = Dataset(clustermap, a=dsa) # this function reverse-maps every sample one-by-one, hence no need # to collect chunks of bootstrapped maps cluster_sizes = get_cluster_sizes(bds, cluster_sizes) else: # Parallel execution # same code as above, just restructured for joblib's Parallel for jobres in Parallel(n_jobs=self.params.n_proc, pre_dispatch=self.params.n_proc, verbose=verbose_level_parallel)( delayed(get_cluster_sizes) (Dataset(np.mean(ds_samples[sidx], axis=0)[None] > thrmap, a=dsa)) for sidx in bcombos): # aggregate cluster_sizes += jobres # store cluster size histogram for later p-value evaluation # use a sparse matrix for easy consumption (max dim is the number of # features, i.e. biggest possible cluster) scl = dok_matrix((1, ds.nfeatures + 1), dtype=int) for s in cluster_sizes: scl[0, s] = cluster_sizes[s] self._null_cluster_sizes = scl def _call(self, ds): if len(ds) > 1: # average all samples into one, assuming we got something like one # sample per subject as input avgr = mean_sample() ds = avgr(ds) # threshold input; at this point we only have one sample left thrd = ds.samples[0] > self._thrmap # mapper default mapper = IdentityMapper() # overwrite if possible if hasattr(ds, 'a') and 'mapper' in ds.a: mapper = ds.a.mapper # reverse-map input othrd = _verified_reverse1(mapper, thrd) # TODO: what is your purpose in life osamp? ;-) osamp = _verified_reverse1(mapper, ds.samples[0]) # prep output dataset outds = ds.copy(deep=False) outds.fa['featurewise_thresh'] = self._thrmap # determine clusters labels, num = measurements.label(othrd,structure=np.ones([3,3,3])) area = measurements.sum(othrd, labels, index=np.arange(1, num + 1)).astype(int) com = measurements.center_of_mass( osamp, labels=labels, index=np.arange(1, num + 1)) maxpos = measurements.maximum_position( osamp, labels=labels, index=np.arange(1, num + 1)) # for the rest we need the labels flattened labels = mapper.forward1(labels) # relabel clusters starting with the biggest and increase index with # decreasing size ordered_labels = np.zeros(labels.shape, dtype=int) ordered_area = np.zeros(area.shape, dtype=int) ordered_com = np.zeros((num, len(osamp.shape)), dtype=float) ordered_maxpos = np.zeros((num, len(osamp.shape)), dtype=float) for i, idx in enumerate(np.argsort(area)): ordered_labels[labels == idx + 1] = num - i # kinda ugly, but we are looping anyway ordered_area[i] = area[idx] ordered_com[i] = com[idx] ordered_maxpos[i] = maxpos[idx] labels = ordered_labels area = ordered_area[::-1] com = ordered_com[::-1] maxpos = ordered_maxpos[::-1] del ordered_labels # this one can be big # store cluster labels after forward-mapping outds.fa['clusters_featurewise_thresh'] = labels.copy() # location info outds.a['clusterlocations'] = \ np.rec.fromarrays( [com, maxpos], names=('center_of_mass', 'max')) # update cluster size histogram with the actual result to get a # proper lower bound for p-values # this will make a copy, because the original matrix is int cluster_probs_raw = _transform_to_pvals( area, self._null_cluster_sizes.astype('float')) clusterstats = ( [area, cluster_probs_raw], ['size', 'prob_raw'] ) # evaluate a bunch of stats for all clusters morestats = {} for cid in xrange(len(area)): # keep clusters on outer loop, because selection is more expensive clvals = ds.samples[0, labels == cid + 1] for id_, fx in ( ('mean', np.mean), ('median', np.median), ('min', np.min), ('max', np.max), ('std', np.std)): stats = morestats.get(id_, []) stats.append(fx(clvals)) morestats[id_] = stats for k, v in morestats.items(): clusterstats[0].append(v) clusterstats[1].append(k) if self.params.multicomp_correction is not None: # do a local import as only this tiny portion needs statsmodels import statsmodels.stats.multitest as smm rej, probs_corr = smm.multipletests( cluster_probs_raw, alpha=self.params.fwe_rate, method=self.params.multicomp_correction)[:2] # store corrected per-cluster probabilities clusterstats[0].append(probs_corr) clusterstats[1].append('prob_corrected') # remove cluster labels that did not pass the FWE threshold for i, r in enumerate(rej): if not r: labels[labels == i + 1] = 0 outds.fa['clusters_fwe_thresh'] = labels outds.a['clusterstats'] = \ np.rec.fromarrays(clusterstats[0], names=clusterstats[1]) return outds
class GNB(Classifier): """Gaussian Naive Bayes `Classifier`. `GNB` is a probabilistic classifier relying on Bayes rule to estimate posterior probabilities of labels given the data. Naive assumption in it is an independence of the features, which allows to combine per-feature likelihoods by a simple product across likelihoods of "independent" features. See http://en.wikipedia.org/wiki/Naive_bayes for more information. Provided here implementation is "naive" on its own -- various aspects could be improved, but it has its own advantages: - implementation is simple and straightforward - no data copying while considering samples of specific class - provides alternative ways to assess prior distribution of the classes in the case of unbalanced sets of samples (see parameter `prior`) - makes use of NumPy broadcasting mechanism, so should be relatively efficient - should work for any dimensionality of samples `GNB` is listed both as linear and non-linear classifier, since specifics of separating boundary depends on the data and/or parameters: linear separation is achieved whenever samples are balanced (or ``prior='uniform'``) and features have the same variance across different classes (i.e. if ``common_variance=True`` to enforce this). Whenever decisions are made based on log-probabilities (parameter ``logprob=True``, which is the default), then conditional attribute `values`, if enabled, would also contain log-probabilities. Also mention that normalization by the evidence (P(data)) is disabled by default since it has no impact per se on classification decision. You might like to set parameter normalize to True if you want to access properly scaled probabilities in `values` conditional attribute. """ # XXX decide when should we set corresponding internal, # since it depends actually on the data -- no clear way, # so set both linear and non-linear __tags__ = [ 'gnb', 'linear', 'non-linear', 'binary', 'multiclass' ] common_variance = Parameter(False, constraints='bool', doc="""Use the same variance across all classes.""") prior = Parameter('laplacian_smoothing', constraints=EnsureChoice('laplacian_smoothing', 'uniform', 'ratio'), doc="""How to compute prior distribution.""") logprob = Parameter(True, constraints='bool', doc="""Operate on log probabilities. Preferable to avoid unneeded exponentiation and loose precision. If set, logprobs are stored in `values`""") normalize = Parameter(False, constraints='bool', doc="""Normalize (log)prob by P(data). Requires probabilities thus for `logprob` case would require exponentiation of 'logprob's, thus disabled by default since does not impact classification output. """) def __init__(self, **kwargs): """Initialize an GNB classifier. """ # init base class first Classifier.__init__(self, **kwargs) # pylint friendly initializations self.means = None """Means of features per class""" self.variances = None """Variances per class, but "vars" is taken ;)""" self.ulabels = None """Labels classifier was trained on""" self.priors = None """Class probabilities""" # Define internal state of classifier self._norm_weight = None def _get_priors(self, nlabels, nsamples, nsamples_per_class): """Return prior probabilities given data """ # helper function - squash all dimensions but 1 squash = lambda x: np.atleast_1d(x.squeeze()) prior = self.params.prior if prior == 'uniform': priors = np.ones((nlabels,))/nlabels elif prior == 'laplacian_smoothing': priors = (1+squash(nsamples_per_class)) \ / (float(nsamples) + nlabels) elif prior == 'ratio': priors = squash(nsamples_per_class) / float(nsamples) else: raise ValueError( "No idea on how to handle '%s' way to compute priors" % self.params.prior) return priors def _train(self, dataset): """Train the classifier using `dataset` (`Dataset`). """ params = self.params targets_sa_name = self.get_space() targets_sa = dataset.sa[targets_sa_name] # get the dataset information into easy vars X = dataset.samples labels = targets_sa.value self.ulabels = ulabels = targets_sa.unique nlabels = len(ulabels) label2index = dict((l, il) for il, l in enumerate(ulabels)) # set the feature dimensions nsamples = len(X) s_shape = X.shape[1:] # shape of a single sample self.means = means = \ np.zeros((nlabels, ) + s_shape) self.variances = variances = \ np.zeros((nlabels, ) + s_shape) # degenerate dimension are added for easy broadcasting later on nsamples_per_class = np.zeros((nlabels,) + (1,)*len(s_shape)) # Estimate means and number of samples per each label for s, l in zip(X, labels): il = label2index[l] # index of the label nsamples_per_class[il] += 1 means[il] += s # helper function - squash all dimensions but 1 squash = lambda x: np.atleast_1d(x.squeeze()) ## Actually compute the means non0labels = (squash(nsamples_per_class) != 0) means[non0labels] /= nsamples_per_class[non0labels] # Store prior probabilities self.priors = self._get_priors(nlabels, nsamples, nsamples_per_class) # Estimate variances # better loop than repmat! ;) for s, l in zip(X, labels): il = label2index[l] # index of the label variances[il] += (s - means[il])**2 ## Actually compute the variances if params.common_variance: # we need to get global std cvar = np.sum(variances, axis=0)/nsamples # sum across labels # broadcast the same variance across labels variances[:] = cvar else: variances[non0labels] /= nsamples_per_class[non0labels] # Precompute and store weighting coefficient for Gaussian if params.logprob: # it would be added to exponent self._norm_weight = -0.5 * np.log(2*np.pi*variances) else: self._norm_weight = 1.0/np.sqrt(2*np.pi*variances) if __debug__ and 'GNB' in debug.active: debug('GNB', "training finished on data.shape=%s " % (X.shape, ) + "min:max(data)=%f:%f" % (np.min(X), np.max(X))) def _untrain(self): """Untrain classifier and reset all learnt params """ self.means = None self.variances = None self.ulabels = None self.priors = None super(GNB, self)._untrain() @accepts_dataset_as_samples def _predict(self, data): """Predict the output for the provided data. """ params = self.params # argument of exponentiation scaled_distances = \ -0.5 * (((data - self.means[:, np.newaxis, ...])**2) \ / self.variances[:, np.newaxis, ...]) if params.logprob: # if self.params.common_variance: # XXX YOH: # For decision there is no need to actually compute # properly scaled p, ie 1/sqrt(2pi * sigma_i) could be # simply discarded since it is common across features AND # classes # For completeness -- computing everything now even in logprob lprob_csfs = self._norm_weight[:, np.newaxis, ...] \ + scaled_distances # XXX for now just cut/paste with different operators, but # could just bind them and reuse in the same equations # Naive part -- just a product of probabilities across features ## First we need to reshape to get class x samples x features lprob_csf = lprob_csfs.reshape( lprob_csfs.shape[:2] + (-1,)) ## Now -- sum across features lprob_cs = lprob_csf.sum(axis=2) # Incorporate class probabilities: prob_cs_cp = lprob_cs + np.log(self.priors[:, np.newaxis]) else: # Just a regular Normal distribution with per # feature/class mean and variances prob_csfs = \ self._norm_weight[:, np.newaxis, ...] \ * np.exp(scaled_distances) # Naive part -- just a product of probabilities across features ## First we need to reshape to get class x samples x features prob_csf = prob_csfs.reshape( prob_csfs.shape[:2] + (-1,)) ## Now -- product across features prob_cs = prob_csf.prod(axis=2) # Incorporate class probabilities: prob_cs_cp = prob_cs * self.priors[:, np.newaxis] # Normalize by evidence P(data) if params.normalize: if params.logprob: prob_cs_cp_real = np.exp(prob_cs_cp) else: prob_cs_cp_real = prob_cs_cp prob_s_cp_marginals = np.sum(prob_cs_cp_real, axis=0) if params.logprob: prob_cs_cp -= np.log(prob_s_cp_marginals) else: prob_cs_cp /= prob_s_cp_marginals # Take the class with maximal (log)probability winners = prob_cs_cp.argmax(axis=0) predictions = [self.ulabels[c] for c in winners] # set to the probabilities per class self.ca.estimates = prob_cs_cp.T if __debug__ and 'GNB' in debug.active: debug('GNB', "predict on data.shape=%s min:max(data)=%f:%f " % (data.shape, np.min(data), np.max(data))) return predictions
class Regression(Measure): """ Given a dataset, compute regularized regression (Ridge or Lasso) on the computed neural dissimilarity matrix using an arbitrary number of predictors (model dissimilarity matrices). Requires scikit-learn """ is_trained = True """Indicate that this measure is always trained.""" # copied from PDist class XXX: ok or pass it in kwargs? pairwise_metric = Parameter('correlation', constraints='str', doc="""\ Distance metric to use for calculating pairwise vector distances for dissimilarity matrix (DSM). See scipy.spatial.distance.pdist for all possible metrics.""") center_data = Parameter(False, constraints='bool', doc="""\ If True then center each column of the data matrix by subtracting the column mean from each element. This is recommended especially when using pairwise_metric='correlation'.""") method = Parameter('ridge', constraints=EnsureChoice('ridge', 'lasso'), doc='Compute Ridge (l2) or Lasso (l1) regression') alpha = Parameter(1.0, constraints='float', doc='alpha parameter for lasso' 'regression') fit_intercept = Parameter(True, constraints='bool', doc='whether to fit the' 'intercept') rank_data = Parameter(True, constraints='bool', doc='whether to rank the neural dsm and the ' 'predictor dsms before running the regression model') normalize = Parameter( False, constraints='bool', doc='if True the predictors and neural dsm will be' 'normalized (z-scored) prior to the regression (and after ' 'the data ranking, if rank_data=True)') def __init__(self, predictors, keep_pairs=None, **kwargs): """ Parameters ---------- predictors : array (N*(N-1)/2, n_predictors) array containing the upper triangular matrix in vector form of the predictor Dissimilarity Matrices. Each column is a predictor dsm. keep_pairs : None or list or array indices in range(N*(N-1)/2) to keep before running the regression. All other elements will be removed. If None, the regression is run on the entire DSM. Returns ------- Dataset a dataset with n_predictors samples and one feature. If fit_intercept is True, the last sample is the intercept. """ super(Regression, self).__init__(**kwargs) if len(predictors.shape) == 1: raise ValueError('predictors have shape {0}. Make sure the array ' 'is at least 2d and transposed correctly'.format( predictors.shape)) self.predictors = predictors self.keep_pairs = keep_pairs def _call(self, dataset): externals.exists('skl', raise_=True) from sklearn.linear_model import Lasso, Ridge from sklearn.preprocessing import scale # first run PDist compute_dsm = PDist(pairwise_metric=self.params.pairwise_metric, center_data=self.params.center_data) dsm = compute_dsm(dataset) dsm_samples = dsm.samples if self.params.rank_data: dsm_samples = rankdata(dsm_samples) predictors = np.apply_along_axis(rankdata, 0, self.predictors) else: predictors = self.predictors if self.params.normalize: predictors = scale(predictors, axis=0) dsm_samples = scale(dsm_samples, axis=0) # keep only the item we want if self.keep_pairs is not None: dsm_samples = dsm_samples[self.keep_pairs] predictors = predictors[self.keep_pairs, :] # check that predictors and samples have the correct dimensions if dsm_samples.shape[0] != predictors.shape[0]: raise ValueError('computed dsm has {0} rows, while predictors have' '{1} rows. Check that predictors have the right' 'shape'.format(dsm_samples.shape[0], predictors.shape[0])) # now fit the regression if self.params.method == 'lasso': reg = Lasso elif self.params.method == 'ridge': reg = Ridge else: raise ValueError('I do not know method {0}'.format( self.params.method)) reg_ = reg(alpha=self.params.alpha, fit_intercept=self.params.fit_intercept) reg_.fit(predictors, dsm_samples) coefs = reg_.coef_.reshape(-1, 1) sa = ['coef' + str(i) for i in range(len(coefs))] if self.params.fit_intercept: coefs = np.vstack((coefs, reg_.intercept_)) sa += ['intercept'] return Dataset(coefs, sa={'coefs': sa})
class IIRFilterMapper(Mapper): """Mapper using IIR filters for data transformation. This mapper is able to perform any IIR-based low-pass, high-pass, or band-pass frequency filtering. This is a front-end for SciPy's filtfilt(), hence its usage looks almost exactly identical, and any of SciPy's IIR filters can be used with this mapper: >>> from scipy import signal >>> b, a = signal.butter(8, 0.125) >>> mapper = IIRFilterMapper(b, a, padlen=150) """ axis = Parameter( 0, constraints='int', doc="""The axis of `x` to which the filter is applied. By default the filter is applied to all features along the samples axis""") padtype = Parameter( 'odd', constraints=EnsureChoice('odd', 'even', 'constant') | EnsureNone(), doc="""Must be 'odd', 'even', 'constant', or None. This determines the type of extension to use for the padded signal to which the filter is applied. If `padtype` is None, no padding is used. The default is 'odd'""") padlen = Parameter( None, constraints=EnsureInt() | EnsureNone(), doc="""The number of elements by which to extend `x` at both ends of `axis` before applying the filter. This value must be less than `x.shape[axis]-1`. `padlen=0` implies no padding. The default value is 3*max(len(a),len(b))""") def __init__(self, b, a, **kwargs): """ All constructor parameters are analogs of filtfilt() or are passed on to the Mapper base class. Parameters ---------- b : (N,) array_like The numerator coefficient vector of the filter. a : (N,) array_like The denominator coefficient vector of the filter. If a[0] is not 1, then both a and b are normalized by a[0]. """ Mapper.__init__(self, auto_train=True, **kwargs) self.__iir_num = b self.__iir_denom = a def _forward_data(self, data): params = self.params try: mapped = filtfilt(self.__iir_num, self.__iir_denom, data, axis=params.axis, padtype=params.padtype, padlen=params.padlen) except TypeError: # we have an ancient scipy, do manually # but is will only support 2d arrays if params.axis == 0: data = data.T if params.axis > 1: raise ValueError("this version of scipy does not " "support nd-arrays for filtfilt()") if not (params['padlen'].is_default and params['padtype'].is_default): warning("this version of scipy.signal.filtfilt() does not " "support `padlen` and `padtype` arguments -- ignoring " "them") mapped = [ filtfilt(self.__iir_num, self.__iir_denom, x) for x in data ] mapped = np.array(mapped) if params.axis == 0: mapped = mapped.T return mapped
class PCorrTargetSimilarity(Measure): """Calculate the partial correlations of a neural RDM with more than two target RDMs. This measure can be used for example when comparing a neural RDM with more than one behavioral RDM, and one desires to look only at the correlations of the residuals. NOTA BENE: this measure computes a distance internally through scipy.spatial.pdist, thus you should make sure that the predictors are in the correct direction, that is smaller values imply higher similarity! """ is_trained = True """Indicate that this measure is always trained.""" pairwise_metric = Parameter('correlation', constraints='str', doc="""\ Distance metric to use for calculating pairwise vector distances for the neural dissimilarity matrix (DSM). See scipy.spatial.distance.pdist for all possible metrics.""") correlation_type = Parameter('spearman', constraints=EnsureChoice( 'spearman', 'pearson'), doc="""\ Type of correlation to use between the compute neural RDM and the target RDMs. If spearman, the residuals are ranked prior to the correlation.""") normalize_rdms = Parameter(True, constraints='bool', doc="""\ If True then center and normalize each column of the neural RDM and of the predictor RDMs by subtracting the column mean from each element and imposing unit L2 norm.""") def __init__(self, target_rdms, **kwargs): """ Parameters ---------- target_rdms : array (length N*(N-1)/2, n_predictors) Target dissimilarity matrices """ # init base classes first super(PCorrTargetSimilarity, self).__init__(**kwargs) self.target_rdms = target_rdms self.corrfx = CORRFX[self.params.correlation_type] self.normalize_rdms = self.params.normalize_rdms def _call(self, dataset): """ Parameters ---------- dataset : input dataset Returns ------- Dataset each sample `i` correspond to the partial correlation between the neural RDM and the `target_dsms[:, i]` partialling out `target_dsms[:, j]` with `j != i`. """ data = dataset.samples dsm = pdist(data, self.params.pairwise_metric) rp = pcorr(dsm[:, None], self.target_rdms, corrfx=self.corrfx, normalize=self.normalize_rdms) return Dataset(rp[:, None])