def handle_arg(arg): """Helper which would read in SpatialImage if necessary """ if arg is None: return arg if isinstance(arg, basestring): arg = nb.load(arg) argshape = arg.get_shape() # Assure that we have 3D (at least) if len(argshape)<3: arg = nb.Nifti1Image( arg.get_data().reshape(argshape + (1,)*(3-len(argshape))), arg.get_affine(), arg.get_header()) else: argshape = arg.shape if len(argshape) == 4: if argshape[-1] > 1: warning("For now plot_lightbox can handle only 3d, 4d data was provided." " Plotting only the first volume") if isinstance(arg, SpatialImage): arg = nb.Nifti1Image(arg.get_data()[..., 0], arg.get_affine(), arg.get_header()) else: arg = arg[..., 0] elif len(argshape) != 3: raise ValueError, "For now just handling 3D volumes" return arg
def _check_cosmo_dataset(cosmo): ''' Helper function to ensure a cosmo input for cosmo_dataset is valid. Currently does two things: (1) raise an error if there are no samples (2) raise a warning if samples have very large or very small values. A use case is certain MEEG datasets with very small sample values (in the order of 1e-25) which affects some classifiers ''' samples = cosmo.get('samples', None) if samples is None: raise KeyError("Missing field .samples in %s" % cosmo) # check for extreme values warn_for_extreme_values_decimals = 10 # ignore NaNs and infinity nonzero_msk = np.logical_and(np.isfinite(samples), samples != 0) if np.any(nonzero_msk): max_nonzero = np.max(np.abs(samples[nonzero_msk])) # see how many decimals in the largest absolute value decimals_nonzero = np.log10(max_nonzero) if abs(decimals_nonzero) > warn_for_extreme_values_decimals: msg = ( 'Samples have extreme values, maximum absolute value is %s; ' 'This may affect some analyses. Considering scaling the samples, ' 'e.g. by a factor of 10**%d ' % (max_nonzero, -decimals_nonzero)) warning(msg)
def stability_assurance(cdf): if __debug__ and 'CHECK_STABILITY' in debug.active: cdf_min, cdf_max = np.min(cdf), np.max(cdf) if cdf_min < 0 or cdf_max > 1.0: s = ('', ' for %s' % name)[int(name is not None)] warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \ (cdf_func, s, cdf_min, cdf_max))
def label_voxel(self, c, levels = None): if self.__referenceLevel is None: warning("You did not provide what level to use " "for reference. Assigning 0th level -- '%s'" % (self._levels[0],)) self.set_reference_level(0) # return self.__referenceAtlas.label_voxel(c, levels) c = self._check_range(c) # obtain coordinates of the closest voxel cref = self._data[ self.__referenceLevel.indexes, c[0], c[1], c[2] ] dist = norm( (cref - c) * self.voxdim ) if __debug__: debug('ATL__', "Closest referenced point for %r is " "%r at distance %3.2f" % (c, cref, dist)) if (self.distance - dist) >= 1e-3: # neglect everything smaller result = self.__referenceAtlas.label_voxel(cref, levels) result['voxel_referenced'] = c result['distance'] = dist else: result = self.__referenceAtlas.label_voxel(c, levels) if __debug__: debug('ATL__', "Closest referenced point is " "further than desired distance %.2f" % self.distance) result['voxel_referenced'] = None result['distance'] = 0 return result
def __init__(self, **kwargs): """Initialize an SMLR classifier. """ """ TODO: # Add in likelihood calculation # Add kernels, not just direct methods. """ # init base class first Classifier.__init__(self, **kwargs) if _cStepwiseRegression is None and self.params.implementation == 'C': warning('SMLR: C implementation is not available.' ' Using pure Python one') self.params.implementation = 'Python' # pylint friendly initializations self._ulabels = None """Unigue labels from the training set.""" self.__weights_all = None """Contains all weights including bias values""" self.__weights = None """Just the weights, without the biases""" self.__biases = None """The biases, will remain none if has_bias is False"""
def _SLcholesky_autoreg(C, nsteps=None, **kwargs): """Simple wrapper around cholesky to incrementally regularize the matrix until successful computation. For `nsteps` we boost diagonal 10-fold each time from the 'epsilon' of the respective dtype. If None -- would proceed until reaching 1. """ if nsteps is None: nsteps = -int(np.floor(np.log10(np.finfo(float).eps))) result = None for step in xrange(nsteps): epsilon_value = (10**step) * np.finfo(C.dtype).eps epsilon = epsilon_value * np.eye(C.shape[0]) try: result = SLcholesky(C + epsilon, lower=True) except SLAError, e: warning("Cholesky decomposition lead to failure: %s. " "As requested, performing auto-regularization but " "for better control you might prefer to regularize " "yourself by providing lm parameter to GPR" % e) if step < nsteps - 1: if __debug__: debug( "GPR", "Failed to obtain cholesky on " "auto-regularization step %d value %g. Got %s." " Boosting lambda more to reg. C." % (step, epsilon_value, e)) continue else: raise
def to_npz(self, filename, compress=True): """Save dataset to a .npz file storing all fa/sa/a which are ndarrays Parameters ---------- filename : str compress : bool, optional If True, savez_compressed is used """ savez = np.savez_compressed if compress else np.savez if not filename.endswith('.npz'): filename += '.npz' entries = {'samples': self.samples} skipped = [] for c in ('a', 'fa', 'sa'): col = getattr(self, c) for k in col: v = col[k].value e = '%s.%s' % (c, k) if isinstance(v, np.ndarray): entries[e] = v else: skipped.append(e) if skipped: warning("Skipping %s since not ndarrays" % (', '.join(skipped))) return savez(filename, **entries)
def run(args): if not args.store is None and args.output is None: raise ValueError("--output is require for result storage") if not args.data is None: dss = [arg2ds(d) for d in args.data] if len(dss): # convenience short-cut ds = dss[0] try: import nose.tools as nt except ImportError: pass for expr in args.eval: if expr == '-': exec sys.stdin elif os.path.isfile(expr): execfile(expr, globals(), locals()) else: exec expr if not args.store is None: out = {} for var in args.store: try: out[var] = locals()[var] except KeyError: warning("'%s' not found in local name space -- skipped." % var) if len(out): ds2hdf5(out, args.output, compression=args.hdf5_compression)
def _SLcholesky_autoreg(C, nsteps=None, **kwargs): """Simple wrapper around cholesky to incrementally regularize the matrix until successful computation. For `nsteps` we boost diagonal 10-fold each time from the 'epsilon' of the respective dtype. If None -- would proceed until reaching 1. """ if nsteps is None: nsteps = -int(np.floor(np.log10(np.finfo(float).eps))) result = None for step in xrange(nsteps): epsilon_value = (10**step) * np.finfo(C.dtype).eps epsilon = epsilon_value * np.eye(C.shape[0]) try: result = SLcholesky(C + epsilon, lower=True) except SLAError, e: warning("Cholesky decomposition lead to failure: %s. " "As requested, performing auto-regularization but " "for better control you might prefer to regularize " "yourself by providing lm parameter to GPR" % e) if step < nsteps-1: if __debug__: debug("GPR", "Failed to obtain cholesky on " "auto-regularization step %d value %g. Got %s." " Boosting lambda more to reg. C." % (step, epsilon_value, e)) continue else: raise
def run(args): if args.store is not None and args.output is None: raise ValueError("--output is require for result storage") if args.data is not None: dss = [arg2ds(d) for d in args.data] if len(dss): # convenience short-cut ds = dss[0] try: import nose.tools as nt except ImportError: pass for expr in args.eval: if expr == '-': exec sys.stdin elif os.path.isfile(expr): execfile(expr, globals(), locals()) else: exec expr if args.store is not None: out = {} for var in args.store: try: out[var] = locals()[var] except KeyError: warning("'%s' not found in local name space -- skipped." % var) if len(out): ds2hdf5(out, args.output, compression=args.hdf5_compression)
def _check(self): '''ensures that different fields are sort of consistent''' fields = ['_v', '_f', '_nv', '_nf'] if not all(hasattr(self, field) for field in fields): raise Exception("Incomplete surface!") if self._v.shape != (self._nv, 3): raise Exception("Wrong shape for vertices") if self._f.shape != (self._nf, 3): raise Exception("Wrong shape for faces") # see if all faces have a corresponding node. # actually this would not invalidate the surface, so # we only give a warning unqf = np.unique(self._f) if unqf.size != self._nv: from mvpa2.base import warning warning("Count mismatch for face range (%d!=%d), " "faces without node: %r" % (unqf.size, self._nv, len(set(range(self._nv)) - set(unqf)))) if np.any(unqf != np.arange(self._nv)): from mvpa2.base import warning warning("Missing values in faces")
def test_confusion_based_error(self, l_clf): train = datasets['uni2medium'] train = train[train.sa.train == 1] # to check if we fail to classify for 3 labels test3 = datasets['uni3medium'] test3 = test3[test3.sa.train == 1] err = ConfusionBasedError(clf=l_clf) terr = TransferMeasure(l_clf, Splitter('train', attr_values=[1, 1]), postproc=BinaryFxNode(mean_mismatch_error, 'targets')) self.assertRaises(UnknownStateError, err, None) """Shouldn't be able to access the state yet""" l_clf.train(train) e, te = err(None), terr(train) te = np.asscalar(te) self.assertTrue( abs(e - te) < 1e-10, msg="ConfusionBasedError (%.2g) should be equal to TransferError " "(%.2g) on traindataset" % (e, te)) # this will print nasty WARNING but it is ok -- it is just checking code # NB warnings are not printed while doing whole testing warning("Don't worry about the following warning.") if 'multiclass' in l_clf.__tags__: self.assertFalse(terr(test3) is None) # try copying the beast terr_copy = copy(terr)
def handle_arg(arg): """Helper which would read in SpatialImage if necessary """ if arg is None: return arg if isinstance(arg, basestring): arg = nb.load(arg) argshape = arg.shape # Assure that we have 3D (at least) if len(argshape)<3: arg = nb.Nifti1Image( arg.get_data().reshape(argshape + (1,)*(3-len(argshape))), arg.affine, arg.header) else: argshape = arg.shape if len(argshape) == 4: if argshape[-1] > 1: warning("For now plot_lightbox can handle only 3d, 4d data was provided." " Plotting only the first volume") if isinstance(arg, SpatialImage): arg = nb.Nifti1Image(arg.get_data()[..., 0], arg.affine, arg.header) else: arg = arg[..., 0] elif len(argshape) != 3: raise ValueError, "For now just handling 3D volumes" return arg
def _forward_data(self, data): params = self.params try: mapped = filtfilt(self.__iir_num, self.__iir_denom, data, axis=params.axis, padtype=params.padtype, padlen=params.padlen) except TypeError: # we have an ancient scipy, do manually # but is will only support 2d arrays if params.axis == 0: data = data.T if params.axis > 1: raise ValueError("this version of scipy does not " "support nd-arrays for filtfilt()") if not (params['padlen'].is_default and params['padtype'].is_default): warning("this version of scipy.signal.filtfilt() does not " "support `padlen` and `padtype` arguments -- ignoring " "them") mapped = [filtfilt(self.__iir_num, self.__iir_denom, x) for x in data] mapped = np.array(mapped) if params.axis == 0: mapped = mapped.T return mapped
def _verified_reverse1(mapper, onesample): """Replacement of Mapper.reverse1 with safety net This function can be called instead of a direct call to a mapper's ``reverse1()``. It wraps a single sample into a dummy axis and calls ``reverse()``. Afterwards it verifies that the first axis of the returned array has one item only, otherwise it will issue a warning. This function is useful in any context where it is critical to ensure that reverse mapping a single sample, yields exactly one sample -- which isn't guaranteed due to the flexible nature of mappers. Parameters ---------- mapper : Mapper instance onesample : array-like Single sample (in terms of the supplied mapper). Returns ------- array Shape matches a single sample in terms of the mappers input space. """ dummy_axis_sample = np.asanyarray(onesample)[None] rsample = mapper.reverse(dummy_axis_sample) if not len(rsample) == 1: warning("Reverse mapping single sample yielded multiple -- can lead to unintended behavior!") return rsample[0]
def _get_increments(self, ndim): """Creates a list of increments for a given dimensionality RF: lame yoh just cut-pasted and tuned up because everything depends on ndim... """ # Set element_sizes element_sizes = self._element_sizes if element_sizes is None: element_sizes = np.ones(ndim) else: if (ndim != len(element_sizes)): raise ValueError, \ "Dimensionality mismatch: element_sizes %s provided " \ "to constructor had %i dimensions, whenever queried " \ "coordinate had %i" \ % (element_sizes, len(element_sizes), ndim) center = np.zeros(ndim) element_sizes = np.asanyarray(element_sizes) # What range for each dimension erange = np.ceil(self._radius / element_sizes).astype(int) tentative_increments = np.array(list(np.ndindex(tuple(erange*2 + 1)))) \ - erange # Filter out the ones beyond the "sphere" res = array([ x for x in tentative_increments if self._inner_radius < self._distance_func( x * element_sizes, center) <= self._radius ]) if not len(res): warning("%s defines no neighbors" % self) return res
def _get_increments(self, ndim): """Creates a list of increments for a given dimensionality RF: lame yoh just cut-pasted and tuned up because everything depends on ndim... """ # Set element_sizes element_sizes = self._element_sizes if element_sizes is None: element_sizes = np.ones(ndim) else: if (ndim != len(element_sizes)): raise ValueError, \ "Dimensionality mismatch: element_sizes %s provided " \ "to constructor had %i dimensions, whenever queried " \ "coordinate had %i" \ % (element_sizes, len(element_sizes), ndim) center = np.zeros(ndim) element_sizes = np.asanyarray(element_sizes) # What range for each dimension erange = np.ceil(self._radius / element_sizes).astype(int) tentative_increments = np.array(list(np.ndindex(tuple(erange*2 + 1)))) \ - erange # Filter out the ones beyond the "sphere" res = array([x for x in tentative_increments if self._inner_radius < self._distance_func(x * element_sizes, center) <= self._radius]) if not len(res): warning("%s defines no neighbors" % self) return res
def _forward_data(self, data): params = self.params try: mapped = filtfilt(self.__iir_num, self.__iir_denom, data, axis=params.axis, padtype=params.padtype, padlen=params.padlen) except TypeError: # we have an ancient scipy, do manually # but is will only support 2d arrays if params.axis == 0: data = data.T if params.axis > 1: raise ValueError("this version of scipy does not " "support nd-arrays for filtfilt()") if not (params['padlen'].is_default and params['padtype'].is_default): warning("this version of scipy.signal.filtfilt() does not " "support `padlen` and `padtype` arguments -- ignoring " "them") mapped = [ filtfilt(self.__iir_num, self.__iir_denom, x) for x in data ] mapped = np.array(mapped) if params.axis == 0: mapped = mapped.T return mapped
def test_confusion_based_error(self, l_clf): train = datasets['uni2medium'] train = train[train.sa.train == 1] # to check if we fail to classify for 3 labels test3 = datasets['uni3medium'] test3 = test3[test3.sa.train == 1] err = ConfusionBasedError(clf=l_clf) terr = TransferMeasure(l_clf, Splitter('train', attr_values=[1,1]), postproc=BinaryFxNode(mean_mismatch_error, 'targets')) self.assertRaises(UnknownStateError, err, None) """Shouldn't be able to access the state yet""" l_clf.train(train) e, te = err(None), terr(train) te = np.asscalar(te) self.assertTrue(abs(e-te) < 1e-10, msg="ConfusionBasedError (%.2g) should be equal to TransferError " "(%.2g) on traindataset" % (e, te)) # this will print nasty WARNING but it is ok -- it is just checking code # NB warnings are not printed while doing whole testing warning("Don't worry about the following warning.") if 'multiclass' in l_clf.__tags__: self.assertFalse(terr(test3) is None) # try copying the beast terr_copy = copy(terr)
def append(self, other): """This method should not be used and will be removed in the future""" warning( "AttrDataset.append() is deprecated and will be removed. " "Instead of ds.append(x) use: ds = vstack((ds, x), a=0)" ) if not self.nfeatures == other.nfeatures: raise DatasetError("Cannot merge datasets, because the number of " "features does not match.") if not sorted(self.sa.keys()) == sorted(other.sa.keys()): raise DatasetError( "Cannot merge dataset. This datasets samples " "attributes %s cannot be mapped into the other " "set %s" % (self.sa.keys(), other.sa.keys()) ) # concat the samples as well self.samples = np.concatenate((self.samples, other.samples), axis=0) # tell the collection the new desired length of all attributes self.sa.set_length_check(len(self.samples)) # concat all samples attributes for k, v in other.sa.iteritems(): self.sa[k].value = np.concatenate((self.sa[k].value, v.value), axis=0)
def label_voxel(self, c, levels=None): if self.__referenceLevel is None: warning("You did not provide what level to use " "for reference. Assigning 0th level -- '%s'" % (self._levels[0], )) self.set_reference_level(0) # return self.__referenceAtlas.label_voxel(c, levels) c = self._check_range(c) # obtain coordinates of the closest voxel cref = self._data[self.__referenceLevel.indexes, c[0], c[1], c[2]] dist = norm((cref - c) * self.voxdim) if __debug__: debug( 'ATL__', "Closest referenced point for %r is " "%r at distance %3.2f" % (c, cref, dist)) if (self.distance - dist) >= 1e-3: # neglect everything smaller result = self.__referenceAtlas.label_voxel(cref, levels) result['voxel_referenced'] = c result['distance'] = dist else: result = self.__referenceAtlas.label_voxel(c, levels) if __debug__: debug( 'ATL__', "Closest referenced point is " "further than desired distance %.2f" % self.distance) result['voxel_referenced'] = None result['distance'] = 0 return result
def _check_cosmo_dataset(cosmo): ''' Helper function to ensure a cosmo input for cosmo_dataset is valid. Currently does two things: (1) raise an error if there are no samples (2) raise a warning if samples have very large or very small values. A use case is certain MEEG datasets with very small sample values (in the order of 1e-25) which affects some classifiers ''' samples = cosmo.get('samples', None) if samples is None: raise KeyError("Missing field .samples in %s" % cosmo) # check for extreme values warn_for_extreme_values_decimals = 10 # ignore NaNs and infinity nonzero_msk = np.logical_and(np.isfinite(samples), samples != 0) if np.any(nonzero_msk): max_nonzero = np.max(np.abs(samples[nonzero_msk])) # see how many decimals in the largest absolute value decimals_nonzero = np.log10(max_nonzero) if abs(decimals_nonzero) > warn_for_extreme_values_decimals: msg = ( 'Samples have extreme values, maximum absolute value is %s; ' 'This may affect some analyses. Considering scaling the samples, ' 'e.g. by a factor of 10**%d ' % ( max_nonzero, -decimals_nonzero)) warning(msg)
def _predict(self, data): """Predict using the skl learner """ try: res = self._skl_learner.predict(data) except Exception as e: raise FailedToPredictError("Failed to predict %s on data of shape %s. Got '%s' during" \ " call to predict()." % (self, data.shape, e)) if self.enforce_dim: res_dim = len(res.shape) if res_dim > self.enforce_dim: # would throw meaningful exception if not possible res = res.reshape(res.shape[:self.enforce_dim]) elif res_dim < self.enforce_dim: # broadcast res = res.reshape(res.shape + (1, ) * (self.enforce_dim - res_dim)) # Estimate estimates after predict, so if something goes # wrong, above exception handling occurs if self.ca.is_enabled('probabilities'): if hasattr(self._skl_learner, 'predict_proba'): # Duplication of computation, since in many scenarios # predict() calls predict_proba() self.ca.probabilities = self._skl_learner.predict_proba(data) else: warning("%s has no predict_proba() defined, so no probability" " estimates could be extracted" % self._skl_learner) self.ca.estimates = res return res
def _pvalue(x, cdf_func, tail, return_tails=False, name=None): """Helper function to return p-value(x) given cdf and tail Parameters ---------- cdf_func : callable Function to be used to derive cdf values for x tail : str ('left', 'right', 'any', 'both') Which tail of the distribution to report. For 'any' and 'both' it chooses the tail it belongs to based on the comparison to p=0.5. In the case of 'any' significance is taken like in a one-tailed test. return_tails : bool If True, a tuple return (pvalues, tails), where tails contain 1s if value was from the right tail, and 0 if the value was from the left tail. """ is_scalar = np.isscalar(x) if is_scalar: x = [x] cdf = cdf_func(x) if __debug__ and "CHECK_STABILITY" in debug.active: cdf_min, cdf_max = np.min(cdf), np.max(cdf) if cdf_min < 0 or cdf_max > 1.0: s = ("", " for %s" % name)[int(name is not None)] warning("Stability check of cdf %s failed%s. Min=%s, max=%s" % (cdf_func, s, cdf_min, cdf_max)) # no escape but to assure that CDF is in the right range. Some # distributions from scipy tend to jump away from [0,1] cdf = np.clip(cdf, 0, 1.0) if tail == "left": if return_tails: right_tail = np.zeros(cdf.shape, dtype=bool) elif tail == "right": cdf = 1 - cdf if return_tails: right_tail = np.ones(cdf.shape, dtype=bool) elif tail in ("any", "both"): right_tail = cdf >= 0.5 cdf[right_tail] = 1.0 - cdf[right_tail] if tail == "both": # we need report the area under both tails # XXX this is only meaningful for symetric distributions cdf *= 2 # Assure that NaNs didn't get significant value cdf[np.isnan(x)] = 1.0 if is_scalar: res = cdf[0] else: res = cdf if return_tails: return (res, right_tail) else: return res
def train(self, ds): """ The default implementation calls ``_pretrain()``, ``_train()``, and finally ``_posttrain()``. Parameters ---------- ds: Dataset Training dataset. Returns ------- None """ got_ds = is_datasetlike(ds) # TODO remove first condition if all Learners get only datasets if got_ds and (ds.nfeatures == 0 or len(ds) == 0): raise DegenerateInputError( "Cannot train learner on degenerate data %s" % ds) if __debug__: debug( "LRN", "Training learner %(lrn)s on dataset %(dataset)s", msgargs={'lrn': self, 'dataset': ds}) self._pretrain(ds) # remember the time when started training t0 = time.time() if got_ds: # things might have happened during pretraining if ds.nfeatures > 0: self._train(ds) else: warning("Trying to train on dataset with no features present") if __debug__: debug("LRN", "No features present for training, no actual training " "is called") else: # in this case we claim to have no idea and simply try to train self._train(ds) # store timing self.ca.training_time = time.time() - t0 # and post-proc self._posttrain(ds) # finally flag as trained self._set_trained() if __debug__: debug( "LRN", "Finished training learner %(lrn)s on dataset %(dataset)s", msgargs={'lrn': self, 'dataset': ds})
def __init__(self, generator, queryengine, errorfx=mean_mismatch_error, indexsum=None, reuse_neighbors=False, splitter=None, **kwargs): """Initialize the base class for "naive" searchlight classifiers Parameters ---------- generator : `Generator` Some `Generator` to prepare partitions for cross-validation. It must not change "targets", thus e.g. no AttributePermutator's errorfx : func, optional Functor that computes a scalar error value from the vectors of desired and predicted values (e.g. subclass of `ErrorFunction`). indexsum : ('sparse', 'fancy'), optional What use to compute sums over arbitrary columns. 'fancy' corresponds to regular fancy indexing over columns, whenever in 'sparse', product of sparse matrices is used (usually faster, so is default if `scipy` is available). reuse_neighbors : bool, optional Compute neighbors information only once, thus allowing for efficient reuse on subsequent calls where dataset's feature attributes remain the same (e.g. during permutation testing) splitter : Splitter, optional Which will be used to split partitioned datasets. If None specified then standard one operating on partitions will be used """ # init base class first BaseSearchlight.__init__(self, queryengine, **kwargs) self._errorfx = errorfx self._generator = generator self._splitter = splitter # TODO: move into _call since resetting over default None # obscures __repr__ if indexsum is None: if externals.exists('scipy'): indexsum = 'sparse' else: indexsum = 'fancy' else: if indexsum == 'sparse' and not externals.exists('scipy'): warning("Scipy.sparse isn't available so taking 'fancy' as " "'indexsum' method.") indexsum = 'fancy' self._indexsum = indexsum if not self.nproc in (None, 1): raise NotImplementedError, "For now only nproc=1 (or None for " \ "autodetection) is supported by GNBSearchlight" self.__pb = None # statistics per each block/label self.__reuse_neighbors = reuse_neighbors # Storage to be used for neighborhood information self.__roi_fids = None
def _level3(self, datasets): params = self.params # for quicker access ;) # create a mapper per dataset mappers = [deepcopy(params.alignment) for ds in datasets] # key different from level-2; the common space is uniform #temp_commonspace = commonspace # Fixing nproc=0 if params.nproc == 0: from mvpa2.base import warning warning("nproc of 0 doesn't make sense. Setting nproc to 1.") params.nproc = 1 # Checking for joblib, if not, set nproc to 1 if params.nproc != 1: from mvpa2.base import externals, warning if not externals.exists('joblib'): warning("Setting nproc different from 1 requires joblib package, which " "does not seem to exist. Setting nproc to 1.") params.nproc = 1 # start from original input datasets again if params.nproc == 1: residuals = [] for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) m, residual = get_trained_mapper(ds_new, self.commonspace, m, self.ca['residual_errors'].enabled) if self.ca['residual_errors'].enabled: residuals.append(residual) else: if __debug__: debug('HPAL_', "Level 3: Using joblib with nproc = %d " % params.nproc) verbose_level_parallel = 20 \ if (__debug__ and 'HPAL' in debug.active) else 0 from joblib import Parallel, delayed import sys # joblib's 'multiprocessing' backend has known issues of failure on OSX # Tested with MacOS 10.12.13, python 2.7.13, joblib v0.10.3 if params.joblib_backend is None: params.joblib_backend = 'threading' if sys.platform == 'darwin' \ else 'multiprocessing' res = Parallel( n_jobs=params.nproc, pre_dispatch=params.nproc, backend=params.joblib_backend, verbose=verbose_level_parallel )( delayed(get_trained_mapper) (ds, self.commonspace, mapper, self.ca['residual_errors'].enabled) for ds, mapper in zip(datasets, mappers) ) mappers = [m for m, r in res] if self.ca['residual_errors'].enabled: residuals = [r for m, r in res] if self.ca['residual_errors'].enabled: self.ca.residual_errors = Dataset(samples=np.array(residuals)[None, :]) return mappers
def seed(random_seed): if __debug__: debug('SG', "Seeding shogun's RNG with %s" % random_seed) try: # reuse the same seed for shogun shogun.Library.Math_init_random(random_seed) except Exception, e: warning('Shogun cannot be seeded due to %s' % (e,))
def corr_error_prob(predicted, target): """Computes p-value of correlation between the target and the predicted values. """ from mvpa2.base import warning warning("p-value for correlation is implemented only when scipy is " "available. Bogus value -1.0 is returned otherwise") return -1.0
def _extract_boxcar_events(ds, events=None, time_attr=None, match="prev", eprefix="event", event_mapper=None): """see eventrelated_dataset() for docs""" # relabel argument conv_strategy = {"prev": "floor", "next": "ceil", "closest": "round"}[match] if not time_attr is None: tvec = ds.sa[time_attr].value # we are asked to convert onset time into sample ids descr_events = [] for ev in events: # do not mess with the input data ev = copy.deepcopy(ev) # best matching sample idx = value2idx(ev["onset"], tvec, conv_strategy) # store offset of sample time and real onset ev["orig_offset"] = ev["onset"] - tvec[idx] # rescue the real onset into a new attribute ev["orig_onset"] = ev["onset"] ev["orig_duration"] = ev["duration"] # figure out how many samples we need ev["duration"] = len(tvec[idx:][tvec[idx:] < ev["onset"] + ev["duration"]]) # new onset is sample index ev["onset"] = idx descr_events.append(ev) else: descr_events = events # convert the event specs into the format expected by BoxcarMapper # take the first event as an example of contained keys evvars = _events2dict(descr_events) # checks for p in ["onset", "duration"]: if not p in evvars: raise ValueError("'%s' is a required property for all events." % p) boxlength = max(evvars["duration"]) if __debug__: if not max(evvars["duration"]) == min(evvars["duration"]): warning("Boxcar mapper will use maximum boxlength (%i) of all " "provided Events." % boxlength) # finally create, train und use the boxcar mapper bcm = BoxcarMapper(evvars["onset"], boxlength, space=eprefix) bcm.train(ds) ds = ds.get_mapped(bcm) if event_mapper is None: # at last reflatten the dataset # could we add some meaningful attribute during this mapping, i.e. would # assigning 'inspace' do something good? ds = ds.get_mapped(FlattenMapper(shape=ds.samples.shape[1:])) else: ds = ds.get_mapped(event_mapper) # add samples attributes for the events, simply dump everything as a samples # attribute # special case onset and duration in case of conversion into descrete time if not time_attr is None: for attr in ("onset", "duration"): evvars[attr] = [e[attr] for e in events] ds = _evvars2ds(ds, evvars, eprefix) return ds
def _call(self, dataset): """Perform the ROI search. """ # local binding nproc = self.nproc if nproc is None and externals.exists('pprocess'): import pprocess try: nproc = pprocess.get_number_of_cores() or 1 except AttributeError: warning("pprocess version %s has no API to figure out maximal " "number of cores. Using 1" % externals.versions['pprocess']) nproc = 1 # train the queryengine self._queryengine.train(dataset) # decide whether to run on all possible center coords or just a provided # subset if isinstance(self.__roi_ids, str): roi_ids = dataset.fa[self.__roi_ids].value.nonzero()[0] elif self.__roi_ids is not None: roi_ids = self.__roi_ids # safeguard against stupidity if __debug__: if max(roi_ids) >= dataset.nfeatures: raise IndexError, \ "Maximal center_id found is %s whenever given " \ "dataset has only %d features" \ % (max(roi_ids), dataset.nfeatures) else: roi_ids = np.arange(dataset.nfeatures) # pass to subclass results, roi_sizes = self._sl_call(dataset, roi_ids, nproc) if not roi_sizes is None: self.ca.roi_sizes = roi_sizes if 'mapper' in dataset.a: # since we know the space we can stick the original mapper into the # results as well if self.__roi_ids is None: results.a['mapper'] = copy.copy(dataset.a.mapper) else: # there is an additional selection step that needs to be # expressed by another mapper mapper = copy.copy(dataset.a.mapper) mapper.append(StaticFeatureSelection(roi_ids, dshape=dataset.shape[1:])) results.a['mapper'] = mapper # charge state self.ca.raw_results = results # return raw results, base-class will take care of transformations return results
def _forward_dataset(self, ds): # local binding chunks_attr = self.__chunks_attr dtype = self.__dtype if __debug__ and chunks_attr is not None: nsamples_per_chunk = get_nsamples_per_attr(ds, chunks_attr) min_nsamples_per_chunk = np.min(nsamples_per_chunk.values()) if min_nsamples_per_chunk in range(3, 6): warning( "Z-scoring chunk-wise having a chunk with only " "%d samples is 'discouraged'. " "You have chunks with following number of samples: %s" % ( min_nsamples_per_chunk, nsamples_per_chunk, )) if min_nsamples_per_chunk <= 2: warning( "Z-scoring chunk-wise having a chunk with less " "than three samples will set features in these " "samples to either zero (with 1 sample in a chunk) " "or -1/+1 (with 2 samples in a chunk). " "You have chunks with following number of samples: %s" % (nsamples_per_chunk, )) params = self.__params_dict if params is None: raise RuntimeError, \ "ZScoreMapper needs to be trained before call to forward" if self._secret_inplace_zscore: mds = ds else: # shallow copy to put the new stuff in mds = ds.copy(deep=False) # but deepcopy the samples since _zscore would modify inplace mds.samples = mds.samples.copy() # cast the data to float, since in-place operations below do not upcast! if np.issubdtype(mds.samples.dtype, np.integer): mds.samples = mds.samples.astype(dtype) if '__all__' in params: # we have a global parameter set mds.samples = self._zscore(mds.samples, *params['__all__']) else: # per chunk z-scoring for c in mds.sa[chunks_attr].unique: if not c in params: raise RuntimeError( "%s has no parameters for chunk '%s'. It probably " "wasn't present in the training dataset!?" % (self.__class__.__name__, c)) slicer = np.where(mds.sa[chunks_attr].value == c)[0] mds.samples[slicer] = self._zscore(mds.samples[slicer], *params[c]) return mds
def _predict(self, data): """Predict values for the data """ # libsvm needs doubles src = _data2ls(data) ca = self.ca predictions = [ self.model.predict(p) for p in src ] if ca.is_enabled('estimates'): if self.__is_regression__: estimates = [ self.model.predict_values_raw(p)[0] for p in src ] else: # if 'trained_targets' are literal they have to be mapped if ( np.issubdtype(self.ca.trained_targets.dtype, 'c') or np.issubdtype(self.ca.trained_targets.dtype, 'U') ): trained_targets = self._attrmap.to_numeric( self.ca.trained_targets) else: trained_targets = self.ca.trained_targets nlabels = len(trained_targets) # XXX We do duplicate work. model.predict calls # predict_values_raw internally and then does voting or # thresholding. So if speed becomes a factor we might # want to move out logic from libsvm over here to base # predictions on obtined values, or adjust libsvm to # spit out values from predict() as well if nlabels == 2: # Apperently libsvm reorders labels so we need to # track (1,0) values instead of (0,1) thus just # lets take negative reverse estimates = [ self.model.predict_values(p)[(trained_targets[1], trained_targets[0])] for p in src ] if len(estimates) > 0: if __debug__: debug("SVM", "Forcing estimates to be ndarray and reshaping" " them into 1D vector") estimates = np.asarray(estimates).reshape(len(estimates)) else: # In multiclass we return dictionary for all pairs # of labels, since libsvm does 1-vs-1 pairs estimates = [ self.model.predict_values(p) for p in src ] ca.estimates = estimates if ca.is_enabled("probabilities"): # XXX Is this really necesssary? yoh don't think so since # assignment to ca is doing the same #self.probabilities = [ self.model.predict_probability(p) # for p in src ] try: ca.probabilities = [ self.model.predict_probability(p) for p in src ] except TypeError: warning("Current SVM %s doesn't support probability " % self + " estimation.") return predictions
def run(args): from mvpa2.base.hdf5 import h5save ds = None if not args.txt_data is None: verbose(1, "Load data from TXT file '%s'" % args.txt_data) samples = _load_from_txt(args.txt_data) ds = Dataset(samples) elif not args.npy_data is None: verbose(1, "Load data from NPY file '%s'" % args.npy_data) samples = _load_from_npy(args.npy_data) ds = Dataset(samples) elif not args.mri_data is None: verbose(1, "Load data from MRI image(s) %s" % args.mri_data) from mvpa2.datasets.mri import fmri_dataset vol_attr = dict() if not args.add_vol_attr is None: # XXX add a way to use the mapper of an existing dataset to # add a volume attribute without having to load the entire # mri data again vol_attr = dict(args.add_vol_attr) if not len(args.add_vol_attr) == len(vol_attr): warning("--vol-attr option with duplicate attribute name: " "check arguments!") verbose(2, "Add volumetric feature attributes: %s" % vol_attr) ds = fmri_dataset(args.mri_data, mask=args.mask, add_fa=vol_attr) if ds is None: if args.data is None: raise RuntimeError('no data source specific') else: ds = hdf2ds(args.data)[0] else: if args.data is not None: verbose( 1, 'ignoring dataset input in favor of other data source -- remove either one to disambiguate' ) # act on all attribute options ds = process_common_dsattr_opts(ds, args) if not args.add_fsl_mcpar is None: from mvpa2.misc.fsl.base import McFlirtParams mc_par = McFlirtParams(args.add_fsl_mcpar) for param in mc_par: verbose( 2, "Add motion regressor as sample attribute '%s'" % ('mc_' + param)) ds.sa['mc_' + param] = mc_par[param] verbose(3, "Dataset summary %s" % (ds.summary())) # and store outfilename = args.output if not outfilename.endswith('.hdf5'): outfilename += '.hdf5' verbose(1, "Save dataset to '%s'" % outfilename) h5save(outfilename, ds, mkdir=True, compression=args.hdf5_compression)
def __init__(self, generator, queryengine, errorfx=mean_mismatch_error, indexsum=None, reuse_neighbors=False, **kwargs): """Initialize the base class for "naive" searchlight classifiers Parameters ---------- generator : `Generator` Some `Generator` to prepare partitions for cross-validation. It must not change "targets", thus e.g. no AttributePermutator's errorfx : func, optional Functor that computes a scalar error value from the vectors of desired and predicted values (e.g. subclass of `ErrorFunction`). indexsum : ('sparse', 'fancy'), optional What use to compute sums over arbitrary columns. 'fancy' corresponds to regular fancy indexing over columns, whenever in 'sparse', product of sparse matrices is used (usually faster, so is default if `scipy` is available). reuse_neighbors : bool, optional Compute neighbors information only once, thus allowing for efficient reuse on subsequent calls where dataset's feature attributes remain the same (e.g. during permutation testing) """ # init base class first BaseSearchlight.__init__(self, queryengine, **kwargs) self._errorfx = errorfx self._generator = generator # TODO: move into _call since resetting over default None # obscures __repr__ if indexsum is None: if externals.exists('scipy'): indexsum = 'sparse' else: indexsum = 'fancy' else: if indexsum == 'sparse' and not externals.exists('scipy'): warning("Scipy.sparse isn't available so taking 'fancy' as " "'indexsum' method.") indexsum = 'fancy' self._indexsum = indexsum if not self.nproc in (None, 1): raise NotImplementedError, "For now only nproc=1 (or None for " \ "autodetection) is supported by GNBSearchlight" self.__pb = None # statistics per each block/label self.__reuse_neighbors = reuse_neighbors # Storage to be used for neighborhood information self.__roi_fids = None
def Atlas(filename=None, name=None, *args, **kwargs): """A convinience factory for the atlases """ if filename is None: if name is None: raise ValueError( "Please provide either path or name of the atlas to be used") atlaspath = KNOWN_ATLASES[name] filename = atlaspath % ({'name': name}) if not os.path.exists(filename): raise IOError("File %s for atlas %s was not found" % (filename, name)) else: if name is not None: raise ValueError("Provide only filename or name") try: # Just to guestimate what atlas that is tempAtlas = XMLBasedAtlas(filename=filename, load_maps=False) #, *args, **kwargs) version = tempAtlas.version atlas_source = None for cls in [PyMVPAAtlas, FSLAtlas]: if cls._check_version(version): atlas_source = cls.source break if atlas_source is None: if __debug__: debug('ATL_', "Unknown atlas " + filename) return tempAtlas atlasTypes = { 'PyMVPA': { "Label": LabelsAtlas, "Reference": ReferencesAtlas }, 'FSL': { "Label": FSLLabelsAtlas, "Probabalistic": FSLProbabilisticAtlas, "Probabilistic": FSLProbabilisticAtlas, } }[atlas_source] atlasType = tempAtlas.header.type.text if atlasType in atlasTypes: if __debug__: debug('ATL_', "Creating %s Atlas" % atlasType) return atlasTypes[atlasType](filename=filename, *args, **kwargs) #return ReferencesAtlas(filename) else: warning( "Unknown %s type '%s' of atlas in %s." " Known are %s" % (atlas_source, atlasType, filename, list(atlasTypes.keys())), 2) return tempAtlas except XMLAtlasException as e: print("File %s is not a valid XML based atlas due to %s" \ % (filename, repr(e))) raise e
def _set_retrainable(self, value, force=False): """Assign value of retrainable parameter If retrainable flag is to be changed, classifier has to be untrained. Also internal attributes such as _changedData, __changedData_isset, and __idhashes should be initialized if it becomes retrainable """ pretrainable = self.params["retrainable"] if (force or value != pretrainable.value) and "retrainable" in self.__tags__: if __debug__: debug("CLF_", "Setting retrainable to %s" % value) if "meta" in self.__tags__: warning( "Retrainability is not yet crafted/tested for " "meta classifiers. Unpredictable behavior might occur" ) # assure that we don't drag anything behind if self.trained: self.untrain() ca = self.ca if not value and ca.has_key("retrained"): ca.pop("retrained") ca.pop("repredicted") if value: if not "retrainable" in self.__tags__: warning( "Setting of flag retrainable for %s has no effect" " since classifier has no such capability. It would" " just lead to resources consumption and slowdown" % self ) ca["retrained"] = ConditionalAttribute(enabled=True, doc="Either retrainable classifier was retrained") ca["repredicted"] = ConditionalAttribute( enabled=True, doc="Either retrainable classifier was repredicted" ) pretrainable.value = value # if retrainable we need to keep track of things if value: self.__idhashes = {"traindata": None, "targets": None, "testdata": None} # , 'testtraindata': None} if __debug__ and "CHECK_RETRAIN" in debug.active: # ??? it is not clear though if idhash is faster than # simple comparison of (dataset != __traineddataset).any(), # but if we like to get rid of __traineddataset then we # should use idhash anyways self.__trained = self.__idhashes.copy() # just same Nones self.__reset_changed_data() self.__invalidatedChangedData = {} elif "retrainable" in self.__tags__: # self.__reset_changed_data() self.__changedData_isset = False self._changedData = None self.__idhashes = None if __debug__ and "CHECK_RETRAIN" in debug.active: self.__trained = None
def __init__(self, space='targets', **kwargs): ProjectionMapper.__init__(self, space=space, **kwargs) self._scale = None """Estimated scale""" if self.params.svd == 'dgesvd' and not externals.exists('liblapack.so'): warning("Reverting choice of svd for ProcrusteanMapper to be default " "'numpy' since liblapack.so seems not to be available for " "'dgesvd'") self.params.svd = 'numpy'
def _set_retrainable(self, value, force=False): """Assign value of retrainable parameter If retrainable flag is to be changed, classifier has to be untrained. Also internal attributes such as _changedData, __changedData_isset, and __idhashes should be initialized if it becomes retrainable """ pretrainable = self.params['retrainable'] if (force or value != pretrainable.value) \ and 'retrainable' in self.__tags__: if __debug__: debug("CLF_", "Setting retrainable to %s" % value) if 'meta' in self.__tags__: warning("Retrainability is not yet crafted/tested for " "meta classifiers. Unpredictable behavior might occur") # assure that we don't drag anything behind if self.trained: self.untrain() ca = self.ca if not value and ca.has_key('retrained'): ca.pop('retrained') ca.pop('repredicted') if value: if not 'retrainable' in self.__tags__: warning("Setting of flag retrainable for %s has no effect" " since classifier has no such capability. It would" " just lead to resources consumption and slowdown" % self) ca['retrained'] = ConditionalAttribute(enabled=True, doc="Either retrainable classifier was retrained") ca['repredicted'] = ConditionalAttribute(enabled=True, doc="Either retrainable classifier was repredicted") pretrainable.value = value # if retrainable we need to keep track of things if value: self.__idhashes = {'traindata': None, 'targets': None, 'testdata': None} #, 'testtraindata': None} if __debug__ and 'CHECK_RETRAIN' in debug.active: # ??? it is not clear though if idhash is faster than # simple comparison of (dataset != __traineddataset).any(), # but if we like to get rid of __traineddataset then we # should use idhash anyways self.__trained = self.__idhashes.copy() # just same Nones self.__reset_changed_data() self.__invalidatedChangedData = {} elif 'retrainable' in self.__tags__: #self.__reset_changed_data() self.__changedData_isset = False self._changedData = None self.__idhashes = None if __debug__ and 'CHECK_RETRAIN' in debug.active: self.__trained = None
def fit(self, measure, ds): """Fit the distribution by performing multiple cycles which repeatedly permuted labels in the training dataset. Parameters ---------- measure: Measure or None A measure used to compute the results from shuffled data. Can be None if a measure instance has been provided to the constructor. ds: `Dataset` which gets permuted and used to compute the measure/transfer error multiple times. """ # TODO: place exceptions separately so we could avoid circular imports from mvpa2.base.learner import LearnerError # prefer the already assigned measure over anything the was passed to # the function. # XXX that is a bit awkward but is necessary to keep the code changes # in the rest of PyMVPA minimal till this behavior become mandatory if not self._measure is None: measure = self._measure measure.untrain() dist_samples = [] """Holds the values for randomized labels.""" # estimate null-distribution # TODO this really needs to be more clever! If data samples are # shuffled within a class it really makes no difference for the # classifier, hence the number of permutations to estimate the # null-distribution of transfer errors can be reduced dramatically # when the *right* permutations (the ones that matter) are done. skipped = 0 # # of skipped permutations for p, permuted_ds in enumerate(self.__permutator.generate(ds)): # new permutation all the time # but only permute the training data and keep the testdata constant # if __debug__: debug("STATMC", "Doing %i permutations: %i" % (self.__permutator.nruns, p + 1), cr=True) # compute and store the measure of this permutation # assume it has `TransferError` interface try: res = measure(permuted_ds) dist_samples.append(res.samples) except LearnerError, e: if __debug__: debug("STATMC", " skipped", cr=True) warning( "Failed to obtain value from %s due to %s. Measurement" " was skipped, which could lead to unstable and/or" " incorrect assessment of the null_dist" % (measure, e) ) skipped += 1 continue
def _forward_dataset(self, ds): # local binding chunks_attr = self.__chunks_attr dtype = self.__dtype if __debug__ and not chunks_attr is None: nsamples_per_chunk = get_nsamples_per_attr(ds, chunks_attr) min_nsamples_per_chunk = np.min(nsamples_per_chunk.values()) if min_nsamples_per_chunk in range(3, 6): warning( "Z-scoring chunk-wise having a chunk with only " "%d samples is 'discouraged'. " "You have chunks with following number of samples: %s" % (min_nsamples_per_chunk, nsamples_per_chunk) ) if min_nsamples_per_chunk <= 2: warning( "Z-scoring chunk-wise having a chunk with less " "than three samples will set features in these " "samples to either zero (with 1 sample in a chunk) " "or -1/+1 (with 2 samples in a chunk). " "You have chunks with following number of samples: %s" % (nsamples_per_chunk,) ) params = self.__params_dict if params is None: raise RuntimeError, "ZScoreMapper needs to be trained before call to forward" if self._secret_inplace_zscore: mds = ds else: # shallow copy to put the new stuff in mds = ds.copy(deep=False) # but deepcopy the samples since _zscore would modify inplace mds.samples = mds.samples.copy() # cast the data to float, since in-place operations below do not upcast! if np.issubdtype(mds.samples.dtype, np.integer): mds.samples = mds.samples.astype(dtype) if "__all__" in params: # we have a global parameter set mds.samples = self._zscore(mds.samples, *params["__all__"]) else: # per chunk z-scoring for c in mds.sa[chunks_attr].unique: if not c in params: raise RuntimeError( "%s has no parameters for chunk '%s'. It probably " "wasn't present in the training dataset!?" % (self.__class__.__name__, c) ) slicer = np.where(mds.sa[chunks_attr].value == c)[0] mds.samples[slicer] = self._zscore(mds.samples[slicer], *params[c]) return mds
def fit(self, measure, ds): """Fit the distribution by performing multiple cycles which repeatedly permuted labels in the training dataset. Parameters ---------- measure: Measure or None A measure used to compute the results from shuffled data. Can be None if a measure instance has been provided to the constructor. ds: `Dataset` which gets permuted and used to compute the measure/transfer error multiple times. """ # TODO: place exceptions separately so we could avoid circular imports from mvpa2.base.learner import LearnerError # prefer the already assigned measure over anything the was passed to # the function. # XXX that is a bit awkward but is necessary to keep the code changes # in the rest of PyMVPA minimal till this behavior become mandatory if not self._measure is None: measure = self._measure measure.untrain() dist_samples = [] """Holds the values for randomized labels.""" # estimate null-distribution # TODO this really needs to be more clever! If data samples are # shuffled within a class it really makes no difference for the # classifier, hence the number of permutations to estimate the # null-distribution of transfer errors can be reduced dramatically # when the *right* permutations (the ones that matter) are done. skipped = 0 # # of skipped permutations for p, permuted_ds in enumerate(self.__permutator.generate(ds)): # new permutation all the time # but only permute the training data and keep the testdata constant # if __debug__: debug('STATMC', "Doing %i permutations: %i" \ % (self.__permutator.count, p+1), cr=True) # compute and store the measure of this permutation # assume it has `TransferError` interface try: res = measure(permuted_ds) dist_samples.append(res.samples) except LearnerError, e: if __debug__: debug('STATMC', " skipped", cr=True) warning( 'Failed to obtain value from %s due to %s. Measurement' ' was skipped, which could lead to unstable and/or' ' incorrect assessment of the null_dist' % (measure, e)) skipped += 1 continue
def _call(self, dataset): """Perform the ROI search. """ # local binding nproc = self.nproc if nproc is None and externals.exists('pprocess'): import pprocess try: nproc = pprocess.get_number_of_cores() or 1 except AttributeError: warning("pprocess version %s has no API to figure out maximal " "number of cores. Using 1" % externals.versions['pprocess']) nproc = 1 # train the queryengine self._queryengine.train(dataset) # decide whether to run on all possible center coords or just a provided # subset if isinstance(self.__roi_ids, str): roi_ids = dataset.fa[self.__roi_ids].value.nonzero()[0] elif self.__roi_ids is not None: roi_ids = self.__roi_ids # safeguard against stupidity if __debug__: if max(roi_ids) >= dataset.nfeatures: raise IndexError, \ "Maximal center_id found is %s whenever given " \ "dataset has only %d features" \ % (max(roi_ids), dataset.nfeatures) else: roi_ids = np.arange(dataset.nfeatures) # pass to subclass results = self._sl_call(dataset, roi_ids, nproc) if 'mapper' in dataset.a: # since we know the space we can stick the original mapper into the # results as well if self.__roi_ids is None: results.a['mapper'] = copy.copy(dataset.a.mapper) else: # there is an additional selection step that needs to be # expressed by another mapper mapper = copy.copy(dataset.a.mapper) mapper.append( StaticFeatureSelection(roi_ids, dshape=dataset.shape[1:])) results.a['mapper'] = mapper # charge state self.ca.raw_results = results # return raw results, base-class will take care of transformations return results
def get_bold_run_model(self, model, subj, run): """Return the stimulation design for a particular subject/task/run. Parameters ---------- model : int Model identifier. subj : int Subject identifier. run : int Run ID. Returns ------- list One item per event in the run. All items are dictionaries with the following keys: 'condition', 'onset', 'duration', 'intensity', 'run', 'task', 'trial_idx', 'ctrial_idx', where the first is a literal label, the last four are integer IDs, and the rest are typically floating point values. 'onset_idx' is the index of the event specification sorted by time across the entire run (typically corresponding to a trial index), 'conset_idx' is analog but contains the onset index per condition, i.e. the nth trial of the respective condition in a run. """ conditions = self.get_model_conditions(model) events = [] ev_fields = ('onset', 'duration', 'intensity') # get onset info for specific subject/task/run combo for cond in conditions: task_id = cond['task'] try: evdata = np.atleast_1d( self._load_model_task_run_onsets( subj, model, task_id, run, cond['id'])) except IOError: warning("onset definition file not found; no information " "about condition '%s' for run %i" % (cond['name'], run)) continue for i, ev in enumerate(evdata): evdict = dict(zip(ev_fields, [ev[field] for field in ev_fields])) evdict['task'] = task_id evdict['condition'] = cond['name'] evdict['run'] = run evdict['conset_idx'] = i events.append(evdict) events = sorted(events, key=lambda x: x['onset']) for i, ev in enumerate(events): ev['onset_idx'] = i return events
def get_bold_run_model(self, model, subj, run): """Return the stimulation design for a particular subject/task/run. Parameters ---------- model : int Model identifier. subj : int Subject identifier. run : int Run ID. Returns ------- list One item per event in the run. All items are dictionaries with the following keys: 'condition', 'onset', 'duration', 'intensity', 'run', 'task', 'trial_idx', 'ctrial_idx', where the first is a literal label, the last four are integer IDs, and the rest are typically floating point values. 'onset_idx' is the index of the event specification sorted by time across the entire run (typically corresponding to a trial index), 'conset_idx' is analog but contains the onset index per condition, i.e. the nth trial of the respective condition in a run. """ conditions = self.get_model_conditions(model) events = [] ev_fields = ('onset', 'duration', 'intensity') # get onset info for specific subject/task/run combo for cond in conditions: task_id = cond['task'] try: evdata = np.atleast_1d( self._load_model_task_run_onsets(subj, model, task_id, run, cond['id'])) except IOError: warning("onset definition file not found; no information " "about condition '%s' for run %i" % (cond['name'], run)) continue for i, ev in enumerate(evdata): evdict = dict( list(zip(ev_fields, [ev[field] for field in ev_fields]))) evdict['task'] = task_id evdict['condition'] = cond['name'] evdict['run'] = run evdict['conset_idx'] = i events.append(evdict) events = sorted(events, key=lambda x: x['onset']) for i, ev in enumerate(events): ev['onset_idx'] = i return events
def _check_range(self, c): """ check and adjust the voxel coordinates""" # check range if __debug__: debug('ATL__', "Querying for voxel %r" % (c,)) if not check_range(c, self.extent): warning("Coordinates %r are not within the extent %r." \ " Reseting to (0,0,0)" % (c, self.extent)) # assume that voxel [0,0,0] is blank, i.e. carries # no labels which could possibly result in evil outcome c = [0]*3 return c
def _check_range(self, c): """ check and adjust the voxel coordinates""" # check range if __debug__: debug('ATL__', "Querying for voxel %r" % (c, )) if not check_range(c, self.extent): warning("Coordinates %r are not within the extent %r." \ " Reseting to (0,0,0)" % (c, self.extent)) # assume that voxel [0,0,0] is blank, i.e. carries # no labels which could possibly result in evil outcome c = [0] * 3 return c
def run(args): from mvpa2.base.hdf5 import h5save ds = None if not args.txt_data is None: verbose(1, "Load data from TXT file '%s'" % args.txt_data) samples = _load_from_txt(args.txt_data) ds = Dataset(samples) elif not args.npy_data is None: verbose(1, "Load data from NPY file '%s'" % args.npy_data) samples = _load_from_npy(args.npy_data) ds = Dataset(samples) elif not args.mri_data is None: verbose(1, "Load data from MRI image(s) %s" % args.mri_data) from mvpa2.datasets.mri import fmri_dataset vol_attr = dict() if not args.add_vol_attr is None: # XXX add a way to use the mapper of an existing dataset to # add a volume attribute without having to load the entire # mri data again vol_attr = dict(args.add_vol_attr) if not len(args.add_vol_attr) == len(vol_attr): warning("--vol-attr option with duplicate attribute name: " "check arguments!") verbose(2, "Add volumetric feature attributes: %s" % vol_attr) ds = fmri_dataset(args.mri_data, mask=args.mask, add_fa=vol_attr) if ds is None: if args.data is None: raise RuntimeError('no data source specific') else: ds = hdf2ds(args.data)[0] else: if args.data is not None: verbose(1, 'ignoring dataset input in favor of other data source -- remove either one to disambiguate') # act on all attribute options ds = process_common_dsattr_opts(ds, args) if not args.add_fsl_mcpar is None: from mvpa2.misc.fsl.base import McFlirtParams mc_par = McFlirtParams(args.add_fsl_mcpar) for param in mc_par: verbose(2, "Add motion regressor as sample attribute '%s'" % ('mc_' + param)) ds.sa['mc_' + param] = mc_par[param] verbose(3, "Dataset summary %s" % (ds.summary())) # and store outfilename = args.output if not outfilename.endswith('.hdf5'): outfilename += '.hdf5' verbose(1, "Save dataset to '%s'" % outfilename) h5save(outfilename, ds, mkdir=True, compression=args.hdf5_compression)
def _forward_dataset_grouped(self, ds): mdata = [] # list of samples array pieces if self.__axis == 'samples': col = ds.sa axis = 0 elif self.__axis == 'features': col = ds.fa axis = 1 else: raise RuntimeError("This should not have happened!") attrs = dict(zip(col.keys(), [[] for i in col])) # create a dictionary for all unique elements in all attribute this # mapper should operate on self.__attrcombs = dict( zip(self.__uattrs, [col[attr].unique for attr in self.__uattrs])) # let it generate all combinations of unique elements in any attr for comb in _orthogonal_permutations(self.__attrcombs): selector = reduce(np.multiply, [ array_whereequal(col[attr].value, value) for attr, value in comb.iteritems() ]) # process the samples if axis == 0: samples = ds.samples[selector] else: samples = ds.samples[:, selector] # check if there were any samples for such a combination, # if not -- warning and skip the rest of the loop body if not len(samples): warning( 'There were no samples for combination %s. It might be ' 'a sign of a disbalanced dataset %s.' % (comb, ds)) continue fxed_samples = self.__smart_apply_along_axis(samples) mdata.append(fxed_samples) if not self.__attrfx is None: # and now all samples attributes fxed_attrs = [ self.__attrfx(col[attr].value[selector]) for attr in col ] for i, attr in enumerate(col): attrs[attr].append(fxed_attrs[i]) if axis == 0: mdata = np.vstack(mdata) else: mdata = np.vstack(np.transpose(mdata)) return mdata, attrs
def predict(self, dataset): """Predict classifier on data Shouldn't be overridden in subclasses unless explicitly needed to do so. Also subclasses trying to call super class's predict should call _predict if within _predict instead of predict() since otherwise it would loop """ ## ??? yoh: changed to asany from as without exhaustive check data = np.asanyarray(dataset.samples) if __debug__: # Verify that we have no NaN/Inf's which we do not "support" ATM if not np.all(np.isfinite(data)): raise ValueError( "Some input data for predict is not finite (NaN or Inf)") debug("CLF", "Predicting classifier %s on ds %s", (self, dataset)) # remember the time when started computing predictions t0 = time.time() ca = self.ca # to assure that those are reset (could be set due to testing # post-training) ca.reset(['estimates', 'predictions']) self._prepredict(dataset) if self.__trainednfeatures > 0 \ or 'notrain2predict' in self.__tags__: result = self._predict(dataset) else: warning( "Trying to predict using classifier trained on no features") if __debug__: debug("CLF", "No features were present for training, prediction is " \ "bogus") result = [None] * data.shape[0] ca.predicting_time = time.time() - t0 # with labels mapping in-place, we also need to go back to the # literal labels if self._attrmap: try: result = self._attrmap.to_literal(result) except KeyError as e: raise FailedToPredictError("Failed to convert predictions from numeric into " \ "literals: %s" % e) self._postpredict(dataset, result) return result
def __assign_nibabel_version(): try: import nibabel except Exception, e: # FloatingError is defined in the same module which precludes # its specific except e_str = str(e) if "We had not expected long double type <type 'numpy.float128'>" in e_str: warning("Must be running under valgrind? Available nibabel experiences " "difficulty with float128 upon import and fails to work, thus is " "report as N/A") raise ImportError("Fail to import nibabel due to %s" % e_str) raise
def _warn_if_fmri_dataset(ds): assert (isinstance(ds, AttrDataset)) fmri_fields = set(('imgaffine', 'imgtype', 'imghdr')) ds_fmri_fields = set.intersection(set(ds.a.keys()), fmri_fields) if len(ds_fmri_fields) > 0: warning('dataset attribute .a has fields %s, which suggest it is an ' 'volumetric dataset. Converting this dataset to GIFTI ' 'format will most likely result in unvisualiable ' '(and potentially, un-analysable) data. Consider using ' 'map2nifti instead' % (', '.join(ds_fmri_fields)))