def _do_subject_slice_timing(subject_data, ref_slice=0, slice_order="ascending", interleaved=False, caching=True, write_output_images=2, func_prefix=None, func_basenames=None, ext=None): if func_prefix is None: func_prefix = PREPROC_OUTPUT_IMAGE_PREFICES['STC'] if func_basenames is None: func_basenames = [get_basenames(func) for func in subject_data.func] # prepare for smart caching if caching: mem = Memory(cachedir=os.path.join( subject_data.output_dir, 'cache_dir'), verbose=100) runner = lambda handle: mem.cache(handle) if caching else handle stc_output = [] original_bold = subject_data.func for sess_func, sess_id in zip(subject_data.func, range(subject_data.n_sessions)): fmristc = runner(fMRISTC(slice_order=slice_order, ref_slice=ref_slice, interleaved=interleaved, verbose=True).fit)( raw_data=sess_func) stc_output.append(runner(fmristc.transform)( sess_func, output_dir=subject_data.tmp_output_dir if ( write_output_images > 0) else None, basenames=func_basenames[sess_id], prefix=func_prefix, ext=ext)) subject_data.func = stc_output del original_bold, fmristc if write_output_images > 1: subject_data.hardlink_output_files() return subject_data
def _delete_orientation(self): """ Delete orientation metadata. Garbage orientation metadata can lead to severe mis-registration trouble. """ # prepare for smart caching if self.scratch is None: self.scratch = self.output_dir cache_dir = os.path.join(self.scratch, 'cache_dir') if not os.path.exists(cache_dir): os.makedirs(cache_dir) mem = Memory(cachedir=cache_dir, verbose=5) # deleteorient for func for attr in ['n_sessions', 'session_output_dirs']: if getattr(self, attr) is None: warnings.warn("'%s' attribute of is None! Skipping" % attr) break else: self.func = [mem.cache(delete_orientation)( self.func[sess], self.session_output_dirs[sess]) for sess in range(self.n_sessions)] # deleteorient for anat if self.anat is not None: self.anat = mem.cache(delete_orientation)( self.anat, self.anat_output_dir)
def test_multilabel(self): cache = Memory(cachedir=tempfile.gettempdir()) cached_func = cache.cache( sklearn.datasets.make_multilabel_classification ) X, Y = cached_func( n_samples=150, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator=True, return_distributions=False, random_state=1 ) X_train = X[:100, :] Y_train = Y[:100, :] X_test = X[101:, :] Y_test = Y[101:, ] data = {'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test} dataset_properties = {'multilabel': True} cs = SimpleClassificationPipeline(dataset_properties=dataset_properties).\ get_hyperparameter_search_space() self._test_configurations(configurations_space=cs, data=data)
def fit(self, X, y=None): """ Compute agglomerative clustering. Parameters ---------- X : array-like, shape=(n_samples, n_features) Returns ------- self """ memory = self.memory if isinstance(memory, six.string_types): memory = Memory(cachedir=memory, verbose=0) if self.n_landmarks is None: distances = memory.cache(pdist)(X, self.metric) else: if self.landmark_strategy == 'random': land_indices = check_random_state(self.random_state).randint(len(X), size=self.n_landmarks) else: land_indices = np.arange(len(X))[::(len(X)//self.n_landmarks)][:self.n_landmarks] distances = memory.cache(pdist)(X[land_indices], self.metric) tree = memory.cache(linkage)(distances, method=self.linkage) self.landmark_labels_ = fcluster(tree, criterion='maxclust', t=self.n_clusters) - 1 if self.n_landmarks is None: self.landmarks_ = X else: self.landmarks_ = X[land_indices] return self
def fetch_asirra(image_count=1000): partial_path = check_fetch_asirra() m = Memory(cachedir=partial_path, compress=6, verbose=0) load_func = m.cache(_fetch_asirra) images, target = load_func(partial_path, image_count=image_count) return Bunch(data=images.reshape(len(images), -1), images=images, target=target, DESCR="Asirra cats and dogs dataset")
def _cache(self, func, memory_level=1, **kwargs): """ Return a joblib.Memory object if necessary. The memory_level determines the level above which the wrapped function output is cached. By specifying a numeric value for this level, the user can to control the amount of cache memory used. This function will cache the function call or not depending on the cache level. Parameters ---------- func: python function The function which output is to be cached. memory_level: integer The memory_level from which caching must be enabled for the wrapped function. Returns ------- Either the original function, if there is no need to cache it (because the requested level is lower than the value given to _cache()) or a joblib.Memory object that wraps the function func. """ # Creates attributes if they don't exist # This is to make creating them in __init__() optional. if not hasattr(self, "memory_level"): self.memory_level = 0 if not hasattr(self, "memory"): self.memory = Memory(cachedir=None) # If cache level is 0 but a memory object has been provided, set # memory_level to 1 with a warning. if self.memory_level == 0: if (isinstance(self.memory, basestring) or self.memory.cachedir is not None): warnings.warn("memory_level is currently set to 0 but " "a Memory object has been provided. " "Setting memory_level to 1.") self.memory_level = 1 if self.memory_level < memory_level: mem = Memory(cachedir=None) return mem.cache(func, **kwargs) else: memory = self.memory if isinstance(memory, basestring): memory = Memory(cachedir=memory) if not isinstance(memory, Memory): raise TypeError("'memory' argument must be a string or a " "joblib.Memory object.") if memory.cachedir is None: warnings.warn("Caching has been enabled (memory_level = %d) but no" " Memory object or path has been provided (parameter" " memory). Caching deactivated for function %s." % (self.memory_level, func.func_name)) return memory.cache(func, **kwargs)
def get_all_metadata(config=None, args=None): if config == None and args == None: raise Exception('Either config or args need to be not None') if config == None: config = get_config(args) class_meta = read_class_meta(config.dataset.class_meta_file) attrib_meta_with_name = read_attribute_meta(config.dataset.attrib_meta_file) attrib_meta = attrib_meta_with_name.drop('class_name',axis=1) train_annos = read_image_annotations(config.dataset.train_annos_file) test_annos = read_image_annotations(config.dataset.test_annos_file, has_class_id=False) domain_meta = read_domain_meta(config.dataset.domain_meta_file) train_annos['class_name'] = np.array([class_meta.class_name[class_index] for class_index in train_annos.class_index]) # test_annos['class_name'] = np.array([class_meta.class_name[class_index] for # class_index in # test_annos.class_index]) # Prepand path to the dataset to each img_path train_annos.img_path = train_annos.img_path.apply(lambda x: config.dataset.main_path.joinpath(x).abspath()) test_annos.img_path = test_annos.img_path.apply(lambda x: config.dataset.main_path.joinpath(x).abspath()) # Filter the class meta and train/test annotations to just use the # domains defined in config class_meta = class_meta[class_meta.domain_index.isin(config.dataset.domains)] train_annos = train_annos[train_annos.domain_index.isin(config.dataset.domains)] test_annos = test_annos[test_annos.domain_index.isin(config.dataset.domains)] # Create dev set dev_annos_train, dev_annos_test = create_dev_set(train_annos, config) # Should we use the dev set as the test set if config.dataset.dev_set.use: train_used, test_used = dev_annos_train, dev_annos_test else: train_used, test_used = train_annos, test_annos if config.flip_images: memory = Memory(cachedir=config.cache_dir, verbose=config.logging.verbose) flip_func = memory.cache(create_flipped_images) train_used = flip_func(train_used, config) return ({'real_train_annos': train_annos, 'real_test_annos': test_annos, 'train_annos': train_used, 'test_annos': test_used, 'validation_annos': dev_annos_test, 'class_meta': class_meta, 'domain_meta': domain_meta, 'attrib_meta': attrib_meta, 'attrib_meta_with_name': attrib_meta_with_name}, config)
def cache(func, memory, ref_memory_level=2, memory_level=1, **kwargs): """ Return a joblib.Memory object. The memory_level determines the level above which the wrapped function output is cached. By specifying a numeric value for this level, the user can to control the amount of cache memory used. This function will cache the function call or not depending on the cache level. Parameters ---------- func: function The function which output is to be cached. memory: instance of joblib.Memory or string Used to cache the function call. ref_memory_level: int The reference memory_level used to determine if function call must be cached or not (if memory_level is larger than ref_memory_level the function is cached) memory_level: int The memory_level from which caching must be enabled for the wrapped function. kwargs: keyword arguments The keyword arguments passed to memory.cache Returns ------- mem: joblib.MemorizedFunc object that wraps the function func. This object may be a no-op, if the requested level is lower than the value given to _cache()). For consistency, a joblib.Memory object is always returned. """ if ref_memory_level <= memory_level or memory is None: memory = Memory(cachedir=None) else: memory = memory if isinstance(memory, basestring): memory = Memory(cachedir=memory) if not isinstance(memory, memory_classes): raise TypeError("'memory' argument must be a string or a " "joblib.Memory object. " "%s %s was given." % (memory, type(memory))) if memory.cachedir is None: warnings.warn("Caching has been enabled (memory_level = %d) " "but no Memory object or path has been provided" " (parameter memory). Caching deactivated for " "function %s." % (ref_memory_level, func.func_name), stacklevel=2) return memory.cache(func, **kwargs)
def _do_subject_coregister( subject_data, coreg_func_to_anat=True, caching=True, ext=None, write_output_images=2, func_basenames=None, func_prefix="", anat_basename=None, anat_prefix="", report=True, verbose=True): ref_brain = 'func' src_brain = 'anat' ref = subject_data.func[0] src = subject_data.anat if coreg_func_to_anat: ref_brain, src_brain = src_brain, ref_brain ref, src = src, ref # prepare for smart caching if caching: mem = Memory(cachedir=os.path.join( subject_data.output_dir, 'cache_dir'), verbose=100) runner = lambda handle: mem.cache(handle) if caching else handle # estimate realignment (affine) params for coreg coreg = runner(Coregister(verbose=verbose).fit)(ref, src) # apply coreg if coreg_func_to_anat: if func_basenames is None: func_basenames = [get_basenames(func) for func in subject_data.func] coreg_func = [] for sess_func, sess_id in zip(subject_data.func, range( subject_data.n_sessions)): coreg_func.append(runner(coreg.transform)( sess_func, output_dir=subject_data.tmp_output_dir if ( write_output_images == 2) else None, basenames=func_basenames[sess_id] if coreg_func_to_anat else anat_basename, prefix=func_prefix)) subject_data.func = coreg_func src = load_vols(subject_data.func[0])[0] else: if anat_basename is None: anat_basename = get_basenames(subject_data.anat) subject_data.anat = runner(coreg.transform)( subject_data.anat, basename=anat_basename, output_dir=subject_data.tmp_output_dir if ( write_output_images == 2) else None, prefix=anat_prefix, ext=ext) src = subject_data.anat # generate coregistration QA thumbs if report: subject_data.generate_coregistration_thumbnails( coreg_func_to_anat=coreg_func_to_anat, nipype=False) del coreg if write_output_images > 1: subject_data.hardlink_output_files() return subject_data
def fit(self, niimgs, y=None): """Compute the mask corresponding to the data Parameters ---------- niimgs: list of filenames or NiImages Data on which the mask must be calculated. If this is a list, the affine is considered the same for all. """ memory = self.memory if isinstance(memory, basestring): memory = Memory(cachedir=memory) # Load data (if filenames are given, load them) if self.verbose > 0: print "[%s.fit] Loading data from %s" % ( self.__class__.__name__, utils._repr_niimgs(niimgs)[:200]) data = [] for niimg in niimgs: # Note that data is not loaded into memory at this stage # if niimg is a string data.append(utils.check_niimgs(niimg, accept_3d=True)) # Compute the mask if not given by the user if self.mask is None: if self.verbose > 0: print "[%s.fit] Computing the mask" % self.__class__.__name__ mask = memory.cache(masking.compute_multi_epi_mask, ignore=['verbose'])( niimgs, connected=self.mask_connected, opening=self.mask_opening, lower_cutoff=self.mask_lower_cutoff, upper_cutoff=self.mask_upper_cutoff, n_jobs=self.n_jobs, verbose=(self.verbose - 1)) self.mask_img_ = Nifti1Image(mask.astype(np.int), data[0].get_affine()) else: self.mask_img_ = utils.check_niimg(self.mask) # If resampling is requested, resample also the mask # Resampling: allows the user to change the affine, the shape or both if self.verbose > 0: print "[%s.transform] Resampling mask" % self.__class__.__name__ self.mask_img_ = memory.cache(resampling.resample_img)( self.mask_img_, target_affine=self.target_affine, target_shape=self.target_shape, copy=(self.target_affine is not None and self.target_shape is not None)) return self
def get_multilabel(self): cache = Memory(cachedir=tempfile.gettempdir()) cached_func = cache.cache(make_multilabel_classification) return cached_func( n_samples=100, n_features=10, n_classes=5, n_labels=5, return_indicator=True, random_state=1 )
def _check_memory(memory, verbose=0): """Function to ensure an instance of a joblib.Memory object. Parameters ---------- memory: None or instance of joblib.Memory or str Used to cache the masking process. If a str is given, it is the path to the caching directory. verbose : int, optional (default 0) Verbosity level. Returns ------- instance of joblib.Memory. """ if memory is None: memory = Memory(cachedir=None, verbose=verbose) if isinstance(memory, _basestring): cache_dir = memory if nilearn.EXPAND_PATH_WILDCARDS: cache_dir = os.path.expanduser(cache_dir) # Perform some verifications on given path. split_cache_dir = os.path.split(cache_dir) if (len(split_cache_dir) > 1 and (not os.path.exists(split_cache_dir[0]) and split_cache_dir[0] != '')): if (not nilearn.EXPAND_PATH_WILDCARDS and cache_dir.startswith("~")): # Maybe the user want to enable expanded user path. error_msg = ("Given cache path parent directory doesn't " "exists, you gave '{0}'. Enabling " "nilearn.EXPAND_PATH_WILDCARDS could solve " "this issue.".format(split_cache_dir[0])) elif memory.startswith("~"): # Path built on top of expanded user path doesn't exist. error_msg = ("Given cache path parent directory doesn't " "exists, you gave '{0}' which was expanded " "as '{1}' but doesn't exist either. Use " "nilearn.EXPAND_PATH_WILDCARDS to deactivate " "auto expand user path (~) behavior." .format(split_cache_dir[0], os.path.dirname(memory))) else: # The given cache base path doesn't exist. error_msg = ("Given cache path parent directory doesn't " "exists, you gave '{0}'." .format(split_cache_dir[0])) raise ValueError(error_msg) memory = Memory(cachedir=cache_dir, verbose=verbose) return memory
def cache(self, func, func_memory_level, **kwargs): """ Return a joblib.Memory object if necessary (depends on memory_level) The memory_level is a rough estimator of the amount of memory necessary to cache a function call. By specifying a numeric value for this level, the user will be able to control more or less the memory used on his computer. This function will cache the function call or not depending on the memory level. This is an helper to avoid code pasting. Parameters ---------- self: python object The object containing information about caching. It must have a memory attribute (used if caching is necessary) and an integer memory_level attribute to determine if the function must be cached or not. func: python function The function that may be cached func_memory_level: integer The memory_level from which caching must be enabled. Returns ------- Either the original function (if there is no need to cache it) or a joblib.Memory object that will be used to cache the function call. """ # if memory level is 0 but a memory object is provided, put memory_level # to 1 with a warning if self.memory_level == 0: if hasattr(self, 'memory') and self.memory is not None \ and (isinstance(self.memory, basestring) or self.memory.cachedir is not None): warnings.warn("memory_level is set to 0 but a Memory object has" " been provided. Setting memory_level to 1.") self.memory_level = 1 if self.memory_level < func_memory_level: return func else: memory = self.memory if isinstance(memory, basestring): memory = Memory(cachedir=memory) if memory.cachedir is None: warnings.warn("Caching has been enabled (memory_level = %d) but no" " Memory object or path has been provided (parameter" " memory). Caching canceled for function %s." % (self.memory_level, func.func_name)) return memory.cache(func, **kwargs)
def _fit(self, X, y=None, **fit_params): self._validate_steps() # Setup the memory memory = self.memory if memory is None: memory = Memory(cachedir=None, verbose=0) elif isinstance(memory, six.string_types): memory = Memory(cachedir=memory, verbose=0) elif not isinstance(memory, Memory): raise ValueError("'memory' should either be a string or" " a joblib.Memory instance, got" " 'memory={!r}' instead.".format(memory)) fit_transform_one_cached = memory.cache(_fit_transform_one) fit_sample_one_cached = memory.cache(_fit_sample_one) fit_params_steps = dict((name, {}) for name, step in self.steps if step is not None) for pname, pval in six.iteritems(fit_params): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval Xt = X yt = y for step_idx, (name, transformer) in enumerate(self.steps[:-1]): if transformer is None: pass else: if memory.cachedir is None: # we do not clone when caching is disabled to preserve # backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer if (hasattr(cloned_transformer, "transform") or hasattr(cloned_transformer, "fit_transform")): Xt, fitted_transformer = fit_transform_one_cached( cloned_transformer, None, Xt, yt, **fit_params_steps[name]) elif hasattr(cloned_transformer, "sample"): Xt, yt, fitted_transformer = fit_sample_one_cached( cloned_transformer, Xt, yt, **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator is None: return Xt, yt, {} return Xt, yt, fit_params_steps[self.steps[-1][0]]
def _niigz2nii(self): """ Convert .nii.gz to .nii (crucial for SPM). """ cache_dir = os.path.join(self.scratch, 'cache_dir') mem = Memory(cache_dir, verbose=100) self._sanitize_session_output_dirs() if not None in [self.func, self.n_sessions, self.session_output_dirs]: self.func = [mem.cache(do_niigz2nii)( self.func[sess], output_dir=self.session_output_dirs[sess]) for sess in range(self.n_sessions)] if not self.anat is None: self.anat = mem.cache(do_niigz2nii)( self.anat, output_dir=self.anat_output_dir)
def fit(self, niimgs, y=None): """Compute the mask corresponding to the data Parameters ---------- niimgs: list of filenames or NiImages Data on which the mask must be calculated. If this is a list, the affine is considered the same for all. """ memory = self.memory if isinstance(memory, basestring): memory = Memory(cachedir=memory) # Load data (if filenames are given, load them) if self.verbose > 0: print "[%s.fit] Loading data" % self.__class__.__name__ niimgs = utils.check_niimgs(niimgs, accept_3d=True) # Compute the mask if not given by the user if self.mask is None: if self.verbose > 0: print "[%s.fit] Computing the mask" % self.__class__.__name__ mask = memory.cache(masking.compute_epi_mask)( niimgs.get_data(), connected=self.mask_connected, opening=self.mask_opening, lower_cutoff=self.mask_lower_cutoff, upper_cutoff=self.mask_upper_cutoff, verbose=(self.verbose - 1), ) self.mask_ = Nifti1Image(mask.astype(np.int), niimgs.get_affine()) else: self.mask_ = utils.check_niimg(self.mask) # If resampling is requested, resample also the mask # Resampling: allows the user to change the affine, the shape or both if self.verbose > 0: print "[%s.transform] Resampling mask" % self.__class__.__name__ self.mask_ = memory.cache(resampling.resample_img)( self.mask_, target_affine=self.target_affine, target_shape=self.target_shape, copy=(self.target_affine is not None and self.target_shape is not None), ) return self
def fit(self, X, y=None, get_rhos=False): ''' Sets up for divergence estimation "from" new data "to" X. Builds FLANN indices for each bag, and maybe gets within-bag distances. Parameters ---------- X : list of arrays or :class:`skl_groups.features.Features` The bags to search "to". get_rhos : boolean, optional, default False Compute within-bag distances :attr:`rhos_`. These are only needed for some divergence functions or if do_sym is passed, and they'll be computed (and saved) during :meth:`transform` if they're not computed here. If you're using Jensen-Shannon divergence, a higher max_K may be needed once it sees the number of points in the transformed bags, so the computation here might be wasted. ''' self.features_ = X = as_features(X, stack=True, bare=True) # if we're using a function that needs to pick its K vals itself, # then we need to set max_K here. when we transform(), might have to # re-do this :| Ks = self._get_Ks() _, _, _, max_K, save_all_Ks, _ = _choose_funcs( self.div_funcs, Ks, X.dim, X.n_pts, None, self.version) if max_K >= X.n_pts.min(): msg = "asked for K = {}, but there's a bag with only {} points" raise ValueError(msg.format(max_K, X.n_pts.min())) memory = self.memory if isinstance(memory, string_types): memory = Memory(cachedir=memory, verbose=0) self.indices_ = id = memory.cache(_build_indices)(X, self._flann_args()) if get_rhos: self.rhos_ = _get_rhos(X, id, Ks, max_K, save_all_Ks, self.min_dist) elif hasattr(self, 'rhos_'): del self.rhos_ return self
def transform(self, X): r''' Computes the divergences from X to :attr:`features_`. Parameters ---------- X : list of bag feature arrays or :class:`skl_groups.features.Features` The bags to search "from". Returns ------- divs : array of shape ``[len(div_funcs), len(Ks), len(X), len(features_)] + ([2] if do_sym else [])`` The divergences from X to :attr:`features_`. ``divs[d, k, i, j]`` is the ``div_funcs[d]`` divergence from ``X[i]`` to ``fetaures_[j]`` using a K of ``Ks[k]``. If ``do_sym``, ``divs[d, k, i, j, 0]`` is :math:`D_{d,k}( X_i \| \texttt{features_}_j)` and ``divs[d, k, i, j, 1]`` is :math:`D_{d,k}(\texttt{features_}_j \| X_i)`. ''' X = as_features(X, stack=True, bare=True) Y = self.features_ Ks = np.asarray(self.Ks) if X.dim != Y.dim: msg = "incompatible dimensions: fit with {}, transform with {}" raise ValueError(msg.format(Y.dim, X.dim)) memory = self.memory if isinstance(memory, string_types): memory = Memory(cachedir=memory, verbose=0) # ignore Y_indices to avoid slow pickling of them # NOTE: if the indices are approximate, then might not get the same # results! est = memory.cache(_est_divs, ignore=['n_jobs', 'Y_indices', 'Y_rhos']) output, self.rhos_ = est( X, Y, self.indices_, getattr(self, 'rhos_', None), self.div_funcs, Ks, self.do_sym, self.clamp, self.version, self.min_dist, self._flann_args(), self._n_jobs) return output
def _do_subject_realign(subject_data, reslice=True, register_to_mean=False, caching=True, hardlink_output=True, ext=None, func_basenames=None, write_output_images=2, report=True, func_prefix=None): if register_to_mean: raise NotImplementedError("Feature pending...") if func_prefix is None: func_prefix = PREPROC_OUTPUT_IMAGE_PREFICES['MC'] if func_basenames is None: func_basenames = [get_basenames(func) for func in subject_data.func] # prepare for smart caching if caching: mem = Memory(cachedir=os.path.join( subject_data.output_dir, 'cache_dir'), verbose=100) runner = lambda handle: mem.cache(handle) if caching else handle mrimc = runner(MRIMotionCorrection( n_sessions=subject_data.n_sessions, verbose=True).fit)( [sess_func for sess_func in subject_data.func]) mrimc_output = runner(mrimc.transform)( reslice=reslice, output_dir=subject_data.tmp_output_dir if ( write_output_images == 2) else None, ext=ext, prefix=func_prefix, basenames=func_basenames) subject_data.func = mrimc_output['realigned_images'] subject_data.realignment_parameters = mrimc_output[ 'realignment_parameters'] # generate realignment thumbs if report: subject_data.generate_realignment_thumbnails(nipype=False) # garbage collection del mrimc if write_output_images > 1: subject_data.hardlink_output_files() return subject_data
def _do_subject_smooth(subject_data, fwhm, prefix=None, write_output_images=2, func_basenames=None, concat=False, caching=True): if prefix is None: prefix = PREPROC_OUTPUT_IMAGE_PREFICES['smoothing'] if func_basenames is None: func_basenames = [get_basenames(func) for func in subject_data.func] if caching: mem = Memory(cachedir=os.path.join( subject_data.output_dir, 'cache_dir'), verbose=100) sfunc = [] for sess in range(subject_data.n_sessions): sess_func = subject_data.func[sess] _tmp = mem.cache(smooth_image)(sess_func, fwhm) if write_output_images == 2: _tmp = mem.cache(save_vols)( _tmp, subject_data.output_dir, basenames=func_basenames[sess], prefix=prefix, concat=concat) sfunc.append(_tmp) subject_data.func = sfunc return subject_data
def fetch_asirra(image_count=1000): """ Parameters ---------- image_count : positive integer Returns ------- data : Bunch Dictionary-like object with the following attributes : 'images', the sample images, 'data', the flattened images, 'target', the label for the image (0 for cat, 1 for dog), and 'DESCR' the full description of the dataset. """ partial_path = check_fetch_asirra() m = Memory(cachedir=partial_path, compress=6, verbose=0) load_func = m.cache(_fetch_asirra) images, target = load_func(partial_path, image_count=image_count) return Bunch(data=images.reshape(len(images), -1), images=images, target=target, DESCR="Asirra cats and dogs dataset")
def fit(self, data, Y=None): if hasattr(data, 'copy'): # It's an array data = data.copy() else: # Probably a list data = copy.deepcopy(data) memory = self.memory if isinstance(memory, basestring): memory = Memory(cachedir=memory) pcas = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(subject_pca)(subject_data, n_components=self.n_components, mem=memory) for subject_data in data) pcas = np.concatenate(pcas, axis=1) if self.kurtosis_thr is None: group_maps = memory.cache(randomized_svd)( pcas, self.n_components)[0] group_maps = group_maps[:, :self.n_components] ica_maps = memory.cache(fastica)(group_maps, whiten=False, fun='cube', random_state=self.random_state)[2] ica_maps = ica_maps.T else: ica_maps = self._find_high_kurtosis(pcas, memory) del pcas self.maps_ = ica_maps if not self.maps_only: # Relearn the time series self.learn_from_maps(data) return self
def __init__(self, mask_img=None, smoothing_fwhm=None, memory=Memory(None), memory_level=1, verbose=0, n_jobs=1, minimize_memory=True): self.mask_img = mask_img self.smoothing_fwhm = smoothing_fwhm if isinstance(memory, _basestring): self.memory = Memory(memory) else: self.memory = memory self.memory_level = memory_level self.verbose = verbose self.n_jobs = n_jobs self.minimize_memory = minimize_memory self.second_level_input_ = None self.confounds_ = None
def __init__(self, t_r=None, slice_time_ref=0., hrf_model='glover', drift_model='cosine', period_cut=128, drift_order=1, fir_delays=[0], min_onset=-24, mask_img=None, target_affine=None, target_shape=None, smoothing_fwhm=None, memory=Memory(None), memory_level=1, standardize=False, signal_scaling=0, noise_model='ar1', verbose=0, n_jobs=1, minimize_memory=True, subject_label=None): # design matrix parameters self.t_r = t_r self.slice_time_ref = slice_time_ref self.hrf_model = hrf_model self.drift_model = drift_model self.period_cut = period_cut self.drift_order = drift_order self.fir_delays = fir_delays self.min_onset = min_onset # glm parameters self.mask_img = mask_img self.target_affine = target_affine self.target_shape = target_shape self.smoothing_fwhm = smoothing_fwhm if isinstance(memory, _basestring): self.memory = Memory(memory) else: self.memory = memory self.memory_level = memory_level self.standardize = standardize if signal_scaling is False: self.signal_scaling = signal_scaling elif signal_scaling in [0, 1, (0, 1)]: self.scaling_axis = signal_scaling self.signal_scaling = True self.standardize = False else: raise ValueError('signal_scaling must be "False", "0", "1"' ' or "(0, 1)"') self.noise_model = noise_model self.verbose = verbose self.n_jobs = n_jobs self.minimize_memory = minimize_memory # attributes self.labels_ = None self.results_ = None self.subject_label = subject_label
@pytest.fixture def app_notest(): tapp = fd_app(cache_dir) tapp.config['TESTING'] = False client = tapp.test_client() client.post_check = app_call_wrapper(client.post) client.get_check = app_call_wrapper(client.get) client.delete_check = app_call_wrapper(client.delete) return client memory = Memory(cachedir=os.path.join(cache_dir, '_joblib_cache', str(os.getpid())), verbose=0) #=============================================================================# # # Feature extraction # #=============================================================================# def get_features(app, hashed=True, metadata_fields='data_dir'): method = V01 + "/feature-extraction/" pars = {"use_hashing": hashed} if metadata_fields == 'data_dir': pars["data_dir"] = data_dir elif metadata_fields == 'dataset_definition':
plt.title("%s / precision" % title) # Fetching datasets ########################################################### print("-- Fetching datasets ...") from nilearn import datasets msdl_atlas_dataset = datasets.fetch_msdl_atlas() adhd_dataset = datasets.fetch_adhd(n_subjects=1) # Extracting region signals ################################################### import nilearn.image import nilearn.input_data from sklearn.externals.joblib import Memory mem = Memory('nilearn_cache') masker = nilearn.input_data.NiftiMapsMasker( msdl_atlas_dataset.maps, resampling_target="maps", detrend=True, low_pass=None, high_pass=0.01, t_r=2.5, standardize=True, memory=mem, memory_level=1, verbose=2) masker.fit() fmri_filename = adhd_dataset.func[0] confound_filename = adhd_dataset.confounds[0] # Computing some confounds hv_confounds = mem.cache(nilearn.image.high_variance_confounds)( fmri_filename) time_series = masker.transform(fmri_filename,
Synopsis: Demo for coregistration in pure python It demos coregistration on a variety of datasets including: SPM single-subject auditory, NYU rest, ABIDE, etc. """ import os import glob import matplotlib.pyplot as plt from pypreprocess.datasets import fetch_spm_auditory, fetch_nyu_rest from pypreprocess.reporting.check_preprocessing import plot_registration from pypreprocess.coreg import Coregister from sklearn.externals.joblib import Memory # misc mem = Memory("demos_cache") def _run_demo(func, anat): # fit coreg = Coregister().fit(anat, func) # apply coreg VFk = coreg.transform(func) # QA plot_registration(anat, VFk, title="before coreg") plot_registration(VFk, anat, title="after coreg") plt.show()
(redirects_url, redirects_filename), (page_links_url, page_links_filename), ] for url, filename in resources: if not os.path.exists(filename): import urllib print "Downloading data from '%s', please wait..." % url opener = urllib.urlopen(url) open(filename, 'wb').write(opener.read()) print ############################################################################### # Loading the redirect files memory = Memory(cachedir=".") def index(redirects, index_map, k): """Find the index of an article name after redirect resolution""" k = redirects.get(k, k) return index_map.setdefault(k, len(index_map)) DBPEDIA_RESOURCE_PREFIX_LEN = len("http://dbpedia.org/resource/") SHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1) def short_name(nt_uri): """Remove the < and > URI markers and the common URI prefix""" return nt_uri[SHORTNAME_SLICE]
def boo(subject_idx=0, cut_coords=None, n_components=20, n_clusters=2000, memory='nilearn_cache'): mem = Memory(cachedir='nilearn_cache') # ## Load the data ################################################### print("Fetch the data files from Internet") haxby_dataset = datasets.fetch_haxby(n_subjects=subject_idx + 1) print("Second, load the labels") haxby_labels = np.genfromtxt(haxby_dataset.session_target[0], skip_header=1, usecols=[0], dtype=basestring) # ## Find voxels of interest ############################################## print("Load the data.") anat_filename = haxby_dataset.anat[subject_idx] anat_img = nibabel.load(anat_filename) fmri_filename = haxby_dataset.func[subject_idx] fmri_raw_img = nibabel.load(fmri_filename) print("Build a mask based on the activations.") epi_masker = NiftiMasker(mask_strategy='epi', detrend=True, standardize=True) epi_masker = mem.cache(epi_masker.fit)(fmri_raw_img) plot_roi(epi_masker.mask_img_, bg_img=anat_img, title='EPI mask', cut_coords=cut_coords) print( "Normalize the (transformed) data") # zscore per pixel, over examples. fmri_masked_vectors = epi_masker.transform(fmri_raw_img) fmri_normed_vectors = mem.cache(stats.mstats.zscore)(fmri_masked_vectors, axis=0) fmri_normed_img = epi_masker.inverse_transform(fmri_normed_vectors) print("Smooth the (spatial) data.") fmri_smooth_img = mem.cache(image.smooth_img)(fmri_normed_img, fwhm=1) print("Mask the MRI data.") masked_fmri_vectors = mem.cache(epi_masker.transform)(fmri_smooth_img) fmri_masked_img = epi_masker.inverse_transform(masked_fmri_vectors) # ## Compute mean values based on condition matrix ########################################## condition_names = list(np.unique(haxby_labels)) n_conditions = len(condition_names) n_good_voxels = masked_fmri_vectors.shape[1] mean_vectors = np.empty((n_conditions, n_good_voxels)) for ci, condition in enumerate(condition_names): condition_vectors = masked_fmri_vectors[haxby_labels == condition, :] mean_vectors[ci, :] = condition_vectors.mean(axis=0) # ## Use similarity across conditions as the 4th dimension ########################################## n_conds = len(condition_names) n_compares = n_conds * (n_conds - 1) / 2 p_vectors = np.zeros((n_compares, masked_fmri_vectors.shape[1])) comparison_text = [] comparison_img = [] idx = 0 for i, cond in enumerate(condition_names): for j, cond2 in enumerate(condition_names[i + 1:]): print("Computing ttest for %s vs. %s." % (cond, cond2)) _, p_vector = stats.ttest_ind( masked_fmri_vectors[haxby_labels == cond, :], masked_fmri_vectors[haxby_labels == cond2, :], axis=0) p_vector /= p_vector.max() # normalize p_vector = -np.log10(p_vector) p_vector[np.isnan(p_vector)] = 0. p_vector[p_vector > 10.] = 10. p_img = epi_masker.inverse_transform(p_vector) comparison_img.append(p_img) comparison_text.append('%s vs. %s' % (cond, cond2)) p_vectors[idx, :] = p_vector idx += 1 #n_comparisons = n_conditions * (n_conditions-1) / 2 #similarity_vectors = np.empty((n_good_voxels, n_comparisons)) #for vi in np.arange(n_good_voxels): # similarity_vectors[vi, :] = pdist(mean_vectors[:, vi]) # Compute a connectivity matrix (for constraining the clustering) mask_data = epi_masker.mask_img_.get_data().astype(np.bool) connectivity = sk_image.grid_to_graph(n_x=mask_data.shape[0], n_y=mask_data.shape[1], n_z=mask_data.shape[2], mask=mask_data) # Cluster (#2) start = time.time() ward = WardAgglomeration(n_clusters=n_clusters, connectivity=connectivity, memory=memory) ward.fit(p_vectors) print("Ward agglomeration %d clusters: %.2fs" % (n_clusters, time.time() - start)) # Compute an image with one ROI per label, and save to disk labels = ward.labels_ + 1 # Avoid 0 label - 0 means mask. labels_img = epi_masker.inverse_transform(labels) labels_img.to_filename('parcellation.nii') # Plot image with len(labels) ROIs, and store # the cut coordinates to reuse for all plots # and the figure for plotting all to a common axis first_plot = plot_roi(labels_img, title="Ward parcellation", bg_img=anat_img) plt.show()
def __init__(self, n_components=20, n_epochs=1, alpha=0., dict_init=None, random_state=None, l1_ratio=1, batch_size=20, replacement=False, reduction=1, projection='partial', learning_rate=1, offset=0, var_red='weight_based', shelve=True, mask=None, smoothing_fwhm=None, standardize=True, detrend=True, low_pass=None, high_pass=None, t_r=None, target_affine=None, target_shape=None, mask_strategy='epi', mask_args=None, memory=Memory(cachedir=None), memory_level=0, n_jobs=1, backend='python', verbose=0, trace_folder=None): BaseDecomposition.__init__( self, n_components=n_components, random_state=random_state, mask=mask, smoothing_fwhm=smoothing_fwhm, standardize=standardize, detrend=detrend, low_pass=low_pass, high_pass=high_pass, t_r=t_r, target_affine=target_affine, target_shape=target_shape, mask_strategy=mask_strategy, mask_args=mask_args, memory=memory, memory_level=memory_level, n_jobs=n_jobs, verbose=verbose, ) self.l1_ratio = l1_ratio self.alpha = alpha self.dict_init = dict_init self.n_epochs = n_epochs self.batch_size = batch_size self.reduction = reduction self.projection = projection self.var_red = var_red self.replacement = replacement self.backend = backend self.shelve = shelve self.trace_folder = trace_folder self.learning_rate = learning_rate self.offset = offset
class SecondLevelModel(BaseEstimator, TransformerMixin, CacheMixin): """ Implementation of the General Linear Model for multiple subject fMRI data Parameters ---------- mask_img: Niimg-like, NiftiMasker or MultiNiftiMasker object, optional, Mask to be used on data. If an instance of masker is passed, then its mask will be used. If no mask is given, it will be computed automatically by a MultiNiftiMasker with default parameters. Automatic mask computation assumes first level imgs have already been masked. smoothing_fwhm: float, optional If smoothing_fwhm is not None, it gives the size in millimeters of the spatial smoothing to apply to the signal. memory: string, optional Path to the directory used to cache the masking process and the glm fit. By default, no caching is done. Creates instance of joblib.Memory. memory_level: integer, optional Rough estimator of the amount of memory used by caching. Higher value means more memory for caching. verbose : integer, optional Indicate the level of verbosity. By default, nothing is printed. If 0 prints nothing. If 1 prints final computation time. If 2 prints masker computation details. n_jobs : integer, optional The number of CPUs to use to do the computation. -1 means 'all CPUs', -2 'all CPUs but one', and so on. minimize_memory : boolean, optional Gets rid of some variables on the model fit results that are not necessary for contrast computation and would only be useful for further inspection of model details. This has an important impact on memory consumption. True by default. """ @replace_parameters({'mask': 'mask_img'}, end_version='next') def __init__(self, mask_img=None, smoothing_fwhm=None, memory=Memory(None), memory_level=1, verbose=0, n_jobs=1, minimize_memory=True): self.mask_img = mask_img self.smoothing_fwhm = smoothing_fwhm if isinstance(memory, _basestring): self.memory = Memory(memory) else: self.memory = memory self.memory_level = memory_level self.verbose = verbose self.n_jobs = n_jobs self.minimize_memory = minimize_memory self.second_level_input_ = None self.confounds_ = None def fit(self, second_level_input, confounds=None, design_matrix=None): """ Fit the second-level GLM 1. create design matrix 2. do a masker job: fMRI_data -> Y 3. fit regression to (Y, X) Parameters ---------- second_level_input: list of `FirstLevelModel` objects or pandas DataFrame or list of Niimg-like objects. Giving FirstLevelModel objects will allow to easily compute the second level contast of arbitrary first level contrasts thanks to the first_level_contrast argument of the compute_contrast method. Effect size images will be computed for each model to contrast at the second level. If a pandas DataFrame, then they have to contain subject_label, map_name and effects_map_path. It can contain multiple maps that would be selected during contrast estimation with the argument first_level_contrast of the compute_contrast function. The DataFrame will be sorted based on the subject_label column to avoid order inconsistencies when extracting the maps. So the rows of the automatically computed design matrix, if not provided, will correspond to the sorted subject_label column. If list of Niimg-like objects then this is taken literally as Y for the model fit and design_matrix must be provided. confounds: pandas DataFrame, optional Must contain a subject_label column. All other columns are considered as confounds and included in the model. If design_matrix is provided then this argument is ignored. The resulting second level design matrix uses the same column names as in the given DataFrame for confounds. At least two columns are expected, "subject_label" and at least one confound. design_matrix: pandas DataFrame, optional Design matrix to fit the GLM. The number of rows in the design matrix must agree with the number of maps derived from second_level_input. Ensure that the order of maps given by a second_level_input list of Niimgs matches the order of the rows in the design matrix. """ # Check parameters # check first level input if isinstance(second_level_input, list): if len(second_level_input) < 2: raise ValueError('A second level model requires a list with at' 'least two first level models or niimgs') # Check FirstLevelModel objects case if isinstance(second_level_input[0], FirstLevelModel): models_input = enumerate(second_level_input) for model_idx, first_level_model in models_input: if (first_level_model.labels_ is None or first_level_model.results_ is None): raise ValueError( 'Model %s at index %i has not been fit yet' '' % (first_level_model.subject_label, model_idx)) if not isinstance(first_level_model, FirstLevelModel): raise ValueError(' object at idx %d is %s instead of' ' FirstLevelModel object' % (model_idx, type(first_level_model))) if confounds is not None: if first_level_model.subject_label is None: raise ValueError( 'In case confounds are provided, first level ' 'objects need to provide the attribute ' 'subject_label to match rows appropriately.' 'Model at idx %d does not provide it. ' 'To set it, you can do ' 'first_level_model.subject_label = "01"' '' % (model_idx)) # Check niimgs case elif isinstance(second_level_input[0], (str, Nifti1Image)): if design_matrix is None: raise ValueError('List of niimgs as second_level_input' ' require a design matrix to be provided') for model_idx, niimg in enumerate(second_level_input): if not isinstance(niimg, (str, Nifti1Image)): raise ValueError(' object at idx %d is %s instead of' ' Niimg-like object' % (model_idx, type(niimg))) # Check pandas dataframe case elif isinstance(second_level_input, pd.DataFrame): for col in ['subject_label', 'map_name', 'effects_map_path']: if col not in second_level_input.columns: raise ValueError('second_level_input DataFrame must have' ' columns subject_label, map_name and' ' effects_map_path') # Make sure subject_label contain strings second_level_columns = second_level_input.columns.tolist() labels_index = second_level_columns.index('subject_label') labels_dtype = second_level_input.dtypes[labels_index] if not isinstance(labels_dtype, np.object): raise ValueError('subject_label column must be of dtype ' 'object instead of dtype %s' % labels_dtype) elif isinstance(second_level_input, (str, Nifti1Image)): if design_matrix is None: raise ValueError('List of niimgs as second_level_input' ' require a design matrix to be provided') second_level_input = check_niimg(niimg=second_level_input, ensure_ndim=4) else: raise ValueError('second_level_input must be a list of' ' `FirstLevelModel` objects, a pandas DataFrame' ' or a list Niimg-like objects. Instead %s ' 'was provided' % type(second_level_input)) # check confounds if confounds is not None: if not isinstance(confounds, pd.DataFrame): raise ValueError('confounds must be a pandas DataFrame') if 'subject_label' not in confounds.columns: raise ValueError('confounds DataFrame must contain column' '"subject_label"') if len(confounds.columns) < 2: raise ValueError('confounds should contain at least 2 columns' 'one called "subject_label" and the other' 'with a given confound') # Make sure subject_label contain strings labels_index = confounds.columns.tolist().index('subject_label') labels_dtype = confounds.dtypes[labels_index] if not isinstance(labels_dtype, np.object): raise ValueError('subject_label column must be of dtype ' 'object instead of dtype %s' % labels_dtype) # check design matrix if design_matrix is not None: if not isinstance(design_matrix, pd.DataFrame): raise ValueError('design matrix must be a pandas DataFrame') # sort a pandas dataframe by subject_label to avoid inconsistencies # with the design matrix row order when automatically extracting maps if isinstance(second_level_input, pd.DataFrame): columns = second_level_input.columns.tolist() column_index = columns.index('subject_label') sorted_matrix = sorted( second_level_input.values, key=lambda x: x[column_index]) sorted_input = pd.DataFrame(sorted_matrix, columns=columns) second_level_input = sorted_input self.second_level_input_ = second_level_input self.confounds_ = confounds # Report progress t0 = time.time() if self.verbose > 0: sys.stderr.write("Fitting second level model. " "Take a deep breath\r") # Select sample map for masker fit and get subjects_label for design if isinstance(second_level_input, pd.DataFrame): sample_map = second_level_input['effects_map_path'][0] labels = second_level_input['subject_label'] subjects_label = labels.values.tolist() elif isinstance(second_level_input, Nifti1Image): sample_map = mean_img(second_level_input) elif isinstance(second_level_input[0], FirstLevelModel): sample_model = second_level_input[0] sample_condition = sample_model.design_matrices_[0].columns[0] sample_map = sample_model.compute_contrast( sample_condition, output_type='effect_size') labels = [model.subject_label for model in second_level_input] subjects_label = labels else: # In this case design matrix had to be provided sample_map = mean_img(second_level_input) # Create and set design matrix, if not given if design_matrix is None: design_matrix = make_second_level_design_matrix(subjects_label, confounds) self.design_matrix_ = design_matrix # Learn the mask. Assume the first level imgs have been masked. if not isinstance(self.mask_img, NiftiMasker): self.masker_ = NiftiMasker( mask_img=self.mask_img, smoothing_fwhm=self.smoothing_fwhm, memory=self.memory, verbose=max(0, self.verbose - 1), memory_level=self.memory_level) else: self.masker_ = clone(self.mask_img) for param_name in ['smoothing_fwhm', 'memory', 'memory_level']: our_param = getattr(self, param_name) if our_param is None: continue if getattr(self.masker_, param_name) is not None: warn('Parameter %s of the masker overriden' % param_name) setattr(self.masker_, param_name, our_param) self.masker_.fit(sample_map) # Report progress if self.verbose > 0: sys.stderr.write("\nComputation of second level model done in " "%i seconds\n" % (time.time() - t0)) return self def compute_contrast( self, second_level_contrast=None, first_level_contrast=None, second_level_stat_type=None, output_type='z_score'): """Generate different outputs corresponding to the contrasts provided e.g. z_map, t_map, effects and variance. Parameters ---------- second_level_contrast: str or array of shape (n_col), optional Where ``n_col`` is the number of columns of the design matrix, The string can be a formula compatible with the linear constraint of the Patsy library. Basically one can use the name of the conditions as they appear in the design matrix of the fitted model combined with operators /\*+- and numbers. Please check the patsy documentation for formula examples: http://patsy.readthedocs.io/en/latest/API-reference.html#patsy.DesignInfo.linear_constraint The default (None) is accepted if the design matrix has a single column, in which case the only possible contrast array([1]) is applied; when the design matrix has multiple columns, an error is raised. first_level_contrast: str or array of shape (n_col) with respect to FirstLevelModel, optional In case a list of FirstLevelModel was provided as second_level_input, we have to provide a contrast to apply to the first level models to get the corresponding list of images desired, that would be tested at the second level. In case a pandas DataFrame was provided as second_level_input this is the map name to extract from the pandas dataframe map_name column. It has to be a 't' contrast. second_level_stat_type: {'t', 'F'}, optional Type of the second level contrast output_type: str, optional Type of the output map. Can be 'z_score', 'stat', 'p_value', 'effect_size' or 'effect_variance' Returns ------- output_image: Nifti1Image The desired output image """ if self.second_level_input_ is None: raise ValueError('The model has not been fit yet') # first_level_contrast check if isinstance(self.second_level_input_[0], FirstLevelModel): if first_level_contrast is None: raise ValueError('If second_level_input was a list of ' 'FirstLevelModel, then first_level_contrast ' 'is mandatory. It corresponds to the ' 'second_level_contrast argument of the ' 'compute_contrast method of FirstLevelModel') # check contrast definition if second_level_contrast is None: if self.design_matrix_.shape[1] == 1: second_level_contrast = np.ones([1]) else: raise ValueError('No second-level contrast is specified.') if isinstance(second_level_contrast, np.ndarray): con_val = second_level_contrast if np.all(con_val == 0): raise ValueError('Contrast is null') else: design_info = DesignInfo(self.design_matrix_.columns.tolist()) constraint = design_info.linear_constraint(second_level_contrast) con_val = constraint.coefs # check output type if isinstance(output_type, _basestring): if output_type not in ['z_score', 'stat', 'p_value', 'effect_size', 'effect_variance']: raise ValueError( 'output_type must be one of "z_score", "stat"' ', "p_value", "effect_size" or "effect_variance"') else: raise ValueError('output_type must be one of "z_score", "stat",' ' "p_value", "effect_size" or "effect_variance"') # Get effect_maps appropriate for chosen contrast effect_maps = _infer_effect_maps(self.second_level_input_, first_level_contrast) # Check design matrix X and effect maps Y agree on number of rows if len(effect_maps) != self.design_matrix_.shape[0]: raise ValueError( 'design_matrix does not match the number of maps considered. ' '%i rows in design matrix do not match with %i maps' % (self.design_matrix_.shape[0], len(effect_maps))) # Fit an Ordinary Least Squares regression for parametric statistics Y = self.masker_.transform(effect_maps) if self.memory: mem_glm = self.memory.cache(run_glm, ignore=['n_jobs']) else: mem_glm = run_glm labels, results = mem_glm(Y, self.design_matrix_.values, n_jobs=self.n_jobs, noise_model='ols') # We save memory if inspecting model details is not necessary if self.minimize_memory: for key in results: results[key] = SimpleRegressionResults(results[key]) self.labels_ = labels self.results_ = results # We compute contrast object if self.memory: mem_contrast = self.memory.cache(compute_contrast) else: mem_contrast = compute_contrast contrast = mem_contrast(self.labels_, self.results_, con_val, second_level_stat_type) # We get desired output from contrast object estimate_ = getattr(contrast, output_type)() # Prepare the returned images output = self.masker_.inverse_transform(estimate_) contrast_name = str(con_val) output.header['descrip'] = ( '%s of contrast %s' % (output_type, contrast_name)) return output
aforementioned settings. In general, speed up is increasing as the index size grows. """ from __future__ import division import numpy as np from tempfile import gettempdir from time import time from sklearn.neighbors import NearestNeighbors from sklearn.neighbors.approximate import LSHForest from sklearn.datasets import make_blobs from sklearn.externals.joblib import Memory m = Memory(cachedir=gettempdir()) @m.cache() def make_data(n_samples, n_features, n_queries, random_state=0): """Create index and query data.""" print('Generating random blob-ish data') X, _ = make_blobs(n_samples=n_samples + n_queries, n_features=n_features, centers=100, shuffle=True, random_state=random_state) # Keep the last samples as held out query vectors: note since we used # shuffle=True we have ensured that index and query vectors are # samples from the same distribution (a mixture of 100 gaussians in this
""" import os from collections import namedtuple import matplotlib.pyplot as plt from sklearn.externals.joblib import Memory from pypreprocess.realign import MRIMotionCorrection from pypreprocess.reporting.check_preprocessing import ( plot_spm_motion_parameters) from pypreprocess.datasets import (fetch_fsl_feeds, fetch_spm_multimodal_fmri, fetch_spm_auditory) from nilearn.datasets import fetch_nyu_rest # data structure for subject data SubjectData = namedtuple('SubjectData', 'subject_id func output_dir') mem = Memory("demos_cache") def _demo_runner(subjects, dataset_id, **spm_realign_kwargs): """Demo runner. Parameters ---------- subjects: iterable for subject data each subject data can be anything, with a func (string or list of strings; existing file path(s)) and an output_dir (string, existing dir path) field dataset_id: string a short string describing the data being processed (e.g. "HAXBY!") Notes
def boo(subject_idx=0, cut_coords=None): mem = Memory(cachedir='nilearn_cache') # ## Load the data ################################################### print("Fetch the data files from Internet") haxby_dataset = datasets.fetch_haxby(n_subjects=subject_idx + 1) print("Second, load the labels") haxby_labels = np.genfromtxt(haxby_dataset.session_target[0], skip_header=1, usecols=[0], dtype=basestring) # ## Find voxels of interest ############################################## print("Load the data.") anat_filename = haxby_dataset.anat[subject_idx] anat_img = nibabel.load(anat_filename) fmri_filename = haxby_dataset.func[subject_idx] fmri_raw_img = nibabel.load(fmri_filename) shared_affine = fmri_raw_img.get_affine() print("Build a mask based on the activations.") epi_masker = NiftiMasker(mask_strategy='epi', detrend=True, standardize=True) epi_masker = mem.cache(epi_masker.fit)(fmri_raw_img) plot_roi(epi_masker.mask_img_, title='EPI mask', cut_coords=cut_coords) from nipy.labs.viz import plot_map # plot_map(epi_masker.mask_img_.get_data(), epi_masker.mask_img_.get_affine()) # plt.show() # exit() #print("Normalize the (transformed) data") # zscore per pixel, over examples. #fmri_masked_vectors = epi_masker.transform(fmri_raw_img) #fmri_normed_vectors = mem.cache(stats.mstats.zscore)(fmri_masked_vectors, axis=0) fmri_normed_img = fmri_raw_img #epi_masker.inverse_transform(fmri_normed_vectors) print("Smooth the (spatial) data.") fmri_smooth_img = mem.cache(image.smooth_img)(fmri_normed_img, fwhm=1) print("Mask the MRI data.") masked_fmri_vectors = mem.cache(epi_masker.transform)(fmri_smooth_img) fmri_masked_img = epi_masker.inverse_transform(masked_fmri_vectors) # ## Compute a similarity matrix ########################################## condition_names = list(np.unique(haxby_labels)) n_cond_img = (haxby_labels == condition_names[0]).sum() n_conds = len(condition_names) n_compares = n_conds * (n_conds - 1) / 2 p_vectors = np.zeros((n_compares, masked_fmri_vectors.shape[1])) idx = 0 for i, cond in enumerate(condition_names): for j, cond2 in enumerate(condition_names[i+1:]): print("Computing ttest for %s vs. %s." % (cond, cond2)) _, p_vectors[idx, :] = stats.ttest_ind( masked_fmri_vectors[haxby_labels == cond, :], masked_fmri_vectors[haxby_labels == cond2, :], axis=0) idx += 1 p_vectors_normd = p_vectors / p_vectors.max(axis=0) log_p_vectors = -np.log10(p_vectors) log_p_vectors[np.isnan(log_p_vectors)] = 0. log_p_vectors[log_p_vectors > 10.] = 10. #log_p_normd_vectors = log_p_vectors / log_p_vectors.sum(axis=0) plt.figure(); plt.hist(p_vectors_normd.max(axis=0), 100); plt.show() idx = 0 for i, cond in enumerate(condition_names): for j, cond2 in enumerate(condition_names[i+1:]): if cond != 'face' and cond2 != 'face': continue print("Plotting compares for %s vs. %s." % (cond, cond2)) log_p_img = epi_masker.inverse_transform(1/p_vectors[idx, :]) log_p_normd_img = epi_masker.inverse_transform(1. - p_vectors_normd[idx, :]) plot_two_maps(plot_stat_map, (log_p_img, "%s vs. %s." % (cond, cond2)), (log_p_normd_img, "%s vs. %s. (norm'd)" % (cond, cond2)), bg_img=anat_img) import pdb; pdb.set_trace() plt.show() idx += 1
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, metric='minkowski', p=2, leaf_size=40, algorithm='best', memory=Memory(cachedir=None, verbose=0), approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, allow_single_cluster=False, **kwargs): """Perform HDBSCAN clustering from a vector array or distance matrix. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. min_cluster_size : int optional The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise. min_samples : int, optional The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. defaults to the min_cluster_size. alpha : float, optional A distance scaling parameter as used in robust single linkage. See (K. Chaudhuri and S. Dasgupta "Rates of convergence for the cluster tree."). (default 1.0) metric : string, or callable, optional The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. (default minkowski) p : int, optional p value to use if using the minkowski metric. (default 2) leaf_size : int, optional Leaf size for trees responsible for fast nearest neighbour queries. (default 40) algorithm : string, optional Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set to ``best`` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: * ``best`` * ``generic`` * ``prims_kdtree`` * ``prims_balltree`` * ``boruvka_kdtree`` * ``boruvka_balltree`` memory : Instance of joblib.Memory or string (optional) Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. approx_min_span_tree : Bool, optional Whether to accept an only approximate minimum spanning tree. For some algorithms this can provide a significant speedup, but the resulting clustering may be of marginally lower quality. If you are willing to sacrifice speed for correctness you may want to explore this; in general this should be left at the default True. (default True) gen_min_span_tree : bool, optional Whether to generate the minimum spanning tree for later analysis. (default False) core_dist_n_jobs : int, optional Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). (default 4) allow_single_cluster : boolean By default HDBSCAN* will not produce a single cluster, setting this to t=True will override this and allow single cluster results in the case that you feel this is a valid result for your dataset. (default False) **kwargs : optional Arguments passed to the distance metric Returns ------- labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. probabilities : array [n_samples] Cluster membership strengths for each point. Noisy samples are assigned 0. cluster_persistence : array, shape = [n_clusters] A score of how persistent each cluster is. A score of 1.0 represents a perfectly stable cluster that persists over all distance scales, while a score of 0.0 represents a perfectly ephemeral cluster. These scores can be guage the relative coherence of the clusters output by the algorithm. condensed_tree : record array The condensed cluster hierarchy used to generate clusters. single_linkage_tree : array [n_samples - 1, 4] The single linkage tree produced during clustering in scipy hierarchical clustering format (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). min_spanning_tree : array [n_samples - 1, 3] The minimum spanning as an edgelist. If gen_min_span_tree was False this will be None. References ---------- R. Campello, D. Moulavi, and J. Sander, "Density-Based Clustering Based on Hierarchical Density Estimates" In: Advances in Knowledge Discovery and Data Mining, Springer, pp 160-172. 2013 """ if min_samples is None: min_samples = min_cluster_size if type(min_samples) is not int or type(min_cluster_size) is not int: raise ValueError('Min samples and min cluster size must be integers!') if min_samples <= 0 or min_cluster_size <= 0: raise ValueError('Min samples and Min cluster size must be positive integers') if alpha <= 0.0 or type(alpha) is int: raise ValueError('Alpha must be a positive value greater than 0!') if leaf_size < 1: raise ValueError('Leaf size must be greater than 0!') # Checks input and converts to an nd-array where possible X = check_array(X, accept_sparse='csr') # Python 2 and 3 compliant string_type checking if isinstance(memory, six.string_types): memory = Memory(cachedir=memory, verbose=0) if algorithm != 'best': if algorithm == 'generic': (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_generic)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == 'prims_kdtree': if metric not in KDTree.valid_metrics: raise ValueError("Cannot use Prim's with KDTree for this metric!") (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == 'prims_balltree': if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Prim's with BallTree for this metric!") (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_prims_balltree)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == 'boruvka_kdtree': if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Boruvka with KDTree for this metric!") (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_boruvka_kdtree)(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) elif algorithm == 'boruvka_balltree': if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Boruvka with BallTree for this metric!") (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_boruvka_balltree)(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) else: raise TypeError('Unknown algorithm type %s specified' % algorithm) else: if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_generic)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif metric in KDTree.valid_metrics: #TO DO: Need heuristic to decide when to go to boruvka; still debugging for now if X.shape[1] > 60: (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) else: (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_boruvka_kdtree)(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) else: # Metric is a valid BallTree metric # TO DO: Need heuristic to decide when to go to boruvka; still debugging for now if X.shape[1] > 60: (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) else: (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_boruvka_balltree)(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) return _tree_to_labels(X, single_linkage_tree, min_cluster_size, allow_single_cluster) + (result_min_span_tree,)
def test_selectors(filepath='features_final.csv', scaler='minMax'): from sklearn.externals.joblib import Memory from tempfile import mkdtemp from shutil import rmtree X, y, genre_mapping = loadFeatures_NoSplit(filepath) X = normalize(X, scaler) pipe = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())]) N_FEATURES_OPTIONS = [2, 10, 20, 40, 60, 120] C_OPTIONS = [1, 10] param_grid = [ { 'reduce_dim': [PCA(iterated_power=7), NMF()], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, { 'reduce_dim': [SelectKBest(chi2)], 'reduce_dim__k': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, ] reducer_labels = ['PCA', 'NMF', 'KBest(chi2)'] # Create a temporary folder to store the transformers of the pipeline cachedir = mkdtemp() memory = Memory(cachedir=cachedir, verbose=10) cached_pipe = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())], memory=memory) # This time, a cached pipeline will be used within the grid search grid = GridSearchCV(cached_pipe, cv=2, n_jobs=1, param_grid=param_grid) #grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid) grid.fit(X, y) # Delete the temporary cache before exiting rmtree(cachedir) mean_scores = np.array(grid.cv_results_['mean_test_score']) # scores are in the order of param_grid iteration, which is alphabetical mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS)) # select score for best C mean_scores = mean_scores.max(axis=0) bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) * (len(reducer_labels) + 1) + .5) plt.figure() COLORS = 'bgrcmyk' for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)): plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i]) plt.title("Comparing feature reduction techniques") plt.xlabel('Reduced number of features') plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS) plt.ylabel('Classification accuracy') plt.ylim((0, 1)) plt.legend(loc='upper left') plt.show()
"""Utility functions for autoreject.""" # Authors: Mainak Jas <*****@*****.**> # Denis A. Engemann <*****@*****.**> from collections import defaultdict import warnings import mne from mne.utils import check_version as version_is_greater_equal from mne import pick_types, pick_channels, pick_info from mne.channels.interpolation import _do_interp_dots from sklearn.externals.joblib import Memory mem = Memory(cachedir='cachedir') def _get_ch_type_from_picks(picks, info): """Get the channel types from picks.""" keys = list() for pp in picks: key = mne.io.pick.channel_type(info=info, idx=pp) if key not in keys: keys.append(key) return keys def _check_data(epochs, picks, ch_constraint='data_channels',
from sklearn.cluster import FeatureAgglomeration from sklearn.linear_model import BayesianRidge from sklearn.pipeline import Pipeline from sklearn.externals.joblib import Memory from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.model_selection import KFold from sklearn.metrics import accuracy_score hyper_data = pd.read_csv("../Data/headers3mgperml.csv", sep=',') X = hyper_data.values[:, 16:] y = hyper_data.values[:, 2] #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=75) cv = KFold(2) ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(cachedir=cachedir, verbose=1) # connectivity = grid_to_graph(n_x=240, n_y=34) ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) # clf = Pipeline([('ward', ward), ('ridge', ridge)]) clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X,y) # coef_ = clf.best_estimator_.steps[-1][1].coef_ # coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) # coef_agglomeration_ = coef_.reshape(240, 34)
def evaluate_model(dataset, pipeline_components, pipeline_parameters): input_data = pd.read_csv(dataset, compression='gzip', sep='\t') features = input_data.drop('class', axis=1).values.astype(float) labels = input_data['class'].values pipelines = [ dict(zip(pipeline_parameters.keys(), list(parameter_combination))) for parameter_combination in itertools.product( *pipeline_parameters.values()) ] # Create a temporary folder to store the transformers of the pipeline cachedir = mkdtemp() memory = Memory(cachedir=cachedir, verbose=0) with warnings.catch_warnings(): # Squash warning messages. Turn this off when debugging! warnings.simplefilter('ignore') for pipe_parameters in pipelines: pipeline = [] for component in pipeline_components: if component in pipe_parameters: args = pipe_parameters[component] pipeline.append(component(**args)) else: pipeline.append(component()) try: clf = make_pipeline(*pipeline, memory=memory) cv_predictions = cross_val_predict(estimator=clf, X=features, y=labels, cv=StratifiedKFold( n_splits=10, shuffle=True, random_state=90483257)) accuracy = accuracy_score(labels, cv_predictions) macro_f1 = f1_score(labels, cv_predictions, average='macro') balanced_accuracy = balanced_accuracy_score( labels, cv_predictions) except KeyboardInterrupt: sys.exit(1) # This is a catch-all to make sure that the evaluation won't crash due to a bad parameter # combination or bad data. Turn this off when debugging! except Exception as e: continue preprocessor_class = pipeline_components[0] preprocessor_param_string = 'default' if preprocessor_class in pipe_parameters: preprocessor_param_string = ','.join([ '{}={}'.format( parameter, '|'.join([x.strip() for x in str(value).split(',')])) for parameter, value in pipe_parameters[preprocessor_class].items() ]) classifier_class = pipeline_components[-1] param_string = ','.join([ '{}={}'.format(parameter, value) for parameter, value in pipe_parameters[classifier_class].items() ]) out_text = '\t'.join([ dataset.split('/')[-1][:-7], preprocessor_class.__name__, preprocessor_param_string, classifier_class.__name__, param_string, str(accuracy), str(macro_f1), str(balanced_accuracy) ]) print(out_text) sys.stdout.flush() # Delete the temporary cache before exiting rmtree(cachedir)
msdl_atlas_dataset = datasets.fetch_atlas_msdl() adhd_dataset = datasets.fetch_adhd(n_subjects=n_subjects) # print basic information on the dataset print('First subject functional nifti image (4D) is at: %s' % adhd_dataset.func[0]) # 4D data ############################################################################## # Extracting region signals # -------------------------- from nilearn import image from nilearn import input_data # A "memory" to avoid recomputation from sklearn.externals.joblib import Memory mem = Memory('nilearn_cache') masker = input_data.NiftiMapsMasker(msdl_atlas_dataset.maps, resampling_target="maps", detrend=True, low_pass=None, high_pass=0.01, t_r=2.5, standardize=True, memory='nilearn_cache', memory_level=1, verbose=2) masker.fit() subject_time_series = [] func_filenames = adhd_dataset.func
def test_pipeline_memory_sampler(): X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert_false(hasattr(transf, 'means_')) # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert_equal(ts, cached_pipe.named_steps['transf'].timestamp_) # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert_equal(ts, cached_pipe_2.named_steps['transf_2'].timestamp_) finally: shutil.rmtree(cachedir)
def mask_and_reduce(masker, imgs, confounds=None, reduction_ratio='auto', n_components=None, random_state=None, memory_level=0, memory=Memory(cachedir=None), n_jobs=1): """Mask and reduce provided 4D images with given masker. Uses a PCA (randomized for small reduction ratio) or a range finding matrix on time series to reduce data size in time direction. For multiple images, the concatenation of data is returned, either as an ndarray or a memorymap (useful for big datasets that do not fit in memory). Parameters ---------- masker: NiftiMasker or MultiNiftiMasker Instance used to mask provided data. imgs: list of 4D Niimg-like objects See http://nilearn.github.io/manipulating_images/input_output.html List of subject data to mask, reduce and stack. confounds: CSV file path or 2D matrix, optional This parameter is passed to signal.clean. Please see the corresponding documentation for details. reduction_ratio: 'auto' or float between 0. and 1. - Between 0. or 1. : controls data reduction in the temporal domain , 1. means no reduction, < 1. calls for an SVD based reduction. - if set to 'auto', estimator will set the number of components per reduced session to be n_components. n_components: integer, optional Number of components per subject to be extracted by dimension reduction random_state: int or RandomState Pseudo number generator state used for random sampling. memory_level: integer, optional Integer indicating the level of memorization. The higher, the more function calls are cached. memory: joblib.Memory Used to cache the function calls. Returns ------ data: ndarray or memorymap Concatenation of reduced data. """ if not hasattr(imgs, '__iter__'): imgs = [imgs] if reduction_ratio == 'auto': if n_components is None: # Reduction ratio is 1 if # neither n_components nor ratio is provided reduction_ratio = 1 else: if reduction_ratio is None: reduction_ratio = 1 else: reduction_ratio = float(reduction_ratio) if not 0 <= reduction_ratio <= 1: raise ValueError('Reduction ratio should be between 0. and 1.,' 'got %.2f' % reduction_ratio) if confounds is None: confounds = itertools.repeat(confounds) if reduction_ratio == 'auto': n_samples = n_components reduction_ratio = None else: # We'll let _mask_and_reduce_single decide on the number of # samples based on the reduction_ratio n_samples = None data_list = Parallel(n_jobs=n_jobs)( delayed(_mask_and_reduce_single)(masker, img, confound, reduction_ratio=reduction_ratio, n_samples=n_samples, memory=memory, memory_level=memory_level, random_state=random_state) for img, confound in zip(imgs, confounds)) subject_n_samples = [subject_data.shape[0] for subject_data in data_list] n_samples = np.sum(subject_n_samples) n_voxels = int(np.sum(_safe_get_data(masker.mask_img_))) data = np.empty((n_samples, n_voxels), order='F', dtype='float64') current_position = 0 for i, next_position in enumerate(np.cumsum(subject_n_samples)): data[current_position:next_position] = data_list[i] current_position = next_position # Clear memory as fast as possible: remove the reference on # the corresponding block of data data_list[i] = None return data
depth=4, num_round=100) #good! y_pred = clf.multi(X_train_cv, y_train_cv, X_test_cv, 3, y_test=y_test_cv) xx.append(multiclass_log_loss(y_test_cv, y_pred)) print xx[-1] #,y_pred.shape,zz[-1] ypred[test_index] = y_pred print xx print 'average:', np.mean(xx), 'std', np.std(xx) return ypred, np.mean(xx) mem = Memory("./mycache") @mem.cache def get_data(name): data = load_svmlight_file(name) return data[0], data[1] X, _ = get_data('../sparse/rebuild1.svm') X1, _ = get_data('../sparse/rebuild2.svm') X2, _ = get_data('../sparse/rebuild3.svm') X3, _ = get_data('../sparse/rebuild4.svm') X4, _ = get_data('../sparse/rebuild5.svm') X = sparse.hstack([X, X1, X2, X3, X4], format='csr').todense() train = pd.read_csv('../explore/train1.csv')
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, metric='minkowski', p=2, leaf_size=40, algorithm='best', memory=Memory(cachedir=None, verbose=0), approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, **kwargs): """Perform HDBSCAN clustering from a vector array or distance matrix. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. min_cluster_size : int optional The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise. min_samples : int, optional The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. defaults to the min_cluster_size. alpha : float, optional A distance scaling parameter as used in robust single linkage. See (K. Chaudhuri and S. Dasgupta "Rates of convergence for the cluster tree."). (default 1.0) metric : string, or callable, optional The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. (default minkowski) p : int, optional p value to use if using the minkowski metric. (default 2) leaf_size : int, optional Leaf size for trees responsible for fast nearest neighbour queries. (default 40) algorithm : string, optional Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set to ``best`` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: * ``best`` * ``generic`` * ``prims_kdtree`` * ``prims_balltree`` * ``boruvka_kdtree`` * ``boruvka_balltree`` memory : Instance of joblib.Memory or string (optional) Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. approx_min_span_tree : Bool, optional Whether to accept an only approximate minimum spanning tree. For some algorithms this can provide a significant speedup, but the resulting clustering may be of marginally lower quality. If you are willing to sacrifice speed for correctness you may want to explore this; in general this should be left at the default True. (default True) gen_min_span_tree : bool, optional Whether to generate the minimum spanning tree for later analysis. (default False) core_dist_n_jobs : int, optional Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). (default 4) **kwargs : optional Arguments passed to the distance metric Returns ------- labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. probabilities : array [n_samples] Cluster membership strengths for each point. Noisy samples are assigned 0. cluster_persistence : array, shape = [n_clusters] A score of how persistent each cluster is. A score of 1.0 represents a perfectly stable cluster that persists over all distance scales, while a score of 0.0 represents a perfectly ephemeral cluster. These scores can be guage the relative coherence of the clusters output by the algorithm. condensed_tree : record array The condensed cluster hierarchy used to generate clusters. single_linkage_tree : array [n_samples - 1, 4] The single linkage tree produced during clustering in scipy hierarchical clustering format (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). min_spanning_tree : array [n_samples - 1, 3] The minimum spanning as an edgelist. If gen_min_span_tree was False this will be None. References ---------- R. Campello, D. Moulavi, and J. Sander, "Density-Based Clustering Based on Hierarchical Density Estimates" In: Advances in Knowledge Discovery and Data Mining, Springer, pp 160-172. 2013 """ if min_samples is None: min_samples = min_cluster_size if type(min_samples) is not int or type(min_cluster_size) is not int: raise ValueError('Min samples and min cluster size must be integers!') if min_samples <= 0 or min_cluster_size <= 0: raise ValueError('Min samples and Min cluster size must be positive integers') if alpha <= 0.0 or type(alpha) is int: raise ValueError('Alpha must be a positive value greater than 0!') if leaf_size < 1: raise ValueError('Leaf size must be greater than 0!') #Checks input and converts to an nd-array where possible X = check_array(X, accept_sparse='csr') #Python 2 and 3 compliant string_type checking if isinstance(memory, six.string_types): memory = Memory(cachedir=memory, verbose=0) if algorithm != 'best': if algorithm == 'generic': (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_generic)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == 'prims_kdtree': if metric not in KDTree.valid_metrics: raise ValueError("Cannot use Prim's with KDTree for this metric!") (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == 'prims_balltree': if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Prim's with BallTree for this metric!") (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_prims_balltree)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == 'boruvka_kdtree': if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Boruvka with KDTree for this metric!") (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_boruvka_kdtree)(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) elif algorithm == 'boruvka_balltree': if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Boruvka with BallTree for this metric!") (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_boruvka_balltree)(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) else: raise TypeError('Unknown algorithm type %s specified' % algorithm) else: if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_generic)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif metric in KDTree.valid_metrics: #TO DO: Need heuristic to decide when to go to boruvka; still debugging for now if X.shape[1] > 60: (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) else: (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_boruvka_kdtree)(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) else: # Metric is a valid BallTree metric #TO DO: Need heuristic to decide when to go to boruvka; still debugging for now if X.shape[1] > 60: (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) else: (single_linkage_tree, result_min_span_tree) = \ memory.cache(_hdbscan_boruvka_balltree)(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) return _tree_to_labels(X, single_linkage_tree, min_cluster_size) + (result_min_span_tree,)
def first_level_models_from_bids(dataset_path, task_label, space_label, img_filters=None, t_r=None, slice_time_ref=0., hrf_model='glover', drift_model='cosine', period_cut=128, drift_order=1, fir_delays=[0], min_onset=-24, mask=None, target_affine=None, target_shape=None, smoothing_fwhm=None, memory=Memory(None), memory_level=1, standardize=False, signal_scaling=0, noise_model='ar1', verbose=0, n_jobs=1, minimize_memory=True, derivatives_folder='derivatives'): """Create FirstLevelModel objects and fit arguments from a BIDS dataset. It t_r is not specified this function will attempt to load it from a bold.json file alongside slice_time_ref. Otherwise t_r and slice_time_ref are taken as given. Parameters ---------- dataset_path: str Directory of the highest level folder of the BIDS dataset. Should contain subject folders and a derivatives folder. task_label: str Task_label as specified in the file names like _task-<task_label>_. space_label: str, optional Specifies the space label of the preproc.nii images. As they are specified in the file names like _space-<space_label>_. img_filters: list of tuples (str, str), optional (default: None) Filters are of the form (field, label). Only one filter per field allowed. A file that does not match a filter will be discarded. Possible filters are 'acq', 'rec', 'run', 'res' and 'variant'. Filter examples would be (variant, smooth), (acq, pa) and (res, 1x1x1). derivatives_folder: str, optional derivatives and app folder path containing preprocessed files. Like "derivatives/FMRIPREP". default is simply "derivatives". All other parameters correspond to a `FirstLevelModel` object, which contains their documentation. The subject label of the model will be determined directly from the BIDS dataset. Returns ------- models: list of `FirstLevelModel` objects Each FirstLevelModel object corresponds to a subject. All runs from different sessions are considered together for the same subject to run a fixed effects analysis on them. models_run_imgs: list of list of Niimg-like objects, Items for the FirstLevelModel fit function of their respective model. models_events: list of list of pandas DataFrames, Items for the FirstLevelModel fit function of their respective model. models_confounds: list of list of pandas DataFrames or None, Items for the FirstLevelModel fit function of their respective model. """ # check arguments img_filters = img_filters if img_filters else [] if not isinstance(dataset_path, str): raise TypeError('dataset_path must be a string, instead %s was given' % type(task_label)) if not os.path.exists(dataset_path): raise ValueError('given path do not exist: %s' % dataset_path) if not isinstance(task_label, str): raise TypeError('task_label must be a string, instead %s was given' % type(task_label)) if not isinstance(space_label, str): raise TypeError('space_label must be a string, instead %s was given' % type(space_label)) if not isinstance(img_filters, list): raise TypeError('img_filters must be a list, instead %s was given' % type(img_filters)) for img_filter in img_filters: if (not isinstance(img_filter[0], str) or not isinstance(img_filter[1], str)): raise TypeError('filters in img filters must be (str, str), ' 'instead %s was given' % type(img_filter)) if img_filter[0] not in ['acq', 'rec', 'run', 'res', 'variant']: raise ValueError("field %s is not a possible filter. Only " "'acq', 'rec', 'run', 'res' and 'variant' " "are allowed." % type(img_filter[0])) # check derivatives folder is present derivatives_path = os.path.join(dataset_path, derivatives_folder) if not os.path.exists(derivatives_path): raise ValueError('derivatives folder does not exist in given dataset') # Get acq specs for models. RepetitionTime and SliceTimingReference. # Throw warning if no bold.json is found if t_r is not None: warn('RepetitionTime given in model_init as %d' % t_r) warn('slice_time_ref is %d percent of the repetition ' 'time' % slice_time_ref) else: filters = [('task', task_label)] for img_filter in img_filters: if img_filter[0] in ['acq', 'rec', 'run']: filters.append(img_filter) img_specs = get_bids_files(derivatives_path, modality_folder='func', file_tag='preproc', file_type='json', filters=filters) # If we dont find the parameter information in the derivatives folder # we try to search in the raw data folder if not img_specs: img_specs = get_bids_files(dataset_path, modality_folder='func', file_tag='bold', file_type='json', filters=filters) if not img_specs: warn('No preproc.json found in derivatives folder and no bold.json' ' in dataset folder. t_r can not be inferred and will need to' ' be set manually in the list of models, otherwise their fit ' 'will throw an exception') else: specs = json.load(open(img_specs[0], 'r')) if 'RepetitionTime' in specs: t_r = float(specs['RepetitionTime']) else: warn('RepetitionTime not found in file %s. t_r can not be ' 'inferred and will need to be set manually in the ' 'list of models. Otherwise their fit will throw an ' ' exception' % img_specs[0]) if 'SliceTimingRef' in specs: slice_time_ref = float(specs['SliceTimingRef']) else: warn('SliceTimingRef not found in file %s. It will be assumed' ' that the slice timing reference is 0.0 percent of the ' 'repetition time. If it is not the case it will need to ' 'be set manually in the generated list of models' % img_specs[0]) # Infer subjects in dataset sub_folders = glob.glob(os.path.join(derivatives_path, 'sub-*/')) sub_labels = [os.path.basename(s[:-1]).split('-')[1] for s in sub_folders] sub_labels = sorted(list(set(sub_labels))) # Build fit_kwargs dictionaries to pass to their respective models fit # Events and confounds files must match number of imgs (runs) models = [] models_run_imgs = [] models_events = [] models_confounds = [] for sub_label in sub_labels: # Create model model = FirstLevelModel(t_r=t_r, slice_time_ref=slice_time_ref, hrf_model=hrf_model, drift_model=drift_model, period_cut=period_cut, drift_order=drift_order, fir_delays=fir_delays, min_onset=min_onset, mask=mask, target_affine=target_affine, target_shape=target_shape, smoothing_fwhm=smoothing_fwhm, memory=memory, memory_level=memory_level, standardize=standardize, signal_scaling=signal_scaling, noise_model=noise_model, verbose=verbose, n_jobs=n_jobs, minimize_memory=minimize_memory, subject_label=sub_label) models.append(model) # Get preprocessed imgs filters = [('task', task_label), ('space', space_label)] + img_filters imgs = get_bids_files(derivatives_path, modality_folder='func', file_tag='preproc', file_type='nii*', sub_label=sub_label, filters=filters) # If there is more than one file for the same (ses, run), likely we # have an issue of underspecification of filters. run_check_list = [] # If more than one run is present the run field is mandatory in BIDS # as well as the ses field if more than one session is present. if len(imgs) > 1: for img in imgs: img_dict = parse_bids_filename(img) if ('_ses-' in img_dict['file_basename'] and '_run-' in img_dict['file_basename']): if (img_dict['ses'], img_dict['run']) in run_check_list: raise ValueError( 'More than one nifti image found for the same run ' '%s and session %s. Please verify that the ' 'preproc_variant and space_label labels ' 'corresponding to the BIDS spec ' 'were correctly specified.' % (img_dict['run'], img_dict['ses'])) else: run_check_list.append( (img_dict['ses'], img_dict['run'])) elif '_ses-' in img_dict['file_basename']: if img_dict['ses'] in run_check_list: raise ValueError( 'More than one nifti image found for the same ses ' '%s, while no additional run specification present' '. Please verify that the preproc_variant and ' 'space_label labels ' 'corresponding to the BIDS spec ' 'were correctly specified.' % img_dict['ses']) else: run_check_list.append(img_dict['ses']) elif '_run-' in img_dict['file_basename']: if img_dict['run'] in run_check_list: raise ValueError( 'More than one nifti image found for the same run ' '%s. Please verify that the preproc_variant and ' 'space_label labels ' 'corresponding to the BIDS spec ' 'were correctly specified.' % img_dict['run']) else: run_check_list.append(img_dict['run']) models_run_imgs.append(imgs) # Get events and extra confounds filters = [('task', task_label)] for img_filter in img_filters: if img_filter[0] in ['acq', 'rec', 'run']: filters.append(img_filter) # Get events files events = get_bids_files(dataset_path, modality_folder='func', file_tag='events', file_type='tsv', sub_label=sub_label, filters=filters) if events: if len(events) != len(imgs): raise ValueError('%d events.tsv files found for %d bold ' 'files. Same number of event files as ' 'the number of runs is expected' % (len(events), len(imgs))) events = [ pd.read_csv(event, sep='\t', index_col=None) for event in events ] models_events.append(events) else: raise ValueError('No events.tsv files found') # Get confounds. If not found it will be assumed there are none. # If there are confounds, they are assumed to be present for all runs. confounds = get_bids_files(derivatives_path, modality_folder='func', file_tag='confounds', file_type='tsv', sub_label=sub_label, filters=filters) if confounds: if len(confounds) != len(imgs): raise ValueError('%d confounds.tsv files found for %d bold ' 'files. Same number of confound files as ' 'the number of runs is expected' % (len(events), len(imgs))) confounds = [ pd.read_csv(c, sep='\t', index_col=None) for c in confounds ] models_confounds.append(confounds) return models, models_run_imgs, models_events, models_confounds
print('RMSE %s: %.3f' % (estimator, score)) if output_dir is not None: with open(join(debug_folder, 'score'), 'w+') as f: f.write('score : %.4f' % score) return score output_dir = expanduser(join('~/output/dl_recommender/', datetime.datetime.now().strftime('%Y-%m-%d_%H' '-%M-%S'))) os.makedirs(output_dir) random_state = check_random_state(0) mem = Memory(cachedir=expanduser("~/cache"), verbose=10) X_csr = mem.cache(fetch_ml_10m)(expanduser('~/data/own/ml-10M100K'), remove_empty=True) permutation = random_state.permutation(X_csr.shape[0]) X_csr = X_csr[permutation] X, y = array_to_fm_format(X_csr) uniform_split = ShuffleSplit(n_iter=4, test_size=.25, random_state=random_state) fm_decoder = FMDecoder(n_samples=X_csr.shape[0], n_features=X_csr.shape[1]) base_estimator = BaseRecommender(fm_decoder)
class FirstLevelModel(BaseEstimator, TransformerMixin, CacheMixin): """ Implementation of the General Linear Model for single session fMRI data Parameters ---------- t_r : float This parameter indicates repetition times of the experimental runs. In seconds. It is necessary to correctly consider times in the design matrix. This parameter is also passed to nilearn.signal.clean. Please see the related documentation for details. slice_time_ref : float, optional (default 0.) This parameter indicates the time of the reference slice used in the slice timing preprocessing step of the experimental runs. It is expressed as a percentage of the t_r (time repetition), so it can have values between 0. and 1. hrf_model : {'spm', 'spm + derivative', 'spm + derivative + dispersion', 'glover', 'glover + derivative', 'glover + derivative + dispersion', 'fir', None} String that specifies the hemodynamic response function. Defaults to 'glover'. drift_model : string, optional This parameter specifies the desired drift model for the design matrices. It can be 'polynomial', 'cosine' or None. period_cut : float, optional This parameter specifies the cut period of the low-pass filter in seconds for the design matrices. drift_order : int, optional This parameter specifices the order of the drift model (in case it is polynomial) for the design matrices. fir_delays : array of shape(n_onsets) or list, optional In case of FIR design, yields the array of delays used in the FIR model, in seconds. min_onset : float, optional This parameter specifies the minimal onset relative to the design (in seconds). Events that start before (slice_time_ref * t_r + min_onset) are not considered. mask : Niimg-like, NiftiMasker object or False, optional Mask to be used on data. If an instance of masker is passed, then its mask will be used. If no mask is given, it will be computed automatically by a NiftiMasker with default parameters. If False is given then the data will not be masked. target_affine : 3x3 or 4x4 matrix, optional This parameter is passed to nilearn.image.resample_img. Please see the related documentation for details. target_shape : 3-tuple of integers, optional This parameter is passed to nilearn.image.resample_img. Please see the related documentation for details. smoothing_fwhm : float, optional If smoothing_fwhm is not None, it gives the size in millimeters of the spatial smoothing to apply to the signal. memory : string, optional Path to the directory used to cache the masking process and the glm fit. By default, no caching is done. Creates instance of joblib.Memory. memory_level : integer, optional Rough estimator of the amount of memory used by caching. Higher value means more memory for caching. standardize : boolean, optional If standardize is True, the time-series are centered and normed: their variance is put to 1 in the time dimension. signal_scaling : False, int or (int, int), optional, If not False, fMRI signals are scaled to the mean value of scaling_axis given, which can be 0, 1 or (0, 1). 0 refers to mean scaling each voxel with respect to time, 1 refers to mean scaling each time point with respect to all voxels and (0, 1) refers to scaling with respect to voxels and time, which is known as grand mean scaling. Incompatible with standardize (standardize=False is enforced when signal_scaling is not False). noise_model : {'ar1', 'ols'}, optional The temporal variance model. Defaults to 'ar1' verbose : integer, optional Indicate the level of verbosity. By default, nothing is printed. If 0 prints nothing. If 1 prints progress by computation of each run. If 2 prints timing details of masker and GLM. If 3 prints masker computation details. n_jobs : integer, optional The number of CPUs to use to do the computation. -1 means 'all CPUs', -2 'all CPUs but one', and so on. minimize_memory : boolean, optional Gets rid of some variables on the model fit results that are not necessary for contrast computation and would only be useful for further inspection of model details. This has an important impact on memory consumption. True by default. subject_label : string, optional This id will be used to identify a `FirstLevelModel` when passed to a `SecondLevelModel` object. Attributes ---------- labels : array of shape (n_voxels,), a map of values on voxels used to identify the corresponding model results : dict, with keys corresponding to the different labels values values are RegressionResults instances corresponding to the voxels """ def __init__(self, t_r=None, slice_time_ref=0., hrf_model='glover', drift_model='cosine', period_cut=128, drift_order=1, fir_delays=[0], min_onset=-24, mask=None, target_affine=None, target_shape=None, smoothing_fwhm=None, memory=Memory(None), memory_level=1, standardize=False, signal_scaling=0, noise_model='ar1', verbose=0, n_jobs=1, minimize_memory=True, subject_label=None): # design matrix parameters self.t_r = t_r self.slice_time_ref = slice_time_ref self.hrf_model = hrf_model self.drift_model = drift_model self.period_cut = period_cut self.drift_order = drift_order self.fir_delays = fir_delays self.min_onset = min_onset # glm parameters self.mask = mask self.target_affine = target_affine self.target_shape = target_shape self.smoothing_fwhm = smoothing_fwhm if isinstance(memory, _basestring): self.memory = Memory(memory) else: self.memory = memory self.memory_level = memory_level self.standardize = standardize if signal_scaling is False: self.signal_scaling = signal_scaling elif signal_scaling in [0, 1, (0, 1)]: self.scaling_axis = signal_scaling self.signal_scaling = True self.standardize = False else: raise ValueError('signal_scaling must be "False", "0", "1"' ' or "(0, 1)"') self.noise_model = noise_model self.verbose = verbose self.n_jobs = n_jobs self.minimize_memory = minimize_memory # attributes self.labels_ = None self.results_ = None self.subject_label = subject_label def fit(self, run_imgs, events=None, confounds=None, design_matrices=None): """ Fit the GLM For each run: 1. create design matrix X 2. do a masker job: fMRI_data -> Y 3. fit regression to (Y, X) Parameters ---------- run_imgs: Niimg-like object or list of Niimg-like objects, See http://nilearn.github.io/manipulating_images/input_output.html#inputing-data-file-names-or-image-objects Data on which the GLM will be fitted. If this is a list, the affine is considered the same for all. events: pandas Dataframe or string or list of pandas DataFrames or strings fMRI events used to build design matrices. One events object expected per run_img. Ignored in case designs is not None. If string, then a path to a csv file is expected. confounds: pandas Dataframe or string or list of pandas DataFrames or strings Each column in a DataFrame corresponds to a confound variable to be included in the regression model of the respective run_img. The number of rows must match the number of volumes in the respective run_img. Ignored in case designs is not None. If string, then a path to a csv file is expected. design_matrices: pandas DataFrame or list of pandas DataFrames, Design matrices that will be used to fit the GLM. If given it takes precedence over events and confounds. """ # Check arguments # Check imgs type if events is not None: _check_events_file_uses_tab_separators(events_files=events) if not isinstance(run_imgs, (list, tuple)): run_imgs = [run_imgs] if design_matrices is None: if events is None: raise ValueError('events or design matrices must be provided') if self.t_r is None: raise ValueError('t_r not given to FirstLevelModel object' ' to compute design from events') else: design_matrices = _check_run_tables(run_imgs, design_matrices, 'design_matrices') # Check that number of events and confound files match number of runs # Also check that events and confound files can be loaded as DataFrame if events is not None: events = _check_run_tables(run_imgs, events, 'events') if confounds is not None: confounds = _check_run_tables(run_imgs, confounds, 'confounds') # Learn the mask if self.mask is False: # We create a dummy mask to preserve functionality of api ref_img = check_niimg(run_imgs[0]) self.mask = Nifti1Image(np.ones(ref_img.shape[:3]), ref_img.affine) if not isinstance(self.mask, NiftiMasker): self.masker_ = NiftiMasker(mask_img=self.mask, smoothing_fwhm=self.smoothing_fwhm, target_affine=self.target_affine, standardize=self.standardize, mask_strategy='epi', t_r=self.t_r, memory=self.memory, verbose=max(0, self.verbose - 2), target_shape=self.target_shape, memory_level=self.memory_level) self.masker_.fit(run_imgs[0]) else: if self.mask.mask_img_ is None and self.masker_ is None: self.masker_ = clone(self.mask) for param_name in [ 'target_affine', 'target_shape', 'smoothing_fwhm', 't_r', 'memory', 'memory_level' ]: our_param = getattr(self, param_name) if our_param is None: continue if getattr(self.masker_, param_name) is not None: warn('Parameter %s of the masker' ' overriden' % param_name) setattr(self.masker_, param_name, our_param) self.masker_.fit(run_imgs[0]) else: self.masker_ = self.mask # For each run fit the model and keep only the regression results. self.labels_, self.results_, self.design_matrices_ = [], [], [] n_runs = len(run_imgs) t0 = time.time() for run_idx, run_img in enumerate(run_imgs): # Report progress if self.verbose > 0: percent = float(run_idx) / n_runs percent = round(percent * 100, 2) dt = time.time() - t0 # We use a max to avoid a division by zero if run_idx == 0: remaining = 'go take a coffee, a big one' else: remaining = (100. - percent) / max(0.01, percent) * dt remaining = '%i seconds remaining' % remaining sys.stderr.write("Computing run %d out of %d runs (%s)\n" % (run_idx + 1, n_runs, remaining)) # Build the experimental design for the glm run_img = check_niimg(run_img, ensure_ndim=4) if design_matrices is None: n_scans = run_img.get_data().shape[3] if confounds is not None: confounds_matrix = confounds[run_idx].values if confounds_matrix.shape[0] != n_scans: raise ValueError('Rows in confounds does not match' 'n_scans in run_img at index %d' % (run_idx, )) confounds_names = confounds[run_idx].columns.tolist() else: confounds_matrix = None confounds_names = None start_time = self.slice_time_ref * self.t_r end_time = (n_scans - 1 + self.slice_time_ref) * self.t_r frame_times = np.linspace(start_time, end_time, n_scans) design = make_first_level_design_matrix( frame_times, events[run_idx], self.hrf_model, self.drift_model, self.period_cut, self.drift_order, self.fir_delays, confounds_matrix, confounds_names, self.min_onset) else: design = design_matrices[run_idx] self.design_matrices_.append(design) # Mask and prepare data for GLM if self.verbose > 1: t_masking = time.time() sys.stderr.write('Starting masker computation \r') Y = self.masker_.transform(run_img) if self.verbose > 1: t_masking = time.time() - t_masking sys.stderr.write('Masker took %d seconds \n' % t_masking) if self.signal_scaling: Y, _ = mean_scaling(Y, self.scaling_axis) if self.memory: mem_glm = self.memory.cache(run_glm, ignore=['n_jobs']) else: mem_glm = run_glm # compute GLM if self.verbose > 1: t_glm = time.time() sys.stderr.write('Performing GLM computation\r') labels, results = mem_glm(Y, design.values, noise_model=self.noise_model, bins=100, n_jobs=self.n_jobs) if self.verbose > 1: t_glm = time.time() - t_glm sys.stderr.write('GLM took %d seconds \n' % t_glm) self.labels_.append(labels) # We save memory if inspecting model details is not necessary if self.minimize_memory: for key in results: results[key] = SimpleRegressionResults(results[key]) self.results_.append(results) del Y # Report progress if self.verbose > 0: sys.stderr.write( "\nComputation of %d runs done in %i seconds\n\n" % (n_runs, time.time() - t0)) return self def compute_contrast(self, contrast_def, stat_type=None, output_type='z_score'): """Generate different outputs corresponding to the contrasts provided e.g. z_map, t_map, effects and variance. In multi-session case, outputs the fixed effects map. Parameters ---------- contrast_def : str or array of shape (n_col) or list of (string or array of shape (n_col)) where ``n_col`` is the number of columns of the design matrix, (one array per run). If only one array is provided when there are several runs, it will be assumed that the same contrast is desired for all runs. The string can be a formula compatible with the linear constraint of the Patsy library. Basically one can use the name of the conditions as they appear in the design matrix of the fitted model combined with operators /\*+- and numbers. Please checks the patsy documentation for formula examples: http://patsy.readthedocs.io/en/latest/API-reference.html#patsy.DesignInfo.linear_constraint stat_type : {'t', 'F'}, optional type of the contrast output_type : str, optional Type of the output map. Can be 'z_score', 'stat', 'p_value', 'effect_size', 'effect_variance' or 'all' Returns ------- output : Nifti1Image or dict The desired output image(s). If ``output_type == 'all'``, then the output is a dictionary of images, keyed by the type of image. """ if self.labels_ is None or self.results_ is None: raise ValueError('The model has not been fit yet') if isinstance(contrast_def, (np.ndarray, str)): con_vals = [contrast_def] elif isinstance(contrast_def, (list, tuple)): con_vals = contrast_def else: raise ValueError('contrast_def must be an array or str or list of' ' (array or str)') # Translate formulas to vectors with patsy design_info = DesignInfo(self.design_matrices_[0].columns.tolist()) for cidx, con in enumerate(con_vals): if not isinstance(con, np.ndarray): con_vals[cidx] = design_info.linear_constraint(con).coefs n_runs = len(self.labels_) if len(con_vals) != n_runs: warn('One contrast given, assuming it for all %d runs' % n_runs) con_vals = con_vals * n_runs # 'all' is assumed to be the final entry; if adding more, place before 'all' valid_types = [ 'z_score', 'stat', 'p_value', 'effect_size', 'effect_variance', 'all' ] if output_type not in valid_types: raise ValueError( 'output_type must be one of {}'.format(valid_types)) contrast = _fixed_effect_contrast(self.labels_, self.results_, con_vals, stat_type) output_types = valid_types[:-1] if output_type == 'all' else [ output_type ] outputs = {} for output_type_ in output_types: estimate_ = getattr(contrast, output_type_)() # Prepare the returned images output = self.masker_.inverse_transform(estimate_) contrast_name = str(con_vals) output.header['descrip'] = ('%s of contrast %s' % (output_type_, contrast_name)) outputs[output_type_] = output return outputs if output_type == 'all' else output
def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2, algorithm='best', memory=Memory(cachedir=None, verbose=0)): """Perform robust single linkage clustering from a vector array or distance matrix. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. cut : float The reachability distance value to cut the cluster heirarchy at to derive a flat cluster labelling. k : int, optional Reachability distances will be computed with regard to the `k` nearest neighbors. (default 5) alpha : float, optional Distance scaling for reachability distance computation. Reachability distance is computed as $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$. (default sqrt(2)) gamma : int, optional Ignore any clusters in the flat clustering with size less than gamma, and declare points in such clusters as noise points. (default 5) metric : string, or callable, optional The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. algorithm : string, optional Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set to ``best`` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: * ``generic`` * ``best`` * ``prims_kdtree`` * ``prims_balltree`` * ``boruvka_kdtree`` * ``boruvka_balltree`` memory : Instance of joblib.Memory or string (optional) Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. Returns ------- labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. single_linkage_tree : array [n_samples - 1, 4] The single linkage tree produced during clustering in scipy hierarchical clustering format (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). References ---------- K. Chaudhuri and S. Dasgupta. "Rates of convergence for the cluster tree." In Advances in Neural Information Processing Systems, 2010. """ if type(k) is not int or k < 1: raise ValueError('k must be an integer greater than zero!') if type(alpha) is not float or alpha < 1.0: raise ValueError('alpha must be a float greater than or equal to 1.0!') if type(gamma) is not int or gamma < 1: raise ValueError('gamma must be an integer greater than zero!') X = check_array(X, accept_sparse='csr') if isinstance(memory, six.string_types): memory = Memory(cachedir=memory, verbose=0) if algorithm != 'best': if algorithm == 'generic': single_linkage_tree = \ memory.cache(_rsl_generic)(X, k, alpha, metric, p) elif algorithm == 'prims_kdtree': single_linkage_tree = \ memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p) elif algorithm == 'prims_balltree': single_linkage_tree = \ memory.cache(_rsl_prims_balltree)(X, k, alpha, metric, p) elif algorithm == 'boruvka_kdtree': single_linkage_tree = \ memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, p) elif algorithm == 'boruvka_balltree': single_linkage_tree = \ memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, p) else: raise TypeError('Unknown algorithm type %s specified' % algorithm) else: if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... single_linkage_tree = \ memory.cache(_rsl_generic)(X, k, alpha, metric, p) elif metric in KDTree.valid_metrics: # Need heuristic to decide when to go to boruvka; still debugging for now if X.shape[1] > 128: single_linkage_tree = \ memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p) else: single_linkage_tree = \ memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, p) else: # Metric is a valid BallTree metric # Need heuristic to decide when to go to boruvka; still debugging for now if X.shape[1] > 128: single_linkage_tree = \ memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p) else: single_linkage_tree = \ memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, p) labels = single_linkage_tree.get_clusters(cut, gamma) return labels, single_linkage_tree
for x in X: # smooth data x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel() X -= X.mean(axis=0) X /= X.std(axis=0) y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise # ############################################################################# # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(cachedir=cachedir, verbose=1) # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x=size, n_y=size) ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function
class FirstLevelModel(BaseEstimator, TransformerMixin, CacheMixin): """ Implementation of the General Linear Model for single session fMRI data Parameters ---------- t_r: float This parameter indicates repetition times of the experimental runs. In seconds. It is necessary to correctly consider times in the design matrix. This parameter is also passed to nilearn.signal.clean. Please see the related documentation for details. slice_time_ref: float, optional (default 0.) This parameter indicates the time of the reference slice used in the slice timing preprocessing step of the experimental runs. It is expressed as a percentage of the t_r (time repetition), so it can have values between 0. and 1. hrf_model : string, optional This parameter specifies the hemodynamic response function (HRF) for the design matrices. It can be 'canonical', 'canonical with derivative' or 'fir'. drift_model : string, optional This parameter specifies the desired drift model for the design matrices. It can be 'polynomial', 'cosine' or 'blank'. period_cut : float, optional This parameter specifies the cut period of the low-pass filter in seconds for the design matrices. drift_order : int, optional This parameter specifices the order of the drift model (in case it is polynomial) for the design matrices. fir_delays : array of shape(n_onsets) or list, optional In case of FIR design, yields the array of delays used in the FIR model, in seconds. min_onset : float, optional This parameter specifies the minimal onset relative to the design (in seconds). Events that start before (slice_time_ref * t_r + min_onset) are not considered. mask: Niimg-like, NiftiMasker or MultiNiftiMasker object, optional, Mask to be used on data. If an instance of masker is passed, then its mask will be used. If no mask is given, it will be computed automatically by a MultiNiftiMasker with default parameters. target_affine: 3x3 or 4x4 matrix, optional This parameter is passed to nilearn.image.resample_img. Please see the related documentation for details. target_shape: 3-tuple of integers, optional This parameter is passed to nilearn.image.resample_img. Please see the related documentation for details. smoothing_fwhm: float, optional If smoothing_fwhm is not None, it gives the size in millimeters of the spatial smoothing to apply to the signal. memory: string, optional Path to the directory used to cache the masking process and the glm fit. By default, no caching is done. Creates instance of joblib.Memory. memory_level: integer, optional Rough estimator of the amount of memory used by caching. Higher value means more memory for caching. standardize : boolean, optional If standardize is True, the time-series are centered and normed: their variance is put to 1 in the time dimension. signal_scaling: False, int or (int, int), optional, If not False, fMRI signals are scaled to the mean value of scaling_axis given, which can be 0, 1 or (0, 1). 0 refers to mean scaling each voxel with respect to time, 1 refers to mean scaling each time point with respect to all voxels and (0, 1) refers to scaling with respect to voxels and time, which is known as grand mean scaling. Incompatible with standardize (standardize=False is enforced when signal_scaling is not False). noise_model : {'ar1', 'ols'}, optional The temporal variance model. Defaults to 'ar1' verbose : integer, optional Indicate the level of verbosity. By default, nothing is printed. n_jobs : integer, optional The number of CPUs to use to do the computation. -1 means 'all CPUs', -2 'all CPUs but one', and so on. minimize_memory : boolean, optional Gets rid of some variables on the model fit results that are not necessary for contrast computation and would only be useful for further inspection of model details. This has an important impact on memory consumption. True by default. Attributes ---------- labels : array of shape (n_voxels,), a map of values on voxels used to identify the corresponding model results : dict, with keys corresponding to the different labels values values are RegressionResults instances corresponding to the voxels """ def __init__(self, t_r=None, slice_time_ref=0., hrf_model='glover', drift_model='cosine', period_cut=128, drift_order=1, fir_delays=[0], min_onset=-24, mask=None, target_affine=None, target_shape=None, smoothing_fwhm=None, memory=Memory(None), memory_level=1, standardize=False, signal_scaling=0, noise_model='ar1', verbose=1, n_jobs=1, minimize_memory=True): # design matrix parameters self.t_r = t_r self.slice_time_ref = slice_time_ref self.hrf_model = hrf_model self.drift_model = drift_model self.period_cut = period_cut self.drift_order = drift_order self.fir_delays = fir_delays self.min_onset = min_onset # glm parameters self.mask = mask self.target_affine = target_affine self.target_shape = target_shape self.smoothing_fwhm = smoothing_fwhm if isinstance(memory, _basestring): self.memory = Memory(memory) else: self.memory = memory self.memory_level = memory_level self.standardize = standardize if signal_scaling in [0, 1, (0, 1)]: self.scaling_axis = signal_scaling self.signal_scaling = True self.standardize = False elif signal_scaling is False: self.signal_scaling = signal_scaling else: raise ValueError('signal_scaling must be "False", "0", "1"' ' or "(0, 1)"') self.noise_model = noise_model self.verbose = verbose self.n_jobs = n_jobs self.minimize_memory = minimize_memory # attributes self.labels_ = None self.results_ = None def fit(self, run_imgs, paradigms=None, confounds=None, design_matrices=None): """ Fit the GLM For each run: 1. create design matrix X 2. do a masker job: fMRI_data -> Y 3. fit regression to (Y, X) Parameters ---------- run_imgs: Niimg-like object or list of Niimg-like objects, See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg. Data on which the GLM will be fitted. If this is a list, the affine is considered the same for all. paradigms: pandas Dataframe or string or list of pandas DataFrames or strings, fMRI paradigms used to build design matrices. One paradigm expected per run_img. Ignored in case designs is not None. confounds: pandas Dataframe or string or list of pandas DataFrames or strings, Each column in a DataFrame corresponds to a confound variable to be included in the regression model of the respective run_img. The number of rows must match the number of volumes in the respective run_img. Ignored in case designs is not None. design_matrices: pandas DataFrame or list of pandas DataFrames, Design matrices that will be used to fit the GLM. """ # Check arguments # Check imgs type if not isinstance(run_imgs, (list, tuple)): run_imgs = [run_imgs] for rimg in run_imgs: if not isinstance(rimg, (_basestring, Nifti1Image)): raise ValueError('run_imgs must be Niimg-like object or list' ' of Niimg-like objects') # check all information necessary to build design matrices is available if design_matrices is None: if paradigms is None: raise ValueError('paradigms or design matrices must be provided') if self.t_r is None: raise ValueError('t_r not given to FirstLevelModel object' ' to compute design from paradigm') else: design_matrices = _check_run_tables(run_imgs, design_matrices, 'design_matrices') # check the number of paradigm and confound files match number of runs # Also check paradigm and confound files can be loaded as DataFrame if paradigms is not None: paradigms = _check_run_tables(run_imgs, paradigms, 'paradigms') if confounds is not None: confounds = _check_run_tables(run_imgs, confounds, 'confounds') # Learn the mask if not isinstance(self.mask, NiftiMasker): self.masker_ = NiftiMasker( mask_img=self.mask, smoothing_fwhm=self.smoothing_fwhm, target_affine=self.target_affine, standardize=self.standardize, mask_strategy='epi', t_r=self.t_r, memory=self.memory, verbose=max(0, self.verbose - 1), target_shape=self.target_shape, memory_level=self.memory_level) else: self.masker_ = clone(self.mask) for param_name in ['target_affine', 'target_shape', 'smoothing_fwhm', 'low_pass', 'high_pass', 't_r', 'memory', 'memory_level']: our_param = getattr(self, param_name) if our_param is None: continue if getattr(self.masker_, param_name) is not None: warn('Parameter %s of the masker overriden' % param_name) setattr(self.masker_, param_name, our_param) self.masker_.fit(run_imgs[0]) # For each run fit the model and keep only the regression results. self.labels_, self.results_, self.design_matrices_ = [], [], [] n_runs = len(run_imgs) t0 = time.time() for run_idx, run_img in enumerate(run_imgs): # Report progress if self.verbose > 0: percent = float(run_idx) / n_runs percent = round(percent * 100, 2) dt = time.time() - t0 # We use a max to avoid a division by zero if run_idx == 0: remaining = 'go take a coffee, a big one' else: remaining = (100. - percent) / max(0.01, percent) * dt remaining = '%i seconds remaining' % remaining sys.stderr.write(" " * 100 + "\r") sys.stderr.write( "Computing run %d out of %d runs (%s)\r" % (run_idx, n_runs, remaining)) # Build the experimental design for the glm run_img = check_niimg(run_img, ensure_ndim=4) if design_matrices is None: n_scans = run_img.get_data().shape[3] if confounds is not None: confounds_matrix = confounds[run_idx].values if confounds_matrix.shape[0] != n_scans: raise ValueError('Rows in confounds does not match' 'n_scans in run_img at index %d' % (run_idx,)) confounds_names = confounds[run_idx].columns else: confounds_matrix = None confounds_names = None start_time = self.slice_time_ref * self.t_r end_time = (n_scans - 1 + self.slice_time_ref) * self.t_r frame_times = np.linspace(start_time, end_time, n_scans) design = make_design_matrix(frame_times, paradigms[run_idx], self.hrf_model, self.drift_model, self.period_cut, self.drift_order, self.fir_delays, confounds_matrix, confounds_names, self.min_onset) else: design = design_matrices[run_idx] self.design_matrices_.append(design) # Compute GLM Y = self.masker_.transform(run_img) if self.signal_scaling: Y, _ = mean_scaling(Y, self.scaling_axis) if self.memory is not None: mem_glm = self.memory.cache(run_glm) else: mem_glm = run_glm labels, results = mem_glm(Y, design, noise_model=self.noise_model, bins=100, n_jobs=self.n_jobs) self.labels_.append(labels) # We save memory if inspecting model details is not necessary if self.minimize_memory: for key in results: results[key] = SimpleRegressionResults(results[key]) self.results_.append(results) del Y # Report progress if self.verbose > 0: sys.stderr.write("\nComputation of %d runs done in %i seconds\n" % (n_runs, time.time() - t0)) return self def compute_contrast(self, contrast_def, contrast_name=None, stat_type=None, output_type='z_score'): """Generate different outputs corresponding to the contrasts provided e.g. z_map, t_map, effects and variance. In multi-session case, outputs the fixed effects map. Parameters ---------- contrast_def : array or list of arrays of shape (n_col) or (n_run, n_col) where ``n_col`` is the number of columns of the design matrix, (one array per run). If only one array is provided when there are several runs, it will be assumed that the same contrast is desired for all runs contrast_name : str, optional name of the contrast stat_type : {'t', 'F'}, optional type of the contrast output_type : str, optional Type of the output map. Can be 'z_score', 'stat', 'p_value', 'effect_size' or 'effect_variance' Returns ------- output_image : Nifti1Image The desired output image """ if self.labels_ is None or self.results_ is None: raise ValueError('The model has not been fit yet') if isinstance(contrast_def, np.ndarray): con_vals = [contrast_def] elif isinstance(contrast_def, (list, tuple)): con_vals = contrast_def for cidx, con in enumerate(contrast_def): if not isinstance(con, np.ndarray): raise ValueError('contrast_def at index %i is not an' ' array' % cidx) else: raise ValueError('contrast_def must be an array or list of arrays') n_runs = len(self.labels_) if len(con_vals) != n_runs: warn('One contrast given, assuming it for all %d runs' % n_runs) con_vals = con_vals * n_runs if isinstance(output_type, _basestring): if output_type not in ['z_score', 'stat', 'p_value', 'effect_size', 'effect_variance']: raise ValueError('output_type must be one of "z_score", "stat",' ' "p_value","effect_size" or "effect_variance"') else: raise ValueError('output_type must be one of "z_score", "stat",' ' "p_value","effect_size" or "effect_variance"') if self.memory is not None: arg_ignore = ['labels', 'results'] mem_contrast = self.memory.cache(_fixed_effect_contrast, ignore=arg_ignore) else: mem_contrast = _fixed_effect_contrast contrast = mem_contrast(self.labels_, self.results_, con_vals, stat_type) estimate_ = getattr(contrast, output_type)() # Prepare the returned images output = self.masker_.inverse_transform(estimate_) if contrast_name is None: contrast_name = str(con_vals) output.get_header()['descrip'] = ( '%s of contrast %s' % (output_type, contrast_name)) return output
# Author: Nelle Varoquaux <*****@*****.**> # License: BSD import numpy as np import scipy as sp from matplotlib import pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.cluster.mean_shift_ import MeanShift, estimate_bandwidth from sklearn.externals.joblib import Memory from skimage.data import camera mem = Memory(cachedir='.') def calculate_cluster(camera, camera_mat, quantile): bandwidth = estimate_bandwidth(camera_mat, quantile=quantile, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(camera_mat) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) camera_clustered = camera.copy()