def __init__(self, mi_type='cc', verbose=None): """Init.""" set_log_level(verbose) desc = CONFIG['MI_REPR'][mi_type] settings = {'description': desc} self.settings = Attributes(attrs=settings, section_name='Settings') self._kwargs = dict() assert hasattr(self, 'name') logger.info(f"{self.name} ({mi_type})")
class BaseMIEstimator(object): """Base class for mutual-information estimators. Parameters ---------- mi_type : {'cc', 'cd', 'ccd', 'ccc'} Mutual information type : * 'cc' : MI between two continuous variables * 'cd' : MI between a continuous and a discret variables * 'ccd' : MI between two continuous variables conditioned by a third discret one * 'ccc' : MI between two continuous variables conditioned by a third continuous one """ def __init__(self, mi_type='cc', verbose=None): """Init.""" set_log_level(verbose) desc = CONFIG['MI_REPR'][mi_type] settings = {'description': desc} self.settings = Attributes(attrs=settings, section_name='Settings') self._kwargs = dict() assert hasattr(self, 'name') logger.info(f"{self.name} ({mi_type})") def __repr__(self): """Overall representation.""" return '*** ' + self.name + ' ***\n' + self.settings.__repr__() def _repr_html_(self): """IPython representation.""" title = f"<h3><br>{self.name}</br></h3>" return title + self.settings._repr_html_() def estimate(self, x, y, z=None, categories=None): """Estimate the (possibly conditional) mutual-information.""" raise NotImplementedError() def get_function(self): """Get the function to execute. The returned function should have the following signature : * fcn(x, y, z=None, categories=None) and should returned an array of shape (n_categories, n_var). """ raise NotImplementedError()
class WfBase(object): """Base class for workflows.""" def __init__(self, attrs=None): # noqa self.attrs = Attributes(attrs=attrs) def __getitem__(self, key): return self.attrs[key] def __setitem__(self, key, value): self.attrs[key] = value def __repr__(self): return self.attrs.__repr__() def _repr_html_(self): return self.attrs._repr_html_() def fit(self): # noqa raise NotImplementedError() def clean(self): # noqa raise NotImplementedError()
def __init__(self, attrs=None): # noqa self.attrs = Attributes(attrs=attrs)
def __init__(self, x, y=None, z=None, roi=None, agg_ch=True, times=None, multivariate=False, nb_min_suj=False, attrs=None, verbose=None): """Init.""" set_log_level(verbose) self.attrs = Attributes(attrs=attrs) assert isinstance(x, (list, tuple)) self._agg_ch = agg_ch self._multivariate = multivariate logger.info('Definition of an electrophysiological dataset') logger.info(f' Dataset composed of {len(x)} subjects / sessions') # ========================== Multi-conditions ========================= # remapping group y and z if isinstance(y, (list, tuple)): y = multi_to_uni_conditions(y, var_name='y', verbose=verbose) if isinstance(z, (list, tuple)): z = multi_to_uni_conditions(z, var_name='z', verbose=verbose) # ===================== Multi-subjects conversion ===================== # force converting the data (latest task-related variables) n_subjects = len(x) y = [y] * n_subjects if not isinstance(y, list) else y z = [z] * n_subjects if not isinstance(z, list) else z roi = [roi] * n_subjects if not isinstance(roi, list) else roi for k in range(n_subjects): x[k] = SubjectEphy(x[k], y=y[k], z=z[k], roi=roi[k], agg_ch=True, times=times, multivariate=multivariate, verbose=verbose) self._x = x # minimum number of subject / roi nb_min_suj = -np.inf if not isinstance(nb_min_suj, int) else nb_min_suj self._nb_min_suj = nb_min_suj logger.info(f" At least {self._nb_min_suj} subjects / roi required") # merge attributes self.attrs.merge([k.attrs for k in self._x]) self._y_dtype = self.attrs['y_dtype'] self._z_dtype = self.attrs['z_dtype'] self._mi_type = self.attrs['mi_type'] mi_repr = self.attrs['mi_repr'] logger.info(f" Supported MI definition {mi_repr} ({self._mi_type})") # ===================== Additional dimensions ======================== # Subject dimension for n_k, k in enumerate(range(len(self._x))): self._x[k].name = f'subject_{n_k}' self._x[k] = self._x[k].assign_coords( subject=('trials', [n_k] * self._x[k].shape[0])) # channel aggregation if not agg_ch: # split into sections of unique intergers n_trials_s = [k.shape[1] for k in self._x] agg_ch_num = np.arange(np.sum(n_trials_s)) agg_split = np.split(agg_ch_num, np.cumsum(n_trials_s)[0:-1]) # add additional dimension for k in range(len(self._x)): self._x[k] = self._x[k].assign_coords(agg_ch=('roi', agg_split[k])) # final mi dimension dims = list(self._x[0].dims) self._mi_dims = [k for k in dims if k not in ['trials', 'mv']] # ============================= Attributes ============================ # update internals parameters self._update_internals() # # update internal attributes self.attrs.update({ 'nb_min_suj': nb_min_suj, 'n_subjects': len(self._x), 'agg_ch': agg_ch, 'multivariate': multivariate, 'dtype': "DatasetEphy", '__version__': frites.__version__ })
class DatasetEphy(object): """Multi-subjects electrophysiological data container. This class is a container used to represent the neurophysiological data coming from multiple subjects. In addition to passing the data, this container is also going to need the anatomical information such as the task-related variables (continuous or discret). Then, the created object can be used to compute the mutual information (MI) between the data and the task variable and group-level. Parameters ---------- x : list List of length (n_subjects,) where each element of the list should be the neural data of single subject. Supported types for each element of the list are the same as in :class:`SubjectEphy` y, z : list, sting | None List of length (n_subjects,) of continuous or discrete task-related variables per subject. Supported types for each element of the list are the same as in :class:`SubjectEphy` roi : list | None List of length (n_subjects,) where each element represents the anatomical information of each channel. Supported types for each element of the list are the same as in :class:`SubjectEphy` agg_ch : bool | True If multiple channels belong to the same ROI, specify whether if the data should be aggregated across channels (True) or if the information per channel have to be take into account (False - conditional mutual information). times : array_like | None The time vector to use. Supported types are listed in :class:`SubjectEphy` multivariate : bool | False If 4d input is provided, this parameter specifies whether this axis should be considered as multi-variate (True) or uni-varariate (False) nb_min_suj : int | None The minimum number of subjects per roi. Roi with n_suj < nb_min_suj are going to be skipped. Use None to force selecting all of the subjects attrs : dict | {} Dictionary of additional attributes about the data """ def __init__(self, x, y=None, z=None, roi=None, agg_ch=True, times=None, multivariate=False, nb_min_suj=False, attrs=None, verbose=None): """Init.""" set_log_level(verbose) self.attrs = Attributes(attrs=attrs) assert isinstance(x, (list, tuple)) self._agg_ch = agg_ch self._multivariate = multivariate logger.info('Definition of an electrophysiological dataset') logger.info(f' Dataset composed of {len(x)} subjects / sessions') # ========================== Multi-conditions ========================= # remapping group y and z if isinstance(y, (list, tuple)): y = multi_to_uni_conditions(y, var_name='y', verbose=verbose) if isinstance(z, (list, tuple)): z = multi_to_uni_conditions(z, var_name='z', verbose=verbose) # ===================== Multi-subjects conversion ===================== # force converting the data (latest task-related variables) n_subjects = len(x) y = [y] * n_subjects if not isinstance(y, list) else y z = [z] * n_subjects if not isinstance(z, list) else z roi = [roi] * n_subjects if not isinstance(roi, list) else roi for k in range(n_subjects): x[k] = SubjectEphy(x[k], y=y[k], z=z[k], roi=roi[k], agg_ch=True, times=times, multivariate=multivariate, verbose=verbose) self._x = x # minimum number of subject / roi nb_min_suj = -np.inf if not isinstance(nb_min_suj, int) else nb_min_suj self._nb_min_suj = nb_min_suj logger.info(f" At least {self._nb_min_suj} subjects / roi required") # merge attributes self.attrs.merge([k.attrs for k in self._x]) self._y_dtype = self.attrs['y_dtype'] self._z_dtype = self.attrs['z_dtype'] self._mi_type = self.attrs['mi_type'] mi_repr = self.attrs['mi_repr'] logger.info(f" Supported MI definition {mi_repr} ({self._mi_type})") # ===================== Additional dimensions ======================== # Subject dimension for n_k, k in enumerate(range(len(self._x))): self._x[k].name = f'subject_{n_k}' self._x[k] = self._x[k].assign_coords( subject=('trials', [n_k] * self._x[k].shape[0])) # channel aggregation if not agg_ch: # split into sections of unique intergers n_trials_s = [k.shape[1] for k in self._x] agg_ch_num = np.arange(np.sum(n_trials_s)) agg_split = np.split(agg_ch_num, np.cumsum(n_trials_s)[0:-1]) # add additional dimension for k in range(len(self._x)): self._x[k] = self._x[k].assign_coords(agg_ch=('roi', agg_split[k])) # final mi dimension dims = list(self._x[0].dims) self._mi_dims = [k for k in dims if k not in ['trials', 'mv']] # ============================= Attributes ============================ # update internals parameters self._update_internals() # # update internal attributes self.attrs.update({ 'nb_min_suj': nb_min_suj, 'n_subjects': len(self._x), 'agg_ch': agg_ch, 'multivariate': multivariate, 'dtype': "DatasetEphy", '__version__': frites.__version__ }) ########################################################################### ########################################################################### # INTERNALS ########################################################################### ########################################################################### def __repr__(self): """String representation.""" dt = [] for k in range(len(self._x)): da = xr.DataArray(dims=self._x[k].dims, coords=self._x[k].coords, name=self._x[k].name) dt.append(da) return repr(self.attrs.wrap_xr(xr.merge(dt, compat='override'))) def _update_internals(self): """Update internal variables.""" # build a unique list of unsorted roi names merged_roi = np.r_[tuple([k['roi'].data for k in self._x])] roi_names = nonsorted_unique(merged_roi) # dataframe made of unique roi per subjects and subject id suj_r, roi_r = [], [] for k in range(len(self._x)): _roi = np.unique(self._x[k]['roi'].data).tolist() roi_r += _roi suj_r += [k] * len(_roi) df_rs = pd.DataFrame({'roi': roi_r, '#subjects': suj_r}) # get number and id of subjects per roi gp_roi = df_rs.groupby('roi') groups = gp_roi.indices u_suj = [list(df_rs.loc[groups[k], '#subjects']) for k in roi_names] df_rs = gp_roi.count().reindex(roi_names) df_rs['subjects'] = u_suj df_rs['keep'] = df_rs['#subjects'] >= self._nb_min_suj self._df_rs = df_rs self._times = self._x[0]['times'].data self._n_times = len(self._times) self._n_subjects = len(self._x) self._roi_names = list(df_rs.index[df_rs['keep']]) self._n_roi = len(self._roi_names) ########################################################################### ########################################################################### # METHODS ########################################################################### ########################################################################### def get_roi_data(self, roi, groupby='subjects', mi_type='cc', copnorm=True, gcrn_per_suj=True): """Get the data of a single brain region. Parameters ---------- roi : string ROI name to get groupby : {'subjects'} Specify if the data across subjects have to be concatenated mi_type : {'cc', 'cd', 'ccd'} The type of mutual-information that is then going to be used. This is going to have an influence on how the data are organized and how the copnorm is going to be applied copnorm : bool | True Apply the gaussian copula rank normalization gcrn_per_suj : bool | True Specify whether the gaussian copula rank normalization have to be applied per subject (True - RFX) or across subjects (False - FFX) Returns ------- da : xr.DataArray The data of the single brain region """ # list of subjects present in the desired roi suj_list = self._df_rs.loc[roi, 'subjects'] # group data across subjects if groupby == 'subjects': x_r_ms = [] for s in suj_list: # roi (possibly multi-sites) selection x_roi = self._x[s].sel(roi=self._x[s]['roi'].data == roi) # stack roi and trials x_roi = x_roi.stack(rtr=('roi', 'trials')) x_r_ms.append(x_roi) x_r_ms = xr.concat(x_r_ms, 'rtr') # 4d or multivariate if self._multivariate: x_r_ms = x_r_ms.transpose('times', 'mv', 'rtr') else: x_r_ms = x_r_ms.expand_dims('mv', axis=-2) x_coords = list(x_r_ms.coords) # channels aggregation if not self._agg_ch and ('y' in x_coords): # shortcuts ch_id = x_r_ms['agg_ch'].data y = x_r_ms['y'].data # transformation depends on mi_type if mi_type == 'cd': # I(C; D) where the D=[y, ch_id] ysub = np.c_[y, ch_id] x_r_ms['y'].data = multi_to_uni_conditions([ysub], False)[0] elif mi_type == 'ccd' and ('z' not in x_coords): # I(C; C; D) where D=ch_id. In that case z=D x_r_ms = x_r_ms.assign_coords(z=('rtr', ch_id)) elif mi_type == 'ccd' and ('z' in x_coords): # I(C; C; D) where D=[z, ch_id] zsub = np.c_[x_r_ms['z'].data, ch_id] x_r_ms['z'].data = multi_to_uni_conditions([zsub], False)[0] else: raise ValueError("Can't avoid aggregating channels") # gaussian copula rank normalization if copnorm: if gcrn_per_suj: # gcrn per subject logger.debug("copnorm applied per subjects") suj = x_r_ms['subject'].data x_r_ms.data = copnorm_cat_nd(x_r_ms.data, suj, axis=-1) if (mi_type in ['cc', 'ccd']) and ('y' in x_coords): x_r_ms['y'].data = copnorm_cat_nd(x_r_ms['y'].data, suj, axis=0) else: # gcrn across subjects logger.debug("copnorm applied across subjects") x_r_ms.data = copnorm_nd(x_r_ms.data, axis=-1) if (mi_type in ['cc', 'ccd']) and ('y' in x_coords): x_r_ms['y'].data = copnorm_nd(x_r_ms['y'].data, axis=0) return x_r_ms def sel(self, **kwargs): """Coordinate-based data slicing. Slice the entire dataset based on the coordinates values. Parameters ---------- kwargs : {} | None Additional inputs are sent to to the method `xr.DataArray.sel` of the data of each subject. Returns ------- inst : instance of DatasetEphy The sliced object. Examples -------- > # define the dataset > ds = DatasetEphy(...) > # temporal slicing of the data between (-100ms, 1500ms) > ds.sel(times=slice(-0.1, 1.5)) """ self._x = [k.sel(**kwargs) for k in self._x] self._update_internals() return self def isel(self, **kwargs): """Index-based data slicing. Slice the entire dataset based on indexes. Parameters ---------- kwargs : {} | None Additional inputs are sent to to the method `xr.DataArray.isel` of the data of each subject. Returns ------- inst : instance of DatasetEphy The sliced object. Examples -------- > # define the dataset > ds = DatasetEphy(...) > # temporal slicing of the data between time points (100, 2500) > ds.sel(times=slice(100, 2500)) """ self._x = [k.isel(**kwargs) for k in self._x] self._update_internals() return self def savgol_filter(self, h_freq, edges=None, verbose=None): """Filter the data using Savitzky-Golay polynomial method. This method is an adaptation of the mne-python one. Note that this smoothing operation is performed inplace to avoid data copy. Parameters ---------- h_freq : float Approximate high cut-off frequency in Hz. Note that this is not an exact cutoff, since Savitzky-Golay filtering is done using polynomial fits instead of FIR/IIR filtering. This parameter is thus used to determine the length of the window over which a 5th-order polynomial smoothing is used. edges : int, float | None Edge compensation. Use either an integer to drop a specific number of time points (e.g edges=100 remove 100 time points at the begining and at the end) or a float to drop a period (e.g edges=0.2 drop 200ms at the begining and at the end) Returns ------- inst : instance of DatasetEphy The object with the filtering applied. Notes ----- For Savitzky-Golay low-pass approximation, see: https://gist.github.com/larsoner/bbac101d50176611136b """ set_log_level(verbose) # perform smoothing for n_s in range(len(self._x)): self._x[n_s] = savgol_filter(self._x[n_s], h_freq, axis='times', sfreq=self.attrs['sfreq'], verbose=verbose) # edge effect compensation if isinstance(edges, CONFIG['FLOAT_DTYPE']): t = self._times self.sel(times=slice(t[0] + edges, t[-1] - edges)) elif isinstance(edges, CONFIG['INT_DTYPE']): self.isel(times=slice(edges, -edges)) return self def get_connectivity_pairs(self, as_blocks=False, directed=False, verbose=None): """Get the connectivity pairs for this dataset. This method can be used to get the possible connectivity pairs i.e (sources, targets) for directed connectivity (or not). In addition, some pairs are going to be ignored because of a number of subjects to low. Parameters ---------- directed : bool | False Get either directed (True) or non-directed (False) pairs Returns ------- df_conn : pd.DataFrame The table describing the connectivity informations per pair of brain regions df_conn_suj : pd.DataFrame The table describing the connectivity informations per subject """ rois = [k['roi'].data.astype(str) for k in self.x] # get the dataframe for connectivity self.df_conn, self.df_conn_suj = conn_get_pairs( rois, directed=directed, nb_min_suj=self._nb_min_suj, verbose=verbose) # filter both dataframes df_conn = self.df_conn.loc[self.df_conn['keep']] df_conn = df_conn.drop(columns='keep') df_conn = df_conn.reset_index(drop=True) df_conn_suj = self.df_conn_suj.loc[self.df_conn_suj['keep_suj']] df_conn_suj = df_conn_suj.drop(columns='keep_suj') df_conn_suj = df_conn_suj.reset_index(drop=True) # group by sources if as_blocks: df_conn = df_conn.groupby('sources').agg(list).reset_index() return df_conn, df_conn_suj ########################################################################### ########################################################################### # PROPERTIES ########################################################################### ########################################################################### @property def x(self): """Multi-subjects electrophysiological data (DataArray).""" return self._x @property def df_rs(self): """Pandas DataFrame of cross-subjects anatomical repartition.""" return self._df_rs @property def times(self): """Time vector.""" return self._times @property def roi_names(self): """List of brain regions to keep across subjects.""" return self._roi_names
def __new__(self, x, y=None, z=None, roi=None, times=None, agg_ch=True, multivariate=False, name=None, attrs=None, sfreq=None, verbose=None): """Init.""" set_log_level(verbose) attrs = Attributes(attrs=attrs) _supp_dim = [] # ========================== Data extraction ========================== # ____________________________ extraction _____________________________ if isinstance(x, xr.DataArray): # xr -> xr # get data, name and attributes attrs.update(x.attrs) name = x.name if name is None else name data = x.data # get y / z regressors y = x[y].data if isinstance(y, str) else y z = x[z].data if isinstance(z, str) else z # get spatial informations (roi) roi = x[roi].data if isinstance(roi, str) else roi # build 4d (possibly multivariate) coordinate if x.ndim == 4: if multivariate: _supp_dim = ('mv', np.full((x.shape[2]), np.nan)) else: _supp_dim = (x.dims[2], x[x.dims[2]].data) # get the temporal vector times = x[times].data if isinstance(times, str) else times if 'mne' in str(type(x)): # mne -> xr times = x.times if times is None else times roi = x.info['ch_names'] if roi is None else roi sfreq = x.info['sfreq'] if sfreq is None else sfreq if isinstance(x, CONFIG["MNE_EPOCHS_TYPE"]): data = x.get_data() elif isinstance(x, CONFIG["MNE_EPOCHSTFR_TYPE"]): data = x.data if multivariate: _supp_dim = ('mv', np.full((data.shape[2]), np.nan)) else: _supp_dim = ('freqs', x.freqs) if isinstance(x, np.ndarray): # numpy -> xr data = x if data.ndim == 4: if multivariate: _supp_dim = ('mv', np.full((data.shape[2]), np.nan)) else: _supp_dim = ('supp', np.arange(data.shape[2])) assert data.ndim <= 4, "Data up to 4-dimensions are supported" # ____________________________ Y/Z dtypes _____________________________ # infer dtypes y_dtype = self._infer_dtypes(y, 'y') z_dtype = self._infer_dtypes(z, 'z') # infer supported mi_type mi_type = CONFIG['MI_TABLE'][y_dtype][z_dtype] mi_repr = CONFIG['MI_REPR'][mi_type] # uni to multi condition remapping y = multi_to_uni_conditions([y], var_name='y', verbose=verbose)[0] z = multi_to_uni_conditions([z], var_name='z', verbose=verbose)[0] # __________________________ Sampling rate ____________________________ # infer the sampling frequency (if needed) if sfreq is None: if (times is not None) and (len(times) >= 2): sfreq = 1. / (times[1] - times[0]) else: logger.warning("Impossible to infer the sampling frequency. " "You should consider providing a time vector") sfreq = 1. sfreq = float(sfreq) # ============================= DataArray ============================= # ___________________________ Dims & Coords ___________________________ dims, coords = [], OrderedDict() n_trials, n_roi, n_times = np.array(list(data.shape))[[0, 1, -1]] # don't break if empty time vector or missing roi if times is None: logger.warning("No time vector. A default one is created") times = np.arange(n_times) / sfreq if roi is None: logger.warning("No regions of interest are provided (roi). Default" " ones are created") roi = [f"roi_{k}" for k in range(n_roi)] # build trials (potentially) multi-coordinates coords['trials'] = ('trials', np.arange(n_trials)) if (y is not None) and (len(y) == n_trials): coords['y'] = ('trials', y) if (z is not None) and (len(z) == n_trials): coords['z'] = ('trials', z) if name is not None: coords['subject'] = ('trials', [name] * n_trials) dims += ['trials'] # build space (potentially) multi-coordinates coords['roi'] = ('roi', roi) if agg_ch: coords['agg_ch'] = ('roi', [0] * n_roi) else: coords['agg_ch'] = ('roi', np.arange(n_roi)) dims += ['roi'] if _supp_dim: coords[_supp_dim[0]] = _supp_dim[1] dims += [_supp_dim[0]] # build temporal coordinate if (times is not None) and (len(times) == n_times): coords['times'] = ('times', times) dims += ['times'] # _____________________________ Attributes ____________________________ attrs.update({ '__version__': frites.__version__, 'modality': "electrophysiology", 'dtype': 'SubjectEphy', 'y_dtype': y_dtype, 'z_dtype': z_dtype, 'mi_type': mi_type, 'mi_repr': mi_repr, 'sfreq': sfreq, 'agg_ch': agg_ch, 'multivariate': multivariate }) # _____________________________ DataArray _____________________________ # for a given reason, DataArray are not easy to subclass (see #706, # #728, #3980). Therefore, for the moment, it's just easier to simply # return a dataarray da = xr.DataArray(data, dims=dims, coords=coords, name=name, attrs=attrs) return da