def test_save_load_FlattenMapper(f): from mvpa2.mappers.flatten import FlattenMapper fm = FlattenMapper() ds = datasets['3dsmall'] ds_ = fm(ds) ds_r = fm.reverse(ds_) fm_ = saveload(fm, f) assert_equal(fm_.shape, fm.shape)
def eeglab_dataset(samples): '''Make a Dataset instance from EEGLAB input data Parameters ---------- samples: str Filename of EEGLAB text file Returns ------- ds: mvpa2.base.dataset.Dataset Dataset with the contents of the input file ''' if not isinstance(samples, basestring): raise ValueError("Samples should be a string") if _looks_like_filename(samples): if not os.path.exists(samples): raise ValueError("Input looks like a filename, but file" " %s does not exist" % samples) with open(samples) as f: samples = f.read() lines = samples.split('\n') samples = [] cur_sample = None for i, line in enumerate(lines): if not line: continue if i == 0: # first line contains the channel names channel_labels = line.split() n_channels = len(channel_labels) else: # first value is the time point, the remainders the value # for each channel values = map(float, line.split()) t = values[0] # time eeg = values[1:] # values for each electrode if len(eeg) != n_channels: raise ValueError("Line %d: expected %d values but found %d" % (n_channels, len(eeg))) if cur_sample is None or t < prev_t: # new sample cur_sample = [] samples.append(cur_sample) cur_sample.append((t, eeg)) prev_t = t # get and verify number of elements in each dimension n_samples = len(samples) n_timepoints_all = map(len, samples) n_timepoints_unique = set(n_timepoints_all) if len(n_timepoints_unique) != 1: raise ValueError("Different number of time points in different" "samples: found %d different lengths" % len(n_timepoints_unique)) n_timepoints = n_timepoints_all[0] shape = (n_samples, n_timepoints, n_channels) # allocate space for data data = np.zeros(shape) # make a list of all channels and timepoints channel_array = np.asarray(channel_labels) timepoint_array = np.asarray([samples[0][i][0] for i in xrange(n_timepoints)]) dts = timepoint_array[1:] - timepoint_array[:-1] if not np.all(dts == dts[0]): raise ValueError("Delta time points are different") # put the values in the data array for i, sample in enumerate(samples): for j, (t, values) in enumerate(sample): # check that the time is the same if i > 0 and timepoint_array[j] != t: raise ValueError("Sample %d, time point %s is different " "than the first sample (%s)" % (i, t, timepoint_array[j])) for k, value in enumerate(values): data[i, j, k] = value samples = None # and let gc do it's job # make a Dataset instance with the data ds = Dataset(data) # append a flatten_mapper to go from 3D (sample X time X channel) # to 2D (sample X (time X channel)) flatten_mapper = FlattenMapper(shape=shape[1:], space='time_channel_indices') ds = ds.get_mapped(flatten_mapper) # make this a 3D array of the proper size channel_array_3D = np.tile(channel_array, (1, n_timepoints, 1)) timepoint_array_3D = np.tile(np.reshape(timepoint_array, (-1, 1)), (1, 1, n_channels)) # for consistency use the flattan_mapper defined above to # flatten channel and timepoint names as well ds.fa['channelids'] = flatten_mapper.forward(channel_array_3D).ravel() ds.fa['timepoints'] = flatten_mapper.forward(timepoint_array_3D).ravel() # make some dynamic properties # XXX at the moment we don't have propert 'protection' in case # the feature space is sliced in a way so that some channels and/or # timepoints occur more often than others _eeglab_set_attributes(ds) return ds
def from_wizard(cls, samples, targets=None, chunks=None, mask=None, mapper=None, flatten=None, space=None): """Convenience method to create dataset. Datasets can be created from N-dimensional samples. Data arrays with more than two dimensions are going to be flattened, while preserving the first axis (separating the samples) and concatenating all other as the second axis. Optionally, it is possible to specify targets and chunk attributes for all samples, and masking of the input data (only selecting elements corresponding to non-zero mask elements Parameters ---------- samples : ndarray N-dimensional samples array. The first axis separates individual samples. targets : scalar or ndarray, optional Labels for all samples. If a scalar is provided its values is assigned as label to all samples. chunks : scalar or ndarray, optional Chunks definition for all samples. If a scalar is provided its values is assigned as chunk of all samples. mask : ndarray, optional The shape of the array has to correspond to the shape of a single sample (shape(samples)[1:] == shape(mask)). Its non-zero elements are used to mask the input data. mapper : Mapper instance, optional A trained mapper instance that is used to forward-map possibly already flattened (see flatten) and masked samples upon construction of the dataset. The mapper must have a simple feature space (samples x features) as output. Use a `ChainMapper` to achieve that, if necessary. flatten : None or bool, optional If None (default) and no mapper provided, data would get flattened. Bool value would instruct explicitly either to flatten before possibly passing into the mapper if no mask is given. space : str, optional If provided it is assigned to the mapper instance that performs the initial flattening of the data. Returns ------- instance : Dataset """ # for all non-ndarray samples you need to go with the constructor samples = np.asanyarray(samples) # compile the necessary samples attributes collection sa_items = {} if not targets is None: sa_items['targets'] = _expand_attribute(targets, samples.shape[0], 'targets') if not chunks is None: # unlike previous implementation, we do not do magic to do chunks # if there are none, there are none sa_items['chunks'] = _expand_attribute(chunks, samples.shape[0], 'chunks') # common checks should go into __init__ ds = cls(samples, sa=sa_items) # apply mask through mapper if mask is None: # if we have multi-dim data if len(samples.shape) > 2 and \ ((flatten is None and mapper is None) # auto case or flatten): # bool case fm = FlattenMapper(shape=samples.shape[1:], space=space) ds = ds.get_mapped(fm) else: mm = mask_mapper(mask, space=space) mm.train(ds) ds = ds.get_mapped(mm) # apply generic mapper if not mapper is None: ds = ds.get_mapped(mapper) return ds
def test_flatten(): samples_shape = (2, 2, 4) data_shape = (4,) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape).view(myarray) pristinedata = data.copy() target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target).view(myarray) index_target = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3], [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 0, 3], [1, 1, 0], [1, 1, 1], [1, 1, 2], [1, 1, 3]]) # test only flattening the first two dimensions fm_max = FlattenMapper(maxdims=2) fm_max.train(data) assert_equal(fm_max(data).shape, (4, 4, 4)) # array subclass survives ok_(isinstance(data, myarray)) # actually, there should be no difference between a plain FlattenMapper and # a chain that only has a FlattenMapper as the one element for fm in [FlattenMapper(space='voxel'), ChainMapper([FlattenMapper(space='voxel'), StaticFeatureSelection(slice(None))])]: # not working if untrained assert_raises(RuntimeError, fm.forward1, np.arange(np.sum(samples_shape) + 1)) fm.train(data) ok_(isinstance(fm.forward(data), myarray)) ok_(isinstance(fm.forward1(data[2]), myarray)) assert_array_equal(fm.forward(data), target) assert_array_equal(fm.forward1(data[2]), target[2]) assert_raises(ValueError, fm.forward, np.arange(4)) # all of that leaves that data unmodified assert_array_equal(data, pristinedata) # reverse mapping ok_(isinstance(fm.reverse(target), myarray)) ok_(isinstance(fm.reverse1(target[0]), myarray)) ok_(isinstance(fm.reverse(target[1:2]), myarray)) assert_array_equal(fm.reverse(target), data) assert_array_equal(fm.reverse1(target[0]), data[0]) assert_array_equal(fm.reverse1(target[0]), _verified_reverse1(fm, target[0])) assert_array_equal(fm.reverse(target[1:2]), data[1:2]) assert_raises(ValueError, fm.reverse, np.arange(14)) # check one dimensional data, treated as scalar samples oned = np.arange(5) fm.train(Dataset(oned)) # needs 2D assert_raises(ValueError, fm.forward, oned) # doesn't match mapper, since Dataset turns `oned` into (5,1) assert_raises(ValueError, fm.forward, oned) assert_equal(Dataset(oned).nfeatures, 1) # try dataset mode, with some feature attribute fattr = np.arange(np.prod(samples_shape)).reshape(samples_shape) ds = Dataset(data, fa={'awesome': fattr.copy()}) assert_equal(ds.samples.shape, data_shape) fm.train(ds) dsflat = fm.forward(ds) ok_(isinstance(dsflat, Dataset)) ok_(isinstance(dsflat.samples, myarray)) assert_array_equal(dsflat.samples, target) assert_array_equal(dsflat.fa.awesome, np.arange(np.prod(samples_shape))) assert_true(isinstance(dsflat.fa['awesome'], ArrayCollectable)) # test index creation assert_array_equal(index_target, dsflat.fa.voxel) # and back revds = fm.reverse(dsflat) ok_(isinstance(revds, Dataset)) ok_(isinstance(revds.samples, myarray)) assert_array_equal(revds.samples, data) assert_array_equal(revds.fa.awesome, fattr) assert_true(isinstance(revds.fa['awesome'], ArrayCollectable)) assert_false('voxel' in revds.fa)
def test_chainmapper(): # the chain needs at lest one mapper assert_raises(ValueError, ChainMapper, []) # a typical first mapper is to flatten cm = ChainMapper([FlattenMapper()]) # few container checks assert_equal(len(cm), 1) assert_true(isinstance(cm[0], FlattenMapper)) # now training # come up with data samples_shape = (2, 2, 4) data_shape = (4,) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape) target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target) # if it is not trained it knows nothing cm.train(data) # a new mapper should appear when doing feature selection cm.append(StaticFeatureSelection(list(range(1, 16)))) assert_equal(cm.forward1(data[0]).shape, (15,)) assert_equal(len(cm), 2) # multiple slicing cm.append(StaticFeatureSelection([9, 14])) assert_equal(cm.forward1(data[0]).shape, (2,)) assert_equal(len(cm), 3) # check reproduction if __debug__: # debug mode needs special test as it enhances the repr output # with module info and id() appendix for objects import mvpa2 cm_clone = eval(repr(cm)) assert_equal('#'.join(repr(cm_clone).split('#')[:-1]), '#'.join(repr(cm).split('#')[:-1])) else: cm_clone = eval(repr(cm)) assert_equal(repr(cm_clone), repr(cm)) # what happens if we retrain the whole beast an same data as before cm.train(data) assert_equal(cm.forward1(data[0]).shape, (2,)) assert_equal(len(cm), 3) # let's map something mdata = cm.forward(data) assert_array_equal(mdata, target[:, [10, 15]]) # and back rdata = cm.reverse(mdata) # original shape assert_equal(rdata.shape, data.shape) # content as far it could be restored assert_array_equal(rdata[rdata > 0], data[rdata > 0]) assert_equal(np.sum(rdata > 0), 8) # Lets construct a dataset with mapper assigned and see # if sub-selecting a feature adjusts trailing StaticFeatureSelection # appropriately ds_subsel = Dataset.from_wizard(data, mapper=cm)[:, 1] tail_sfs = ds_subsel.a.mapper[-1] assert_equal(repr(tail_sfs), 'StaticFeatureSelection(slicearg=array([14]))')
def _extract_boxcar_events(ds, events=None, time_attr=None, match='prev', eprefix='event', event_mapper=None): """see eventrelated_dataset() for docs""" # relabel argument conv_strategy = { 'prev': 'floor', 'next': 'ceil', 'closest': 'round' }[match] if not time_attr is None: tvec = ds.sa[time_attr].value # we are asked to convert onset time into sample ids descr_events = [] for ev in events: # do not mess with the input data ev = copy.deepcopy(ev) # best matching sample idx = value2idx(ev['onset'], tvec, conv_strategy) # store offset of sample time and real onset ev['orig_offset'] = ev['onset'] - tvec[idx] # rescue the real onset into a new attribute ev['orig_onset'] = ev['onset'] ev['orig_duration'] = ev['duration'] # figure out how many samples we need ev['duration'] = \ len(tvec[idx:][tvec[idx:] < ev['onset'] + ev['duration']]) # new onset is sample index ev['onset'] = idx descr_events.append(ev) else: descr_events = events # convert the event specs into the format expected by BoxcarMapper # take the first event as an example of contained keys evvars = _events2dict(descr_events) # checks for p in ['onset', 'duration']: if not p in evvars: raise ValueError("'%s' is a required property for all events." % p) boxlength = max(evvars['duration']) if __debug__: if not max(evvars['duration']) == min(evvars['duration']): warning('Boxcar mapper will use maximum boxlength (%i) of all ' 'provided Events.' % boxlength) # finally create, train und use the boxcar mapper bcm = BoxcarMapper(evvars['onset'], boxlength, space=eprefix) bcm.train(ds) ds = ds.get_mapped(bcm) if event_mapper is None: # at last reflatten the dataset # could we add some meaningful attribute during this mapping, i.e. would # assigning 'inspace' do something good? ds = ds.get_mapped(FlattenMapper(shape=ds.samples.shape[1:])) else: ds = ds.get_mapped(event_mapper) # add samples attributes for the events, simply dump everything as a samples # attribute # special case onset and duration in case of conversion into descrete time if not time_attr is None: for attr in ('onset', 'duration'): evvars[attr] = [e[attr] for e in events] ds = _evvars2ds(ds, evvars, eprefix) return ds
def test_flatten(): samples_shape = (2, 2, 4) data_shape = (4,) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape).view(myarray) pristinedata = data.copy() target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target).view(myarray) index_target = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3], [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 0, 3], [1, 1, 0], [1, 1, 1], [1, 1, 2], [1, 1, 3]]) # test only flattening the first two dimensions fm_max = FlattenMapper(maxdims=2) fm_max.train(data) assert_equal(fm_max(data).shape, (4, 4, 4)) # array subclass survives ok_(isinstance(data, myarray)) # actually, there should be no difference between a plain FlattenMapper and # a chain that only has a FlattenMapper as the one element for fm in [FlattenMapper(space='voxel'), ChainMapper([FlattenMapper(space='voxel'), StaticFeatureSelection(slice(None))])]: # not working if untrained assert_raises(RuntimeError, fm.forward1, np.arange(np.sum(samples_shape) + 1)) fm.train(data) ok_(isinstance(fm.forward(data), myarray)) ok_(isinstance(fm.forward1(data[2]), myarray)) assert_array_equal(fm.forward(data), target) assert_array_equal(fm.forward1(data[2]), target[2]) assert_raises(ValueError, fm.forward, np.arange(4)) # all of that leaves that data unmodified assert_array_equal(data, pristinedata) # reverse mapping ok_(isinstance(fm.reverse(target), myarray)) ok_(isinstance(fm.reverse1(target[0]), myarray)) ok_(isinstance(fm.reverse(target[1:2]), myarray)) assert_array_equal(fm.reverse(target), data) assert_array_equal(fm.reverse1(target[0]), data[0]) assert_array_equal(fm.reverse(target[1:2]), data[1:2]) assert_raises(ValueError, fm.reverse, np.arange(14)) # check one dimensional data, treated as scalar samples oned = np.arange(5) fm.train(Dataset(oned)) # needs 2D assert_raises(ValueError, fm.forward, oned) # doesn't match mapper, since Dataset turns `oned` into (5,1) assert_raises(ValueError, fm.forward, oned) assert_equal(Dataset(oned).nfeatures, 1) # try dataset mode, with some feature attribute fattr = np.arange(np.prod(samples_shape)).reshape(samples_shape) ds = Dataset(data, fa={'awesome': fattr.copy()}) assert_equal(ds.samples.shape, data_shape) fm.train(ds) dsflat = fm.forward(ds) ok_(isinstance(dsflat, Dataset)) ok_(isinstance(dsflat.samples, myarray)) assert_array_equal(dsflat.samples, target) assert_array_equal(dsflat.fa.awesome, np.arange(np.prod(samples_shape))) assert_true(isinstance(dsflat.fa['awesome'], ArrayCollectable)) # test index creation assert_array_equal(index_target, dsflat.fa.voxel) # and back revds = fm.reverse(dsflat) ok_(isinstance(revds, Dataset)) ok_(isinstance(revds.samples, myarray)) assert_array_equal(revds.samples, data) assert_array_equal(revds.fa.awesome, fattr) assert_true(isinstance(revds.fa['awesome'], ArrayCollectable)) assert_false('voxel' in revds.fa)
def extract_boxcar_event_samples(ds, events=None, time_attr=None, match='prev', event_offset=None, event_duration=None, eprefix='event', event_mapper=None): """Segment a dataset by extracting boxcar events (Multiple) consecutive samples are extracted for each event, and are either returned in a flattened shape, or subject to further processing. Events are specified as a list of dictionaries (see:class:`~mvpa2.misc.support.Event`) for a helper class. Each dictionary contains all relevant attributes to describe an event. This is at least the ``onset`` time of an event, but can also comprise of ``duration``, ``amplitude``, and arbitrary other attributes. Boxcar event model details -------------------------- For each event all samples covering that particular event are used to form a corresponding sample. One sample for each event is returned. Event specification dictionaries must contain an ``onset`` attribute (as sample index in the input dataset), ``duration`` (as number of consecutive samples after the onset). Any number of additional attributes can be present in an event specification. Those attributes are included as sample attributes in the returned dataset. Alternatively, ``onset`` and ``duration`` may also be given in a non-discrete time specification. In this case a dataset attribute needs to be specified that contains time-stamps for each input data sample, and is used to convert times into discrete sample indices (see ``match`` argument). A mapper instance can be provided (see ``event_mapper``) to implement futher processing of each event sample, for example in order to yield average samples. Parameters ---------- ds : Dataset The samples of this input dataset have to be in whatever ascending order. events : list Each event definition has to specify ``onset`` and ``duration``. All other attributes will be passed on to the sample attributes collection of the returned dataset. time_attr : str or None Attribute with dataset sample time-stamps. If not None, the ``onset`` and ``duration`` specs from the event list will be converted using information from this sample attribute. Its values will be treated as in-the-same-unit and are used to determine corresponding samples from real-value onset and duration definitions. For HRF modeling this argument is mandatory. match : {'prev', 'next', 'closest'} Strategy used to match real-value onsets to sample indices. 'prev' chooses the closes preceding samples, 'next' the closest following sample and 'closest' to absolute closest sample. event_offset : None or float If not None, all event ``onset`` specifications will be offset by this value before boxcar modeling is performed. event_duration : None or float If not None, all event ``duration`` specifications will be set to this value before boxcar modeling is done. eprefix : str or None If not None, this prefix is used to name additional attributes generated by the underlying `~mvpa2.mappers.boxcar.BoxcarMapper`. If it is set to None, no additional attributes will be created. event_mapper : Mapper This mapper is used to forward-map the dataset containing the boxcar event samples. If None (default) a FlattenMapper is employed to convert multi-dimensional sample matrices into simple one-dimensional sample vectors. This option can be used to implement temporal compression, by e.g. averaging samples within an event boxcar using an FxMapper. Any mapper needs to keep the sample axis unchanged, i.e. number and order of samples remain the same. Returns ------- Dataset One sample per each event definition that has been passed to the function. Additional event attributes are included as sample attributes. Examples -------- The documentation also contains an :ref:`example script <example_eventrelated>` showing a spatio-temporal analysis of fMRI data that involves this function. >>> from mvpa2.datasets import Dataset >>> ds = Dataset(np.random.randn(10, 25)) >>> events = [{'onset': 2, 'duration': 4}, ... {'onset': 4, 'duration': 4}] >>> eds = eventrelated_dataset(ds, events) >>> len(eds) 2 >>> eds.nfeatures == ds.nfeatures * 4 True >>> 'mapper' in ds.a False >>> print eds.a.mapper <Chain: <Boxcar: bl=4>-<Flatten>> And now the same conversion, but with events specified as real time. This is on possible if the input dataset contains a sample attribute with the necessary information about the input samples. >>> ds.sa['record_time'] = np.linspace(0, 5, len(ds)) >>> rt_events = [{'onset': 1.05, 'duration': 2.2}, ... {'onset': 2.3, 'duration': 2.12}] >>> rt_eds = eventrelated_dataset(ds, rt_events, time_attr='record_time', ... match='closest') >>> np.all(eds.samples == rt_eds.samples) True >>> # returned dataset e.g. has info from original samples >>> rt_eds.sa.record_time array([[ 1.11111111, 1.66666667, 2.22222222, 2.77777778], [ 2.22222222, 2.77777778, 3.33333333, 3.88888889]]) """ # relabel argument conv_strategy = { 'prev': 'floor', 'next': 'ceil', 'closest': 'round' }[match] if not (event_offset is None and event_duration is None): descr_events = [] for ev in events: # do not mess with the input data ev = copy.deepcopy(ev) if event_offset is not None: ev['onset'] += event_offset if event_duration is not None: ev['duration'] = event_duration descr_events.append(ev) events = descr_events if time_attr is not None: tvec = ds.sa[time_attr].value # we are asked to convert onset time into sample ids descr_events = [] for ev in events: # do not mess with the input data ev = copy.deepcopy(ev) # best matching sample idx = value2idx(ev['onset'], tvec, conv_strategy) # store offset of sample time and real onset ev['orig_offset'] = ev['onset'] - tvec[idx] # rescue the real onset into a new attribute ev['orig_onset'] = ev['onset'] ev['orig_duration'] = ev['duration'] # figure out how many samples we need ev['duration'] = \ len(tvec[idx:][tvec[idx:] < ev['onset'] + ev['duration']]) # new onset is sample index ev['onset'] = idx descr_events.append(ev) else: descr_events = events # convert the event specs into the format expected by BoxcarMapper # take the first event as an example of contained keys evvars = _events2dict(descr_events) # checks for p in ['onset', 'duration']: if not p in evvars: raise ValueError("'%s' is a required property for all events." % p) boxlength = max(evvars['duration']) if __debug__: if not max(evvars['duration']) == min(evvars['duration']): warning('Boxcar mapper will use maximum boxlength (%i) of all ' 'provided Events.' % boxlength) # finally create, train und use the boxcar mapper bcm = BoxcarMapper(evvars['onset'], boxlength, space=eprefix) bcm.train(ds) ds = ds.get_mapped(bcm) if event_mapper is None: # at last reflatten the dataset # could we add some meaningful attribute during this mapping, i.e. would # assigning 'inspace' do something good? ds = ds.get_mapped(FlattenMapper(shape=ds.samples.shape[1:])) else: ds = ds.get_mapped(event_mapper) # add samples attributes for the events, simply dump everything as a samples # attribute # special case onset and duration in case of conversion into descrete time if time_attr is not None: for attr in ('onset', 'duration'): evvars[attr] = [e[attr] for e in events] ds = _evvars2ds(ds, evvars, eprefix) return ds
def test_datasetmapping(): # 6 samples, 4X2 features data = np.arange(48).reshape(6, 4, 2) ds = Dataset(data, sa={ 'timepoints': np.arange(6), 'multidim': data.copy() }, fa={'fid': np.arange(4)}) # with overlapping and non-overlapping boxcars startpoints = [0, 1, 4] boxlength = 2 bm = BoxcarMapper(startpoints, boxlength, space='boxy') # train is critical bm.train(ds) mds = bm.forward(ds) assert_equal(len(mds), len(startpoints)) assert_equal(mds.nfeatures, boxlength) # all samples attributes remain, but the can rotated/compressed into # multidimensional attributes assert_equal(sorted(mds.sa.keys()), ['boxy_onsetidx'] + sorted(ds.sa.keys())) assert_equal(mds.sa.multidim.shape, (len(startpoints), boxlength) + ds.shape[1:]) assert_equal(mds.sa.timepoints.shape, (len(startpoints), boxlength)) assert_array_equal(mds.sa.timepoints.flatten(), np.array([(s, s + 1) for s in startpoints]).flatten()) assert_array_equal(mds.sa.boxy_onsetidx, startpoints) # feature attributes also get rotated and broadcasted assert_array_equal(mds.fa.fid, [ds.fa.fid, ds.fa.fid]) # and finally there is a new one assert_array_equal(mds.fa.boxy_offsetidx, list(range(boxlength))) # now see how it works on reverse() rds = bm.reverse(mds) # we got at least something of all original attributes back assert_equal(sorted(rds.sa.keys()), sorted(ds.sa.keys())) assert_equal(sorted(rds.fa.keys()), sorted(ds.fa.keys())) # it is not possible to reconstruct the full samples array # some samples even might show up multiple times (when there are overlapping # boxcars assert_array_equal( rds.samples, np.array([[[0, 1], [2, 3], [4, 5], [6, 7]], [[8, 9], [10, 11], [12, 13], [14, 15]], [[8, 9], [10, 11], [12, 13], [14, 15]], [[16, 17], [18, 19], [20, 21], [22, 23]], [[32, 33], [34, 35], [36, 37], [38, 39]], [[40, 41], [42, 43], [44, 45], [46, 47]]])) assert_array_equal(rds.sa.timepoints, [0, 1, 1, 2, 4, 5]) assert_array_equal(rds.sa.multidim, ds.sa.multidim[rds.sa.timepoints]) # but feature attributes should be fully recovered assert_array_equal(rds.fa.fid, ds.fa.fid) # popular dataset configuration (double flatten + boxcar) cm = ChainMapper([FlattenMapper(), bm, FlattenMapper()]) cm.train(ds) bflat = ds.get_mapped(cm) assert_equal(bflat.shape, (len(startpoints), boxlength * np.prod(ds.shape[1:]))) # add attributes bflat.fa['testfa'] = np.arange(bflat.nfeatures) bflat.sa['testsa'] = np.arange(bflat.nsamples) # now try to go back bflatrev = bflat.mapper.reverse(bflat) # data should be same again, as far as the boxcars match assert_array_equal(ds.samples[:2], bflatrev.samples[:2]) assert_array_equal(ds.samples[-2:], bflatrev.samples[-2:]) # feature axis should match assert_equal(ds.shape[1:], bflatrev.shape[1:])
def fmri_dataset( samples, targets=None, chunks=None, mask=None, sprefix='voxel', tprefix='time', add_fa=None, ): """Create a dataset from an fMRI timeseries image. The timeseries image serves as the samples data, with each volume becoming a sample. All 3D volume samples are flattened into one-dimensional feature vectors, optionally being masked (i.e. subset of voxels corresponding to non-zero elements in a mask image). In addition to (optional) samples attributes for targets and chunks the returned dataset contains a number of additional attributes: Samples attributes (per each volume): * volume index (time_indices) * volume acquisition time (time_coord) Feature attributes (per each voxel): * voxel indices (voxel_indices), sometimes referred to as ijk Dataset attributes: * dump of the image (e.g. NIfTI) header data (imghdr) * class of the image (e.g. Nifti1Image) (imgtype) * volume extent (voxel_dim) * voxel extent (voxel_eldim) The default attribute name is listed in parenthesis, but may be altered by the corresponding prefix arguments. The validity of the attribute values relies on correct settings in the NIfTI image header. Parameters ---------- samples : str or NiftiImage or list fMRI timeseries, specified either as a filename (single file 4D image), an image instance (4D image), or a list of filenames or image instances (each list item corresponding to a 3D volume). targets : scalar or sequence Label attribute for each volume in the timeseries, or a scalar value that is assigned to all samples. chunks : scalar or sequence Chunk attribute for each volume in the timeseries, or a scalar value that is assigned to all samples. mask : str or NiftiImage Filename or image instance of a 3D volume mask. Voxels corresponding to non-zero elements in the mask will be selected. The mask has to be in the same space (orientation and dimensions) as the timeseries image sprefix : str or None Prefix for attribute names describing spatial properties of the timeseries. If None, no such attributes are stored in the dataset. tprefix : str or None Prefix for attribute names describing temporal properties of the timeseries. If None, no such attributes are stored in the dataset. add_fa : dict or None Optional dictionary with additional volumetric data that shall be stored as feature attributes in the dataset. The dictionary key serves as the feature attribute name. Each value might be of any type supported by the 'mask' argument of this function. Returns ------- Dataset """ # load the samples imgdata, imghdr, img = _load_anyimg(samples, ensure=True, enforce_dim=4) # figure out what the mask is, but only handle known cases, the rest # goes directly into the mapper which maybe knows more maskimg = _load_anyimg(mask) if maskimg is None: pass else: # take just data and ignore the header mask = maskimg[0] # compile the samples attributes sa = {} if targets is not None: sa['targets'] = _expand_attribute(targets, imgdata.shape[0], 'targets') if chunks is not None: sa['chunks'] = _expand_attribute(chunks, imgdata.shape[0], 'chunks') # create a dataset ds = Dataset(imgdata, sa=sa) if sprefix is None: space = None else: space = sprefix + '_indices' ds = ds.get_mapped(FlattenMapper(shape=imgdata.shape[1:], space=space)) # now apply the mask if any if mask is not None: flatmask = ds.a.mapper.forward1(mask) # direct slicing is possible, and it is potentially more efficient, # so let's use it #mapper = StaticFeatureSelection(flatmask) #ds = ds.get_mapped(StaticFeatureSelection(flatmask)) ds = ds[:, flatmask != 0] # load and store additional feature attributes if add_fa is not None: for fattr in add_fa: value = _load_anyimg(add_fa[fattr], ensure=True)[0] ds.fa[fattr] = ds.a.mapper.forward1(value) # store interesting NIfTI props in the dataset in a more portable way ds.a['imgaffine'] = img.affine ds.a['imgtype'] = img.__class__.__name__ # stick the header instance in as is, and ... ds.a['imghdr'] = imghdr # ... let strip_nibabel() be the central place to take care of any header # conversion into non-NiBabel dtypes strip_nibabel(ds) # If there is a space assigned , store the extent of that space if sprefix is not None: ds.a[sprefix + '_dim'] = imgdata.shape[1:] # 'voxdim' is (x,y,z) while 'samples' are (t,z,y,x) ds.a[sprefix + '_eldim'] = _get_voxdim(imghdr) # TODO extend with the unit if tprefix is not None: ds.sa[tprefix + '_indices'] = np.arange(len(ds), dtype='int') ds.sa[tprefix + '_coords'] = \ np.arange(len(ds), dtype='float') * _get_dt(imghdr) # TODO extend with the unit return ds
def eventrelated_dataset(ds, events=None, time_attr=None, match='prev', eprefix='event'): """Segment a dataset into a set of events. This function can be used to extract event-related samples from any time-series based dataset (actually, it don't have to be time series, but could also be any other type of ordered samples). Boxcar-shaped event samples, potentially spanning multiple input samples can be automatically extracted using :class:`~mvpa2.misc.support.Event` definition lists. For each event all samples covering that particular event are used to form the corresponding sample. An event definition is a dictionary that contains ``onset`` (as sample index in the input dataset), ``duration`` (as number of consecutive samples after the onset), as well as an arbitrary number of additional attributes. Alternatively, ``onset`` and ``duration`` may also be given as real time stamps (or durations). In this case a to be specified samples attribute in the input dataset will be used to convert these into sample indices. Parameters ---------- ds : Dataset The samples of this input dataset have to be in whatever ascending order. events : list Each event definition has to specify ``onset`` and ``duration``. All other attributes will be passed on to the sample attributes collection of the returned dataset. time_attr : str or None If not None, the ``onset`` and ``duration`` specs from the event list will be converted using information from this sample attribute. Its values will be treated as in-the-same-unit and are used to determine corresponding samples from real-value onset and duration definitions. match : {'prev', 'next', 'closest'} Strategy used to match real-value onsets to sample indices. 'prev' chooses the closes preceding samples, 'next' the closest following sample and 'closest' to absolute closest sample. eprefix : str or None If not None, this prefix is used to name additional attributes generated by the underlying `~mvpa2.mappers.boxcar.BoxcarMapper`. If it is set to None, no additional attributes will be created. Returns ------- Dataset The returned dataset has one sample per each event definition that has been passed to the function. Examples -------- The documentation also contains an :ref:`example script <example_eventrelated>` showing a spatio-temporal analysis of fMRI data that involves this function. >>> from mvpa2.datasets import Dataset >>> ds = Dataset(np.random.randn(10, 25)) >>> events = [{'onset': 2, 'duration': 4}, ... {'onset': 4, 'duration': 4}] >>> eds = eventrelated_dataset(ds, events) >>> len(eds) 2 >>> eds.nfeatures == ds.nfeatures * 4 True >>> 'mapper' in ds.a False >>> print eds.a.mapper <Chain: <Boxcar: bl=4>-<Flatten>> And now the same conversion, but with events specified as real time. This is on possible if the input dataset contains a sample attribute with the necessary information about the input samples. >>> ds.sa['record_time'] = np.linspace(0, 5, len(ds)) >>> rt_events = [{'onset': 1.05, 'duration': 2.2}, ... {'onset': 2.3, 'duration': 2.12}] >>> rt_eds = eventrelated_dataset(ds, rt_events, time_attr='record_time', ... match='closest') >>> np.all(eds.samples == rt_eds.samples) True >>> # returned dataset e.g. has info from original samples >>> rt_eds.sa.record_time array([[ 1.11111111, 1.66666667, 2.22222222, 2.77777778], [ 2.22222222, 2.77777778, 3.33333333, 3.88888889]]) """ # relabel argument conv_strategy = { 'prev': 'floor', 'next': 'ceil', 'closest': 'round' }[match] if not time_attr is None: tvec = ds.sa[time_attr].value # we are asked to convert onset time into sample ids descr_events = [] for ev in events: # do not mess with the input data ev = copy.deepcopy(ev) # best matching sample idx = value2idx(ev['onset'], tvec, conv_strategy) # store offset of sample time and real onset ev['orig_offset'] = ev['onset'] - tvec[idx] # rescue the real onset into a new attribute ev['orig_onset'] = ev['onset'] ev['orig_duration'] = ev['duration'] # figure out how many samples we need ev['duration'] = \ len(tvec[idx:][tvec[idx:] < ev['onset'] + ev['duration']]) # new onset is sample index ev['onset'] = idx descr_events.append(ev) else: descr_events = events # convert the event specs into the format expected by BoxcarMapper # take the first event as an example of contained keys evvars = {} for k in descr_events[0]: try: evvars[k] = [e[k] for e in descr_events] except KeyError: raise ValueError("Each event property must be present for all " "events (could not find '%s')" % k) # checks for p in ['onset', 'duration']: if not p in evvars: raise ValueError("'%s' is a required property for all events." % p) boxlength = max(evvars['duration']) if __debug__: if not max(evvars['duration']) == min(evvars['duration']): warning('Boxcar mapper will use maximum boxlength (%i) of all ' 'provided Events.' % boxlength) # finally create, train und use the boxcar mapper bcm = BoxcarMapper(evvars['onset'], boxlength, space=eprefix) bcm.train(ds) ds = ds.get_mapped(bcm) # at last reflatten the dataset # could we add some meaningful attribute during this mapping, i.e. would # assigning 'inspace' do something good? ds = ds.get_mapped(FlattenMapper(shape=ds.samples.shape[1:])) # add samples attributes for the events, simply dump everything as a samples # attribute for a in evvars: if not eprefix is None and a in ds.sa: # if there is already a samples attribute like this, it got mapped # by BoxcarMapper (i.e. is multi-dimensional). We move it aside # under new `eprefix` name ds.sa[eprefix + '_' + a] = ds.sa[a] if a in ['onset', 'duration']: # special case: we want the non-discrete, original onset and # duration if not time_attr is None: # but only if there was a conversion happining, since otherwise # we get the same info from BoxcarMapper ds.sa[a] = [e[a] for e in events] else: ds.sa[a] = evvars[a] return ds
def simple_sim1( shape, dissims, rois_arrangement='circle', roi_neighborhood=Sphere(5), nruns=1, nsubjects=1, # noise components -- we just add normal for now also with # spatial smoothing to possibly create difference in noise # characteristics across different kinds # # "Instrumental noise" -- generic nuisance noise_independent_std=0.4, noise_independent_smooth=3., # "Intrinsic signal", specific per each subject (due to # motion, whatever) -- might be fun for someone to cluster, # but irrelevant for us noise_subject_n=1, noise_subject_std=0.4, noise_subject_smooth=1.5, # "Intrinsic common signal" -- probably generalizes across # subjects and fun for someone studying veins to get those # reproducible clusters. It will be mixed in also with # different weights per each run. # Again -- might be fun for someone to cluster, but not for us # since it would not be representative of the original signal noise_common_n=1, noise_common_std=0.4, noise_common_smooth=2.): """Simulate "data" containing similarity matrices with 3 noise components for multiple subjects Noise components are: - random normal noise, also spatially smoothed (should have smaller sigma for smoothing probably than for intrinsic noise) - intrinsic noise which is composed from a set of random fields, generated by random normal noise with subsequent spatial filtering, which are then mixed into each run data with random weights. They are to simulate subject-specific intrinsic signals such as artifacts due to motion, possible subject-specific physiological processes - intrinsic common noise across subjects intrinsic noise (e.g. all of them have similar blood distribution networks and other physiological parameters, and some intrinsic networks, which although similar in space would have different mix-in coefficients across subject/runs) Theoretically, decomposition methods (such as ICA, PCA, etc) should help to identify such common noise components and filter them out. Also methods which iteratively remove non-informative projections (such as GLMdenoise) should be effective to identify those mix-ins TODO: now mix-in happens with purely normal random weights, ideally we should color those as well """ ndissims = len(dissims) # first we fisher transform so we can add normal noise # check first that we don't have extreme values that might give infinity dissims = np.array(dissims) dissims = 1. - dissims dissims[dissims == 1] = 0.99 dissims[dissims == -1] = -0.99 # fisher dissims = np.arctanh(dissims) # generate target clean "picture" d = np.asanyarray(dissims[0]) signal_clean = np.zeros(shape + (len(vector_form(d)), )) # generate ground truth for clustering cluster_truth = np.zeros(shape, dtype='int') if rois_arrangement == 'circle': radius = min(shape[:2]) / 4. center = np.array((radius * 2, ) * len(shape)).astype(int) # arrange at quarter distance from center for i, dissim in enumerate(dissims): dissim = vector_form(dissim) # that is kinda boring -- the same dissimilarity to each # voxel??? # # TODO: come up with a better arrangement/idea, e.g. to # generate an MVPA pattern which would satisfy the # dissimilarity (not exactly but at least close). That # would make more sense roi_center = center.copy() roi_center[0] += int(radius * np.cos(2 * np.pi * i / ndissims)) roi_center[1] += int(radius * np.sin(2 * np.pi * i / ndissims)) for coords in roi_neighborhood(roi_center): acoords = np.asanyarray(coords) if np.all(acoords >= [0]*len(coords)) and \ np.all(acoords < signal_clean.shape[:len(coords)]): signal_clean.__setitem__(coords, dissim) cluster_truth.__setitem__(coords, i + 1) else: raise ValueError("I know only circle") # generated randomly and will be mixed into subjects with different weights # TODO: static across runs within subject?? if so -- would be no different # from having RSAs? common_noises = get_intrinsic_noises(signal_clean.shape, std=noise_common_std, sigma=noise_common_smooth, n=noise_common_n) assert common_noises[0].ndim == 3, "There should be no time comp" # Now lets generate per subject and per run data by adding some noise(s) # all_signals = [] dss = [] for isubject in xrange(nsubjects): # Interesting noise, simulating some underlying process which has nothing # to do with original design/similarity but having spatial structure which # repeats through runs with random weights (consider it to be a principal component) # generated randomly for each subject separately, but they should have # common structure across runs subj_specific_noises = get_intrinsic_noises(signal_clean.shape, std=noise_subject_std, sigma=noise_subject_smooth, n=noise_subject_n) assert subj_specific_noises[ 0].ndim == 3, "There should be no time comp" # subject_signals = [] dss_subject = [] subj_common_noises = [ noise * np.random.normal() for noise in common_noises ] subj_specific_mixins = generate_mixins(nruns) subj_common_mixins = generate_mixins(nruns) for run in range(nruns): signal_run = signal_clean.copy() for noise in subj_specific_noises: signal_run += noise * subj_specific_mixins[run] for noise in subj_common_noises: signal_run += noise * subj_common_mixins[run] # generic noise -- no common structure across subjects/runs signal_run += filter_each_2d( np.random.normal(size=signal_clean.shape) * noise_independent_std, noise_independent_smooth) # go back to correlations with inverse of fisher signal_run = np.tanh(signal_run) # rollaxis to bring similarities into leading dimension ds = Dataset(np.rollaxis(signal_run, 2, 0)) ds.sa['chunks'] = [run] ds.sa['dissimilarity'] = np.arange(len(dissim)) # Lame one for now ds_flat = ds.get_mapped( FlattenMapper(shape=ds.shape[1:], space='pixel_indices')) dss_subject.append(ds_flat) #subject_signals.append(signal_run) #all_signals.append(subject_signals) ds = dsvstack(dss_subject) ds.a['mapper'] = dss_subject[ 0].a.mapper # .a are not transferred by vstack dss.append(ds) # Instrumental noise -- the most banal assert (len(dss) == nsubjects) assert (len(dss) == nsubjects) assert (len(dss[0]) == nruns * len(dissim)) return np.tanh(signal_clean), cluster_truth, dss