Пример #1
0
def test_save_load_FlattenMapper(f):
    from mvpa2.mappers.flatten import FlattenMapper
    fm = FlattenMapper()
    ds = datasets['3dsmall']
    ds_ = fm(ds)
    ds_r = fm.reverse(ds_)
    fm_ = saveload(fm, f)
    assert_equal(fm_.shape, fm.shape)
Пример #2
0
def test_save_load_FlattenMapper(f):
    from mvpa2.mappers.flatten import FlattenMapper
    fm = FlattenMapper()
    ds = datasets['3dsmall']
    ds_ = fm(ds)
    ds_r = fm.reverse(ds_)
    fm_ = saveload(fm, f)
    assert_equal(fm_.shape, fm.shape)
Пример #3
0
def eeglab_dataset(samples):
    '''Make a Dataset instance from EEGLAB input data

    Parameters
    ----------
    samples: str
        Filename of EEGLAB text file

    Returns
    -------
    ds: mvpa2.base.dataset.Dataset
        Dataset with the contents of the input file
    '''
    if not isinstance(samples, basestring):
        raise ValueError("Samples should be a string")

    if _looks_like_filename(samples):
        if not os.path.exists(samples):
            raise ValueError("Input looks like a filename, but file"
                                " %s does not exist" % samples)
        with open(samples) as f:
            samples = f.read()

    lines = samples.split('\n')
    samples = []
    cur_sample = None

    for i, line in enumerate(lines):
        if not line:
            continue
        if i == 0:
            # first line contains the channel names
            channel_labels = line.split()
            n_channels = len(channel_labels)
        else:
            # first value is the time point, the remainders the value 
            # for each channel
            values = map(float, line.split())
            t = values[0]  # time 
            eeg = values[1:] # values for each electrode

            if len(eeg) != n_channels:
                raise ValueError("Line %d: expected %d values but found %d" %
                                    (n_channels, len(eeg)))

            if cur_sample is None or t < prev_t:
                # new sample
                cur_sample = []
                samples.append(cur_sample)

            cur_sample.append((t, eeg))
            prev_t = t

    # get and verify number of elements in each dimension
    n_samples = len(samples)
    n_timepoints_all = map(len, samples)

    n_timepoints_unique = set(n_timepoints_all)
    if len(n_timepoints_unique) != 1:
        raise ValueError("Different number of time points in different"
                            "samples: found %d different lengths" %
                            len(n_timepoints_unique))

    n_timepoints = n_timepoints_all[0]

    shape = (n_samples, n_timepoints, n_channels)

    # allocate space for data
    data = np.zeros(shape)

    # make a list of all channels and timepoints
    channel_array = np.asarray(channel_labels)
    timepoint_array = np.asarray([samples[0][i][0]
                                  for i in xrange(n_timepoints)])

    dts = timepoint_array[1:] - timepoint_array[:-1]
    if not np.all(dts == dts[0]):
        raise ValueError("Delta time points are different")

    # put the values in the data array
    for i, sample in enumerate(samples):
        for j, (t, values) in enumerate(sample):
            # check that the time is the same
            if i > 0 and timepoint_array[j] != t:
                raise ValueError("Sample %d, time point %s is different "
                                 "than the first sample (%s)" %
                                 (i, t, timepoint_array[j]))

            for k, value in enumerate(values):
                data[i, j, k] = value

    samples = None # and let gc do it's job

    # make a Dataset instance with the data
    ds = Dataset(data)

    # append a flatten_mapper to go from 3D (sample X time X channel)
    # to 2D (sample X (time X channel))
    flatten_mapper = FlattenMapper(shape=shape[1:], space='time_channel_indices')
    ds = ds.get_mapped(flatten_mapper)

    # make this a 3D array of the proper size
    channel_array_3D = np.tile(channel_array, (1, n_timepoints, 1))
    timepoint_array_3D = np.tile(np.reshape(timepoint_array, (-1, 1)),
                                            (1, 1, n_channels))

    # for consistency use the flattan_mapper defined above to 
    # flatten channel and timepoint names as well
    ds.fa['channelids'] = flatten_mapper.forward(channel_array_3D).ravel()
    ds.fa['timepoints'] = flatten_mapper.forward(timepoint_array_3D).ravel()

    # make some dynamic properties
    # XXX at the moment we don't have propert 'protection' in case
    # the feature space is sliced in a way so that some channels and/or
    # timepoints occur more often than others 
    _eeglab_set_attributes(ds)

    return ds
Пример #4
0
    def from_wizard(cls,
                    samples,
                    targets=None,
                    chunks=None,
                    mask=None,
                    mapper=None,
                    flatten=None,
                    space=None):
        """Convenience method to create dataset.

        Datasets can be created from N-dimensional samples. Data arrays with
        more than two dimensions are going to be flattened, while preserving
        the first axis (separating the samples) and concatenating all other as
        the second axis. Optionally, it is possible to specify targets and
        chunk attributes for all samples, and masking of the input data (only
        selecting elements corresponding to non-zero mask elements

        Parameters
        ----------
        samples : ndarray
          N-dimensional samples array. The first axis separates individual
          samples.
        targets : scalar or ndarray, optional
          Labels for all samples. If a scalar is provided its values is assigned
          as label to all samples.
        chunks : scalar or ndarray, optional
          Chunks definition for all samples. If a scalar is provided its values
          is assigned as chunk of all samples.
        mask : ndarray, optional
          The shape of the array has to correspond to the shape of a single
          sample (shape(samples)[1:] == shape(mask)). Its non-zero elements
          are used to mask the input data.
        mapper : Mapper instance, optional
          A trained mapper instance that is used to forward-map
          possibly already flattened (see flatten) and masked samples
          upon construction of the dataset. The mapper must have a
          simple feature space (samples x features) as output. Use a
          `ChainMapper` to achieve that, if necessary.
        flatten : None or bool, optional
          If None (default) and no mapper provided, data would get flattened.
          Bool value would instruct explicitly either to flatten before
          possibly passing into the mapper if no mask is given.
        space : str, optional
          If provided it is assigned to the mapper instance that performs the
          initial flattening of the data.

        Returns
        -------
        instance : Dataset
        """
        # for all non-ndarray samples you need to go with the constructor
        samples = np.asanyarray(samples)

        # compile the necessary samples attributes collection
        sa_items = {}

        if not targets is None:
            sa_items['targets'] = _expand_attribute(targets, samples.shape[0],
                                                    'targets')

        if not chunks is None:
            # unlike previous implementation, we do not do magic to do chunks
            # if there are none, there are none
            sa_items['chunks'] = _expand_attribute(chunks, samples.shape[0],
                                                   'chunks')

        # common checks should go into __init__
        ds = cls(samples, sa=sa_items)
        # apply mask through mapper
        if mask is None:
            # if we have multi-dim data
            if len(samples.shape) > 2 and \
                   ((flatten is None and mapper is None) # auto case
                    or flatten):                         # bool case
                fm = FlattenMapper(shape=samples.shape[1:], space=space)
                ds = ds.get_mapped(fm)
        else:
            mm = mask_mapper(mask, space=space)
            mm.train(ds)
            ds = ds.get_mapped(mm)

        # apply generic mapper
        if not mapper is None:
            ds = ds.get_mapped(mapper)
        return ds
Пример #5
0
def test_flatten():
    samples_shape = (2, 2, 4)
    data_shape = (4,) + samples_shape
    data = np.arange(np.prod(data_shape)).reshape(data_shape).view(myarray)
    pristinedata = data.copy()
    target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
              [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
              [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
              [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]]
    target = np.array(target).view(myarray)
    index_target = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3],
                            [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3],
                            [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 0, 3],
                            [1, 1, 0], [1, 1, 1], [1, 1, 2], [1, 1, 3]])

    # test only flattening the first two dimensions
    fm_max = FlattenMapper(maxdims=2)
    fm_max.train(data)
    assert_equal(fm_max(data).shape, (4, 4, 4))

    # array subclass survives
    ok_(isinstance(data, myarray))

    # actually, there should be no difference between a plain FlattenMapper and
    # a chain that only has a FlattenMapper as the one element
    for fm in [FlattenMapper(space='voxel'),
               ChainMapper([FlattenMapper(space='voxel'),
                            StaticFeatureSelection(slice(None))])]:
        # not working if untrained
        assert_raises(RuntimeError,
                      fm.forward1,
                      np.arange(np.sum(samples_shape) + 1))

        fm.train(data)

        ok_(isinstance(fm.forward(data), myarray))
        ok_(isinstance(fm.forward1(data[2]), myarray))
        assert_array_equal(fm.forward(data), target)
        assert_array_equal(fm.forward1(data[2]), target[2])
        assert_raises(ValueError, fm.forward, np.arange(4))

        # all of that leaves that data unmodified
        assert_array_equal(data, pristinedata)

        # reverse mapping
        ok_(isinstance(fm.reverse(target), myarray))
        ok_(isinstance(fm.reverse1(target[0]), myarray))
        ok_(isinstance(fm.reverse(target[1:2]), myarray))
        assert_array_equal(fm.reverse(target), data)
        assert_array_equal(fm.reverse1(target[0]), data[0])
        assert_array_equal(fm.reverse1(target[0]),
                           _verified_reverse1(fm, target[0]))
        assert_array_equal(fm.reverse(target[1:2]), data[1:2])
        assert_raises(ValueError, fm.reverse, np.arange(14))

        # check one dimensional data, treated as scalar samples
        oned = np.arange(5)
        fm.train(Dataset(oned))
        # needs 2D
        assert_raises(ValueError, fm.forward, oned)
        # doesn't match mapper, since Dataset turns `oned` into (5,1)
        assert_raises(ValueError, fm.forward, oned)
        assert_equal(Dataset(oned).nfeatures, 1)

        # try dataset mode, with some feature attribute
        fattr = np.arange(np.prod(samples_shape)).reshape(samples_shape)
        ds = Dataset(data, fa={'awesome': fattr.copy()})
        assert_equal(ds.samples.shape, data_shape)
        fm.train(ds)
        dsflat = fm.forward(ds)
        ok_(isinstance(dsflat, Dataset))
        ok_(isinstance(dsflat.samples, myarray))
        assert_array_equal(dsflat.samples, target)
        assert_array_equal(dsflat.fa.awesome, np.arange(np.prod(samples_shape)))
        assert_true(isinstance(dsflat.fa['awesome'], ArrayCollectable))
        # test index creation
        assert_array_equal(index_target, dsflat.fa.voxel)

        # and back
        revds = fm.reverse(dsflat)
        ok_(isinstance(revds, Dataset))
        ok_(isinstance(revds.samples, myarray))
        assert_array_equal(revds.samples, data)
        assert_array_equal(revds.fa.awesome, fattr)
        assert_true(isinstance(revds.fa['awesome'], ArrayCollectable))
        assert_false('voxel' in revds.fa)
Пример #6
0
def test_chainmapper():
    # the chain needs at lest one mapper
    assert_raises(ValueError, ChainMapper, [])
    # a typical first mapper is to flatten
    cm = ChainMapper([FlattenMapper()])

    # few container checks
    assert_equal(len(cm), 1)
    assert_true(isinstance(cm[0], FlattenMapper))

    # now training
    # come up with data
    samples_shape = (2, 2, 4)
    data_shape = (4,) + samples_shape
    data = np.arange(np.prod(data_shape)).reshape(data_shape)
    target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
              [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
              [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
              [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]]
    target = np.array(target)

    # if it is not trained it knows nothing
    cm.train(data)

    # a new mapper should appear when doing feature selection
    cm.append(StaticFeatureSelection(list(range(1, 16))))
    assert_equal(cm.forward1(data[0]).shape, (15,))
    assert_equal(len(cm), 2)
    # multiple slicing
    cm.append(StaticFeatureSelection([9, 14]))
    assert_equal(cm.forward1(data[0]).shape, (2,))
    assert_equal(len(cm), 3)

    # check reproduction
    if __debug__:
        # debug mode needs special test as it enhances the repr output
        # with module info and id() appendix for objects
        import mvpa2
        cm_clone = eval(repr(cm))
        assert_equal('#'.join(repr(cm_clone).split('#')[:-1]),
                     '#'.join(repr(cm).split('#')[:-1]))
    else:
        cm_clone = eval(repr(cm))
        assert_equal(repr(cm_clone), repr(cm))

    # what happens if we retrain the whole beast an same data as before
    cm.train(data)
    assert_equal(cm.forward1(data[0]).shape, (2,))
    assert_equal(len(cm), 3)

    # let's map something
    mdata = cm.forward(data)
    assert_array_equal(mdata, target[:, [10, 15]])
    # and back
    rdata = cm.reverse(mdata)
    # original shape
    assert_equal(rdata.shape, data.shape)
    # content as far it could be restored
    assert_array_equal(rdata[rdata > 0], data[rdata > 0])
    assert_equal(np.sum(rdata > 0), 8)

    # Lets construct a dataset with mapper assigned and see
    # if sub-selecting a feature adjusts trailing StaticFeatureSelection
    # appropriately
    ds_subsel = Dataset.from_wizard(data, mapper=cm)[:, 1]
    tail_sfs = ds_subsel.a.mapper[-1]
    assert_equal(repr(tail_sfs), 'StaticFeatureSelection(slicearg=array([14]))')
Пример #7
0
def _extract_boxcar_events(ds,
                           events=None,
                           time_attr=None,
                           match='prev',
                           eprefix='event',
                           event_mapper=None):
    """see eventrelated_dataset() for docs"""
    # relabel argument
    conv_strategy = {
        'prev': 'floor',
        'next': 'ceil',
        'closest': 'round'
    }[match]

    if not time_attr is None:
        tvec = ds.sa[time_attr].value
        # we are asked to convert onset time into sample ids
        descr_events = []
        for ev in events:
            # do not mess with the input data
            ev = copy.deepcopy(ev)
            # best matching sample
            idx = value2idx(ev['onset'], tvec, conv_strategy)
            # store offset of sample time and real onset
            ev['orig_offset'] = ev['onset'] - tvec[idx]
            # rescue the real onset into a new attribute
            ev['orig_onset'] = ev['onset']
            ev['orig_duration'] = ev['duration']
            # figure out how many samples we need
            ev['duration'] = \
                    len(tvec[idx:][tvec[idx:] < ev['onset'] + ev['duration']])
            # new onset is sample index
            ev['onset'] = idx
            descr_events.append(ev)
    else:
        descr_events = events
    # convert the event specs into the format expected by BoxcarMapper
    # take the first event as an example of contained keys
    evvars = _events2dict(descr_events)
    # checks
    for p in ['onset', 'duration']:
        if not p in evvars:
            raise ValueError("'%s' is a required property for all events." % p)
    boxlength = max(evvars['duration'])
    if __debug__:
        if not max(evvars['duration']) == min(evvars['duration']):
            warning('Boxcar mapper will use maximum boxlength (%i) of all '
                    'provided Events.' % boxlength)

    # finally create, train und use the boxcar mapper
    bcm = BoxcarMapper(evvars['onset'], boxlength, space=eprefix)
    bcm.train(ds)
    ds = ds.get_mapped(bcm)
    if event_mapper is None:
        # at last reflatten the dataset
        # could we add some meaningful attribute during this mapping, i.e. would
        # assigning 'inspace' do something good?
        ds = ds.get_mapped(FlattenMapper(shape=ds.samples.shape[1:]))
    else:
        ds = ds.get_mapped(event_mapper)
    # add samples attributes for the events, simply dump everything as a samples
    # attribute
    # special case onset and duration in case of conversion into descrete time
    if not time_attr is None:
        for attr in ('onset', 'duration'):
            evvars[attr] = [e[attr] for e in events]
    ds = _evvars2ds(ds, evvars, eprefix)

    return ds
Пример #8
0
def test_flatten():
    samples_shape = (2, 2, 4)
    data_shape = (4,) + samples_shape
    data = np.arange(np.prod(data_shape)).reshape(data_shape).view(myarray)
    pristinedata = data.copy()
    target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
              [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
              [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
              [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]]
    target = np.array(target).view(myarray)
    index_target = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3],
                            [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3],
                            [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 0, 3],
                            [1, 1, 0], [1, 1, 1], [1, 1, 2], [1, 1, 3]])

    # test only flattening the first two dimensions
    fm_max = FlattenMapper(maxdims=2)
    fm_max.train(data)
    assert_equal(fm_max(data).shape, (4, 4, 4))

    # array subclass survives
    ok_(isinstance(data, myarray))

    # actually, there should be no difference between a plain FlattenMapper and
    # a chain that only has a FlattenMapper as the one element
    for fm in [FlattenMapper(space='voxel'),
               ChainMapper([FlattenMapper(space='voxel'),
                            StaticFeatureSelection(slice(None))])]:
        # not working if untrained
        assert_raises(RuntimeError,
                      fm.forward1,
                      np.arange(np.sum(samples_shape) + 1))

        fm.train(data)

        ok_(isinstance(fm.forward(data), myarray))
        ok_(isinstance(fm.forward1(data[2]), myarray))
        assert_array_equal(fm.forward(data), target)
        assert_array_equal(fm.forward1(data[2]), target[2])
        assert_raises(ValueError, fm.forward, np.arange(4))

        # all of that leaves that data unmodified
        assert_array_equal(data, pristinedata)

        # reverse mapping
        ok_(isinstance(fm.reverse(target), myarray))
        ok_(isinstance(fm.reverse1(target[0]), myarray))
        ok_(isinstance(fm.reverse(target[1:2]), myarray))
        assert_array_equal(fm.reverse(target), data)
        assert_array_equal(fm.reverse1(target[0]), data[0])
        assert_array_equal(fm.reverse(target[1:2]), data[1:2])
        assert_raises(ValueError, fm.reverse, np.arange(14))

        # check one dimensional data, treated as scalar samples
        oned = np.arange(5)
        fm.train(Dataset(oned))
        # needs 2D
        assert_raises(ValueError, fm.forward, oned)
        # doesn't match mapper, since Dataset turns `oned` into (5,1)
        assert_raises(ValueError, fm.forward, oned)
        assert_equal(Dataset(oned).nfeatures, 1)

        # try dataset mode, with some feature attribute
        fattr = np.arange(np.prod(samples_shape)).reshape(samples_shape)
        ds = Dataset(data, fa={'awesome': fattr.copy()})
        assert_equal(ds.samples.shape, data_shape)
        fm.train(ds)
        dsflat = fm.forward(ds)
        ok_(isinstance(dsflat, Dataset))
        ok_(isinstance(dsflat.samples, myarray))
        assert_array_equal(dsflat.samples, target)
        assert_array_equal(dsflat.fa.awesome, np.arange(np.prod(samples_shape)))
        assert_true(isinstance(dsflat.fa['awesome'], ArrayCollectable))
        # test index creation
        assert_array_equal(index_target, dsflat.fa.voxel)

        # and back
        revds = fm.reverse(dsflat)
        ok_(isinstance(revds, Dataset))
        ok_(isinstance(revds.samples, myarray))
        assert_array_equal(revds.samples, data)
        assert_array_equal(revds.fa.awesome, fattr)
        assert_true(isinstance(revds.fa['awesome'], ArrayCollectable))
        assert_false('voxel' in revds.fa)
Пример #9
0
def extract_boxcar_event_samples(ds,
                                 events=None,
                                 time_attr=None,
                                 match='prev',
                                 event_offset=None,
                                 event_duration=None,
                                 eprefix='event',
                                 event_mapper=None):
    """Segment a dataset by extracting boxcar events

    (Multiple) consecutive samples are extracted for each event, and are either
    returned in a flattened shape, or subject to further processing.

    Events are specified as a list of dictionaries
    (see:class:`~mvpa2.misc.support.Event`) for a helper class. Each dictionary
    contains all relevant attributes to describe an event. This is at least the
    ``onset`` time of an event, but can also comprise of ``duration``,
    ``amplitude``, and arbitrary other attributes.

    Boxcar event model details
    --------------------------

    For each event all samples covering that particular event are used to form
    a corresponding sample. One sample for each event is returned. Event
    specification dictionaries must contain an ``onset`` attribute (as sample
    index in the input dataset), ``duration`` (as number of consecutive samples
    after the onset). Any number of additional attributes can be present in an
    event specification. Those attributes are included as sample attributes in
    the returned dataset.

    Alternatively, ``onset`` and ``duration`` may also be given in a
    non-discrete time specification. In this case a dataset attribute needs to
    be specified that contains time-stamps for each input data sample, and is
    used to convert times into discrete sample indices (see ``match``
    argument).

    A mapper instance can be provided (see ``event_mapper``) to implement
    futher processing of each event sample, for example in order to yield
    average samples.

    Parameters
    ----------
    ds : Dataset
      The samples of this input dataset have to be in whatever ascending order.
    events : list
      Each event definition has to specify ``onset`` and ``duration``. All
      other attributes will be passed on to the sample attributes collection of
      the returned dataset.
    time_attr : str or None
      Attribute with dataset sample time-stamps.
      If not None, the ``onset`` and ``duration`` specs
      from the event list will be converted using information from this sample
      attribute. Its values will be treated as in-the-same-unit and are used to
      determine corresponding samples from real-value onset and duration
      definitions.
      For HRF modeling this argument is mandatory.
    match : {'prev', 'next', 'closest'}
      Strategy used to match real-value onsets to sample
      indices. 'prev' chooses the closes preceding samples, 'next' the closest
      following sample and 'closest' to absolute closest sample.
    event_offset : None or float
      If not None, all event ``onset`` specifications will be offset by this
      value before boxcar modeling is performed.
    event_duration : None or float
      If not None, all event ``duration`` specifications will be set to this
      value before boxcar modeling is done.
    eprefix : str or None
      If not None, this prefix is used to name additional
      attributes generated by the underlying
      `~mvpa2.mappers.boxcar.BoxcarMapper`. If it is set to None, no additional
      attributes will be created.
    event_mapper : Mapper
      This mapper is used to forward-map the dataset containing the boxcar event
      samples. If None (default) a FlattenMapper is employed to convert
      multi-dimensional sample matrices into simple one-dimensional sample
      vectors. This option can be used to implement temporal compression, by
      e.g. averaging samples within an event boxcar using an FxMapper. Any
      mapper needs to keep the sample axis unchanged, i.e. number and order of
      samples remain the same.

    Returns
    -------
    Dataset
      One sample per each event definition that has been passed to the
      function. Additional event attributes are included as sample attributes.

    Examples
    --------
    The documentation also contains an :ref:`example script
    <example_eventrelated>` showing a spatio-temporal analysis of fMRI data
    that involves this function.

    >>> from mvpa2.datasets import Dataset
    >>> ds = Dataset(np.random.randn(10, 25))
    >>> events = [{'onset': 2, 'duration': 4},
    ...           {'onset': 4, 'duration': 4}]
    >>> eds = eventrelated_dataset(ds, events)
    >>> len(eds)
    2
    >>> eds.nfeatures == ds.nfeatures * 4
    True
    >>> 'mapper' in ds.a
    False
    >>> print eds.a.mapper
    <Chain: <Boxcar: bl=4>-<Flatten>>

    And now the same conversion, but with events specified as real time. This is
    on possible if the input dataset contains a sample attribute with the
    necessary information about the input samples.

    >>> ds.sa['record_time'] = np.linspace(0, 5, len(ds))
    >>> rt_events = [{'onset': 1.05, 'duration': 2.2},
    ...              {'onset': 2.3, 'duration': 2.12}]
    >>> rt_eds = eventrelated_dataset(ds, rt_events, time_attr='record_time',
    ...                               match='closest')
    >>> np.all(eds.samples == rt_eds.samples)
    True
    >>> # returned dataset e.g. has info from original samples
    >>> rt_eds.sa.record_time
    array([[ 1.11111111,  1.66666667,  2.22222222,  2.77777778],
           [ 2.22222222,  2.77777778,  3.33333333,  3.88888889]])
    """
    # relabel argument
    conv_strategy = {
        'prev': 'floor',
        'next': 'ceil',
        'closest': 'round'
    }[match]

    if not (event_offset is None and event_duration is None):
        descr_events = []
        for ev in events:
            # do not mess with the input data
            ev = copy.deepcopy(ev)
            if event_offset is not None:
                ev['onset'] += event_offset
            if event_duration is not None:
                ev['duration'] = event_duration
            descr_events.append(ev)
        events = descr_events

    if time_attr is not None:
        tvec = ds.sa[time_attr].value
        # we are asked to convert onset time into sample ids
        descr_events = []
        for ev in events:
            # do not mess with the input data
            ev = copy.deepcopy(ev)
            # best matching sample
            idx = value2idx(ev['onset'], tvec, conv_strategy)
            # store offset of sample time and real onset
            ev['orig_offset'] = ev['onset'] - tvec[idx]
            # rescue the real onset into a new attribute
            ev['orig_onset'] = ev['onset']
            ev['orig_duration'] = ev['duration']
            # figure out how many samples we need
            ev['duration'] = \
                    len(tvec[idx:][tvec[idx:] < ev['onset'] + ev['duration']])
            # new onset is sample index
            ev['onset'] = idx
            descr_events.append(ev)
    else:
        descr_events = events
    # convert the event specs into the format expected by BoxcarMapper
    # take the first event as an example of contained keys
    evvars = _events2dict(descr_events)
    # checks
    for p in ['onset', 'duration']:
        if not p in evvars:
            raise ValueError("'%s' is a required property for all events." % p)
    boxlength = max(evvars['duration'])
    if __debug__:
        if not max(evvars['duration']) == min(evvars['duration']):
            warning('Boxcar mapper will use maximum boxlength (%i) of all '
                    'provided Events.' % boxlength)

    # finally create, train und use the boxcar mapper
    bcm = BoxcarMapper(evvars['onset'], boxlength, space=eprefix)
    bcm.train(ds)
    ds = ds.get_mapped(bcm)
    if event_mapper is None:
        # at last reflatten the dataset
        # could we add some meaningful attribute during this mapping, i.e. would
        # assigning 'inspace' do something good?
        ds = ds.get_mapped(FlattenMapper(shape=ds.samples.shape[1:]))
    else:
        ds = ds.get_mapped(event_mapper)
    # add samples attributes for the events, simply dump everything as a samples
    # attribute
    # special case onset and duration in case of conversion into descrete time
    if time_attr is not None:
        for attr in ('onset', 'duration'):
            evvars[attr] = [e[attr] for e in events]
    ds = _evvars2ds(ds, evvars, eprefix)

    return ds
Пример #10
0
def test_datasetmapping():
    # 6 samples, 4X2 features
    data = np.arange(48).reshape(6, 4, 2)
    ds = Dataset(data,
                 sa={
                     'timepoints': np.arange(6),
                     'multidim': data.copy()
                 },
                 fa={'fid': np.arange(4)})
    # with overlapping and non-overlapping boxcars
    startpoints = [0, 1, 4]
    boxlength = 2
    bm = BoxcarMapper(startpoints, boxlength, space='boxy')
    # train is critical
    bm.train(ds)
    mds = bm.forward(ds)
    assert_equal(len(mds), len(startpoints))
    assert_equal(mds.nfeatures, boxlength)
    # all samples attributes remain, but the can rotated/compressed into
    # multidimensional attributes
    assert_equal(sorted(mds.sa.keys()),
                 ['boxy_onsetidx'] + sorted(ds.sa.keys()))
    assert_equal(mds.sa.multidim.shape,
                 (len(startpoints), boxlength) + ds.shape[1:])
    assert_equal(mds.sa.timepoints.shape, (len(startpoints), boxlength))
    assert_array_equal(mds.sa.timepoints.flatten(),
                       np.array([(s, s + 1) for s in startpoints]).flatten())
    assert_array_equal(mds.sa.boxy_onsetidx, startpoints)
    # feature attributes also get rotated and broadcasted
    assert_array_equal(mds.fa.fid, [ds.fa.fid, ds.fa.fid])
    # and finally there is a new one
    assert_array_equal(mds.fa.boxy_offsetidx, list(range(boxlength)))

    # now see how it works on reverse()
    rds = bm.reverse(mds)
    # we got at least something of all original attributes back
    assert_equal(sorted(rds.sa.keys()), sorted(ds.sa.keys()))
    assert_equal(sorted(rds.fa.keys()), sorted(ds.fa.keys()))
    # it is not possible to reconstruct the full samples array
    # some samples even might show up multiple times (when there are overlapping
    # boxcars
    assert_array_equal(
        rds.samples,
        np.array([[[0, 1], [2, 3], [4, 5], [6, 7]],
                  [[8, 9], [10, 11], [12, 13], [14, 15]],
                  [[8, 9], [10, 11], [12, 13], [14, 15]],
                  [[16, 17], [18, 19], [20, 21], [22, 23]],
                  [[32, 33], [34, 35], [36, 37], [38, 39]],
                  [[40, 41], [42, 43], [44, 45], [46, 47]]]))
    assert_array_equal(rds.sa.timepoints, [0, 1, 1, 2, 4, 5])
    assert_array_equal(rds.sa.multidim, ds.sa.multidim[rds.sa.timepoints])
    # but feature attributes should be fully recovered
    assert_array_equal(rds.fa.fid, ds.fa.fid)

    # popular dataset configuration (double flatten + boxcar)
    cm = ChainMapper([FlattenMapper(), bm, FlattenMapper()])
    cm.train(ds)
    bflat = ds.get_mapped(cm)
    assert_equal(bflat.shape,
                 (len(startpoints), boxlength * np.prod(ds.shape[1:])))
    # add attributes
    bflat.fa['testfa'] = np.arange(bflat.nfeatures)
    bflat.sa['testsa'] = np.arange(bflat.nsamples)
    # now try to go back
    bflatrev = bflat.mapper.reverse(bflat)
    # data should be same again, as far as the boxcars match
    assert_array_equal(ds.samples[:2], bflatrev.samples[:2])
    assert_array_equal(ds.samples[-2:], bflatrev.samples[-2:])
    # feature axis should match
    assert_equal(ds.shape[1:], bflatrev.shape[1:])
Пример #11
0
def fmri_dataset(
    samples,
    targets=None,
    chunks=None,
    mask=None,
    sprefix='voxel',
    tprefix='time',
    add_fa=None,
):
    """Create a dataset from an fMRI timeseries image.

    The timeseries image serves as the samples data, with each volume becoming
    a sample. All 3D volume samples are flattened into one-dimensional feature
    vectors, optionally being masked (i.e. subset of voxels corresponding to
    non-zero elements in a mask image).

    In addition to (optional) samples attributes for targets and chunks the
    returned dataset contains a number of additional attributes:

    Samples attributes (per each volume):

      * volume index (time_indices)
      * volume acquisition time (time_coord)

    Feature attributes (per each voxel):

      * voxel indices (voxel_indices), sometimes referred to as ijk

    Dataset attributes:

      * dump of the image (e.g. NIfTI) header data (imghdr)
      * class of the image (e.g. Nifti1Image) (imgtype)
      * volume extent (voxel_dim)
      * voxel extent (voxel_eldim)

    The default attribute name is listed in parenthesis, but may be altered by
    the corresponding prefix arguments. The validity of the attribute values
    relies on correct settings in the NIfTI image header.

    Parameters
    ----------
    samples : str or NiftiImage or list
      fMRI timeseries, specified either as a filename (single file 4D image),
      an image instance (4D image), or a list of filenames or image instances
      (each list item corresponding to a 3D volume).
    targets : scalar or sequence
      Label attribute for each volume in the timeseries, or a scalar value that
      is assigned to all samples.
    chunks : scalar or sequence
      Chunk attribute for each volume in the timeseries, or a scalar value that
      is assigned to all samples.
    mask : str or NiftiImage
      Filename or image instance of a 3D volume mask. Voxels corresponding to
      non-zero elements in the mask will be selected. The mask has to be in the
      same space (orientation and dimensions) as the timeseries image
    sprefix : str or None
      Prefix for attribute names describing spatial properties of the
      timeseries. If None, no such attributes are stored in the dataset.
    tprefix : str or None
      Prefix for attribute names describing temporal properties of the
      timeseries. If None, no such attributes are stored in the dataset.
    add_fa : dict or None
      Optional dictionary with additional volumetric data that shall be stored
      as feature attributes in the dataset. The dictionary key serves as the
      feature attribute name. Each value might be of any type supported by the
      'mask' argument of this function.

    Returns
    -------
    Dataset
    """
    # load the samples
    imgdata, imghdr, img = _load_anyimg(samples, ensure=True, enforce_dim=4)

    # figure out what the mask is, but only handle known cases, the rest
    # goes directly into the mapper which maybe knows more
    maskimg = _load_anyimg(mask)
    if maskimg is None:
        pass
    else:
        # take just data and ignore the header
        mask = maskimg[0]

    # compile the samples attributes
    sa = {}
    if targets is not None:
        sa['targets'] = _expand_attribute(targets, imgdata.shape[0], 'targets')
    if chunks is not None:
        sa['chunks'] = _expand_attribute(chunks, imgdata.shape[0], 'chunks')

    # create a dataset
    ds = Dataset(imgdata, sa=sa)
    if sprefix is None:
        space = None
    else:
        space = sprefix + '_indices'
    ds = ds.get_mapped(FlattenMapper(shape=imgdata.shape[1:], space=space))

    # now apply the mask if any
    if mask is not None:
        flatmask = ds.a.mapper.forward1(mask)
        # direct slicing is possible, and it is potentially more efficient,
        # so let's use it
        #mapper = StaticFeatureSelection(flatmask)
        #ds = ds.get_mapped(StaticFeatureSelection(flatmask))
        ds = ds[:, flatmask != 0]

    # load and store additional feature attributes
    if add_fa is not None:
        for fattr in add_fa:
            value = _load_anyimg(add_fa[fattr], ensure=True)[0]
            ds.fa[fattr] = ds.a.mapper.forward1(value)

    # store interesting NIfTI props in the dataset in a more portable way
    ds.a['imgaffine'] = img.affine
    ds.a['imgtype'] = img.__class__.__name__
    # stick the header instance in as is, and ...
    ds.a['imghdr'] = imghdr
    # ... let strip_nibabel() be the central place to take care of any header
    # conversion into non-NiBabel dtypes
    strip_nibabel(ds)

    # If there is a space assigned , store the extent of that space
    if sprefix is not None:
        ds.a[sprefix + '_dim'] = imgdata.shape[1:]
        # 'voxdim' is (x,y,z) while 'samples' are (t,z,y,x)
        ds.a[sprefix + '_eldim'] = _get_voxdim(imghdr)
        # TODO extend with the unit
    if tprefix is not None:
        ds.sa[tprefix + '_indices'] = np.arange(len(ds), dtype='int')
        ds.sa[tprefix + '_coords'] = \
            np.arange(len(ds), dtype='float') * _get_dt(imghdr)
        # TODO extend with the unit

    return ds
Пример #12
0
def eventrelated_dataset(ds,
                         events=None,
                         time_attr=None,
                         match='prev',
                         eprefix='event'):
    """Segment a dataset into a set of events.

    This function can be used to extract event-related samples from any
    time-series based dataset (actually, it don't have to be time series, but
    could also be any other type of ordered samples). Boxcar-shaped event
    samples, potentially spanning multiple input samples can be automatically
    extracted using :class:`~mvpa2.misc.support.Event` definition lists.  For
    each event all samples covering that particular event are used to form the
    corresponding sample.

    An event definition is a dictionary that contains ``onset`` (as sample index
    in the input dataset), ``duration`` (as number of consecutive samples after
    the onset), as well as an arbitrary number of additional attributes.

    Alternatively, ``onset`` and ``duration`` may also be given as real time
    stamps (or durations). In this case a to be specified samples attribute in
    the input dataset will be used to convert these into sample indices.

    Parameters
    ----------
    ds : Dataset
      The samples of this input dataset have to be in whatever ascending order.
    events : list
      Each event definition has to specify ``onset`` and ``duration``. All other
      attributes will be passed on to the sample attributes collection of the
      returned dataset.
    time_attr : str or None
      If not None, the ``onset`` and ``duration`` specs from the event list will
      be converted using information from this sample attribute. Its values will
      be treated as in-the-same-unit and are used to determine corresponding
      samples from real-value onset and duration definitions.
    match : {'prev', 'next', 'closest'}
      Strategy used to match real-value onsets to sample indices. 'prev' chooses
      the closes preceding samples, 'next' the closest following sample and
      'closest' to absolute closest sample.
    eprefix : str or None
      If not None, this prefix is used to name additional attributes generated
      by the underlying `~mvpa2.mappers.boxcar.BoxcarMapper`. If it is set to
      None, no additional attributes will be created.

    Returns
    -------
    Dataset
      The returned dataset has one sample per each event definition that has
      been passed to the function.

    Examples
    --------
    The documentation also contains an :ref:`example script
    <example_eventrelated>` showing a spatio-temporal analysis of fMRI data
    that involves this function.

    >>> from mvpa2.datasets import Dataset
    >>> ds = Dataset(np.random.randn(10, 25))
    >>> events = [{'onset': 2, 'duration': 4},
    ...           {'onset': 4, 'duration': 4}]
    >>> eds = eventrelated_dataset(ds, events)
    >>> len(eds)
    2
    >>> eds.nfeatures == ds.nfeatures * 4
    True
    >>> 'mapper' in ds.a
    False
    >>> print eds.a.mapper
    <Chain: <Boxcar: bl=4>-<Flatten>>

    And now the same conversion, but with events specified as real time. This is
    on possible if the input dataset contains a sample attribute with the
    necessary information about the input samples.

    >>> ds.sa['record_time'] = np.linspace(0, 5, len(ds))
    >>> rt_events = [{'onset': 1.05, 'duration': 2.2},
    ...              {'onset': 2.3, 'duration': 2.12}]
    >>> rt_eds = eventrelated_dataset(ds, rt_events, time_attr='record_time',
    ...                               match='closest')
    >>> np.all(eds.samples == rt_eds.samples)
    True
    >>> # returned dataset e.g. has info from original samples
    >>> rt_eds.sa.record_time
    array([[ 1.11111111,  1.66666667,  2.22222222,  2.77777778],
           [ 2.22222222,  2.77777778,  3.33333333,  3.88888889]])
    """
    # relabel argument
    conv_strategy = {
        'prev': 'floor',
        'next': 'ceil',
        'closest': 'round'
    }[match]

    if not time_attr is None:
        tvec = ds.sa[time_attr].value
        # we are asked to convert onset time into sample ids
        descr_events = []
        for ev in events:
            # do not mess with the input data
            ev = copy.deepcopy(ev)
            # best matching sample
            idx = value2idx(ev['onset'], tvec, conv_strategy)
            # store offset of sample time and real onset
            ev['orig_offset'] = ev['onset'] - tvec[idx]
            # rescue the real onset into a new attribute
            ev['orig_onset'] = ev['onset']
            ev['orig_duration'] = ev['duration']
            # figure out how many samples we need
            ev['duration'] = \
                    len(tvec[idx:][tvec[idx:] < ev['onset'] + ev['duration']])
            # new onset is sample index
            ev['onset'] = idx
            descr_events.append(ev)
    else:
        descr_events = events
    # convert the event specs into the format expected by BoxcarMapper
    # take the first event as an example of contained keys
    evvars = {}
    for k in descr_events[0]:
        try:
            evvars[k] = [e[k] for e in descr_events]
        except KeyError:
            raise ValueError("Each event property must be present for all "
                             "events (could not find '%s')" % k)
    # checks
    for p in ['onset', 'duration']:
        if not p in evvars:
            raise ValueError("'%s' is a required property for all events." % p)
    boxlength = max(evvars['duration'])
    if __debug__:
        if not max(evvars['duration']) == min(evvars['duration']):
            warning('Boxcar mapper will use maximum boxlength (%i) of all '
                    'provided Events.' % boxlength)

    # finally create, train und use the boxcar mapper
    bcm = BoxcarMapper(evvars['onset'], boxlength, space=eprefix)
    bcm.train(ds)
    ds = ds.get_mapped(bcm)
    # at last reflatten the dataset
    # could we add some meaningful attribute during this mapping, i.e. would
    # assigning 'inspace' do something good?
    ds = ds.get_mapped(FlattenMapper(shape=ds.samples.shape[1:]))
    # add samples attributes for the events, simply dump everything as a samples
    # attribute
    for a in evvars:
        if not eprefix is None and a in ds.sa:
            # if there is already a samples attribute like this, it got mapped
            # by BoxcarMapper (i.e. is multi-dimensional). We move it aside
            # under new `eprefix` name
            ds.sa[eprefix + '_' + a] = ds.sa[a]
        if a in ['onset', 'duration']:
            # special case: we want the non-discrete, original onset and
            # duration
            if not time_attr is None:
                # but only if there was a conversion happining, since otherwise
                # we get the same info from BoxcarMapper
                ds.sa[a] = [e[a] for e in events]
        else:
            ds.sa[a] = evvars[a]
    return ds
Пример #13
0
def simple_sim1(
        shape,
        dissims,
        rois_arrangement='circle',
        roi_neighborhood=Sphere(5),
        nruns=1,
        nsubjects=1,
        # noise components -- we just add normal for now also with
        # spatial smoothing to possibly create difference in noise
        # characteristics across different kinds
        #
        # "Instrumental noise" -- generic nuisance
        noise_independent_std=0.4,
        noise_independent_smooth=3.,
        # "Intrinsic signal", specific per each subject (due to
        # motion, whatever) -- might be fun for someone to cluster,
        # but irrelevant for us
        noise_subject_n=1,
        noise_subject_std=0.4,
        noise_subject_smooth=1.5,
        # "Intrinsic common signal" -- probably generalizes across
        # subjects and fun for someone studying veins to get those
        # reproducible clusters.  It will be mixed in also with
        # different weights per each run.
        # Again -- might be fun for someone to cluster, but not for us
        # since it would not be representative of the original signal
        noise_common_n=1,
        noise_common_std=0.4,
        noise_common_smooth=2.):
    """Simulate "data" containing similarity matrices with 3 noise
    components for multiple subjects

    Noise components are:

    - random normal noise, also spatially smoothed (should have smaller
    sigma for smoothing probably than for intrinsic noise)

    - intrinsic noise which is composed from a set of random fields,
    generated by random normal noise with subsequent spatial filtering,
    which are then mixed into each run data with random weights.  They
    are to simulate subject-specific intrinsic signals such as artifacts
    due to motion, possible subject-specific physiological processes

    - intrinsic common noise across subjects intrinsic noise (e.g. all of them
    have similar blood distribution networks and other physiological
    parameters, and some intrinsic networks, which although similar in
    space would have different mix-in coefficients across subject/runs)

    Theoretically, decomposition methods (such as ICA, PCA, etc) should help to
    identify such common noise components and filter them out.  Also methods
    which iteratively remove non-informative projections (such as GLMdenoise)
    should be effective to identify those mix-ins

    TODO: now mix-in happens with purely normal random weights,  ideally we
    should color those as well
    """
    ndissims = len(dissims)

    # first we fisher transform so we can add normal noise
    # check first that we don't have extreme values that might give infinity
    dissims = np.array(dissims)
    dissims = 1. - dissims
    dissims[dissims == 1] = 0.99
    dissims[dissims == -1] = -0.99
    # fisher
    dissims = np.arctanh(dissims)

    # generate target clean "picture"
    d = np.asanyarray(dissims[0])
    signal_clean = np.zeros(shape + (len(vector_form(d)), ))

    # generate ground truth for clustering
    cluster_truth = np.zeros(shape, dtype='int')

    if rois_arrangement == 'circle':
        radius = min(shape[:2]) / 4.
        center = np.array((radius * 2, ) * len(shape)).astype(int)
        # arrange at quarter distance from center
        for i, dissim in enumerate(dissims):
            dissim = vector_form(dissim)
            # that is kinda boring -- the same dissimilarity to each
            # voxel???
            #
            # TODO: come up with a better arrangement/idea, e.g. to
            # generate an MVPA pattern which would satisfy the
            # dissimilarity (not exactly but at least close).  That
            # would make more sense
            roi_center = center.copy()
            roi_center[0] += int(radius * np.cos(2 * np.pi * i / ndissims))
            roi_center[1] += int(radius * np.sin(2 * np.pi * i / ndissims))
            for coords in roi_neighborhood(roi_center):
                acoords = np.asanyarray(coords)
                if np.all(acoords >= [0]*len(coords)) and \
                   np.all(acoords < signal_clean.shape[:len(coords)]):
                    signal_clean.__setitem__(coords, dissim)
                    cluster_truth.__setitem__(coords, i + 1)
    else:
        raise ValueError("I know only circle")

    # generated randomly and will be mixed into subjects with different weights
    # TODO: static across runs within subject??  if so -- would be no different
    #       from having RSAs?
    common_noises = get_intrinsic_noises(signal_clean.shape,
                                         std=noise_common_std,
                                         sigma=noise_common_smooth,
                                         n=noise_common_n)
    assert common_noises[0].ndim == 3, "There should be no time comp"

    # Now lets generate per subject and per run data by adding some noise(s)
    # all_signals = []
    dss = []
    for isubject in xrange(nsubjects):
        # Interesting noise, simulating some underlying process which has nothing
        # to do with original design/similarity but having spatial structure which
        # repeats through runs with random weights (consider it to be a principal component)

        # generated randomly for each subject separately, but they should have
        # common structure across runs
        subj_specific_noises = get_intrinsic_noises(signal_clean.shape,
                                                    std=noise_subject_std,
                                                    sigma=noise_subject_smooth,
                                                    n=noise_subject_n)
        assert subj_specific_noises[
            0].ndim == 3, "There should be no time comp"
        # subject_signals = []
        dss_subject = []
        subj_common_noises = [
            noise * np.random.normal() for noise in common_noises
        ]

        subj_specific_mixins = generate_mixins(nruns)
        subj_common_mixins = generate_mixins(nruns)

        for run in range(nruns):
            signal_run = signal_clean.copy()
            for noise in subj_specific_noises:
                signal_run += noise * subj_specific_mixins[run]
            for noise in subj_common_noises:
                signal_run += noise * subj_common_mixins[run]
            # generic noise -- no common structure across subjects/runs
            signal_run += filter_each_2d(
                np.random.normal(size=signal_clean.shape) *
                noise_independent_std, noise_independent_smooth)

            # go back to correlations with inverse of fisher
            signal_run = np.tanh(signal_run)
            # rollaxis to bring similarities into leading dimension
            ds = Dataset(np.rollaxis(signal_run, 2, 0))
            ds.sa['chunks'] = [run]
            ds.sa['dissimilarity'] = np.arange(len(dissim))  # Lame one for now
            ds_flat = ds.get_mapped(
                FlattenMapper(shape=ds.shape[1:], space='pixel_indices'))
            dss_subject.append(ds_flat)
            #subject_signals.append(signal_run)
        #all_signals.append(subject_signals)
        ds = dsvstack(dss_subject)
        ds.a['mapper'] = dss_subject[
            0].a.mapper  # .a are not transferred by vstack
        dss.append(ds)

    # Instrumental noise -- the most banal
    assert (len(dss) == nsubjects)
    assert (len(dss) == nsubjects)
    assert (len(dss[0]) == nruns * len(dissim))

    return np.tanh(signal_clean), cluster_truth, dss