예제 #1
0
    def _transform_multi_channel_data(self, X, y):
        # Data partitioning
        parted_X, parted_y = self._partition_data(
            X=X, y=y, partition_size=self.window_size)
        transposed_X = np.transpose(parted_X, [0, 2, 1])
        converted_X = np.reshape(transposed_X,
                                 (transposed_X.shape[0], transposed_X.shape[1],
                                  1, transposed_X.shape[2]))

        # Create view converter
        view_converter = DefaultViewConverter(shape=self.sample_shape,
                                              axes=('b', 0, 1, 'c'))

        # Convert data into a design matrix
        view_converted_X = view_converter.topo_view_to_design_mat(converted_X)
        assert np.all(converted_X == view_converter.design_mat_to_topo_view(
            view_converted_X))

        # Format the target into proper format
        sum_y = np.sum(parted_y, axis=1)
        sum_y[sum_y > 0] = 1
        one_hot_formatter = OneHotFormatter(max_labels=self.n_classes)
        hot_y = one_hot_formatter.format(sum_y)

        return view_converted_X, hot_y, view_converter
class ConditionalGeneratorTestCase(unittest.TestCase):
    def setUp(self):
        self.noise_dim = 10
        self.num_labels = 10

        self.condition_dtype = 'uint8'
        self.condition_space = VectorSpace(dim=self.num_labels,
                                           dtype=self.condition_dtype)
        self.condition_formatter = OneHotFormatter(self.num_labels,
                                                   dtype=self.condition_dtype)
        self.condition_distribution = OneHotDistribution(self.condition_space)

        # TODO this nvis stuff is dirty. The ConditionalGenerator should handle it
        self.mlp_nvis = self.noise_dim + self.num_labels
        self.mlp_nout = 1

        # Set up model
        self.mlp = MLP(nvis=self.mlp_nvis,
                       layers=[Linear(self.mlp_nout, 'out', irange=0.1)])
        self.G = ConditionalGenerator(
            input_condition_space=self.condition_space,
            condition_distribution=self.condition_distribution,
            noise_dim=self.noise_dim,
            mlp=self.mlp)

    def test_conditional_generator_input_setup(self):
        """Check that conditional generator correctly sets up composite
        input layer."""

        # Feedforward: We want the net to ignore the noise and simply
        # convert the one-hot vector to a number
        weights = np.concatenate([
            np.zeros((self.mlp_nout, self.noise_dim)),
            np.array(range(self.num_labels)).reshape(
                (1, -1)).repeat(self.mlp_nout, axis=0)
        ],
                                 axis=1).T.astype(theano.config.floatX)
        self.mlp.layers[0].set_weights(weights)

        inp = (T.matrix(), T.matrix(dtype=self.condition_dtype))
        f = theano.function(inp, self.G.mlp.fprop(inp))

        assert_array_equal(
            f(
                np.random.rand(self.num_labels,
                               self.noise_dim).astype(theano.config.floatX),
                self.condition_formatter.format(
                    np.array(range(self.num_labels)))),
            np.array(range(self.num_labels)).reshape(self.num_labels, 1))

    def test_sample_noise(self):
        """Test barebones noise sampling."""

        n = T.iscalar()
        cond_inp = self.condition_distribution.sample(n)
        sample_and_noise = theano.function([n],
                                           self.G.sample_and_noise(
                                               cond_inp, all_g_layers=True)[1])

        print sample_and_noise(15)
예제 #3
0
def test_dtype_errors():
    # Try to call theano_expr with a bad label dtype.
    raised = False
    fmt = OneHotFormatter(max_labels=50)
    try:
        fmt.theano_expr(theano.tensor.vector(dtype=theano.config.floatX))
    except TypeError:
        raised = True
    assert raised

    # Try to call format with a bad label dtype.
    raised = False
    try:
        fmt.format(numpy.zeros(10, dtype='float64'))
    except TypeError:
        raised = True
    assert raised
예제 #4
0
def test_dtype_errors():
    # Try to call theano_expr with a bad label dtype.
    raised = False
    fmt = OneHotFormatter(max_labels=50)
    try:
        fmt.theano_expr(theano.tensor.vector(dtype=theano.config.floatX))
    except TypeError:
        raised = True
    assert raised

    # Try to call format with a bad label dtype.
    raised = False
    try:
        fmt.format(numpy.zeros(10, dtype='float64'))
    except TypeError:
        raised = True
    assert raised
예제 #5
0
 def check_one_hot_formatter(seed, max_labels, dtype, ncases):
     rng = numpy.random.RandomState(seed)
     fmt = OneHotFormatter(max_labels=max_labels, dtype=dtype)
     integer_labels = rng.random_integers(0, max_labels - 1, size=ncases)
     one_hot_labels = fmt.format(integer_labels)
     assert len(zip(*one_hot_labels.nonzero())) == ncases
     for case, label in enumerate(integer_labels):
         assert one_hot_labels[case, label] == 1
예제 #6
0
def format_targets(y):
    # matlab has one-based indexing and one-based labels
    # have to convert to zero-based labels so subtract 1...
    y = y - 1
    # we need only a 1d-array of integers
    # squeeze in case of 2 dimensions, make sure it is still 1d in case of
    # a single number (can happen for test runs with just one trial)
    y = np.atleast_1d(y.squeeze())
    y = y.astype(int)
    target_formatter = OneHotFormatter(4)
    y = target_formatter.format(y)
    return y
예제 #7
0
def test_bad_arguments():
    # Make sure an invalid max_labels raises an error.
    raised = False
    try:
        fmt = OneHotFormatter(max_labels=-10)
    except ValueError:
        raised = True
    assert raised

    raised = False
    try:
        fmt = OneHotFormatter(max_labels='10')
    except ValueError:
        raised = True
    assert raised

    # Make sure an invalid dtype identifier raises an error.
    raised = False
    try:
        fmt = OneHotFormatter(max_labels=10, dtype='invalid')
    except TypeError:
        raised = True
    assert raised

    # Make sure an invalid ndim raises an error for format().
    fmt = OneHotFormatter(max_labels=10)
    raised = False
    try:
        fmt.format(numpy.zeros((2, 3), dtype='int32'))
    except ValueError:
        raised = True
    assert raised

    # Make sure an invalid ndim raises an error for theano_expr().
    raised = False
    try:
        fmt.theano_expr(theano.tensor.imatrix())
    except ValueError:
        raised = True
    assert raised
예제 #8
0
def test_bad_arguments():
    # Make sure an invalid max_labels raises an error.
    raised = False
    try:
        fmt = OneHotFormatter(max_labels=-10)
    except ValueError:
        raised = True
    assert raised

    raised = False
    try:
        fmt = OneHotFormatter(max_labels='10')
    except ValueError:
        raised = True
    assert raised

    # Make sure an invalid dtype identifier raises an error.
    raised = False
    try:
        fmt = OneHotFormatter(max_labels=10, dtype='invalid')
    except TypeError:
        raised = True
    assert raised

    # Make sure an invalid ndim raises an error for format().
    fmt = OneHotFormatter(max_labels=10)
    raised = False
    try:
        fmt.format(numpy.zeros((2, 3, 4), dtype='int32'))
    except ValueError:
        raised = True
    assert raised

    # Make sure an invalid ndim raises an error for theano_expr().
    raised = False
    try:
        fmt.theano_expr(theano.tensor.itensor3())
    except ValueError:
        raised = True
    assert raised
예제 #9
0
def generate_datasets(inputs):
    targets = np.zeros(inputs.shape[0]).astype('int')
    targets[::2] = 1 # every second target is class 1 others class 0
    inputs[targets == 1] = inputs[targets == 1] + 1
    target_formatter = OneHotFormatter(2)
    targets_one_hot = target_formatter.format(targets)
    train_set = VolumetricDenseDesignMatrix(topo_view=inputs[0:50], 
        y=targets_one_hot[0:50], axes=('b', 0, 1, 2, 'c'))
    valid_set = VolumetricDenseDesignMatrix(topo_view=inputs[50:75], 
        y=targets_one_hot[50:75], axes=('b', 0, 1, 2, 'c'))
    test_set = VolumetricDenseDesignMatrix(topo_view=inputs[75:100], 
        y=targets_one_hot[75:100], axes=('b', 0, 1, 2, 'c'))
    return train_set, valid_set, test_set
예제 #10
0
    def _transform_single_channel_data(self, X, y):
        windowed_X = np.reshape(X, (-1, self.window_size))
        windowed_y = np.reshape(y, (-1, self.window_size))

        # Format the target into proper format
        sum_y = np.sum(windowed_y, axis=1)
        sum_y[sum_y > 0] = 1

        # Duplicate the labels for all channels
        dup_y = np.tile(sum_y, self.n_channels)

        one_hot_formatter = OneHotFormatter(max_labels=self.n_classes)
        hot_y = one_hot_formatter.format(dup_y)

        return windowed_X, hot_y, None
예제 #11
0
def test_one_hot_formatter_simple():
    def check_one_hot_formatter(seed, max_labels, dtype, ncases):
        rng = numpy.random.RandomState(seed)
        fmt = OneHotFormatter(max_labels=max_labels, dtype=dtype)
        integer_labels = rng.random_integers(0, max_labels - 1, size=ncases)
        one_hot_labels = fmt.format(integer_labels)
        assert len(list(zip(*one_hot_labels.nonzero()))) == ncases
        for case, label in enumerate(integer_labels):
            assert one_hot_labels[case, label] == 1

    rng = numpy.random.RandomState(0)
    for seed, dtype in enumerate(all_types):
        yield (check_one_hot_formatter, seed, rng.random_integers(1, 30), dtype, rng.random_integers(1, 100))
    fmt = OneHotFormatter(max_labels=10)
    assert fmt.format(numpy.zeros((1, 1), dtype="uint8")).shape == (1, 1, 10)
예제 #12
0
def test_one_hot_formatter_simple():
    def check_one_hot_formatter(seed, max_labels, dtype, ncases):
        rng = numpy.random.RandomState(seed)
        fmt = OneHotFormatter(max_labels=max_labels, dtype=dtype)
        integer_labels = rng.random_integers(0, max_labels - 1, size=ncases)
        one_hot_labels = fmt.format(integer_labels)
        assert len(list(zip(*one_hot_labels.nonzero()))) == ncases
        for case, label in enumerate(integer_labels):
            assert one_hot_labels[case, label] == 1
    rng = numpy.random.RandomState(0)
    for seed, dtype in enumerate(all_types):
        yield (check_one_hot_formatter, seed, rng.random_integers(1, 30),
               dtype, rng.random_integers(1, 100))
    fmt = OneHotFormatter(max_labels=10)
    assert fmt.format(numpy.zeros((1, 1), dtype='uint8')).shape == (1, 1, 10)
예제 #13
0
    def _transform_single_channel_data(self, X, y):
        windowed_X = np.reshape(X, (-1, self.window_size))
        windowed_y = np.reshape(y, (-1, self.window_size))

        # Format the target into proper format
        sum_y = np.sum(windowed_y, axis=1)
        sum_y[sum_y > 0] = 1

        # Duplicate the labels for all channels
        dup_y = np.tile(sum_y, self.n_channels)

        one_hot_formatter = OneHotFormatter(max_labels=self.n_classes)
        hot_y = one_hot_formatter.format(dup_y)

        return windowed_X, hot_y, None
예제 #14
0
class ConditionalGeneratorTestCase(unittest.TestCase):
    def setUp(self):
        self.noise_dim = 10
        self.num_labels = 10

        self.condition_dtype = 'uint8'
        self.condition_space = VectorSpace(dim=self.num_labels, dtype=self.condition_dtype)
        self.condition_formatter = OneHotFormatter(self.num_labels, dtype=self.condition_dtype)
        self.condition_distribution = OneHotDistribution(self.condition_space)

        # TODO this nvis stuff is dirty. The ConditionalGenerator should handle it
        self.mlp_nvis = self.noise_dim + self.num_labels
        self.mlp_nout = 1

        # Set up model
        self.mlp = MLP(nvis=self.mlp_nvis, layers=[Linear(self.mlp_nout, 'out', irange=0.1)])
        self.G = ConditionalGenerator(input_condition_space=self.condition_space,
                                      condition_distribution=self.condition_distribution,
                                      noise_dim=self.noise_dim,
                                      mlp=self.mlp)

    def test_conditional_generator_input_setup(self):
        """Check that conditional generator correctly sets up composite
        input layer."""

        # Feedforward: We want the net to ignore the noise and simply
        # convert the one-hot vector to a number
        weights = np.concatenate([np.zeros((self.mlp_nout, self.noise_dim)),
                                  np.array(range(self.num_labels)).reshape((1, -1)).repeat(self.mlp_nout, axis=0)],
                                 axis=1).T.astype(theano.config.floatX)
        self.mlp.layers[0].set_weights(weights)

        inp = (T.matrix(), T.matrix(dtype=self.condition_dtype))
        f = theano.function(inp, self.G.mlp.fprop(inp))

        assert_array_equal(
            f(np.random.rand(self.num_labels, self.noise_dim).astype(theano.config.floatX),
              self.condition_formatter.format(np.array(range(self.num_labels)))),
            np.array(range(self.num_labels)).reshape(self.num_labels, 1))

    def test_sample_noise(self):
        """Test barebones noise sampling."""

        n = T.iscalar()
        cond_inp = self.condition_distribution.sample(n)
        sample_and_noise = theano.function([n], self.G.sample_and_noise(cond_inp, all_g_layers=True)[1])

        print sample_and_noise(15)
예제 #15
0
def generate_datasets(inputs):
    targets = np.zeros(inputs.shape[0]).astype('int')
    targets[::2] = 1  # every second target is class 1 others class 0
    inputs[targets == 1] = inputs[targets == 1] + 1
    target_formatter = OneHotFormatter(2)
    targets_one_hot = target_formatter.format(targets)
    train_set = VolumetricDenseDesignMatrix(topo_view=inputs[0:50],
                                            y=targets_one_hot[0:50],
                                            axes=('b', 0, 1, 2, 'c'))
    valid_set = VolumetricDenseDesignMatrix(topo_view=inputs[50:75],
                                            y=targets_one_hot[50:75],
                                            axes=('b', 0, 1, 2, 'c'))
    test_set = VolumetricDenseDesignMatrix(topo_view=inputs[75:100],
                                           y=targets_one_hot[75:100],
                                           axes=('b', 0, 1, 2, 'c'))
    return train_set, valid_set, test_set
예제 #16
0
    def check_one_hot_formatter(seed, max_labels, dtype, ncases, nmultis):
        rng = numpy.random.RandomState(seed)
        fmt = OneHotFormatter(max_labels=max_labels, dtype=dtype)
        integer_labels = rng.random_integers(0, max_labels - 1, size=ncases * nmultis).reshape(ncases, nmultis)

        one_hot_labels = fmt.format(integer_labels, mode="merge")
        # n_ones was expected to be equal to ncases * nmultis if integer_labels
        # do not contain duplicated tags. (i.e., those labels like
        # [1, 2, 2, 3, 5, 6].) Because that we are not depreciating this kind
        # of duplicated labels, which allows different cases belong to
        # different number of classes, and those duplicated tags will only
        # activate one neuron in the k-hot representation, we need to use
        # numpy.unique() here to eliminate those duplications while counting
        # "1"s in the final k-hot representation.
        n_ones = numpy.concatenate([numpy.unique(l) for l in integer_labels])
        assert len(list(zip(*one_hot_labels.nonzero()))) == len(n_ones)
        for case, label in enumerate(integer_labels):
            assert numpy.sum(one_hot_labels[case, label]) == nmultis
예제 #17
0
    def check_one_hot_formatter(seed, max_labels, dtype, ncases, nmultis):
        rng = numpy.random.RandomState(seed)
        fmt = OneHotFormatter(max_labels=max_labels, dtype=dtype)
        integer_labels = rng.random_integers(
            0, max_labels - 1, size=ncases*nmultis
        ).reshape(ncases, nmultis)

        one_hot_labels = fmt.format(integer_labels, mode='merge')
        # n_ones was expected to be equal to ncases * nmultis if integer_labels
        # do not contain duplicated tags. (i.e., those labels like
        # [1, 2, 2, 3, 5, 6].) Because that we are not depreciating this kind
        # of duplicated labels, which allows different cases belong to
        # different number of classes, and those duplicated tags will only
        # activate one neuron in the k-hot representation, we need to use
        # numpy.unique() here to eliminate those duplications while counting
        # "1"s in the final k-hot representation.
        n_ones = numpy.concatenate([numpy.unique(l) for l in integer_labels])
        assert len(list(zip(*one_hot_labels.nonzero()))) == len(n_ones)
        for case, label in enumerate(integer_labels):
            assert numpy.sum(one_hot_labels[case, label]) == nmultis
예제 #18
0
    def _transform_multi_channel_data(self, X, y):
        # Data partitioning
        parted_X, parted_y = self._partition_data(X=X, y=y, partition_size=self.window_size)
        transposed_X = np.transpose(parted_X, [0, 2, 1])
        converted_X = np.reshape(transposed_X, (transposed_X.shape[0],
                                                transposed_X.shape[1],
                                                1,
                                                transposed_X.shape[2]))

        # Create view converter
        view_converter = DefaultViewConverter(shape=self.sample_shape,
                                              axes=('b', 0, 1, 'c'))

        # Convert data into a design matrix
        view_converted_X = view_converter.topo_view_to_design_mat(converted_X)
        assert np.all(converted_X == view_converter.design_mat_to_topo_view(view_converted_X))

        # Format the target into proper format
        sum_y = np.sum(parted_y, axis=1)
        sum_y[sum_y > 0] = 1
        one_hot_formatter = OneHotFormatter(max_labels=self.n_classes)
        hot_y = one_hot_formatter.format(sum_y)

        return view_converted_X, hot_y, view_converter
# Samples per condition
sample_cols = 5

# Generate conditional information
conditional_batch = model.generator.condition_space.make_theano_batch()
formatter = OneHotFormatter(rows,
                            dtype=model.generator.condition_space.dtype)
conditional = formatter.theano_expr(conditional_batch, mode='concatenate')

# Now sample from generator
# For some reason format_as from VectorSpace is not working right
topo_samples_batch = model.generator.sample(conditional)
topo_sample_f = theano.function([conditional], topo_samples_batch)
conditional_data = formatter.format(np.concatenate([np.repeat(i, sample_cols) for i in range(rows)])
                                      .reshape((rows * sample_cols, 1)),
                                    mode='concatenate')
topo_samples = topo_sample_f(conditional_data)

samples = dataset.get_design_matrix(topo_samples)
dataset.axes = ['b', 0, 1, 'c']
dataset.view_converter.axes = ['b', 0, 1, 'c']
topo_samples = dataset.get_topological_view(samples)

pv = PatchViewer(grid_shape=(rows, sample_cols + 1), patch_shape=(32,32),
        is_color=True)
scale = np.abs(samples).max()

X = dataset.X
topo = dataset.get_topological_view()
index = 0
예제 #20
0
    def __init__(self, which_set, onehot_dtype='uint8',
                 center=False, rescale=False, gcn=None,
                 start=None, stop=None, axes=('b', 0, 1, 'c'),
                 toronto_prepro=False, preprocessor=None):
        """Modified version of the CIFAR10 constructor which creates Y
        as one-hot vectors rather than simple indexes. This is super
        hacky. Sorry, Guido.."""

        # note: there is no such thing as the cifar10 validation set;
        # pylearn1 defined one but really it should be user-configurable
        # (as it is here)

        self.axes = axes

        # we define here:
        dtype = 'uint8'
        ntrain = 50000
        nvalid = 0  # artefact, we won't use it
        ntest = 10000

        # we also expose the following details:
        self.img_shape = (3, 32, 32)
        self.img_size = numpy.prod(self.img_shape)
        self.n_classes = 10
        self.label_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                            'dog', 'frog', 'horse', 'ship', 'truck']

        # prepare loading
        fnames = ['data_batch_%i' % i for i in range(1, 6)]
        datasets = {}
        datapath = os.path.join(
            string_utils.preprocess('${PYLEARN2_DATA_PATH}'),
            'cifar10', 'cifar-10-batches-py')
        for name in fnames + ['test_batch']:
            fname = os.path.join(datapath, name)
            if not os.path.exists(fname):
                raise IOError(fname + " was not found. You probably need to "
                              "download the CIFAR-10 dataset by using the "
                              "download script in "
                              "pylearn2/scripts/datasets/download_cifar10.sh "
                              "or manually from "
                              "http://www.cs.utoronto.ca/~kriz/cifar.html")
            datasets[name] = cache.datasetCache.cache_file(fname)

        lenx = numpy.ceil((ntrain + nvalid) / 10000.) * 10000
        x = numpy.zeros((lenx, self.img_size), dtype=dtype)
        y = numpy.zeros((lenx, 1), dtype=dtype)

        # load train data
        nloaded = 0
        for i, fname in enumerate(fnames):
            _logger.info('loading file %s' % datasets[fname])
            data = serial.load(datasets[fname])
            x[i * 10000:(i + 1) * 10000, :] = data['data']
            y[i * 10000:(i + 1) * 10000, 0] = data['labels']
            nloaded += 10000
            if nloaded >= ntrain + nvalid + ntest:
                break

        # load test data
        _logger.info('loading file %s' % datasets['test_batch'])
        data = serial.load(datasets['test_batch'])

        # process this data
        Xs = {'train': x[0:ntrain],
              'test': data['data'][0:ntest]}

        Ys = {'train': y[0:ntrain],
              'test': data['labels'][0:ntest]}

        X = numpy.cast['float32'](Xs[which_set])

        y = Ys[which_set]
        if isinstance(y, list):
            y = numpy.asarray(y).astype(dtype)
        if which_set == 'test':
            assert y.shape[0] == 10000
            y = y.reshape((y.shape[0], 1))

        formatter = OneHotFormatter(self.n_classes, dtype=onehot_dtype)
        y = formatter.format(y, mode='concatenate')

        if center:
            X -= 127.5
        self.center = center

        if rescale:
            X /= 127.5
        self.rescale = rescale

        if toronto_prepro:
            assert not center
            assert not gcn
            X = X / 255.
            if which_set == 'test':
                other = CIFAR10(which_set='train')
                oX = other.X
                oX /= 255.
                X = X - oX.mean(axis=0)
            else:
                X = X - X.mean(axis=0)
        self.toronto_prepro = toronto_prepro

        self.gcn = gcn
        if gcn is not None:
            gcn = float(gcn)
            X = global_contrast_normalize(X, scale=gcn)

        if start is not None:
            # This needs to come after the prepro so that it doesn't
            # change the pixel means computed above for toronto_prepro
            assert start >= 0
            assert stop > start
            assert stop <= X.shape[0]
            X = X[start:stop, :]
            y = y[start:stop, :]
            assert X.shape[0] == y.shape[0]

        if which_set == 'test':
            assert X.shape[0] == 10000

        view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3),
                                                                  axes)

        super(CIFAR10, self).__init__(X=X, y=y, view_converter=view_converter,
                                      )#y_labels=self.n_classes)

        assert not contains_nan(self.X)

        if preprocessor:
            preprocessor.apply(self)

        # Another hack: rename 'targets' to match model expectations
        space, (X_source, y_source) = self.data_specs
        self.data_specs = (space, (X_source, 'condition'))
    def __init__(
        self,
        path,
        name='',  # optional name

        # selectors
        subjects='all',  # optional selector (list) or 'all'
        trial_types='all',  # optional selector (list) or 'all'
        trial_numbers='all',  # optional selector (list) or 'all'
        conditions='all',  # optional selector (list) or 'all'     
        partitioner=None,
        channel_filter=NoChannelFilter(
        ),  # optional channel filter, default: keep all
        channel_names=None,  # optional channel names (for metadata)
        label_map=None,  # optional conversion of labels
        remove_dc_offset=False,  # optional subtraction of channel mean, usually done already earlier
        resample=None,  # optional down-sampling

        # optional sub-sequences selection
        start_sample=0,
        stop_sample=None,  # optional for selection of sub-sequences

        # optional signal filter to by applied before spitting the signal
        signal_filter=None,

        # windowing parameters
        frame_size=-1,
        hop_size=-1,  # values > 0 will lead to windowing
        hop_fraction=None,  # alternative to specifying absolute hop_size

        # optional spectrum parameters, n_fft = 0 keeps raw data
        n_fft=0,
        n_freq_bins=None,
        spectrum_log_amplitude=False,
        spectrum_normalization_mode=None,
        include_phase=False,
        flatten_channels=False,
        layout='tf',  # (0,1)-axes layout tf=time x features or ft=features x time
        save_matrix_path=None,
        keep_metadata=False,
    ):
        '''
        Constructor
        '''

        # save params
        self.params = locals().copy()
        del self.params['self']
        # print self.params

        # TODO: get the whole filtering into an extra class

        datafiles_metadata, metadb = load_datafiles_metadata(path)

        #         print datafiles_metadata

        def apply_filters(filters, node):
            if isinstance(node, dict):
                filtered = []
                keepkeys = filters[0]
                for key, value in node.items():
                    if keepkeys == 'all' or key in keepkeys:
                        filtered.extend(apply_filters(filters[1:], value))
                return filtered
            else:
                return node  # [node]

        # keep only files that match the metadata filters
        self.datafiles = apply_filters(
            [subjects, trial_types, trial_numbers, conditions],
            datafiles_metadata)

        # copy metadata for retained files
        self.metadb = {}
        for datafile in self.datafiles:
            self.metadb[datafile] = metadb[datafile]

#         print self.datafiles
#         print self.metadb

        self.name = name

        if partitioner is not None:
            self.datafiles = partitioner.get_partition(self.name, self.metadb)

        self.include_phase = include_phase
        self.spectrum_normalization_mode = spectrum_normalization_mode
        self.spectrum_log_amplitude = spectrum_log_amplitude

        self.sequence_partitions = [
        ]  # used to keep track of original sequences

        # metadata: [subject, trial_no, stimulus, channel, start, ]
        self.metadata = []

        sequences = []
        labels = []
        n_sequences = 0

        if frame_size > 0 and hop_size == -1 and hop_fraction is not None:
            hop_size = np.ceil(frame_size / hop_fraction)

        for i in xrange(len(self.datafiles)):
            with log_timing(log,
                            'loading data from {}'.format(self.datafiles[i])):

                # save start of next sequence
                self.sequence_partitions.append(n_sequences)

                data, metadata = load(os.path.join(path, self.datafiles[i]))

                label = metadata['label']
                if label_map is not None:
                    label = label_map[label]

                multi_channel_frames = []

                # process 1 channel at a time
                for channel in xrange(data.shape[1]):
                    # filter channels
                    if not channel_filter.keep_channel(channel):
                        continue

                    samples = data[:, channel]

                    # subtract channel mean
                    if remove_dc_offset:
                        samples -= samples.mean()

                    # down-sample if requested
                    if resample is not None and resample[0] != resample[1]:
                        samples = librosa.resample(samples, resample[0],
                                                   resample[1])

                    # apply optional signal filter after down-sampling -> requires lower order
                    if signal_filter is not None:
                        samples = signal_filter.process(samples)

                    # get sub-sequence in resampled space
                    # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape))
                    samples = samples[start_sample:stop_sample]

                    if n_fft is not None and n_fft > 0:  # Optionally:
                        ### frequency spectrum branch ###

                        # transform to spectogram
                        hop_length = n_fft / 4
                        '''
                        from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html
                        >>> # Get a power spectrogram from a waveform y
                        >>> S       = np.abs(librosa.stft(y)) ** 2
                        >>> log_S   = librosa.logamplitude(S)
                        '''

                        S = librosa.core.stft(samples,
                                              n_fft=n_fft,
                                              hop_length=hop_length)
                        # mag = np.abs(S)        # magnitude spectrum
                        mag = np.abs(S)**2  # power spectrum

                        # include phase information if requested
                        if self.include_phase:
                            # phase = np.unwrap(np.angle(S))
                            phase = np.angle(S)

                        # Optionally: cut off high bands
                        if n_freq_bins is not None:
                            mag = mag[0:n_freq_bins, :]
                            if self.include_phase:
                                phase = phase[0:n_freq_bins, :]

                        if self.spectrum_log_amplitude:
                            mag = librosa.logamplitude(mag)

                        s = mag  # for normalization
                        '''
                        NOTE on normalization:
                        It depends on the structure of a neural network and (even more) 
                        on the properties of data. There is no best normalization algorithm 
                        because if there would be one, it would be used everywhere by default...
                    
                        In theory, there is no requirement for the data to be normalized at all. 
                        This is a purely practical thing because in practice convergence could 
                        take forever if your input is spread out too much. The simplest would be 
                        to just normalize it by scaling your data to (-1,1) (or (0,1) depending 
                        on activation function), and in most cases it does work. If your 
                        algorithm converges well, then this is your answer. If not, there are 
                        too many possible problems and methods to outline here without knowing 
                        the actual data.
                        '''

                        ## normalize to mean 0, std 1
                        if self.spectrum_normalization_mode == 'mean0_std1':
                            # s = preprocessing.scale(s, axis=0);
                            mean = np.mean(s)
                            std = np.std(s)
                            s = (s - mean) / std

                        ## normalize by linear transform to [0,1]
                        elif self.spectrum_normalization_mode == 'linear_0_1':
                            s = s / np.max(s)

                        ## normalize by linear transform to [-1,1]
                        elif self.spectrum_normalization_mode == 'linear_-1_1':
                            s = -1 + 2 * (s - np.min(s)) / (np.max(s) -
                                                            np.min(s))

                        elif self.spectrum_normalization_mode is not None:
                            raise ValueError(
                                'unsupported spectrum normalization mode {}'.
                                format(self.spectrum_normalization_mode))

                        #print s.mean(axis=0)
                        #print s.std(axis=0)

                        # include phase information if requested
                        if self.include_phase:
                            # normalize phase to [-1.1]
                            phase = phase / np.pi
                            s = np.vstack([s, phase])

                        # transpose to fit pylearn2 layout
                        s = np.transpose(s)
                        # print s.shape

                        ### end of frequency spectrum branch ###
                    else:
                        ### raw waveform branch ###

                        # normalize to max amplitude 1
                        s = librosa.util.normalize(samples)

                        # add 2nd data dimension
                        s = s.reshape(s.shape[0], 1)
                        # print s.shape

                        ### end of raw waveform branch ###

                    s = np.asfarray(s, dtype='float32')

                    if frame_size > 0 and hop_size > 0:
                        s = s.copy(
                        )  # FIXME: THIS IS NECESSARY IN MultiChannelEEGSequencesDataset - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!!
                        frames = frame(s,
                                       frame_length=frame_size,
                                       hop_length=hop_size)
                    else:
                        frames = s
                    del s
                    # print frames.shape

                    if flatten_channels:
                        # add artificial channel dimension
                        frames = frames.reshape(
                            (frames.shape[0], frames.shape[1], frames.shape[2],
                             1))
                        # print frames.shape

                        sequences.append(frames)

                        # increment counter by new number of frames
                        n_sequences += frames.shape[0]

                        if keep_metadata:
                            # determine channel name
                            channel_name = None
                            if channel_names is not None:
                                channel_name = channel_names[channel]
                            elif 'channels' in metadata:
                                channel_name = metadata['channels'][channel]

                            self.metadata.append({
                                'subject':
                                metadata['subject'],  # subject
                                'trial_type':
                                metadata['trial_type'],  # trial_type
                                'trial_no':
                                metadata['trial_no'],  # trial_no
                                'condition':
                                metadata['condition'],  # condition
                                'channel':
                                channel,  # channel
                                'channel_name':
                                channel_name,
                                'start':
                                self.sequence_partitions[-1],  # start
                                'stop':
                                n_sequences  # stop
                            })

                        for _ in xrange(frames.shape[0]):
                            labels.append(label)
                    else:
                        multi_channel_frames.append(frames)

                    ### end of channel iteration ###

                if not flatten_channels:
                    # turn list into array
                    multi_channel_frames = np.asfarray(multi_channel_frames,
                                                       dtype='float32')
                    # [channels x frames x time x freq] -> cb01
                    # [channels x frames x time x 1] -> cb0.

                    # move channel dimension to end
                    multi_channel_frames = np.rollaxis(multi_channel_frames, 0,
                                                       4)
                    # print multi_channel_frames.shape
                    # log.debug(multi_channel_frames.shape)

                    sequences.append(multi_channel_frames)

                    # increment counter by new number of frames
                    n_sequences += multi_channel_frames.shape[0]

                    if keep_metadata:
                        self.metadata.append({
                            'subject':
                            metadata['subject'],  # subject
                            'trial_type':
                            metadata['trial_type'],  # trial_type
                            'trial_no':
                            metadata['trial_no'],  # trial_no
                            'condition':
                            metadata['condition'],  # condition
                            'channel':
                            'all',  # channel
                            'start':
                            self.sequence_partitions[-1],  # start
                            'stop':
                            n_sequences  # stop
                        })

                    for _ in xrange(multi_channel_frames.shape[0]):
                        labels.append(label)

                ### end of datafile iteration ###

        # turn into numpy arrays
        sequences = np.vstack(sequences)
        # print sequences.shape;

        labels = np.hstack(labels)

        # one_hot_y = one_hot(labels)
        one_hot_formatter = OneHotFormatter(labels.max() + 1)  # FIXME!
        one_hot_y = one_hot_formatter.format(labels)

        self.labels = labels

        if layout == 'ft':  # swap axes to (batch, feature, time, channels)
            sequences = sequences.swapaxes(1, 2)

        log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape))
        super(MultiChannelEEGDataset, self).__init__(topo_view=sequences,
                                                     y=one_hot_y,
                                                     axes=['b', 0, 1, 'c'])

        log.info(
            'generated dataset "{}" with shape X={}={} y={} labels={} '.format(
                self.name, self.X.shape, sequences.shape, self.y.shape,
                self.labels.shape))

        if save_matrix_path is not None:
            matrix = DenseDesignMatrix(topo_view=sequences,
                                       y=one_hot_y,
                                       axes=['b', 0, 1, 'c'])
            with log_timing(
                    log,
                    'saving DenseDesignMatrix to {}'.format(save_matrix_path)):
                serial.save(save_matrix_path, matrix)
def svd_accuracy(file_name, ec, kwargs, folds=10, max_svs=10, max_init=15):
    """
    Classify data based on svd features.
    """
    kwargs['condense'] = False
    ds = ec.ECoG(file_name, which_set='train', **kwargs)
    n_classes = int(np.around(ds.y.max() + 1))
    max_svs = min(max_svs, n_classes)

    init_list = np.arange(0, n_classes - max_svs + 1)
    init_list = init_list[init_list < max_init]
    nsvs_list = np.arange(1, max_svs + 1)

    pa = np.inf * np.ones((folds, len(nsvs_list), len(init_list)))
    ma = np.inf * np.ones((folds, len(nsvs_list), len(init_list)))
    va = np.inf * np.ones((folds, len(nsvs_list), len(init_list)))
    u_s = np.zeros((folds, n_classes, n_classes))
    s_s = np.zeros((folds, n_classes))
    v_s = np.zeros((folds, n_classes, ds.X.shape[1]))
    ohf = OneHotFormatter(n_classes)

    for fold in range(folds):
        kwargs_copy = copy.deepcopy(kwargs)
        print('fold: {}'.format(fold))
        ds = ec.ECoG(file_name,
                     which_set='train',
                     fold=fold,
                     center=False,
                     **kwargs_copy)
        # CV
        ts = ds.get_test_set()
        vs = ds.get_valid_set()
        train_X = np.concatenate((ds.X, vs.X), axis=0)
        train_mean = train_X.mean(axis=0)
        train_X = train_X - train_mean
        train_y = np.concatenate((ds.y, vs.y), axis=0)
        test_X = ts.X - train_mean
        test_y = ts.y
        y_oh = ohf.format(train_y, mode='concatenate')
        c_yx = (y_oh - y_oh.mean(axis=0)).T.dot(train_X) / train_X.shape[0]
        u, s, v = np.linalg.svd(c_yx, full_matrices=False)
        u_s[fold] = u
        s_s[fold] = s
        v_s[fold] = v
        for ii, n_svs in enumerate(nsvs_list):
            for jj, sv_init in enumerate(init_list):
                vp = v[sv_init:sv_init + n_svs]
                train_proj = train_X.dot(vp.T)
                test_proj = test_X.dot(vp.T)
                cl = LR(solver='lbfgs',
                        multi_class='multinomial').fit(train_proj,
                                                       train_y.ravel())
                y_hat = cl.predict(test_proj)
                p_results = []
                m_results = []
                v_results = []
                for y, yh in zip(test_y.ravel(), y_hat.ravel()):
                    pr = place_equiv(y, yh)
                    if pr is not None:
                        p_results.append(pr)
                    mr = manner_equiv(y, yh)
                    if mr is not None:
                        m_results.append(mr)
                    vr = vowel_equiv(y, yh)
                    if vr is not None:
                        v_results.append(vr)
                pa[fold, ii, jj] = np.array(p_results).mean()
                ma[fold, ii, jj] = np.array(m_results).mean()
                va[fold, ii, jj] = np.array(v_results).mean()
    return pa, ma, va, u_s, s_s, v_s, init_list, nsvs_list
예제 #23
0
    def load_data(self):
        # Get the directory of the patient data
        patient_dir = os.path.join(self.data_dir, self.patient_id)

        # Load metadata about dataset form MAT file
        metadata_fname = os.path.join(patient_dir, 'trainset.mat')
        metadata_mat = loadmat(metadata_fname)

        # Get number of seizures
        self.n_seizures = metadata_mat.get('ictals').size

        # Get detail of the segment
        self.sampling_rate = metadata_mat['sampling_rate'][0][0]
        self.segment_sec = metadata_mat['segment_sec'][0][0]
        self.segment_samples = self.sampling_rate * self.segment_sec

        self.preictal_samples = 0
        self.nonictal_samples = 0

        # Examples of indexing through MAT file
        # mat['nonictals'][i][0]['filename'][0][0][0][j][0]
        # mat['nonictals'][i][0]['idx'][0][0][0][j][0]
        # mat['nonictals'][i][0]['n_segments'][0][0][0][0]

        # Balanced classes
        if self.which_set == 'train' or self.which_set == 'valid_train':

            if self.which_set == 'train':
                select_idx = np.setdiff1d(
                    range(metadata_mat['preictals'].size),
                    np.asarray([
                        self.leave_out_seizure_idx_valid,
                        self.leave_out_seizure_idx_test
                    ]))
            else:
                select_idx = np.asarray([self.leave_out_seizure_idx_valid])

            X = None
            y = None
            for i in select_idx:
                print '====== Seizure', i, '======'

                # Non-ictal data
                temp_nonictal_X = self.load_segment(part='nonictals',
                                                    seizure_idx=i,
                                                    metadata_mat=metadata_mat,
                                                    patient_dir=patient_dir)

                # Pre-ictal
                temp_preictal_X = self.load_segment(part='preictals',
                                                    seizure_idx=i,
                                                    metadata_mat=metadata_mat,
                                                    patient_dir=patient_dir)

                # Concatenate preictal and nonictal data
                temp_X = np.concatenate((temp_preictal_X, temp_nonictal_X),
                                        axis=0)
                temp_y = np.zeros(temp_X.shape[0], dtype=int)
                temp_y[range(temp_preictal_X.shape[0])] = 1

                # Sanity check
                # if not (temp_preictal_X.shape[0] == temp_nonictal_X.shape[0]):
                #     raise Exception('Unbalanced classes.')
                print 'Preictal samples: {0}, Nonictal samples: {1}'.format(
                    temp_preictal_X.shape[0], temp_nonictal_X.shape[0])
                if not np.all(
                        np.arange(temp_preictal_X.shape[0]) == np.where(temp_y)
                    [0]):
                    raise Exception(
                        'There is a mismatch between the number of preictal data and labels.'
                    )

                self.preictal_samples = self.preictal_samples + temp_preictal_X.shape[
                    0]
                self.nonictal_samples = self.nonictal_samples + temp_nonictal_X.shape[
                    0]

                if not (X is None) and not (y is None):
                    X = np.concatenate((X, temp_X), axis=0)
                    y = np.append(y, temp_y)
                else:
                    X = temp_X
                    y = temp_y

        # Unbalanced classes
        elif self.which_set == 'valid' or self.which_set == 'test':

            if self.which_set == 'valid':
                select_idx = self.leave_out_seizure_idx_valid
            else:
                select_idx = self.leave_out_seizure_idx_test

            print '====== Seizure', select_idx, '======'

            # Get metadata of all blocks
            block_df = pd.read_table(os.path.join(patient_dir,
                                                  'block_metadata.txt'),
                                     sep='\t')

            # Get block index of the selected seizure
            select_sz_fname = metadata_mat['preictals'][select_idx][0][
                'filename'][0][0][0][0][0]
            block_idx = np.where(block_df.filename == select_sz_fname)[0][0]

            n_padded_block = 2
            start_block_idx = block_idx - n_padded_block
            end_block_idx = block_idx + n_padded_block + 1

            if start_block_idx < 0:
                start_block_idx = 0
            if end_block_idx > block_df.shape[0]:
                end_block_idx = block_df.shape[0]

            select_block_idx = np.arange(start_block_idx, end_block_idx)
            filenames = block_df.filename[select_block_idx].values

            X = None
            y = None
            y_select_idx = None
            ictal_labels = None
            for b_idx, fname in enumerate(filenames):
                # Name of the MAT files that store EEG data
                data_fname = fname.replace('.data', '.mat')

                # Name of the MAT file that stores indices of flat (i.e., false) segments
                fname_flat = fname.replace('.data',
                                           '_flat_signal_segment_idx.mat')

                # Get all good indices (i.e., remove segments of flat signals)
                flat_mat = loadmat(os.path.join(patient_dir, fname_flat))
                flat_idx = np.empty(0, dtype=int)
                for j in range(flat_mat['flat_signal_segment_idx'].shape[0]):
                    flat_idx = np.append(
                        flat_idx,
                        np.squeeze(flat_mat['flat_signal_segment_idx'][j][0]))
                flat_idx = flat_idx - 1  # Change from MATLAB to python index system

                data_mat = loadmat(os.path.join(patient_dir, data_fname))

                if data_mat['signals'].shape[1] != block_df.samples[
                        select_block_idx[b_idx]]:
                    raise Exception(
                        'There is a mismatch between the number samples specified in the metadata and '
                        'the provided signal data')

                n_segments = np.ceil(data_mat['signals'].shape[1] /
                                     (self.segment_samples * 1.0))
                all_idx = np.arange(n_segments, dtype=int)
                good_idx = np.setdiff1d(all_idx, flat_idx)

                # Get indicies of scalp EEG channels
                elec_names = np.asarray(
                    [ename[0][0] for ename in data_mat['elec_names']])
                scalp_channels_idx = np.empty(0, dtype=int)
                for ch in self.scalp_channel_labels:
                    scalp_channels_idx = np.append(
                        scalp_channels_idx,
                        np.where(elec_names == ch)[0][0])

                print 'Load', self.which_set, 'data from', fname

                if good_idx.size > 0:
                    temp_X = None
                    for idx in range(good_idx.size):
                        g_idx = good_idx[idx]
                        start_sample_idx = np.uint32(
                            g_idx) * self.segment_samples
                        end_sample_idx = np.uint32(g_idx +
                                                   1) * self.segment_samples
                        if end_sample_idx > data_mat['signals'].shape[1]:
                            # Zero-padding if the window size is not compatible
                            extra = end_sample_idx - data_mat['signals'].shape[
                                1]
                            assert (data_mat['signals'].shape[1] +
                                    extra) % self.segment_samples == 0
                            if extra > 0:
                                data_mat['signals'] = np.concatenate(
                                    (data_mat['signals'],
                                     np.zeros(
                                         (data_mat['signals'].shape[0], extra),
                                         dtype=float)),
                                    axis=1)
                            assert data_mat['signals'].shape[
                                1] % self.segment_samples == 0
                        temp_sample_idx = np.arange(start_sample_idx,
                                                    end_sample_idx)

                        if not (temp_X is None):
                            temp = data_mat['signals'][:, temp_sample_idx]
                            temp_X = np.concatenate(
                                (temp_X,
                                 np.asarray([temp[scalp_channels_idx, :]])),
                                axis=0)
                        else:
                            temp = data_mat['signals'][:, temp_sample_idx]
                            temp_X = np.asarray([temp[scalp_channels_idx, :]])

                    # If this record contains preictal data, get preictal labels
                    temp_preictal_meta_idx = -1
                    temp_preictal_fname_idx = -1
                    for preictal_meta_idx, preictal_meta in enumerate(
                            metadata_mat['preictals']):
                        for preictal_fname_idx, preictal_fname in enumerate(
                                preictal_meta[0]['filename'][0][0][0]):
                            if preictal_fname == fname:
                                temp_preictal_meta_idx = preictal_meta_idx
                                temp_preictal_fname_idx = preictal_fname_idx
                                break
                    if temp_preictal_meta_idx != -1 and temp_preictal_fname_idx != -1:
                        # Preictal indices
                        preictal_idx = metadata_mat['preictals'][
                            temp_preictal_meta_idx][0]['idx'][0][0][0][
                                temp_preictal_fname_idx][0]
                        preictal_idx = preictal_idx - 1  # Change from MATLAB to python index system

                        temp_y = np.zeros(n_segments, dtype=int)
                        temp_y[preictal_idx] = 1

                        # Sanity check
                        if not (preictal_idx.size == np.intersect1d(
                                good_idx, preictal_idx).size):
                            raise Exception(
                                'Good indices and preictal indices are mismatch.'
                            )

                        # Remove segment of flat signals from labels
                        temp_y = temp_y[good_idx]

                        self.preictal_samples = self.preictal_samples + preictal_idx.size
                        self.nonictal_samples = self.nonictal_samples + (
                            temp_y.size - preictal_idx.size)
                    else:
                        temp_y = np.zeros(temp_X.shape[0], dtype=int)
                        self.nonictal_samples = self.nonictal_samples + temp_y.size

                    # If this record contains preictal data of the leave-out-seizure index, get preictal labels
                    if temp_preictal_meta_idx == select_idx:
                        temp_y_select_idx = temp_y
                    else:
                        temp_y_select_idx = np.zeros(temp_X.shape[0],
                                                     dtype=int)

                    # If this record contains ictal data, get ictal labels
                    temp_ictal_meta_idx = -1
                    temp_ictal_fname_idx = -1
                    for ictal_meta_idx, ictal_meta in enumerate(
                            metadata_mat['ictals']):
                        for ictal_fname_idx, ictal_fname in enumerate(
                                ictal_meta[0]['filename'][0][0][0]):
                            if ictal_fname == fname:
                                temp_ictal_meta_idx = ictal_meta_idx
                                temp_ictal_fname_idx = ictal_fname_idx
                                break
                    if temp_ictal_meta_idx != -1 and temp_ictal_fname_idx != -1:
                        # Ictal indices
                        ictal_idx = metadata_mat['ictals'][
                            temp_ictal_meta_idx][0]['idx'][0][0][0][
                                temp_ictal_fname_idx][0]
                        ictal_idx = ictal_idx - 1  # Change from MATLAB to python index system

                        temp_ictal_labels = np.zeros(n_segments, dtype=int)
                        temp_ictal_labels[ictal_idx] = 1

                        # Sanity check
                        if not (ictal_idx.size == np.intersect1d(
                                good_idx, ictal_idx).size):
                            raise Exception(
                                'Good indices and ictal indices are mismatch.')

                        # Remove segment of flat signals from labels
                        temp_ictal_labels = temp_ictal_labels[good_idx]
                    else:
                        temp_ictal_labels = np.zeros(temp_X.shape[0],
                                                     dtype=int)

                    # Sanity check
                    if not (temp_X.shape[0] == temp_y.size):
                        raise Exception(
                            'Number of feature data and labels are not equal.')
                    if not (temp_X.shape[0] == temp_ictal_labels.size):
                        raise Exception(
                            'Number of feature data and labels are not equal.')

                    if not (X is None) and not (y is None) and not (
                            ictal_labels is None):
                        X = np.concatenate((X, temp_X), axis=0)
                        y = np.append(y, temp_y)
                        y_select_idx = np.append(y_select_idx,
                                                 temp_y_select_idx)
                        ictal_labels = np.append(ictal_labels,
                                                 temp_ictal_labels)
                    else:
                        X = temp_X
                        y = temp_y
                        y_select_idx = temp_y_select_idx
                        ictal_labels = temp_ictal_labels
                else:
                    print 'There is no good segment for during this seizure'

            # Store preictal labels that are from the leave-out-seizure index (use for compute accuracy)
            # Note: this property will exist when which_set=='valid' or which_set=='test'
            #       as there is no need for ictal to be imported.
            self.y_select_idx = y_select_idx

            # Sanity check
            if np.where(y_select_idx == 1)[0].size > np.where(y == 1)[0].size:
                raise Exception(
                    'There is an error in collecting preictal labels only from the leave-out-seizure index.'
                )
            elif np.where(y_select_idx == 1)[0].size == np.where(
                    y == 1)[0].size:
                print 'There is only one preictal periods, and this period is from the leave-out-seizure index.'
                if not np.all(
                        np.where(y_select_idx == 1)[0] == np.where(y == 1)[0]):
                    raise Exception(
                        'There is a mismatch between y_select_idx and y.')
            elif np.where(y_select_idx == 1)[0].size < np.where(
                    y == 1)[0].size:
                print 'There are more than one preictal periods.'
                if not np.all(
                        np.intersect1d(
                            np.where(y == 1)[0],
                            np.where(y_select_idx == 1)[0]) == np.where(
                                y_select_idx == 1)[0]):
                    raise Exception(
                        'There is a mismatch between y_select_idx and y in the preictal labels of the leave-out-seizure index.'
                    )

            # Store ictal labels
            # Note: this property will exist when which_set=='valid' or which_set=='test'
            #       as there is no need for ictal to be imported.
            self.ictal_labels = ictal_labels
        else:
            raise Exception('Invalid dataset selection')

        X = np.transpose(X, [0, 2, 1])
        one_hot_formatter = OneHotFormatter(max_labels=2)
        y = one_hot_formatter.format(y)

        # Sanity check
        if not (X.shape[0] == self.preictal_samples + self.nonictal_samples):
            raise Exception(
                'There is a mismatch in the number of training samples.')
        if not (np.where(np.argmax(y, axis=1) == 1)[0].size
                == self.preictal_samples):
            raise Exception(
                'There is a mismatch in the number of preictal samples and its labels.'
            )
        if not (X.shape[0] == y.shape[0]):
            raise Exception(
                'There is a mismatch in the number of training samples and its labels.'
            )

        return X, y
예제 #24
0
class IndexSpace(Space):
    """
    A space representing indices, for example MNIST labels (0-10) or the
    indices of words in a dictionary for NLP tasks. A single space can
    contain multiple indices, for example the word indices of an n-gram.

    IndexSpaces can be converted to VectorSpaces in two ways: Either the
    labels are converted into one-hot vectors which are then concatenated,
    or they are converted into a single vector where 1s indicate labels
    present i.e. for 4 possible labels we have [0, 2] -> [1 0 1 0] or
    [0, 2] -> [1 0 0 0 0 0 1 0].
    """
    def __init__(self, max_labels, dim, **kwargs):
        """
        Initialize an IndexSpace.

        Parameters
        ----------
        max_labels : int
            The number of possible classes/labels. This means that
            all labels should be < max_labels. Example: For MNIST
            there are 10 numbers and hence max_labels = 10.
        dim : int
            The number of indices in one space e.g. for MNIST there is
            one target label and hence dim = 1. If we have an n-gram
            of word indices as input to a neurel net language model, dim = n.
        kwargs: passes on to superclass constructor
        """

        super(IndexSpace, self).__init__(**kwargs)

        self.max_labels = max_labels
        self.dim = dim
        self.formatter = OneHotFormatter(self.max_labels)

    def __str__(self):
        """
        Return a string representation.
        """
        return '%(classname)s(dim=%(dim)s, max_labels=%(max_labels)s)' % \
               dict(classname=self.__class__.__name__,
                    dim=self.dim,
                    max_labels=self.max_labels)

    def __eq__(self, other):
        return (type(self) == type(other) and
                self.max_labels == other.max_labels and
                self.dim == other.dim)

    def __ne__(self, other):
        return (not self == other)

    @functools.wraps(Space.get_total_dimension)
    def get_total_dimension(self):
        return self.dim

    @functools.wraps(Space.np_format_as)
    def np_format_as(self, batch, space):
        if isinstance(space, VectorSpace):
            if self.max_labels == space.dim:
                rval = self.formatter.format(batch, sparse=space.sparse,
                                             mode='merge')
            elif self.dim * self.max_labels == space.dim:
                rval = self.formatter.format(batch, sparse=space.sparse,
                                             mode='concatenate')
            else:
                raise ValueError("Can't convert %s to %s"
                                 % (self, space))
            return rval
        else:
            raise ValueError("Can't convert %s to %s"
                             % (self, space))

    @functools.wraps(Space._format_as)
    def _format_as(self, batch, space):
        """
        Supports formatting to a VectorSpace where indices are represented
        by ones in a binary vector.
        """
        if isinstance(space, VectorSpace):
            if self.max_labels == space.dim:
                rval = self.formatter.theano_expr(batch, sparse=space.sparse,
                                                  mode='merge')
            elif self.dim * self.max_labels == space.dim:
                rval = self.formatter.theano_expr(batch, sparse=space.sparse,
                                                  mode='concatenate')
            else:
                raise ValueError("Can't convert %s to %s"
                                 % (self, space))
            return rval
        else:
            raise ValueError("Can't convert %s to %s"
                             % (self, space))

    @functools.wraps(Space.make_theano_batch)
    def make_theano_batch(self, name=None, dtype=None, batch_size=None):
        if batch_size == 1:
            rval = T.lrow(name=name)
        else:
            rval = T.lmatrix(name=name)
        return rval

    @functools.wraps(Space.batch_size)
    def batch_size(self, batch):
        self.validate(batch)
        return batch.shape[0]

    @functools.wraps(Space.np_batch_size)
    def np_batch_size(self, batch):
        self.np_validate(batch)
        return batch.shape[0]

    @functools.wraps(Space._validate)
    def _validate(self, batch):
        """
        .. todo::

            WRITEME
        """
        if not isinstance(batch, theano.gof.Variable):
            raise TypeError("IndexSpace batch should be a theano Variable, "
                            "got " + str(type(batch)))
        if not isinstance(batch.type, (theano.tensor.TensorType,
                                       CudaNdarrayType)):
            raise TypeError("VectorSpace batch should be TensorType or "
                            "CudaNdarrayType, got "+str(batch.type))
        if batch.ndim != 2:
            raise ValueError('IndexSpace batches must be 2D, got %d '
                             'dimensions' % batch.ndim)
        for val in get_debug_values(batch):
            self.np_validate(val)

    @functools.wraps(Space._np_validate)
    def _np_validate(self, batch):
        # Use the 'CudaNdarray' string to avoid importing theano.sandbox.cuda
        # when it is not available
        if (not isinstance(batch, np.ndarray)
            and str(type(batch)) != "<type 'CudaNdarray'>"):
            raise TypeError("The value of a IndexSpace batch should be a "
                            "numpy.ndarray, or CudaNdarray, but is %s."
                            % str(type(batch)))
        if batch.ndim != 2:
            raise ValueError("The value of a IndexSpace batch must be "
                             "2D, got %d dimensions for %s." % (batch.ndim,
                                                                batch))
        if batch.shape[1] != self.dim:
            raise ValueError("The width of a IndexSpace batch must match "
                             "with the space's dimension, but batch has shape "
                             "%s and dim = %d." % (str(batch.shape), self.dim))
예제 #25
0
    def load_data(self):
        # Get the directory of the patient data
        patient_dir = os.path.join(self.data_dir, self.patient_id)

        # Load metadata about dataset form MAT file
        metadata_fname = os.path.join(
            patient_dir, 'trainset_' + str(self.preictal_sec) + '.mat')
        metadata_mat = loadmat(metadata_fname)

        # Get number of seizures
        self.n_seizures = metadata_mat.get('ictals').size

        # Get detail of the segment
        self.sampling_rate = metadata_mat['sampling_rate'][0][0]
        self.segment_sec = metadata_mat['segment_sec'][0][0]
        self.segment_samples = self.sampling_rate * self.segment_sec

        # Get the number blocks to extend from the withheld seizure
        self.n_extended_blocks_test = metadata_mat['n_extended_blocks_test'][
            0][0]

        self.preictal_samples = 0
        self.nonictal_samples = 0
        self.nan_non_flat_samples = 0

        # Examples of indexing through MAT file
        # mat['nonictals'][i][0]['filename'][0][0][0][j][0]
        # mat['nonictals'][i][0]['idx'][0][0][0][j][0]
        # mat['nonictals'][i][0]['n_segments'][0][0][0][0]

        # Load shuffle data
        if self.which_set == 'train' or self.which_set == 'valid_train':

            if self.which_set == 'train':
                select_idx = np.setdiff1d(
                    range(metadata_mat['preictals'].size),
                    np.asarray([
                        self.leave_out_seizure_idx_valid,
                        self.leave_out_seizure_idx_test
                    ]))
            else:
                select_idx = np.asarray([self.leave_out_seizure_idx_valid])

            X = None
            y = None

            if self.use_all_nonictals:
                temp_preictal_X = None
                for i in select_idx:
                    print '====== Seizure', i, '======'

                    # Pre-ictal
                    temp_X = self.load_feature(
                        part='preictals',
                        list_features=self.list_features,
                        seizure_idx=i,
                        metadata_mat=metadata_mat,
                        patient_dir=patient_dir)

                    if not (temp_preictal_X is None):
                        temp_preictal_X = np.concatenate(
                            (temp_preictal_X, temp_X), axis=1)
                    else:
                        temp_preictal_X = temp_X

                self.preictal_samples = temp_preictal_X.shape[1]

                # Non-ictal data
                temp_nonictal_X = self.load_feature(
                    part='nonictals_all',
                    list_features=self.list_features,
                    seizure_idx=self.leave_out_seizure_idx_test,
                    metadata_mat=metadata_mat,
                    patient_dir=patient_dir)
                X = np.concatenate((temp_preictal_X, temp_nonictal_X), axis=1)
                y = np.zeros(X.shape[1], dtype=int)
                y[range(self.preictal_samples)] = 1

                self.nonictal_samples = temp_nonictal_X.shape[1]

                print 'Preictal samples: {0}, Nonictal samples: {1}'.format(
                    self.preictal_samples, self.nonictal_samples)
                if not np.all(
                        np.arange(self.preictal_samples) == np.where(y)[0]):
                    raise Exception(
                        'There is a mismatch between the number of preictal data and labels.'
                    )

            else:
                for i in select_idx:
                    print '====== Seizure', i, '======'

                    # Non-ictal data
                    temp_nonictal_X = self.load_feature(
                        part='nonictals',
                        list_features=self.list_features,
                        seizure_idx=i,
                        metadata_mat=metadata_mat,
                        patient_dir=patient_dir)

                    # Pre-ictal
                    temp_preictal_X = self.load_feature(
                        part='preictals',
                        list_features=self.list_features,
                        seizure_idx=i,
                        metadata_mat=metadata_mat,
                        patient_dir=patient_dir)

                    # Concatenate preictal and nonictal data
                    temp_X = np.concatenate((temp_preictal_X, temp_nonictal_X),
                                            axis=1)
                    temp_y = np.zeros(temp_X.shape[1], dtype=int)
                    temp_y[range(temp_preictal_X.shape[1])] = 1

                    # Sanity check
                    # if not (temp_preictal_X.shape[1] == temp_nonictal_X.shape[1]):
                    #     raise Exception('Unbalanced classes.')
                    print 'Preictal samples: {0}, Nonictal samples: {1}'.format(
                        temp_preictal_X.shape[1], temp_nonictal_X.shape[1])
                    if not np.all(
                            np.arange(temp_preictal_X.shape[1]) == np.where(
                                temp_y)[0]):
                        raise Exception(
                            'There is a mismatch between the number of preictal data and labels.'
                        )

                    self.preictal_samples = self.preictal_samples + temp_preictal_X.shape[
                        1]
                    self.nonictal_samples = self.nonictal_samples + temp_nonictal_X.shape[
                        1]

                    if not (X is None) and not (y is None):
                        X = np.concatenate((X, temp_X), axis=1)
                        y = np.append(y, temp_y)
                    else:
                        X = temp_X
                        y = temp_y

        # Load continuous data
        elif self.which_set == 'valid' or self.which_set == 'test':

            if self.which_set == 'valid':
                select_idx = self.leave_out_seizure_idx_valid
            else:
                select_idx = self.leave_out_seizure_idx_test

            print '====== Seizure', select_idx, '======'

            # Get metadata of all blocks
            block_df = pd.read_table(os.path.join(patient_dir,
                                                  'block_metadata.txt'),
                                     sep='\t')

            # Get block index of the selected seizure
            select_sz_fname = metadata_mat['preictals'][select_idx][0][
                'filename'][0][0][0][0][0]
            block_idx = np.where(block_df.filename == select_sz_fname)[0][0]

            start_block_idx = block_idx - self.n_extended_blocks_test
            end_block_idx = block_idx + self.n_extended_blocks_test + 1

            if start_block_idx < 0:
                start_block_idx = 0
            if end_block_idx > block_df.shape[0]:
                end_block_idx = block_df.shape[0]

            select_block_idx = np.arange(start_block_idx, end_block_idx)
            filenames = block_df.filename[select_block_idx].values

            X = None
            y = None
            y_label_all = None
            ictal_labels = None
            for b_idx, fname in enumerate(filenames):
                # Name of the MAT file that stores indices of flat (i.e., false) segments
                fname_flat = fname.replace('.data',
                                           '_flat_signal_segment_idx.mat')

                # Get all good indices (i.e., remove segments of flat signals)
                flat_mat = loadmat(os.path.join(patient_dir, fname_flat))
                flat_idx = np.empty(0, dtype=int)
                for j in range(flat_mat['flat_signal_segment_idx'].shape[0]):
                    flat_idx = np.append(
                        flat_idx,
                        np.squeeze(flat_mat['flat_signal_segment_idx'][j][0]))
                flat_idx = flat_idx - 1  # Change from MATLAB to python index system

                n_segments = np.ceil(
                    block_df.samples[select_block_idx[b_idx]] /
                    (self.segment_samples * 1.0))
                all_idx = np.arange(n_segments, dtype=int)
                good_idx = np.setdiff1d(all_idx, flat_idx)

                print 'Load', self.which_set, 'data from', fname

                if good_idx.size > 0:
                    # Features with shape [n_features, n_samples]
                    temp_X = self.load_list_feature(
                        list_features=self.list_features,
                        sample_idx=good_idx,
                        fname=fname,
                        patient_dir=patient_dir)

                    # If this record contains preictal data in the withheld seizures, get preictal labels
                    temp_y_withheld = self.get_labels(
                        label_type='preictals',
                        filename=fname,
                        good_idx=good_idx,
                        metadata_mat=metadata_mat,
                        n_all_segments=n_segments,
                        n_data_segments=temp_X.shape[1],
                        select_meta_idx=select_idx)

                    # If this record contains preictal data in the selected seizures, get preictal labels
                    temp_y_select = self.get_labels(
                        label_type='preictals',
                        filename=fname,
                        good_idx=good_idx,
                        metadata_mat=metadata_mat,
                        n_all_segments=n_segments,
                        n_data_segments=temp_X.shape[1])

                    # If this record contains preictal data in all seizures, get preictal labels
                    temp_y_rm = self.get_labels(
                        label_type='all_preictals',
                        filename=fname,
                        good_idx=good_idx,
                        metadata_mat=metadata_mat,
                        n_all_segments=n_segments,
                        n_data_segments=temp_X.shape[1])

                    tmp_preictal_withheld_idx = np.where(
                        temp_y_withheld == 1)[0]
                    tmp_preictal_select_idx = np.where(temp_y_select == 1)[0]
                    tmp_preictal_rm_idx = np.where(temp_y_rm == 1)[0]
                    tmp_preictal_select_idx = np.setdiff1d(
                        tmp_preictal_select_idx, tmp_preictal_withheld_idx)
                    tmp_preictal_rm_idx = np.setdiff1d(
                        tmp_preictal_rm_idx, tmp_preictal_withheld_idx)
                    tmp_preictal_rm_idx = np.setdiff1d(
                        tmp_preictal_rm_idx, tmp_preictal_select_idx)

                    self.preictal_samples = self.preictal_samples + np.where(
                        temp_y_withheld == 1)[0].size
                    self.nonictal_samples = self.nonictal_samples + np.where(
                        temp_y_withheld == 0)[0].size

                    if tmp_preictal_withheld_idx.size > 0:
                        print ' Load preictal data from the withheld seizure from this file.'
                        print ' Size:', tmp_preictal_withheld_idx.size, tmp_preictal_withheld_idx
                    if tmp_preictal_select_idx.size > 0:
                        print ' Load preictal data from selected seizures in addition to the withheld seizure from this file.'
                        print ' Size:', tmp_preictal_select_idx.size, tmp_preictal_select_idx
                    if tmp_preictal_rm_idx.size > 0:
                        print ' Load preictal data from removed seizures in addition to the withheld seizure from this file.'
                        print ' Size:', tmp_preictal_rm_idx.size, tmp_preictal_rm_idx

                    # Sanity check
                    if np.intersect1d(tmp_preictal_withheld_idx,
                                      tmp_preictal_select_idx).size > 0:
                        raise Exception(
                            'There is an overlapped of the labels between the withheld seizures, and the selected seizures.'
                        )
                    if np.intersect1d(tmp_preictal_select_idx,
                                      tmp_preictal_rm_idx).size > 0:
                        raise Exception(
                            'There is an overlapped of the labels between the selected seizures, and the removed seizures.'
                        )
                    if np.intersect1d(tmp_preictal_withheld_idx,
                                      tmp_preictal_rm_idx).size > 0:
                        raise Exception(
                            'There is an overlapped of the labels between the withheld seizures, and the removed seizures.'
                        )

                    temp_y_all = np.zeros(temp_X.shape[1], dtype=int)
                    temp_y_all[
                        tmp_preictal_withheld_idx] = 1  # Labels for the withheld seizure
                    temp_y_all[
                        tmp_preictal_select_idx] = 2  # Labels for the selected seizure (that is not from withheld seizures)
                    temp_y_all[
                        tmp_preictal_rm_idx] = 3  # Labels for the removed seizure (that is not from withheld seizures)

                    # If this record contains ictal data, get ictal labels
                    temp_ictal_labels = self.get_labels(
                        label_type='all_ictals',
                        filename=fname,
                        good_idx=good_idx,
                        metadata_mat=metadata_mat,
                        n_all_segments=n_segments,
                        n_data_segments=temp_X.shape[1])

                    tmp_ictal_idx = np.where(temp_ictal_labels == 1)[0]
                    if tmp_ictal_idx.size > 0:
                        print ' Ictal label:', tmp_ictal_idx.size, tmp_ictal_idx

                    # Dealing with NaN features after filtering out flat segment which occurs due to noise in the data,
                    # not from flat segments
                    nan_sample_idx = np.where(np.isnan(np.sum(temp_X, 0)))[0]
                    nan_feature_idx = np.where(np.isnan(np.sum(temp_X, 1)))[0]
                    if nan_sample_idx.size > 0 or nan_feature_idx.size > 0:
                        print self.which_set, 'contains NaN at:'
                        print ' sample_idx:', good_idx[
                            nan_sample_idx], ' feature_idx:', nan_feature_idx
                        print ' shape before remove NaN:', temp_X.shape
                        tmp_preictal_idx = np.where(temp_y_withheld == 1)[0]
                        tmp_nonictal_idx = np.where(temp_y_withheld == 0)[0]
                        nan_preictal_sample_idx = np.intersect1d(
                            tmp_preictal_idx, nan_sample_idx)
                        nan_nonictal_sample_idx = np.intersect1d(
                            tmp_nonictal_idx, nan_sample_idx)
                        if nan_preictal_sample_idx.size > 0:
                            print ' NaN are in preictal index:', good_idx[
                                nan_preictal_sample_idx]
                        if nan_nonictal_sample_idx.size > 0:
                            print ' NaN are in nonictal index:', good_idx[
                                nan_nonictal_sample_idx]
                        all_idx = np.arange(temp_X.shape[1])
                        good_idx_1 = np.setdiff1d(all_idx, nan_sample_idx)
                        temp_X = temp_X[:, good_idx_1]
                        temp_y_all = temp_y_all[good_idx_1]
                        temp_y_withheld = temp_y_withheld[good_idx_1]
                        temp_ictal_labels = temp_ictal_labels[good_idx_1]
                        print ' shape before remove NaN:', temp_X.shape
                        self.nan_non_flat_samples = self.nan_non_flat_samples + nan_sample_idx.size

                    # Sanity check
                    tmp_nan_sample_idx = np.where(np.isnan(np.sum(temp_X,
                                                                  0)))[0]
                    if tmp_nan_sample_idx.size > 0:
                        raise Exception('There is an error in removing NaN')
                    if not (temp_X.shape[1] == temp_y_all.size):
                        raise Exception(
                            'Number of feature data and labels [temp_y_all] are not equal.'
                        )
                    if not (temp_X.shape[1] == temp_y_withheld.size):
                        raise Exception(
                            'Number of feature data and labels [temp_y_withheld] are not equal.'
                        )
                    if not (temp_X.shape[1] == temp_ictal_labels.size):
                        raise Exception(
                            'Number of feature data and labels [ictal_labels] are not equal.'
                        )

                    if not (X is None) and not (y is None) and not (
                            ictal_labels is None):
                        X = np.concatenate((X, temp_X), axis=1)
                        y = np.append(y, temp_y_withheld)
                        y_label_all = np.append(y_label_all, temp_y_all)
                        ictal_labels = np.append(ictal_labels,
                                                 temp_ictal_labels)
                    else:
                        X = temp_X
                        y = temp_y_withheld
                        y_label_all = temp_y_all
                        ictal_labels = temp_ictal_labels
                else:
                    print 'There is no good segment for during this seizure'

            # Store preictal labels that are from the withheld index (use for compute accuracy), selected seizure index,
            #  and removed seizure index.
            # Note: this property will exist when which_set=='valid' or which_set=='test'
            #       as there is no need for ictal to be imported.
            self.y_label_all = y_label_all

            # Sanity check
            if np.where(y == 1)[0].size > np.where(y_label_all > 0)[0].size:
                raise Exception(
                    'There is an error in collecting preictal labels only from the leave-out-seizure index.'
                )
            if np.where(y == 1)[0].size == np.where(y_label_all == 1)[0].size:
                print 'There is only one preictal periods, and this period is from the leave-out-seizure index.'
                if not np.all(
                        np.where(y == 1)[0] == np.where(y_label_all == 1)[0]):
                    raise Exception(
                        'There is a mismatch between y and y_label_all.')
            if np.where(y == 1)[0].size < np.where(y_label_all > 0)[0].size:
                print 'There are more than one preictal periods.'
                if not np.all(
                        np.where(y == 1)[0] == np.where(y_label_all == 1)[0]):
                    raise Exception(
                        'There is a mismatch between y_select_idx and y in the preictal labels of the leave-out-seizure index.'
                    )

            # Store ictal labels
            # Note: this property will exist when which_set=='valid' or which_set=='test'
            #       as there is no need for ictal to be imported.
            self.ictal_labels = ictal_labels
        else:
            raise Exception('Invalid dataset selection')

        print 'There are {0} samples that have been removed in addition to the flat signal as due to NaN.'.format(
            self.nan_non_flat_samples)

        X = np.transpose(X, [1, 0])
        one_hot_formatter = OneHotFormatter(max_labels=2)
        y = one_hot_formatter.format(y)

        # Sanity check
        # Note: We ignore nan_non_flat_samples if we load shuffle data as we specify the labels after the NaN have been removed
        #       In contrast to loading continuous data, we specify the labels before removing NaN, so we have to remove the NaN samples for checking
        if self.which_set == 'train' or self.which_set == 'valid_train':
            if not (X.shape[0]
                    == self.preictal_samples + self.nonictal_samples):
                raise Exception(
                    'There is a mismatch in the number of training samples ({0} != {1}).'
                    .format(X.shape[0],
                            self.preictal_samples + self.nonictal_samples))
            if not (np.where(np.argmax(y, axis=1) == 1)[0].size
                    == self.preictal_samples):
                raise Exception(
                    'There is a mismatch in the number of preictal samples and its labels ({0} != {1}).'
                    .format(
                        np.where(np.argmax(y, axis=1) == 1)[0].size,
                        self.preictal_samples))
            if not (X.shape[0] == y.shape[0]):
                raise Exception(
                    'There is a mismatch in the number of training samples and its labels ({0} != {1}).'
                    .format(X.shape[0], y.shape[0]))
        elif self.which_set == 'valid' or self.which_set == 'test':
            if not (X.shape[0] == self.preictal_samples +
                    self.nonictal_samples - self.nan_non_flat_samples):
                raise Exception(
                    'There is a mismatch in the number of training samples ({0} != {1}).'
                    .format(
                        X.shape[0], self.preictal_samples +
                        self.nonictal_samples - self.nan_non_flat_samples))
            if not ((np.where(np.argmax(y, axis=1) == 1)[0].size +
                     np.where(np.argmax(y, axis=1) == 0)[0].size)
                    == self.preictal_samples + self.nonictal_samples -
                    self.nan_non_flat_samples):
                raise Exception(
                    'There is a mismatch in the number of samples and its labels ({0} != {1}).'
                    .format(
                        np.where(np.argmax(y, axis=1) == 1)[0].size +
                        np.where(np.argmax(y, axis=1) == 0)[0].size,
                        self.preictal_samples))
            if not (X.shape[0] == y.shape[0]):
                raise Exception(
                    'There is a mismatch in the number of training samples and its labels ({0} != {1}).'
                    .format(X.shape[0], y.shape[0]))

        return X, y
예제 #26
0
    def load_data(self, which_set, sample_size_second, batch_size, scaler_path):
        raw_data, raw_labels, channel_labels, \
        seizure_range_idx, seizure_range_second, seizure_seconds, \
        n_channels, sample_size, sampling_rate = self.load_source_data(sample_size_second)

        self.channel_labels = channel_labels
        self.seizure_seconds_src = seizure_seconds
        self.sampling_rate = sampling_rate
        self.raw_data = raw_data

        # Generate seiuzre index (rounded to be divided by the sampling rate)
        seizure_round_sample_idx = np.empty(seizure_range_second.size, dtype=object)
        for r in range(seizure_range_second.size):
            start_idx = seizure_range_second[r][0] * sampling_rate
            end_idx = seizure_range_second[r][-1] * sampling_rate
            seizure_round_sample_idx[r] = np.arange(start_idx, end_idx)

        # Generate non-seizure index
        non_seizure_round_sample_idx = np.arange(raw_data.shape[1])
        for s_idx in seizure_round_sample_idx:
            non_seizure_round_sample_idx = np.setdiff1d(non_seizure_round_sample_idx,
                                                        s_idx)

        # Partition non-seizure data into segments
        # Then randomly choose for training, cv and test sets
        n_segments = 10
        segment_size = non_seizure_round_sample_idx.size / n_segments
        segment_size = segment_size - (segment_size % sampling_rate)
        segment_idx = np.empty(n_segments, dtype=object)
        for i in range(n_segments):
            start_segment_idx = i * segment_size
            end_segment_idx = (i+1) * segment_size
            if end_segment_idx > non_seizure_round_sample_idx.size:
                end_segment_idx = non_seizure_round_sample_idx.size
            segment_idx[i] = np.arange(start_segment_idx, end_segment_idx)

        # Select test seizure index
        test_seizure_idx = self.leave_one_out_seizure
        np.random.seed(test_seizure_idx)

        # Leave-one-out cross-validation - seizure
        n_seizures = seizure_range_idx.shape[0]
        rest_seizure_idx = np.setdiff1d(np.arange(n_seizures), test_seizure_idx)
        perm_rest_seizure_idx = np.random.permutation(rest_seizure_idx)
        train_seizure_idx = perm_rest_seizure_idx
        cv_seizure_idx = perm_rest_seizure_idx

        # Leave-one-out cross-validation - non-seizure
        n_train_segments = int(n_segments * 0.6)
        n_cv_segments = int(n_segments * 0.2)
        non_seizure_segment_idx = np.arange(n_segments)
        perm_non_seizure_segment_idx = np.random.permutation(non_seizure_segment_idx)
        train_sample_segments = perm_non_seizure_segment_idx[:n_train_segments]
        cv_sample_segments = perm_non_seizure_segment_idx[n_train_segments:n_train_segments+n_cv_segments]
        test_sample_segments = perm_non_seizure_segment_idx[n_train_segments+n_cv_segments:]
        train_sample_idx = np.empty(0, dtype=int)
        for s in train_sample_segments:
            train_sample_idx = np.append(train_sample_idx, segment_idx[s])
        cv_sample_idx = np.empty(0, dtype=int)
        for s in cv_sample_segments:
            cv_sample_idx = np.append(cv_sample_idx, segment_idx[s])
        test_sample_idx = np.empty(0, dtype=int)
        for s in test_sample_segments:
            test_sample_idx = np.append(test_sample_idx, segment_idx[s])

        print 'Segment index for train, cv and test sets:', \
              train_sample_segments, cv_sample_segments, test_sample_segments

        print 'Seizure index for train, cv and test sets:', \
              train_seizure_idx, cv_seizure_idx, [test_seizure_idx]

        if which_set == 'train':
            print("Loading training data...")
            data = raw_data[:,non_seizure_round_sample_idx[train_sample_idx]]
            labels = raw_labels[non_seizure_round_sample_idx[train_sample_idx]]
            select_seizure = train_seizure_idx
        elif which_set == 'valid':
            print("Loading validation data...")
            data = raw_data[:,non_seizure_round_sample_idx[cv_sample_idx]]
            labels = raw_labels[non_seizure_round_sample_idx[cv_sample_idx]]
            select_seizure = cv_seizure_idx
        elif which_set == 'test':
            print("Loading test data...")
            data = raw_data[:,non_seizure_round_sample_idx[test_sample_idx]]
            labels = raw_labels[non_seizure_round_sample_idx[test_sample_idx]]
            select_seizure = [test_seizure_idx]
        elif which_set == 'all':
            print("Loading all data...")
            data = raw_data
            labels = raw_labels
            select_seizure = []
        else:
            raise('Invalid set.')

        # Add seizure data
        for sz in select_seizure:
            data = np.concatenate((data, raw_data[:, seizure_round_sample_idx[sz]]), axis=1)
            labels = np.concatenate((labels, raw_labels[seizure_round_sample_idx[sz]]), axis=1)

        # No filtering

        # Preprocessing
        if which_set == 'train':
            scaler = preprocessing.StandardScaler()
            scaler = scaler.fit(data.transpose())

            with open(scaler_path, 'w') as f:
                pickle.dump(scaler, f)

            data = scaler.transform(data.transpose()).transpose()
        else:
            with open(scaler_path) as f:
                scaler = pickle.load(f)

            data = scaler.transform(data.transpose()).transpose()

        # Input transformation
        X = np.reshape(data, (-1, sample_size))
        y = np.reshape(labels, (-1, sample_size))
        y = np.sum(y, 1).transpose()
        y[y > 0] = 1

        print 'Seizure index after transform:', np.where(y)[0]
        self.seizure_seconds = np.where(y)[0]

        # Duplicate the labels for all channels
        y = np.tile(y, n_channels)

        # Format the target into proper format
        n_classes = 2
        one_hot_formatter = OneHotFormatter(max_labels=n_classes)
        y = one_hot_formatter.format(y)

        # Check batch size
        cut_off = X.shape[0] % batch_size
        if cut_off > 0:
            X = X[:-cut_off,:]
            y = y[:-cut_off,:]

        return X, y, n_channels, sample_size
예제 #27
0
def _build_frames_w_phn(dataset, subset, wav_seqs, seqs_to_phns,
                        in_samples, out_samples, shift,
                        win_width, shuffle):
        
    #import pdb; pdb.set_trace()
    norm_seqs = utils.standardize(wav_seqs)
    #norm_seqs = utils.normalize(wav_seqs)
    
    frame_len = in_samples + out_samples
    overlap = frame_len - shift
    
    samples = []
    seqs_phn_info = []
    seqs_phn_shift = []

        
    # CAUTION!: I am using here reduced phone set
    # we can also try using the full set but we must store phn+1
    # because 0 no more refers to 'h#' (no speech)

    for ind in range(len(norm_seqs)):
        #import pdb; pdb.set_trace()
        wav_seq = norm_seqs[ind]
        phn_seq = seqs_to_phns[ind]
        phn_start_end = dataset.__dict__[subset+"_phn"][phn_seq[0]:phn_seq[1]]

        # create a matrix with consecutive windows
        # phones are padded by h#, because each window will be shifted once
        # the first phone samples has passed

        phones = np.append(phn_start_end[:,2].astype('int16'),
                           np.zeros((1,),dtype='int16'))
        # phones = np.append(phn_start_end[:,2],
        #                    np.zeros((1,)))

        phn_windows = segment_axis(phones, win_width, win_width-1)

        # array that has endings of each phone
        phn_ends = phn_start_end[:,1]
        # extend the last phone till the end, this is not wrong as long as the
        # last phone is no speech phone (h#)
        phn_ends[-1] = wav_seq.shape[0]-1

        # create a mapping from each sample to phn_window
        phn_win_shift = np.zeros_like(wav_seq,dtype='int16')
        phn_win_shift[phn_ends] = 1
        phn_win = phn_win_shift.cumsum(dtype='int16')
        # minor correction!
        phn_win[-1] = phn_win[-2]

        # Segment samples into frames
        samples.append(segment_axis(wav_seq, frame_len, overlap))

        # for phones we care only about one value to mark the start of a new window.
        # the start of a phone window in a frame is when all samples of previous
        # phone hav passed, so we use 'min' function to choose the current phone
        # of the frame
        phn_frames = segment_axis(phn_win, frame_len, overlap).min(axis=1)
        # replace the window index with the window itself
        win_frames = phn_windows[phn_frames]
        seqs_phn_info.append(win_frames)

        #import pdb; pdb.set_trace()
        # create a window shift for each frame
        shift_frames_aux = np.roll(phn_frames,1)
        shift_frames_aux[0] = 0
        shift_frames = phn_frames - shift_frames_aux
        # to mark the ending of the sequence - countering the first correction!
        shift_frames[-1] = 1
        seqs_phn_shift.append(shift_frames)
        #import pdb; pdb.set_trace()
    
        
    #import pdb; pdb.set_trace()
    # stack all data in one matrix, each row is a frame
    samples_data = np.vstack(samples[:])
    phn_data = np.vstack(seqs_phn_info[:])
    shift_data = np.hstack(seqs_phn_shift[:])

    
    #convert phone data to one-hot
    from pylearn2.format.target_format import OneHotFormatter
    fmt = OneHotFormatter(max_labels=39, dtype='float32')
    
    phn_data = fmt.format(phn_data)
    phn_data = phn_data.reshape(phn_data.shape[0],
                                phn_data.shape[1]*phn_data.shape[2])
    
    full_data = np.hstack([samples_data[:,:in_samples], phn_data, #input
                           samples_data[:,in_samples:], #out1
                           shift_data.reshape(shift_data.shape[0],1)]) #out2
    
    if shuffle:
        np.random.seed(123)
        full_data = np.random.permutation(full_data)

    
    data_x = full_data[:,:in_samples+win_width*39]
    data_y1 = full_data[:,in_samples+win_width*39:-1]
    data_y2 = full_data[:,-1]
    
        
    print 'Done'
    print 'There are %d examples in %s set'%(data_x.shape[0],subset)

    print "--------------"
    print 'data_x.shape', data_x.shape
    print 'data_y1.shape', data_y1.shape
    
    return utils.shared_dataset(data_x), \
           utils.shared_dataset(data_y1),\
           utils.shared_dataset(data_y2)
예제 #28
0
import numpy
from pylearn2_timit.timitlpc import TIMITlpc
from pylearn2.space import CompositeSpace, VectorSpace, IndexSpace
from pylearn2.format.target_format import OneHotFormatter

valid = TIMITlpc("valid", frame_length=160, overlap=159, start=10, stop=11)

valid._iter_data_specs = (CompositeSpace((IndexSpace(dim=3,max_labels=61), VectorSpace(dim=10),)), ('phones', 'lpc_features'))

formatter = OneHotFormatter(max_labels=62)

f = lambda x: formatter.format(numpy.asarray(x, dtype=int), mode='merge')

#valid._iter_convert = [f, None]

it = valid.iterator(mode='random_uniform', batch_size=100, num_batches=100)











예제 #29
0
    def load_data(self, which_set, sample_size_second, batch_size,
                  scaler_path):
        raw_data, raw_labels, channel_labels, \
        seizure_range_idx, seizure_range_second, seizure_seconds, \
        n_channels, sample_size, sampling_rate = self.load_source_data(sample_size_second)

        self.channel_labels = channel_labels
        self.seizure_seconds_src = seizure_seconds
        self.sampling_rate = sampling_rate
        self.raw_data = raw_data

        # Generate seiuzre index (rounded to be divided by the sampling rate)
        seizure_round_sample_idx = np.empty(seizure_range_second.size,
                                            dtype=object)
        for r in range(seizure_range_second.size):
            start_idx = seizure_range_second[r][0] * sampling_rate
            end_idx = seizure_range_second[r][-1] * sampling_rate
            seizure_round_sample_idx[r] = np.arange(start_idx, end_idx)

        # Generate non-seizure index
        non_seizure_round_sample_idx = np.arange(raw_data.shape[1])
        for s_idx in seizure_round_sample_idx:
            non_seizure_round_sample_idx = np.setdiff1d(
                non_seizure_round_sample_idx, s_idx)

        # Partition non-seizure data into segments
        # Then randomly choose for training, cv and test sets
        n_segments = 10
        segment_size = non_seizure_round_sample_idx.size / n_segments
        segment_size = segment_size - (segment_size % sampling_rate)
        segment_idx = np.empty(n_segments, dtype=object)
        for i in range(n_segments):
            start_segment_idx = i * segment_size
            end_segment_idx = (i + 1) * segment_size
            if end_segment_idx > non_seizure_round_sample_idx.size:
                end_segment_idx = non_seizure_round_sample_idx.size
            segment_idx[i] = np.arange(start_segment_idx, end_segment_idx)

        # Select test seizure index
        test_seizure_idx = self.leave_one_out_seizure
        np.random.seed(test_seizure_idx)

        # Leave-one-out cross-validation - seizure
        n_seizures = seizure_range_idx.shape[0]
        rest_seizure_idx = np.setdiff1d(np.arange(n_seizures),
                                        test_seizure_idx)
        perm_rest_seizure_idx = np.random.permutation(rest_seizure_idx)
        train_seizure_idx = perm_rest_seizure_idx
        cv_seizure_idx = perm_rest_seizure_idx

        # Leave-one-out cross-validation - non-seizure
        n_train_segments = int(n_segments * 0.6)
        n_cv_segments = int(n_segments * 0.2)
        non_seizure_segment_idx = np.arange(n_segments)
        perm_non_seizure_segment_idx = np.random.permutation(
            non_seizure_segment_idx)
        train_sample_segments = perm_non_seizure_segment_idx[:n_train_segments]
        cv_sample_segments = perm_non_seizure_segment_idx[
            n_train_segments:n_train_segments + n_cv_segments]
        test_sample_segments = perm_non_seizure_segment_idx[n_train_segments +
                                                            n_cv_segments:]
        train_sample_idx = np.empty(0, dtype=int)
        for s in train_sample_segments:
            train_sample_idx = np.append(train_sample_idx, segment_idx[s])
        cv_sample_idx = np.empty(0, dtype=int)
        for s in cv_sample_segments:
            cv_sample_idx = np.append(cv_sample_idx, segment_idx[s])
        test_sample_idx = np.empty(0, dtype=int)
        for s in test_sample_segments:
            test_sample_idx = np.append(test_sample_idx, segment_idx[s])

        print 'Segment index for train, cv and test sets:', \
              train_sample_segments, cv_sample_segments, test_sample_segments

        print 'Seizure index for train, cv and test sets:', \
              train_seizure_idx, cv_seizure_idx, [test_seizure_idx]

        if which_set == 'train':
            print("Loading training data...")
            data = raw_data[:, non_seizure_round_sample_idx[train_sample_idx]]
            labels = raw_labels[non_seizure_round_sample_idx[train_sample_idx]]
            select_seizure = train_seizure_idx
        elif which_set == 'valid':
            print("Loading validation data...")
            data = raw_data[:, non_seizure_round_sample_idx[cv_sample_idx]]
            labels = raw_labels[non_seizure_round_sample_idx[cv_sample_idx]]
            select_seizure = cv_seizure_idx
        elif which_set == 'test':
            print("Loading test data...")
            data = raw_data[:, non_seizure_round_sample_idx[test_sample_idx]]
            labels = raw_labels[non_seizure_round_sample_idx[test_sample_idx]]
            select_seizure = [test_seizure_idx]
        elif which_set == 'all':
            print("Loading all data...")
            data = raw_data
            labels = raw_labels
            select_seizure = []
        else:
            raise ('Invalid set.')

        # Add seizure data
        for sz in select_seizure:
            data = np.concatenate(
                (data, raw_data[:, seizure_round_sample_idx[sz]]), axis=1)
            labels = np.concatenate(
                (labels, raw_labels[seizure_round_sample_idx[sz]]), axis=1)

        # No filtering

        # Preprocessing
        if which_set == 'train':
            scaler = preprocessing.StandardScaler()
            scaler = scaler.fit(data.transpose())

            with open(scaler_path, 'w') as f:
                pickle.dump(scaler, f)

            data = scaler.transform(data.transpose()).transpose()
        else:
            with open(scaler_path) as f:
                scaler = pickle.load(f)

            data = scaler.transform(data.transpose()).transpose()

        # Input transformation
        X = np.reshape(data, (-1, sample_size))
        y = np.reshape(labels, (-1, sample_size))
        y = np.sum(y, 1).transpose()
        y[y > 0] = 1

        print 'Seizure index after transform:', np.where(y)[0]
        self.seizure_seconds = np.where(y)[0]

        # Duplicate the labels for all channels
        y = np.tile(y, n_channels)

        # Format the target into proper format
        n_classes = 2
        one_hot_formatter = OneHotFormatter(max_labels=n_classes)
        y = one_hot_formatter.format(y)

        # Check batch size
        cut_off = X.shape[0] % batch_size
        if cut_off > 0:
            X = X[:-cut_off, :]
            y = y[:-cut_off, :]

        return X, y, n_channels, sample_size
예제 #30
0
    def __init__(self,
                 which_set,
                 onehot_dtype='uint8',
                 center=False,
                 rescale=False,
                 gcn=None,
                 start=None,
                 stop=None,
                 axes=('b', 0, 1, 'c'),
                 toronto_prepro=False,
                 preprocessor=None):
        """Modified version of the CIFAR10 constructor which creates Y
        as one-hot vectors rather than simple indexes. This is super
        hacky. Sorry, Guido.."""

        # note: there is no such thing as the cifar10 validation set;
        # pylearn1 defined one but really it should be user-configurable
        # (as it is here)

        self.axes = axes

        # we define here:
        dtype = 'uint8'
        ntrain = 50000
        nvalid = 0  # artefact, we won't use it
        ntest = 10000

        # we also expose the following details:
        self.img_shape = (3, 32, 32)
        self.img_size = numpy.prod(self.img_shape)
        self.n_classes = 10
        self.label_names = [
            'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog',
            'horse', 'ship', 'truck'
        ]

        # prepare loading
        fnames = ['data_batch_%i' % i for i in range(1, 6)]
        datasets = {}
        datapath = os.path.join(
            string_utils.preprocess('${PYLEARN2_DATA_PATH}'), 'cifar10',
            'cifar-10-batches-py')
        for name in fnames + ['test_batch']:
            fname = os.path.join(datapath, name)
            if not os.path.exists(fname):
                raise IOError(fname + " was not found. You probably need to "
                              "download the CIFAR-10 dataset by using the "
                              "download script in "
                              "pylearn2/scripts/datasets/download_cifar10.sh "
                              "or manually from "
                              "http://www.cs.utoronto.ca/~kriz/cifar.html")
            datasets[name] = cache.datasetCache.cache_file(fname)

        lenx = numpy.ceil((ntrain + nvalid) / 10000.) * 10000
        x = numpy.zeros((lenx, self.img_size), dtype=dtype)
        y = numpy.zeros((lenx, 1), dtype=dtype)

        # load train data
        nloaded = 0
        for i, fname in enumerate(fnames):
            _logger.info('loading file %s' % datasets[fname])
            data = serial.load(datasets[fname])
            x[i * 10000:(i + 1) * 10000, :] = data['data']
            y[i * 10000:(i + 1) * 10000, 0] = data['labels']
            nloaded += 10000
            if nloaded >= ntrain + nvalid + ntest:
                break

        # load test data
        _logger.info('loading file %s' % datasets['test_batch'])
        data = serial.load(datasets['test_batch'])

        # process this data
        Xs = {'train': x[0:ntrain], 'test': data['data'][0:ntest]}

        Ys = {'train': y[0:ntrain], 'test': data['labels'][0:ntest]}

        X = numpy.cast['float32'](Xs[which_set])

        y = Ys[which_set]
        if isinstance(y, list):
            y = numpy.asarray(y).astype(dtype)
        if which_set == 'test':
            assert y.shape[0] == 10000
            y = y.reshape((y.shape[0], 1))

        formatter = OneHotFormatter(self.n_classes, dtype=onehot_dtype)
        y = formatter.format(y, mode='concatenate')

        if center:
            X -= 127.5
        self.center = center

        if rescale:
            X /= 127.5
        self.rescale = rescale

        if toronto_prepro:
            assert not center
            assert not gcn
            X = X / 255.
            if which_set == 'test':
                other = CIFAR10(which_set='train')
                oX = other.X
                oX /= 255.
                X = X - oX.mean(axis=0)
            else:
                X = X - X.mean(axis=0)
        self.toronto_prepro = toronto_prepro

        self.gcn = gcn
        if gcn is not None:
            gcn = float(gcn)
            X = global_contrast_normalize(X, scale=gcn)

        if start is not None:
            # This needs to come after the prepro so that it doesn't
            # change the pixel means computed above for toronto_prepro
            assert start >= 0
            assert stop > start
            assert stop <= X.shape[0]
            X = X[start:stop, :]
            y = y[start:stop, :]
            assert X.shape[0] == y.shape[0]

        if which_set == 'test':
            assert X.shape[0] == 10000

        view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3),
                                                                  axes)

        super(CIFAR10, self).__init__(
            X=X,
            y=y,
            view_converter=view_converter,
        )  #y_labels=self.n_classes)

        assert not contains_nan(self.X)

        if preprocessor:
            preprocessor.apply(self)

        # Another hack: rename 'targets' to match model expectations
        space, (X_source, y_source) = self.data_specs
        self.data_specs = (space, (X_source, 'condition'))
예제 #31
0
    def __init__(self,
                 db,                # data source
                 name = '',         # optional name

                 selectors = dict(),

                 partitioner = None,

                 meta_sources = [],     # optional sources other than 'features' and 'targets' from metadata

                 channel_filter = NoChannelFilter(),   # optional channel filter, default: keep all
                 channel_names = None,  # optional channel names (for metadata)

                 label_attribute = 'label', # metadata attribute to be used as label
                 label_map = None,      # optional conversion of labels
                 use_targets = True,    # use targets if provides, otherwise labels are used

                 remove_dc_offset = False,  # optional subtraction of channel mean, usually done already earlier
                 resample = None,       # optional down-sampling
                 normalize = True,      # normalize to max=1

                 # optional sub-sequences selection
                 start_sample = 0,
                 stop_sample  = None,   # optional for selection of sub-sequences
                 zero_padding = True,   # if True (default) trials that are too short will be padded with
                                        # otherwise they will rejected.

                 # optional signal filter to by applied before splitting the signal
                 signal_filter = None,

                 trial_processors = [],     # optional processing of the trials
                 target_processor = None,   # optional processing of the targets, e.g. zero-padding
                 transformers = [],         # optional transformations of the dataset

                 layout='tf',       # (0,1)-axes layout tf=time x features or ft=features x time

                 debug=False,
                 ):
        '''
        Constructor
        '''

        # save params
        self.params = locals().copy()
        del self.params['self']
        # print self.params

        self.name = name
        self.debug = debug

        metadb = DatasetMetaDB(db.metadata, selectors.keys())

        if partitioner is not None:
            pass # FIXME

        selected_trial_ids = metadb.select(selectors)
        log.info('selectors: {}'.format(selectors))
        log.info('selected trials: {}'.format(selected_trial_ids))

        if normalize:
            log.info('Data will be normalized to max amplitude 1 per channel (normalize=True).')

        trials = list()
        labels = list()
        targets = list()
        meta = list()

        if stop_sample == 'auto-min':
            stop_sample = np.min([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids])
            log.info('Using minimum trial length. stop_sample={}'.format(stop_sample))
        elif stop_sample ==  'auto-max':
            stop_sample = np.max([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids])
            log.info('Using maximum trial length. stop_sample={}'.format(stop_sample))

        for trial_i in selected_trial_ids:

            trial_meta = db.metadata[trial_i]

            if use_targets:
                if targets is None:
                    target = None
                else:
                    target = db.targets[trial_i]
                    assert not np.isnan(np.sum(target))

                if target_processor is not None:
                    target = target_processor.process(target, trial_meta)

                    assert not np.isnan(np.sum(target))
            else:
                # get and process label
                label = db.metadata[trial_i][label_attribute]
                if label_map is not None:
                    label = label_map[label]

            processed_trial = []

            trial = db.data[trial_i]

            if np.isnan(np.sum(trial)):
                print trial_i, trial

            assert not np.isnan(np.sum(trial))

            rejected = False # flag for trial rejection

            trial = np.atleast_2d(trial)

            # process 1 channel at a time
            for channel in xrange(trial.shape[0]):
                # filter channels
                if not channel_filter.keep_channel(channel):
                    continue

                samples = trial[channel, :]

                # subtract channel mean
                if remove_dc_offset:
                    samples -= samples.mean()

                # down-sample if requested
                if resample is not None and resample[0] != resample[1]:
                    samples = librosa.resample(samples, resample[0], resample[1], res_type='sinc_best')

                # apply optional signal filter after down-sampling -> requires lower order
                if signal_filter is not None:
                    samples = signal_filter.process(samples)

                # get sub-sequence in resampled space
                # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape))

                if stop_sample is not None and stop_sample > len(samples):
                    if zero_padding:
                        tmp = np.zeros(stop_sample)
                        tmp[:len(samples)] = samples
                        samples = tmp
                    else:
                        rejected = True
                        break # stop processing this trial

                s = samples[start_sample:stop_sample]

                # TODO optional channel processing

                # normalize to max amplitude 1
                if normalize:
                    s = librosa.util.normalize(s)

                # add 2nd data dimension
                s = s.reshape(s.shape[0], 1)
                # print s.shape

                s = np.asfarray(s, dtype=theano.config.floatX)

                processed_trial.append(s)

                ### end of channel iteration ###

            if rejected:
                continue    # next trial

            processed_trial = np.asfarray([processed_trial], dtype=theano.config.floatX)

            # processed_trial = processed_trial.reshape((1, processed_trial.shape))
            processed_trial = np.rollaxis(processed_trial, 1, 4)

            # optional (external) trial processing, e.g. windowing
            # trials will be in b01c format with tf layout for 01-axes
            for trial_processor in trial_processors:
                processed_trial = trial_processor.process(processed_trial, trial_meta)

            trials.append(processed_trial)

            for k in range(len(processed_trial)):
                meta.append(trial_meta)

                if use_targets:
                    targets.append(target)
                else:
                    labels.append(label)

        ### end of datafile iteration ###

        # turn into numpy arrays
        self.trials = np.vstack(trials)

        assert not np.isnan(np.sum(self.trials))

        # prepare targets / labels
        if use_targets:
            self.targets = np.vstack(targets)
            assert not np.isnan(np.sum(self.targets))
        else:
            labels = np.hstack(labels)
            if label_map is None:
                one_hot_formatter = OneHotFormatter(max(labels) + 1)
            else:
                one_hot_formatter = OneHotFormatter(max(label_map.values()) + 1)
            one_hot_y = one_hot_formatter.format(labels)
            self.targets = one_hot_y

        self.metadata = meta

        if layout == 'ft': # swap axes to (batch, feature, time, channels)
            self.trials = self.trials.swapaxes(1, 2)

        # transform after finalizing the data structure
        for transformer in transformers:
            self.trials, self.targets = transformer.process(self.trials, self.targets)

        self.trials = np.asarray(self.trials, dtype=theano.config.floatX)

        log.debug('final dataset shape: {} (b,0,1,c)'.format(self.trials.shape))
        # super(EEGEpochsDataset, self).__init__(topo_view=self.trials, y=self.targets, axes=['b', 0, 1, 'c'])

        self.X = self.trials.reshape(self.trials.shape[0], np.prod(self.trials.shape[1:]))
        self.y = self.targets
        log.info('generated dataset "{}" with shape X={}={} y={} targets={} '.
                 format(self.name, self.X.shape, self.trials.shape, self.y.shape, self.targets.shape))


        # determine data specs
        features_space = Conv2DSpace(
            shape=[self.trials.shape[1], self.trials.shape[2]],
            num_channels=self.trials.shape[3]
        )
        features_source = 'features'

        targets_space = VectorSpace(dim=self.targets.shape[-1])
        targets_source = 'targets'

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]

        # additional support for meta information
        self.meta_maps = dict()
        for meta_source in meta_sources:
            self.meta_maps[meta_source] = sorted(list(set([m[meta_source] for m in self.metadata])))
            space_components.extend([VectorSpace(dim=1)])
            source_components.extend([meta_source])
            log.info('Generated meta-source "{}" with value map: {}'
                     .format(meta_source, self.meta_maps[meta_source]))

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        log.debug('data specs: {}'.format(self.data_specs))
예제 #32
0
    def __init__(self, 
                 path,
                 name = '',         # optional name
                 
                 # selectors
                 subjects='all',        # optional selector (list) or 'all'
                 trial_types='all',     # optional selector (list) or 'all'
                 trial_numbers='all',   # optional selector (list) or 'all'
                 conditions='all',      # optional selector (list) or 'all'     
                 
                 partitioner = None,            
                 
                 channel_filter = NoChannelFilter(),   # optional channel filter, default: keep all
                 channel_names = None,  # optional channel names (for metadata)
                 
                 label_map = None,      # optional conversion of labels

                 remove_dc_offset = False,  # optional subtraction of channel mean, usually done already earlier
                 resample = None,       # optional down-sampling

                 # optional sub-sequences selection
                 start_sample = 0,
                 stop_sample  = None,   # optional for selection of sub-sequences

                 # optional signal filter to by applied before spitting the signal
                 signal_filter = None,

                 # windowing parameters
                 frame_size = -1,
                 hop_size   = -1,       # values > 0 will lead to windowing
                 hop_fraction = None,   # alternative to specifying absolute hop_size
                 
                 # optional spectrum parameters, n_fft = 0 keeps raw data
                 n_fft = 0,
                 n_freq_bins = None,
                 spectrum_log_amplitude = False,
                 spectrum_normalization_mode = None,
                 include_phase = False,

                 flatten_channels=False,
                 layout='tf',       # (0,1)-axes layout tf=time x features or ft=features x time

                 save_matrix_path = None,
                 keep_metadata = False,
                 ):
        '''
        Constructor
        '''

        # save params
        self.params = locals().copy()
        del self.params['self']
        # print self.params
        
        # TODO: get the whole filtering into an extra class
        
        datafiles_metadata, metadb = load_datafiles_metadata(path)
        
#         print datafiles_metadata
        
        def apply_filters(filters, node):            
            if isinstance(node, dict):            
                filtered = []
                keepkeys = filters[0]
                for key, value in node.items():
                    if keepkeys == 'all' or key in keepkeys:
                        filtered.extend(apply_filters(filters[1:], value))
                return filtered
            else:
                return node # [node]
            
        
        # keep only files that match the metadata filters
        self.datafiles = apply_filters([subjects,trial_types,trial_numbers,conditions], datafiles_metadata)
        
        # copy metadata for retained files
        self.metadb = {}
        for datafile in self.datafiles:
            self.metadb[datafile] = metadb[datafile]
        
#         print self.datafiles
#         print self.metadb
        
        self.name = name

        if partitioner is not None:
            self.datafiles = partitioner.get_partition(self.name, self.metadb)
        
        self.include_phase = include_phase
        self.spectrum_normalization_mode = spectrum_normalization_mode
        self.spectrum_log_amplitude = spectrum_log_amplitude

        self.sequence_partitions = [] # used to keep track of original sequences
        
        # metadata: [subject, trial_no, stimulus, channel, start, ]
        self.metadata = []
        
        sequences = []
        labels = []
        n_sequences = 0

        if frame_size > 0 and hop_size == -1 and hop_fraction is not None:
            hop_size = np.ceil(frame_size / hop_fraction)

        for i in xrange(len(self.datafiles)):        
            with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): 

                # save start of next sequence
                self.sequence_partitions.append(n_sequences)

                data, metadata = load(os.path.join(path, self.datafiles[i]))

                label = metadata['label']
                if label_map is not None:
                    label = label_map[label]

                multi_channel_frames = []

                # process 1 channel at a time
                for channel in xrange(data.shape[1]):
                    # filter channels
                    if not channel_filter.keep_channel(channel):
                        continue

                    samples = data[:, channel]

                    # subtract channel mean
                    if remove_dc_offset:
                        samples -= samples.mean()

                    # down-sample if requested
                    if resample is not None and resample[0] != resample[1]:
                        samples = librosa.resample(samples, resample[0], resample[1])

                    # apply optional signal filter after down-sampling -> requires lower order
                    if signal_filter is not None:
                        samples = signal_filter.process(samples)

                    # get sub-sequence in resampled space
                    # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape))
                    samples = samples[start_sample:stop_sample]

                    if n_fft is not None and n_fft > 0: # Optionally:
                        ### frequency spectrum branch ###

                        # transform to spectogram
                        hop_length = n_fft / 4;
            
                        '''
                        from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html
                        >>> # Get a power spectrogram from a waveform y
                        >>> S       = np.abs(librosa.stft(y)) ** 2
                        >>> log_S   = librosa.logamplitude(S)
                        '''                                     
                             
                        S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length)
                        # mag = np.abs(S)        # magnitude spectrum
                        mag = np.abs(S)**2       # power spectrum
                        
                        # include phase information if requested
                        if self.include_phase:
                            # phase = np.unwrap(np.angle(S))
                            phase = np.angle(S)

                        # Optionally: cut off high bands
                        if n_freq_bins is not None:
                            mag = mag[0:n_freq_bins, :]
                            if self.include_phase:
                                phase = phase[0:n_freq_bins, :]
                                                  
                        if self.spectrum_log_amplitude:      
                            mag = librosa.logamplitude(mag)
                            
                        s = mag # for normalization
                                                    
                        '''
                        NOTE on normalization:
                        It depends on the structure of a neural network and (even more) 
                        on the properties of data. There is no best normalization algorithm 
                        because if there would be one, it would be used everywhere by default...
                    
                        In theory, there is no requirement for the data to be normalized at all. 
                        This is a purely practical thing because in practice convergence could 
                        take forever if your input is spread out too much. The simplest would be 
                        to just normalize it by scaling your data to (-1,1) (or (0,1) depending 
                        on activation function), and in most cases it does work. If your 
                        algorithm converges well, then this is your answer. If not, there are 
                        too many possible problems and methods to outline here without knowing 
                        the actual data.
                        '''
    
                        ## normalize to mean 0, std 1
                        if self.spectrum_normalization_mode == 'mean0_std1':
                            # s = preprocessing.scale(s, axis=0);
                            mean = np.mean(s)
                            std = np.std(s)
                            s = (s - mean) / std
                        
                        ## normalize by linear transform to [0,1]
                        elif self.spectrum_normalization_mode == 'linear_0_1':
                            s = s / np.max(s)
                        
                        ## normalize by linear transform to [-1,1]
                        elif self.spectrum_normalization_mode == 'linear_-1_1':
                            s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s))
                            
                        elif self.spectrum_normalization_mode is not None:
                            raise ValueError(
                                'unsupported spectrum normalization mode {}'.format(
                                    self.spectrum_normalization_mode)
                             )
                        
                        #print s.mean(axis=0)
                        #print s.std(axis=0)
    
                        # include phase information if requested
                        if self.include_phase:
                            # normalize phase to [-1.1]
                            phase = phase / np.pi
                            s = np.vstack([s, phase])
                        
                        # transpose to fit pylearn2 layout
                        s = np.transpose(s)
                        # print s.shape

                        ### end of frequency spectrum branch ###
                    else:
                        ### raw waveform branch ###

                        # normalize to max amplitude 1
                        s = librosa.util.normalize(samples)

                        # add 2nd data dimension
                        s = s.reshape(s.shape[0], 1)
                        # print s.shape

                        ### end of raw waveform branch ###

                    s = np.asfarray(s, dtype='float32')

                    if frame_size > 0 and hop_size > 0:
                        s = s.copy() # FIXME: THIS IS NECESSARY IN MultiChannelEEGSequencesDataset - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!!
                        frames = frame(s, frame_length=frame_size, hop_length=hop_size)
                    else:
                        frames = s
                    del s
                    # print frames.shape

                    if flatten_channels:
                        # add artificial channel dimension
                        frames = frames.reshape((frames.shape[0], frames.shape[1], frames.shape[2], 1))
                        # print frames.shape

                        sequences.append(frames)

                        # increment counter by new number of frames
                        n_sequences += frames.shape[0]

                        if keep_metadata:
                            # determine channel name
                            channel_name = None
                            if channel_names is not None:
                                channel_name = channel_names[channel]
                            elif 'channels' in metadata:
                                channel_name = metadata['channels'][channel]

                            self.metadata.append({
                                        'subject'   : metadata['subject'],            # subject
                                        'trial_type': metadata['trial_type'],         # trial_type
                                        'trial_no'  : metadata['trial_no'],           # trial_no
                                        'condition' : metadata['condition'],          # condition
                                        'channel'   : channel,                        # channel
                                        'channel_name' : channel_name,
                                        'start'     : self.sequence_partitions[-1],   # start
                                        'stop'      : n_sequences                     # stop
                                    })

                        for _ in xrange(frames.shape[0]):
                            labels.append(label)
                    else:
                        multi_channel_frames.append(frames)

                    ### end of channel iteration ###


                if not flatten_channels:
                    # turn list into array
                    multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32')
                    # [channels x frames x time x freq] -> cb01
                    # [channels x frames x time x 1] -> cb0.

                    # move channel dimension to end
                    multi_channel_frames = np.rollaxis(multi_channel_frames, 0, 4)
                    # print multi_channel_frames.shape
                    # log.debug(multi_channel_frames.shape)

                    sequences.append(multi_channel_frames)

                    # increment counter by new number of frames
                    n_sequences += multi_channel_frames.shape[0]

                    if keep_metadata:
                        self.metadata.append({
                                    'subject'   : metadata['subject'],            # subject
                                    'trial_type': metadata['trial_type'],         # trial_type
                                    'trial_no'  : metadata['trial_no'],           # trial_no
                                    'condition' : metadata['condition'],          # condition
                                    'channel'   : 'all',                          # channel
                                    'start'     : self.sequence_partitions[-1],   # start
                                    'stop'      : n_sequences                     # stop
                                })

                    for _ in xrange(multi_channel_frames.shape[0]):
                        labels.append(label)

                ### end of datafile iteration ###
      
        # turn into numpy arrays
        sequences = np.vstack(sequences)
        # print sequences.shape;
        
        labels = np.hstack(labels)
        
        # one_hot_y = one_hot(labels)
        one_hot_formatter = OneHotFormatter(labels.max() + 1) # FIXME!
        one_hot_y = one_hot_formatter.format(labels)
                
        self.labels = labels

        if layout == 'ft': # swap axes to (batch, feature, time, channels)
            sequences = sequences.swapaxes(1, 2)
            
        log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape))
        super(MultiChannelEEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c'])
        
        log.info('generated dataset "{}" with shape X={}={} y={} labels={} '.
                 format(self.name, self.X.shape, sequences.shape, self.y.shape, self.labels.shape))

        if save_matrix_path is not None:
            matrix = DenseDesignMatrix(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c'])
            with log_timing(log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)):
                serial.save(save_matrix_path, matrix)
예제 #33
0
class IndexSpace(Space):
    """
    A space representing indices, for example MNIST labels (0-10) or the
    indices of words in a dictionary for NLP tasks. A single space can
    contain multiple indices, for example the word indices of an n-gram.

    IndexSpaces can be converted to VectorSpaces in two ways: Either the
    labels are converted into one-hot vectors which are then concatenated,
    or they are converted into a single vector where 1s indicate labels
    present i.e. for 4 possible labels we have [0, 2] -> [1 0 1 0] or
    [0, 2] -> [1 0 0 0 0 0 1 0].
    """
    def __init__(self, max_labels, dim, **kwargs):
        """
        Initialize an IndexSpace.

        Parameters
        ----------
        max_labels : int
            The number of possible classes/labels. This means that
            all labels should be < max_labels. Example: For MNIST
            there are 10 numbers and hence max_labels = 10.
        dim : int
            The number of indices in one space e.g. for MNIST there is
            one target label and hence dim = 1. If we have an n-gram
            of word indices as input to a neurel net language model, dim = n.
        kwargs: passes on to superclass constructor
        """

        super(IndexSpace, self).__init__(**kwargs)

        self.max_labels = max_labels
        self.dim = dim
        self.formatter = OneHotFormatter(self.max_labels)

    def __str__(self):
        """
        Return a string representation.
        """
        return '%(classname)s(dim=%(dim)s, max_labels=%(max_labels)s' % \
               dict(classname=self.__class__.__name__,
                    dim=self.dim,
                    max_labels=self.max_labels)

    @functools.wraps(Space.get_total_dimension)
    def get_total_dimension(self):
        return self.dim

    @functools.wraps(Space.np_format_as)
    def np_format_as(self, batch, space):
        if isinstance(space, VectorSpace):
            if self.max_labels == space.dim:
                rval = self.formatter.format(batch, sparse=space.sparse,
                                             mode='merge')
            elif self.dim * self.max_labels == space.dim:
                rval = self.formatter.format(batch, sparse=space.sparse,
                                             mode='concatenate')
            else:
                raise ValueError("Can't convert IndexSpace to"
                                 "VectorSpace (%d labels to %d dimensions)"
                                 % (self.dim, space.dim))
            return rval
        else:
            raise ValueError("Can't convert IndexSpace to %(space)s"
                             % (space.__class__.__name__))

    @functools.wraps(Space._format_as)
    def _format_as(self, batch, space):
        """
        Supports formatting to a VectorSpace where indices are represented
        by ones in a binary vector.
        """
        if isinstance(space, VectorSpace):
            if self.max_labels == space.dim:
                rval = self.formatter.theano_expr(batch, sparse=space.sparse,
                                                  mode='merge')
            elif self.dim * self.max_labels == space.dim:
                rval = self.formatter.theano_expr(batch, sparse=space.sparse,
                                                  mode='concatenate')
            else:
                raise ValueError("Can't convert IndexSpace to"
                                 "VectorSpace (%d labels to %d dimensions)"
                                 % (self.dim, space.dim))
            return rval
        else:
            raise ValueError("Can't convert IndexSpace to %(space)s"
                             % (space.__class__.__name__))

    @functools.wraps(Space.make_theano_batch)
    def make_theano_batch(self, name=None, dtype=None, batch_size=None):
        if batch_size == 1:
            rval = T.lrow(name=name)
        else:
            rval = T.lmatrix(name=name)
        return rval

    @functools.wraps(Space.batch_size)
    def batch_size(self, batch):
        self.validate(batch)
        return batch.shape[0]

    @functools.wraps(Space.np_batch_size)
    def np_batch_size(self, batch):
        self.np_validate(batch)
        return batch.shape[0]

    @functools.wraps(Space._validate)
    def _validate(self, batch):
        """
        .. todo::

            WRITEME
        """
        if not isinstance(batch, theano.gof.Variable):
            raise TypeError("IndexSpace batch should be a theano Variable, "
                            "got " + str(type(batch)))
        if not isinstance(batch.type, (theano.tensor.TensorType,
                                       CudaNdarrayType)):
            raise TypeError("VectorSpace batch should be TensorType or "
                            "CudaNdarrayType, got "+str(batch.type))
        if batch.ndim != 2:
            raise ValueError('IndexSpace batches must be 2D, got %d '
                             'dimensions' % batch.ndim)
        for val in get_debug_values(batch):
            self.np_validate(val)

    @functools.wraps(Space._np_validate)
    def _np_validate(self, batch):
        # Use the 'CudaNdarray' string to avoid importing theano.sandbox.cuda
        # when it is not available
        if (not isinstance(batch, np.ndarray)
            and str(type(batch)) != "<type 'CudaNdarray'>"):
            raise TypeError("The value of a IndexSpace batch should be a "
                            "numpy.ndarray, or CudaNdarray, but is %s."
                            % str(type(batch)))
        if batch.ndim != 2:
            raise ValueError("The value of a IndexSpace batch must be "
                             "2D, got %d dimensions for %s." % (batch.ndim,
                                                                batch))
        if batch.shape[1] != self.dim:
            raise ValueError("The width of a IndexSpace batch must match "
                             "with the space's dimension, but batch has shape "
                             "%s and dim = %d." % (str(batch.shape), self.dim))
예제 #34
0
    def __init__(
        self,
        db,  # data source
        name='',  # optional name
        selectors=dict(),
        partitioner=None,
        meta_sources=[],  # optional sources other than 'features' and 'targets' from metadata
        channel_filter=NoChannelFilter(
        ),  # optional channel filter, default: keep all
        channel_names=None,  # optional channel names (for metadata)
        label_attribute='label',  # metadata attribute to be used as label
        label_map=None,  # optional conversion of labels
        use_targets=True,  # use targets if provides, otherwise labels are used
        remove_dc_offset=False,  # optional subtraction of channel mean, usually done already earlier
        resample=None,  # optional down-sampling
        normalize=True,  # normalize to max=1

        # optional sub-sequences selection
        start_sample=0,
        stop_sample=None,  # optional for selection of sub-sequences
        zero_padding=True,  # if True (default) trials that are too short will be padded with
        # otherwise they will rejected.

        # optional signal filter to by applied before splitting the signal
        signal_filter=None,
        trial_processors=[],  # optional processing of the trials
        target_processor=None,  # optional processing of the targets, e.g. zero-padding
        transformers=[],  # optional transformations of the dataset
        layout='tf',  # (0,1)-axes layout tf=time x features or ft=features x time
        debug=False,
    ):
        '''
        Constructor
        '''

        # save params
        self.params = locals().copy()
        del self.params['self']
        # print self.params

        self.name = name
        self.debug = debug

        metadb = DatasetMetaDB(db.metadata, selectors.keys())

        if partitioner is not None:
            pass  # FIXME

        selected_trial_ids = metadb.select(selectors)
        log.info('selectors: {}'.format(selectors))
        log.info('selected trials: {}'.format(selected_trial_ids))

        if normalize:
            log.info(
                'Data will be normalized to max amplitude 1 per channel (normalize=True).'
            )

        trials = list()
        labels = list()
        targets = list()
        meta = list()

        if stop_sample == 'auto-min':
            stop_sample = np.min(
                [db.data[trial_i].shape[-1] for trial_i in selected_trial_ids])
            log.info('Using minimum trial length. stop_sample={}'.format(
                stop_sample))
        elif stop_sample == 'auto-max':
            stop_sample = np.max(
                [db.data[trial_i].shape[-1] for trial_i in selected_trial_ids])
            log.info('Using maximum trial length. stop_sample={}'.format(
                stop_sample))

        for trial_i in selected_trial_ids:

            trial_meta = db.metadata[trial_i]

            if use_targets:
                if targets is None:
                    target = None
                else:
                    target = db.targets[trial_i]
                    assert not np.isnan(np.sum(target))

                if target_processor is not None:
                    target = target_processor.process(target, trial_meta)

                    assert not np.isnan(np.sum(target))
            else:
                # get and process label
                label = db.metadata[trial_i][label_attribute]
                if label_map is not None:
                    label = label_map[label]

            processed_trial = []

            trial = db.data[trial_i]

            if np.isnan(np.sum(trial)):
                print trial_i, trial

            assert not np.isnan(np.sum(trial))

            rejected = False  # flag for trial rejection

            trial = np.atleast_2d(trial)

            # process 1 channel at a time
            for channel in xrange(trial.shape[0]):
                # filter channels
                if not channel_filter.keep_channel(channel):
                    continue

                samples = trial[channel, :]

                # subtract channel mean
                if remove_dc_offset:
                    samples -= samples.mean()

                # down-sample if requested
                if resample is not None and resample[0] != resample[1]:
                    samples = librosa.resample(samples,
                                               resample[0],
                                               resample[1],
                                               res_type='sinc_best')

                # apply optional signal filter after down-sampling -> requires lower order
                if signal_filter is not None:
                    samples = signal_filter.process(samples)

                # get sub-sequence in resampled space
                # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape))

                if stop_sample is not None and stop_sample > len(samples):
                    if zero_padding:
                        tmp = np.zeros(stop_sample)
                        tmp[:len(samples)] = samples
                        samples = tmp
                    else:
                        rejected = True
                        break  # stop processing this trial

                s = samples[start_sample:stop_sample]

                # TODO optional channel processing

                # normalize to max amplitude 1
                if normalize:
                    s = librosa.util.normalize(s)

                # add 2nd data dimension
                s = s.reshape(s.shape[0], 1)
                # print s.shape

                s = np.asfarray(s, dtype=theano.config.floatX)

                processed_trial.append(s)

                ### end of channel iteration ###

            if rejected:
                continue  # next trial

            processed_trial = np.asfarray([processed_trial],
                                          dtype=theano.config.floatX)

            # processed_trial = processed_trial.reshape((1, processed_trial.shape))
            processed_trial = np.rollaxis(processed_trial, 1, 4)

            # optional (external) trial processing, e.g. windowing
            # trials will be in b01c format with tf layout for 01-axes
            for trial_processor in trial_processors:
                processed_trial = trial_processor.process(
                    processed_trial, trial_meta)

            trials.append(processed_trial)

            for k in range(len(processed_trial)):
                meta.append(trial_meta)

                if use_targets:
                    targets.append(target)
                else:
                    labels.append(label)

        ### end of datafile iteration ###

        # turn into numpy arrays
        self.trials = np.vstack(trials)

        assert not np.isnan(np.sum(self.trials))

        # prepare targets / labels
        if use_targets:
            self.targets = np.vstack(targets)
            assert not np.isnan(np.sum(self.targets))
        else:
            labels = np.hstack(labels)
            if label_map is None:
                one_hot_formatter = OneHotFormatter(max(labels) + 1)
            else:
                one_hot_formatter = OneHotFormatter(
                    max(label_map.values()) + 1)
            one_hot_y = one_hot_formatter.format(labels)
            self.targets = one_hot_y

        self.metadata = meta

        if layout == 'ft':  # swap axes to (batch, feature, time, channels)
            self.trials = self.trials.swapaxes(1, 2)

        # transform after finalizing the data structure
        for transformer in transformers:
            self.trials, self.targets = transformer.process(
                self.trials, self.targets)

        self.trials = np.asarray(self.trials, dtype=theano.config.floatX)

        log.debug('final dataset shape: {} (b,0,1,c)'.format(
            self.trials.shape))
        # super(EEGEpochsDataset, self).__init__(topo_view=self.trials, y=self.targets, axes=['b', 0, 1, 'c'])

        self.X = self.trials.reshape(self.trials.shape[0],
                                     np.prod(self.trials.shape[1:]))
        self.y = self.targets
        log.info('generated dataset "{}" with shape X={}={} y={} targets={} '.
                 format(self.name, self.X.shape, self.trials.shape,
                        self.y.shape, self.targets.shape))

        # determine data specs
        features_space = Conv2DSpace(
            shape=[self.trials.shape[1], self.trials.shape[2]],
            num_channels=self.trials.shape[3])
        features_source = 'features'

        targets_space = VectorSpace(dim=self.targets.shape[-1])
        targets_source = 'targets'

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]

        # additional support for meta information
        self.meta_maps = dict()
        for meta_source in meta_sources:
            self.meta_maps[meta_source] = sorted(
                list(set([m[meta_source] for m in self.metadata])))
            space_components.extend([VectorSpace(dim=1)])
            source_components.extend([meta_source])
            log.info('Generated meta-source "{}" with value map: {}'.format(
                meta_source, self.meta_maps[meta_source]))

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        log.debug('data specs: {}'.format(self.data_specs))
예제 #35
0
    def load_data(self):
        # Get the directory of the patient data
        patient_dir = os.path.join(self.data_dir, self.patient_id)

        # Load metadata about dataset form MAT file
        metadata_fname = os.path.join(patient_dir, 'trainset_' + str(self.preictal_sec) + '.mat')
        metadata_mat = loadmat(metadata_fname)

        # Get number of seizures
        self.n_seizures = metadata_mat.get('ictals').size

        # Get detail of the segment
        self.sampling_rate = metadata_mat['sampling_rate'][0][0]
        self.segment_sec = metadata_mat['segment_sec'][0][0]
        self.segment_samples = self.sampling_rate * self.segment_sec

        # Get the number blocks to extend from the withheld seizure
        self.n_extended_blocks_test = metadata_mat['n_extended_blocks_test'][0][0]

        self.preictal_samples = 0
        self.nonictal_samples = 0
        self.nan_non_flat_samples = 0

        # Examples of indexing through MAT file
        # mat['nonictals'][i][0]['filename'][0][0][0][j][0]
        # mat['nonictals'][i][0]['idx'][0][0][0][j][0]
        # mat['nonictals'][i][0]['n_segments'][0][0][0][0]

        # Load shuffle data
        if self.which_set == 'train' or self.which_set == 'valid_train':

            if self.which_set == 'train':
                select_idx = np.setdiff1d(range(metadata_mat['preictals'].size),
                                          np.asarray([self.leave_out_seizure_idx_valid,
                                                      self.leave_out_seizure_idx_test]))
            else:
                select_idx = np.asarray([self.leave_out_seizure_idx_valid])

            X = None
            y = None

            if self.use_all_nonictals:
                temp_preictal_X = None
                for i in select_idx:
                    print '====== Seizure', i, '======'

                    # Pre-ictal
                    temp_X = self.load_feature(part='preictals',
                                               list_features=self.list_features,
                                               seizure_idx=i,
                                               metadata_mat=metadata_mat,
                                               patient_dir=patient_dir)

                    if not (temp_preictal_X is None):
                        temp_preictal_X = np.concatenate((temp_preictal_X, temp_X), axis=1)
                    else:
                        temp_preictal_X = temp_X

                self.preictal_samples = temp_preictal_X.shape[1]

                # Non-ictal data
                temp_nonictal_X = self.load_feature(part='nonictals_all',
                                                    list_features=self.list_features,
                                                    seizure_idx=self.leave_out_seizure_idx_test,
                                                    metadata_mat=metadata_mat,
                                                    patient_dir=patient_dir)
                X = np.concatenate((temp_preictal_X, temp_nonictal_X), axis=1)
                y = np.zeros(X.shape[1], dtype=int)
                y[range(self.preictal_samples)] = 1

                self.nonictal_samples = temp_nonictal_X.shape[1]

                print 'Preictal samples: {0}, Nonictal samples: {1}'.format(self.preictal_samples,
                                                                            self.nonictal_samples)
                if not np.all(np.arange(self.preictal_samples) == np.where(y)[0]):
                    raise Exception('There is a mismatch between the number of preictal data and labels.')

            else:
                for i in select_idx:
                    print '====== Seizure', i, '======'

                    # Non-ictal data
                    temp_nonictal_X = self.load_feature(part='nonictals',
                                                        list_features=self.list_features,
                                                        seizure_idx=i,
                                                        metadata_mat=metadata_mat,
                                                        patient_dir=patient_dir)

                    # Pre-ictal
                    temp_preictal_X = self.load_feature(part='preictals',
                                                        list_features=self.list_features,
                                                        seizure_idx=i,
                                                        metadata_mat=metadata_mat,
                                                        patient_dir=patient_dir)

                    # Concatenate preictal and nonictal data
                    temp_X = np.concatenate((temp_preictal_X, temp_nonictal_X), axis=1)
                    temp_y = np.zeros(temp_X.shape[1], dtype=int)
                    temp_y[range(temp_preictal_X.shape[1])] = 1

                    # Sanity check
                    # if not (temp_preictal_X.shape[1] == temp_nonictal_X.shape[1]):
                    #     raise Exception('Unbalanced classes.')
                    print 'Preictal samples: {0}, Nonictal samples: {1}'.format(temp_preictal_X.shape[1],
                                                                                temp_nonictal_X.shape[1])
                    if not np.all(np.arange(temp_preictal_X.shape[1]) == np.where(temp_y)[0]):
                        raise Exception('There is a mismatch between the number of preictal data and labels.')

                    self.preictal_samples = self.preictal_samples + temp_preictal_X.shape[1]
                    self.nonictal_samples = self.nonictal_samples + temp_nonictal_X.shape[1]

                    if not (X is None) and not (y is None):
                        X = np.concatenate((X, temp_X), axis=1)
                        y = np.append(y, temp_y)
                    else:
                        X = temp_X
                        y = temp_y

        # Load continuous data
        elif self.which_set == 'valid' or self.which_set == 'test':

            if self.which_set == 'valid':
                select_idx = self.leave_out_seizure_idx_valid
            else:
                select_idx = self.leave_out_seizure_idx_test

            print '====== Seizure', select_idx, '======'

            # Get metadata of all blocks
            block_df = pd.read_table(os.path.join(patient_dir, 'block_metadata.txt'), sep='\t')

            # Get block index of the selected seizure
            select_sz_fname = metadata_mat['preictals'][select_idx][0]['filename'][0][0][0][0][0]
            block_idx = np.where(block_df.filename == select_sz_fname)[0][0]

            start_block_idx = block_idx - self.n_extended_blocks_test
            end_block_idx = block_idx + self.n_extended_blocks_test + 1

            if start_block_idx < 0:
                start_block_idx = 0
            if end_block_idx > block_df.shape[0]:
                end_block_idx = block_df.shape[0]

            select_block_idx = np.arange(start_block_idx, end_block_idx)
            filenames = block_df.filename[select_block_idx].values

            X = None
            y = None
            y_label_all = None
            ictal_labels = None
            for b_idx, fname in enumerate(filenames):
                # Name of the MAT file that stores indices of flat (i.e., false) segments
                fname_flat = fname.replace('.data', '_flat_signal_segment_idx.mat')

                # Get all good indices (i.e., remove segments of flat signals)
                flat_mat = loadmat(os.path.join(patient_dir, fname_flat))
                flat_idx = np.empty(0, dtype=int)
                for j in range(flat_mat['flat_signal_segment_idx'].shape[0]):
                    flat_idx = np.append(flat_idx, np.squeeze(flat_mat['flat_signal_segment_idx'][j][0]))
                flat_idx = flat_idx - 1 # Change from MATLAB to python index system

                n_segments = np.ceil(block_df.samples[select_block_idx[b_idx]] / (self.segment_samples * 1.0))
                all_idx = np.arange(n_segments, dtype=int)
                good_idx = np.setdiff1d(all_idx, flat_idx)

                print 'Load', self.which_set, 'data from', fname

                if good_idx.size > 0:
                    # Features with shape [n_features, n_samples]
                    temp_X = self.load_list_feature(list_features=self.list_features,
                                                    sample_idx=good_idx,
                                                    fname=fname,
                                                    patient_dir=patient_dir)

                    # If this record contains preictal data in the withheld seizures, get preictal labels
                    temp_y_withheld = self.get_labels(label_type='preictals',
                                                      filename=fname,
                                                      good_idx=good_idx,
                                                      metadata_mat=metadata_mat,
                                                      n_all_segments=n_segments,
                                                      n_data_segments=temp_X.shape[1],
                                                      select_meta_idx=select_idx)

                    # If this record contains preictal data in the selected seizures, get preictal labels
                    temp_y_select = self.get_labels(label_type='preictals',
                                                    filename=fname,
                                                    good_idx=good_idx,
                                                    metadata_mat=metadata_mat,
                                                    n_all_segments=n_segments,
                                                    n_data_segments=temp_X.shape[1])

                    # If this record contains preictal data in all seizures, get preictal labels
                    temp_y_rm = self.get_labels(label_type='all_preictals',
                                                filename=fname,
                                                good_idx=good_idx,
                                                metadata_mat=metadata_mat,
                                                n_all_segments=n_segments,
                                                n_data_segments=temp_X.shape[1])

                    tmp_preictal_withheld_idx = np.where(temp_y_withheld == 1)[0]
                    tmp_preictal_select_idx = np.where(temp_y_select == 1)[0]
                    tmp_preictal_rm_idx = np.where(temp_y_rm == 1)[0]
                    tmp_preictal_select_idx = np.setdiff1d(tmp_preictal_select_idx, tmp_preictal_withheld_idx)
                    tmp_preictal_rm_idx = np.setdiff1d(tmp_preictal_rm_idx, tmp_preictal_withheld_idx)
                    tmp_preictal_rm_idx = np.setdiff1d(tmp_preictal_rm_idx, tmp_preictal_select_idx)

                    self.preictal_samples = self.preictal_samples + np.where(temp_y_withheld == 1)[0].size
                    self.nonictal_samples = self.nonictal_samples + np.where(temp_y_withheld == 0)[0].size

                    if tmp_preictal_withheld_idx.size > 0:
                        print ' Load preictal data from the withheld seizure from this file.'
                        print ' Size:', tmp_preictal_withheld_idx.size, tmp_preictal_withheld_idx
                    if tmp_preictal_select_idx.size > 0:
                        print ' Load preictal data from selected seizures in addition to the withheld seizure from this file.'
                        print ' Size:', tmp_preictal_select_idx.size, tmp_preictal_select_idx
                    if tmp_preictal_rm_idx.size > 0:
                        print ' Load preictal data from removed seizures in addition to the withheld seizure from this file.'
                        print ' Size:', tmp_preictal_rm_idx.size, tmp_preictal_rm_idx

                    # Sanity check
                    if np.intersect1d(tmp_preictal_withheld_idx, tmp_preictal_select_idx).size > 0:
                        raise Exception('There is an overlapped of the labels between the withheld seizures, and the selected seizures.')
                    if np.intersect1d(tmp_preictal_select_idx, tmp_preictal_rm_idx).size > 0:
                        raise Exception('There is an overlapped of the labels between the selected seizures, and the removed seizures.')
                    if np.intersect1d(tmp_preictal_withheld_idx, tmp_preictal_rm_idx).size > 0:
                        raise Exception('There is an overlapped of the labels between the withheld seizures, and the removed seizures.')

                    temp_y_all = np.zeros(temp_X.shape[1], dtype=int)
                    temp_y_all[tmp_preictal_withheld_idx] = 1   # Labels for the withheld seizure
                    temp_y_all[tmp_preictal_select_idx] = 2     # Labels for the selected seizure (that is not from withheld seizures)
                    temp_y_all[tmp_preictal_rm_idx] = 3         # Labels for the removed seizure (that is not from withheld seizures)

                    # If this record contains ictal data, get ictal labels
                    temp_ictal_labels = self.get_labels(label_type='all_ictals',
                                                        filename=fname,
                                                        good_idx=good_idx,
                                                        metadata_mat=metadata_mat,
                                                        n_all_segments=n_segments,
                                                        n_data_segments=temp_X.shape[1])

                    tmp_ictal_idx = np.where(temp_ictal_labels == 1)[0]
                    if tmp_ictal_idx.size > 0:
                        print ' Ictal label:', tmp_ictal_idx.size, tmp_ictal_idx

                    # Dealing with NaN features after filtering out flat segment which occurs due to noise in the data,
                    # not from flat segments
                    nan_sample_idx = np.where(np.isnan(np.sum(temp_X, 0)))[0]
                    nan_feature_idx = np.where(np.isnan(np.sum(temp_X, 1)))[0]
                    if nan_sample_idx.size > 0 or nan_feature_idx.size > 0:
                        print self.which_set, 'contains NaN at:'
                        print ' sample_idx:', good_idx[nan_sample_idx], ' feature_idx:', nan_feature_idx
                        print ' shape before remove NaN:', temp_X.shape
                        tmp_preictal_idx = np.where(temp_y_withheld == 1)[0]
                        tmp_nonictal_idx = np.where(temp_y_withheld == 0)[0]
                        nan_preictal_sample_idx = np.intersect1d(tmp_preictal_idx, nan_sample_idx)
                        nan_nonictal_sample_idx = np.intersect1d(tmp_nonictal_idx, nan_sample_idx)
                        if nan_preictal_sample_idx.size > 0:
                            print ' NaN are in preictal index:', good_idx[nan_preictal_sample_idx]
                        if nan_nonictal_sample_idx.size > 0:
                            print ' NaN are in nonictal index:', good_idx[nan_nonictal_sample_idx]
                        all_idx = np.arange(temp_X.shape[1])
                        good_idx_1 = np.setdiff1d(all_idx, nan_sample_idx)
                        temp_X = temp_X[:, good_idx_1]
                        temp_y_all = temp_y_all[good_idx_1]
                        temp_y_withheld = temp_y_withheld[good_idx_1]
                        temp_ictal_labels = temp_ictal_labels[good_idx_1]
                        print ' shape before remove NaN:', temp_X.shape
                        self.nan_non_flat_samples = self.nan_non_flat_samples + nan_sample_idx.size

                    # Sanity check
                    tmp_nan_sample_idx = np.where(np.isnan(np.sum(temp_X, 0)))[0]
                    if tmp_nan_sample_idx.size > 0:
                        raise Exception('There is an error in removing NaN')
                    if not (temp_X.shape[1] == temp_y_all.size):
                        raise Exception('Number of feature data and labels [temp_y_all] are not equal.')
                    if not (temp_X.shape[1] == temp_y_withheld.size):
                        raise Exception('Number of feature data and labels [temp_y_withheld] are not equal.')
                    if not (temp_X.shape[1] == temp_ictal_labels.size):
                        raise Exception('Number of feature data and labels [ictal_labels] are not equal.')

                    if not (X is None) and not (y is None) and not (ictal_labels is None):
                        X = np.concatenate((X, temp_X), axis=1)
                        y = np.append(y, temp_y_withheld)
                        y_label_all = np.append(y_label_all, temp_y_all)
                        ictal_labels = np.append(ictal_labels, temp_ictal_labels)
                    else:
                        X = temp_X
                        y = temp_y_withheld
                        y_label_all = temp_y_all
                        ictal_labels = temp_ictal_labels
                else:
                    print 'There is no good segment for during this seizure'

            # Store preictal labels that are from the withheld index (use for compute accuracy), selected seizure index,
            #  and removed seizure index.
            # Note: this property will exist when which_set=='valid' or which_set=='test'
            #       as there is no need for ictal to be imported.
            self.y_label_all = y_label_all

            # Sanity check
            if np.where(y == 1)[0].size > np.where(y_label_all > 0)[0].size:
                raise Exception('There is an error in collecting preictal labels only from the leave-out-seizure index.')
            if np.where(y == 1)[0].size == np.where(y_label_all == 1)[0].size:
                print 'There is only one preictal periods, and this period is from the leave-out-seizure index.'
                if not np.all(np.where(y == 1)[0] == np.where(y_label_all == 1)[0]):
                    raise Exception('There is a mismatch between y and y_label_all.')
            if np.where(y == 1)[0].size < np.where(y_label_all > 0)[0].size:
                print 'There are more than one preictal periods.'
                if not np.all(np.where(y == 1)[0] == np.where(y_label_all == 1)[0]):
                    raise Exception('There is a mismatch between y_select_idx and y in the preictal labels of the leave-out-seizure index.')

            # Store ictal labels
            # Note: this property will exist when which_set=='valid' or which_set=='test'
            #       as there is no need for ictal to be imported.
            self.ictal_labels = ictal_labels
        else:
            raise Exception('Invalid dataset selection')

        print 'There are {0} samples that have been removed in addition to the flat signal as due to NaN.'.format(self.nan_non_flat_samples)

        X = np.transpose(X, [1, 0])
        one_hot_formatter = OneHotFormatter(max_labels=2)
        y = one_hot_formatter.format(y)

        # Sanity check
        # Note: We ignore nan_non_flat_samples if we load shuffle data as we specify the labels after the NaN have been removed
        #       In contrast to loading continuous data, we specify the labels before removing NaN, so we have to remove the NaN samples for checking
        if self.which_set == 'train' or self.which_set == 'valid_train':
            if not (X.shape[0] == self.preictal_samples + self.nonictal_samples):
                raise Exception('There is a mismatch in the number of training samples ({0} != {1}).'.format(X.shape[0],
                                                                                                             self.preictal_samples + self.nonictal_samples))
            if not (np.where(np.argmax(y, axis=1) == 1)[0].size == self.preictal_samples):
                raise Exception('There is a mismatch in the number of preictal samples and its labels ({0} != {1}).'.format(np.where(np.argmax(y, axis=1) == 1)[0].size,
                                                                                                                            self.preictal_samples))
            if not (X.shape[0] == y.shape[0]):
                raise Exception('There is a mismatch in the number of training samples and its labels ({0} != {1}).'.format(X.shape[0],
                                                                                                                            y.shape[0]))
        elif self.which_set == 'valid' or self.which_set == 'test':
            if not (X.shape[0] == self.preictal_samples + self.nonictal_samples - self.nan_non_flat_samples):
                raise Exception('There is a mismatch in the number of training samples ({0} != {1}).'.format(X.shape[0],
                                                                                                             self.preictal_samples + self.nonictal_samples - self.nan_non_flat_samples))
            if not ((np.where(np.argmax(y, axis=1) == 1)[0].size + np.where(np.argmax(y, axis=1) == 0)[0].size) ==
                        self.preictal_samples + self.nonictal_samples - self.nan_non_flat_samples):
                raise Exception('There is a mismatch in the number of samples and its labels ({0} != {1}).'.format(np.where(np.argmax(y, axis=1) == 1)[0].size + np.where(np.argmax(y, axis=1) == 0)[0].size,
                                                                                                                   self.preictal_samples))
            if not (X.shape[0] == y.shape[0]):
                raise Exception('There is a mismatch in the number of training samples and its labels ({0} != {1}).'.format(X.shape[0],
                                                                                                                            y.shape[0]))

        return X, y
예제 #36
0
    def __init__(self, 
                 path, suffix='',   # required data file parameters
                 subjects='all',    # optional selector (list) or 'all'
                 start_sample = 0,
                 stop_sample  = None, # optional for selection of sub-sequences
                 frame_size = -1, 
                 hop_size   = -1,   # values > 0 will lead to windowing
                 label_mode='tempo',
                 name = '',         # optional name
                 n_fft = 0,
                 n_freq_bins = None,
                 save_matrix_path = None,
                 channels = None,
                 resample = None,
                 stimulus_id_filter = None,
                 keep_metadata = False,
                 spectrum_log_amplitude = False,
                 spectrum_normalization_mode = None,
                 include_phase = False,
                 layout = 'tf'     # 2D axes layout tf=time x features or ft= features x time
                 ):
        '''
        Constructor
        '''

        # save params
        self.params = locals().copy()
        del self.params['self']
        # print self.params
        
        self.name = name;
        
        self.include_phase = include_phase;
        self.spectrum_normalization_mode = spectrum_normalization_mode;
        self.spectrum_log_amplitude = spectrum_log_amplitude;
        
        self.datafiles = [];
        subject_paths = glob.glob(os.path.join(path, 'Sub*'));
        for path in subject_paths:
            dataset_filename = os.path.join(path, 'dataset'+suffix+'.pklz');
            if os.path.isfile(dataset_filename):   
                log.debug('addding {}'.format(dataset_filename));
                self.datafiles.append(dataset_filename);
            else:
                log.warn('file does not exists {}'.format(dataset_filename));                
        self.datafiles.sort();
        
        if subjects == 'all':
            subjects = np.arange(0,len(self.datafiles));            
        assert subjects is not None and len(subjects) > 0;
        
        self.label_mode = label_mode;
        self.label_converter = LabelConverter();
        
        if stimulus_id_filter is None:
            stimulus_id_filter = [];
        self.stimulus_id_filter = stimulus_id_filter;
        
        self.subject_partitions = []; # used to keep track of original subjects
        self.sequence_partitions = []; # used to keep track of original sequences
        self.trial_partitions = []; # keeps track of original trials
        
        # metadata: [subject, trial_no, stimulus, channel, start, ]
        self.metadata = [];
        
        sequences = [];
        labels = [];
        n_sequences = 0;
        last_raw_label = -1;
        for i in xrange(len(self.datafiles)):
            if i in subjects:
                with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): 
                    self.subject_partitions.append(n_sequences);                            # save start of next subject
                    
                    subject_sequences, subject_labels, channel_meta = load(self.datafiles[i]);
                    
                    subject_trial_no = -1;
                    
                    for j in xrange(len(subject_sequences)):
                        l = subject_labels[j];                                              # get raw label

                        if l in stimulus_id_filter:
#                             log.debug('skipping stimulus {}'.format(l));
                            continue;

                        c = channel_meta[j][0];
                        
                        if channels is not None and not c in channels:                      # apply optional channel filter
                            log.debug('skipping channel {}'.format(c));
                            continue;
                        
                        self.sequence_partitions.append(n_sequences);                       # save start of next sequence                        
                        
                        if l != last_raw_label:                                             # if raw label changed...
                            self.trial_partitions.append(n_sequences);                      # ...save start of next trial
                            subject_trial_no += 1;                                          # increment subject_trial_no counter
                            last_raw_label = l;
                        
                        l = self.label_converter.get_label(l[0], self.label_mode);          # convert to label_mode view
                        
                        s = subject_sequences[j];                        
                        s = s[start_sample:stop_sample];                                    # get sub-sequence in original space

                        # down-sample if requested
                        if resample is not None and resample[0] != resample[1]:
                            s = librosa.resample(s, resample[0], resample[1]);
                                                                                                
                        if n_fft is not None and n_fft > 0:                          # Optionally:
                                                                                            #     transform to spectogram
                            hop_length = n_fft / 4;
        
                            '''
                            from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html
                            >>> # Get a power spectrogram from a waveform y
                            >>> S       = np.abs(librosa.stft(y)) ** 2
                            >>> log_S   = librosa.logamplitude(S)
                            '''                                     
#                             s = np.abs(librosa.core.stft(s, 
#                                                             n_fft=n_fft, 
#                                                             hop_length=hop_length)
#                                           )**2;      
                            
                            S = librosa.core.stft(s, n_fft=n_fft, hop_length=hop_length);
#                             mag = np.abs(S);        # magnitude spectrum
                            mag = np.abs(S)**2;       # power spectrum  
                            # phase = np.unwrap(np.angle(S));
                            phase = np.angle(S);
                            
                            if n_freq_bins is not None:                               # Optionally:
                                mag = mag[0:n_freq_bins, :];                          #    cut off high bands
                                phase = phase[0:n_freq_bins, :];
                                                      
                            if self.spectrum_log_amplitude:      
                                mag = librosa.logamplitude(mag);
                                
                            s = mag; # for normalization
                                                        
                            '''
                            NOTE on normalization:
                            It depends on the structure of a neural network and (even more) 
                            on the properties of data. There is no best normalization algorithm 
                            because if there would be one, it would be used everywhere by default...
                        
                            In theory, there is no requirement for the data to be normalized at all. 
                            This is a purely practical thing because in practice convergence could 
                            take forever if your input is spread out too much. The simplest would be 
                            to just normalize it by scaling your data to (-1,1) (or (0,1) depending 
                            on activation function), and in most cases it does work. If your 
                            algorithm converges well, then this is your answer. If not, there are 
                            too many possible problems and methods to outline here without knowing 
                            the actual data.
                            '''
    
                            ## normalize to mean 0, std 1
                            if self.spectrum_normalization_mode == 'mean0_std1':
#                                 s = preprocessing.scale(s, axis=0);
                                mean = np.mean(s);
                                std = np.std(s);
                                s = (s - mean) / std;
                            
                            ## normalize by linear transform to [0,1]
                            elif self.spectrum_normalization_mode == 'linear_0_1':
                                s = s / np.max(s);
                            
                            ## normalize by linear transform to [-1,1]
                            elif self.spectrum_normalization_mode == 'linear_-1_1':
                                s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s));
                                
                            elif self.spectrum_normalization_mode is not None:
                                raise ValueError(
                                                 'unsupported spectrum normalization mode {}'.format(
                                                self.spectrum_normalization_mode)
                                                 );     
                            
                            #print s.mean(axis=0)
                            #print s.std(axis=0)

                            # include phase information if requested
                            if self.include_phase:
                                # normalize phase to [-1.1]
                                phase = phase / np.pi
                                s = np.vstack([s, phase]);                                       
                            
                            # transpose to fit pylearn2 layout
                            s = np.transpose(s);
                        else:
                            # normalize to max amplitude 1
                            s = librosa.util.normalize(s);
                        
                        s = np.asfarray(s, dtype='float32');
                        
                        if frame_size > 0 and hop_size > 0:
                            s, l = self._split_sequence(s, l, frame_size, hop_size);
                        
#                         print s.shape
                        n_sequences += len(s);
                         
                        sequences.append(s);
                        labels.extend(l);
                        
                        if keep_metadata:
                            self.metadata.append({
                                        'subject'   : i,                              # subject 
                                        'trial_no'  : subject_trial_no,               # trial_no
                                        'stimulus'  : last_raw_label[0],              # stimulus 
                                        'channel'   : c,                              # channel
                                        'start'     : self.sequence_partitions[-1],   # start
                                        'stop'      : n_sequences                     # stop
                                    });
      
        # turn into numpy arrays
        sequences = np.vstack(sequences);
        print sequences.shape;
        
        labels = np.hstack(labels);        
        
        # one_hot_y = one_hot(labels)
        one_hot_formatter = OneHotFormatter(labels.max() + 1)
        one_hot_y = one_hot_formatter.format(labels)
                
        self.labels = labels; # save for later
        
        if n_fft > 0:
            sequences = np.array([sequences]);
            
            # re-arrange dimensions
            sequences = sequences.swapaxes(0,1).swapaxes(1,2).swapaxes(2,3);

            if layout == 'ft':
                sequences = sequences.swapaxes(1,2)
            
            log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape));
            print 'final dataset shape: {} (b,0,1,c)'.format(sequences.shape)
            super(EEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']);
        else:
            # if layout == 'ft':
            #     sequences = sequences.swapaxes(1,2)

            super(EEGDataset, self).__init__(X=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']);
        
        log.debug('generated dataset "{}" with shape X={} y={} labels={} '.format(self.name, self.X.shape, self.y.shape, self.labels.shape));
        
        if save_matrix_path is not None:
            matrix = DenseDesignMatrix(X=sequences, y=one_hot_y);
            with log_timing(log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)):
                serial.save(save_matrix_path, matrix);