def _transform_multi_channel_data(self, X, y): # Data partitioning parted_X, parted_y = self._partition_data( X=X, y=y, partition_size=self.window_size) transposed_X = np.transpose(parted_X, [0, 2, 1]) converted_X = np.reshape(transposed_X, (transposed_X.shape[0], transposed_X.shape[1], 1, transposed_X.shape[2])) # Create view converter view_converter = DefaultViewConverter(shape=self.sample_shape, axes=('b', 0, 1, 'c')) # Convert data into a design matrix view_converted_X = view_converter.topo_view_to_design_mat(converted_X) assert np.all(converted_X == view_converter.design_mat_to_topo_view( view_converted_X)) # Format the target into proper format sum_y = np.sum(parted_y, axis=1) sum_y[sum_y > 0] = 1 one_hot_formatter = OneHotFormatter(max_labels=self.n_classes) hot_y = one_hot_formatter.format(sum_y) return view_converted_X, hot_y, view_converter
class ConditionalGeneratorTestCase(unittest.TestCase): def setUp(self): self.noise_dim = 10 self.num_labels = 10 self.condition_dtype = 'uint8' self.condition_space = VectorSpace(dim=self.num_labels, dtype=self.condition_dtype) self.condition_formatter = OneHotFormatter(self.num_labels, dtype=self.condition_dtype) self.condition_distribution = OneHotDistribution(self.condition_space) # TODO this nvis stuff is dirty. The ConditionalGenerator should handle it self.mlp_nvis = self.noise_dim + self.num_labels self.mlp_nout = 1 # Set up model self.mlp = MLP(nvis=self.mlp_nvis, layers=[Linear(self.mlp_nout, 'out', irange=0.1)]) self.G = ConditionalGenerator( input_condition_space=self.condition_space, condition_distribution=self.condition_distribution, noise_dim=self.noise_dim, mlp=self.mlp) def test_conditional_generator_input_setup(self): """Check that conditional generator correctly sets up composite input layer.""" # Feedforward: We want the net to ignore the noise and simply # convert the one-hot vector to a number weights = np.concatenate([ np.zeros((self.mlp_nout, self.noise_dim)), np.array(range(self.num_labels)).reshape( (1, -1)).repeat(self.mlp_nout, axis=0) ], axis=1).T.astype(theano.config.floatX) self.mlp.layers[0].set_weights(weights) inp = (T.matrix(), T.matrix(dtype=self.condition_dtype)) f = theano.function(inp, self.G.mlp.fprop(inp)) assert_array_equal( f( np.random.rand(self.num_labels, self.noise_dim).astype(theano.config.floatX), self.condition_formatter.format( np.array(range(self.num_labels)))), np.array(range(self.num_labels)).reshape(self.num_labels, 1)) def test_sample_noise(self): """Test barebones noise sampling.""" n = T.iscalar() cond_inp = self.condition_distribution.sample(n) sample_and_noise = theano.function([n], self.G.sample_and_noise( cond_inp, all_g_layers=True)[1]) print sample_and_noise(15)
def test_dtype_errors(): # Try to call theano_expr with a bad label dtype. raised = False fmt = OneHotFormatter(max_labels=50) try: fmt.theano_expr(theano.tensor.vector(dtype=theano.config.floatX)) except TypeError: raised = True assert raised # Try to call format with a bad label dtype. raised = False try: fmt.format(numpy.zeros(10, dtype='float64')) except TypeError: raised = True assert raised
def check_one_hot_formatter(seed, max_labels, dtype, ncases): rng = numpy.random.RandomState(seed) fmt = OneHotFormatter(max_labels=max_labels, dtype=dtype) integer_labels = rng.random_integers(0, max_labels - 1, size=ncases) one_hot_labels = fmt.format(integer_labels) assert len(zip(*one_hot_labels.nonzero())) == ncases for case, label in enumerate(integer_labels): assert one_hot_labels[case, label] == 1
def format_targets(y): # matlab has one-based indexing and one-based labels # have to convert to zero-based labels so subtract 1... y = y - 1 # we need only a 1d-array of integers # squeeze in case of 2 dimensions, make sure it is still 1d in case of # a single number (can happen for test runs with just one trial) y = np.atleast_1d(y.squeeze()) y = y.astype(int) target_formatter = OneHotFormatter(4) y = target_formatter.format(y) return y
def test_bad_arguments(): # Make sure an invalid max_labels raises an error. raised = False try: fmt = OneHotFormatter(max_labels=-10) except ValueError: raised = True assert raised raised = False try: fmt = OneHotFormatter(max_labels='10') except ValueError: raised = True assert raised # Make sure an invalid dtype identifier raises an error. raised = False try: fmt = OneHotFormatter(max_labels=10, dtype='invalid') except TypeError: raised = True assert raised # Make sure an invalid ndim raises an error for format(). fmt = OneHotFormatter(max_labels=10) raised = False try: fmt.format(numpy.zeros((2, 3), dtype='int32')) except ValueError: raised = True assert raised # Make sure an invalid ndim raises an error for theano_expr(). raised = False try: fmt.theano_expr(theano.tensor.imatrix()) except ValueError: raised = True assert raised
def test_bad_arguments(): # Make sure an invalid max_labels raises an error. raised = False try: fmt = OneHotFormatter(max_labels=-10) except ValueError: raised = True assert raised raised = False try: fmt = OneHotFormatter(max_labels='10') except ValueError: raised = True assert raised # Make sure an invalid dtype identifier raises an error. raised = False try: fmt = OneHotFormatter(max_labels=10, dtype='invalid') except TypeError: raised = True assert raised # Make sure an invalid ndim raises an error for format(). fmt = OneHotFormatter(max_labels=10) raised = False try: fmt.format(numpy.zeros((2, 3, 4), dtype='int32')) except ValueError: raised = True assert raised # Make sure an invalid ndim raises an error for theano_expr(). raised = False try: fmt.theano_expr(theano.tensor.itensor3()) except ValueError: raised = True assert raised
def generate_datasets(inputs): targets = np.zeros(inputs.shape[0]).astype('int') targets[::2] = 1 # every second target is class 1 others class 0 inputs[targets == 1] = inputs[targets == 1] + 1 target_formatter = OneHotFormatter(2) targets_one_hot = target_formatter.format(targets) train_set = VolumetricDenseDesignMatrix(topo_view=inputs[0:50], y=targets_one_hot[0:50], axes=('b', 0, 1, 2, 'c')) valid_set = VolumetricDenseDesignMatrix(topo_view=inputs[50:75], y=targets_one_hot[50:75], axes=('b', 0, 1, 2, 'c')) test_set = VolumetricDenseDesignMatrix(topo_view=inputs[75:100], y=targets_one_hot[75:100], axes=('b', 0, 1, 2, 'c')) return train_set, valid_set, test_set
def _transform_single_channel_data(self, X, y): windowed_X = np.reshape(X, (-1, self.window_size)) windowed_y = np.reshape(y, (-1, self.window_size)) # Format the target into proper format sum_y = np.sum(windowed_y, axis=1) sum_y[sum_y > 0] = 1 # Duplicate the labels for all channels dup_y = np.tile(sum_y, self.n_channels) one_hot_formatter = OneHotFormatter(max_labels=self.n_classes) hot_y = one_hot_formatter.format(dup_y) return windowed_X, hot_y, None
def test_one_hot_formatter_simple(): def check_one_hot_formatter(seed, max_labels, dtype, ncases): rng = numpy.random.RandomState(seed) fmt = OneHotFormatter(max_labels=max_labels, dtype=dtype) integer_labels = rng.random_integers(0, max_labels - 1, size=ncases) one_hot_labels = fmt.format(integer_labels) assert len(list(zip(*one_hot_labels.nonzero()))) == ncases for case, label in enumerate(integer_labels): assert one_hot_labels[case, label] == 1 rng = numpy.random.RandomState(0) for seed, dtype in enumerate(all_types): yield (check_one_hot_formatter, seed, rng.random_integers(1, 30), dtype, rng.random_integers(1, 100)) fmt = OneHotFormatter(max_labels=10) assert fmt.format(numpy.zeros((1, 1), dtype="uint8")).shape == (1, 1, 10)
def test_one_hot_formatter_simple(): def check_one_hot_formatter(seed, max_labels, dtype, ncases): rng = numpy.random.RandomState(seed) fmt = OneHotFormatter(max_labels=max_labels, dtype=dtype) integer_labels = rng.random_integers(0, max_labels - 1, size=ncases) one_hot_labels = fmt.format(integer_labels) assert len(list(zip(*one_hot_labels.nonzero()))) == ncases for case, label in enumerate(integer_labels): assert one_hot_labels[case, label] == 1 rng = numpy.random.RandomState(0) for seed, dtype in enumerate(all_types): yield (check_one_hot_formatter, seed, rng.random_integers(1, 30), dtype, rng.random_integers(1, 100)) fmt = OneHotFormatter(max_labels=10) assert fmt.format(numpy.zeros((1, 1), dtype='uint8')).shape == (1, 1, 10)
class ConditionalGeneratorTestCase(unittest.TestCase): def setUp(self): self.noise_dim = 10 self.num_labels = 10 self.condition_dtype = 'uint8' self.condition_space = VectorSpace(dim=self.num_labels, dtype=self.condition_dtype) self.condition_formatter = OneHotFormatter(self.num_labels, dtype=self.condition_dtype) self.condition_distribution = OneHotDistribution(self.condition_space) # TODO this nvis stuff is dirty. The ConditionalGenerator should handle it self.mlp_nvis = self.noise_dim + self.num_labels self.mlp_nout = 1 # Set up model self.mlp = MLP(nvis=self.mlp_nvis, layers=[Linear(self.mlp_nout, 'out', irange=0.1)]) self.G = ConditionalGenerator(input_condition_space=self.condition_space, condition_distribution=self.condition_distribution, noise_dim=self.noise_dim, mlp=self.mlp) def test_conditional_generator_input_setup(self): """Check that conditional generator correctly sets up composite input layer.""" # Feedforward: We want the net to ignore the noise and simply # convert the one-hot vector to a number weights = np.concatenate([np.zeros((self.mlp_nout, self.noise_dim)), np.array(range(self.num_labels)).reshape((1, -1)).repeat(self.mlp_nout, axis=0)], axis=1).T.astype(theano.config.floatX) self.mlp.layers[0].set_weights(weights) inp = (T.matrix(), T.matrix(dtype=self.condition_dtype)) f = theano.function(inp, self.G.mlp.fprop(inp)) assert_array_equal( f(np.random.rand(self.num_labels, self.noise_dim).astype(theano.config.floatX), self.condition_formatter.format(np.array(range(self.num_labels)))), np.array(range(self.num_labels)).reshape(self.num_labels, 1)) def test_sample_noise(self): """Test barebones noise sampling.""" n = T.iscalar() cond_inp = self.condition_distribution.sample(n) sample_and_noise = theano.function([n], self.G.sample_and_noise(cond_inp, all_g_layers=True)[1]) print sample_and_noise(15)
def check_one_hot_formatter(seed, max_labels, dtype, ncases, nmultis): rng = numpy.random.RandomState(seed) fmt = OneHotFormatter(max_labels=max_labels, dtype=dtype) integer_labels = rng.random_integers(0, max_labels - 1, size=ncases * nmultis).reshape(ncases, nmultis) one_hot_labels = fmt.format(integer_labels, mode="merge") # n_ones was expected to be equal to ncases * nmultis if integer_labels # do not contain duplicated tags. (i.e., those labels like # [1, 2, 2, 3, 5, 6].) Because that we are not depreciating this kind # of duplicated labels, which allows different cases belong to # different number of classes, and those duplicated tags will only # activate one neuron in the k-hot representation, we need to use # numpy.unique() here to eliminate those duplications while counting # "1"s in the final k-hot representation. n_ones = numpy.concatenate([numpy.unique(l) for l in integer_labels]) assert len(list(zip(*one_hot_labels.nonzero()))) == len(n_ones) for case, label in enumerate(integer_labels): assert numpy.sum(one_hot_labels[case, label]) == nmultis
def check_one_hot_formatter(seed, max_labels, dtype, ncases, nmultis): rng = numpy.random.RandomState(seed) fmt = OneHotFormatter(max_labels=max_labels, dtype=dtype) integer_labels = rng.random_integers( 0, max_labels - 1, size=ncases*nmultis ).reshape(ncases, nmultis) one_hot_labels = fmt.format(integer_labels, mode='merge') # n_ones was expected to be equal to ncases * nmultis if integer_labels # do not contain duplicated tags. (i.e., those labels like # [1, 2, 2, 3, 5, 6].) Because that we are not depreciating this kind # of duplicated labels, which allows different cases belong to # different number of classes, and those duplicated tags will only # activate one neuron in the k-hot representation, we need to use # numpy.unique() here to eliminate those duplications while counting # "1"s in the final k-hot representation. n_ones = numpy.concatenate([numpy.unique(l) for l in integer_labels]) assert len(list(zip(*one_hot_labels.nonzero()))) == len(n_ones) for case, label in enumerate(integer_labels): assert numpy.sum(one_hot_labels[case, label]) == nmultis
def _transform_multi_channel_data(self, X, y): # Data partitioning parted_X, parted_y = self._partition_data(X=X, y=y, partition_size=self.window_size) transposed_X = np.transpose(parted_X, [0, 2, 1]) converted_X = np.reshape(transposed_X, (transposed_X.shape[0], transposed_X.shape[1], 1, transposed_X.shape[2])) # Create view converter view_converter = DefaultViewConverter(shape=self.sample_shape, axes=('b', 0, 1, 'c')) # Convert data into a design matrix view_converted_X = view_converter.topo_view_to_design_mat(converted_X) assert np.all(converted_X == view_converter.design_mat_to_topo_view(view_converted_X)) # Format the target into proper format sum_y = np.sum(parted_y, axis=1) sum_y[sum_y > 0] = 1 one_hot_formatter = OneHotFormatter(max_labels=self.n_classes) hot_y = one_hot_formatter.format(sum_y) return view_converted_X, hot_y, view_converter
# Samples per condition sample_cols = 5 # Generate conditional information conditional_batch = model.generator.condition_space.make_theano_batch() formatter = OneHotFormatter(rows, dtype=model.generator.condition_space.dtype) conditional = formatter.theano_expr(conditional_batch, mode='concatenate') # Now sample from generator # For some reason format_as from VectorSpace is not working right topo_samples_batch = model.generator.sample(conditional) topo_sample_f = theano.function([conditional], topo_samples_batch) conditional_data = formatter.format(np.concatenate([np.repeat(i, sample_cols) for i in range(rows)]) .reshape((rows * sample_cols, 1)), mode='concatenate') topo_samples = topo_sample_f(conditional_data) samples = dataset.get_design_matrix(topo_samples) dataset.axes = ['b', 0, 1, 'c'] dataset.view_converter.axes = ['b', 0, 1, 'c'] topo_samples = dataset.get_topological_view(samples) pv = PatchViewer(grid_shape=(rows, sample_cols + 1), patch_shape=(32,32), is_color=True) scale = np.abs(samples).max() X = dataset.X topo = dataset.get_topological_view() index = 0
def __init__(self, which_set, onehot_dtype='uint8', center=False, rescale=False, gcn=None, start=None, stop=None, axes=('b', 0, 1, 'c'), toronto_prepro=False, preprocessor=None): """Modified version of the CIFAR10 constructor which creates Y as one-hot vectors rather than simple indexes. This is super hacky. Sorry, Guido..""" # note: there is no such thing as the cifar10 validation set; # pylearn1 defined one but really it should be user-configurable # (as it is here) self.axes = axes # we define here: dtype = 'uint8' ntrain = 50000 nvalid = 0 # artefact, we won't use it ntest = 10000 # we also expose the following details: self.img_shape = (3, 32, 32) self.img_size = numpy.prod(self.img_shape) self.n_classes = 10 self.label_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] # prepare loading fnames = ['data_batch_%i' % i for i in range(1, 6)] datasets = {} datapath = os.path.join( string_utils.preprocess('${PYLEARN2_DATA_PATH}'), 'cifar10', 'cifar-10-batches-py') for name in fnames + ['test_batch']: fname = os.path.join(datapath, name) if not os.path.exists(fname): raise IOError(fname + " was not found. You probably need to " "download the CIFAR-10 dataset by using the " "download script in " "pylearn2/scripts/datasets/download_cifar10.sh " "or manually from " "http://www.cs.utoronto.ca/~kriz/cifar.html") datasets[name] = cache.datasetCache.cache_file(fname) lenx = numpy.ceil((ntrain + nvalid) / 10000.) * 10000 x = numpy.zeros((lenx, self.img_size), dtype=dtype) y = numpy.zeros((lenx, 1), dtype=dtype) # load train data nloaded = 0 for i, fname in enumerate(fnames): _logger.info('loading file %s' % datasets[fname]) data = serial.load(datasets[fname]) x[i * 10000:(i + 1) * 10000, :] = data['data'] y[i * 10000:(i + 1) * 10000, 0] = data['labels'] nloaded += 10000 if nloaded >= ntrain + nvalid + ntest: break # load test data _logger.info('loading file %s' % datasets['test_batch']) data = serial.load(datasets['test_batch']) # process this data Xs = {'train': x[0:ntrain], 'test': data['data'][0:ntest]} Ys = {'train': y[0:ntrain], 'test': data['labels'][0:ntest]} X = numpy.cast['float32'](Xs[which_set]) y = Ys[which_set] if isinstance(y, list): y = numpy.asarray(y).astype(dtype) if which_set == 'test': assert y.shape[0] == 10000 y = y.reshape((y.shape[0], 1)) formatter = OneHotFormatter(self.n_classes, dtype=onehot_dtype) y = formatter.format(y, mode='concatenate') if center: X -= 127.5 self.center = center if rescale: X /= 127.5 self.rescale = rescale if toronto_prepro: assert not center assert not gcn X = X / 255. if which_set == 'test': other = CIFAR10(which_set='train') oX = other.X oX /= 255. X = X - oX.mean(axis=0) else: X = X - X.mean(axis=0) self.toronto_prepro = toronto_prepro self.gcn = gcn if gcn is not None: gcn = float(gcn) X = global_contrast_normalize(X, scale=gcn) if start is not None: # This needs to come after the prepro so that it doesn't # change the pixel means computed above for toronto_prepro assert start >= 0 assert stop > start assert stop <= X.shape[0] X = X[start:stop, :] y = y[start:stop, :] assert X.shape[0] == y.shape[0] if which_set == 'test': assert X.shape[0] == 10000 view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) super(CIFAR10, self).__init__(X=X, y=y, view_converter=view_converter, )#y_labels=self.n_classes) assert not contains_nan(self.X) if preprocessor: preprocessor.apply(self) # Another hack: rename 'targets' to match model expectations space, (X_source, y_source) = self.data_specs self.data_specs = (space, (X_source, 'condition'))
def __init__( self, path, name='', # optional name # selectors subjects='all', # optional selector (list) or 'all' trial_types='all', # optional selector (list) or 'all' trial_numbers='all', # optional selector (list) or 'all' conditions='all', # optional selector (list) or 'all' partitioner=None, channel_filter=NoChannelFilter( ), # optional channel filter, default: keep all channel_names=None, # optional channel names (for metadata) label_map=None, # optional conversion of labels remove_dc_offset=False, # optional subtraction of channel mean, usually done already earlier resample=None, # optional down-sampling # optional sub-sequences selection start_sample=0, stop_sample=None, # optional for selection of sub-sequences # optional signal filter to by applied before spitting the signal signal_filter=None, # windowing parameters frame_size=-1, hop_size=-1, # values > 0 will lead to windowing hop_fraction=None, # alternative to specifying absolute hop_size # optional spectrum parameters, n_fft = 0 keeps raw data n_fft=0, n_freq_bins=None, spectrum_log_amplitude=False, spectrum_normalization_mode=None, include_phase=False, flatten_channels=False, layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time save_matrix_path=None, keep_metadata=False, ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params # TODO: get the whole filtering into an extra class datafiles_metadata, metadb = load_datafiles_metadata(path) # print datafiles_metadata def apply_filters(filters, node): if isinstance(node, dict): filtered = [] keepkeys = filters[0] for key, value in node.items(): if keepkeys == 'all' or key in keepkeys: filtered.extend(apply_filters(filters[1:], value)) return filtered else: return node # [node] # keep only files that match the metadata filters self.datafiles = apply_filters( [subjects, trial_types, trial_numbers, conditions], datafiles_metadata) # copy metadata for retained files self.metadb = {} for datafile in self.datafiles: self.metadb[datafile] = metadb[datafile] # print self.datafiles # print self.metadb self.name = name if partitioner is not None: self.datafiles = partitioner.get_partition(self.name, self.metadb) self.include_phase = include_phase self.spectrum_normalization_mode = spectrum_normalization_mode self.spectrum_log_amplitude = spectrum_log_amplitude self.sequence_partitions = [ ] # used to keep track of original sequences # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = [] sequences = [] labels = [] n_sequences = 0 if frame_size > 0 and hop_size == -1 and hop_fraction is not None: hop_size = np.ceil(frame_size / hop_fraction) for i in xrange(len(self.datafiles)): with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): # save start of next sequence self.sequence_partitions.append(n_sequences) data, metadata = load(os.path.join(path, self.datafiles[i])) label = metadata['label'] if label_map is not None: label = label_map[label] multi_channel_frames = [] # process 1 channel at a time for channel in xrange(data.shape[1]): # filter channels if not channel_filter.keep_channel(channel): continue samples = data[:, channel] # subtract channel mean if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1]) # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) samples = samples[start_sample:stop_sample] if n_fft is not None and n_fft > 0: # Optionally: ### frequency spectrum branch ### # transform to spectogram hop_length = n_fft / 4 ''' from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html >>> # Get a power spectrogram from a waveform y >>> S = np.abs(librosa.stft(y)) ** 2 >>> log_S = librosa.logamplitude(S) ''' S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length) # mag = np.abs(S) # magnitude spectrum mag = np.abs(S)**2 # power spectrum # include phase information if requested if self.include_phase: # phase = np.unwrap(np.angle(S)) phase = np.angle(S) # Optionally: cut off high bands if n_freq_bins is not None: mag = mag[0:n_freq_bins, :] if self.include_phase: phase = phase[0:n_freq_bins, :] if self.spectrum_log_amplitude: mag = librosa.logamplitude(mag) s = mag # for normalization ''' NOTE on normalization: It depends on the structure of a neural network and (even more) on the properties of data. There is no best normalization algorithm because if there would be one, it would be used everywhere by default... In theory, there is no requirement for the data to be normalized at all. This is a purely practical thing because in practice convergence could take forever if your input is spread out too much. The simplest would be to just normalize it by scaling your data to (-1,1) (or (0,1) depending on activation function), and in most cases it does work. If your algorithm converges well, then this is your answer. If not, there are too many possible problems and methods to outline here without knowing the actual data. ''' ## normalize to mean 0, std 1 if self.spectrum_normalization_mode == 'mean0_std1': # s = preprocessing.scale(s, axis=0); mean = np.mean(s) std = np.std(s) s = (s - mean) / std ## normalize by linear transform to [0,1] elif self.spectrum_normalization_mode == 'linear_0_1': s = s / np.max(s) ## normalize by linear transform to [-1,1] elif self.spectrum_normalization_mode == 'linear_-1_1': s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)) elif self.spectrum_normalization_mode is not None: raise ValueError( 'unsupported spectrum normalization mode {}'. format(self.spectrum_normalization_mode)) #print s.mean(axis=0) #print s.std(axis=0) # include phase information if requested if self.include_phase: # normalize phase to [-1.1] phase = phase / np.pi s = np.vstack([s, phase]) # transpose to fit pylearn2 layout s = np.transpose(s) # print s.shape ### end of frequency spectrum branch ### else: ### raw waveform branch ### # normalize to max amplitude 1 s = librosa.util.normalize(samples) # add 2nd data dimension s = s.reshape(s.shape[0], 1) # print s.shape ### end of raw waveform branch ### s = np.asfarray(s, dtype='float32') if frame_size > 0 and hop_size > 0: s = s.copy( ) # FIXME: THIS IS NECESSARY IN MultiChannelEEGSequencesDataset - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!! frames = frame(s, frame_length=frame_size, hop_length=hop_size) else: frames = s del s # print frames.shape if flatten_channels: # add artificial channel dimension frames = frames.reshape( (frames.shape[0], frames.shape[1], frames.shape[2], 1)) # print frames.shape sequences.append(frames) # increment counter by new number of frames n_sequences += frames.shape[0] if keep_metadata: # determine channel name channel_name = None if channel_names is not None: channel_name = channel_names[channel] elif 'channels' in metadata: channel_name = metadata['channels'][channel] self.metadata.append({ 'subject': metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no': metadata['trial_no'], # trial_no 'condition': metadata['condition'], # condition 'channel': channel, # channel 'channel_name': channel_name, 'start': self.sequence_partitions[-1], # start 'stop': n_sequences # stop }) for _ in xrange(frames.shape[0]): labels.append(label) else: multi_channel_frames.append(frames) ### end of channel iteration ### if not flatten_channels: # turn list into array multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32') # [channels x frames x time x freq] -> cb01 # [channels x frames x time x 1] -> cb0. # move channel dimension to end multi_channel_frames = np.rollaxis(multi_channel_frames, 0, 4) # print multi_channel_frames.shape # log.debug(multi_channel_frames.shape) sequences.append(multi_channel_frames) # increment counter by new number of frames n_sequences += multi_channel_frames.shape[0] if keep_metadata: self.metadata.append({ 'subject': metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no': metadata['trial_no'], # trial_no 'condition': metadata['condition'], # condition 'channel': 'all', # channel 'start': self.sequence_partitions[-1], # start 'stop': n_sequences # stop }) for _ in xrange(multi_channel_frames.shape[0]): labels.append(label) ### end of datafile iteration ### # turn into numpy arrays sequences = np.vstack(sequences) # print sequences.shape; labels = np.hstack(labels) # one_hot_y = one_hot(labels) one_hot_formatter = OneHotFormatter(labels.max() + 1) # FIXME! one_hot_y = one_hot_formatter.format(labels) self.labels = labels if layout == 'ft': # swap axes to (batch, feature, time, channels) sequences = sequences.swapaxes(1, 2) log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)) super(MultiChannelEEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) log.info( 'generated dataset "{}" with shape X={}={} y={} labels={} '.format( self.name, self.X.shape, sequences.shape, self.y.shape, self.labels.shape)) if save_matrix_path is not None: matrix = DenseDesignMatrix(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) with log_timing( log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)): serial.save(save_matrix_path, matrix)
def svd_accuracy(file_name, ec, kwargs, folds=10, max_svs=10, max_init=15): """ Classify data based on svd features. """ kwargs['condense'] = False ds = ec.ECoG(file_name, which_set='train', **kwargs) n_classes = int(np.around(ds.y.max() + 1)) max_svs = min(max_svs, n_classes) init_list = np.arange(0, n_classes - max_svs + 1) init_list = init_list[init_list < max_init] nsvs_list = np.arange(1, max_svs + 1) pa = np.inf * np.ones((folds, len(nsvs_list), len(init_list))) ma = np.inf * np.ones((folds, len(nsvs_list), len(init_list))) va = np.inf * np.ones((folds, len(nsvs_list), len(init_list))) u_s = np.zeros((folds, n_classes, n_classes)) s_s = np.zeros((folds, n_classes)) v_s = np.zeros((folds, n_classes, ds.X.shape[1])) ohf = OneHotFormatter(n_classes) for fold in range(folds): kwargs_copy = copy.deepcopy(kwargs) print('fold: {}'.format(fold)) ds = ec.ECoG(file_name, which_set='train', fold=fold, center=False, **kwargs_copy) # CV ts = ds.get_test_set() vs = ds.get_valid_set() train_X = np.concatenate((ds.X, vs.X), axis=0) train_mean = train_X.mean(axis=0) train_X = train_X - train_mean train_y = np.concatenate((ds.y, vs.y), axis=0) test_X = ts.X - train_mean test_y = ts.y y_oh = ohf.format(train_y, mode='concatenate') c_yx = (y_oh - y_oh.mean(axis=0)).T.dot(train_X) / train_X.shape[0] u, s, v = np.linalg.svd(c_yx, full_matrices=False) u_s[fold] = u s_s[fold] = s v_s[fold] = v for ii, n_svs in enumerate(nsvs_list): for jj, sv_init in enumerate(init_list): vp = v[sv_init:sv_init + n_svs] train_proj = train_X.dot(vp.T) test_proj = test_X.dot(vp.T) cl = LR(solver='lbfgs', multi_class='multinomial').fit(train_proj, train_y.ravel()) y_hat = cl.predict(test_proj) p_results = [] m_results = [] v_results = [] for y, yh in zip(test_y.ravel(), y_hat.ravel()): pr = place_equiv(y, yh) if pr is not None: p_results.append(pr) mr = manner_equiv(y, yh) if mr is not None: m_results.append(mr) vr = vowel_equiv(y, yh) if vr is not None: v_results.append(vr) pa[fold, ii, jj] = np.array(p_results).mean() ma[fold, ii, jj] = np.array(m_results).mean() va[fold, ii, jj] = np.array(v_results).mean() return pa, ma, va, u_s, s_s, v_s, init_list, nsvs_list
def load_data(self): # Get the directory of the patient data patient_dir = os.path.join(self.data_dir, self.patient_id) # Load metadata about dataset form MAT file metadata_fname = os.path.join(patient_dir, 'trainset.mat') metadata_mat = loadmat(metadata_fname) # Get number of seizures self.n_seizures = metadata_mat.get('ictals').size # Get detail of the segment self.sampling_rate = metadata_mat['sampling_rate'][0][0] self.segment_sec = metadata_mat['segment_sec'][0][0] self.segment_samples = self.sampling_rate * self.segment_sec self.preictal_samples = 0 self.nonictal_samples = 0 # Examples of indexing through MAT file # mat['nonictals'][i][0]['filename'][0][0][0][j][0] # mat['nonictals'][i][0]['idx'][0][0][0][j][0] # mat['nonictals'][i][0]['n_segments'][0][0][0][0] # Balanced classes if self.which_set == 'train' or self.which_set == 'valid_train': if self.which_set == 'train': select_idx = np.setdiff1d( range(metadata_mat['preictals'].size), np.asarray([ self.leave_out_seizure_idx_valid, self.leave_out_seizure_idx_test ])) else: select_idx = np.asarray([self.leave_out_seizure_idx_valid]) X = None y = None for i in select_idx: print '====== Seizure', i, '======' # Non-ictal data temp_nonictal_X = self.load_segment(part='nonictals', seizure_idx=i, metadata_mat=metadata_mat, patient_dir=patient_dir) # Pre-ictal temp_preictal_X = self.load_segment(part='preictals', seizure_idx=i, metadata_mat=metadata_mat, patient_dir=patient_dir) # Concatenate preictal and nonictal data temp_X = np.concatenate((temp_preictal_X, temp_nonictal_X), axis=0) temp_y = np.zeros(temp_X.shape[0], dtype=int) temp_y[range(temp_preictal_X.shape[0])] = 1 # Sanity check # if not (temp_preictal_X.shape[0] == temp_nonictal_X.shape[0]): # raise Exception('Unbalanced classes.') print 'Preictal samples: {0}, Nonictal samples: {1}'.format( temp_preictal_X.shape[0], temp_nonictal_X.shape[0]) if not np.all( np.arange(temp_preictal_X.shape[0]) == np.where(temp_y) [0]): raise Exception( 'There is a mismatch between the number of preictal data and labels.' ) self.preictal_samples = self.preictal_samples + temp_preictal_X.shape[ 0] self.nonictal_samples = self.nonictal_samples + temp_nonictal_X.shape[ 0] if not (X is None) and not (y is None): X = np.concatenate((X, temp_X), axis=0) y = np.append(y, temp_y) else: X = temp_X y = temp_y # Unbalanced classes elif self.which_set == 'valid' or self.which_set == 'test': if self.which_set == 'valid': select_idx = self.leave_out_seizure_idx_valid else: select_idx = self.leave_out_seizure_idx_test print '====== Seizure', select_idx, '======' # Get metadata of all blocks block_df = pd.read_table(os.path.join(patient_dir, 'block_metadata.txt'), sep='\t') # Get block index of the selected seizure select_sz_fname = metadata_mat['preictals'][select_idx][0][ 'filename'][0][0][0][0][0] block_idx = np.where(block_df.filename == select_sz_fname)[0][0] n_padded_block = 2 start_block_idx = block_idx - n_padded_block end_block_idx = block_idx + n_padded_block + 1 if start_block_idx < 0: start_block_idx = 0 if end_block_idx > block_df.shape[0]: end_block_idx = block_df.shape[0] select_block_idx = np.arange(start_block_idx, end_block_idx) filenames = block_df.filename[select_block_idx].values X = None y = None y_select_idx = None ictal_labels = None for b_idx, fname in enumerate(filenames): # Name of the MAT files that store EEG data data_fname = fname.replace('.data', '.mat') # Name of the MAT file that stores indices of flat (i.e., false) segments fname_flat = fname.replace('.data', '_flat_signal_segment_idx.mat') # Get all good indices (i.e., remove segments of flat signals) flat_mat = loadmat(os.path.join(patient_dir, fname_flat)) flat_idx = np.empty(0, dtype=int) for j in range(flat_mat['flat_signal_segment_idx'].shape[0]): flat_idx = np.append( flat_idx, np.squeeze(flat_mat['flat_signal_segment_idx'][j][0])) flat_idx = flat_idx - 1 # Change from MATLAB to python index system data_mat = loadmat(os.path.join(patient_dir, data_fname)) if data_mat['signals'].shape[1] != block_df.samples[ select_block_idx[b_idx]]: raise Exception( 'There is a mismatch between the number samples specified in the metadata and ' 'the provided signal data') n_segments = np.ceil(data_mat['signals'].shape[1] / (self.segment_samples * 1.0)) all_idx = np.arange(n_segments, dtype=int) good_idx = np.setdiff1d(all_idx, flat_idx) # Get indicies of scalp EEG channels elec_names = np.asarray( [ename[0][0] for ename in data_mat['elec_names']]) scalp_channels_idx = np.empty(0, dtype=int) for ch in self.scalp_channel_labels: scalp_channels_idx = np.append( scalp_channels_idx, np.where(elec_names == ch)[0][0]) print 'Load', self.which_set, 'data from', fname if good_idx.size > 0: temp_X = None for idx in range(good_idx.size): g_idx = good_idx[idx] start_sample_idx = np.uint32( g_idx) * self.segment_samples end_sample_idx = np.uint32(g_idx + 1) * self.segment_samples if end_sample_idx > data_mat['signals'].shape[1]: # Zero-padding if the window size is not compatible extra = end_sample_idx - data_mat['signals'].shape[ 1] assert (data_mat['signals'].shape[1] + extra) % self.segment_samples == 0 if extra > 0: data_mat['signals'] = np.concatenate( (data_mat['signals'], np.zeros( (data_mat['signals'].shape[0], extra), dtype=float)), axis=1) assert data_mat['signals'].shape[ 1] % self.segment_samples == 0 temp_sample_idx = np.arange(start_sample_idx, end_sample_idx) if not (temp_X is None): temp = data_mat['signals'][:, temp_sample_idx] temp_X = np.concatenate( (temp_X, np.asarray([temp[scalp_channels_idx, :]])), axis=0) else: temp = data_mat['signals'][:, temp_sample_idx] temp_X = np.asarray([temp[scalp_channels_idx, :]]) # If this record contains preictal data, get preictal labels temp_preictal_meta_idx = -1 temp_preictal_fname_idx = -1 for preictal_meta_idx, preictal_meta in enumerate( metadata_mat['preictals']): for preictal_fname_idx, preictal_fname in enumerate( preictal_meta[0]['filename'][0][0][0]): if preictal_fname == fname: temp_preictal_meta_idx = preictal_meta_idx temp_preictal_fname_idx = preictal_fname_idx break if temp_preictal_meta_idx != -1 and temp_preictal_fname_idx != -1: # Preictal indices preictal_idx = metadata_mat['preictals'][ temp_preictal_meta_idx][0]['idx'][0][0][0][ temp_preictal_fname_idx][0] preictal_idx = preictal_idx - 1 # Change from MATLAB to python index system temp_y = np.zeros(n_segments, dtype=int) temp_y[preictal_idx] = 1 # Sanity check if not (preictal_idx.size == np.intersect1d( good_idx, preictal_idx).size): raise Exception( 'Good indices and preictal indices are mismatch.' ) # Remove segment of flat signals from labels temp_y = temp_y[good_idx] self.preictal_samples = self.preictal_samples + preictal_idx.size self.nonictal_samples = self.nonictal_samples + ( temp_y.size - preictal_idx.size) else: temp_y = np.zeros(temp_X.shape[0], dtype=int) self.nonictal_samples = self.nonictal_samples + temp_y.size # If this record contains preictal data of the leave-out-seizure index, get preictal labels if temp_preictal_meta_idx == select_idx: temp_y_select_idx = temp_y else: temp_y_select_idx = np.zeros(temp_X.shape[0], dtype=int) # If this record contains ictal data, get ictal labels temp_ictal_meta_idx = -1 temp_ictal_fname_idx = -1 for ictal_meta_idx, ictal_meta in enumerate( metadata_mat['ictals']): for ictal_fname_idx, ictal_fname in enumerate( ictal_meta[0]['filename'][0][0][0]): if ictal_fname == fname: temp_ictal_meta_idx = ictal_meta_idx temp_ictal_fname_idx = ictal_fname_idx break if temp_ictal_meta_idx != -1 and temp_ictal_fname_idx != -1: # Ictal indices ictal_idx = metadata_mat['ictals'][ temp_ictal_meta_idx][0]['idx'][0][0][0][ temp_ictal_fname_idx][0] ictal_idx = ictal_idx - 1 # Change from MATLAB to python index system temp_ictal_labels = np.zeros(n_segments, dtype=int) temp_ictal_labels[ictal_idx] = 1 # Sanity check if not (ictal_idx.size == np.intersect1d( good_idx, ictal_idx).size): raise Exception( 'Good indices and ictal indices are mismatch.') # Remove segment of flat signals from labels temp_ictal_labels = temp_ictal_labels[good_idx] else: temp_ictal_labels = np.zeros(temp_X.shape[0], dtype=int) # Sanity check if not (temp_X.shape[0] == temp_y.size): raise Exception( 'Number of feature data and labels are not equal.') if not (temp_X.shape[0] == temp_ictal_labels.size): raise Exception( 'Number of feature data and labels are not equal.') if not (X is None) and not (y is None) and not ( ictal_labels is None): X = np.concatenate((X, temp_X), axis=0) y = np.append(y, temp_y) y_select_idx = np.append(y_select_idx, temp_y_select_idx) ictal_labels = np.append(ictal_labels, temp_ictal_labels) else: X = temp_X y = temp_y y_select_idx = temp_y_select_idx ictal_labels = temp_ictal_labels else: print 'There is no good segment for during this seizure' # Store preictal labels that are from the leave-out-seizure index (use for compute accuracy) # Note: this property will exist when which_set=='valid' or which_set=='test' # as there is no need for ictal to be imported. self.y_select_idx = y_select_idx # Sanity check if np.where(y_select_idx == 1)[0].size > np.where(y == 1)[0].size: raise Exception( 'There is an error in collecting preictal labels only from the leave-out-seizure index.' ) elif np.where(y_select_idx == 1)[0].size == np.where( y == 1)[0].size: print 'There is only one preictal periods, and this period is from the leave-out-seizure index.' if not np.all( np.where(y_select_idx == 1)[0] == np.where(y == 1)[0]): raise Exception( 'There is a mismatch between y_select_idx and y.') elif np.where(y_select_idx == 1)[0].size < np.where( y == 1)[0].size: print 'There are more than one preictal periods.' if not np.all( np.intersect1d( np.where(y == 1)[0], np.where(y_select_idx == 1)[0]) == np.where( y_select_idx == 1)[0]): raise Exception( 'There is a mismatch between y_select_idx and y in the preictal labels of the leave-out-seizure index.' ) # Store ictal labels # Note: this property will exist when which_set=='valid' or which_set=='test' # as there is no need for ictal to be imported. self.ictal_labels = ictal_labels else: raise Exception('Invalid dataset selection') X = np.transpose(X, [0, 2, 1]) one_hot_formatter = OneHotFormatter(max_labels=2) y = one_hot_formatter.format(y) # Sanity check if not (X.shape[0] == self.preictal_samples + self.nonictal_samples): raise Exception( 'There is a mismatch in the number of training samples.') if not (np.where(np.argmax(y, axis=1) == 1)[0].size == self.preictal_samples): raise Exception( 'There is a mismatch in the number of preictal samples and its labels.' ) if not (X.shape[0] == y.shape[0]): raise Exception( 'There is a mismatch in the number of training samples and its labels.' ) return X, y
class IndexSpace(Space): """ A space representing indices, for example MNIST labels (0-10) or the indices of words in a dictionary for NLP tasks. A single space can contain multiple indices, for example the word indices of an n-gram. IndexSpaces can be converted to VectorSpaces in two ways: Either the labels are converted into one-hot vectors which are then concatenated, or they are converted into a single vector where 1s indicate labels present i.e. for 4 possible labels we have [0, 2] -> [1 0 1 0] or [0, 2] -> [1 0 0 0 0 0 1 0]. """ def __init__(self, max_labels, dim, **kwargs): """ Initialize an IndexSpace. Parameters ---------- max_labels : int The number of possible classes/labels. This means that all labels should be < max_labels. Example: For MNIST there are 10 numbers and hence max_labels = 10. dim : int The number of indices in one space e.g. for MNIST there is one target label and hence dim = 1. If we have an n-gram of word indices as input to a neurel net language model, dim = n. kwargs: passes on to superclass constructor """ super(IndexSpace, self).__init__(**kwargs) self.max_labels = max_labels self.dim = dim self.formatter = OneHotFormatter(self.max_labels) def __str__(self): """ Return a string representation. """ return '%(classname)s(dim=%(dim)s, max_labels=%(max_labels)s)' % \ dict(classname=self.__class__.__name__, dim=self.dim, max_labels=self.max_labels) def __eq__(self, other): return (type(self) == type(other) and self.max_labels == other.max_labels and self.dim == other.dim) def __ne__(self, other): return (not self == other) @functools.wraps(Space.get_total_dimension) def get_total_dimension(self): return self.dim @functools.wraps(Space.np_format_as) def np_format_as(self, batch, space): if isinstance(space, VectorSpace): if self.max_labels == space.dim: rval = self.formatter.format(batch, sparse=space.sparse, mode='merge') elif self.dim * self.max_labels == space.dim: rval = self.formatter.format(batch, sparse=space.sparse, mode='concatenate') else: raise ValueError("Can't convert %s to %s" % (self, space)) return rval else: raise ValueError("Can't convert %s to %s" % (self, space)) @functools.wraps(Space._format_as) def _format_as(self, batch, space): """ Supports formatting to a VectorSpace where indices are represented by ones in a binary vector. """ if isinstance(space, VectorSpace): if self.max_labels == space.dim: rval = self.formatter.theano_expr(batch, sparse=space.sparse, mode='merge') elif self.dim * self.max_labels == space.dim: rval = self.formatter.theano_expr(batch, sparse=space.sparse, mode='concatenate') else: raise ValueError("Can't convert %s to %s" % (self, space)) return rval else: raise ValueError("Can't convert %s to %s" % (self, space)) @functools.wraps(Space.make_theano_batch) def make_theano_batch(self, name=None, dtype=None, batch_size=None): if batch_size == 1: rval = T.lrow(name=name) else: rval = T.lmatrix(name=name) return rval @functools.wraps(Space.batch_size) def batch_size(self, batch): self.validate(batch) return batch.shape[0] @functools.wraps(Space.np_batch_size) def np_batch_size(self, batch): self.np_validate(batch) return batch.shape[0] @functools.wraps(Space._validate) def _validate(self, batch): """ .. todo:: WRITEME """ if not isinstance(batch, theano.gof.Variable): raise TypeError("IndexSpace batch should be a theano Variable, " "got " + str(type(batch))) if not isinstance(batch.type, (theano.tensor.TensorType, CudaNdarrayType)): raise TypeError("VectorSpace batch should be TensorType or " "CudaNdarrayType, got "+str(batch.type)) if batch.ndim != 2: raise ValueError('IndexSpace batches must be 2D, got %d ' 'dimensions' % batch.ndim) for val in get_debug_values(batch): self.np_validate(val) @functools.wraps(Space._np_validate) def _np_validate(self, batch): # Use the 'CudaNdarray' string to avoid importing theano.sandbox.cuda # when it is not available if (not isinstance(batch, np.ndarray) and str(type(batch)) != "<type 'CudaNdarray'>"): raise TypeError("The value of a IndexSpace batch should be a " "numpy.ndarray, or CudaNdarray, but is %s." % str(type(batch))) if batch.ndim != 2: raise ValueError("The value of a IndexSpace batch must be " "2D, got %d dimensions for %s." % (batch.ndim, batch)) if batch.shape[1] != self.dim: raise ValueError("The width of a IndexSpace batch must match " "with the space's dimension, but batch has shape " "%s and dim = %d." % (str(batch.shape), self.dim))
def load_data(self): # Get the directory of the patient data patient_dir = os.path.join(self.data_dir, self.patient_id) # Load metadata about dataset form MAT file metadata_fname = os.path.join( patient_dir, 'trainset_' + str(self.preictal_sec) + '.mat') metadata_mat = loadmat(metadata_fname) # Get number of seizures self.n_seizures = metadata_mat.get('ictals').size # Get detail of the segment self.sampling_rate = metadata_mat['sampling_rate'][0][0] self.segment_sec = metadata_mat['segment_sec'][0][0] self.segment_samples = self.sampling_rate * self.segment_sec # Get the number blocks to extend from the withheld seizure self.n_extended_blocks_test = metadata_mat['n_extended_blocks_test'][ 0][0] self.preictal_samples = 0 self.nonictal_samples = 0 self.nan_non_flat_samples = 0 # Examples of indexing through MAT file # mat['nonictals'][i][0]['filename'][0][0][0][j][0] # mat['nonictals'][i][0]['idx'][0][0][0][j][0] # mat['nonictals'][i][0]['n_segments'][0][0][0][0] # Load shuffle data if self.which_set == 'train' or self.which_set == 'valid_train': if self.which_set == 'train': select_idx = np.setdiff1d( range(metadata_mat['preictals'].size), np.asarray([ self.leave_out_seizure_idx_valid, self.leave_out_seizure_idx_test ])) else: select_idx = np.asarray([self.leave_out_seizure_idx_valid]) X = None y = None if self.use_all_nonictals: temp_preictal_X = None for i in select_idx: print '====== Seizure', i, '======' # Pre-ictal temp_X = self.load_feature( part='preictals', list_features=self.list_features, seizure_idx=i, metadata_mat=metadata_mat, patient_dir=patient_dir) if not (temp_preictal_X is None): temp_preictal_X = np.concatenate( (temp_preictal_X, temp_X), axis=1) else: temp_preictal_X = temp_X self.preictal_samples = temp_preictal_X.shape[1] # Non-ictal data temp_nonictal_X = self.load_feature( part='nonictals_all', list_features=self.list_features, seizure_idx=self.leave_out_seizure_idx_test, metadata_mat=metadata_mat, patient_dir=patient_dir) X = np.concatenate((temp_preictal_X, temp_nonictal_X), axis=1) y = np.zeros(X.shape[1], dtype=int) y[range(self.preictal_samples)] = 1 self.nonictal_samples = temp_nonictal_X.shape[1] print 'Preictal samples: {0}, Nonictal samples: {1}'.format( self.preictal_samples, self.nonictal_samples) if not np.all( np.arange(self.preictal_samples) == np.where(y)[0]): raise Exception( 'There is a mismatch between the number of preictal data and labels.' ) else: for i in select_idx: print '====== Seizure', i, '======' # Non-ictal data temp_nonictal_X = self.load_feature( part='nonictals', list_features=self.list_features, seizure_idx=i, metadata_mat=metadata_mat, patient_dir=patient_dir) # Pre-ictal temp_preictal_X = self.load_feature( part='preictals', list_features=self.list_features, seizure_idx=i, metadata_mat=metadata_mat, patient_dir=patient_dir) # Concatenate preictal and nonictal data temp_X = np.concatenate((temp_preictal_X, temp_nonictal_X), axis=1) temp_y = np.zeros(temp_X.shape[1], dtype=int) temp_y[range(temp_preictal_X.shape[1])] = 1 # Sanity check # if not (temp_preictal_X.shape[1] == temp_nonictal_X.shape[1]): # raise Exception('Unbalanced classes.') print 'Preictal samples: {0}, Nonictal samples: {1}'.format( temp_preictal_X.shape[1], temp_nonictal_X.shape[1]) if not np.all( np.arange(temp_preictal_X.shape[1]) == np.where( temp_y)[0]): raise Exception( 'There is a mismatch between the number of preictal data and labels.' ) self.preictal_samples = self.preictal_samples + temp_preictal_X.shape[ 1] self.nonictal_samples = self.nonictal_samples + temp_nonictal_X.shape[ 1] if not (X is None) and not (y is None): X = np.concatenate((X, temp_X), axis=1) y = np.append(y, temp_y) else: X = temp_X y = temp_y # Load continuous data elif self.which_set == 'valid' or self.which_set == 'test': if self.which_set == 'valid': select_idx = self.leave_out_seizure_idx_valid else: select_idx = self.leave_out_seizure_idx_test print '====== Seizure', select_idx, '======' # Get metadata of all blocks block_df = pd.read_table(os.path.join(patient_dir, 'block_metadata.txt'), sep='\t') # Get block index of the selected seizure select_sz_fname = metadata_mat['preictals'][select_idx][0][ 'filename'][0][0][0][0][0] block_idx = np.where(block_df.filename == select_sz_fname)[0][0] start_block_idx = block_idx - self.n_extended_blocks_test end_block_idx = block_idx + self.n_extended_blocks_test + 1 if start_block_idx < 0: start_block_idx = 0 if end_block_idx > block_df.shape[0]: end_block_idx = block_df.shape[0] select_block_idx = np.arange(start_block_idx, end_block_idx) filenames = block_df.filename[select_block_idx].values X = None y = None y_label_all = None ictal_labels = None for b_idx, fname in enumerate(filenames): # Name of the MAT file that stores indices of flat (i.e., false) segments fname_flat = fname.replace('.data', '_flat_signal_segment_idx.mat') # Get all good indices (i.e., remove segments of flat signals) flat_mat = loadmat(os.path.join(patient_dir, fname_flat)) flat_idx = np.empty(0, dtype=int) for j in range(flat_mat['flat_signal_segment_idx'].shape[0]): flat_idx = np.append( flat_idx, np.squeeze(flat_mat['flat_signal_segment_idx'][j][0])) flat_idx = flat_idx - 1 # Change from MATLAB to python index system n_segments = np.ceil( block_df.samples[select_block_idx[b_idx]] / (self.segment_samples * 1.0)) all_idx = np.arange(n_segments, dtype=int) good_idx = np.setdiff1d(all_idx, flat_idx) print 'Load', self.which_set, 'data from', fname if good_idx.size > 0: # Features with shape [n_features, n_samples] temp_X = self.load_list_feature( list_features=self.list_features, sample_idx=good_idx, fname=fname, patient_dir=patient_dir) # If this record contains preictal data in the withheld seizures, get preictal labels temp_y_withheld = self.get_labels( label_type='preictals', filename=fname, good_idx=good_idx, metadata_mat=metadata_mat, n_all_segments=n_segments, n_data_segments=temp_X.shape[1], select_meta_idx=select_idx) # If this record contains preictal data in the selected seizures, get preictal labels temp_y_select = self.get_labels( label_type='preictals', filename=fname, good_idx=good_idx, metadata_mat=metadata_mat, n_all_segments=n_segments, n_data_segments=temp_X.shape[1]) # If this record contains preictal data in all seizures, get preictal labels temp_y_rm = self.get_labels( label_type='all_preictals', filename=fname, good_idx=good_idx, metadata_mat=metadata_mat, n_all_segments=n_segments, n_data_segments=temp_X.shape[1]) tmp_preictal_withheld_idx = np.where( temp_y_withheld == 1)[0] tmp_preictal_select_idx = np.where(temp_y_select == 1)[0] tmp_preictal_rm_idx = np.where(temp_y_rm == 1)[0] tmp_preictal_select_idx = np.setdiff1d( tmp_preictal_select_idx, tmp_preictal_withheld_idx) tmp_preictal_rm_idx = np.setdiff1d( tmp_preictal_rm_idx, tmp_preictal_withheld_idx) tmp_preictal_rm_idx = np.setdiff1d( tmp_preictal_rm_idx, tmp_preictal_select_idx) self.preictal_samples = self.preictal_samples + np.where( temp_y_withheld == 1)[0].size self.nonictal_samples = self.nonictal_samples + np.where( temp_y_withheld == 0)[0].size if tmp_preictal_withheld_idx.size > 0: print ' Load preictal data from the withheld seizure from this file.' print ' Size:', tmp_preictal_withheld_idx.size, tmp_preictal_withheld_idx if tmp_preictal_select_idx.size > 0: print ' Load preictal data from selected seizures in addition to the withheld seizure from this file.' print ' Size:', tmp_preictal_select_idx.size, tmp_preictal_select_idx if tmp_preictal_rm_idx.size > 0: print ' Load preictal data from removed seizures in addition to the withheld seizure from this file.' print ' Size:', tmp_preictal_rm_idx.size, tmp_preictal_rm_idx # Sanity check if np.intersect1d(tmp_preictal_withheld_idx, tmp_preictal_select_idx).size > 0: raise Exception( 'There is an overlapped of the labels between the withheld seizures, and the selected seizures.' ) if np.intersect1d(tmp_preictal_select_idx, tmp_preictal_rm_idx).size > 0: raise Exception( 'There is an overlapped of the labels between the selected seizures, and the removed seizures.' ) if np.intersect1d(tmp_preictal_withheld_idx, tmp_preictal_rm_idx).size > 0: raise Exception( 'There is an overlapped of the labels between the withheld seizures, and the removed seizures.' ) temp_y_all = np.zeros(temp_X.shape[1], dtype=int) temp_y_all[ tmp_preictal_withheld_idx] = 1 # Labels for the withheld seizure temp_y_all[ tmp_preictal_select_idx] = 2 # Labels for the selected seizure (that is not from withheld seizures) temp_y_all[ tmp_preictal_rm_idx] = 3 # Labels for the removed seizure (that is not from withheld seizures) # If this record contains ictal data, get ictal labels temp_ictal_labels = self.get_labels( label_type='all_ictals', filename=fname, good_idx=good_idx, metadata_mat=metadata_mat, n_all_segments=n_segments, n_data_segments=temp_X.shape[1]) tmp_ictal_idx = np.where(temp_ictal_labels == 1)[0] if tmp_ictal_idx.size > 0: print ' Ictal label:', tmp_ictal_idx.size, tmp_ictal_idx # Dealing with NaN features after filtering out flat segment which occurs due to noise in the data, # not from flat segments nan_sample_idx = np.where(np.isnan(np.sum(temp_X, 0)))[0] nan_feature_idx = np.where(np.isnan(np.sum(temp_X, 1)))[0] if nan_sample_idx.size > 0 or nan_feature_idx.size > 0: print self.which_set, 'contains NaN at:' print ' sample_idx:', good_idx[ nan_sample_idx], ' feature_idx:', nan_feature_idx print ' shape before remove NaN:', temp_X.shape tmp_preictal_idx = np.where(temp_y_withheld == 1)[0] tmp_nonictal_idx = np.where(temp_y_withheld == 0)[0] nan_preictal_sample_idx = np.intersect1d( tmp_preictal_idx, nan_sample_idx) nan_nonictal_sample_idx = np.intersect1d( tmp_nonictal_idx, nan_sample_idx) if nan_preictal_sample_idx.size > 0: print ' NaN are in preictal index:', good_idx[ nan_preictal_sample_idx] if nan_nonictal_sample_idx.size > 0: print ' NaN are in nonictal index:', good_idx[ nan_nonictal_sample_idx] all_idx = np.arange(temp_X.shape[1]) good_idx_1 = np.setdiff1d(all_idx, nan_sample_idx) temp_X = temp_X[:, good_idx_1] temp_y_all = temp_y_all[good_idx_1] temp_y_withheld = temp_y_withheld[good_idx_1] temp_ictal_labels = temp_ictal_labels[good_idx_1] print ' shape before remove NaN:', temp_X.shape self.nan_non_flat_samples = self.nan_non_flat_samples + nan_sample_idx.size # Sanity check tmp_nan_sample_idx = np.where(np.isnan(np.sum(temp_X, 0)))[0] if tmp_nan_sample_idx.size > 0: raise Exception('There is an error in removing NaN') if not (temp_X.shape[1] == temp_y_all.size): raise Exception( 'Number of feature data and labels [temp_y_all] are not equal.' ) if not (temp_X.shape[1] == temp_y_withheld.size): raise Exception( 'Number of feature data and labels [temp_y_withheld] are not equal.' ) if not (temp_X.shape[1] == temp_ictal_labels.size): raise Exception( 'Number of feature data and labels [ictal_labels] are not equal.' ) if not (X is None) and not (y is None) and not ( ictal_labels is None): X = np.concatenate((X, temp_X), axis=1) y = np.append(y, temp_y_withheld) y_label_all = np.append(y_label_all, temp_y_all) ictal_labels = np.append(ictal_labels, temp_ictal_labels) else: X = temp_X y = temp_y_withheld y_label_all = temp_y_all ictal_labels = temp_ictal_labels else: print 'There is no good segment for during this seizure' # Store preictal labels that are from the withheld index (use for compute accuracy), selected seizure index, # and removed seizure index. # Note: this property will exist when which_set=='valid' or which_set=='test' # as there is no need for ictal to be imported. self.y_label_all = y_label_all # Sanity check if np.where(y == 1)[0].size > np.where(y_label_all > 0)[0].size: raise Exception( 'There is an error in collecting preictal labels only from the leave-out-seizure index.' ) if np.where(y == 1)[0].size == np.where(y_label_all == 1)[0].size: print 'There is only one preictal periods, and this period is from the leave-out-seizure index.' if not np.all( np.where(y == 1)[0] == np.where(y_label_all == 1)[0]): raise Exception( 'There is a mismatch between y and y_label_all.') if np.where(y == 1)[0].size < np.where(y_label_all > 0)[0].size: print 'There are more than one preictal periods.' if not np.all( np.where(y == 1)[0] == np.where(y_label_all == 1)[0]): raise Exception( 'There is a mismatch between y_select_idx and y in the preictal labels of the leave-out-seizure index.' ) # Store ictal labels # Note: this property will exist when which_set=='valid' or which_set=='test' # as there is no need for ictal to be imported. self.ictal_labels = ictal_labels else: raise Exception('Invalid dataset selection') print 'There are {0} samples that have been removed in addition to the flat signal as due to NaN.'.format( self.nan_non_flat_samples) X = np.transpose(X, [1, 0]) one_hot_formatter = OneHotFormatter(max_labels=2) y = one_hot_formatter.format(y) # Sanity check # Note: We ignore nan_non_flat_samples if we load shuffle data as we specify the labels after the NaN have been removed # In contrast to loading continuous data, we specify the labels before removing NaN, so we have to remove the NaN samples for checking if self.which_set == 'train' or self.which_set == 'valid_train': if not (X.shape[0] == self.preictal_samples + self.nonictal_samples): raise Exception( 'There is a mismatch in the number of training samples ({0} != {1}).' .format(X.shape[0], self.preictal_samples + self.nonictal_samples)) if not (np.where(np.argmax(y, axis=1) == 1)[0].size == self.preictal_samples): raise Exception( 'There is a mismatch in the number of preictal samples and its labels ({0} != {1}).' .format( np.where(np.argmax(y, axis=1) == 1)[0].size, self.preictal_samples)) if not (X.shape[0] == y.shape[0]): raise Exception( 'There is a mismatch in the number of training samples and its labels ({0} != {1}).' .format(X.shape[0], y.shape[0])) elif self.which_set == 'valid' or self.which_set == 'test': if not (X.shape[0] == self.preictal_samples + self.nonictal_samples - self.nan_non_flat_samples): raise Exception( 'There is a mismatch in the number of training samples ({0} != {1}).' .format( X.shape[0], self.preictal_samples + self.nonictal_samples - self.nan_non_flat_samples)) if not ((np.where(np.argmax(y, axis=1) == 1)[0].size + np.where(np.argmax(y, axis=1) == 0)[0].size) == self.preictal_samples + self.nonictal_samples - self.nan_non_flat_samples): raise Exception( 'There is a mismatch in the number of samples and its labels ({0} != {1}).' .format( np.where(np.argmax(y, axis=1) == 1)[0].size + np.where(np.argmax(y, axis=1) == 0)[0].size, self.preictal_samples)) if not (X.shape[0] == y.shape[0]): raise Exception( 'There is a mismatch in the number of training samples and its labels ({0} != {1}).' .format(X.shape[0], y.shape[0])) return X, y
def load_data(self, which_set, sample_size_second, batch_size, scaler_path): raw_data, raw_labels, channel_labels, \ seizure_range_idx, seizure_range_second, seizure_seconds, \ n_channels, sample_size, sampling_rate = self.load_source_data(sample_size_second) self.channel_labels = channel_labels self.seizure_seconds_src = seizure_seconds self.sampling_rate = sampling_rate self.raw_data = raw_data # Generate seiuzre index (rounded to be divided by the sampling rate) seizure_round_sample_idx = np.empty(seizure_range_second.size, dtype=object) for r in range(seizure_range_second.size): start_idx = seizure_range_second[r][0] * sampling_rate end_idx = seizure_range_second[r][-1] * sampling_rate seizure_round_sample_idx[r] = np.arange(start_idx, end_idx) # Generate non-seizure index non_seizure_round_sample_idx = np.arange(raw_data.shape[1]) for s_idx in seizure_round_sample_idx: non_seizure_round_sample_idx = np.setdiff1d(non_seizure_round_sample_idx, s_idx) # Partition non-seizure data into segments # Then randomly choose for training, cv and test sets n_segments = 10 segment_size = non_seizure_round_sample_idx.size / n_segments segment_size = segment_size - (segment_size % sampling_rate) segment_idx = np.empty(n_segments, dtype=object) for i in range(n_segments): start_segment_idx = i * segment_size end_segment_idx = (i+1) * segment_size if end_segment_idx > non_seizure_round_sample_idx.size: end_segment_idx = non_seizure_round_sample_idx.size segment_idx[i] = np.arange(start_segment_idx, end_segment_idx) # Select test seizure index test_seizure_idx = self.leave_one_out_seizure np.random.seed(test_seizure_idx) # Leave-one-out cross-validation - seizure n_seizures = seizure_range_idx.shape[0] rest_seizure_idx = np.setdiff1d(np.arange(n_seizures), test_seizure_idx) perm_rest_seizure_idx = np.random.permutation(rest_seizure_idx) train_seizure_idx = perm_rest_seizure_idx cv_seizure_idx = perm_rest_seizure_idx # Leave-one-out cross-validation - non-seizure n_train_segments = int(n_segments * 0.6) n_cv_segments = int(n_segments * 0.2) non_seizure_segment_idx = np.arange(n_segments) perm_non_seizure_segment_idx = np.random.permutation(non_seizure_segment_idx) train_sample_segments = perm_non_seizure_segment_idx[:n_train_segments] cv_sample_segments = perm_non_seizure_segment_idx[n_train_segments:n_train_segments+n_cv_segments] test_sample_segments = perm_non_seizure_segment_idx[n_train_segments+n_cv_segments:] train_sample_idx = np.empty(0, dtype=int) for s in train_sample_segments: train_sample_idx = np.append(train_sample_idx, segment_idx[s]) cv_sample_idx = np.empty(0, dtype=int) for s in cv_sample_segments: cv_sample_idx = np.append(cv_sample_idx, segment_idx[s]) test_sample_idx = np.empty(0, dtype=int) for s in test_sample_segments: test_sample_idx = np.append(test_sample_idx, segment_idx[s]) print 'Segment index for train, cv and test sets:', \ train_sample_segments, cv_sample_segments, test_sample_segments print 'Seizure index for train, cv and test sets:', \ train_seizure_idx, cv_seizure_idx, [test_seizure_idx] if which_set == 'train': print("Loading training data...") data = raw_data[:,non_seizure_round_sample_idx[train_sample_idx]] labels = raw_labels[non_seizure_round_sample_idx[train_sample_idx]] select_seizure = train_seizure_idx elif which_set == 'valid': print("Loading validation data...") data = raw_data[:,non_seizure_round_sample_idx[cv_sample_idx]] labels = raw_labels[non_seizure_round_sample_idx[cv_sample_idx]] select_seizure = cv_seizure_idx elif which_set == 'test': print("Loading test data...") data = raw_data[:,non_seizure_round_sample_idx[test_sample_idx]] labels = raw_labels[non_seizure_round_sample_idx[test_sample_idx]] select_seizure = [test_seizure_idx] elif which_set == 'all': print("Loading all data...") data = raw_data labels = raw_labels select_seizure = [] else: raise('Invalid set.') # Add seizure data for sz in select_seizure: data = np.concatenate((data, raw_data[:, seizure_round_sample_idx[sz]]), axis=1) labels = np.concatenate((labels, raw_labels[seizure_round_sample_idx[sz]]), axis=1) # No filtering # Preprocessing if which_set == 'train': scaler = preprocessing.StandardScaler() scaler = scaler.fit(data.transpose()) with open(scaler_path, 'w') as f: pickle.dump(scaler, f) data = scaler.transform(data.transpose()).transpose() else: with open(scaler_path) as f: scaler = pickle.load(f) data = scaler.transform(data.transpose()).transpose() # Input transformation X = np.reshape(data, (-1, sample_size)) y = np.reshape(labels, (-1, sample_size)) y = np.sum(y, 1).transpose() y[y > 0] = 1 print 'Seizure index after transform:', np.where(y)[0] self.seizure_seconds = np.where(y)[0] # Duplicate the labels for all channels y = np.tile(y, n_channels) # Format the target into proper format n_classes = 2 one_hot_formatter = OneHotFormatter(max_labels=n_classes) y = one_hot_formatter.format(y) # Check batch size cut_off = X.shape[0] % batch_size if cut_off > 0: X = X[:-cut_off,:] y = y[:-cut_off,:] return X, y, n_channels, sample_size
def _build_frames_w_phn(dataset, subset, wav_seqs, seqs_to_phns, in_samples, out_samples, shift, win_width, shuffle): #import pdb; pdb.set_trace() norm_seqs = utils.standardize(wav_seqs) #norm_seqs = utils.normalize(wav_seqs) frame_len = in_samples + out_samples overlap = frame_len - shift samples = [] seqs_phn_info = [] seqs_phn_shift = [] # CAUTION!: I am using here reduced phone set # we can also try using the full set but we must store phn+1 # because 0 no more refers to 'h#' (no speech) for ind in range(len(norm_seqs)): #import pdb; pdb.set_trace() wav_seq = norm_seqs[ind] phn_seq = seqs_to_phns[ind] phn_start_end = dataset.__dict__[subset+"_phn"][phn_seq[0]:phn_seq[1]] # create a matrix with consecutive windows # phones are padded by h#, because each window will be shifted once # the first phone samples has passed phones = np.append(phn_start_end[:,2].astype('int16'), np.zeros((1,),dtype='int16')) # phones = np.append(phn_start_end[:,2], # np.zeros((1,))) phn_windows = segment_axis(phones, win_width, win_width-1) # array that has endings of each phone phn_ends = phn_start_end[:,1] # extend the last phone till the end, this is not wrong as long as the # last phone is no speech phone (h#) phn_ends[-1] = wav_seq.shape[0]-1 # create a mapping from each sample to phn_window phn_win_shift = np.zeros_like(wav_seq,dtype='int16') phn_win_shift[phn_ends] = 1 phn_win = phn_win_shift.cumsum(dtype='int16') # minor correction! phn_win[-1] = phn_win[-2] # Segment samples into frames samples.append(segment_axis(wav_seq, frame_len, overlap)) # for phones we care only about one value to mark the start of a new window. # the start of a phone window in a frame is when all samples of previous # phone hav passed, so we use 'min' function to choose the current phone # of the frame phn_frames = segment_axis(phn_win, frame_len, overlap).min(axis=1) # replace the window index with the window itself win_frames = phn_windows[phn_frames] seqs_phn_info.append(win_frames) #import pdb; pdb.set_trace() # create a window shift for each frame shift_frames_aux = np.roll(phn_frames,1) shift_frames_aux[0] = 0 shift_frames = phn_frames - shift_frames_aux # to mark the ending of the sequence - countering the first correction! shift_frames[-1] = 1 seqs_phn_shift.append(shift_frames) #import pdb; pdb.set_trace() #import pdb; pdb.set_trace() # stack all data in one matrix, each row is a frame samples_data = np.vstack(samples[:]) phn_data = np.vstack(seqs_phn_info[:]) shift_data = np.hstack(seqs_phn_shift[:]) #convert phone data to one-hot from pylearn2.format.target_format import OneHotFormatter fmt = OneHotFormatter(max_labels=39, dtype='float32') phn_data = fmt.format(phn_data) phn_data = phn_data.reshape(phn_data.shape[0], phn_data.shape[1]*phn_data.shape[2]) full_data = np.hstack([samples_data[:,:in_samples], phn_data, #input samples_data[:,in_samples:], #out1 shift_data.reshape(shift_data.shape[0],1)]) #out2 if shuffle: np.random.seed(123) full_data = np.random.permutation(full_data) data_x = full_data[:,:in_samples+win_width*39] data_y1 = full_data[:,in_samples+win_width*39:-1] data_y2 = full_data[:,-1] print 'Done' print 'There are %d examples in %s set'%(data_x.shape[0],subset) print "--------------" print 'data_x.shape', data_x.shape print 'data_y1.shape', data_y1.shape return utils.shared_dataset(data_x), \ utils.shared_dataset(data_y1),\ utils.shared_dataset(data_y2)
import numpy from pylearn2_timit.timitlpc import TIMITlpc from pylearn2.space import CompositeSpace, VectorSpace, IndexSpace from pylearn2.format.target_format import OneHotFormatter valid = TIMITlpc("valid", frame_length=160, overlap=159, start=10, stop=11) valid._iter_data_specs = (CompositeSpace((IndexSpace(dim=3,max_labels=61), VectorSpace(dim=10),)), ('phones', 'lpc_features')) formatter = OneHotFormatter(max_labels=62) f = lambda x: formatter.format(numpy.asarray(x, dtype=int), mode='merge') #valid._iter_convert = [f, None] it = valid.iterator(mode='random_uniform', batch_size=100, num_batches=100)
def load_data(self, which_set, sample_size_second, batch_size, scaler_path): raw_data, raw_labels, channel_labels, \ seizure_range_idx, seizure_range_second, seizure_seconds, \ n_channels, sample_size, sampling_rate = self.load_source_data(sample_size_second) self.channel_labels = channel_labels self.seizure_seconds_src = seizure_seconds self.sampling_rate = sampling_rate self.raw_data = raw_data # Generate seiuzre index (rounded to be divided by the sampling rate) seizure_round_sample_idx = np.empty(seizure_range_second.size, dtype=object) for r in range(seizure_range_second.size): start_idx = seizure_range_second[r][0] * sampling_rate end_idx = seizure_range_second[r][-1] * sampling_rate seizure_round_sample_idx[r] = np.arange(start_idx, end_idx) # Generate non-seizure index non_seizure_round_sample_idx = np.arange(raw_data.shape[1]) for s_idx in seizure_round_sample_idx: non_seizure_round_sample_idx = np.setdiff1d( non_seizure_round_sample_idx, s_idx) # Partition non-seizure data into segments # Then randomly choose for training, cv and test sets n_segments = 10 segment_size = non_seizure_round_sample_idx.size / n_segments segment_size = segment_size - (segment_size % sampling_rate) segment_idx = np.empty(n_segments, dtype=object) for i in range(n_segments): start_segment_idx = i * segment_size end_segment_idx = (i + 1) * segment_size if end_segment_idx > non_seizure_round_sample_idx.size: end_segment_idx = non_seizure_round_sample_idx.size segment_idx[i] = np.arange(start_segment_idx, end_segment_idx) # Select test seizure index test_seizure_idx = self.leave_one_out_seizure np.random.seed(test_seizure_idx) # Leave-one-out cross-validation - seizure n_seizures = seizure_range_idx.shape[0] rest_seizure_idx = np.setdiff1d(np.arange(n_seizures), test_seizure_idx) perm_rest_seizure_idx = np.random.permutation(rest_seizure_idx) train_seizure_idx = perm_rest_seizure_idx cv_seizure_idx = perm_rest_seizure_idx # Leave-one-out cross-validation - non-seizure n_train_segments = int(n_segments * 0.6) n_cv_segments = int(n_segments * 0.2) non_seizure_segment_idx = np.arange(n_segments) perm_non_seizure_segment_idx = np.random.permutation( non_seizure_segment_idx) train_sample_segments = perm_non_seizure_segment_idx[:n_train_segments] cv_sample_segments = perm_non_seizure_segment_idx[ n_train_segments:n_train_segments + n_cv_segments] test_sample_segments = perm_non_seizure_segment_idx[n_train_segments + n_cv_segments:] train_sample_idx = np.empty(0, dtype=int) for s in train_sample_segments: train_sample_idx = np.append(train_sample_idx, segment_idx[s]) cv_sample_idx = np.empty(0, dtype=int) for s in cv_sample_segments: cv_sample_idx = np.append(cv_sample_idx, segment_idx[s]) test_sample_idx = np.empty(0, dtype=int) for s in test_sample_segments: test_sample_idx = np.append(test_sample_idx, segment_idx[s]) print 'Segment index for train, cv and test sets:', \ train_sample_segments, cv_sample_segments, test_sample_segments print 'Seizure index for train, cv and test sets:', \ train_seizure_idx, cv_seizure_idx, [test_seizure_idx] if which_set == 'train': print("Loading training data...") data = raw_data[:, non_seizure_round_sample_idx[train_sample_idx]] labels = raw_labels[non_seizure_round_sample_idx[train_sample_idx]] select_seizure = train_seizure_idx elif which_set == 'valid': print("Loading validation data...") data = raw_data[:, non_seizure_round_sample_idx[cv_sample_idx]] labels = raw_labels[non_seizure_round_sample_idx[cv_sample_idx]] select_seizure = cv_seizure_idx elif which_set == 'test': print("Loading test data...") data = raw_data[:, non_seizure_round_sample_idx[test_sample_idx]] labels = raw_labels[non_seizure_round_sample_idx[test_sample_idx]] select_seizure = [test_seizure_idx] elif which_set == 'all': print("Loading all data...") data = raw_data labels = raw_labels select_seizure = [] else: raise ('Invalid set.') # Add seizure data for sz in select_seizure: data = np.concatenate( (data, raw_data[:, seizure_round_sample_idx[sz]]), axis=1) labels = np.concatenate( (labels, raw_labels[seizure_round_sample_idx[sz]]), axis=1) # No filtering # Preprocessing if which_set == 'train': scaler = preprocessing.StandardScaler() scaler = scaler.fit(data.transpose()) with open(scaler_path, 'w') as f: pickle.dump(scaler, f) data = scaler.transform(data.transpose()).transpose() else: with open(scaler_path) as f: scaler = pickle.load(f) data = scaler.transform(data.transpose()).transpose() # Input transformation X = np.reshape(data, (-1, sample_size)) y = np.reshape(labels, (-1, sample_size)) y = np.sum(y, 1).transpose() y[y > 0] = 1 print 'Seizure index after transform:', np.where(y)[0] self.seizure_seconds = np.where(y)[0] # Duplicate the labels for all channels y = np.tile(y, n_channels) # Format the target into proper format n_classes = 2 one_hot_formatter = OneHotFormatter(max_labels=n_classes) y = one_hot_formatter.format(y) # Check batch size cut_off = X.shape[0] % batch_size if cut_off > 0: X = X[:-cut_off, :] y = y[:-cut_off, :] return X, y, n_channels, sample_size
def __init__(self, which_set, onehot_dtype='uint8', center=False, rescale=False, gcn=None, start=None, stop=None, axes=('b', 0, 1, 'c'), toronto_prepro=False, preprocessor=None): """Modified version of the CIFAR10 constructor which creates Y as one-hot vectors rather than simple indexes. This is super hacky. Sorry, Guido..""" # note: there is no such thing as the cifar10 validation set; # pylearn1 defined one but really it should be user-configurable # (as it is here) self.axes = axes # we define here: dtype = 'uint8' ntrain = 50000 nvalid = 0 # artefact, we won't use it ntest = 10000 # we also expose the following details: self.img_shape = (3, 32, 32) self.img_size = numpy.prod(self.img_shape) self.n_classes = 10 self.label_names = [ 'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck' ] # prepare loading fnames = ['data_batch_%i' % i for i in range(1, 6)] datasets = {} datapath = os.path.join( string_utils.preprocess('${PYLEARN2_DATA_PATH}'), 'cifar10', 'cifar-10-batches-py') for name in fnames + ['test_batch']: fname = os.path.join(datapath, name) if not os.path.exists(fname): raise IOError(fname + " was not found. You probably need to " "download the CIFAR-10 dataset by using the " "download script in " "pylearn2/scripts/datasets/download_cifar10.sh " "or manually from " "http://www.cs.utoronto.ca/~kriz/cifar.html") datasets[name] = cache.datasetCache.cache_file(fname) lenx = numpy.ceil((ntrain + nvalid) / 10000.) * 10000 x = numpy.zeros((lenx, self.img_size), dtype=dtype) y = numpy.zeros((lenx, 1), dtype=dtype) # load train data nloaded = 0 for i, fname in enumerate(fnames): _logger.info('loading file %s' % datasets[fname]) data = serial.load(datasets[fname]) x[i * 10000:(i + 1) * 10000, :] = data['data'] y[i * 10000:(i + 1) * 10000, 0] = data['labels'] nloaded += 10000 if nloaded >= ntrain + nvalid + ntest: break # load test data _logger.info('loading file %s' % datasets['test_batch']) data = serial.load(datasets['test_batch']) # process this data Xs = {'train': x[0:ntrain], 'test': data['data'][0:ntest]} Ys = {'train': y[0:ntrain], 'test': data['labels'][0:ntest]} X = numpy.cast['float32'](Xs[which_set]) y = Ys[which_set] if isinstance(y, list): y = numpy.asarray(y).astype(dtype) if which_set == 'test': assert y.shape[0] == 10000 y = y.reshape((y.shape[0], 1)) formatter = OneHotFormatter(self.n_classes, dtype=onehot_dtype) y = formatter.format(y, mode='concatenate') if center: X -= 127.5 self.center = center if rescale: X /= 127.5 self.rescale = rescale if toronto_prepro: assert not center assert not gcn X = X / 255. if which_set == 'test': other = CIFAR10(which_set='train') oX = other.X oX /= 255. X = X - oX.mean(axis=0) else: X = X - X.mean(axis=0) self.toronto_prepro = toronto_prepro self.gcn = gcn if gcn is not None: gcn = float(gcn) X = global_contrast_normalize(X, scale=gcn) if start is not None: # This needs to come after the prepro so that it doesn't # change the pixel means computed above for toronto_prepro assert start >= 0 assert stop > start assert stop <= X.shape[0] X = X[start:stop, :] y = y[start:stop, :] assert X.shape[0] == y.shape[0] if which_set == 'test': assert X.shape[0] == 10000 view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) super(CIFAR10, self).__init__( X=X, y=y, view_converter=view_converter, ) #y_labels=self.n_classes) assert not contains_nan(self.X) if preprocessor: preprocessor.apply(self) # Another hack: rename 'targets' to match model expectations space, (X_source, y_source) = self.data_specs self.data_specs = (space, (X_source, 'condition'))
def __init__(self, db, # data source name = '', # optional name selectors = dict(), partitioner = None, meta_sources = [], # optional sources other than 'features' and 'targets' from metadata channel_filter = NoChannelFilter(), # optional channel filter, default: keep all channel_names = None, # optional channel names (for metadata) label_attribute = 'label', # metadata attribute to be used as label label_map = None, # optional conversion of labels use_targets = True, # use targets if provides, otherwise labels are used remove_dc_offset = False, # optional subtraction of channel mean, usually done already earlier resample = None, # optional down-sampling normalize = True, # normalize to max=1 # optional sub-sequences selection start_sample = 0, stop_sample = None, # optional for selection of sub-sequences zero_padding = True, # if True (default) trials that are too short will be padded with # otherwise they will rejected. # optional signal filter to by applied before splitting the signal signal_filter = None, trial_processors = [], # optional processing of the trials target_processor = None, # optional processing of the targets, e.g. zero-padding transformers = [], # optional transformations of the dataset layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time debug=False, ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params self.name = name self.debug = debug metadb = DatasetMetaDB(db.metadata, selectors.keys()) if partitioner is not None: pass # FIXME selected_trial_ids = metadb.select(selectors) log.info('selectors: {}'.format(selectors)) log.info('selected trials: {}'.format(selected_trial_ids)) if normalize: log.info('Data will be normalized to max amplitude 1 per channel (normalize=True).') trials = list() labels = list() targets = list() meta = list() if stop_sample == 'auto-min': stop_sample = np.min([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids]) log.info('Using minimum trial length. stop_sample={}'.format(stop_sample)) elif stop_sample == 'auto-max': stop_sample = np.max([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids]) log.info('Using maximum trial length. stop_sample={}'.format(stop_sample)) for trial_i in selected_trial_ids: trial_meta = db.metadata[trial_i] if use_targets: if targets is None: target = None else: target = db.targets[trial_i] assert not np.isnan(np.sum(target)) if target_processor is not None: target = target_processor.process(target, trial_meta) assert not np.isnan(np.sum(target)) else: # get and process label label = db.metadata[trial_i][label_attribute] if label_map is not None: label = label_map[label] processed_trial = [] trial = db.data[trial_i] if np.isnan(np.sum(trial)): print trial_i, trial assert not np.isnan(np.sum(trial)) rejected = False # flag for trial rejection trial = np.atleast_2d(trial) # process 1 channel at a time for channel in xrange(trial.shape[0]): # filter channels if not channel_filter.keep_channel(channel): continue samples = trial[channel, :] # subtract channel mean if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1], res_type='sinc_best') # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) if stop_sample is not None and stop_sample > len(samples): if zero_padding: tmp = np.zeros(stop_sample) tmp[:len(samples)] = samples samples = tmp else: rejected = True break # stop processing this trial s = samples[start_sample:stop_sample] # TODO optional channel processing # normalize to max amplitude 1 if normalize: s = librosa.util.normalize(s) # add 2nd data dimension s = s.reshape(s.shape[0], 1) # print s.shape s = np.asfarray(s, dtype=theano.config.floatX) processed_trial.append(s) ### end of channel iteration ### if rejected: continue # next trial processed_trial = np.asfarray([processed_trial], dtype=theano.config.floatX) # processed_trial = processed_trial.reshape((1, processed_trial.shape)) processed_trial = np.rollaxis(processed_trial, 1, 4) # optional (external) trial processing, e.g. windowing # trials will be in b01c format with tf layout for 01-axes for trial_processor in trial_processors: processed_trial = trial_processor.process(processed_trial, trial_meta) trials.append(processed_trial) for k in range(len(processed_trial)): meta.append(trial_meta) if use_targets: targets.append(target) else: labels.append(label) ### end of datafile iteration ### # turn into numpy arrays self.trials = np.vstack(trials) assert not np.isnan(np.sum(self.trials)) # prepare targets / labels if use_targets: self.targets = np.vstack(targets) assert not np.isnan(np.sum(self.targets)) else: labels = np.hstack(labels) if label_map is None: one_hot_formatter = OneHotFormatter(max(labels) + 1) else: one_hot_formatter = OneHotFormatter(max(label_map.values()) + 1) one_hot_y = one_hot_formatter.format(labels) self.targets = one_hot_y self.metadata = meta if layout == 'ft': # swap axes to (batch, feature, time, channels) self.trials = self.trials.swapaxes(1, 2) # transform after finalizing the data structure for transformer in transformers: self.trials, self.targets = transformer.process(self.trials, self.targets) self.trials = np.asarray(self.trials, dtype=theano.config.floatX) log.debug('final dataset shape: {} (b,0,1,c)'.format(self.trials.shape)) # super(EEGEpochsDataset, self).__init__(topo_view=self.trials, y=self.targets, axes=['b', 0, 1, 'c']) self.X = self.trials.reshape(self.trials.shape[0], np.prod(self.trials.shape[1:])) self.y = self.targets log.info('generated dataset "{}" with shape X={}={} y={} targets={} '. format(self.name, self.X.shape, self.trials.shape, self.y.shape, self.targets.shape)) # determine data specs features_space = Conv2DSpace( shape=[self.trials.shape[1], self.trials.shape[2]], num_channels=self.trials.shape[3] ) features_source = 'features' targets_space = VectorSpace(dim=self.targets.shape[-1]) targets_source = 'targets' space_components = [features_space, targets_space] source_components = [features_source, targets_source] # additional support for meta information self.meta_maps = dict() for meta_source in meta_sources: self.meta_maps[meta_source] = sorted(list(set([m[meta_source] for m in self.metadata]))) space_components.extend([VectorSpace(dim=1)]) source_components.extend([meta_source]) log.info('Generated meta-source "{}" with value map: {}' .format(meta_source, self.meta_maps[meta_source])) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) log.debug('data specs: {}'.format(self.data_specs))
def __init__(self, path, name = '', # optional name # selectors subjects='all', # optional selector (list) or 'all' trial_types='all', # optional selector (list) or 'all' trial_numbers='all', # optional selector (list) or 'all' conditions='all', # optional selector (list) or 'all' partitioner = None, channel_filter = NoChannelFilter(), # optional channel filter, default: keep all channel_names = None, # optional channel names (for metadata) label_map = None, # optional conversion of labels remove_dc_offset = False, # optional subtraction of channel mean, usually done already earlier resample = None, # optional down-sampling # optional sub-sequences selection start_sample = 0, stop_sample = None, # optional for selection of sub-sequences # optional signal filter to by applied before spitting the signal signal_filter = None, # windowing parameters frame_size = -1, hop_size = -1, # values > 0 will lead to windowing hop_fraction = None, # alternative to specifying absolute hop_size # optional spectrum parameters, n_fft = 0 keeps raw data n_fft = 0, n_freq_bins = None, spectrum_log_amplitude = False, spectrum_normalization_mode = None, include_phase = False, flatten_channels=False, layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time save_matrix_path = None, keep_metadata = False, ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params # TODO: get the whole filtering into an extra class datafiles_metadata, metadb = load_datafiles_metadata(path) # print datafiles_metadata def apply_filters(filters, node): if isinstance(node, dict): filtered = [] keepkeys = filters[0] for key, value in node.items(): if keepkeys == 'all' or key in keepkeys: filtered.extend(apply_filters(filters[1:], value)) return filtered else: return node # [node] # keep only files that match the metadata filters self.datafiles = apply_filters([subjects,trial_types,trial_numbers,conditions], datafiles_metadata) # copy metadata for retained files self.metadb = {} for datafile in self.datafiles: self.metadb[datafile] = metadb[datafile] # print self.datafiles # print self.metadb self.name = name if partitioner is not None: self.datafiles = partitioner.get_partition(self.name, self.metadb) self.include_phase = include_phase self.spectrum_normalization_mode = spectrum_normalization_mode self.spectrum_log_amplitude = spectrum_log_amplitude self.sequence_partitions = [] # used to keep track of original sequences # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = [] sequences = [] labels = [] n_sequences = 0 if frame_size > 0 and hop_size == -1 and hop_fraction is not None: hop_size = np.ceil(frame_size / hop_fraction) for i in xrange(len(self.datafiles)): with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): # save start of next sequence self.sequence_partitions.append(n_sequences) data, metadata = load(os.path.join(path, self.datafiles[i])) label = metadata['label'] if label_map is not None: label = label_map[label] multi_channel_frames = [] # process 1 channel at a time for channel in xrange(data.shape[1]): # filter channels if not channel_filter.keep_channel(channel): continue samples = data[:, channel] # subtract channel mean if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1]) # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) samples = samples[start_sample:stop_sample] if n_fft is not None and n_fft > 0: # Optionally: ### frequency spectrum branch ### # transform to spectogram hop_length = n_fft / 4; ''' from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html >>> # Get a power spectrogram from a waveform y >>> S = np.abs(librosa.stft(y)) ** 2 >>> log_S = librosa.logamplitude(S) ''' S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length) # mag = np.abs(S) # magnitude spectrum mag = np.abs(S)**2 # power spectrum # include phase information if requested if self.include_phase: # phase = np.unwrap(np.angle(S)) phase = np.angle(S) # Optionally: cut off high bands if n_freq_bins is not None: mag = mag[0:n_freq_bins, :] if self.include_phase: phase = phase[0:n_freq_bins, :] if self.spectrum_log_amplitude: mag = librosa.logamplitude(mag) s = mag # for normalization ''' NOTE on normalization: It depends on the structure of a neural network and (even more) on the properties of data. There is no best normalization algorithm because if there would be one, it would be used everywhere by default... In theory, there is no requirement for the data to be normalized at all. This is a purely practical thing because in practice convergence could take forever if your input is spread out too much. The simplest would be to just normalize it by scaling your data to (-1,1) (or (0,1) depending on activation function), and in most cases it does work. If your algorithm converges well, then this is your answer. If not, there are too many possible problems and methods to outline here without knowing the actual data. ''' ## normalize to mean 0, std 1 if self.spectrum_normalization_mode == 'mean0_std1': # s = preprocessing.scale(s, axis=0); mean = np.mean(s) std = np.std(s) s = (s - mean) / std ## normalize by linear transform to [0,1] elif self.spectrum_normalization_mode == 'linear_0_1': s = s / np.max(s) ## normalize by linear transform to [-1,1] elif self.spectrum_normalization_mode == 'linear_-1_1': s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)) elif self.spectrum_normalization_mode is not None: raise ValueError( 'unsupported spectrum normalization mode {}'.format( self.spectrum_normalization_mode) ) #print s.mean(axis=0) #print s.std(axis=0) # include phase information if requested if self.include_phase: # normalize phase to [-1.1] phase = phase / np.pi s = np.vstack([s, phase]) # transpose to fit pylearn2 layout s = np.transpose(s) # print s.shape ### end of frequency spectrum branch ### else: ### raw waveform branch ### # normalize to max amplitude 1 s = librosa.util.normalize(samples) # add 2nd data dimension s = s.reshape(s.shape[0], 1) # print s.shape ### end of raw waveform branch ### s = np.asfarray(s, dtype='float32') if frame_size > 0 and hop_size > 0: s = s.copy() # FIXME: THIS IS NECESSARY IN MultiChannelEEGSequencesDataset - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!! frames = frame(s, frame_length=frame_size, hop_length=hop_size) else: frames = s del s # print frames.shape if flatten_channels: # add artificial channel dimension frames = frames.reshape((frames.shape[0], frames.shape[1], frames.shape[2], 1)) # print frames.shape sequences.append(frames) # increment counter by new number of frames n_sequences += frames.shape[0] if keep_metadata: # determine channel name channel_name = None if channel_names is not None: channel_name = channel_names[channel] elif 'channels' in metadata: channel_name = metadata['channels'][channel] self.metadata.append({ 'subject' : metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no' : metadata['trial_no'], # trial_no 'condition' : metadata['condition'], # condition 'channel' : channel, # channel 'channel_name' : channel_name, 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }) for _ in xrange(frames.shape[0]): labels.append(label) else: multi_channel_frames.append(frames) ### end of channel iteration ### if not flatten_channels: # turn list into array multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32') # [channels x frames x time x freq] -> cb01 # [channels x frames x time x 1] -> cb0. # move channel dimension to end multi_channel_frames = np.rollaxis(multi_channel_frames, 0, 4) # print multi_channel_frames.shape # log.debug(multi_channel_frames.shape) sequences.append(multi_channel_frames) # increment counter by new number of frames n_sequences += multi_channel_frames.shape[0] if keep_metadata: self.metadata.append({ 'subject' : metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no' : metadata['trial_no'], # trial_no 'condition' : metadata['condition'], # condition 'channel' : 'all', # channel 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }) for _ in xrange(multi_channel_frames.shape[0]): labels.append(label) ### end of datafile iteration ### # turn into numpy arrays sequences = np.vstack(sequences) # print sequences.shape; labels = np.hstack(labels) # one_hot_y = one_hot(labels) one_hot_formatter = OneHotFormatter(labels.max() + 1) # FIXME! one_hot_y = one_hot_formatter.format(labels) self.labels = labels if layout == 'ft': # swap axes to (batch, feature, time, channels) sequences = sequences.swapaxes(1, 2) log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)) super(MultiChannelEEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) log.info('generated dataset "{}" with shape X={}={} y={} labels={} '. format(self.name, self.X.shape, sequences.shape, self.y.shape, self.labels.shape)) if save_matrix_path is not None: matrix = DenseDesignMatrix(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) with log_timing(log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)): serial.save(save_matrix_path, matrix)
class IndexSpace(Space): """ A space representing indices, for example MNIST labels (0-10) or the indices of words in a dictionary for NLP tasks. A single space can contain multiple indices, for example the word indices of an n-gram. IndexSpaces can be converted to VectorSpaces in two ways: Either the labels are converted into one-hot vectors which are then concatenated, or they are converted into a single vector where 1s indicate labels present i.e. for 4 possible labels we have [0, 2] -> [1 0 1 0] or [0, 2] -> [1 0 0 0 0 0 1 0]. """ def __init__(self, max_labels, dim, **kwargs): """ Initialize an IndexSpace. Parameters ---------- max_labels : int The number of possible classes/labels. This means that all labels should be < max_labels. Example: For MNIST there are 10 numbers and hence max_labels = 10. dim : int The number of indices in one space e.g. for MNIST there is one target label and hence dim = 1. If we have an n-gram of word indices as input to a neurel net language model, dim = n. kwargs: passes on to superclass constructor """ super(IndexSpace, self).__init__(**kwargs) self.max_labels = max_labels self.dim = dim self.formatter = OneHotFormatter(self.max_labels) def __str__(self): """ Return a string representation. """ return '%(classname)s(dim=%(dim)s, max_labels=%(max_labels)s' % \ dict(classname=self.__class__.__name__, dim=self.dim, max_labels=self.max_labels) @functools.wraps(Space.get_total_dimension) def get_total_dimension(self): return self.dim @functools.wraps(Space.np_format_as) def np_format_as(self, batch, space): if isinstance(space, VectorSpace): if self.max_labels == space.dim: rval = self.formatter.format(batch, sparse=space.sparse, mode='merge') elif self.dim * self.max_labels == space.dim: rval = self.formatter.format(batch, sparse=space.sparse, mode='concatenate') else: raise ValueError("Can't convert IndexSpace to" "VectorSpace (%d labels to %d dimensions)" % (self.dim, space.dim)) return rval else: raise ValueError("Can't convert IndexSpace to %(space)s" % (space.__class__.__name__)) @functools.wraps(Space._format_as) def _format_as(self, batch, space): """ Supports formatting to a VectorSpace where indices are represented by ones in a binary vector. """ if isinstance(space, VectorSpace): if self.max_labels == space.dim: rval = self.formatter.theano_expr(batch, sparse=space.sparse, mode='merge') elif self.dim * self.max_labels == space.dim: rval = self.formatter.theano_expr(batch, sparse=space.sparse, mode='concatenate') else: raise ValueError("Can't convert IndexSpace to" "VectorSpace (%d labels to %d dimensions)" % (self.dim, space.dim)) return rval else: raise ValueError("Can't convert IndexSpace to %(space)s" % (space.__class__.__name__)) @functools.wraps(Space.make_theano_batch) def make_theano_batch(self, name=None, dtype=None, batch_size=None): if batch_size == 1: rval = T.lrow(name=name) else: rval = T.lmatrix(name=name) return rval @functools.wraps(Space.batch_size) def batch_size(self, batch): self.validate(batch) return batch.shape[0] @functools.wraps(Space.np_batch_size) def np_batch_size(self, batch): self.np_validate(batch) return batch.shape[0] @functools.wraps(Space._validate) def _validate(self, batch): """ .. todo:: WRITEME """ if not isinstance(batch, theano.gof.Variable): raise TypeError("IndexSpace batch should be a theano Variable, " "got " + str(type(batch))) if not isinstance(batch.type, (theano.tensor.TensorType, CudaNdarrayType)): raise TypeError("VectorSpace batch should be TensorType or " "CudaNdarrayType, got "+str(batch.type)) if batch.ndim != 2: raise ValueError('IndexSpace batches must be 2D, got %d ' 'dimensions' % batch.ndim) for val in get_debug_values(batch): self.np_validate(val) @functools.wraps(Space._np_validate) def _np_validate(self, batch): # Use the 'CudaNdarray' string to avoid importing theano.sandbox.cuda # when it is not available if (not isinstance(batch, np.ndarray) and str(type(batch)) != "<type 'CudaNdarray'>"): raise TypeError("The value of a IndexSpace batch should be a " "numpy.ndarray, or CudaNdarray, but is %s." % str(type(batch))) if batch.ndim != 2: raise ValueError("The value of a IndexSpace batch must be " "2D, got %d dimensions for %s." % (batch.ndim, batch)) if batch.shape[1] != self.dim: raise ValueError("The width of a IndexSpace batch must match " "with the space's dimension, but batch has shape " "%s and dim = %d." % (str(batch.shape), self.dim))
def __init__( self, db, # data source name='', # optional name selectors=dict(), partitioner=None, meta_sources=[], # optional sources other than 'features' and 'targets' from metadata channel_filter=NoChannelFilter( ), # optional channel filter, default: keep all channel_names=None, # optional channel names (for metadata) label_attribute='label', # metadata attribute to be used as label label_map=None, # optional conversion of labels use_targets=True, # use targets if provides, otherwise labels are used remove_dc_offset=False, # optional subtraction of channel mean, usually done already earlier resample=None, # optional down-sampling normalize=True, # normalize to max=1 # optional sub-sequences selection start_sample=0, stop_sample=None, # optional for selection of sub-sequences zero_padding=True, # if True (default) trials that are too short will be padded with # otherwise they will rejected. # optional signal filter to by applied before splitting the signal signal_filter=None, trial_processors=[], # optional processing of the trials target_processor=None, # optional processing of the targets, e.g. zero-padding transformers=[], # optional transformations of the dataset layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time debug=False, ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params self.name = name self.debug = debug metadb = DatasetMetaDB(db.metadata, selectors.keys()) if partitioner is not None: pass # FIXME selected_trial_ids = metadb.select(selectors) log.info('selectors: {}'.format(selectors)) log.info('selected trials: {}'.format(selected_trial_ids)) if normalize: log.info( 'Data will be normalized to max amplitude 1 per channel (normalize=True).' ) trials = list() labels = list() targets = list() meta = list() if stop_sample == 'auto-min': stop_sample = np.min( [db.data[trial_i].shape[-1] for trial_i in selected_trial_ids]) log.info('Using minimum trial length. stop_sample={}'.format( stop_sample)) elif stop_sample == 'auto-max': stop_sample = np.max( [db.data[trial_i].shape[-1] for trial_i in selected_trial_ids]) log.info('Using maximum trial length. stop_sample={}'.format( stop_sample)) for trial_i in selected_trial_ids: trial_meta = db.metadata[trial_i] if use_targets: if targets is None: target = None else: target = db.targets[trial_i] assert not np.isnan(np.sum(target)) if target_processor is not None: target = target_processor.process(target, trial_meta) assert not np.isnan(np.sum(target)) else: # get and process label label = db.metadata[trial_i][label_attribute] if label_map is not None: label = label_map[label] processed_trial = [] trial = db.data[trial_i] if np.isnan(np.sum(trial)): print trial_i, trial assert not np.isnan(np.sum(trial)) rejected = False # flag for trial rejection trial = np.atleast_2d(trial) # process 1 channel at a time for channel in xrange(trial.shape[0]): # filter channels if not channel_filter.keep_channel(channel): continue samples = trial[channel, :] # subtract channel mean if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1], res_type='sinc_best') # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) if stop_sample is not None and stop_sample > len(samples): if zero_padding: tmp = np.zeros(stop_sample) tmp[:len(samples)] = samples samples = tmp else: rejected = True break # stop processing this trial s = samples[start_sample:stop_sample] # TODO optional channel processing # normalize to max amplitude 1 if normalize: s = librosa.util.normalize(s) # add 2nd data dimension s = s.reshape(s.shape[0], 1) # print s.shape s = np.asfarray(s, dtype=theano.config.floatX) processed_trial.append(s) ### end of channel iteration ### if rejected: continue # next trial processed_trial = np.asfarray([processed_trial], dtype=theano.config.floatX) # processed_trial = processed_trial.reshape((1, processed_trial.shape)) processed_trial = np.rollaxis(processed_trial, 1, 4) # optional (external) trial processing, e.g. windowing # trials will be in b01c format with tf layout for 01-axes for trial_processor in trial_processors: processed_trial = trial_processor.process( processed_trial, trial_meta) trials.append(processed_trial) for k in range(len(processed_trial)): meta.append(trial_meta) if use_targets: targets.append(target) else: labels.append(label) ### end of datafile iteration ### # turn into numpy arrays self.trials = np.vstack(trials) assert not np.isnan(np.sum(self.trials)) # prepare targets / labels if use_targets: self.targets = np.vstack(targets) assert not np.isnan(np.sum(self.targets)) else: labels = np.hstack(labels) if label_map is None: one_hot_formatter = OneHotFormatter(max(labels) + 1) else: one_hot_formatter = OneHotFormatter( max(label_map.values()) + 1) one_hot_y = one_hot_formatter.format(labels) self.targets = one_hot_y self.metadata = meta if layout == 'ft': # swap axes to (batch, feature, time, channels) self.trials = self.trials.swapaxes(1, 2) # transform after finalizing the data structure for transformer in transformers: self.trials, self.targets = transformer.process( self.trials, self.targets) self.trials = np.asarray(self.trials, dtype=theano.config.floatX) log.debug('final dataset shape: {} (b,0,1,c)'.format( self.trials.shape)) # super(EEGEpochsDataset, self).__init__(topo_view=self.trials, y=self.targets, axes=['b', 0, 1, 'c']) self.X = self.trials.reshape(self.trials.shape[0], np.prod(self.trials.shape[1:])) self.y = self.targets log.info('generated dataset "{}" with shape X={}={} y={} targets={} '. format(self.name, self.X.shape, self.trials.shape, self.y.shape, self.targets.shape)) # determine data specs features_space = Conv2DSpace( shape=[self.trials.shape[1], self.trials.shape[2]], num_channels=self.trials.shape[3]) features_source = 'features' targets_space = VectorSpace(dim=self.targets.shape[-1]) targets_source = 'targets' space_components = [features_space, targets_space] source_components = [features_source, targets_source] # additional support for meta information self.meta_maps = dict() for meta_source in meta_sources: self.meta_maps[meta_source] = sorted( list(set([m[meta_source] for m in self.metadata]))) space_components.extend([VectorSpace(dim=1)]) source_components.extend([meta_source]) log.info('Generated meta-source "{}" with value map: {}'.format( meta_source, self.meta_maps[meta_source])) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) log.debug('data specs: {}'.format(self.data_specs))
def load_data(self): # Get the directory of the patient data patient_dir = os.path.join(self.data_dir, self.patient_id) # Load metadata about dataset form MAT file metadata_fname = os.path.join(patient_dir, 'trainset_' + str(self.preictal_sec) + '.mat') metadata_mat = loadmat(metadata_fname) # Get number of seizures self.n_seizures = metadata_mat.get('ictals').size # Get detail of the segment self.sampling_rate = metadata_mat['sampling_rate'][0][0] self.segment_sec = metadata_mat['segment_sec'][0][0] self.segment_samples = self.sampling_rate * self.segment_sec # Get the number blocks to extend from the withheld seizure self.n_extended_blocks_test = metadata_mat['n_extended_blocks_test'][0][0] self.preictal_samples = 0 self.nonictal_samples = 0 self.nan_non_flat_samples = 0 # Examples of indexing through MAT file # mat['nonictals'][i][0]['filename'][0][0][0][j][0] # mat['nonictals'][i][0]['idx'][0][0][0][j][0] # mat['nonictals'][i][0]['n_segments'][0][0][0][0] # Load shuffle data if self.which_set == 'train' or self.which_set == 'valid_train': if self.which_set == 'train': select_idx = np.setdiff1d(range(metadata_mat['preictals'].size), np.asarray([self.leave_out_seizure_idx_valid, self.leave_out_seizure_idx_test])) else: select_idx = np.asarray([self.leave_out_seizure_idx_valid]) X = None y = None if self.use_all_nonictals: temp_preictal_X = None for i in select_idx: print '====== Seizure', i, '======' # Pre-ictal temp_X = self.load_feature(part='preictals', list_features=self.list_features, seizure_idx=i, metadata_mat=metadata_mat, patient_dir=patient_dir) if not (temp_preictal_X is None): temp_preictal_X = np.concatenate((temp_preictal_X, temp_X), axis=1) else: temp_preictal_X = temp_X self.preictal_samples = temp_preictal_X.shape[1] # Non-ictal data temp_nonictal_X = self.load_feature(part='nonictals_all', list_features=self.list_features, seizure_idx=self.leave_out_seizure_idx_test, metadata_mat=metadata_mat, patient_dir=patient_dir) X = np.concatenate((temp_preictal_X, temp_nonictal_X), axis=1) y = np.zeros(X.shape[1], dtype=int) y[range(self.preictal_samples)] = 1 self.nonictal_samples = temp_nonictal_X.shape[1] print 'Preictal samples: {0}, Nonictal samples: {1}'.format(self.preictal_samples, self.nonictal_samples) if not np.all(np.arange(self.preictal_samples) == np.where(y)[0]): raise Exception('There is a mismatch between the number of preictal data and labels.') else: for i in select_idx: print '====== Seizure', i, '======' # Non-ictal data temp_nonictal_X = self.load_feature(part='nonictals', list_features=self.list_features, seizure_idx=i, metadata_mat=metadata_mat, patient_dir=patient_dir) # Pre-ictal temp_preictal_X = self.load_feature(part='preictals', list_features=self.list_features, seizure_idx=i, metadata_mat=metadata_mat, patient_dir=patient_dir) # Concatenate preictal and nonictal data temp_X = np.concatenate((temp_preictal_X, temp_nonictal_X), axis=1) temp_y = np.zeros(temp_X.shape[1], dtype=int) temp_y[range(temp_preictal_X.shape[1])] = 1 # Sanity check # if not (temp_preictal_X.shape[1] == temp_nonictal_X.shape[1]): # raise Exception('Unbalanced classes.') print 'Preictal samples: {0}, Nonictal samples: {1}'.format(temp_preictal_X.shape[1], temp_nonictal_X.shape[1]) if not np.all(np.arange(temp_preictal_X.shape[1]) == np.where(temp_y)[0]): raise Exception('There is a mismatch between the number of preictal data and labels.') self.preictal_samples = self.preictal_samples + temp_preictal_X.shape[1] self.nonictal_samples = self.nonictal_samples + temp_nonictal_X.shape[1] if not (X is None) and not (y is None): X = np.concatenate((X, temp_X), axis=1) y = np.append(y, temp_y) else: X = temp_X y = temp_y # Load continuous data elif self.which_set == 'valid' or self.which_set == 'test': if self.which_set == 'valid': select_idx = self.leave_out_seizure_idx_valid else: select_idx = self.leave_out_seizure_idx_test print '====== Seizure', select_idx, '======' # Get metadata of all blocks block_df = pd.read_table(os.path.join(patient_dir, 'block_metadata.txt'), sep='\t') # Get block index of the selected seizure select_sz_fname = metadata_mat['preictals'][select_idx][0]['filename'][0][0][0][0][0] block_idx = np.where(block_df.filename == select_sz_fname)[0][0] start_block_idx = block_idx - self.n_extended_blocks_test end_block_idx = block_idx + self.n_extended_blocks_test + 1 if start_block_idx < 0: start_block_idx = 0 if end_block_idx > block_df.shape[0]: end_block_idx = block_df.shape[0] select_block_idx = np.arange(start_block_idx, end_block_idx) filenames = block_df.filename[select_block_idx].values X = None y = None y_label_all = None ictal_labels = None for b_idx, fname in enumerate(filenames): # Name of the MAT file that stores indices of flat (i.e., false) segments fname_flat = fname.replace('.data', '_flat_signal_segment_idx.mat') # Get all good indices (i.e., remove segments of flat signals) flat_mat = loadmat(os.path.join(patient_dir, fname_flat)) flat_idx = np.empty(0, dtype=int) for j in range(flat_mat['flat_signal_segment_idx'].shape[0]): flat_idx = np.append(flat_idx, np.squeeze(flat_mat['flat_signal_segment_idx'][j][0])) flat_idx = flat_idx - 1 # Change from MATLAB to python index system n_segments = np.ceil(block_df.samples[select_block_idx[b_idx]] / (self.segment_samples * 1.0)) all_idx = np.arange(n_segments, dtype=int) good_idx = np.setdiff1d(all_idx, flat_idx) print 'Load', self.which_set, 'data from', fname if good_idx.size > 0: # Features with shape [n_features, n_samples] temp_X = self.load_list_feature(list_features=self.list_features, sample_idx=good_idx, fname=fname, patient_dir=patient_dir) # If this record contains preictal data in the withheld seizures, get preictal labels temp_y_withheld = self.get_labels(label_type='preictals', filename=fname, good_idx=good_idx, metadata_mat=metadata_mat, n_all_segments=n_segments, n_data_segments=temp_X.shape[1], select_meta_idx=select_idx) # If this record contains preictal data in the selected seizures, get preictal labels temp_y_select = self.get_labels(label_type='preictals', filename=fname, good_idx=good_idx, metadata_mat=metadata_mat, n_all_segments=n_segments, n_data_segments=temp_X.shape[1]) # If this record contains preictal data in all seizures, get preictal labels temp_y_rm = self.get_labels(label_type='all_preictals', filename=fname, good_idx=good_idx, metadata_mat=metadata_mat, n_all_segments=n_segments, n_data_segments=temp_X.shape[1]) tmp_preictal_withheld_idx = np.where(temp_y_withheld == 1)[0] tmp_preictal_select_idx = np.where(temp_y_select == 1)[0] tmp_preictal_rm_idx = np.where(temp_y_rm == 1)[0] tmp_preictal_select_idx = np.setdiff1d(tmp_preictal_select_idx, tmp_preictal_withheld_idx) tmp_preictal_rm_idx = np.setdiff1d(tmp_preictal_rm_idx, tmp_preictal_withheld_idx) tmp_preictal_rm_idx = np.setdiff1d(tmp_preictal_rm_idx, tmp_preictal_select_idx) self.preictal_samples = self.preictal_samples + np.where(temp_y_withheld == 1)[0].size self.nonictal_samples = self.nonictal_samples + np.where(temp_y_withheld == 0)[0].size if tmp_preictal_withheld_idx.size > 0: print ' Load preictal data from the withheld seizure from this file.' print ' Size:', tmp_preictal_withheld_idx.size, tmp_preictal_withheld_idx if tmp_preictal_select_idx.size > 0: print ' Load preictal data from selected seizures in addition to the withheld seizure from this file.' print ' Size:', tmp_preictal_select_idx.size, tmp_preictal_select_idx if tmp_preictal_rm_idx.size > 0: print ' Load preictal data from removed seizures in addition to the withheld seizure from this file.' print ' Size:', tmp_preictal_rm_idx.size, tmp_preictal_rm_idx # Sanity check if np.intersect1d(tmp_preictal_withheld_idx, tmp_preictal_select_idx).size > 0: raise Exception('There is an overlapped of the labels between the withheld seizures, and the selected seizures.') if np.intersect1d(tmp_preictal_select_idx, tmp_preictal_rm_idx).size > 0: raise Exception('There is an overlapped of the labels between the selected seizures, and the removed seizures.') if np.intersect1d(tmp_preictal_withheld_idx, tmp_preictal_rm_idx).size > 0: raise Exception('There is an overlapped of the labels between the withheld seizures, and the removed seizures.') temp_y_all = np.zeros(temp_X.shape[1], dtype=int) temp_y_all[tmp_preictal_withheld_idx] = 1 # Labels for the withheld seizure temp_y_all[tmp_preictal_select_idx] = 2 # Labels for the selected seizure (that is not from withheld seizures) temp_y_all[tmp_preictal_rm_idx] = 3 # Labels for the removed seizure (that is not from withheld seizures) # If this record contains ictal data, get ictal labels temp_ictal_labels = self.get_labels(label_type='all_ictals', filename=fname, good_idx=good_idx, metadata_mat=metadata_mat, n_all_segments=n_segments, n_data_segments=temp_X.shape[1]) tmp_ictal_idx = np.where(temp_ictal_labels == 1)[0] if tmp_ictal_idx.size > 0: print ' Ictal label:', tmp_ictal_idx.size, tmp_ictal_idx # Dealing with NaN features after filtering out flat segment which occurs due to noise in the data, # not from flat segments nan_sample_idx = np.where(np.isnan(np.sum(temp_X, 0)))[0] nan_feature_idx = np.where(np.isnan(np.sum(temp_X, 1)))[0] if nan_sample_idx.size > 0 or nan_feature_idx.size > 0: print self.which_set, 'contains NaN at:' print ' sample_idx:', good_idx[nan_sample_idx], ' feature_idx:', nan_feature_idx print ' shape before remove NaN:', temp_X.shape tmp_preictal_idx = np.where(temp_y_withheld == 1)[0] tmp_nonictal_idx = np.where(temp_y_withheld == 0)[0] nan_preictal_sample_idx = np.intersect1d(tmp_preictal_idx, nan_sample_idx) nan_nonictal_sample_idx = np.intersect1d(tmp_nonictal_idx, nan_sample_idx) if nan_preictal_sample_idx.size > 0: print ' NaN are in preictal index:', good_idx[nan_preictal_sample_idx] if nan_nonictal_sample_idx.size > 0: print ' NaN are in nonictal index:', good_idx[nan_nonictal_sample_idx] all_idx = np.arange(temp_X.shape[1]) good_idx_1 = np.setdiff1d(all_idx, nan_sample_idx) temp_X = temp_X[:, good_idx_1] temp_y_all = temp_y_all[good_idx_1] temp_y_withheld = temp_y_withheld[good_idx_1] temp_ictal_labels = temp_ictal_labels[good_idx_1] print ' shape before remove NaN:', temp_X.shape self.nan_non_flat_samples = self.nan_non_flat_samples + nan_sample_idx.size # Sanity check tmp_nan_sample_idx = np.where(np.isnan(np.sum(temp_X, 0)))[0] if tmp_nan_sample_idx.size > 0: raise Exception('There is an error in removing NaN') if not (temp_X.shape[1] == temp_y_all.size): raise Exception('Number of feature data and labels [temp_y_all] are not equal.') if not (temp_X.shape[1] == temp_y_withheld.size): raise Exception('Number of feature data and labels [temp_y_withheld] are not equal.') if not (temp_X.shape[1] == temp_ictal_labels.size): raise Exception('Number of feature data and labels [ictal_labels] are not equal.') if not (X is None) and not (y is None) and not (ictal_labels is None): X = np.concatenate((X, temp_X), axis=1) y = np.append(y, temp_y_withheld) y_label_all = np.append(y_label_all, temp_y_all) ictal_labels = np.append(ictal_labels, temp_ictal_labels) else: X = temp_X y = temp_y_withheld y_label_all = temp_y_all ictal_labels = temp_ictal_labels else: print 'There is no good segment for during this seizure' # Store preictal labels that are from the withheld index (use for compute accuracy), selected seizure index, # and removed seizure index. # Note: this property will exist when which_set=='valid' or which_set=='test' # as there is no need for ictal to be imported. self.y_label_all = y_label_all # Sanity check if np.where(y == 1)[0].size > np.where(y_label_all > 0)[0].size: raise Exception('There is an error in collecting preictal labels only from the leave-out-seizure index.') if np.where(y == 1)[0].size == np.where(y_label_all == 1)[0].size: print 'There is only one preictal periods, and this period is from the leave-out-seizure index.' if not np.all(np.where(y == 1)[0] == np.where(y_label_all == 1)[0]): raise Exception('There is a mismatch between y and y_label_all.') if np.where(y == 1)[0].size < np.where(y_label_all > 0)[0].size: print 'There are more than one preictal periods.' if not np.all(np.where(y == 1)[0] == np.where(y_label_all == 1)[0]): raise Exception('There is a mismatch between y_select_idx and y in the preictal labels of the leave-out-seizure index.') # Store ictal labels # Note: this property will exist when which_set=='valid' or which_set=='test' # as there is no need for ictal to be imported. self.ictal_labels = ictal_labels else: raise Exception('Invalid dataset selection') print 'There are {0} samples that have been removed in addition to the flat signal as due to NaN.'.format(self.nan_non_flat_samples) X = np.transpose(X, [1, 0]) one_hot_formatter = OneHotFormatter(max_labels=2) y = one_hot_formatter.format(y) # Sanity check # Note: We ignore nan_non_flat_samples if we load shuffle data as we specify the labels after the NaN have been removed # In contrast to loading continuous data, we specify the labels before removing NaN, so we have to remove the NaN samples for checking if self.which_set == 'train' or self.which_set == 'valid_train': if not (X.shape[0] == self.preictal_samples + self.nonictal_samples): raise Exception('There is a mismatch in the number of training samples ({0} != {1}).'.format(X.shape[0], self.preictal_samples + self.nonictal_samples)) if not (np.where(np.argmax(y, axis=1) == 1)[0].size == self.preictal_samples): raise Exception('There is a mismatch in the number of preictal samples and its labels ({0} != {1}).'.format(np.where(np.argmax(y, axis=1) == 1)[0].size, self.preictal_samples)) if not (X.shape[0] == y.shape[0]): raise Exception('There is a mismatch in the number of training samples and its labels ({0} != {1}).'.format(X.shape[0], y.shape[0])) elif self.which_set == 'valid' or self.which_set == 'test': if not (X.shape[0] == self.preictal_samples + self.nonictal_samples - self.nan_non_flat_samples): raise Exception('There is a mismatch in the number of training samples ({0} != {1}).'.format(X.shape[0], self.preictal_samples + self.nonictal_samples - self.nan_non_flat_samples)) if not ((np.where(np.argmax(y, axis=1) == 1)[0].size + np.where(np.argmax(y, axis=1) == 0)[0].size) == self.preictal_samples + self.nonictal_samples - self.nan_non_flat_samples): raise Exception('There is a mismatch in the number of samples and its labels ({0} != {1}).'.format(np.where(np.argmax(y, axis=1) == 1)[0].size + np.where(np.argmax(y, axis=1) == 0)[0].size, self.preictal_samples)) if not (X.shape[0] == y.shape[0]): raise Exception('There is a mismatch in the number of training samples and its labels ({0} != {1}).'.format(X.shape[0], y.shape[0])) return X, y
def __init__(self, path, suffix='', # required data file parameters subjects='all', # optional selector (list) or 'all' start_sample = 0, stop_sample = None, # optional for selection of sub-sequences frame_size = -1, hop_size = -1, # values > 0 will lead to windowing label_mode='tempo', name = '', # optional name n_fft = 0, n_freq_bins = None, save_matrix_path = None, channels = None, resample = None, stimulus_id_filter = None, keep_metadata = False, spectrum_log_amplitude = False, spectrum_normalization_mode = None, include_phase = False, layout = 'tf' # 2D axes layout tf=time x features or ft= features x time ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params self.name = name; self.include_phase = include_phase; self.spectrum_normalization_mode = spectrum_normalization_mode; self.spectrum_log_amplitude = spectrum_log_amplitude; self.datafiles = []; subject_paths = glob.glob(os.path.join(path, 'Sub*')); for path in subject_paths: dataset_filename = os.path.join(path, 'dataset'+suffix+'.pklz'); if os.path.isfile(dataset_filename): log.debug('addding {}'.format(dataset_filename)); self.datafiles.append(dataset_filename); else: log.warn('file does not exists {}'.format(dataset_filename)); self.datafiles.sort(); if subjects == 'all': subjects = np.arange(0,len(self.datafiles)); assert subjects is not None and len(subjects) > 0; self.label_mode = label_mode; self.label_converter = LabelConverter(); if stimulus_id_filter is None: stimulus_id_filter = []; self.stimulus_id_filter = stimulus_id_filter; self.subject_partitions = []; # used to keep track of original subjects self.sequence_partitions = []; # used to keep track of original sequences self.trial_partitions = []; # keeps track of original trials # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = []; sequences = []; labels = []; n_sequences = 0; last_raw_label = -1; for i in xrange(len(self.datafiles)): if i in subjects: with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): self.subject_partitions.append(n_sequences); # save start of next subject subject_sequences, subject_labels, channel_meta = load(self.datafiles[i]); subject_trial_no = -1; for j in xrange(len(subject_sequences)): l = subject_labels[j]; # get raw label if l in stimulus_id_filter: # log.debug('skipping stimulus {}'.format(l)); continue; c = channel_meta[j][0]; if channels is not None and not c in channels: # apply optional channel filter log.debug('skipping channel {}'.format(c)); continue; self.sequence_partitions.append(n_sequences); # save start of next sequence if l != last_raw_label: # if raw label changed... self.trial_partitions.append(n_sequences); # ...save start of next trial subject_trial_no += 1; # increment subject_trial_no counter last_raw_label = l; l = self.label_converter.get_label(l[0], self.label_mode); # convert to label_mode view s = subject_sequences[j]; s = s[start_sample:stop_sample]; # get sub-sequence in original space # down-sample if requested if resample is not None and resample[0] != resample[1]: s = librosa.resample(s, resample[0], resample[1]); if n_fft is not None and n_fft > 0: # Optionally: # transform to spectogram hop_length = n_fft / 4; ''' from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html >>> # Get a power spectrogram from a waveform y >>> S = np.abs(librosa.stft(y)) ** 2 >>> log_S = librosa.logamplitude(S) ''' # s = np.abs(librosa.core.stft(s, # n_fft=n_fft, # hop_length=hop_length) # )**2; S = librosa.core.stft(s, n_fft=n_fft, hop_length=hop_length); # mag = np.abs(S); # magnitude spectrum mag = np.abs(S)**2; # power spectrum # phase = np.unwrap(np.angle(S)); phase = np.angle(S); if n_freq_bins is not None: # Optionally: mag = mag[0:n_freq_bins, :]; # cut off high bands phase = phase[0:n_freq_bins, :]; if self.spectrum_log_amplitude: mag = librosa.logamplitude(mag); s = mag; # for normalization ''' NOTE on normalization: It depends on the structure of a neural network and (even more) on the properties of data. There is no best normalization algorithm because if there would be one, it would be used everywhere by default... In theory, there is no requirement for the data to be normalized at all. This is a purely practical thing because in practice convergence could take forever if your input is spread out too much. The simplest would be to just normalize it by scaling your data to (-1,1) (or (0,1) depending on activation function), and in most cases it does work. If your algorithm converges well, then this is your answer. If not, there are too many possible problems and methods to outline here without knowing the actual data. ''' ## normalize to mean 0, std 1 if self.spectrum_normalization_mode == 'mean0_std1': # s = preprocessing.scale(s, axis=0); mean = np.mean(s); std = np.std(s); s = (s - mean) / std; ## normalize by linear transform to [0,1] elif self.spectrum_normalization_mode == 'linear_0_1': s = s / np.max(s); ## normalize by linear transform to [-1,1] elif self.spectrum_normalization_mode == 'linear_-1_1': s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)); elif self.spectrum_normalization_mode is not None: raise ValueError( 'unsupported spectrum normalization mode {}'.format( self.spectrum_normalization_mode) ); #print s.mean(axis=0) #print s.std(axis=0) # include phase information if requested if self.include_phase: # normalize phase to [-1.1] phase = phase / np.pi s = np.vstack([s, phase]); # transpose to fit pylearn2 layout s = np.transpose(s); else: # normalize to max amplitude 1 s = librosa.util.normalize(s); s = np.asfarray(s, dtype='float32'); if frame_size > 0 and hop_size > 0: s, l = self._split_sequence(s, l, frame_size, hop_size); # print s.shape n_sequences += len(s); sequences.append(s); labels.extend(l); if keep_metadata: self.metadata.append({ 'subject' : i, # subject 'trial_no' : subject_trial_no, # trial_no 'stimulus' : last_raw_label[0], # stimulus 'channel' : c, # channel 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }); # turn into numpy arrays sequences = np.vstack(sequences); print sequences.shape; labels = np.hstack(labels); # one_hot_y = one_hot(labels) one_hot_formatter = OneHotFormatter(labels.max() + 1) one_hot_y = one_hot_formatter.format(labels) self.labels = labels; # save for later if n_fft > 0: sequences = np.array([sequences]); # re-arrange dimensions sequences = sequences.swapaxes(0,1).swapaxes(1,2).swapaxes(2,3); if layout == 'ft': sequences = sequences.swapaxes(1,2) log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)); print 'final dataset shape: {} (b,0,1,c)'.format(sequences.shape) super(EEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']); else: # if layout == 'ft': # sequences = sequences.swapaxes(1,2) super(EEGDataset, self).__init__(X=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']); log.debug('generated dataset "{}" with shape X={} y={} labels={} '.format(self.name, self.X.shape, self.y.shape, self.labels.shape)); if save_matrix_path is not None: matrix = DenseDesignMatrix(X=sequences, y=one_hot_y); with log_timing(log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)): serial.save(save_matrix_path, matrix);