def convert_to_dataset(X,y): X = np.vstack(X); y = np.vstack(y); # convert labels y = self.label_converter.get_labels(y, self.label_mode); y = np.hstack(y); one_hot_y = one_hot(y); dataset = DenseDesignMatrix(X=X, y=one_hot_y); dataset.labels = y; # for confusion matrix return dataset;
def convert_to_dataset(X, y): X = np.vstack(X) y = np.vstack(y) # convert labels y = self.label_converter.get_labels(y, self.label_mode) y = np.hstack(y) one_hot_y = one_hot(y) dataset = DenseDesignMatrix(X=X, y=one_hot_y) dataset.labels = y # for confusion matrix return dataset
def on_monitor(self, model, dataset, algorithm): d = model.get_param_vector() - self.origin data = list(dataset.get_batch_design(self.batch_size, include_labels=True)) from pylearn2.utils.one_hot import one_hot data[1] = one_hot(data[1]) cost_values = [] for scale in self.scales: print "Evaluating cost at scale ", scale model.set_param_vector(self.origin + scale * d) model.enforce_constraints() cost_values.append(self.cost_fn(*data)) print 'Scales searched: ',self.scales print 'Cost values: ', cost_values best_scale = self.scales[cost_values.index(min(cost_values))] print "best_scale: ", best_scale model.set_param_vector(self.origin + best_scale * d)
def test_one_hot_basic(): assert_equal(one_hot([1, 2]), [[0, 1, 0], [0, 0, 1]]) assert_equal(one_hot([[1], [2], [1]], max_label=3), [[0, 1, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0]])
def test_one_hot_out(): out = np.empty((2, 3), dtype="uint8") assert_equal(one_hot([1, 2], out=out), [[0, 1, 0], [0, 0, 1]]) assert_equal(out, [[0, 1, 0], [0, 0, 1]])
def test_one_hot_dtypes(): int_dt = ["int8", "int16", "int32", "int64"] int_dt += ["u" + dt for dt in int_dt] float_dt = ["float64", "float32", "complex64", "complex128"] all_dt = int_dt + float_dt assert_(all(one_hot([5], dtype=dt).dtype == np.dtype(dt) for dt in all_dt))
def setup_impl(self, model, dataset, algorithm): cost = algorithm.cost root = model.get_param_vector() dim = root.size rng = self.rng points = rng.randn(self.num_points, self.num_basis_vectors) points = points.astype(root.dtype) points *= self.scale if self.include_root: points[0, :] = 0. if not hasattr(self, 'cost_fn'): # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now # ======================================= data_specs = cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) # End cargo culting # ====================== print "Compiling cost function..." cost_fn = function(theano_args, cost_value) self.cost_fn = cost_fn else: cost_fn = self.cost_fn cost_values = np.zeros(self.num_points) data = list(dataset.get_batch_design(self.batch_size, include_labels=True)) from pylearn2.utils.one_hot import one_hot data[1] = one_hot(data[1]) if self.method == 'gaussian': basis = rng.normal(dim, self.num_basis_vectors).astype(root.dtype) elif self.method == 'element': basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype) for i in xrange(self.num_basis_vectors): basis[rng.randint(dim), i] = 1. elif self.method == 'gradient': if not hasattr(self, 'grad_fn'): self.grad_fn = function(theano_args, grad(cost_value, model.get_params())) grad_fn = self.grad_fn basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype) for i in xrange(self.num_basis_vectors): ipt = list(dataset.get_batch_design(1, include_labels=True)) label = ipt[1] assert label.size == 1 label = label[0] one_hot = np.zeros((1, 10,),dtype='float32') one_hot[0, label] = 1 ipt[1] = one_hot g = grad_fn(*ipt) basis[:,i] = np.concatenate([e.reshape(e.size) for e in g], axis=0) else: assert False basis /= np.sqrt(np.square(basis).sum(axis=0)) # Orthogonalize basis for i in xrange(self.num_basis_vectors): v = basis[:,i ].copy() for j in xrange(i - 1): u = basis[:, j].copy() v -= np.dot(u, v) * u norm = np.sqrt(np.square(v).sum()) assert norm > 1e-4 v /= norm basis[:,i] = v for i in xrange(self.num_points): print "Evaluating cost at point ", i point = points[i, :] full_point = root + np.dot(basis, point) model.set_param_vector(full_point) cost_values[i] = cost_fn(*data) print cost_values[i] from pylearn2.utils import sharedX import theano.tensor as T print "!!!!!!!! FITTING THE QUADRATIC FUNCTION !!!!!!!!!!!!!!!!!!!" if not hasattr(self, 'fit_quad'): points = sharedX(points) #from theano import config #config.compute_test_value = 'raise' cost_values = sharedX(cost_values) A = sharedX(np.zeros((self.num_basis_vectors, self.num_basis_vectors))) if self.psd: mat = T.dot(A.T, A) else: mat = A b = sharedX(np.zeros(self.num_basis_vectors)) c = sharedX(0.) half_quad = T.dot(points, mat) quad = (points * half_quad).sum(axis=1) lin = T.dot(points, b) pred = quad + lin + c from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent mse = T.square(pred - cost_values).mean() mae = abs(pred - cost_values).mean() obj = locals()[self.fitting_cost] fit_quad = BatchGradientDescent(obj, params = [A, b, c], max_iter = self.num_basis_vectors ** 2, verbose = 3, tol = None, init_alpha = None, min_init_alpha = 1e-7, reset_alpha = False, conjugate = True, reset_conjugate = False, line_search_mode = 'exhaustive') self.fit_quad = fit_quad self.A = A self.b = b self.c = c self.points = points self.cost_values = cost_values else: self.A.set_value(.001 * np.identity(self.A.get_value().shape[0], dtype=self.A.dtype)) self.b.set_value(self.b.get_value() * 0.) self.c.set_value(self.c.get_value() * 0.) self.points.set_value(points) self.cost_values.set_value(cost_values.astype(self.cost_values.dtype)) self.fit_quad.minimize() print "!!!!!!!!!!!!! FINDING ITS MINIMUM !!!!!!!!!!!!!!!!!!!!!!!!!!!" if self.use_solver: if self.psd: Av = self.A.get_value() mat_v = np.dot(Av.T, Av) else: mat_v = self.A.get_value() bv = self.b.get_value() # minimize for x^T A x + b^T x + c # -> solve 2 A x + b = 0 # Ax = - b / 2 print "********** mat_v", mat_v.min(), mat_v.max() x, ignored_residuals, ignored_rank, ignored_singular_values = np.linalg.lstsq(mat_v, - 0.5 * bv) print "********** soln: ", x.min(), x.mean(), x.max() print "********** SVs: ", ignored_singular_values.min(), ignored_singular_values.max() assert x.ndim == 1, x.shape prod = np.dot(basis, x) norm = np.sqrt(np.square(prod).sum()) print "*************** Moving params by ",norm vector = root + prod model.set_param_vector(vector) else: # use minimizer if not hasattr(self, 'fit_params'): self.vector = sharedX(points.get_value().mean(axis=0)) vector = self.vector obj = T.dot(T.dot(mat, vector), vector) + T.dot(b, vector) def constrain(d): assert vector in d n = d[vector] norm = T.sqrt(T.square(n).sum()) desired_norm = T.clip(norm, 0., self.max_jump_norm) d[vector] = n * desired_norm / norm self.fit_params = BatchGradientDescent(obj, params=[vector], max_iter = self.num_basis_vectors, verbose = 3, tol=None, param_constrainers = [constrain], init_alpha = None, min_init_alpha = 1e-3, reset_alpha=False, conjugate=True, reset_conjugate=False, line_search_mode='exhaustive') else: self.vector.set_value(points.mean(axis=0).astype(self.vector.dtype)) self.fit_params.minimize() model.set_param_vector(root + np.dot(basis , self.vector.get_value()))
def __init__( self, path, suffix='', # required data file parameters subjects='all', # optional selector (list) or 'all' start_sample=0, stop_sample=None, # optional for selection of sub-sequences frame_size=-1, hop_size=-1, # values > 0 will lead to windowing label_mode='tempo', name='', # optional name n_fft=0, n_freq_bins=None, save_matrix_path=None, channels=None, resample=None, stimulus_id_filter=None, keep_metadata=False, spectrum_log_amplitude=False, spectrum_normalization_mode=None, ): ''' Constructor ''' self.name = name self.spectrum_normalization_mode = spectrum_normalization_mode self.spectrum_log_amplitude = spectrum_log_amplitude self.datafiles = [] subject_paths = glob.glob(os.path.join(path, 'Sub*')) for path in subject_paths: dataset_filename = os.path.join(path, 'dataset' + suffix + '.pklz') if os.path.isfile(dataset_filename): log.debug('addding {}'.format(dataset_filename)) self.datafiles.append(dataset_filename) else: log.warn('file does not exists {}'.format(dataset_filename)) self.datafiles.sort() if subjects == 'all': subjects = np.arange(0, len(self.datafiles)) assert subjects is not None and len(subjects) > 0 self.label_mode = label_mode self.label_converter = LabelConverter() if stimulus_id_filter is None: stimulus_id_filter = [] self.stimulus_id_filter = stimulus_id_filter self.subject_partitions = [] # used to keep track of original subjects self.sequence_partitions = [] # used to keep track of original sequences self.trial_partitions = [] # keeps track of original trials # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = [] sequences = [] labels = [] n_sequences = 0 last_raw_label = -1 for i in xrange(len(self.datafiles)): if i in subjects: with log_timing( log, 'loading data from {}'.format(self.datafiles[i])): self.subject_partitions.append(n_sequences) # save start of next subject subject_sequences, subject_labels, channel_meta = load( self.datafiles[i]) subject_trial_no = -1 for j in xrange(len(subject_sequences)): l = subject_labels[j] # get raw label if l in stimulus_id_filter: # log.debug('skipping stimulus {}'.format(l)); continue c = channel_meta[j][0] if channels is not None and not c in channels: # apply optional channel filter log.debug('skipping channel {}'.format(c)) continue self.sequence_partitions.append(n_sequences) # save start of next sequence if l != last_raw_label: # if raw label changed... self.trial_partitions.append(n_sequences) # ...save start of next trial subject_trial_no += 1 # increment subject_trial_no counter last_raw_label = l l = self.label_converter.get_label( l[0], self.label_mode) # convert to label_mode view s = subject_sequences[j] s = s[start_sample:stop_sample] # get sub-sequence in original space # down-sample if requested if resample is not None and resample[0] != resample[1]: s = librosa.resample(s, resample[0], resample[1]) if n_fft is not None and n_fft > 0: # Optionally: # transform to spectogram hop_length = n_fft / 4 ''' from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html >>> # Get a power spectrogram from a waveform y >>> S = np.abs(librosa.stft(y)) ** 2 >>> log_S = librosa.logamplitude(S) ''' s = np.abs( librosa.core.stft(s, n_fft=n_fft, hop_length=hop_length))**2 if n_freq_bins is not None: # Optionally: s = s[0:n_freq_bins, :] # cut off high bands if self.spectrum_log_amplitude: s = librosa.logamplitude(s) ''' NOTE on normalization: It depends on the structure of a neural network and (even more) on the properties of data. There is no best normalization algorithm because if there would be one, it would be used everywhere by default... In theory, there is no requirement for the data to be normalized at all. This is a purely practical thing because in practice convergence could take forever if your input is spread out too much. The simplest would be to just normalize it by scaling your data to (-1,1) (or (0,1) depending on activation function), and in most cases it does work. If your algorithm converges well, then this is your answer. If not, there are too many possible problems and methods to outline here without knowing the actual data. ''' ## normalize to mean 0, std 1 if self.spectrum_normalization_mode == 'mean0_std1': # s = preprocessing.scale(s, axis=0); mean = np.mean(s) std = np.std(s) s = (s - mean) / std ## normalize by linear transform to [0,1] elif self.spectrum_normalization_mode == 'linear_0_1': s = s / np.max(s) ## normalize by linear transform to [-1,1] elif self.spectrum_normalization_mode == 'linear_-1_1': s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)) elif self.spectrum_normalization_mode is not None: raise ValueError( 'unsupported spectrum normalization mode {}' .format(self.spectrum_normalization_mode)) #print s.mean(axis=0) #print s.std(axis=0) # transpose to fit pylearn2 layout s = np.transpose(s) else: # normalize to max amplitude 1 s = librosa.util.normalize(s) s = np.asfarray(s, dtype='float32') if frame_size > 0 and hop_size > 0: s, l = self._split_sequence( s, l, frame_size, hop_size) # print s.shape n_sequences += len(s) sequences.append(s) labels.extend(l) if keep_metadata: self.metadata.append({ 'subject': i, # subject 'trial_no': subject_trial_no, # trial_no 'stimulus': last_raw_label[0], # stimulus 'channel': c, # channel 'start': self.sequence_partitions[-1], # start 'stop': n_sequences # stop }) # turn into numpy arrays sequences = np.vstack(sequences) # print sequences.shape; labels = np.hstack(labels) one_hot_y = one_hot(labels) self.labels = labels # save for later if n_fft > 0: sequences = np.array([sequences]) # re-arrange dimensions sequences = sequences.swapaxes(0, 1).swapaxes(1, 2).swapaxes(2, 3) log.debug('final dataset shape: {} (b,0,1,c)'.format( sequences.shape)) super(EEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) else: super(EEGDataset, self).__init__(X=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) log.debug( 'generated dataset "{}" with shape X={} y={} labels={} '.format( self.name, self.X.shape, self.y.shape, self.labels.shape)) if save_matrix_path is not None: matrix = DenseDesignMatrix(X=sequences, y=one_hot_y) with log_timing( log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)): serial.save(save_matrix_path, matrix)
def __init__(self, path, suffix='', # required data file parameters subjects='all', # optional selector (list) or 'all' start_sample = 0, stop_sample = None, # optional for selection of sub-sequences frame_size = -1, hop_size = -1, # values > 0 will lead to windowing label_mode='tempo', name = '', # optional name n_fft = 0, n_freq_bins = None, save_matrix_path = None, channels = None, resample = None, stimulus_id_filter = None, keep_metadata = False, spectrum_log_amplitude = False, spectrum_normalization_mode = None, ): ''' Constructor ''' self.name = name; self.spectrum_normalization_mode = spectrum_normalization_mode; self.spectrum_log_amplitude = spectrum_log_amplitude; self.datafiles = []; subject_paths = glob.glob(os.path.join(path, 'Sub*')); for path in subject_paths: dataset_filename = os.path.join(path, 'dataset'+suffix+'.pklz'); if os.path.isfile(dataset_filename): log.debug('addding {}'.format(dataset_filename)); self.datafiles.append(dataset_filename); else: log.warn('file does not exists {}'.format(dataset_filename)); self.datafiles.sort(); if subjects == 'all': subjects = np.arange(0,len(self.datafiles)); assert subjects is not None and len(subjects) > 0; self.label_mode = label_mode; self.label_converter = LabelConverter(); if stimulus_id_filter is None: stimulus_id_filter = []; self.stimulus_id_filter = stimulus_id_filter; self.subject_partitions = []; # used to keep track of original subjects self.sequence_partitions = []; # used to keep track of original sequences self.trial_partitions = []; # keeps track of original trials # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = []; sequences = []; labels = []; n_sequences = 0; last_raw_label = -1; for i in xrange(len(self.datafiles)): if i in subjects: with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): self.subject_partitions.append(n_sequences); # save start of next subject subject_sequences, subject_labels, channel_meta = load(self.datafiles[i]); subject_trial_no = -1; for j in xrange(len(subject_sequences)): l = subject_labels[j]; # get raw label if l in stimulus_id_filter: # log.debug('skipping stimulus {}'.format(l)); continue; c = channel_meta[j][0]; if channels is not None and not c in channels: # apply optional channel filter log.debug('skipping channel {}'.format(c)); continue; self.sequence_partitions.append(n_sequences); # save start of next sequence if l != last_raw_label: # if raw label changed... self.trial_partitions.append(n_sequences); # ...save start of next trial subject_trial_no += 1; # increment subject_trial_no counter last_raw_label = l; l = self.label_converter.get_label(l[0], self.label_mode); # convert to label_mode view s = subject_sequences[j]; s = s[start_sample:stop_sample]; # get sub-sequence in original space # down-sample if requested if resample is not None and resample[0] != resample[1]: s = librosa.resample(s, resample[0], resample[1]); if n_fft is not None and n_fft > 0: # Optionally: # transform to spectogram hop_length = n_fft / 4; ''' from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html >>> # Get a power spectrogram from a waveform y >>> S = np.abs(librosa.stft(y)) ** 2 >>> log_S = librosa.logamplitude(S) ''' s = np.abs(librosa.core.stft(s, n_fft=n_fft, hop_length=hop_length) )**2; if n_freq_bins is not None: # Optionally: s = s[0:n_freq_bins, :]; # cut off high bands if self.spectrum_log_amplitude: s = librosa.logamplitude(s); ''' NOTE on normalization: It depends on the structure of a neural network and (even more) on the properties of data. There is no best normalization algorithm because if there would be one, it would be used everywhere by default... In theory, there is no requirement for the data to be normalized at all. This is a purely practical thing because in practice convergence could take forever if your input is spread out too much. The simplest would be to just normalize it by scaling your data to (-1,1) (or (0,1) depending on activation function), and in most cases it does work. If your algorithm converges well, then this is your answer. If not, there are too many possible problems and methods to outline here without knowing the actual data. ''' ## normalize to mean 0, std 1 if self.spectrum_normalization_mode == 'mean0_std1': # s = preprocessing.scale(s, axis=0); mean = np.mean(s); std = np.std(s); s = (s - mean) / std; ## normalize by linear transform to [0,1] elif self.spectrum_normalization_mode == 'linear_0_1': s = s / np.max(s); ## normalize by linear transform to [-1,1] elif self.spectrum_normalization_mode == 'linear_-1_1': s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)); elif self.spectrum_normalization_mode is not None: raise ValueError( 'unsupported spectrum normalization mode {}'.format( self.spectrum_normalization_mode) ); #print s.mean(axis=0) #print s.std(axis=0) # transpose to fit pylearn2 layout s = np.transpose(s); else: # normalize to max amplitude 1 s = librosa.util.normalize(s); s = np.asfarray(s, dtype='float32'); if frame_size > 0 and hop_size > 0: s, l = self._split_sequence(s, l, frame_size, hop_size); # print s.shape n_sequences += len(s); sequences.append(s); labels.extend(l); if keep_metadata: self.metadata.append({ 'subject' : i, # subject 'trial_no' : subject_trial_no, # trial_no 'stimulus' : last_raw_label[0], # stimulus 'channel' : c, # channel 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }); # turn into numpy arrays sequences = np.vstack(sequences); # print sequences.shape; labels = np.hstack(labels); one_hot_y = one_hot(labels); self.labels = labels; # save for later if n_fft > 0: sequences = np.array([sequences]); # re-arrange dimensions sequences = sequences.swapaxes(0,1).swapaxes(1,2).swapaxes(2,3); log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)); super(EEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']); else: super(EEGDataset, self).__init__(X=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']); log.debug('generated dataset "{}" with shape X={} y={} labels={} '.format(self.name, self.X.shape, self.y.shape, self.labels.shape)); if save_matrix_path is not None: matrix = DenseDesignMatrix(X=sequences, y=one_hot_y); with log_timing(log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)): serial.save(save_matrix_path, matrix);
def test_one_hot_out(): out = np.empty((2, 3), dtype='uint8') assert_equal(one_hot([1, 2], out=out), [[0, 1, 0], [0, 0, 1]]) assert_equal(out, [[0, 1, 0], [0, 0, 1]])
def test_one_hot_dtypes(): int_dt = ['int8', 'int16', 'int32', 'int64'] int_dt += ['u' + dt for dt in int_dt] float_dt = ['float64', 'float32', 'complex64', 'complex128'] all_dt = int_dt + float_dt assert_(all(one_hot([5], dtype=dt).dtype == np.dtype(dt) for dt in all_dt))