def gen_phone(mdl): X = theano.tensor.dmatrix('X') P = theano.tensor.dmatrix('P') y = mdl.fprop([X,P]) predict = theano.function([X, P], y) resolution = 1600 step = 64 b = 1.019 n_channels = 64 D_multi = np.r_[tuple(gammatone_matrix(b, fc, resolution, step) for fc in erb_space(150, 8000, n_channels))] phones = np.load('test_phones_1600.npy') X = np.asmatrix((len(phones),np.zeros(1536))) phone_code = np.asmatrix((len(phones),np.zeros(3*62))) for pi, p in enumerate(phones): phone_code[pi,[p[0], p[1]+62, p[2]+2*62]] = 1 # one-hot encoding out = np.zeros(1600 + 200*(len(phones)-1)) step = 200 for k in range(1,len(phones)): idx = range(k*step, k*step+1600) X[k] = predict(X[k-1], phone_code[k]) out[idx] += np.dot(X[k], D_multi) out_scaled = np.asarray(out/max(abs(out)), dtype='float32') return out_scaled
def __init__(self, which_set, frame_length, overlap=0, frames_per_example=1, start=0, stop=None, audio_only=False, n_prev_phones=0, n_next_phones=0, samples_to_predict=1, filter_fn=None, rng=_default_seed, b=1.019, step=8, n_channels=50): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only self.n_prev_phones = n_prev_phones self.n_next_phones = n_next_phones self.samples_to_predict = samples_to_predict self.b = b self.step = step self.n_channels = n_channels # Initializing the dictionary self.D = numpy.r_[tuple(gammatone_matrix(self.b, fc, self.frame_length, self.step) for fc in erb_space(150, 8000, self.n_channels))] #self.coder = SparseCoder(dictionary=self.D, transform_n_nonzero_coefs=None, transform_alpha=1, transform_algorithm='omp') # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = self.scaler.transform(sequence) if filter_fn is not None: filter_fn = eval(filter_fn) indexes = filter_fn(self.speaker_info_list[self.speaker_id]) self.raw_wav = self.raw_wav[indexes] if not self.audio_only: self.phones = self.phones[indexes] self.phonemes = self.phonemes[indexes] self.words = self.words[indexes] # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): if not self.audio_only: # Phones segmentation phones_sequence = self.phones[sequence_id] phones_segmented_sequence = segment_axis(phones_sequence,frame_length,overlap) phones_mode = numpy.concatenate([scipy.stats.mode(phones_segmented_sequence[k-self.n_prev_phones:k+self.n_next_phones+1],axis=1)[0].T for k in range(self.n_prev_phones,len(phones_segmented_sequence)-self.n_next_phones)]) self.phones[sequence_id] = numpy.asarray(phones_mode, dtype=numpy.int16) # Phonemes segmentation phonemes_sequence = self.phonemes[sequence_id] phonemes_segmented_sequence = segment_axis(phonemes_sequence, frame_length, overlap) if self.n_next_phones == 0: self.phonemes[sequence_id] = phonemes_segmented_sequence[self.n_prev_phones:] else: self.phonemes[sequence_id] = phonemes_segmented_sequence[self.n_prev_phones:-self.n_next_phones] # Words segmentation words_sequence = self.words[sequence_id] words_segmented_sequence = segment_axis(words_sequence, frame_length, overlap) if self.n_next_phones == 0: self.words[sequence_id] = words_segmented_sequence[self.n_prev_phones:] else: self.words[sequence_id] = words_segmented_sequence[self.n_prev_phones:-self.n_next_phones] if self.n_next_phones == 0: self.raw_wav[sequence_id] = self.raw_wav[sequence_id][self.n_prev_phones:] else: self.raw_wav[sequence_id] = self.raw_wav[sequence_id][self.n_prev_phones:-self.n_next_phones] # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = self.raw_wav[sequence_id].shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav #numpy.save('/home/jfsantos/data/%s_sparse_frames.npy' % which_set, self.samples_sequences) if not self.audio_only: self.phones_sequences = self.phones self.phonemes_sequences = self.phonemes self.words_sequences = self.words self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace( dim=self.D.shape[0] * self.frames_per_example ) features_source = 'features' def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example].todense()) return rval targets_space = VectorSpace(dim=self.D.shape[0]) targets_source = 'targets' def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].todense()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] if not self.audio_only: num_phones = numpy.max([numpy.max(sequence) for sequence in self.phones]) + 1 phones_space = IndexSpace(max_labels=num_phones, dim=1+self.n_prev_phones+self.n_next_phones, dtype=str(self.phones_sequences[0].dtype)) phones_source = 'phones' def phones_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phones_sequences[sequence_index][example_index].ravel()) return rval num_phonemes = numpy.max([numpy.max(sequence) for sequence in self.phonemes]) + 1 phonemes_space = IndexSpace(max_labels=num_phonemes, dim=1, dtype=str(self.phonemes_sequences[0].dtype)) phonemes_source = 'phonemes' def phonemes_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phonemes_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval num_words = numpy.max([numpy.max(sequence) for sequence in self.words]) + 1 words_space = IndexSpace(max_labels=num_words, dim=1, dtype=str(self.words_sequences[0].dtype)) words_source = 'words' def words_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.words_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval space_components.extend([phones_space, phonemes_space, words_space]) source_components.extend([phones_source, phonemes_source, words_source]) map_fn_components.extend([phones_map_fn, phonemes_map_fn, words_map_fn]) batch_components.extend([None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
def __init__( self, which_set, frame_length, overlap=0.5, frames_per_example=1, start=0, stop=None, audio_only=True, n_prev_phones=0, n_next_phones=0, samples_to_predict=1, filter_fn=None, rng=_default_seed, b=1.019, step=64, n_channels=64, ): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length if overlap < 1.0: self.overlap = overlap * frame_length else: self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only self.n_prev_phones = n_prev_phones self.n_next_phones = n_next_phones self.samples_to_predict = samples_to_predict self.b = b self.step = step self.n_channels = n_channels print "Frame length %d, overlap %d" % (self.frame_length, self.overlap) # Initializing the dictionary self.D = numpy.r_[ tuple( gammatone_matrix(self.b, fc, self.frame_length, self.step) for fc in erb_space(150, 8000, self.n_channels) ) ] print "Using dictionary with shape", self.D.shape self.coder = SparseCoder( dictionary=self.D, transform_n_nonzero_coefs=None, transform_alpha=None, transform_algorithm="omp" ) # RNG initialization if hasattr(rng, "random_integers"): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): print "Sentence %d/%d" % (sequence_id, len(self.raw_wav)) X = segment_axis(samples_sequence, frame_length, overlap, end="pad") X = numpy.hanning(self.frame_length) * X self.raw_wav[sequence_id] = scipy.sparse.csr_matrix(self.coder.transform(X)) # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = self.raw_wav[sequence_id].shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav numpy.save("%s_sparse_frames.npy" % which_set, self.samples_sequences) self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace(dim=self.D.shape[0] * self.frames_per_example) features_source = "features" def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append( self.samples_sequences[sequence_index][ example_index : example_index + self.frames_per_example ].todense() ) return rval targets_space = VectorSpace(dim=self.D.shape[0]) targets_source = "targets" def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].todense()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class("shuffled_sequential") self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))