def _generate_data(self): """ Generates X matrix for DenseDesignMatrix initialization function. """ X = numpy.zeros((self.samples+1, 2)) X[0, :] = self.init_state y = numpy.zeros(self.samples) for i in range(1, X.shape[0]): X[i, 0] = 1 - self.alpha*X[i-1, 0]**2 + X[i-1, 1] X[i, 1] = self.beta*X[i-1, 0] last_target = X[-1, :] X = X[:-1, :] X.reshape((1, self.samples*2)) # Flatten Z = segment_axis(X, length=self.frame_length, overlap=0) y = numpy.zeros((Z.shape[0], 2)) y[:-1, :] = Z[1:, 0:2] y[-1, :] = last_target # X[-1, :] return (Z, y)
def __init__(self, which_set, frame_length, overlap=0, frames_per_example=1, start=0, stop=None, audio_only=False, n_prev_phones=0, n_next_phones=0, samples_to_predict=1, filter_fn=None, rng=_default_seed, b=1.019, step=8, n_channels=50): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only self.n_prev_phones = n_prev_phones self.n_next_phones = n_next_phones self.samples_to_predict = samples_to_predict self.b = b self.step = step self.n_channels = n_channels # Initializing the dictionary self.D = numpy.r_[tuple(gammatone_matrix(self.b, fc, self.frame_length, self.step) for fc in erb_space(150, 8000, self.n_channels))] #self.coder = SparseCoder(dictionary=self.D, transform_n_nonzero_coefs=None, transform_alpha=1, transform_algorithm='omp') # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = self.scaler.transform(sequence) if filter_fn is not None: filter_fn = eval(filter_fn) indexes = filter_fn(self.speaker_info_list[self.speaker_id]) self.raw_wav = self.raw_wav[indexes] if not self.audio_only: self.phones = self.phones[indexes] self.phonemes = self.phonemes[indexes] self.words = self.words[indexes] # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): if not self.audio_only: # Phones segmentation phones_sequence = self.phones[sequence_id] phones_segmented_sequence = segment_axis(phones_sequence,frame_length,overlap) phones_mode = numpy.concatenate([scipy.stats.mode(phones_segmented_sequence[k-self.n_prev_phones:k+self.n_next_phones+1],axis=1)[0].T for k in range(self.n_prev_phones,len(phones_segmented_sequence)-self.n_next_phones)]) self.phones[sequence_id] = numpy.asarray(phones_mode, dtype=numpy.int16) # Phonemes segmentation phonemes_sequence = self.phonemes[sequence_id] phonemes_segmented_sequence = segment_axis(phonemes_sequence, frame_length, overlap) if self.n_next_phones == 0: self.phonemes[sequence_id] = phonemes_segmented_sequence[self.n_prev_phones:] else: self.phonemes[sequence_id] = phonemes_segmented_sequence[self.n_prev_phones:-self.n_next_phones] # Words segmentation words_sequence = self.words[sequence_id] words_segmented_sequence = segment_axis(words_sequence, frame_length, overlap) if self.n_next_phones == 0: self.words[sequence_id] = words_segmented_sequence[self.n_prev_phones:] else: self.words[sequence_id] = words_segmented_sequence[self.n_prev_phones:-self.n_next_phones] if self.n_next_phones == 0: self.raw_wav[sequence_id] = self.raw_wav[sequence_id][self.n_prev_phones:] else: self.raw_wav[sequence_id] = self.raw_wav[sequence_id][self.n_prev_phones:-self.n_next_phones] # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = self.raw_wav[sequence_id].shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav #numpy.save('/home/jfsantos/data/%s_sparse_frames.npy' % which_set, self.samples_sequences) if not self.audio_only: self.phones_sequences = self.phones self.phonemes_sequences = self.phonemes self.words_sequences = self.words self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace( dim=self.D.shape[0] * self.frames_per_example ) features_source = 'features' def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example].todense()) return rval targets_space = VectorSpace(dim=self.D.shape[0]) targets_source = 'targets' def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].todense()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] if not self.audio_only: num_phones = numpy.max([numpy.max(sequence) for sequence in self.phones]) + 1 phones_space = IndexSpace(max_labels=num_phones, dim=1+self.n_prev_phones+self.n_next_phones, dtype=str(self.phones_sequences[0].dtype)) phones_source = 'phones' def phones_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phones_sequences[sequence_index][example_index].ravel()) return rval num_phonemes = numpy.max([numpy.max(sequence) for sequence in self.phonemes]) + 1 phonemes_space = IndexSpace(max_labels=num_phonemes, dim=1, dtype=str(self.phonemes_sequences[0].dtype)) phonemes_source = 'phonemes' def phonemes_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phonemes_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval num_words = numpy.max([numpy.max(sequence) for sequence in self.words]) + 1 words_space = IndexSpace(max_labels=num_words, dim=1, dtype=str(self.words_sequences[0].dtype)) words_source = 'words' def words_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.words_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval space_components.extend([phones_space, phonemes_space, words_space]) source_components.extend([phones_source, phonemes_source, words_source]) map_fn_components.extend([phones_map_fn, phonemes_map_fn, words_map_fn]) batch_components.extend([None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
def __init__(self, which_set, frame_length, start=0, stop=None, audio_only=False, rng=_default_seed): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in the sliding window start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.audio_only = audio_only # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std if not self.audio_only: self.num_phones = numpy.max( [numpy.max(sequence) for sequence in self.phones]) + 1 self.num_phonemes = numpy.max( [numpy.max(sequence) for sequence in self.phonemes]) + 1 self.num_words = numpy.max( [numpy.max(sequence) for sequence in self.words]) + 1 # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] samples_sequences = [] targets_sequences = [] phones_sequences = [] phonemes_sequences = [] words_sequences = [] for sequence_id, samples_sequence in enumerate(self.raw_wav): # Sequence segmentation samples_segmented_sequence = segment_axis(samples_sequence, frame_length, frame_length - 1)[:-1] samples_sequences.append(samples_segmented_sequence) targets_sequences.append(samples_sequence[frame_length:].reshape( (samples_sequence[frame_length:].shape[0], 1))) if not self.audio_only: target_phones = self.phones[sequence_id][frame_length:] phones_sequences.append( target_phones.reshape((target_phones.shape[0], 1))) target_phonemes = self.phonemes[sequence_id][frame_length:] phonemes_sequences.append( target_phonemes.reshape((target_phonemes.shape[0], 1))) target_words = self.words[sequence_id][frame_length:] words_sequences.append( target_words.reshape((target_words.shape[0], 1))) del self.raw_wav self.samples_sequences = samples_sequences self.targets_sequences = targets_sequences self.data = [samples_sequences, targets_sequences] if not self.audio_only: del self.phones del self.phonemes del self.words self.phones_sequences = phones_sequences self.phonemes_sequences = phonemes_sequences self.words_sequences = words_sequences self.data.extend( [phones_sequences, phonemes_sequences, words_sequences]) self.num_examples = len(samples_sequences) # DataSpecs features_space = VectorSequenceSpace(dim=self.frame_length) features_source = 'features' targets_space = VectorSequenceSpace(dim=1) targets_source = 'targets' space_components = [features_space, targets_space] source_components = [features_source, targets_source] batch_components = [None, None] if not self.audio_only: phones_space = IndexSequenceSpace( max_labels=self.num_phones, dim=1, dtype=str(self.phones_sequences[0].dtype)) phones_source = 'phones' phonemes_space = IndexSequenceSpace( max_labels=self.num_phonemes, dim=1, dtype=str(self.phonemes_sequences[0].dtype)) phonemes_source = 'phonemes' words_space = IndexSequenceSpace( max_labels=self.num_words, dim=1, dtype=str(self.words_sequences[0].dtype)) words_source = 'words' space_components.extend( [phones_space, phonemes_space, words_space]) source_components.extend( [phones_source, phonemes_source, words_source]) batch_components.extend([None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace( (features_space, targets_space)), (features_source, targets_source))
def __init__(self, which_set, frame_length, overlap=0, frames_per_example=1, start=0, stop=None, audio_only=False, rng=_default_seed): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std if not self.audio_only: self.num_phones = numpy.max( [numpy.max(sequence) for sequence in self.phones]) + 1 self.num_phonemes = numpy.max( [numpy.max(sequence) for sequence in self.phonemes]) + 1 self.num_words = numpy.max( [numpy.max(sequence) for sequence in self.words]) + 1 # The following is hard coded. However, the way it is done above # could be problematic if a max value (the max over the whole # dataset (train + valid + test)) is not present in at least one # one of the three subsets. This is the case for speakers. This is # not the case for phones. self.num_speakers = 630 # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] self.speaker_id = self.speaker_id[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] self.speaker_id = self.speaker_id[start:] examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): if not self.audio_only: # Phones segmentation phones_sequence = self.phones[sequence_id] phones_segmented_sequence = segment_axis( phones_sequence, frame_length, overlap) self.phones[sequence_id] = phones_segmented_sequence # phones_segmented_sequence = scipy.stats.mode( # phones_segmented_sequence, # axis=1 # )[0].flatten() # phones_segmented_sequence = numpy.asarray( # phones_segmented_sequence, # dtype='int' # ) # phones_sequence_list.append(phones_segmented_sequence) # Phonemes segmentation phonemes_sequence = self.phonemes[sequence_id] phonemes_segmented_sequence = segment_axis( phonemes_sequence, frame_length, overlap) self.phonemes[sequence_id] = phonemes_segmented_sequence # phonemes_segmented_sequence = scipy.stats.mode( # phonemes_segmented_sequence, # axis=1 # )[0].flatten() # phonemes_segmented_sequence = numpy.asarray( # phonemes_segmented_sequence, # dtype='int' # ) # phonemes_sequence_list.append(phonemes_segmented_sequence) # Words segmentation words_sequence = self.words[sequence_id] words_segmented_sequence = segment_axis( words_sequence, frame_length, overlap) self.words[sequence_id] = words_segmented_sequence # words_segmented_sequence = scipy.stats.mode( # words_segmented_sequence, # axis=1 # )[0].flatten() # words_segmented_sequence = numpy.asarray(words_segmented_sequence, # dtype='int') # words_sequence_list.append(words_segmented_sequence) # TODO: look at this, does it force copying the data? # Sequence segmentation samples_segmented_sequence = segment_axis(samples_sequence, frame_length, overlap) self.raw_wav[sequence_id] = samples_segmented_sequence # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = samples_segmented_sequence.shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav if not self.audio_only: self.phones_sequences = self.phones self.phonemes_sequences = self.phonemes self.words_sequences = self.words self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace(dim=self.frame_length * self.frames_per_example) features_source = 'features' def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index] [example_index:example_index + self.frames_per_example].ravel()) return rval targets_space = VectorSpace(dim=self.frame_length) targets_source = 'targets' def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] if not self.audio_only: phones_space = IndexSpace(max_labels=self.num_phones, dim=1, dtype=str( self.phones_sequences[0].dtype)) phones_source = 'phones' def phones_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.phones_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval phonemes_space = IndexSpace(max_labels=self.num_phonemes, dim=1, dtype=str( self.phonemes_sequences[0].dtype)) phonemes_source = 'phonemes' def phonemes_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.phonemes_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval words_space = IndexSpace(max_labels=self.num_words, dim=1, dtype=str(self.words_sequences[0].dtype)) words_source = 'words' def words_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.words_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval speaker_id_space = IndexSpace(max_labels=self.num_speakers, dim=1, dtype=str(self.speaker_id.dtype)) speaker_id_source = 'speaker_id' def speaker_id_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.speaker_id[sequence_index].ravel()) return rval dialect_space = IndexSpace(max_labels=8, dim=1, dtype='int32') dialect_source = 'dialect' def dialect_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[1:9])) return rval education_space = IndexSpace(max_labels=6, dim=1, dtype='int32') education_source = 'education' def education_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[9:15])) return rval race_space = IndexSpace(max_labels=8, dim=1, dtype='int32') race_source = 'race' def race_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[16:24])) return rval gender_space = IndexSpace(max_labels=2, dim=1, dtype='int32') gender_source = 'gender' def gender_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[24:])) return rval space_components.extend([ phones_space, phonemes_space, words_space, speaker_id_space, dialect_space, education_space, race_space, gender_space ]) source_components.extend([ phones_source, phonemes_source, words_source, speaker_id_source, dialect_source, education_source, race_source, gender_source ]) map_fn_components.extend([ phones_map_fn, phonemes_map_fn, words_map_fn, speaker_id_map_fn, dialect_map_fn, education_map_fn, race_map_fn, gender_map_fn ]) batch_components.extend( [None, None, None, None, None, None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace( (features_space, targets_space)), (features_source, targets_source))
def __init__(self, which_set, frame_length, overlap=0, frames_per_example=1, start=0, stop=None, audio_only=False, rng=_default_seed, noise = False ): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only self.noise = noise # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): if not self.audio_only: # Phones segmentation phones_sequence = self.phones[sequence_id] phones_segmented_sequence = segment_axis(phones_sequence, frame_length, overlap) self.phones[sequence_id] = phones_segmented_sequence # phones_segmented_sequence = scipy.stats.mode( # phones_segmented_sequence, # axis=1 # )[0].flatten() # phones_segmented_sequence = numpy.asarray( # phones_segmented_sequence, # dtype='int' # ) # phones_sequence_list.append(phones_segmented_sequence) # Phonemes segmentation phonemes_sequence = self.phonemes[sequence_id] phonemes_segmented_sequence = segment_axis(phonemes_sequence, frame_length, overlap) self.phonemes[sequence_id] = phonemes_segmented_sequence # phonemes_segmented_sequence = scipy.stats.mode( # phonemes_segmented_sequence, # axis=1 # )[0].flatten() # phonemes_segmented_sequence = numpy.asarray( # phonemes_segmented_sequence, # dtype='int' # ) # phonemes_sequence_list.append(phonemes_segmented_sequence) # Words segmentation words_sequence = self.words[sequence_id] words_segmented_sequence = segment_axis(words_sequence, frame_length, overlap) self.words[sequence_id] = words_segmented_sequence # words_segmented_sequence = scipy.stats.mode( # words_segmented_sequence, # axis=1 # )[0].flatten() # words_segmented_sequence = numpy.asarray(words_segmented_sequence, # dtype='int') # words_sequence_list.append(words_segmented_sequence) # TODO: look at this, does it force copying the data? # Sequence segmentation samples_segmented_sequence = segment_axis(samples_sequence, frame_length, overlap) self.raw_wav[sequence_id] = samples_segmented_sequence # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = samples_segmented_sequence.shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav if not self.audio_only: self.phones_sequences = self.phones self.phonemes_sequences = self.phonemes self.words_sequences = self.words self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace( dim=self.frame_length * self.frames_per_example ) features_source = 'features' def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example].ravel()) return rval def features_map_fn_noise(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append( (self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example] + self.noise_this_epoch[sequence_index][example_index:example_index + self.frames_per_example]).ravel()) return rval targets_space = VectorSpace(dim=self.frame_length) targets_source = 'targets' def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] if self.noise == False: map_fn_components = [features_map_fn, targets_map_fn] else: map_fn_components = [features_map_fn_noise, targets_map_fn] batch_components = [None, None] if not self.audio_only: num_phones = numpy.max([numpy.max(sequence) for sequence in self.phones]) + 1 phones_space = IndexSpace(max_labels=num_phones, dim=1, dtype=str(self.phones_sequences[0].dtype)) phones_source = 'phones' def phones_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phones_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval num_phonemes = numpy.max([numpy.max(sequence) for sequence in self.phonemes]) + 1 phonemes_space = IndexSpace(max_labels=num_phonemes, dim=1, dtype=str(self.phonemes_sequences[0].dtype)) phonemes_source = 'phonemes' def phonemes_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phonemes_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval num_words = numpy.max([numpy.max(sequence) for sequence in self.words]) + 1 words_space = IndexSpace(max_labels=num_words, dim=1, dtype=str(self.words_sequences[0].dtype)) words_source = 'words' def words_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.words_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval space_components.extend([phones_space, phonemes_space, words_space]) source_components.extend([phones_source, phonemes_source, words_source]) map_fn_components.extend([phones_map_fn, phonemes_map_fn, words_map_fn]) batch_components.extend([None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
def __init__( self, which_set, frame_length, overlap=0.5, frames_per_example=1, start=0, stop=None, audio_only=True, n_prev_phones=0, n_next_phones=0, samples_to_predict=1, filter_fn=None, rng=_default_seed, b=1.019, step=64, n_channels=64, ): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length if overlap < 1.0: self.overlap = overlap * frame_length else: self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only self.n_prev_phones = n_prev_phones self.n_next_phones = n_next_phones self.samples_to_predict = samples_to_predict self.b = b self.step = step self.n_channels = n_channels print "Frame length %d, overlap %d" % (self.frame_length, self.overlap) # Initializing the dictionary self.D = numpy.r_[ tuple( gammatone_matrix(self.b, fc, self.frame_length, self.step) for fc in erb_space(150, 8000, self.n_channels) ) ] print "Using dictionary with shape", self.D.shape self.coder = SparseCoder( dictionary=self.D, transform_n_nonzero_coefs=None, transform_alpha=None, transform_algorithm="omp" ) # RNG initialization if hasattr(rng, "random_integers"): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): print "Sentence %d/%d" % (sequence_id, len(self.raw_wav)) X = segment_axis(samples_sequence, frame_length, overlap, end="pad") X = numpy.hanning(self.frame_length) * X self.raw_wav[sequence_id] = scipy.sparse.csr_matrix(self.coder.transform(X)) # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = self.raw_wav[sequence_id].shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav numpy.save("%s_sparse_frames.npy" % which_set, self.samples_sequences) self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace(dim=self.D.shape[0] * self.frames_per_example) features_source = "features" def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append( self.samples_sequences[sequence_index][ example_index : example_index + self.frames_per_example ].todense() ) return rval targets_space = VectorSpace(dim=self.D.shape[0]) targets_source = "targets" def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].todense()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class("shuffled_sequential") self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
def __init__(self, which_set, frame_length, overlap=0, frames_per_example=1, start=0, stop=None, audio_only=False, rng=_default_seed): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std if not self.audio_only: self.num_phones = numpy.max([numpy.max(sequence) for sequence in self.phones]) + 1 self.num_phonemes = numpy.max([numpy.max(sequence) for sequence in self.phonemes]) + 1 self.num_words = numpy.max([numpy.max(sequence) for sequence in self.words]) + 1 # The following is hard coded. However, the way it is done above # could be problematic if a max value (the max over the whole # dataset (train + valid + test)) is not present in at least one # one of the three subsets. This is the case for speakers. This is # not the case for phones. self.num_speakers = 630 # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] self.speaker_id = self.speaker_id[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] self.speaker_id = self.speaker_id[start:] examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): if not self.audio_only: # Phones segmentation phones_sequence = self.phones[sequence_id] phones_segmented_sequence = segment_axis(phones_sequence, frame_length, overlap) self.phones[sequence_id] = phones_segmented_sequence # phones_segmented_sequence = scipy.stats.mode( # phones_segmented_sequence, # axis=1 # )[0].flatten() # phones_segmented_sequence = numpy.asarray( # phones_segmented_sequence, # dtype='int' # ) # phones_sequence_list.append(phones_segmented_sequence) # Phonemes segmentation phonemes_sequence = self.phonemes[sequence_id] phonemes_segmented_sequence = segment_axis(phonemes_sequence, frame_length, overlap) self.phonemes[sequence_id] = phonemes_segmented_sequence # phonemes_segmented_sequence = scipy.stats.mode( # phonemes_segmented_sequence, # axis=1 # )[0].flatten() # phonemes_segmented_sequence = numpy.asarray( # phonemes_segmented_sequence, # dtype='int' # ) # phonemes_sequence_list.append(phonemes_segmented_sequence) # Words segmentation words_sequence = self.words[sequence_id] words_segmented_sequence = segment_axis(words_sequence, frame_length, overlap) self.words[sequence_id] = words_segmented_sequence # words_segmented_sequence = scipy.stats.mode( # words_segmented_sequence, # axis=1 # )[0].flatten() # words_segmented_sequence = numpy.asarray(words_segmented_sequence, # dtype='int') # words_sequence_list.append(words_segmented_sequence) # TODO: look at this, does it force copying the data? # Sequence segmentation samples_segmented_sequence = segment_axis(samples_sequence, frame_length, overlap) self.raw_wav[sequence_id] = samples_segmented_sequence # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = samples_segmented_sequence.shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav if not self.audio_only: self.phones_sequences = self.phones self.phonemes_sequences = self.phonemes self.words_sequences = self.words self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace( dim=self.frame_length * self.frames_per_example ) features_source = 'features' def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example].ravel()) return rval targets_space = VectorSpace(dim=self.frame_length) targets_source = 'targets' def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] if not self.audio_only: phones_space = IndexSpace(max_labels=self.num_phones, dim=1, dtype=str(self.phones_sequences[0].dtype)) phones_source = 'phones' def phones_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phones_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval phonemes_space = IndexSpace(max_labels=self.num_phonemes, dim=1, dtype=str(self.phonemes_sequences[0].dtype)) phonemes_source = 'phonemes' def phonemes_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phonemes_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval words_space = IndexSpace(max_labels=self.num_words, dim=1, dtype=str(self.words_sequences[0].dtype)) words_source = 'words' def words_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.words_sequences[sequence_index][example_index + self.frames_per_example].ravel()) return rval speaker_id_space = IndexSpace(max_labels=self.num_speakers, dim=1, dtype=str(self.speaker_id.dtype)) speaker_id_source = 'speaker_id' def speaker_id_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.speaker_id[sequence_index].ravel()) return rval dialect_space = IndexSpace(max_labels=8, dim=1, dtype='int32') dialect_source = 'dialect' def dialect_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): info = self.speaker_info_list[self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[1:9])) return rval education_space = IndexSpace(max_labels=6, dim=1, dtype='int32') education_source = 'education' def education_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): info = self.speaker_info_list[self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[9:15])) return rval race_space = IndexSpace(max_labels=8, dim=1, dtype='int32') race_source = 'race' def race_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): info = self.speaker_info_list[self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[16:24])) return rval gender_space = IndexSpace(max_labels=2, dim=1, dtype='int32') gender_source = 'gender' def gender_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): info = self.speaker_info_list[self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[24:])) return rval space_components.extend([phones_space, phonemes_space, words_space, speaker_id_space, dialect_space, education_space, race_space, gender_space]) source_components.extend([phones_source, phonemes_source, words_source, speaker_id_source, dialect_source, education_source, race_source, gender_source]) map_fn_components.extend([phones_map_fn, phonemes_map_fn, words_map_fn, speaker_id_map_fn, dialect_map_fn, education_map_fn, race_map_fn, gender_map_fn]) batch_components.extend([None, None, None, None, None, None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
def __init__(self, which_set, frame_length, start=0, stop=None, audio_only=False, rng=_default_seed): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in the sliding window start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.audio_only = audio_only # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std if not self.audio_only: self.num_phones = numpy.max([numpy.max(sequence) for sequence in self.phones]) + 1 self.num_phonemes = numpy.max([numpy.max(sequence) for sequence in self.phonemes]) + 1 self.num_words = numpy.max([numpy.max(sequence) for sequence in self.words]) + 1 # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] samples_sequences = [] targets_sequences = [] phones_sequences = [] phonemes_sequences = [] words_sequences = [] for sequence_id, samples_sequence in enumerate(self.raw_wav): # Sequence segmentation samples_segmented_sequence = segment_axis(samples_sequence, frame_length, frame_length - 1)[:-1] samples_sequences.append(samples_segmented_sequence) targets_sequences.append(samples_sequence[frame_length:].reshape( (samples_sequence[frame_length:].shape[0], 1) )) if not self.audio_only: target_phones = self.phones[sequence_id][frame_length:] phones_sequences.append(target_phones.reshape( (target_phones.shape[0], 1) )) target_phonemes = self.phonemes[sequence_id][frame_length:] phonemes_sequences.append(target_phonemes.reshape( (target_phonemes.shape[0], 1) )) target_words = self.words[sequence_id][frame_length:] words_sequences.append(target_words.reshape( (target_words.shape[0], 1) )) del self.raw_wav self.samples_sequences = samples_sequences self.targets_sequences = targets_sequences self.data = [samples_sequences, targets_sequences] if not self.audio_only: del self.phones del self.phonemes del self.words self.phones_sequences = phones_sequences self.phonemes_sequences = phonemes_sequences self.words_sequences = words_sequences self.data.extend([phones_sequences, phonemes_sequences, words_sequences]) self.num_examples = len(samples_sequences) # DataSpecs features_space = VectorSequenceSpace(dim=self.frame_length) features_source = 'features' targets_space = VectorSequenceSpace(dim=1) targets_source = 'targets' space_components = [features_space, targets_space] source_components = [features_source, targets_source] batch_components = [None, None] if not self.audio_only: phones_space = IndexSequenceSpace( max_labels=self.num_phones, dim=1, dtype=str(self.phones_sequences[0].dtype) ) phones_source = 'phones' phonemes_space = IndexSequenceSpace( max_labels=self.num_phonemes, dim=1, dtype=str(self.phonemes_sequences[0].dtype) ) phonemes_source = 'phonemes' words_space = IndexSequenceSpace( max_labels=self.num_words, dim=1, dtype=str(self.words_sequences[0].dtype) ) words_source = 'words' space_components.extend([phones_space, phonemes_space, words_space]) source_components.extend([phones_source, phonemes_source, words_source]) batch_components.extend([None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
def get_markov_frames(self, subset, id): """ Given the subset and an id, this method returns the list [input_frames, input_phonemes, input_words, output_phoneme, output_word, spkr_info, output_frame, ending_phoneme, ending_word]. """ assert subset + "_intervals_seq" in self.__dict__.keys() assert id < self.__dict__[subset + "_intervals_seq"][-1] n_frames_in = self.__dict__[subset + "_n_frames_in"] frame_length = self.__dict__[subset + "_frame_length"] overlap = self.__dict__[subset + "_overlap"] wav_length = self.__dict__[subset + "_wav_length"] intervals_seq = self.__dict__[subset + "_intervals_seq"] # Find the acoustic samples sequence we are looking for seq_id = np.digitize([id], intervals_seq) - 1 seq_id = seq_id[0] # Find the position in this sequence idx_in_seq = id - intervals_seq[seq_id] - (wav_length - frame_length \ + overlap) # Get the sequence wav_seq = self.__dict__[subset + "_raw_wav"][seq_id] # Get the phonemes phn_l_start = self.__dict__[subset + "_seq_to_phn"][seq_id][0] phn_l_end = self.__dict__[subset + "_seq_to_phn"][seq_id][1] phn_start_end = self.__dict__[subset + "_phn"][phn_l_start:phn_l_end] phn_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any phoneme so 0 is # the index for "NO_PHONEME" and the other index are shifted by one for (phn_start, phn_end, phn) in phn_start_end: phn_seq[phn_start:phn_end] = phn + 1 # Get the words wrd_l_start = self.__dict__[subset + "_seq_to_wrd"][seq_id][0] wrd_l_end = self.__dict__[subset + "_seq_to_wrd"][seq_id][1] wrd_start_end = self.__dict__[subset + "_wrd"][wrd_l_start:wrd_l_end] wrd_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any word so 0 is # the index for "NO_WORD" and the other index are shifted by one for (wrd_start, wrd_end, wrd) in wrd_start_end: wrd_seq[wrd_start:wrd_end] = wrd + 1 # Binary variable announcing the end of the word or phoneme end_phn = np.zeros_like(phn_seq) end_wrd = np.zeros_like(wrd_seq) for i in range(len(phn_seq) - 1): if phn_seq[i] != phn_seq[i + 1]: end_phn[i] = 1 if wrd_seq[i] != wrd_seq[i + 1]: end_wrd[i] = 1 end_phn[-1] = 1 end_wrd[-1] = 1 # Find the speaker id spkr_id = self.__dict__[subset + "_spkr"][seq_id] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Pick the selected segment padded_wav_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_wav_seq[-idx_in_seq:] = wav_seq[0:(wav_length + idx_in_seq)] else: padded_wav_seq = wav_seq[idx_in_seq:(idx_in_seq + wav_length)] padded_phn_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_phn_seq[-idx_in_seq:] = phn_seq[0:(wav_length + idx_in_seq)] else: padded_phn_seq = phn_seq[idx_in_seq:(idx_in_seq + wav_length)] padded_wrd_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_wrd_seq[-idx_in_seq:] = wrd_seq[0:(wav_length + idx_in_seq)] else: padded_wrd_seq = wrd_seq[idx_in_seq:(idx_in_seq + wav_length)] # Segment into frames wav_seq = segment_axis(padded_wav_seq, frame_length, overlap) # Take the most occurring phoneme in a sequence phn_seq = segment_axis(padded_phn_seq, frame_length, overlap) phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten() phn_seq = np.asarray(phn_seq, dtype='int') # Take the most occurring word in a sequence wrd_seq = segment_axis(padded_wrd_seq, frame_length, overlap) wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten() wrd_seq = np.asarray(wrd_seq, dtype='int') # Announce the end if and only if it was announced in the current frame end_phn = segment_axis(end_phn, frame_length, overlap) end_phn = end_phn.max(axis=1) end_wrd = segment_axis(end_wrd, frame_length, overlap) end_wrd = end_wrd.max(axis=1) # Put names on the output input_frames = wav_seq[:-1] input_phonemes = phn_seq[:-1] input_words = wrd_seq[:-1] output_phoneme = phn_seq[-1] output_word = wrd_seq[-1] output_frame = wav_seq[-1] ending_phoneme = end_phn[-1] ending_word = end_wrd[-1] return [input_frames, input_phonemes, input_words, output_phoneme, \ output_word, spkr_info, output_frame, ending_phoneme, \ ending_word]
def get_raw_seq(self, subset, seq_id, frame_length, overlap): """ Given the id of the subset, the id of the sequence, the frame length and the overlap between frames, this method will return a frames sequence from a given set, the associated phonemes and words sequences (including a binary variable indicating change) and the information vector on the speaker. """ self.check_subset_value(subset) self.check_subset_presence(subset) # Check if the id is valid n_seq = self.__dict__[subset + "_n_seq"] if seq_id >= n_seq: raise ValueError("This sequence does not exist.") # Get the sequence wav_seq = self.__dict__[subset + "_raw_wav"][seq_id] # Get the phonemes phn_l_start = self.__dict__[subset + "_seq_to_phn"][seq_id][0] phn_l_end = self.__dict__[subset + "_seq_to_phn"][seq_id][1] phn_start_end = self.__dict__[subset + "_phn"][phn_l_start:phn_l_end] phn_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any phoneme so 0 is # the index for "NO_PHONEME" and the other index are shifted by one for (phn_start, phn_end, phn) in phn_start_end: phn_seq[phn_start:phn_end] = phn + 1 # Get the words wrd_l_start = self.__dict__[subset + "_seq_to_wrd"][seq_id][0] wrd_l_end = self.__dict__[subset + "_seq_to_wrd"][seq_id][1] wrd_start_end = self.__dict__[subset + "_wrd"][wrd_l_start:wrd_l_end] wrd_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any word so 0 is # the index for "NO_WORD" and the other index are shifted by one for (wrd_start, wrd_end, wrd) in wrd_start_end: wrd_seq[wrd_start:wrd_end] = wrd + 1 # Binary variable announcing the end of the word or phoneme end_phn = np.zeros_like(phn_seq) end_wrd = np.zeros_like(wrd_seq) for i in range(len(phn_seq) - 1): if phn_seq[i] != phn_seq[i + 1]: end_phn[i] = 1 if wrd_seq[i] != wrd_seq[i + 1]: end_wrd[i] = 1 end_phn[-1] = 1 end_wrd[-1] = 1 # Find the speaker id spkr_id = self.__dict__[subset + "_spkr"][seq_id] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Segment into frames wav_seq = segment_axis(wav_seq, frame_length, overlap) # Take the most occurring phoneme in a sequence phn_seq = segment_axis(phn_seq, frame_length, overlap) phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten() phn_seq = np.asarray(phn_seq, dtype='int') # Take the most occurring word in a sequence wrd_seq = segment_axis(wrd_seq, frame_length, overlap) wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten() wrd_seq = np.asarray(wrd_seq, dtype='int') # Announce the end if and only if it was announced in the current frame end_phn = segment_axis(end_phn, frame_length, overlap) end_phn = end_phn.max(axis=1) end_wrd = segment_axis(end_wrd, frame_length, overlap) end_wrd = end_wrd.max(axis=1) return [wav_seq, phn_seq, end_phn, wrd_seq, end_wrd, spkr_info]
def __init__(self, which_set, frame_length, overlap=0, frames_per_example=1, start=0, stop=None, rng=_default_seed): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) if stop is not None: self.raw_wav = self.raw_wav[start:stop] else: self.raw_wav = self.raw_wav[start:] features_map = [] targets_map = [] phones_map = [] words_map = [] n_seq = len(self.raw_wav) self.phones_seq = [] self.phonemes_seq = [] self.wrd_seq = [] for sequence_id in range(len(self.raw_wav)): # Get the phonemes phn_l_start = self.sequences_to_phonemes[sequence_id][0] phn_l_end = self.sequences_to_phonemes[sequence_id][1] phonemes_start_end = self.phonemes[phn_l_start:phn_l_end] phones_start_end = self.phones[phn_l_start:phn_l_end] phonemes_sequence = numpy.zeros(len(self.raw_wav[sequence_id])) phones_sequence = numpy.zeros(len(self.raw_wav[sequence_id])) # Some timestamp does not correspond to any phoneme so 0 is # the index for "NO_PHONEME" and the other index are shifted by one for (phn_start, phn_end, phn) in phonemes_start_end: phonemes_sequence[phn_start:phn_end] = phn+1 for (phn_start, phn_end, phn) in phones_start_end: phones_sequence[phn_start:phn_end] = phn+1 phonemes_segmented_sequence = segment_axis(phonemes_sequence, frame_length, overlap) phonemes_segmented_sequence = scipy.stats.mode(phonemes_segmented_sequence, axis=1)[0].flatten() phonemes_segmented_sequence = numpy.asarray(phonemes_segmented_sequence, dtype='int') self.phonemes_seq.append(phonemes_segmented_sequence) phones_segmented_sequence = segment_axis(phones_sequence, frame_length, overlap) phones_segmented_sequence = scipy.stats.mode(phones_segmented_sequence, axis=1)[0].flatten() phones_segmented_sequence = numpy.asarray(phones_segmented_sequence, dtype='int') self.phones_seq.append(phones_segmented_sequence) # Get the words wrd_l_start = self.sequences_to_words[sequence_id][0] wrd_l_end = self.sequences_to_words[sequence_id][1] wrd_start_end = self.words[wrd_l_start:wrd_l_end] wrd_sequence = numpy.zeros(len(self.raw_wav[sequence_id])) # Some timestamp does not correspond to any word so 0 is # the index for "NO_WORD" and the other index are shifted by one for (wrd_start, wrd_end, wrd) in wrd_start_end: wrd_sequence[wrd_start:wrd_end] = wrd+1 wrd_segmented_sequence = segment_axis(wrd_sequence, frame_length, overlap) wrd_segmented_sequence = scipy.stats.mode(wrd_segmented_sequence, axis=1)[0].flatten() wrd_segmented_sequence = numpy.asarray(wrd_segmented_sequence, dtype='int') self.wrd_seq.append(wrd_segmented_sequence) self.phones_seq = numpy.array(self.phones_seq) self.phonemes_seq = numpy.array(self.phonemes_seq) self.wrd_seq = numpy.array(self.wrd_seq) for sequence_id, sequence in enumerate(self.raw_wav): segmented_sequence = segment_axis(sequence, frame_length, overlap) self.raw_wav[sequence_id] = segmented_sequence num_frames = segmented_sequence.shape[0] num_examples = num_frames - self.frames_per_example for example_id in xrange(num_examples): features_map.append([sequence_id, example_id, example_id + self.frames_per_example]) targets_map.append([sequence_id, example_id + self.frames_per_example]) phones_map.append([sequence_id, example_id]) words_map.append([sequence_id, example_id]) features_map = numpy.asarray(features_map) targets_map = numpy.asarray(targets_map) phones_map = numpy.asarray(phones_map) words_map = numpy.asarray(words_map) self.num_examples = features_map.shape[0] # DataSpecs features_space = VectorSpace( dim=self.frame_length * self.frames_per_example ) features_source = 'features' features_dtype = self.raw_wav[0].dtype features_map_fn = lambda indexes: [ self.raw_wav[index[0]][index[1]:index[2]].ravel() for index in features_map[indexes] ] targets_space = VectorSpace(dim=self.frame_length) targets_source = 'targets' targets_dtype = self.raw_wav[0].dtype targets_map_fn = lambda indexes: [ self.raw_wav[index[0]][index[1]] for index in targets_map[indexes] ] phones_space = VectorSpace(dim=1) phones_source = 'phones' phones_dtype = self.phones_seq[0].dtype phones_map_fn = lambda indexes: [ self.phones_seq[index[0]][index[1]] for index in phones_map[indexes] ] phonemes_space = VectorSpace(dim=1) phonemes_source = 'phonemes' phonemes_dtype = self.phonemes_seq[0].dtype phonemes_map_fn = lambda indexes: [ self.phonemes_seq[index[0]][index[1]] for index in phones_map[indexes] ] words_space = VectorSpace(dim=1) words_source = 'words' words_dtype = self.wrd_seq[0].dtype words_map_fn = lambda indexes: [ self.wrd_seq[index[0]][index[1]] for index in words_map[indexes] ] space = CompositeSpace((features_space, targets_space, phones_space, phonemes_space, words_space)) source = (features_source, targets_source, phones_source, phonemes_source, words_source) self.data_specs = (space, source) self.dtypes = (features_dtype, targets_dtype, phones_dtype, phonemes_dtype, words_dtype) self.map_functions = (features_map_fn, targets_map_fn, phones_map_fn, phonemes_map_fn, words_map_fn) # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
def get_markov_frames(self, subset, id): """ Given the subset and an id, this method returns the list [input_frames, input_phonemes, input_words, output_phoneme, output_word, spkr_info, output_frame, ending_phoneme, ending_word]. """ assert subset+"_intervals_seq" in self.__dict__.keys() assert id < self.__dict__[subset+"_intervals_seq"][-1] n_frames_in = self.__dict__[subset+"_n_frames_in"] frame_length = self.__dict__[subset+"_frame_length"] overlap = self.__dict__[subset+"_overlap"] wav_length = self.__dict__[subset+"_wav_length"] intervals_seq = self.__dict__[subset+"_intervals_seq"] # Find the acoustic samples sequence we are looking for seq_id = np.digitize([id], intervals_seq) - 1 seq_id = seq_id[0] # Find the position in this sequence idx_in_seq = id - intervals_seq[seq_id] - (wav_length - frame_length \ + overlap) # Get the sequence wav_seq = self.__dict__[subset+"_raw_wav"][seq_id] # Get the phonemes phn_l_start = self.__dict__[subset+"_seq_to_phn"][seq_id][0] phn_l_end = self.__dict__[subset+"_seq_to_phn"][seq_id][1] phn_start_end = self.__dict__[subset+"_phn"][phn_l_start:phn_l_end] phn_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any phoneme so 0 is # the index for "NO_PHONEME" and the other index are shifted by one for (phn_start, phn_end, phn) in phn_start_end: phn_seq[phn_start:phn_end] = phn+1 # Get the words wrd_l_start = self.__dict__[subset+"_seq_to_wrd"][seq_id][0] wrd_l_end = self.__dict__[subset+"_seq_to_wrd"][seq_id][1] wrd_start_end = self.__dict__[subset+"_wrd"][wrd_l_start:wrd_l_end] wrd_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any word so 0 is # the index for "NO_WORD" and the other index are shifted by one for (wrd_start, wrd_end, wrd) in wrd_start_end: wrd_seq[wrd_start:wrd_end] = wrd+1 # Binary variable announcing the end of the word or phoneme end_phn = np.zeros_like(phn_seq) end_wrd = np.zeros_like(wrd_seq) for i in range(len(phn_seq) - 1): if phn_seq[i] != phn_seq[i+1]: end_phn[i] = 1 if wrd_seq[i] != wrd_seq[i+1]: end_wrd[i] = 1 end_phn[-1] = 1 end_wrd[-1] = 1 # Find the speaker id spkr_id = self.__dict__[subset+"_spkr"][seq_id] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Pick the selected segment padded_wav_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_wav_seq[-idx_in_seq:] = wav_seq[0:(wav_length+idx_in_seq)] else: padded_wav_seq = wav_seq[idx_in_seq:(idx_in_seq + wav_length)] padded_phn_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_phn_seq[-idx_in_seq:] = phn_seq[0:(wav_length+idx_in_seq)] else: padded_phn_seq = phn_seq[idx_in_seq:(idx_in_seq + wav_length)] padded_wrd_seq = np.zeros((wav_length)) if idx_in_seq < 0: padded_wrd_seq[-idx_in_seq:] = wrd_seq[0:(wav_length+idx_in_seq)] else: padded_wrd_seq = wrd_seq[idx_in_seq:(idx_in_seq + wav_length)] # Segment into frames wav_seq = segment_axis(padded_wav_seq, frame_length, overlap) # Take the most occurring phoneme in a sequence phn_seq = segment_axis(padded_phn_seq, frame_length, overlap) phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten() phn_seq = np.asarray(phn_seq, dtype='int') # Take the most occurring word in a sequence wrd_seq = segment_axis(padded_wrd_seq, frame_length, overlap) wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten() wrd_seq = np.asarray(wrd_seq, dtype='int') # Announce the end if and only if it was announced in the current frame end_phn = segment_axis(end_phn, frame_length, overlap) end_phn = end_phn.max(axis=1) end_wrd = segment_axis(end_wrd, frame_length, overlap) end_wrd = end_wrd.max(axis=1) # Put names on the output input_frames = wav_seq[:-1] input_phonemes = phn_seq[:-1] input_words = wrd_seq[:-1] output_phoneme = phn_seq[-1] output_word = wrd_seq[-1] output_frame = wav_seq[-1] ending_phoneme = end_phn[-1] ending_word = end_wrd[-1] return [input_frames, input_phonemes, input_words, output_phoneme, \ output_word, spkr_info, output_frame, ending_phoneme, \ ending_word]
def get_raw_seq(self, subset, seq_id, frame_length, overlap): """ Given the id of the subset, the id of the sequence, the frame length and the overlap between frames, this method will return a frames sequence from a given set, the associated phonemes and words sequences (including a binary variable indicating change) and the information vector on the speaker. """ self.check_subset_value(subset) self.check_subset_presence(subset) # Check if the id is valid n_seq = self.__dict__[subset+"_n_seq"] if seq_id >= n_seq: raise ValueError("This sequence does not exist.") # Get the sequence wav_seq = self.__dict__[subset+"_raw_wav"][seq_id] # Get the phonemes phn_l_start = self.__dict__[subset+"_seq_to_phn"][seq_id][0] phn_l_end = self.__dict__[subset+"_seq_to_phn"][seq_id][1] phn_start_end = self.__dict__[subset+"_phn"][phn_l_start:phn_l_end] phn_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any phoneme so 0 is # the index for "NO_PHONEME" and the other index are shifted by one for (phn_start, phn_end, phn) in phn_start_end: phn_seq[phn_start:phn_end] = phn+1 # Get the words wrd_l_start = self.__dict__[subset+"_seq_to_wrd"][seq_id][0] wrd_l_end = self.__dict__[subset+"_seq_to_wrd"][seq_id][1] wrd_start_end = self.__dict__[subset+"_wrd"][wrd_l_start:wrd_l_end] wrd_seq = np.zeros_like(wav_seq) # Some timestamp does not correspond to any word so 0 is # the index for "NO_WORD" and the other index are shifted by one for (wrd_start, wrd_end, wrd) in wrd_start_end: wrd_seq[wrd_start:wrd_end] = wrd+1 # Binary variable announcing the end of the word or phoneme end_phn = np.zeros_like(phn_seq) end_wrd = np.zeros_like(wrd_seq) for i in range(len(phn_seq) - 1): if phn_seq[i] != phn_seq[i+1]: end_phn[i] = 1 if wrd_seq[i] != wrd_seq[i+1]: end_wrd[i] = 1 end_phn[-1] = 1 end_wrd[-1] = 1 # Find the speaker id spkr_id = self.__dict__[subset+"_spkr"][seq_id] # Find the speaker info spkr_info = self.spkrinfo[spkr_id] # Segment into frames wav_seq = segment_axis(wav_seq, frame_length, overlap) # Take the most occurring phoneme in a sequence phn_seq = segment_axis(phn_seq, frame_length, overlap) phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten() phn_seq = np.asarray(phn_seq, dtype='int') # Take the most occurring word in a sequence wrd_seq = segment_axis(wrd_seq, frame_length, overlap) wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten() wrd_seq = np.asarray(wrd_seq, dtype='int') # Announce the end if and only if it was announced in the current frame end_phn = segment_axis(end_phn, frame_length, overlap) end_phn = end_phn.max(axis=1) end_wrd = segment_axis(end_wrd, frame_length, overlap) end_wrd = end_wrd.max(axis=1) return [wav_seq, phn_seq, end_phn, wrd_seq, end_wrd, spkr_info]