def _generate_data(self):
        """
        Generates X matrix for DenseDesignMatrix initialization
        function.
        """
        X = numpy.zeros((self.samples+1, 2))
        X[0, :] = self.init_state
        y = numpy.zeros(self.samples)

        for i in range(1, X.shape[0]):
            X[i, 0] = 1 - self.alpha*X[i-1, 0]**2 + X[i-1, 1]
            X[i, 1] = self.beta*X[i-1, 0]

        last_target = X[-1, :]
        X = X[:-1, :]
        X.reshape((1, self.samples*2))  # Flatten

        Z = segment_axis(X, length=self.frame_length, overlap=0)

        y = numpy.zeros((Z.shape[0], 2))
        y[:-1, :] = Z[1:, 0:2]
        y[-1, :] = last_target  # X[-1, :]

        return (Z, y)
示例#2
0
    def __init__(self, which_set, frame_length, overlap=0,
                 frames_per_example=1, start=0, stop=None,
                 audio_only=False, n_prev_phones=0, n_next_phones=0,
                 samples_to_predict=1, filter_fn=None,
                 rng=_default_seed, b=1.019, step=8, n_channels=50):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only
        self.n_prev_phones = n_prev_phones
        self.n_next_phones = n_next_phones
        self.samples_to_predict = samples_to_predict
        self.b = b
        self.step = step
        self.n_channels = n_channels

        # Initializing the dictionary
        self.D = numpy.r_[tuple(gammatone_matrix(self.b, fc,
                                                 self.frame_length,
                                                 self.step) for fc in
                                erb_space(150, 8000,
                                          self.n_channels))]

        #self.coder = SparseCoder(dictionary=self.D, transform_n_nonzero_coefs=None, transform_alpha=1, transform_algorithm='omp')
        
        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = self.scaler.transform(sequence)

        if filter_fn is not None:
            filter_fn = eval(filter_fn)
            indexes = filter_fn(self.speaker_info_list[self.speaker_id])
            self.raw_wav = self.raw_wav[indexes]
            if not self.audio_only:
                self.phones = self.phones[indexes]
                self.phonemes = self.phonemes[indexes]
                self.words = self.words[indexes]

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            if not self.audio_only:
                # Phones segmentation
                phones_sequence = self.phones[sequence_id]
                phones_segmented_sequence = segment_axis(phones_sequence,frame_length,overlap)
                phones_mode = numpy.concatenate([scipy.stats.mode(phones_segmented_sequence[k-self.n_prev_phones:k+self.n_next_phones+1],axis=1)[0].T for k in range(self.n_prev_phones,len(phones_segmented_sequence)-self.n_next_phones)])
                self.phones[sequence_id] = numpy.asarray(phones_mode, dtype=numpy.int16)
                
                # Phonemes segmentation
                phonemes_sequence = self.phonemes[sequence_id]
                phonemes_segmented_sequence = segment_axis(phonemes_sequence,
                                                           frame_length,
                                                           overlap)
                if self.n_next_phones == 0:
                    self.phonemes[sequence_id] = phonemes_segmented_sequence[self.n_prev_phones:]
                else:
                    self.phonemes[sequence_id] = phonemes_segmented_sequence[self.n_prev_phones:-self.n_next_phones]
                
                # Words segmentation
                words_sequence = self.words[sequence_id]
                words_segmented_sequence = segment_axis(words_sequence,
                                                        frame_length,
                                                        overlap)
                if self.n_next_phones == 0:
                    self.words[sequence_id] = words_segmented_sequence[self.n_prev_phones:]
                else:
                    self.words[sequence_id] = words_segmented_sequence[self.n_prev_phones:-self.n_next_phones]

            if self.n_next_phones == 0:
                self.raw_wav[sequence_id] = self.raw_wav[sequence_id][self.n_prev_phones:]
            else:
                self.raw_wav[sequence_id] = self.raw_wav[sequence_id][self.n_prev_phones:-self.n_next_phones]

            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = self.raw_wav[sequence_id].shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        #numpy.save('/home/jfsantos/data/%s_sparse_frames.npy' % which_set, self.samples_sequences)
        if not self.audio_only:
            self.phones_sequences = self.phones
            self.phonemes_sequences = self.phonemes
            self.words_sequences = self.words
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(
            dim=self.D.shape[0] * self.frames_per_example
        )
        features_source = 'features'
        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example].todense())
            return rval

        targets_space = VectorSpace(dim=self.D.shape[0])
        targets_source = 'targets'
        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].todense())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        if not self.audio_only:
            num_phones = numpy.max([numpy.max(sequence) for sequence
                                    in self.phones]) + 1
            phones_space = IndexSpace(max_labels=num_phones, dim=1+self.n_prev_phones+self.n_next_phones,
                                      dtype=str(self.phones_sequences[0].dtype))
            phones_source = 'phones'
            def phones_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phones_sequences[sequence_index][example_index].ravel())
                return rval

            num_phonemes = numpy.max([numpy.max(sequence) for sequence
                                      in self.phonemes]) + 1
            phonemes_space = IndexSpace(max_labels=num_phonemes, dim=1,
                                        dtype=str(self.phonemes_sequences[0].dtype))
            phonemes_source = 'phonemes'
            def phonemes_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phonemes_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            num_words = numpy.max([numpy.max(sequence) for sequence
                                   in self.words]) + 1
            words_space = IndexSpace(max_labels=num_words, dim=1,
                                     dtype=str(self.words_sequences[0].dtype))
            words_source = 'words'
            def words_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.words_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            space_components.extend([phones_space, phonemes_space,
                                     words_space])
            source_components.extend([phones_source, phonemes_source,
                                     words_source])
            map_fn_components.extend([phones_map_fn, phonemes_map_fn,
                                     words_map_fn])
            batch_components.extend([None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))
示例#3
0
    def __init__(self,
                 which_set,
                 frame_length,
                 start=0,
                 stop=None,
                 audio_only=False,
                 rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in the sliding window
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.audio_only = audio_only

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        if not self.audio_only:
            self.num_phones = numpy.max(
                [numpy.max(sequence) for sequence in self.phones]) + 1
            self.num_phonemes = numpy.max(
                [numpy.max(sequence) for sequence in self.phonemes]) + 1
            self.num_words = numpy.max(
                [numpy.max(sequence) for sequence in self.words]) + 1

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]

        samples_sequences = []
        targets_sequences = []
        phones_sequences = []
        phonemes_sequences = []
        words_sequences = []
        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length,
                                                      frame_length - 1)[:-1]
            samples_sequences.append(samples_segmented_sequence)
            targets_sequences.append(samples_sequence[frame_length:].reshape(
                (samples_sequence[frame_length:].shape[0], 1)))
            if not self.audio_only:
                target_phones = self.phones[sequence_id][frame_length:]
                phones_sequences.append(
                    target_phones.reshape((target_phones.shape[0], 1)))
                target_phonemes = self.phonemes[sequence_id][frame_length:]
                phonemes_sequences.append(
                    target_phonemes.reshape((target_phonemes.shape[0], 1)))
                target_words = self.words[sequence_id][frame_length:]
                words_sequences.append(
                    target_words.reshape((target_words.shape[0], 1)))

        del self.raw_wav
        self.samples_sequences = samples_sequences
        self.targets_sequences = targets_sequences
        self.data = [samples_sequences, targets_sequences]
        if not self.audio_only:
            del self.phones
            del self.phonemes
            del self.words
            self.phones_sequences = phones_sequences
            self.phonemes_sequences = phonemes_sequences
            self.words_sequences = words_sequences
            self.data.extend(
                [phones_sequences, phonemes_sequences, words_sequences])
        self.num_examples = len(samples_sequences)

        # DataSpecs
        features_space = VectorSequenceSpace(dim=self.frame_length)
        features_source = 'features'

        targets_space = VectorSequenceSpace(dim=1)
        targets_source = 'targets'

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSequenceSpace(
                max_labels=self.num_phones,
                dim=1,
                dtype=str(self.phones_sequences[0].dtype))
            phones_source = 'phones'

            phonemes_space = IndexSequenceSpace(
                max_labels=self.num_phonemes,
                dim=1,
                dtype=str(self.phonemes_sequences[0].dtype))
            phonemes_source = 'phonemes'

            words_space = IndexSequenceSpace(
                max_labels=self.num_words,
                dim=1,
                dtype=str(self.words_sequences[0].dtype))
            words_source = 'words'

            space_components.extend(
                [phones_space, phonemes_space, words_space])
            source_components.extend(
                [phones_source, phonemes_source, words_source])
            batch_components.extend([None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace(
            (features_space, targets_space)), (features_source,
                                               targets_source))
示例#4
0
    def __init__(self,
                 which_set,
                 frame_length,
                 overlap=0,
                 frames_per_example=1,
                 start=0,
                 stop=None,
                 audio_only=False,
                 rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        if not self.audio_only:
            self.num_phones = numpy.max(
                [numpy.max(sequence) for sequence in self.phones]) + 1
            self.num_phonemes = numpy.max(
                [numpy.max(sequence) for sequence in self.phonemes]) + 1
            self.num_words = numpy.max(
                [numpy.max(sequence) for sequence in self.words]) + 1
            # The following is hard coded. However, the way it is done above
            # could be problematic if a max value (the max over the whole
            # dataset (train + valid + test)) is not present in at least one
            # one of the three subsets. This is the case for speakers. This is
            # not the case for phones.
            self.num_speakers = 630

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
                self.speaker_id = self.speaker_id[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]
                self.speaker_id = self.speaker_id[start:]

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            if not self.audio_only:
                # Phones segmentation
                phones_sequence = self.phones[sequence_id]
                phones_segmented_sequence = segment_axis(
                    phones_sequence, frame_length, overlap)
                self.phones[sequence_id] = phones_segmented_sequence
                # phones_segmented_sequence = scipy.stats.mode(
                #     phones_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phones_segmented_sequence = numpy.asarray(
                #     phones_segmented_sequence,
                #     dtype='int'
                # )
                # phones_sequence_list.append(phones_segmented_sequence)
                # Phonemes segmentation
                phonemes_sequence = self.phonemes[sequence_id]
                phonemes_segmented_sequence = segment_axis(
                    phonemes_sequence, frame_length, overlap)
                self.phonemes[sequence_id] = phonemes_segmented_sequence
                # phonemes_segmented_sequence = scipy.stats.mode(
                #     phonemes_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phonemes_segmented_sequence = numpy.asarray(
                #     phonemes_segmented_sequence,
                #     dtype='int'
                # )
                # phonemes_sequence_list.append(phonemes_segmented_sequence)
                # Words segmentation
                words_sequence = self.words[sequence_id]
                words_segmented_sequence = segment_axis(
                    words_sequence, frame_length, overlap)
                self.words[sequence_id] = words_segmented_sequence
                # words_segmented_sequence = scipy.stats.mode(
                #     words_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # words_segmented_sequence = numpy.asarray(words_segmented_sequence,
                #                                          dtype='int')
                # words_sequence_list.append(words_segmented_sequence)

            # TODO: look at this, does it force copying the data?
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length, overlap)
            self.raw_wav[sequence_id] = samples_segmented_sequence

            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = samples_segmented_sequence.shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        if not self.audio_only:
            self.phones_sequences = self.phones
            self.phonemes_sequences = self.phonemes
            self.words_sequences = self.words
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(dim=self.frame_length *
                                     self.frames_per_example)
        features_source = 'features'

        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index]
                            [example_index:example_index +
                             self.frames_per_example].ravel())
            return rval

        targets_space = VectorSpace(dim=self.frame_length)
        targets_source = 'targets'

        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][
                    example_index + self.frames_per_example].ravel())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSpace(max_labels=self.num_phones,
                                      dim=1,
                                      dtype=str(
                                          self.phones_sequences[0].dtype))
            phones_source = 'phones'

            def phones_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.phones_sequences[sequence_index][
                        example_index + self.frames_per_example].ravel())
                return rval

            phonemes_space = IndexSpace(max_labels=self.num_phonemes,
                                        dim=1,
                                        dtype=str(
                                            self.phonemes_sequences[0].dtype))
            phonemes_source = 'phonemes'

            def phonemes_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.phonemes_sequences[sequence_index][
                        example_index + self.frames_per_example].ravel())
                return rval

            words_space = IndexSpace(max_labels=self.num_words,
                                     dim=1,
                                     dtype=str(self.words_sequences[0].dtype))
            words_source = 'words'

            def words_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.words_sequences[sequence_index][
                        example_index + self.frames_per_example].ravel())
                return rval

            speaker_id_space = IndexSpace(max_labels=self.num_speakers,
                                          dim=1,
                                          dtype=str(self.speaker_id.dtype))
            speaker_id_source = 'speaker_id'

            def speaker_id_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.speaker_id[sequence_index].ravel())
                return rval

            dialect_space = IndexSpace(max_labels=8, dim=1, dtype='int32')
            dialect_source = 'dialect'

            def dialect_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[1:9]))
                return rval

            education_space = IndexSpace(max_labels=6, dim=1, dtype='int32')
            education_source = 'education'

            def education_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[9:15]))
                return rval

            race_space = IndexSpace(max_labels=8, dim=1, dtype='int32')
            race_source = 'race'

            def race_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[16:24]))
                return rval

            gender_space = IndexSpace(max_labels=2, dim=1, dtype='int32')
            gender_source = 'gender'

            def gender_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[24:]))
                return rval

            space_components.extend([
                phones_space, phonemes_space, words_space, speaker_id_space,
                dialect_space, education_space, race_space, gender_space
            ])
            source_components.extend([
                phones_source, phonemes_source, words_source,
                speaker_id_source, dialect_source, education_source,
                race_source, gender_source
            ])
            map_fn_components.extend([
                phones_map_fn, phonemes_map_fn, words_map_fn,
                speaker_id_map_fn, dialect_map_fn, education_map_fn,
                race_map_fn, gender_map_fn
            ])
            batch_components.extend(
                [None, None, None, None, None, None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace(
            (features_space, targets_space)), (features_source,
                                               targets_source))
示例#5
0
    def __init__(self, which_set, frame_length, overlap=0,
                 frames_per_example=1, start=0, stop=None, audio_only=False,
                 rng=_default_seed,
                 noise = False ):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only
        self.noise = noise

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            if not self.audio_only:
                # Phones segmentation
                phones_sequence = self.phones[sequence_id]
                phones_segmented_sequence = segment_axis(phones_sequence,
                                                         frame_length,
                                                         overlap)
                self.phones[sequence_id] = phones_segmented_sequence
                # phones_segmented_sequence = scipy.stats.mode(
                #     phones_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phones_segmented_sequence = numpy.asarray(
                #     phones_segmented_sequence,
                #     dtype='int'
                # )
                # phones_sequence_list.append(phones_segmented_sequence)
                # Phonemes segmentation
                phonemes_sequence = self.phonemes[sequence_id]
                phonemes_segmented_sequence = segment_axis(phonemes_sequence,
                                                           frame_length,
                                                           overlap)
                self.phonemes[sequence_id] = phonemes_segmented_sequence
                # phonemes_segmented_sequence = scipy.stats.mode(
                #     phonemes_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phonemes_segmented_sequence = numpy.asarray(
                #     phonemes_segmented_sequence,
                #     dtype='int'
                # )
                # phonemes_sequence_list.append(phonemes_segmented_sequence)
                # Words segmentation
                words_sequence = self.words[sequence_id]
                words_segmented_sequence = segment_axis(words_sequence,
                                                        frame_length,
                                                        overlap)
                self.words[sequence_id] = words_segmented_sequence
                # words_segmented_sequence = scipy.stats.mode(
                #     words_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # words_segmented_sequence = numpy.asarray(words_segmented_sequence,
                #                                          dtype='int')
                # words_sequence_list.append(words_segmented_sequence)

            # TODO: look at this, does it force copying the data?
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length,
                                                      overlap)
            self.raw_wav[sequence_id] = samples_segmented_sequence

            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = samples_segmented_sequence.shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)
        
        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        if not self.audio_only:
            self.phones_sequences = self.phones
            self.phonemes_sequences = self.phonemes
            self.words_sequences = self.words
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(
            dim=self.frame_length * self.frames_per_example
        )
        features_source = 'features'
        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index:example_index
                    + self.frames_per_example].ravel())
            return rval
            
        def features_map_fn_noise(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(
                    (self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example]
                    + self.noise_this_epoch[sequence_index][example_index:example_index + self.frames_per_example]).ravel())
            return rval
            

        targets_space = VectorSpace(dim=self.frame_length)
        targets_source = 'targets'
        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index
                    + self.frames_per_example].ravel())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        if self.noise == False:
            map_fn_components = [features_map_fn, targets_map_fn]
        else:
            map_fn_components = [features_map_fn_noise, targets_map_fn]
        batch_components = [None, None]

        if not self.audio_only:
            num_phones = numpy.max([numpy.max(sequence) for sequence
                                    in self.phones]) + 1
            phones_space = IndexSpace(max_labels=num_phones, dim=1,
                                      dtype=str(self.phones_sequences[0].dtype))
            phones_source = 'phones'
            def phones_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phones_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            num_phonemes = numpy.max([numpy.max(sequence) for sequence
                                      in self.phonemes]) + 1
            phonemes_space = IndexSpace(max_labels=num_phonemes, dim=1,
                                        dtype=str(self.phonemes_sequences[0].dtype))
            phonemes_source = 'phonemes'
            def phonemes_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phonemes_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            num_words = numpy.max([numpy.max(sequence) for sequence
                                   in self.words]) + 1
            words_space = IndexSpace(max_labels=num_words, dim=1,
                                     dtype=str(self.words_sequences[0].dtype))
            words_source = 'words'
            def words_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.words_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            space_components.extend([phones_space, phonemes_space,
                                     words_space])
            source_components.extend([phones_source, phonemes_source,
                                     words_source])            
            map_fn_components.extend([phones_map_fn, phonemes_map_fn,
                                     words_map_fn])
            batch_components.extend([None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))
    def __init__(
        self,
        which_set,
        frame_length,
        overlap=0.5,
        frames_per_example=1,
        start=0,
        stop=None,
        audio_only=True,
        n_prev_phones=0,
        n_next_phones=0,
        samples_to_predict=1,
        filter_fn=None,
        rng=_default_seed,
        b=1.019,
        step=64,
        n_channels=64,
    ):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        if overlap < 1.0:
            self.overlap = overlap * frame_length
        else:
            self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only
        self.n_prev_phones = n_prev_phones
        self.n_next_phones = n_next_phones
        self.samples_to_predict = samples_to_predict
        self.b = b
        self.step = step
        self.n_channels = n_channels

        print "Frame length %d, overlap %d" % (self.frame_length, self.overlap)

        # Initializing the dictionary
        self.D = numpy.r_[
            tuple(
                gammatone_matrix(self.b, fc, self.frame_length, self.step)
                for fc in erb_space(150, 8000, self.n_channels)
            )
        ]
        print "Using dictionary with shape", self.D.shape

        self.coder = SparseCoder(
            dictionary=self.D, transform_n_nonzero_coefs=None, transform_alpha=None, transform_algorithm="omp"
        )

        # RNG initialization
        if hasattr(rng, "random_integers"):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            print "Sentence %d/%d" % (sequence_id, len(self.raw_wav))
            X = segment_axis(samples_sequence, frame_length, overlap, end="pad")
            X = numpy.hanning(self.frame_length) * X
            self.raw_wav[sequence_id] = scipy.sparse.csr_matrix(self.coder.transform(X))
            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = self.raw_wav[sequence_id].shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        numpy.save("%s_sparse_frames.npy" % which_set, self.samples_sequences)
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(dim=self.D.shape[0] * self.frames_per_example)
        features_source = "features"

        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(
                    self.samples_sequences[sequence_index][
                        example_index : example_index + self.frames_per_example
                    ].todense()
                )
            return rval

        targets_space = VectorSpace(dim=self.D.shape[0])
        targets_source = "targets"

        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].todense())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class("shuffled_sequential")
        self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
示例#7
0
    def __init__(self, which_set, frame_length, overlap=0,
                 frames_per_example=1, start=0, stop=None, audio_only=False,
                 rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        if not self.audio_only:
            self.num_phones = numpy.max([numpy.max(sequence) for sequence
                                         in self.phones]) + 1
            self.num_phonemes = numpy.max([numpy.max(sequence) for sequence
                                           in self.phonemes]) + 1
            self.num_words = numpy.max([numpy.max(sequence) for sequence
                                        in self.words]) + 1
            # The following is hard coded. However, the way it is done above
            # could be problematic if a max value (the max over the whole
            # dataset (train + valid + test)) is not present in at least one
            # one of the three subsets. This is the case for speakers. This is
            # not the case for phones.
            self.num_speakers = 630

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
                self.speaker_id = self.speaker_id[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]
                self.speaker_id = self.speaker_id[start:]

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            if not self.audio_only:
                # Phones segmentation
                phones_sequence = self.phones[sequence_id]
                phones_segmented_sequence = segment_axis(phones_sequence,
                                                         frame_length,
                                                         overlap)
                self.phones[sequence_id] = phones_segmented_sequence
                # phones_segmented_sequence = scipy.stats.mode(
                #     phones_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phones_segmented_sequence = numpy.asarray(
                #     phones_segmented_sequence,
                #     dtype='int'
                # )
                # phones_sequence_list.append(phones_segmented_sequence)
                # Phonemes segmentation
                phonemes_sequence = self.phonemes[sequence_id]
                phonemes_segmented_sequence = segment_axis(phonemes_sequence,
                                                           frame_length,
                                                           overlap)
                self.phonemes[sequence_id] = phonemes_segmented_sequence
                # phonemes_segmented_sequence = scipy.stats.mode(
                #     phonemes_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phonemes_segmented_sequence = numpy.asarray(
                #     phonemes_segmented_sequence,
                #     dtype='int'
                # )
                # phonemes_sequence_list.append(phonemes_segmented_sequence)
                # Words segmentation
                words_sequence = self.words[sequence_id]
                words_segmented_sequence = segment_axis(words_sequence,
                                                        frame_length,
                                                        overlap)
                self.words[sequence_id] = words_segmented_sequence
                # words_segmented_sequence = scipy.stats.mode(
                #     words_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # words_segmented_sequence = numpy.asarray(words_segmented_sequence,
                #                                          dtype='int')
                # words_sequence_list.append(words_segmented_sequence)

            # TODO: look at this, does it force copying the data?
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length,
                                                      overlap)
            self.raw_wav[sequence_id] = samples_segmented_sequence

            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = samples_segmented_sequence.shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        if not self.audio_only:
            self.phones_sequences = self.phones
            self.phonemes_sequences = self.phonemes
            self.words_sequences = self.words
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(
            dim=self.frame_length * self.frames_per_example
        )
        features_source = 'features'
        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index:example_index
                    + self.frames_per_example].ravel())
            return rval

        targets_space = VectorSpace(dim=self.frame_length)
        targets_source = 'targets'
        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index
                    + self.frames_per_example].ravel())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSpace(max_labels=self.num_phones, dim=1,
                                      dtype=str(self.phones_sequences[0].dtype))
            phones_source = 'phones'
            def phones_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phones_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            phonemes_space = IndexSpace(max_labels=self.num_phonemes, dim=1,
                                        dtype=str(self.phonemes_sequences[0].dtype))
            phonemes_source = 'phonemes'
            def phonemes_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phonemes_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            words_space = IndexSpace(max_labels=self.num_words, dim=1,
                                     dtype=str(self.words_sequences[0].dtype))
            words_source = 'words'
            def words_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.words_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            speaker_id_space = IndexSpace(max_labels=self.num_speakers, dim=1,
                                          dtype=str(self.speaker_id.dtype))
            speaker_id_source = 'speaker_id'
            def speaker_id_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.speaker_id[sequence_index].ravel())
                return rval

            dialect_space = IndexSpace(max_labels=8, dim=1, dtype='int32')
            dialect_source = 'dialect'
            def dialect_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    info = self.speaker_info_list[self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[1:9]))
                return rval

            education_space = IndexSpace(max_labels=6, dim=1, dtype='int32')
            education_source = 'education'
            def education_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    info = self.speaker_info_list[self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[9:15]))
                return rval

            race_space = IndexSpace(max_labels=8, dim=1, dtype='int32')
            race_source = 'race'
            def race_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    info = self.speaker_info_list[self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[16:24]))
                return rval
              
            gender_space = IndexSpace(max_labels=2, dim=1, dtype='int32')
            gender_source = 'gender'
            def gender_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    info = self.speaker_info_list[self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[24:]))
                return rval

            space_components.extend([phones_space, phonemes_space,
                                     words_space, speaker_id_space,
                                     dialect_space, education_space,
                                     race_space, gender_space])
            source_components.extend([phones_source, phonemes_source,
                                     words_source, speaker_id_source,
                                     dialect_source, education_source,
                                     race_source, gender_source])
            map_fn_components.extend([phones_map_fn, phonemes_map_fn,
                                     words_map_fn, speaker_id_map_fn,
                                     dialect_map_fn, education_map_fn,
                                     race_map_fn, gender_map_fn])
            batch_components.extend([None, None, None, None, None, None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))
示例#8
0
    def __init__(self, which_set, frame_length, start=0, stop=None,
                 audio_only=False, rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in the sliding window
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.audio_only = audio_only

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        if not self.audio_only:
            self.num_phones = numpy.max([numpy.max(sequence) for sequence
                                         in self.phones]) + 1
            self.num_phonemes = numpy.max([numpy.max(sequence) for sequence
                                           in self.phonemes]) + 1
            self.num_words = numpy.max([numpy.max(sequence) for sequence
                                        in self.words]) + 1

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]

        samples_sequences = []
        targets_sequences = []
        phones_sequences = []
        phonemes_sequences = []
        words_sequences = []
        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length,
                                                      frame_length - 1)[:-1]
            samples_sequences.append(samples_segmented_sequence)
            targets_sequences.append(samples_sequence[frame_length:].reshape(
                (samples_sequence[frame_length:].shape[0], 1)
            ))
            if not self.audio_only:
                target_phones = self.phones[sequence_id][frame_length:]
                phones_sequences.append(target_phones.reshape(
                    (target_phones.shape[0], 1)
                ))
                target_phonemes = self.phonemes[sequence_id][frame_length:]
                phonemes_sequences.append(target_phonemes.reshape(
                    (target_phonemes.shape[0], 1)
                ))
                target_words = self.words[sequence_id][frame_length:]
                words_sequences.append(target_words.reshape(
                    (target_words.shape[0], 1)
                ))

        del self.raw_wav
        self.samples_sequences = samples_sequences
        self.targets_sequences = targets_sequences
        self.data = [samples_sequences, targets_sequences]
        if not self.audio_only:
            del self.phones
            del self.phonemes
            del self.words
            self.phones_sequences = phones_sequences
            self.phonemes_sequences = phonemes_sequences
            self.words_sequences = words_sequences
            self.data.extend([phones_sequences, phonemes_sequences,
                              words_sequences])
        self.num_examples = len(samples_sequences)

        # DataSpecs
        features_space = VectorSequenceSpace(dim=self.frame_length)
        features_source = 'features'

        targets_space = VectorSequenceSpace(dim=1)
        targets_source = 'targets'

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSequenceSpace(
                max_labels=self.num_phones,
                dim=1,
                dtype=str(self.phones_sequences[0].dtype)
            )
            phones_source = 'phones'

            phonemes_space = IndexSequenceSpace(
                max_labels=self.num_phonemes,
                dim=1,
                dtype=str(self.phonemes_sequences[0].dtype)
            )
            phonemes_source = 'phonemes'

            words_space = IndexSequenceSpace(
                max_labels=self.num_words,
                dim=1,
                dtype=str(self.words_sequences[0].dtype)
            )
            words_source = 'words'

            space_components.extend([phones_space, phonemes_space,
                                     words_space])
            source_components.extend([phones_source, phonemes_source,
                                     words_source])
            batch_components.extend([None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))
示例#9
0
    def get_markov_frames(self, subset, id):
        """
        Given the subset and an id, this method returns the list [input_frames, 
        input_phonemes, input_words, output_phoneme, output_word, spkr_info, 
        output_frame, ending_phoneme, ending_word]. 
        
        """
        assert subset + "_intervals_seq" in self.__dict__.keys()
        assert id < self.__dict__[subset + "_intervals_seq"][-1]

        n_frames_in = self.__dict__[subset + "_n_frames_in"]
        frame_length = self.__dict__[subset + "_frame_length"]
        overlap = self.__dict__[subset + "_overlap"]
        wav_length = self.__dict__[subset + "_wav_length"]
        intervals_seq = self.__dict__[subset + "_intervals_seq"]

        # Find the acoustic samples sequence we are looking for
        seq_id = np.digitize([id], intervals_seq) - 1
        seq_id = seq_id[0]

        # Find the position in this sequence
        idx_in_seq = id - intervals_seq[seq_id] - (wav_length - frame_length \
                     + overlap)

        # Get the sequence
        wav_seq = self.__dict__[subset + "_raw_wav"][seq_id]

        # Get the phonemes
        phn_l_start = self.__dict__[subset + "_seq_to_phn"][seq_id][0]
        phn_l_end = self.__dict__[subset + "_seq_to_phn"][seq_id][1]
        phn_start_end = self.__dict__[subset + "_phn"][phn_l_start:phn_l_end]
        phn_seq = np.zeros_like(wav_seq)
        # Some timestamp does not correspond to any phoneme so 0 is
        # the index for "NO_PHONEME" and the other index are shifted by one
        for (phn_start, phn_end, phn) in phn_start_end:
            phn_seq[phn_start:phn_end] = phn + 1

        # Get the words
        wrd_l_start = self.__dict__[subset + "_seq_to_wrd"][seq_id][0]
        wrd_l_end = self.__dict__[subset + "_seq_to_wrd"][seq_id][1]
        wrd_start_end = self.__dict__[subset + "_wrd"][wrd_l_start:wrd_l_end]
        wrd_seq = np.zeros_like(wav_seq)
        # Some timestamp does not correspond to any word so 0 is
        # the index for "NO_WORD" and the other index are shifted by one
        for (wrd_start, wrd_end, wrd) in wrd_start_end:
            wrd_seq[wrd_start:wrd_end] = wrd + 1

        # Binary variable announcing the end of the word or phoneme
        end_phn = np.zeros_like(phn_seq)
        end_wrd = np.zeros_like(wrd_seq)

        for i in range(len(phn_seq) - 1):
            if phn_seq[i] != phn_seq[i + 1]:
                end_phn[i] = 1
            if wrd_seq[i] != wrd_seq[i + 1]:
                end_wrd[i] = 1

        end_phn[-1] = 1
        end_wrd[-1] = 1

        # Find the speaker id
        spkr_id = self.__dict__[subset + "_spkr"][seq_id]
        # Find the speaker info
        spkr_info = self.spkrinfo[spkr_id]

        # Pick the selected segment
        padded_wav_seq = np.zeros((wav_length))
        if idx_in_seq < 0:
            padded_wav_seq[-idx_in_seq:] = wav_seq[0:(wav_length + idx_in_seq)]
        else:
            padded_wav_seq = wav_seq[idx_in_seq:(idx_in_seq + wav_length)]

        padded_phn_seq = np.zeros((wav_length))
        if idx_in_seq < 0:
            padded_phn_seq[-idx_in_seq:] = phn_seq[0:(wav_length + idx_in_seq)]
        else:
            padded_phn_seq = phn_seq[idx_in_seq:(idx_in_seq + wav_length)]

        padded_wrd_seq = np.zeros((wav_length))
        if idx_in_seq < 0:
            padded_wrd_seq[-idx_in_seq:] = wrd_seq[0:(wav_length + idx_in_seq)]
        else:
            padded_wrd_seq = wrd_seq[idx_in_seq:(idx_in_seq + wav_length)]

        # Segment into frames
        wav_seq = segment_axis(padded_wav_seq, frame_length, overlap)

        # Take the most occurring phoneme in a sequence
        phn_seq = segment_axis(padded_phn_seq, frame_length, overlap)
        phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten()
        phn_seq = np.asarray(phn_seq, dtype='int')

        # Take the most occurring word in a sequence
        wrd_seq = segment_axis(padded_wrd_seq, frame_length, overlap)
        wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten()
        wrd_seq = np.asarray(wrd_seq, dtype='int')

        # Announce the end if and only if it was announced in the current frame
        end_phn = segment_axis(end_phn, frame_length, overlap)
        end_phn = end_phn.max(axis=1)
        end_wrd = segment_axis(end_wrd, frame_length, overlap)
        end_wrd = end_wrd.max(axis=1)

        # Put names on the output
        input_frames = wav_seq[:-1]
        input_phonemes = phn_seq[:-1]
        input_words = wrd_seq[:-1]
        output_phoneme = phn_seq[-1]
        output_word = wrd_seq[-1]
        output_frame = wav_seq[-1]
        ending_phoneme = end_phn[-1]
        ending_word = end_wrd[-1]

        return [input_frames, input_phonemes, input_words, output_phoneme, \
                output_word, spkr_info, output_frame, ending_phoneme, \
                ending_word]
示例#10
0
    def get_raw_seq(self, subset, seq_id, frame_length, overlap):
        """
        Given the id of the subset, the id of the sequence, the frame length and 
        the overlap between frames, this method will return a frames sequence 
        from a given set, the associated phonemes and words sequences (including 
        a binary variable indicating change) and the information vector on the 
        speaker.
        
        """
        self.check_subset_value(subset)
        self.check_subset_presence(subset)

        # Check if the id is valid
        n_seq = self.__dict__[subset + "_n_seq"]
        if seq_id >= n_seq:
            raise ValueError("This sequence does not exist.")

        # Get the sequence
        wav_seq = self.__dict__[subset + "_raw_wav"][seq_id]

        # Get the phonemes
        phn_l_start = self.__dict__[subset + "_seq_to_phn"][seq_id][0]
        phn_l_end = self.__dict__[subset + "_seq_to_phn"][seq_id][1]
        phn_start_end = self.__dict__[subset + "_phn"][phn_l_start:phn_l_end]
        phn_seq = np.zeros_like(wav_seq)
        # Some timestamp does not correspond to any phoneme so 0 is
        # the index for "NO_PHONEME" and the other index are shifted by one
        for (phn_start, phn_end, phn) in phn_start_end:
            phn_seq[phn_start:phn_end] = phn + 1

        # Get the words
        wrd_l_start = self.__dict__[subset + "_seq_to_wrd"][seq_id][0]
        wrd_l_end = self.__dict__[subset + "_seq_to_wrd"][seq_id][1]
        wrd_start_end = self.__dict__[subset + "_wrd"][wrd_l_start:wrd_l_end]
        wrd_seq = np.zeros_like(wav_seq)
        # Some timestamp does not correspond to any word so 0 is
        # the index for "NO_WORD" and the other index are shifted by one
        for (wrd_start, wrd_end, wrd) in wrd_start_end:
            wrd_seq[wrd_start:wrd_end] = wrd + 1

        # Binary variable announcing the end of the word or phoneme
        end_phn = np.zeros_like(phn_seq)
        end_wrd = np.zeros_like(wrd_seq)

        for i in range(len(phn_seq) - 1):
            if phn_seq[i] != phn_seq[i + 1]:
                end_phn[i] = 1
            if wrd_seq[i] != wrd_seq[i + 1]:
                end_wrd[i] = 1

        end_phn[-1] = 1
        end_wrd[-1] = 1

        # Find the speaker id
        spkr_id = self.__dict__[subset + "_spkr"][seq_id]
        # Find the speaker info
        spkr_info = self.spkrinfo[spkr_id]

        # Segment into frames
        wav_seq = segment_axis(wav_seq, frame_length, overlap)

        # Take the most occurring phoneme in a sequence
        phn_seq = segment_axis(phn_seq, frame_length, overlap)
        phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten()
        phn_seq = np.asarray(phn_seq, dtype='int')

        # Take the most occurring word in a sequence
        wrd_seq = segment_axis(wrd_seq, frame_length, overlap)
        wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten()
        wrd_seq = np.asarray(wrd_seq, dtype='int')

        # Announce the end if and only if it was announced in the current frame
        end_phn = segment_axis(end_phn, frame_length, overlap)
        end_phn = end_phn.max(axis=1)
        end_wrd = segment_axis(end_wrd, frame_length, overlap)
        end_wrd = end_wrd.max(axis=1)

        return [wav_seq, phn_seq, end_phn, wrd_seq, end_wrd, spkr_info]
示例#11
0
    def __init__(self, which_set, frame_length, overlap=0,
                 frames_per_example=1, start=0, stop=None, rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)

        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]

        features_map = []
        targets_map = []
        phones_map = []
        words_map = []

        n_seq = len(self.raw_wav)
        self.phones_seq = []
        self.phonemes_seq = []
        self.wrd_seq = []
        for sequence_id in range(len(self.raw_wav)):
            # Get the phonemes
            phn_l_start = self.sequences_to_phonemes[sequence_id][0]
            phn_l_end = self.sequences_to_phonemes[sequence_id][1]
            phonemes_start_end = self.phonemes[phn_l_start:phn_l_end]
            phones_start_end = self.phones[phn_l_start:phn_l_end]
            phonemes_sequence = numpy.zeros(len(self.raw_wav[sequence_id]))
            phones_sequence = numpy.zeros(len(self.raw_wav[sequence_id]))
            # Some timestamp does not correspond to any phoneme so 0 is 
            # the index for "NO_PHONEME" and the other index are shifted by one
            for (phn_start, phn_end, phn) in phonemes_start_end:
                phonemes_sequence[phn_start:phn_end] = phn+1
            
            for (phn_start, phn_end, phn) in phones_start_end:
                phones_sequence[phn_start:phn_end] = phn+1

            phonemes_segmented_sequence = segment_axis(phonemes_sequence, frame_length, overlap)
            phonemes_segmented_sequence = scipy.stats.mode(phonemes_segmented_sequence, axis=1)[0].flatten()
            phonemes_segmented_sequence = numpy.asarray(phonemes_segmented_sequence, dtype='int')
            self.phonemes_seq.append(phonemes_segmented_sequence)

            phones_segmented_sequence = segment_axis(phones_sequence, frame_length, overlap)
            phones_segmented_sequence = scipy.stats.mode(phones_segmented_sequence, axis=1)[0].flatten()
            phones_segmented_sequence = numpy.asarray(phones_segmented_sequence, dtype='int')
            self.phones_seq.append(phones_segmented_sequence)

            # Get the words
            wrd_l_start = self.sequences_to_words[sequence_id][0]
            wrd_l_end = self.sequences_to_words[sequence_id][1]
            wrd_start_end = self.words[wrd_l_start:wrd_l_end]
            wrd_sequence = numpy.zeros(len(self.raw_wav[sequence_id]))
            # Some timestamp does not correspond to any word so 0 is 
            # the index for "NO_WORD" and the other index are shifted by one
            for (wrd_start, wrd_end, wrd) in wrd_start_end:
                wrd_sequence[wrd_start:wrd_end] = wrd+1

            wrd_segmented_sequence = segment_axis(wrd_sequence, frame_length, overlap)
            wrd_segmented_sequence = scipy.stats.mode(wrd_segmented_sequence, axis=1)[0].flatten()
            wrd_segmented_sequence = numpy.asarray(wrd_segmented_sequence, dtype='int')
            self.wrd_seq.append(wrd_segmented_sequence)

        self.phones_seq = numpy.array(self.phones_seq)
        self.phonemes_seq = numpy.array(self.phonemes_seq)
        self.wrd_seq = numpy.array(self.wrd_seq)

        for sequence_id, sequence in enumerate(self.raw_wav):
            segmented_sequence = segment_axis(sequence, frame_length, overlap)
            self.raw_wav[sequence_id] = segmented_sequence

            num_frames = segmented_sequence.shape[0]
            num_examples = num_frames - self.frames_per_example
            for example_id in xrange(num_examples):
                features_map.append([sequence_id, example_id,
                                     example_id + self.frames_per_example])
                targets_map.append([sequence_id,
                                    example_id + self.frames_per_example])
                phones_map.append([sequence_id, example_id])
                words_map.append([sequence_id, example_id])

        features_map = numpy.asarray(features_map)
        targets_map = numpy.asarray(targets_map)
        phones_map = numpy.asarray(phones_map)
        words_map = numpy.asarray(words_map)

        self.num_examples = features_map.shape[0]

        # DataSpecs
        features_space = VectorSpace(
            dim=self.frame_length * self.frames_per_example
        )
        features_source = 'features'
        features_dtype = self.raw_wav[0].dtype
        features_map_fn = lambda indexes: [
            self.raw_wav[index[0]][index[1]:index[2]].ravel()
            for index in features_map[indexes]
        ]

        targets_space = VectorSpace(dim=self.frame_length)
        targets_source = 'targets'
        targets_dtype = self.raw_wav[0].dtype
        targets_map_fn = lambda indexes: [
            self.raw_wav[index[0]][index[1]]
            for index in targets_map[indexes]
        ]

        phones_space = VectorSpace(dim=1)
        phones_source = 'phones'
        phones_dtype = self.phones_seq[0].dtype
        phones_map_fn = lambda indexes: [
            self.phones_seq[index[0]][index[1]]
            for index in phones_map[indexes]
        ]

        phonemes_space = VectorSpace(dim=1)
        phonemes_source = 'phonemes'
        phonemes_dtype = self.phonemes_seq[0].dtype
        phonemes_map_fn = lambda indexes: [
            self.phonemes_seq[index[0]][index[1]]
            for index in phones_map[indexes]
        ]


        words_space = VectorSpace(dim=1)
        words_source = 'words'
        words_dtype = self.wrd_seq[0].dtype
        words_map_fn = lambda indexes: [
            self.wrd_seq[index[0]][index[1]]
            for index in words_map[indexes]
        ]

        space = CompositeSpace((features_space, targets_space, phones_space,
                                phonemes_space, words_space))
        source = (features_source, targets_source, phones_source, 
                                phonemes_source, words_source)
        self.data_specs = (space, source)
        self.dtypes = (features_dtype, targets_dtype, phones_dtype,
                       phonemes_dtype, words_dtype)
        self.map_functions = (features_map_fn, targets_map_fn, phones_map_fn,
                              phonemes_map_fn, words_map_fn)

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))
示例#12
0
文件: timit.py 项目: amoliu/research
 def get_markov_frames(self, subset, id):
     """
     Given the subset and an id, this method returns the list [input_frames, 
     input_phonemes, input_words, output_phoneme, output_word, spkr_info, 
     output_frame, ending_phoneme, ending_word]. 
     
     """
     assert subset+"_intervals_seq" in self.__dict__.keys()
     assert id < self.__dict__[subset+"_intervals_seq"][-1]
     
     n_frames_in = self.__dict__[subset+"_n_frames_in"]
     frame_length = self.__dict__[subset+"_frame_length"]
     overlap = self.__dict__[subset+"_overlap"]
     wav_length = self.__dict__[subset+"_wav_length"]
     intervals_seq = self.__dict__[subset+"_intervals_seq"]
     
     # Find the acoustic samples sequence we are looking for
     seq_id = np.digitize([id], intervals_seq) - 1
     seq_id = seq_id[0]
     
     # Find the position in this sequence
     idx_in_seq = id - intervals_seq[seq_id] - (wav_length - frame_length \
                  + overlap)
         
     
     # Get the sequence
     wav_seq = self.__dict__[subset+"_raw_wav"][seq_id]
     
     # Get the phonemes
     phn_l_start = self.__dict__[subset+"_seq_to_phn"][seq_id][0]
     phn_l_end = self.__dict__[subset+"_seq_to_phn"][seq_id][1]
     phn_start_end = self.__dict__[subset+"_phn"][phn_l_start:phn_l_end]
     phn_seq = np.zeros_like(wav_seq)
     # Some timestamp does not correspond to any phoneme so 0 is 
     # the index for "NO_PHONEME" and the other index are shifted by one
     for (phn_start, phn_end, phn) in phn_start_end:
         phn_seq[phn_start:phn_end] = phn+1
     
     # Get the words
     wrd_l_start = self.__dict__[subset+"_seq_to_wrd"][seq_id][0]
     wrd_l_end = self.__dict__[subset+"_seq_to_wrd"][seq_id][1]
     wrd_start_end = self.__dict__[subset+"_wrd"][wrd_l_start:wrd_l_end]
     wrd_seq = np.zeros_like(wav_seq)
     # Some timestamp does not correspond to any word so 0 is 
     # the index for "NO_WORD" and the other index are shifted by one
     for (wrd_start, wrd_end, wrd) in wrd_start_end:
         wrd_seq[wrd_start:wrd_end] = wrd+1
     
     # Binary variable announcing the end of the word or phoneme
     end_phn = np.zeros_like(phn_seq)
     end_wrd = np.zeros_like(wrd_seq)
     
     for i in range(len(phn_seq) - 1):
         if phn_seq[i] != phn_seq[i+1]:
             end_phn[i] = 1
         if wrd_seq[i] != wrd_seq[i+1]:
             end_wrd[i] = 1
     
     end_phn[-1] = 1
     end_wrd[-1] = 1
     
     # Find the speaker id
     spkr_id = self.__dict__[subset+"_spkr"][seq_id]
     # Find the speaker info
     spkr_info = self.spkrinfo[spkr_id]
     
     # Pick the selected segment
     padded_wav_seq = np.zeros((wav_length))
     if idx_in_seq < 0:
         padded_wav_seq[-idx_in_seq:] = wav_seq[0:(wav_length+idx_in_seq)]
     else:
         padded_wav_seq = wav_seq[idx_in_seq:(idx_in_seq + wav_length)]
     
     padded_phn_seq = np.zeros((wav_length))
     if idx_in_seq < 0:
         padded_phn_seq[-idx_in_seq:] = phn_seq[0:(wav_length+idx_in_seq)]
     else:
         padded_phn_seq = phn_seq[idx_in_seq:(idx_in_seq + wav_length)]
     
     padded_wrd_seq = np.zeros((wav_length))
     if idx_in_seq < 0:
         padded_wrd_seq[-idx_in_seq:] = wrd_seq[0:(wav_length+idx_in_seq)]
     else:
         padded_wrd_seq = wrd_seq[idx_in_seq:(idx_in_seq + wav_length)]
     
     # Segment into frames
     wav_seq = segment_axis(padded_wav_seq, frame_length, overlap)
     
     # Take the most occurring phoneme in a sequence
     phn_seq = segment_axis(padded_phn_seq, frame_length, overlap)
     phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten()
     phn_seq = np.asarray(phn_seq, dtype='int')
     
     # Take the most occurring word in a sequence
     wrd_seq = segment_axis(padded_wrd_seq, frame_length, overlap)
     wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten()
     wrd_seq = np.asarray(wrd_seq, dtype='int')
     
     # Announce the end if and only if it was announced in the current frame
     end_phn = segment_axis(end_phn, frame_length, overlap)
     end_phn = end_phn.max(axis=1)
     end_wrd = segment_axis(end_wrd, frame_length, overlap)
     end_wrd = end_wrd.max(axis=1)
     
     # Put names on the output
     input_frames = wav_seq[:-1]
     input_phonemes = phn_seq[:-1]
     input_words = wrd_seq[:-1]
     output_phoneme = phn_seq[-1]
     output_word = wrd_seq[-1]
     output_frame = wav_seq[-1]
     ending_phoneme = end_phn[-1]
     ending_word = end_wrd[-1]
     
     return [input_frames, input_phonemes, input_words, output_phoneme, \
             output_word, spkr_info, output_frame, ending_phoneme, \
             ending_word]
示例#13
0
文件: timit.py 项目: amoliu/research
 def get_raw_seq(self, subset, seq_id, frame_length, overlap):
     """
     Given the id of the subset, the id of the sequence, the frame length and 
     the overlap between frames, this method will return a frames sequence 
     from a given set, the associated phonemes and words sequences (including 
     a binary variable indicating change) and the information vector on the 
     speaker.
     
     """
     self.check_subset_value(subset)
     self.check_subset_presence(subset)
     
     # Check if the id is valid
     n_seq = self.__dict__[subset+"_n_seq"]
     if seq_id >= n_seq:
         raise ValueError("This sequence does not exist.")
     
     # Get the sequence
     wav_seq = self.__dict__[subset+"_raw_wav"][seq_id]
     
     # Get the phonemes
     phn_l_start = self.__dict__[subset+"_seq_to_phn"][seq_id][0]
     phn_l_end = self.__dict__[subset+"_seq_to_phn"][seq_id][1]
     phn_start_end = self.__dict__[subset+"_phn"][phn_l_start:phn_l_end]
     phn_seq = np.zeros_like(wav_seq)
     # Some timestamp does not correspond to any phoneme so 0 is 
     # the index for "NO_PHONEME" and the other index are shifted by one
     for (phn_start, phn_end, phn) in phn_start_end:
         phn_seq[phn_start:phn_end] = phn+1
     
     # Get the words
     wrd_l_start = self.__dict__[subset+"_seq_to_wrd"][seq_id][0]
     wrd_l_end = self.__dict__[subset+"_seq_to_wrd"][seq_id][1]
     wrd_start_end = self.__dict__[subset+"_wrd"][wrd_l_start:wrd_l_end]
     wrd_seq = np.zeros_like(wav_seq)
     # Some timestamp does not correspond to any word so 0 is 
     # the index for "NO_WORD" and the other index are shifted by one
     for (wrd_start, wrd_end, wrd) in wrd_start_end:
         wrd_seq[wrd_start:wrd_end] = wrd+1
     
     # Binary variable announcing the end of the word or phoneme
     end_phn = np.zeros_like(phn_seq)
     end_wrd = np.zeros_like(wrd_seq)
     
     for i in range(len(phn_seq) - 1):
         if phn_seq[i] != phn_seq[i+1]:
             end_phn[i] = 1
         if wrd_seq[i] != wrd_seq[i+1]:
             end_wrd[i] = 1
     
     end_phn[-1] = 1
     end_wrd[-1] = 1
     
     # Find the speaker id
     spkr_id = self.__dict__[subset+"_spkr"][seq_id]
     # Find the speaker info
     spkr_info = self.spkrinfo[spkr_id]
     
     # Segment into frames
     wav_seq = segment_axis(wav_seq, frame_length, overlap)
     
     # Take the most occurring phoneme in a sequence
     phn_seq = segment_axis(phn_seq, frame_length, overlap)
     phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten()
     phn_seq = np.asarray(phn_seq, dtype='int')
     
     # Take the most occurring word in a sequence
     wrd_seq = segment_axis(wrd_seq, frame_length, overlap)
     wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten()
     wrd_seq = np.asarray(wrd_seq, dtype='int')
     
     # Announce the end if and only if it was announced in the current frame
     end_phn = segment_axis(end_phn, frame_length, overlap)
     end_phn = end_phn.max(axis=1)
     end_wrd = segment_axis(end_wrd, frame_length, overlap)
     end_wrd = end_wrd.max(axis=1)
     
     return [wav_seq, phn_seq, end_phn, wrd_seq, end_wrd, spkr_info]