示例#1
0
def gen_phone(mdl):
    X = theano.tensor.dmatrix('X')
    P = theano.tensor.dmatrix('P')
    y = mdl.fprop([X,P])
    predict = theano.function([X, P], y)

    resolution = 1600
    step = 64
    b = 1.019
    n_channels = 64

    D_multi = np.r_[tuple(gammatone_matrix(b, fc, resolution, step) for
                      fc in erb_space(150, 8000, n_channels))]

    phones = np.load('test_phones_1600.npy')
    X = np.asmatrix((len(phones),np.zeros(1536)))

    phone_code = np.asmatrix((len(phones),np.zeros(3*62)))

    for pi, p in enumerate(phones):
        phone_code[pi,[p[0], p[1]+62, p[2]+2*62]] = 1 # one-hot encoding

    out = np.zeros(1600 + 200*(len(phones)-1))
    step = 200
    for k in range(1,len(phones)):
        idx = range(k*step, k*step+1600)
        X[k] = predict(X[k-1], phone_code[k])
        out[idx] += np.dot(X[k], D_multi)
    
    out_scaled = np.asarray(out/max(abs(out)), dtype='float32')
    return out_scaled
示例#2
0
    def __init__(self, which_set, frame_length, overlap=0,
                 frames_per_example=1, start=0, stop=None,
                 audio_only=False, n_prev_phones=0, n_next_phones=0,
                 samples_to_predict=1, filter_fn=None,
                 rng=_default_seed, b=1.019, step=8, n_channels=50):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only
        self.n_prev_phones = n_prev_phones
        self.n_next_phones = n_next_phones
        self.samples_to_predict = samples_to_predict
        self.b = b
        self.step = step
        self.n_channels = n_channels

        # Initializing the dictionary
        self.D = numpy.r_[tuple(gammatone_matrix(self.b, fc,
                                                 self.frame_length,
                                                 self.step) for fc in
                                erb_space(150, 8000,
                                          self.n_channels))]

        #self.coder = SparseCoder(dictionary=self.D, transform_n_nonzero_coefs=None, transform_alpha=1, transform_algorithm='omp')
        
        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = self.scaler.transform(sequence)

        if filter_fn is not None:
            filter_fn = eval(filter_fn)
            indexes = filter_fn(self.speaker_info_list[self.speaker_id])
            self.raw_wav = self.raw_wav[indexes]
            if not self.audio_only:
                self.phones = self.phones[indexes]
                self.phonemes = self.phonemes[indexes]
                self.words = self.words[indexes]

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            if not self.audio_only:
                # Phones segmentation
                phones_sequence = self.phones[sequence_id]
                phones_segmented_sequence = segment_axis(phones_sequence,frame_length,overlap)
                phones_mode = numpy.concatenate([scipy.stats.mode(phones_segmented_sequence[k-self.n_prev_phones:k+self.n_next_phones+1],axis=1)[0].T for k in range(self.n_prev_phones,len(phones_segmented_sequence)-self.n_next_phones)])
                self.phones[sequence_id] = numpy.asarray(phones_mode, dtype=numpy.int16)
                
                # Phonemes segmentation
                phonemes_sequence = self.phonemes[sequence_id]
                phonemes_segmented_sequence = segment_axis(phonemes_sequence,
                                                           frame_length,
                                                           overlap)
                if self.n_next_phones == 0:
                    self.phonemes[sequence_id] = phonemes_segmented_sequence[self.n_prev_phones:]
                else:
                    self.phonemes[sequence_id] = phonemes_segmented_sequence[self.n_prev_phones:-self.n_next_phones]
                
                # Words segmentation
                words_sequence = self.words[sequence_id]
                words_segmented_sequence = segment_axis(words_sequence,
                                                        frame_length,
                                                        overlap)
                if self.n_next_phones == 0:
                    self.words[sequence_id] = words_segmented_sequence[self.n_prev_phones:]
                else:
                    self.words[sequence_id] = words_segmented_sequence[self.n_prev_phones:-self.n_next_phones]

            if self.n_next_phones == 0:
                self.raw_wav[sequence_id] = self.raw_wav[sequence_id][self.n_prev_phones:]
            else:
                self.raw_wav[sequence_id] = self.raw_wav[sequence_id][self.n_prev_phones:-self.n_next_phones]

            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = self.raw_wav[sequence_id].shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        #numpy.save('/home/jfsantos/data/%s_sparse_frames.npy' % which_set, self.samples_sequences)
        if not self.audio_only:
            self.phones_sequences = self.phones
            self.phonemes_sequences = self.phonemes
            self.words_sequences = self.words
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(
            dim=self.D.shape[0] * self.frames_per_example
        )
        features_source = 'features'
        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example].todense())
            return rval

        targets_space = VectorSpace(dim=self.D.shape[0])
        targets_source = 'targets'
        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].todense())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        if not self.audio_only:
            num_phones = numpy.max([numpy.max(sequence) for sequence
                                    in self.phones]) + 1
            phones_space = IndexSpace(max_labels=num_phones, dim=1+self.n_prev_phones+self.n_next_phones,
                                      dtype=str(self.phones_sequences[0].dtype))
            phones_source = 'phones'
            def phones_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phones_sequences[sequence_index][example_index].ravel())
                return rval

            num_phonemes = numpy.max([numpy.max(sequence) for sequence
                                      in self.phonemes]) + 1
            phonemes_space = IndexSpace(max_labels=num_phonemes, dim=1,
                                        dtype=str(self.phonemes_sequences[0].dtype))
            phonemes_source = 'phonemes'
            def phonemes_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phonemes_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            num_words = numpy.max([numpy.max(sequence) for sequence
                                   in self.words]) + 1
            words_space = IndexSpace(max_labels=num_words, dim=1,
                                     dtype=str(self.words_sequences[0].dtype))
            words_source = 'words'
            def words_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.words_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            space_components.extend([phones_space, phonemes_space,
                                     words_space])
            source_components.extend([phones_source, phonemes_source,
                                     words_source])
            map_fn_components.extend([phones_map_fn, phonemes_map_fn,
                                     words_map_fn])
            batch_components.extend([None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))
    def __init__(
        self,
        which_set,
        frame_length,
        overlap=0.5,
        frames_per_example=1,
        start=0,
        stop=None,
        audio_only=True,
        n_prev_phones=0,
        n_next_phones=0,
        samples_to_predict=1,
        filter_fn=None,
        rng=_default_seed,
        b=1.019,
        step=64,
        n_channels=64,
    ):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        if overlap < 1.0:
            self.overlap = overlap * frame_length
        else:
            self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only
        self.n_prev_phones = n_prev_phones
        self.n_next_phones = n_next_phones
        self.samples_to_predict = samples_to_predict
        self.b = b
        self.step = step
        self.n_channels = n_channels

        print "Frame length %d, overlap %d" % (self.frame_length, self.overlap)

        # Initializing the dictionary
        self.D = numpy.r_[
            tuple(
                gammatone_matrix(self.b, fc, self.frame_length, self.step)
                for fc in erb_space(150, 8000, self.n_channels)
            )
        ]
        print "Using dictionary with shape", self.D.shape

        self.coder = SparseCoder(
            dictionary=self.D, transform_n_nonzero_coefs=None, transform_alpha=None, transform_algorithm="omp"
        )

        # RNG initialization
        if hasattr(rng, "random_integers"):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            print "Sentence %d/%d" % (sequence_id, len(self.raw_wav))
            X = segment_axis(samples_sequence, frame_length, overlap, end="pad")
            X = numpy.hanning(self.frame_length) * X
            self.raw_wav[sequence_id] = scipy.sparse.csr_matrix(self.coder.transform(X))
            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = self.raw_wav[sequence_id].shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        numpy.save("%s_sparse_frames.npy" % which_set, self.samples_sequences)
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(dim=self.D.shape[0] * self.frames_per_example)
        features_source = "features"

        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(
                    self.samples_sequences[sequence_index][
                        example_index : example_index + self.frames_per_example
                    ].todense()
                )
            return rval

        targets_space = VectorSpace(dim=self.D.shape[0])
        targets_source = "targets"

        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].todense())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class("shuffled_sequential")
        self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))