示例#1
0
 def init_conv(self) :
     self.conv = CONV(CONV_PARAMS, self.image_size)
示例#2
0
class LSTM_GMM:
    def __init__(self,
                 dims_tuple,
                 lstm_dim_list,
                 gmm_dim,
                 learning_rate=0.0000001,
                 samplerate=48000,
                 with_conv=False):
        self.debug = 0
        self.lr = learning_rate  # this is useless as we use Adam
        self.orth_scale = 0.9
        self.samplerate = samplerate

        self.time_dim = dims_tuple[0]
        self.batch_dim = dims_tuple[1]
        self.input_dim = dims_tuple[2]
        self.sequence_dim = dims_tuple[3]
        self.output_dim = dims_tuple[4]
        self.gmm_dim = gmm_dim

        self.lstm_layers_dim = lstm_dim_list

        assert self.gmm_dim * 3 == self.output_dim

        if with_conv:
            self.image_size = IMAGE_SIZE
        else:
            self.image_size = None

    def init_conv(self):
        self.conv = CONV(CONV_PARAMS, self.image_size)

    def build_theano_functions(self):
        x = T.fmatrix('time_sequence')
        x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim))

        y = x[:, 1:self.sequence_dim, :]
        x = x[:, :self.sequence_dim - 1, :]

        # if we try to include the spectrogram features
        spec_dims = 0
        if self.image_size is not None:
            print "Convolution activated"
            self.init_conv()
            spec = T.ftensor4('spectrogram')
            spec_features, spec_dims = self.conv.build_conv_layers(spec)
            print "Conv final dims =", spec_dims
            spec_dims = np.prod(spec_dims)
            spec_features = spec_features.reshape(
                (self.batch_dim, self.sequence_dim - 1, spec_dims))
            x = T.concatenate([x, spec_features], axis=2)

        layers_input = [x]
        dims = np.array([self.time_dim + spec_dims])
        for dim in self.lstm_layers_dim:
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)):

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(dims[layer],
                            dims[layer + 1] * 4,
                            weights_init=Orthogonal(self.orth_scale),
                            biases_init=Constant(0),
                            name="linear" + str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X sequence X time
            lstm = LSTM(dim=dims[layer + 1],
                        weights_init=IsotropicGaussian(mean=0., std=0.5),
                        biases_init=Constant(1),
                        name="lstm" + str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale *
                Orthogonal().generate(np.random,
                                      lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # this is where Alex Graves' paper starts
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(dims[1:].sum(),
                                  self.output_dim,
                                  weights_init=Orthogonal(self.orth_scale),
                                  use_bias=False,
                                  name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1:
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else:
            y_hat = output_transform.apply(
                T.concatenate(layers_input[1:], axis=2))

        # transforms to find each gmm params (mu, pi, sig)
        # small hack to softmax a 3D tensor
        pis = T.reshape(
            T.nnet.softmax(
                T.reshape(
                    y_hat[:, :, :self.gmm_dim],
                    ((self.sequence_dim - 1) * self.batch_dim, self.gmm_dim))),
            (self.batch_dim, (self.sequence_dim - 1), self.gmm_dim))
        sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6
        mus = y_hat[:, :, self.gmm_dim * 2:]

        pis = pis[:, :, :, np.newaxis]
        mus = mus[:, :, :, np.newaxis]
        sig = sig[:, :, :, np.newaxis]
        y = y[:, :, np.newaxis, :]

        y = T.patternbroadcast(y, (False, False, True, False))
        mus = T.patternbroadcast(mus, (False, False, False, True))
        sig = T.patternbroadcast(sig, (False, False, False, True))

        # sum likelihood with targets
        # see blog for this crazy Pr() = sum log sum prod
        # axes :: (batch, sequence, mixture, time)
        expo_term = -0.5 * ((y - mus)**2) / sig**2
        coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS))
        #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig))
        sequences = coeff + expo_term
        log_sequences = T.log(pis + EPS) + T.sum(
            sequences, axis=3, keepdims=True)

        log_sequences_max = T.max(log_sequences, axis=2, keepdims=True)

        LL = -(log_sequences_max + T.log(EPS + T.sum(
            T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))
               ).mean()
        LL.name = "summed_likelihood"

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        algorithm = GradientDescent(cost=LL,
                                    parameters=model.parameters,
                                    step_rule=Adam())

        f = theano.function([x], [pis, sig, mus])

        return algorithm, f

    def train(self):
        print "Loading data"
        datafile = self.get_datafile()
        nbexamples = datafile.num_examples
        nbexamples -= nbexamples % (self.sequence_dim * self.time_dim)

        train_stream = ReshapeTransformer(
            DataStream(dataset=datafile,
                       iteration_scheme=ShuffledBatchChunkScheme(
                           nbexamples, self.sequence_dim * self.time_dim)),
            self.sequence_dim, self.time_dim)

        if self.image_size is not None:
            train_stream = Mapping(train_stream,
                                   spec_mapping,
                                   add_sources=['spectrogram'])

        print "Building Theano Graph"
        algorithm, self.fprop = self.build_theano_functions()

        main_loop = MainLoop(algorithm=algorithm,
                             data_stream=train_stream,
                             model=self.model,
                             extensions=[
                                 FinishAfter(after_n_epochs=EPOCHS),
                                 TrainingDataMonitoring(
                                     [aggregation.mean(self.model.outputs[0])],
                                     prefix="train",
                                     after_epoch=True),
                                 Printing(),
                                 SaveParams(EXP_PATH + NAME, after_epoch=True)
                             ])

        main_loop.run()

    def load_model(self):
        model_path = EXP_PATH + NAME + "_params.pkl"
        print "Loading model at", model_path
        f = open(model_path)
        params = pkl.load(f)
        f.close()
        algorithm, self.fprop = self.build_theano_functions()
        self.model.set_parameter_values(params)

    def generate(self, seed=None, minutes=0.5):
        print "Generating module"
        timestep = self.time_dim * (self.sequence_dim - 1)
        samples = minutes * self.samplerate * 60
        song = np.zeros(samples, dtype=np.float32)

        if seed is None:
            datafile = self.get_datafile()
            seed = datafile.get_data(None, range(timestep))
            seed = seed[0].flatten()

        song[:timestep] = seed

        print
        for i in range(0, len(song) - self.time_dim - timestep, self.time_dim):
            sys.stdout.write('\rGenerating %d/%d samples' % (i, samples))
            sys.stdout.flush()

            params = self.fprop(song[i:i + timestep].reshape(
                (self.batch_dim, self.sequence_dim - 1, self.time_dim)))
            try:
                song[i + timestep:i + timestep +
                     self.time_dim] = self.sample_from_gmm(params)
            except ValueError:
                import ipdb
                ipdb.set_trace()

        write(EXP_PATH + "generation.wav", self.samplerate, song)

    def sample_from_gmm(self, params):
        # There is one set of mixture param for every timestep
        # remember the shape is [batch, sequence, mixture, time]
        pis = np.array(params[0])
        sig = np.array(params[1])
        mus = np.array(params[2])

        gmm = GMM(self.gmm_dim, covariance_type='spherical', init_params='')
        gmm.weights_ = pis[0, -1, :]
        gmm.means_ = mus[0, -1, :]
        gmm.covars_ = sig[0, -1, :]

        return gmm.sample(self.time_dim).flatten()

    def get_datafile(self):
        try:
            datafile = H5PYDataset(DATAPATH,
                                   which_sets=('train', ),
                                   sources=['time_sequence'],
                                   load_in_memory=True)
        except IOError:
            print "Could not find the hdf5 file. Will try to generate it"
            raise NotImplementedError

        if self.image_size is not None:
            print "Image size attribute is not None, need to infer the image size of the spectrogram"
            # temporarly create all the streams and stuff to make one mapping, inside the
            # mapping are the image size. Probably a cleaner way to do this.
            nbexamples = datafile.num_examples
            nbexamples -= nbexamples % (self.sequence_dim * self.time_dim)
            dummy_stream = ReshapeTransformer(
                DataStream(dataset=datafile,
                           iteration_scheme=ShuffledBatchChunkScheme(
                               nbexamples, self.sequence_dim * self.time_dim)),
                self.sequence_dim, self.time_dim)
            dummy_stream = Mapping(dummy_stream,
                                   spec_mapping,
                                   add_sources=['spectrogram'])
            dummy_epoch_iterator = dummy_stream.get_epoch_iterator()
            dummy_data = next(dummy_epoch_iterator)
            dummy_data = dummy_data[1]
            self.image_size = (dummy_data.shape[2], dummy_data.shape[3])
            print "Img size found, it should be =", self.image_size

            del nbexamples
            del dummy_stream
            del dummy_epoch_iterator
            del dummy_data

        return datafile
示例#3
0
class LSTM_GMM :
    def __init__(self, dims_tuple, lstm_dim_list, gmm_dim, learning_rate=0.0000001, samplerate=48000, with_conv=False) :
        self.debug = 0
        self.lr = learning_rate # this is useless as we use Adam
        self.orth_scale = 0.9
        self.samplerate = samplerate

        self.time_dim = dims_tuple[0]
        self.batch_dim = dims_tuple[1]
        self.input_dim = dims_tuple[2]
        self.sequence_dim = dims_tuple[3]
        self.output_dim = dims_tuple[4]
        self.gmm_dim = gmm_dim

        self.lstm_layers_dim = lstm_dim_list

        assert self.gmm_dim*3 == self.output_dim

        if with_conv :
            self.image_size = IMAGE_SIZE
        else :
            self.image_size = None


    def init_conv(self) :
        self.conv = CONV(CONV_PARAMS, self.image_size)


    def build_theano_functions(self):
        x = T.fmatrix('time_sequence')
        x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim))

        y = x[:,1:self.sequence_dim,:]
        x = x[:,:self.sequence_dim-1,:]

        # if we try to include the spectrogram features
        spec_dims = 0
        if self.image_size is not None :
            print "Convolution activated"
            self.init_conv()
            spec = T.ftensor4('spectrogram')
            spec_features, spec_dims = self.conv.build_conv_layers(spec)
            print "Conv final dims =", spec_dims
            spec_dims = np.prod(spec_dims)
            spec_features = spec_features.reshape(
                (self.batch_dim, self.sequence_dim-1, spec_dims))
            x = T.concatenate([x, spec_features], axis=2)

        layers_input = [x]
        dims =np.array([self.time_dim + spec_dims])
        for dim in self.lstm_layers_dim :
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)) :

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(dims[layer],
                            dims[layer+1]*4,
                            weights_init=Orthogonal(self.orth_scale),
                            biases_init=Constant(0),
                            name="linear"+str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X sequence X time
            lstm = LSTM(
                dim=dims[layer+1],
                weights_init=IsotropicGaussian(mean=0.,std=0.5),
                biases_init=Constant(1),
                name="lstm"+str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # this is where Alex Graves' paper starts
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(dims[1:].sum(),
                                  self.output_dim,
                                  weights_init=Orthogonal(self.orth_scale),
                                  use_bias=False,
                                  name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1 :
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else :
            y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2))

        # transforms to find each gmm params (mu, pi, sig)
        # small hack to softmax a 3D tensor
        pis = T.reshape(
                    T.nnet.softmax(
                        T.reshape(y_hat[:,:,:self.gmm_dim], ((self.sequence_dim-1)*self.batch_dim, self.gmm_dim))),
                    (self.batch_dim, (self.sequence_dim-1), self.gmm_dim))
        sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+1e-6
        mus = y_hat[:,:,self.gmm_dim*2:]

        pis = pis[:,:,:,np.newaxis]
        mus = mus[:,:,:,np.newaxis]
        sig = sig[:,:,:,np.newaxis]
        y = y[:,:,np.newaxis,:]

        y = T.patternbroadcast(y, (False, False, True, False))
        mus = T.patternbroadcast(mus, (False, False, False, True))
        sig = T.patternbroadcast(sig, (False, False, False, True))

        # sum likelihood with targets
        # see blog for this crazy Pr() = sum log sum prod
        # axes :: (batch, sequence, mixture, time)
        expo_term = -0.5*((y-mus)**2)/sig**2
        coeff = T.log(T.maximum(1./(T.sqrt(2.*np.pi)*sig), EPS))
        #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig))
        sequences = coeff + expo_term
        log_sequences = T.log(pis + EPS) + T.sum(sequences, axis=3, keepdims=True)

        log_sequences_max = T.max(log_sequences, axis=2, keepdims=True)

        LL = -(log_sequences_max + T.log(EPS + T.sum(T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))).mean()
        LL.name = "summed_likelihood"

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        algorithm = GradientDescent(
            cost=LL,
            parameters=model.parameters,
            step_rule=Adam())

        f = theano.function([x],[pis, sig, mus])

        return algorithm, f


    def train(self):
        print "Loading data"
        datafile = self.get_datafile()
        nbexamples = datafile.num_examples
        nbexamples -= nbexamples%(self.sequence_dim*self.time_dim)

        train_stream = ReshapeTransformer(
            DataStream(
                dataset=datafile,
                iteration_scheme=ShuffledBatchChunkScheme(
                    nbexamples, self.sequence_dim*self.time_dim)),
            self.sequence_dim,
            self.time_dim)

        if self.image_size is not None :
            train_stream = Mapping(train_stream, spec_mapping, add_sources=['spectrogram'])

        print "Building Theano Graph"
        algorithm, self.fprop = self.build_theano_functions()

        main_loop = MainLoop(
            algorithm=algorithm,
            data_stream=train_stream,
            model=self.model,
            extensions=[
                FinishAfter(after_n_epochs=EPOCHS),
                TrainingDataMonitoring(
                    [aggregation.mean(self.model.outputs[0])],
                    prefix="train",
                    after_epoch=True),
                Printing(),
                SaveParams(EXP_PATH+NAME, after_epoch=True)
            ])

        main_loop.run()


    def load_model(self) :
        model_path = EXP_PATH+NAME+"_params.pkl"
        print "Loading model at", model_path
        f = open(model_path)
        params = pkl.load(f)
        f.close()
        algorithm, self.fprop = self.build_theano_functions()
        self.model.set_parameter_values(params)


    def generate(self, seed=None, minutes=0.5):
        print "Generating module"
        timestep = self.time_dim*(self.sequence_dim-1)
        samples = minutes*self.samplerate*60
        song = np.zeros(samples, dtype=np.float32)

        if seed is None :
            datafile = self.get_datafile()
            seed = datafile.get_data(None, range(timestep))
            seed = seed[0].flatten()

        song[:timestep] = seed

        print
        for i in range(0, len(song)-self.time_dim-timestep, self.time_dim) :
            sys.stdout.write('\rGenerating %d/%d samples'%(i, samples))
            sys.stdout.flush()

            params = self.fprop(song[i:i+timestep].reshape(
                (self.batch_dim, self.sequence_dim-1, self.time_dim)))
            try :
                song[i+timestep:i+timestep+self.time_dim] = self.sample_from_gmm(params)
            except ValueError :
                import ipdb ; ipdb.set_trace()

        write(EXP_PATH+"generation.wav", self.samplerate, song)


    def sample_from_gmm(self, params) :
        # There is one set of mixture param for every timestep
        # remember the shape is [batch, sequence, mixture, time]
        pis = np.array(params[0])
        sig = np.array(params[1])
        mus = np.array(params[2])

        gmm = GMM(self.gmm_dim, covariance_type='spherical', init_params='')
        gmm.weights_ = pis[0,-1,:]
        gmm.means_ = mus[0,-1,:]
        gmm.covars_= sig[0,-1,:]

        return gmm.sample(self.time_dim).flatten()


    def get_datafile(self) :
        try :
            datafile = H5PYDataset(DATAPATH, which_sets=('train', ),
                                   sources=['time_sequence'], load_in_memory=True)
        except IOError :
            print "Could not find the hdf5 file. Will try to generate it"
            raise NotImplementedError

        if self.image_size is not None :
            print "Image size attribute is not None, need to infer the image size of the spectrogram"
            # temporarly create all the streams and stuff to make one mapping, inside the
            # mapping are the image size. Probably a cleaner way to do this.
            nbexamples = datafile.num_examples
            nbexamples -= nbexamples%(self.sequence_dim*self.time_dim)
            dummy_stream = ReshapeTransformer(
                DataStream(
                    dataset=datafile,
                    iteration_scheme=ShuffledBatchChunkScheme(
                        nbexamples, self.sequence_dim*self.time_dim)),
                self.sequence_dim,
                self.time_dim)
            dummy_stream = Mapping(dummy_stream, spec_mapping, add_sources=['spectrogram'])
            dummy_epoch_iterator = dummy_stream.get_epoch_iterator()
            dummy_data = next(dummy_epoch_iterator)
            dummy_data = dummy_data[1]
            self.image_size = (dummy_data.shape[2], dummy_data.shape[3])
            print "Img size found, it should be =", self.image_size

            del nbexamples
            del dummy_stream
            del dummy_epoch_iterator
            del dummy_data

        return datafile
示例#4
0
 def init_conv(self):
     self.conv = CONV(CONV_PARAMS, self.image_size)