示例#1
0
class M2_VAE(Base_VAE):
    def __init__(self,
                 hyper_params=None,
                 optimize_params=None,
                 model_params=None):
        super(M2_VAE, self).__init__(hyper_params,
                                     optimize_params,
                                     model_params,
                                     model_name='M2')

    def init_model_params(self, dim_x, dim_y):
        print 'M2 model params initialize'

        dim_z = self.hyper_params['dim_z']
        n_hidden = self.hyper_params['n_hidden']  # [500, 500, 500]
        n_hidden_recognize = n_hidden
        n_hidden_generate = n_hidden[::-1]

        self.type_px = self.hyper_params['type_px']

        activation = {
            'tanh': T.tanh,
            'relu': self.relu,
            'softplus': self.softplus,
            'sigmoid': T.nnet.sigmoid,
            'none': self.identify,
        }

        self.nonlinear_q = activation[self.hyper_params['nonlinear_q']]
        self.nonlinear_p = activation[self.hyper_params['nonlinear_p']]
        if self.type_px == 'bernoulli':
            output_f = activation['sigmoid']
        elif self.type_px == 'gaussian':
            output_f = activation['none']

        # Recognize model
        self.recognize_layers = [
            Layer(param_shape=(dim_x, n_hidden_recognize[0]),
                  function=self.identify,
                  nonbias=True),
            Layer(param_shape=(dim_y, n_hidden_recognize[0]),
                  function=self.identify)
        ]
        if len(n_hidden_recognize) > 1:
            self.recognize_layers += [
                Layer(param_shape=shape,
                      function=self.nonlinear_q) for shape in zip(
                          n_hidden_recognize[:-1], n_hidden_recognize[1:])
            ]
        self.recognize_mean_layer = Layer(param_shape=(n_hidden_recognize[-1],
                                                       dim_z),
                                          function=self.identify)
        self.recognize_log_var_layer = Layer(
            param_shape=(n_hidden_recognize[-1], dim_z),
            function=self.identify,
            w_zero=True,
            b_zero=True)

        # Generate Model
        self.generate_layers = [
            Layer((dim_z, n_hidden_generate[0]),
                  function=self.identify,
                  nonbias=True),
            Layer((dim_y, n_hidden_generate[0]), function=self.identify),
        ]
        if len(n_hidden) > 1:
            self.generate_layers += [
                Layer(param_shape=shape, function=self.nonlinear_p)
                for shape in zip(n_hidden_generate[:-1], n_hidden_generate[1:])
            ]
        self.generate_mean_layer = Layer(param_shape=(n_hidden_generate[-1],
                                                      dim_x),
                                         function=output_f)
        self.generate_log_var_layer = Layer(param_shape=(n_hidden_generate[-1],
                                                         dim_x),
                                            function=self.identify,
                                            b_zero=True)

        # Add all parameters
        self.model_params_ = ([
            param for layer in self.recognize_layers for param in layer.params
        ] + self.recognize_mean_layer.params +
                              self.recognize_log_var_layer.params + [
                                  param for layer in self.generate_layers
                                  for param in layer.params
                              ] + self.generate_mean_layer.params)

        if self.type_px == 'gaussian':
            self.model_params_ += self.generate_log_var_layer.params

    def recognize_model(self, X, Y):
        for i, layer in enumerate(self.recognize_layers):
            if i == 0:
                layer_out = layer.fprop(X)
            elif i == 1:
                layer_out += layer.fprop(Y)
                layer_out = self.nonlinear_q(layer_out)
            else:
                layer_out = layer.fprop(layer_out)

        q_mean = self.recognize_mean_layer.fprop(layer_out)
        q_log_var = self.recognize_log_var_layer.fprop(layer_out)

        return {
            'q_mean': q_mean,
            'q_log_var': q_log_var,
        }

    def generate_model(self, Z, Y):
        for i, layer in enumerate(self.generate_layers):
            if i == 0:
                layer_out = layer.fprop(Z)
            elif i == 1:
                layer_out += layer.fprop(Y)
                layer_out = self.nonlinear_p(layer_out)
            else:
                layer_out = layer.fprop(layer_out)

        p_mean = self.generate_mean_layer.fprop(layer_out)
        p_log_var = self.generate_log_var_layer.fprop(layer_out)

        return {'p_mean': p_mean, 'p_log_var': p_log_var}

    def encode(self, x, y):
        if self.encode_main is None:
            X = T.matrix()
            Y = T.matrix()
            self.encode_main = theano.function(inputs=[X, Y],
                                               outputs=self.recognize_model(
                                                   X, Y)['q_mean'])
        return self.encode_main(x, y)

    def decode(self, z, y):
        if self.decode_main is None:
            Z = T.matrix()
            Y = T.matrix()
            self.decode_main = theano.function(inputs=[Z, Y],
                                               outputs=self.generate_model(
                                                   Z, Y)['p_mean'])
        return self.decode_main(z, y)

    def get_expr_lbound(self, X, Y):
        n_samples = X.shape[0]

        recognized_zs = self.recognize_model(X, Y)
        q_mean = recognized_zs['q_mean']
        q_log_var = recognized_zs['q_log_var']

        eps = self.rng_noise.normal(avg=0., std=1., size=q_mean.shape).astype(
            theano.config.floatX)
        # T.exp(0.5 * q_log_var) = std
        # z = mean_z + std * epsilon
        z_tilda = q_mean + T.exp(0.5 * q_log_var) * eps

        generated_x = self.generate_model(z_tilda, Y)
        p_mean = generated_x['p_mean']
        p_log_var = generated_x['p_log_var']

        if self.type_px == 'gaussian':
            log_p_x_given_z = (-0.5 * np.log(2 * np.pi) - 0.5 * p_log_var -
                               0.5 * (X - p_mean)**2 / (2 * T.exp(p_log_var)))
        elif self.type_px == 'bernoulli':
            # log_p_x_given_z = X * T.log(p_mean) + (1 - X) * T.log(1 - p_mean)
            log_p_x_given_z = -T.nnet.binary_crossentropy(p_mean, X)

        logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_log_var)
        logpz = -0.5 * (np.log(2 * np.pi) + q_mean**2 + T.exp(q_log_var))
        # logqz = - 0.5 * T.sum(np.log(2 * np.pi) + 1 + q_log_var, axis=1)
        # logpz = - 0.5 * T.sum(np.log(2 * np.pi) + q_mean ** 2 + T.exp(q_log_var), axis=1)
        D_KL = T.sum(logpz - logqz)
        recon_error = T.sum(log_p_x_given_z)

        return D_KL, recon_error
        # return log_p_x_given_z, logpz, logqz

    def fit(self, x_datas, y_labels):
        X = T.matrix()
        Y = T.matrix()
        self.rng_noise = RandomStreams(self.hyper_params['rng_seed'])
        self.init_model_params(dim_x=x_datas.shape[1], dim_y=y_labels.shape[1])

        D_KL, recon_error = self.get_expr_lbound(X, Y)
        L = D_KL + recon_error

        print 'start fitting'
        gparams = T.grad(cost=L, wrt=self.model_params_)

        optimizer = {
            'sgd': self.sgd,
            'adagrad': self.adagrad,
            'adadelta': self.adaDelta,
            'rmsprop': self.rmsProp,
            'adam': self.adam
        }

        updates = optimizer[self.hyper_params['optimizer']](
            self.model_params_, gparams, self.optimize_params)
        self.hist = self.early_stopping(
            # self.hist = self.optimize(
            X,
            Y,
            x_datas,
            y_labels,
            self.optimize_params,
            L,
            updates,
            self.rng,
            D_KL,
            recon_error,
        )

    def optimize(self, X, Y, x_datas, y_labels, hyper_params, cost, updates,
                 rng, D_KL, recon_error):
        n_iters = hyper_params['n_iters']
        minibatch_size = hyper_params['minibatch_size']
        n_mod_history = hyper_params['n_mod_history']

        train_x = x_datas[:50000]
        valid_x = x_datas[50000:]

        train_y = y_labels[:50000]
        valid_y = y_labels[50000:]

        train = theano.function(inputs=[X, Y],
                                outputs=[cost, D_KL, recon_error],
                                updates=updates)

        validate = theano.function(inputs=[X, Y],
                                   outputs=[cost, D_KL, recon_error])

        n_samples = train_x.shape[0]
        cost_history = []

        total_cost = 0
        total_dkl = 0
        total_recon_error = 0
        for i in xrange(n_iters):
            ixs = rng.permutation(n_samples)
            for j in xrange(0, n_samples, minibatch_size):
                cost, D_KL, recon_error = train(
                    train_x[ixs[j:j + minibatch_size]],
                    train_y[ixs[j:j + minibatch_size]])
                # print np.sum(hoge(train_x[:1])[0])
                total_cost += cost
                total_dkl += D_KL
                total_recon_error += recon_error

            if np.mod(i, n_mod_history) == 0:
                num = n_samples / minibatch_size
                print(
                    '%d epoch train D_KL error: %.3f, Reconstruction error: %.3f, total error: %.3f'
                    % (i, total_dkl / num, total_recon_error / num,
                       total_cost / num))
                total_cost = 0
                total_dkl = 0
                total_recon_error = 0
                valid_error, valid_dkl, valid_recon_error = validate(
                    valid_x, valid_y)
                print '\tvalid D_KL error: %.3f, Reconstruction error: %.3f, total error: %.3f' % (
                    valid_dkl, valid_recon_error, valid_error)
                cost_history.append((i, valid_error))
        return cost_history

    def early_stopping(self, X, Y, x_datas, y_labels, hyper_params, cost,
                       updates, rng, D_KL, recon_error):
        minibatch_size = hyper_params['minibatch_size']

        train_x = x_datas[:50000]
        valid_x = x_datas[50000:]

        train_y = y_labels[:50000]
        valid_y = y_labels[50000:]

        train = theano.function(inputs=[X, Y],
                                outputs=[cost, D_KL, recon_error],
                                updates=updates)

        validate = theano.function(
            inputs=[X, Y],
            outputs=cost,
        )

        n_samples = train_x.shape[0]
        cost_history = []
        best_params = None
        valid_best_error = -np.inf
        best_epoch = 0
        patience = 5000
        patience_increase = 2
        improvement_threshold = 1.005

        done_looping = False

        for i in xrange(1000000):
            if done_looping: break
            ixs = rng.permutation(n_samples)
            for j in xrange(0, n_samples, minibatch_size):
                cost, D_KL, recon_error = train(
                    train_x[ixs[j:j + minibatch_size]],
                    train_y[ixs[j:j + minibatch_size]])

                iter = i * (n_samples / minibatch_size) + j / minibatch_size

                if (iter + 1) % 50 == 0:
                    valid_error = 0.
                    for _ in xrange(3):
                        valid_error += validate(valid_x, valid_y)
                    valid_error /= 3
                    if i % 100 == 0:
                        print 'epoch %d, minibatch %d/%d, valid total error: %.3f' % (
                            i, j / minibatch_size + 1,
                            n_samples / minibatch_size, valid_error)
                    cost_history.append((i * j, valid_error))
                    if valid_error > valid_best_error:
                        if valid_error > valid_best_error * improvement_threshold:
                            patience = max(patience, iter * patience_increase)
                        best_params = self.model_params_
                        valid_best_error = valid_error
                        best_epoch = i

                if patience <= iter:
                    done_looping = True
                    break
        self.model_params_ = best_params
        print 'epoch %d, minibatch %d/%d, valid total error: %.3f' % (
            best_epoch, j / minibatch_size + 1, n_samples / minibatch_size,
            valid_best_error)
        return cost_history
示例#2
0
class M1_GVAE(object):
    def __init__(self,
                 hyper_params=None,
                 sgd_params=None,
                 adagrad_params=None,
                 model_params=None):

        if (sgd_params is not None) and (adagrad_params is not None):
            raise ValueError('Error: select only one algorithm')

        self.hyper_params = hyper_params
        self.sgd_params = sgd_params
        self.adagrad_params = adagrad_params
        self.model_params = model_params

        self.rng = np.random.RandomState(hyper_params['rng_seed'])

        self.model_params_ = None
        self.decode_main = None
        self.encode_main = None

    def init_model_params(self, dim_x):
        print 'M1 model params initialize'
        dim_z = self.hyper_params['dim_z']
        n_hidden = self.hyper_params['n_hidden']  # [500, 500, 500]
        self.type_px = self.hyper_params['type_px']

        def relu(x):
            return x * (x > 0) + 0.01 * x

        def softplus(x):
            return T.log(T.exp(x) + 1)

        activation = {
            'tanh': T.tanh,
            'relu': relu,
            'softplus': softplus,
            'sigmoid': T.nnet.sigmoid,
            'none': None
        }
        nonlinear_q = activation[self.hyper_params['nonlinear_q']]
        nonlinear_p = activation[self.hyper_params['nonlinear_p']]
        if self.type_px == 'bernoulli':
            output_f = activation['sigmoid']
        elif self.type_px == 'gaussian':
            output_f = activation['none']

        # Recognize model
        self.recognize_layers = [
            Layer((dim_x, n_hidden[0]), function=nonlinear_q)
        ]
        if len(n_hidden) > 1:
            self.recognize_layers += [
                Layer(shape, function=nonlinear_q)
                for shape in zip(n_hidden[:-1], n_hidden[1:])
            ]
        self.recognize_mean_layer = Layer((n_hidden[-1], dim_z), function=None)
        self.recognize_log_sigma_layer = Layer((n_hidden[-1], dim_z),
                                               function=None,
                                               w_zero=True,
                                               b_zero=True)

        # Generate Model
        self.generate_layers = [
            Layer((dim_z, n_hidden[0]), function=nonlinear_p)
        ]
        if len(n_hidden) > 1:
            self.generate_layers += [
                Layer(shape, function=nonlinear_p)
                for shape in zip(n_hidden[:-1], n_hidden[1:])
            ]
        self.generate_mean_layer = Layer((n_hidden[-1], dim_x),
                                         function=output_f)
        self.generate_log_sigma_layer = Layer((n_hidden[-1], dim_x),
                                              function=None,
                                              b_zero=True)

        self.model_params_ = ([
            param for layer in self.generate_layers for param in layer.params
        ] + self.recognize_mean_layer.params +
                              self.recognize_log_sigma_layer.params + [
                                  param for layer in self.recognize_layers
                                  for param in layer.params
                              ] + self.generate_mean_layer.params)

        if self.type_px == 'gaussian':
            self.model_params_ += self.generate_log_sigma_layer.params

    def generate_model(self, Z):

        for i, layer in enumerate(self.generate_layers):
            if i == 0:
                layer_out = layer.fprop(Z)
            else:
                layer_out = layer.fprop(layer_out)

        p_mean = self.generate_mean_layer.fprop(layer_out)
        p_log_var = self.generate_log_sigma_layer.fprop(layer_out)

        return {
            # 'mu': 0.5 * (T.tanh(p_mean) + 1), # 0 <= mu <= 1
            # 'log_sigma': 3 * T.tanh(p_log_var) - 1, # -4 <= log sigma **2 <= 2
            # 'mu': T.clip(p_mean, 0., 1.),
            # 'log_sigma': T.clip(p_log_var, -4., 2.)
            'mu': p_mean,
            'log_sigma': p_log_var
        }

    def recognize_model(self, X):

        for i, layer in enumerate(self.recognize_layers):
            if i == 0:
                layer_out = layer.fprop(X)
            else:
                layer_out = layer.fprop(layer_out)

        q_mean = self.recognize_mean_layer.fprop(layer_out)
        q_log_var = self.recognize_log_sigma_layer.fprop(layer_out)

        return {
            'mu': q_mean,
            # 'log_sigma': 3 * T.tanh(q_log_var) - 1,
            # 'log_sigma': T.clip(q_log_var, -4., 2.)
            'log_sigma': q_log_var
        }

    def decode(self, z):
        if self.decode_main is None:
            Z = T.matrix()
            self.decode_main = theano.function(
                inputs=[Z], outputs=self.generate_model(Z)['mu'])
        return self.decode_main(z)

    def encode(self, x):
        if self.encode_main is None:
            X = T.matrix()
            self.encode_main = theano.function(
                inputs=[X], outputs=self.recognize_model(X)['mu'])
        return self.encode_main(x)

    def get_expr_lbound(self, X):
        n_mc_sampling = self.hyper_params['n_mc_sampling']
        n_samples = X.shape[0]
        dim_z = self.hyper_params['dim_z']

        stats_z = self.recognize_model(X)
        q_mean = stats_z['mu']
        q_log_var = stats_z['log_sigma']

        eps = self.rng_noise.normal(size=(n_mc_sampling, n_samples, dim_z))
        z_tilda = q_mean + T.exp(0.5 * q_log_var) * eps

        stats_x = self.generate_model(z_tilda)
        p_mean = stats_x['mu']
        p_log_var = stats_x['log_sigma']

        if self.type_px == 'gaussian':
            log_p_x_given_z = (-0.5 * np.log(2 * np.pi) - 0.5 * p_log_var -
                               0.5 * (X - p_mean)**2 / (2 * T.exp(p_log_var)))
        elif self.type_px == 'bernoulli':
            log_p_x_given_z = X * T.log(p_mean) + (1 - X) * T.log(1 - p_mean)

        logqz = -0.5 * T.sum(np.log(2 * np.pi) + 1 + q_log_var)
        logpz = -0.5 * T.sum(np.log(2 * np.pi) + q_mean**2 + T.exp(q_log_var))
        consts = []

        return (T.sum(log_p_x_given_z) / n_mc_sampling +
                (logpz - logqz)) / n_samples, consts

    def fit(self, x_datas):
        X = T.matrix()
        self.rng_noise = RandomStreams(self.hyper_params['rng_seed'])
        self.init_model_params(dim_x=x_datas.shape[1])

        lbound, consts = self.get_expr_lbound(X)
        cost = -lbound

        print 'start fitting'
        self.hist = self.adam_calc(x_datas, cost, consts, X,
                                   self.model_params_, self.adagrad_params,
                                   self.rng)

    def adagrad_calc(self, x_datas, cost, consts, X, model_params,
                     hyper_params, rng):
        n_iters = hyper_params['n_iters']
        learning_rate = hyper_params['learning_rate']
        minibatch_size = hyper_params['minibatch_size']
        n_mod_history = hyper_params['n_mod_history']
        calc_history = hyper_params['calc_history']

        hs = [
            theano.shared(
                np.ones(param.get_value(borrow=True).shape).astype(
                    theano.config.floatX)) for param in model_params
        ]

        gparams = T.grad(cost=cost, wrt=model_params, consider_constant=consts)
        updates = [(param, param - learning_rate / (T.sqrt(h)) * gparam)
                   for param, gparam, h in zip(model_params, gparams, hs)]
        updates += [(h, h + gparam**2) for gparam, h in zip(gparams, hs)]

        train = theano.function(inputs=[X], outputs=cost, updates=updates)

        validate = theano.function(inputs=[X], outputs=cost)

        n_samples = x_datas.shape[0]
        cost_history = []

        for i in xrange(n_iters):
            ixs = rng.permutation(n_samples)[:minibatch_size]
            minibatch_cost = train(x_datas[ixs])
            # print minibatch_cost

            if np.mod(i, n_mod_history) == 0:
                print '%d epoch error: %f' % (i, minibatch_cost)
                if calc_history == 'minibatch':
                    cost_history.append((i, minibatch_cost))
                else:
                    cost_history.append((i, validate(x_datas[ixs])))
        return cost_history

    def adam_calc(self, x_datas, cost, consts, X, model_params, hyper_params,
                  rng):
        n_iters = hyper_params['n_iters']
        learning_rate = hyper_params['learning_rate']
        minibatch_size = hyper_params['minibatch_size']
        n_mod_history = hyper_params['n_mod_history']
        calc_history = hyper_params['calc_history']

        rs = [
            theano.shared(
                np.ones(param.get_value(borrow=True).shape).astype(
                    theano.config.floatX)) for param in model_params
        ]
        vs = [
            theano.shared(
                np.ones(param.get_value(borrow=True).shape).astype(
                    theano.config.floatX)) for param in model_params
        ]
        ts = [
            theano.shared(
                np.ones(param.get_value(borrow=True).shape).astype(
                    theano.config.floatX)) for param in model_params
        ]

        gnma = 0.999
        beta = 0.9
        weight_decay = 1000 / 50000.

        gparams = T.grad(cost=cost, wrt=model_params, consider_constant=consts)

        updates = [(param, param - learning_rate /
                    (T.sqrt(r / (1 - gnma**t))) * v / (1 - beta**t))
                   for param, r, v, t in zip(model_params, rs, vs, ts)]
        updates += [
            (r, gnma * r + (1 - gnma) * (gparam - weight_decay * param)**2)
            for param, gparam, r in zip(model_params, gparams, rs)
        ]
        updates += [(v,
                     beta * v + (1 - beta) * (gparam - weight_decay * param))
                    for param, gparam, v in zip(model_params, gparams, vs)]
        updates += [(t, t + 1) for t in ts]

        train = theano.function(inputs=[X], outputs=cost, updates=updates)

        validate = theano.function(inputs=[X], outputs=cost)

        n_samples = x_datas.shape[0]
        cost_history = []

        for i in xrange(n_iters):
            ixs = rng.permutation(n_samples)[:minibatch_size]
            minibatch_cost = train(x_datas[ixs])
            # print minibatch_cost

            if np.mod(i, n_mod_history) == 0:
                print '%d epoch error: %f' % (i, minibatch_cost)
                if calc_history == 'minibatch':
                    cost_history.append((i, minibatch_cost))
                else:
                    cost_history.append((i, validate(x_datas[ixs])))
        return cost_history
示例#3
0
class M1_GVAE(object):
    def __init__(
        self,
        hyper_params=None,
        sgd_params=None,
        adagrad_params=None,
        model_params=None
    ):

        if (sgd_params is not None) and (adagrad_params is not None):
            raise ValueError('Error: select only one algorithm')

        self.hyper_params = hyper_params
        self.sgd_params = sgd_params
        self.adagrad_params = adagrad_params
        self.model_params = model_params

        self.rng = np.random.RandomState(hyper_params['rng_seed'])

        self.model_params_ = None
        self.decode_main = None
        self.encode_main = None



    def init_model_params(self, dim_x):
        print 'M1 model params initialize'
        dim_z = self.hyper_params['dim_z']
        n_hidden = self.hyper_params['n_hidden'] # [500, 500, 500]
        self.type_px = self.hyper_params['type_px']
        def relu(x): return x*(x>0) + 0.01 * x
        def softplus(x): return T.log(T.exp(x) + 1)
        activation = {'tanh': T.tanh, 'relu': relu, 'softplus': softplus, 'sigmoid': T.nnet.sigmoid, 'none': None}
        nonlinear_q = activation[self.hyper_params['nonlinear_q']]
        nonlinear_p = activation[self.hyper_params['nonlinear_p']]
        if self.type_px == 'bernoulli':
            output_f = activation['sigmoid']
        elif self.type_px == 'gaussian':
            output_f= activation['none']

        # Recognize model
        self.recognize_layers = [Layer((dim_x, n_hidden[0]), function=nonlinear_q)]
        if len(n_hidden) > 1:
            self.recognize_layers += [Layer(shape, function=nonlinear_q) for shape in zip(n_hidden[:-1], n_hidden[1:])]
        self.recognize_mean_layer = Layer((n_hidden[-1], dim_z), function=None)
        self.recognize_log_sigma_layer = Layer((n_hidden[-1], dim_z), function=None, w_zero=True, b_zero=True)


        # Generate Model
        self.generate_layers = [Layer((dim_z, n_hidden[0]), function=nonlinear_p)]
        if len(n_hidden) > 1:
            self.generate_layers += [Layer(shape, function=nonlinear_p) for shape in zip(n_hidden[:-1], n_hidden[1:])]
        self.generate_mean_layer = Layer((n_hidden[-1], dim_x), function=output_f)
        self.generate_log_sigma_layer = Layer((n_hidden[-1], dim_x), function=None, b_zero=True)


        self.model_params_ = (
            [param for layer in self.generate_layers for param in layer.params] +
            self.recognize_mean_layer.params +
            self.recognize_log_sigma_layer.params +
            [param for layer in self.recognize_layers for param in layer.params] +
            self.generate_mean_layer.params
        )

        if self.type_px == 'gaussian':
            self.model_params_ += self.generate_log_sigma_layer.params

    def generate_model(self, Z):

        for i, layer in enumerate(self.generate_layers):
            if i == 0:
                layer_out = layer.fprop(Z)
            else:
                layer_out = layer.fprop(layer_out)

        p_mean = self.generate_mean_layer.fprop(layer_out)
        p_log_var = self.generate_log_sigma_layer.fprop(layer_out)

        return {
            # 'mu': 0.5 * (T.tanh(p_mean) + 1), # 0 <= mu <= 1
            # 'log_sigma': 3 * T.tanh(p_log_var) - 1, # -4 <= log sigma **2 <= 2
            # 'mu': T.clip(p_mean, 0., 1.),
            # 'log_sigma': T.clip(p_log_var, -4., 2.)
            'mu': p_mean,
            'log_sigma': p_log_var
        }

    def recognize_model(self, X):

        for i, layer in enumerate(self.recognize_layers):
            if i == 0:
                layer_out = layer.fprop(X)
            else:
                layer_out = layer.fprop(layer_out)

        q_mean = self.recognize_mean_layer.fprop(layer_out)
        q_log_var = self.recognize_log_sigma_layer.fprop(layer_out)

        return {
            'mu': q_mean,
            # 'log_sigma': 3 * T.tanh(q_log_var) - 1,
            # 'log_sigma': T.clip(q_log_var, -4., 2.)
            'log_sigma': q_log_var
        }

    def decode(self, z):
        if self.decode_main is None:
            Z = T.matrix()
            self.decode_main = theano.function(
                inputs=[Z],
                outputs=self.generate_model(Z)['mu']
            )
        return self.decode_main(z)

    def encode(self, x):
        if self.encode_main is None:
            X = T.matrix()
            self.encode_main = theano.function(
                inputs=[X],
                outputs=self.recognize_model(X)['mu']
            )
        return self.encode_main(x)

    def get_expr_lbound(self, X):
        n_mc_sampling = self.hyper_params['n_mc_sampling']
        n_samples = X.shape[0]
        dim_z = self.hyper_params['dim_z']

        stats_z = self.recognize_model(X)
        q_mean = stats_z['mu']
        q_log_var = stats_z['log_sigma']

        eps = self.rng_noise.normal(size=(n_mc_sampling, n_samples, dim_z))
        z_tilda = q_mean + T.exp(0.5 * q_log_var) * eps

        stats_x = self.generate_model(z_tilda)
        p_mean = stats_x['mu']
        p_log_var = stats_x['log_sigma']

        if self.type_px == 'gaussian':
            log_p_x_given_z = (
                -0.5 * np.log(2 * np.pi) - 0.5 * p_log_var - 0.5 * (X - p_mean) ** 2 / (2 * T.exp(p_log_var))
            )
        elif self.type_px == 'bernoulli':
            log_p_x_given_z = X * T.log(p_mean) + (1 - X) * T.log(1 - p_mean)

        logqz = - 0.5 * T.sum(np.log(2 * np.pi) + 1 + q_log_var)
        logpz = - 0.5 * T.sum(np.log(2 * np.pi) + q_mean ** 2 + T.exp(q_log_var))
        consts = []

        return (T.sum(log_p_x_given_z) / n_mc_sampling + (logpz - logqz)) / n_samples, consts


    def fit(self, x_datas):
        X = T.matrix()
        self.rng_noise = RandomStreams(self.hyper_params['rng_seed'])
        self.init_model_params(dim_x=x_datas.shape[1])

        lbound, consts = self.get_expr_lbound(X)
        cost = -lbound

        print 'start fitting'
        self.hist = self.adam_calc(
            x_datas,
            cost,
            consts,
            X,
            self.model_params_,
            self.adagrad_params,
            self.rng
        )

    def adagrad_calc(self, x_datas, cost, consts, X, model_params, hyper_params, rng):
        n_iters = hyper_params['n_iters']
        learning_rate = hyper_params['learning_rate']
        minibatch_size = hyper_params['minibatch_size']
        n_mod_history = hyper_params['n_mod_history']
        calc_history = hyper_params['calc_history']

        hs = [theano.shared(np.ones(
                    param.get_value(borrow=True).shape
                ).astype(theano.config.floatX))
            for param in model_params]

        gparams = T.grad(
            cost=cost,
            wrt=model_params,
            consider_constant=consts
        )
        updates = [(param, param - learning_rate / (T.sqrt(h)) * gparam)
                    for param, gparam, h in zip(model_params, gparams, hs)]
        updates += [(h, h + gparam ** 2) for gparam, h in zip(gparams, hs)]

        train = theano.function(
            inputs=[X],
            outputs=cost,
            updates=updates
        )

        validate = theano.function(
            inputs=[X],
            outputs=cost
        )

        n_samples = x_datas.shape[0]
        cost_history = []

        for i in xrange(n_iters):
            ixs = rng.permutation(n_samples)[:minibatch_size]
            minibatch_cost = train(x_datas[ixs])
            # print minibatch_cost

            if np.mod(i, n_mod_history) == 0:
                print '%d epoch error: %f' % (i, minibatch_cost)
                if calc_history == 'minibatch':
                    cost_history.append((i, minibatch_cost))
                else:
                    cost_history.append((i, validate(x_datas[ixs])))
        return cost_history


    def adam_calc(self, x_datas, cost, consts, X, model_params, hyper_params, rng):
        n_iters = hyper_params['n_iters']
        learning_rate = hyper_params['learning_rate']
        minibatch_size = hyper_params['minibatch_size']
        n_mod_history = hyper_params['n_mod_history']
        calc_history = hyper_params['calc_history']

        rs = [theano.shared(np.ones(
                    param.get_value(borrow=True).shape
                ).astype(theano.config.floatX))
            for param in model_params]
        vs = [theano.shared(np.ones(
                    param.get_value(borrow=True).shape
                ).astype(theano.config.floatX))
            for param in model_params]
        ts = [theano.shared(np.ones(
                    param.get_value(borrow=True).shape
                ).astype(theano.config.floatX))
            for param in model_params]

        gnma = 0.999
        beta = 0.9
        weight_decay = 1000 / 50000.

        gparams = T.grad(
            cost=cost,
            wrt=model_params,
            consider_constant=consts
        )


        updates = [(param, param - learning_rate / (T.sqrt(r / (1 - gnma ** t))) * v / (1 - beta ** t))
                    for param, r, v, t  in zip(model_params, rs, vs, ts)]
        updates += [(r, gnma * r + (1- gnma) * (gparam - weight_decay * param) ** 2) for param, gparam, r in zip(model_params, gparams, rs)]
        updates += [(v, beta * v + (1- beta) * (gparam - weight_decay * param)) for param, gparam, v in zip(model_params, gparams, vs)]
        updates += [(t, t + 1) for t in ts]


        train = theano.function(
            inputs=[X],
            outputs=cost,
            updates=updates
        )

        validate = theano.function(
            inputs=[X],
            outputs=cost
        )

        n_samples = x_datas.shape[0]
        cost_history = []

        for i in xrange(n_iters):
            ixs = rng.permutation(n_samples)[:minibatch_size]
            minibatch_cost = train(x_datas[ixs])
            # print minibatch_cost

            if np.mod(i, n_mod_history) == 0:
                print '%d epoch error: %f' % (i, minibatch_cost)
                if calc_history == 'minibatch':
                    cost_history.append((i, minibatch_cost))
                else:
                    cost_history.append((i, validate(x_datas[ixs])))
        return cost_history