def build_model(self, train_set, test_set, validation_set=None): """ :param train_set_unlabeled: Unlabeled train set containing variables x, t. :param train_set_labeled: Unlabeled train set containing variables x, t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(CVAE, self).build_model(train_set, test_set, validation_set) n = self.sh_train_x.shape[0].astype( theano.config.floatX) # no. of data points # Define the layers for the density estimation used in the lower bound. l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu, self.l_qz_logvar) l_log_pz = StandardNormalLogDensityLayer(self.l_qz) l_x_in = ReshapeLayer(self.l_x_in, (-1, self.seq_length * self.n_x)) if self.x_dist == 'bernoulli': l_px = ReshapeLayer( self.l_px, (-1, self.sym_samples, 1, self.seq_length * self.n_x)) l_log_px = BernoulliLogDensityLayer(l_px, l_x_in) elif self.x_dist == 'multinomial': l_px = ReshapeLayer( self.l_px, (-1, self.sym_samples, 1, self.seq_length * self.n_x)) l_log_px = MultinomialLogDensityLayer(l_px, l_x_in) elif self.x_dist == 'gaussian': l_px_mu = ReshapeLayer( self.l_px_mu, (-1, self.sym_samples, 1, self.seq_length * self.n_x)) l_px_logvar = ReshapeLayer( self.l_px_logvar, (-1, self.sym_samples, 1, self.seq_length * self.n_x)) l_log_px = GaussianLogDensityLayer(l_x_in, l_px_mu, l_px_logvar) elif self.x_dist == 'linear': l_log_px = self.l_px self.sym_warmup = T.fscalar('warmup') def lower_bound(log_pz, log_qz, log_px): return log_px + (log_pz - log_qz) * (1. - self.sym_warmup - 0.1) # Lower bound out_layers = [l_log_pz, l_log_qz, l_log_px] inputs = {self.l_x_in: self.sym_x} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pz, log_qz, log_px = out # If the decoder output is linear we need the reconstruction error if self.x_dist == 'linear': log_px = -aggregate(squared_error(log_px.mean(axis=(1, 2)), self.sym_x), mode='mean') lb = lower_bound(log_pz, log_qz, log_px) lb = lb.mean(axis=(1, 2)) # Mean over the sampling dimensions if self.batchnorm: # TODO: implement the BN layer correctly. inputs = {self.l_x_in: self.sym_x} get_output(out_layers, inputs, weighting=None, batch_norm_update_averages=True, batch_norm_use_averages=False) # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients weight_priors = 0.0 for p in self.trainable_model_params: if 'W' not in str(p): continue weight_priors += log_normal(p, 0, 1).sum() # Collect the lower bound and scale it with the weight priors. elbo = lb.mean() cost = (elbo * n + weight_priors) / -n grads_collect = T.grad(cost, self.trainable_model_params) sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads_collect, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = adam(mgrads, self.trainable_model_params, self.sym_lr, sym_beta1, sym_beta2) # updates = rmsprop(mgrads, self.trainable_model_params, self.sym_lr + (0*sym_beta1*sym_beta2)) # Training function x_batch = self.sh_train_x[self.batch_slice] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch = self._srng.binomial(size=x_batch.shape, n=1, p=x_batch, dtype=theano.config.floatX) givens = {self.sym_x: x_batch} inputs = [ self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples, self.sym_warmup ] outputs = [ log_px.mean(), log_pz.mean(), log_qz.mean(), elbo, self.sym_warmup ] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize'] = 100 self.train_args['inputs']['learningrate'] = 1e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['inputs']['warmup'] = 0.1 self.train_args['outputs']['log p(x)'] = '%0.6f' self.train_args['outputs']['log p(z)'] = '%0.6f' self.train_args['outputs']['log q(z)'] = '%0.6f' self.train_args['outputs']['elbo train'] = '%0.6f' self.train_args['outputs']['warmup'] = '%0.3f' # Validation and test function givens = {self.sym_x: self.sh_test_x} f_test = theano.function(inputs=[self.sym_samples, self.sym_warmup], outputs=[elbo], givens=givens) # Test args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['inputs']['warmup'] = 0.1 self.test_args['outputs']['elbo test'] = '%0.6f' f_validate = None if validation_set is not None: givens = {self.sym_x: self.sh_valid_x} f_validate = theano.function(inputs=[self.sym_samples], outputs=[elbo], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['elbo validation'] = '%0.6f' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def build_model(self, train_set_unlabeled, train_set_labeled, test_set, validation_set=None): """ Build the auxiliary deep generative model from the initialized hyperparameters. Define the lower bound term and compile it into a training function. :param train_set_unlabeled: Unlabeled train set containing variables x, t. :param train_set_labeled: Unlabeled train set containing variables x, t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(CSDGM, self).build_model(train_set_unlabeled, test_set, validation_set) sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0], dtype=theano.config.floatX), borrow=True) sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1], dtype=theano.config.floatX), borrow=True) n = self.sh_train_x.shape[0].astype( theano.config.floatX) # no. of data points n_l = sh_train_x_l.shape[0].astype( theano.config.floatX) # no. of labeled data points # Define the layers for the density estimation used in the lower bound. l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu, self.l_qa_logvar) l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu, self.l_qz_logvar) l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8) l_log_pz = StandardNormalLogDensityLayer(self.l_qz) l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu, self.l_pa_logvar) l_x_in = ReshapeLayer(self.l_x_in, (-1, self.n_l * self.n_c)) l_px = DimshuffleLayer(self.l_px, (0, 3, 1, 2, 4)) l_px = ReshapeLayer(l_px, (-1, self.sym_samples, 1, self.n_c)) if self.x_dist == 'bernoulli': l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in) elif self.x_dist == 'multinomial': l_log_px = MultinomialLogDensityLayer(l_px, l_x_in) l_log_px = ReshapeLayer(l_log_px, (-1, self.n_l, 1, 1, 1)) l_log_px = MeanLayer(l_log_px, axis=1) elif self.x_dist == 'gaussian': l_px_mu = ReshapeLayer( DimshuffleLayer(self.l_px_mu, (0, 2, 3, 1, 4)), (-1, self.sym_samples, 1, self.n_l * self.n_c)) l_px_logvar = ReshapeLayer( DimshuffleLayer(self.l_px_logvar, (0, 2, 3, 1, 4)), (-1, self.sym_samples, 1, self.n_l * self.n_c)) l_log_px = GaussianLogDensityLayer(l_x_in, l_px_mu, l_px_logvar) def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px): lb = log_px + log_py + (log_pz + log_pa - log_qa - log_qz) * (1.1 - self.sym_warmup) return lb # Lower bound for labeled data out_layers = [ l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy ] inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out # Prior p(y) expecting that all classes are evenly distributed py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y))) log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l, log_py_l, log_px_zy_l) lb_l = lb_l.mean(axis=(1, 2)) # Mean over the sampling dimensions log_qy_ax_l *= ( self.sym_beta * (n / n_l) ) # Scale the supervised cross entropy with the alpha constant lb_l += log_qy_ax_l.mean(axis=( 1, 2 )) # Collect the lower bound term and mean over sampling dimensions # Lower bound for unlabeled data bs_u = self.sym_x_u.shape[0] # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form # x_repeat t_repeat # [[x[0,0], x[0,1], ..., x[0,n_x]] [[1, 0, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [1, 0, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 1, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [0, 1, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 0, 1] # [x[1,0], x[1,1], ..., x[1,n_x]]] [0, 0, 1]] t_eye = T.eye(self.n_y, k=0) t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape( (-1, self.n_y)) x_u = self.sym_x_u.reshape( (1, bs_u, self.n_l, self.n_c)).repeat(self.n_y, axis=0).reshape( (-1, self.n_l, self.n_c)) # Since the expectation of var a is outside the integration we calculate E_q(a|x) first a_x_u = get_output(self.l_qa, self.sym_x_u, batch_norm_update_averages=True, batch_norm_use_averages=False) a_x_u_rep = a_x_u.reshape( (1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y, axis=0).reshape( (-1, self.n_a)) out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px] inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out # Prior p(y) expecting that all classes are evenly distributed py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y))) log_py_u = -categorical_crossentropy(py_u, t_u).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u, log_py_u, log_px_zy_u) lb_u = lb_u.reshape( (self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2)) inputs = { self.l_x_in: self.sym_x_u, self.l_a_in: a_x_u.reshape((-1, self.n_a)) } y_u = get_output(self.l_qy, inputs, batch_norm_update_averages=True, batch_norm_use_averages=False).mean(axis=(1, 2)) y_u += 1e-8 # Ensure that we get no NANs when calculating the entropy y_u /= T.sum(y_u, axis=1, keepdims=True) lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1) # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients weight_priors = 0.0 for p in self.trainable_model_params: if 'W' not in str(p): continue weight_priors += log_normal(p, 0, 1).sum() # Collect the lower bound and scale it with the weight priors. elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n lb_labeled = -lb_l.mean() lb_unlabeled = -lb_u.mean() log_px = log_px_zy_l.mean() + log_px_zy_u.mean() log_pz = log_pz_l.mean() + log_pz_u.mean() log_qz = log_qz_axy_l.mean() + log_qz_axy_u.mean() log_pa = log_pa_l.mean() + log_pa_u.mean() log_qa = log_qa_x_l.mean() + log_qa_x_u.mean() grads_collect = T.grad(elbo, self.trainable_model_params) params_collect = self.trainable_model_params sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads_collect, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1, sym_beta2) # Training function indices = self._srng.choice(size=[self.sym_bs_l], a=sh_train_x_l.shape[0], replace=False) x_batch_l = sh_train_x_l[indices] t_batch_l = sh_train_t_l[indices] x_batch_u = self.sh_train_x[self.batch_slice] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX) x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX) givens = { self.sym_x_l: x_batch_l, self.sym_x_u: x_batch_u, self.sym_t_l: t_batch_l } inputs = [ self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples, self.sym_warmup ] outputs = [ elbo, lb_labeled, lb_unlabeled, log_px, log_pz, log_qz, log_pa, log_qa ] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize_unlabeled'] = 100 self.train_args['inputs']['batchsize_labeled'] = 100 self.train_args['inputs']['beta'] = 0.1 self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['inputs']['warmup'] = 0.1 self.train_args['outputs']['lb'] = '%0.3f' self.train_args['outputs']['lb-l'] = '%0.3f' self.train_args['outputs']['lb-u'] = '%0.3f' self.train_args['outputs']['px'] = '%0.3f' self.train_args['outputs']['pz'] = '%0.3f' self.train_args['outputs']['qz'] = '%0.3f' self.train_args['outputs']['pa'] = '%0.3f' self.train_args['outputs']['qa'] = '%0.3f' # Validation and test function y = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100 givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t} f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Test args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['test'] = '%0.2f%%' f_validate = None if validation_set is not None: givens = { self.sym_x_l: self.sh_valid_x, self.sym_t_l: self.sh_valid_t } f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['validation'] = '%0.2f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def build_model(self, train_set_unlabeled, train_set_labeled, test_set, validation_set=None): """ Build the auxiliary deep generative model from the initialized hyperparameters. Define the lower bound term and compile it into a training function. :param train_set_unlabeled: Unlabeled train set containing variables x, t. :param train_set_labeled: Unlabeled train set containing variables x, t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(SDGMSSL, self).build_model(train_set_unlabeled, test_set, validation_set) sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0], dtype=theano.config.floatX), borrow=True) sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1], dtype=theano.config.floatX), borrow=True) n = self.sh_train_x.shape[0].astype(theano.config.floatX) # no. of data points n_l = sh_train_x_l.shape[0].astype(theano.config.floatX) # no. of labeled data points # Define the layers for the density estimation used in the lower bound. l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu, self.l_qa_logvar) l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu, self.l_qz_logvar) l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8) l_log_pz = StandardNormalLogDensityLayer(self.l_qz) l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu, self.l_pa_logvar) if self.x_dist == 'bernoulli': l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in) elif self.x_dist == 'multinomial': l_log_px = MultinomialLogDensityLayer(self.l_px, self.l_x_in) elif self.x_dist == 'gaussian': l_log_px = GaussianLogDensityLayer(self.l_x_in, self.l_px_mu, self.l_px_logvar) def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px): lb = log_px + log_py + log_pz + log_pa - log_qa - log_qz return lb # Lower bound for labeled data out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy] inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out # Prior p(y) expecting that all classes are evenly distributed py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y))) log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l, log_py_l, log_px_zy_l) lb_l = lb_l.mean(axis=(1, 2)) # Mean over the sampling dimensions log_qy_ax_l *= (self.sym_beta * (n / n_l)) # Scale the supervised cross entropy with the alpha constant lb_l -= log_qy_ax_l.mean(axis=(1, 2)) # Collect the lower bound term and mean over sampling dimensions # Lower bound for unlabeled data bs_u = self.sym_x_u.shape[0] # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form # x_repeat t_repeat # [[x[0,0], x[0,1], ..., x[0,n_x]] [[1, 0, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [1, 0, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 1, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [0, 1, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 0, 1] # [x[1,0], x[1,1], ..., x[1,n_x]]] [0, 0, 1]] t_eye = T.eye(self.n_y, k=0) t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape((-1, self.n_y)) x_u = self.sym_x_u.reshape((1, bs_u, self.n_x)).repeat(self.n_y, axis=0).reshape((-1, self.n_x)) # Since the expectation of var a is outside the integration we calculate E_q(a|x) first a_x_u = get_output(self.l_qa, self.sym_x_u, batch_norm_update_averages=True, batch_norm_use_averages=False) a_x_u_rep = a_x_u.reshape((1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y, axis=0).reshape( (-1, self.n_a)) out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px] inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out # Prior p(y) expecting that all classes are evenly distributed py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y))) log_py_u = -categorical_crossentropy(py_u, t_u).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u, log_py_u, log_px_zy_u) lb_u = lb_u.reshape((self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2)) inputs = {self.l_x_in: self.sym_x_u, self.l_a_in: a_x_u.reshape((-1, self.n_a))} y_u = get_output(self.l_qy, inputs, batch_norm_update_averages=True, batch_norm_use_averages=False).mean( axis=(1, 2)) y_u += 1e-8 # Ensure that we get no NANs when calculating the entropy y_u /= T.sum(y_u, axis=1, keepdims=True) lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1) if self.batchnorm: # TODO: implement the BN layer correctly. inputs = {self.l_x_in: self.sym_x_u, self.l_y_in: y_u, self.l_a_in: a_x_u} get_output(out_layers, inputs, weighting=None, batch_norm_update_averages=True, batch_norm_use_averages=False) # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients weight_priors = 0.0 for p in self.trainable_model_params: if 'W' not in str(p): continue weight_priors += log_normal(p, 0, 1).sum() # Collect the lower bound and scale it with the weight priors. elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n lb_labeled = -lb_l.mean() lb_unlabeled = -lb_u.mean() grads_collect = T.grad(elbo, self.trainable_model_params) params_collect = self.trainable_model_params sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads_collect, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1, sym_beta2) # Training function indices = self._srng.choice(size=[self.sym_bs_l], a=sh_train_x_l.shape[0], replace=False) x_batch_l = sh_train_x_l[indices] t_batch_l = sh_train_t_l[indices] x_batch_u = self.sh_train_x[self.batch_slice] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX) x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX) givens = {self.sym_x_l: x_batch_l, self.sym_x_u: x_batch_u, self.sym_t_l: t_batch_l} inputs = [self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples] outputs = [elbo, lb_labeled, lb_unlabeled] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize_unlabeled'] = 100 self.train_args['inputs']['batchsize_labeled'] = 100 self.train_args['inputs']['beta'] = 0.1 self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['outputs']['lb'] = '%0.4f' self.train_args['outputs']['lb-labeled'] = '%0.4f' self.train_args['outputs']['lb-unlabeled'] = '%0.4f' # Validation and test function y = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100 givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t} f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Test args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['test'] = '%0.2f%%' f_validate = None if validation_set is not None: givens = {self.sym_x_l: self.sh_valid_x, self.sym_t_l: self.sh_valid_t} f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['validation'] = '%0.2f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def build_model(self, train_set, test_set, validation_set=None): """ Build the auxiliary deep generative model from the initialized hyperparameters. Define the lower bound term and compile it into a training function. :param train_set: Train set containing variables x, t. for the unlabeled data_preparation in the train set, we define 0's in t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(ADGMSSL, self).build_model(train_set, test_set, validation_set) # Define the layers for the density estimation used in the lower bound. l_log_pa = GaussianMarginalLogDensityLayer(self.l_a_mu, self.l_a_logvar) l_log_pz = GaussianMarginalLogDensityLayer(self.l_z_mu, self.l_z_logvar) l_log_qa_x = GaussianMarginalLogDensityLayer(1, self.l_a_logvar) l_log_qz_xy = GaussianMarginalLogDensityLayer(1, self.l_z_logvar) l_log_qy_ax = MultinomialLogDensityLayer(self.l_y, self.l_y_in, eps=1e-8) if self.x_dist == 'bernoulli': l_px_zy = BernoulliLogDensityLayer(self.l_xhat, self.l_x_in) elif self.x_dist == 'multinomial': l_px_zy = MultinomialLogDensityLayer(self.l_xhat, self.l_x_in) elif self.x_dist == 'gaussian': l_px_zy = GaussianLogDensityLayer(self.l_x_in, self.l_xhat_mu, self.l_xhat_logvar) ### Compute lower bound for labeled data_preparation ### out_layers = [l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy, l_log_qy_ax] inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l} log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = get_output(out_layers, inputs) py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y))) # non-informative prior log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_l = log_pa_l + log_pz_l + log_py_l + log_px_zy_l - log_qa_x_l - log_qz_axy_l # Upscale the discriminative term with a weight. log_qy_ax_l *= self.sym_beta xhat_grads_l = T.grad(lb_l.mean(axis=(1, 2)).sum(), self.xhat_params) y_grads_l = T.grad(log_qy_ax_l.mean(axis=(1, 2)).sum(), self.y_params) lb_l += log_qy_ax_l lb_l = lb_l.mean(axis=(1, 2)) ### Compute lower bound for unlabeled data_preparation ### bs_u = self.sym_x_u.shape[0] # size of the unlabeled data_preparation. t_eye = T.eye(self.n_y, k=0) # ones in diagonal and 0's elsewhere (bs x n_y). # repeat unlabeled t the number of classes for integration (bs * n_y) x n_y. t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape((-1, self.n_y)) # repeat unlabeled x the number of classes for integration (bs * n_y) x n_x x_u = self.sym_x_u.reshape((1, bs_u, self.n_x)).repeat(self.n_y, axis=0).reshape((-1, self.n_x)) out_layers = [l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy] inputs = {self.l_x_in: x_u, self.l_y_in: t_u} log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = get_output(out_layers, inputs) py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y))) # non-informative prior. log_py_u = -categorical_crossentropy(py_u, t_u).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_u = log_pa_u + log_pz_u + log_py_u + log_px_zy_u - log_qa_x_u - log_qz_axy_u lb_u = lb_u.reshape((self.n_y, self.sym_samples, 1, bs_u)).transpose(3, 1, 2, 0).mean( axis=(1, 2)) # mean over samples. y_ax_u = get_output(self.l_y, self.sym_x_u) y_ax_u = y_ax_u.mean(axis=(1, 2)) # bs x n_y y_ax_u += 1e-8 # ensure that we get no NANs. y_ax_u /= T.sum(y_ax_u, axis=1, keepdims=True) xhat_grads_u = T.grad((y_ax_u * lb_u).sum(axis=1).sum(), self.xhat_params) lb_u = (y_ax_u * (lb_u - T.log(y_ax_u))).sum(axis=1) y_grads_u = T.grad(lb_u.sum(), self.y_params) # Loss - regularizing with weight priors p(theta|N(0,1)) and clipping gradients y_weight_priors = 0.0 for p in self.y_params: if 'W' not in str(p): continue y_weight_priors += log_normal(p, 0, 1).sum() y_weight_priors_grad = T.grad(y_weight_priors, self.y_params, disconnected_inputs='ignore') xhat_weight_priors = 0.0 for p in self.xhat_params: if 'W' not in str(p): continue xhat_weight_priors += log_normal(p, 0, 1).sum() xhat_weight_priors_grad = T.grad(xhat_weight_priors, self.xhat_params, disconnected_inputs='ignore') n = self.sh_train_x.shape[0].astype(theano.config.floatX) # no. of data_preparation points in train set n_b = n / self.sym_batchsize.astype(theano.config.floatX) # no. of batches in train set y_grads = [T.zeros(p.shape) for p in self.y_params] for i in range(len(y_grads)): y_grads[i] = (y_grads_l[i] + y_grads_u[i]) y_grads[i] *= n_b y_grads[i] += y_weight_priors_grad[i] y_grads[i] /= -n xhat_grads = [T.zeros(p.shape) for p in self.xhat_params] for i in range(len(xhat_grads)): xhat_grads[i] = (xhat_grads_l[i] + xhat_grads_u[i]) xhat_grads[i] *= n_b xhat_grads[i] += xhat_weight_priors_grad[i] xhat_grads[i] /= -n params = self.y_params + self.xhat_params grads = y_grads + xhat_grads # Collect the lower bound and scale it with the weight priors. elbo = ((lb_l.sum() + lb_u.sum()) * n_b + y_weight_priors + xhat_weight_priors) / -n # Avoid vanishing and exploding gradients. clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') updates = adam(mgrads, params, self.sym_lr, sym_beta1, sym_beta2) ### Compile training function ### x_batch_l = self.sh_train_x[self.batch_slice][:self.sym_bs_l] x_batch_u = self.sh_train_x[self.batch_slice][self.sym_bs_l:] t_batch_l = self.sh_train_t[self.batch_slice][:self.sym_bs_l] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX) x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX) givens = {self.sym_x_l: x_batch_l, self.sym_x_u: x_batch_u, self.sym_t_l: t_batch_l} inputs = [self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples] f_train = theano.function(inputs=inputs, outputs=[elbo], givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize'] = 200 self.train_args['inputs']['batchsize_labeled'] = 100 self.train_args['inputs']['beta'] = 1200. self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['outputs']['lb'] = '%0.4f' ### Compile testing function ### class_err_test = self._classification_error(self.sym_x_l, self.sym_t_l) givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t} f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err_test], givens=givens) # Testing args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['err'] = '%0.2f%%' ### Compile validation function ### f_validate = None if validation_set is not None: class_err_valid = self._classification_error(self.sym_x_l, self.sym_t_l) givens = {self.sym_x_l: self.sh_valid_x, self.sym_t_l: self.sh_valid_t} inputs = [self.sym_samples] f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err_valid], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['err'] = '%0.2f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def build_model(self, train_set, test_set, validation_set=None): """ Build the auxiliary deep generative model from the initialized hyperparameters. Define the lower bound term and compile it into a training function. :param train_set: Train set containing variables x, t. for the unlabeled data_preparation in the train set, we define 0's in t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(ADGMSSL, self).build_model(train_set, test_set, validation_set) # Define the layers for the density estimation used in the lower bound. l_log_pa = GaussianMarginalLogDensityLayer(self.l_a_mu, self.l_a_logvar) l_log_pz = GaussianMarginalLogDensityLayer(self.l_z_mu, self.l_z_logvar) l_log_qa_x = GaussianMarginalLogDensityLayer(1, self.l_a_logvar) l_log_qz_xy = GaussianMarginalLogDensityLayer(1, self.l_z_logvar) l_log_qy_ax = MultinomialLogDensityLayer(self.l_y, self.l_y_in, eps=1e-8) if self.x_dist == 'bernoulli': l_px_zy = BernoulliLogDensityLayer(self.l_xhat, self.l_x_in) elif self.x_dist == 'multinomial': l_px_zy = MultinomialLogDensityLayer(self.l_xhat, self.l_x_in) elif self.x_dist == 'gaussian': l_px_zy = GaussianLogDensityLayer(self.l_x_in, self.l_xhat_mu, self.l_xhat_logvar) ### Compute lower bound for labeled data_preparation ### out_layers = [ l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy, l_log_qy_ax ] inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l} log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = get_output( out_layers, inputs) py_l = softmax(T.zeros( (self.sym_x_l.shape[0], self.n_y))) # non-informative prior log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_l = log_pa_l + log_pz_l + log_py_l + log_px_zy_l - log_qa_x_l - log_qz_axy_l # Upscale the discriminative term with a weight. log_qy_ax_l *= self.sym_beta xhat_grads_l = T.grad(lb_l.mean(axis=(1, 2)).sum(), self.xhat_params) y_grads_l = T.grad(log_qy_ax_l.mean(axis=(1, 2)).sum(), self.y_params) lb_l += log_qy_ax_l lb_l = lb_l.mean(axis=(1, 2)) ### Compute lower bound for unlabeled data_preparation ### bs_u = self.sym_x_u.shape[0] # size of the unlabeled data_preparation. t_eye = T.eye(self.n_y, k=0) # ones in diagonal and 0's elsewhere (bs x n_y). # repeat unlabeled t the number of classes for integration (bs * n_y) x n_y. t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape( (-1, self.n_y)) # repeat unlabeled x the number of classes for integration (bs * n_y) x n_x x_u = self.sym_x_u.reshape( (1, bs_u, self.n_x)).repeat(self.n_y, axis=0).reshape( (-1, self.n_x)) out_layers = [l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy] inputs = {self.l_x_in: x_u, self.l_y_in: t_u} log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = get_output( out_layers, inputs) py_u = softmax(T.zeros( (bs_u * self.n_y, self.n_y))) # non-informative prior. log_py_u = -categorical_crossentropy(py_u, t_u).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_u = log_pa_u + log_pz_u + log_py_u + log_px_zy_u - log_qa_x_u - log_qz_axy_u lb_u = lb_u.reshape( (self.n_y, self.sym_samples, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2)) # mean over samples. y_ax_u = get_output(self.l_y, self.sym_x_u) y_ax_u = y_ax_u.mean(axis=(1, 2)) # bs x n_y y_ax_u += 1e-8 # ensure that we get no NANs. y_ax_u /= T.sum(y_ax_u, axis=1, keepdims=True) xhat_grads_u = T.grad((y_ax_u * lb_u).sum(axis=1).sum(), self.xhat_params) lb_u = (y_ax_u * (lb_u - T.log(y_ax_u))).sum(axis=1) y_grads_u = T.grad(lb_u.sum(), self.y_params) # Loss - regularizing with weight priors p(theta|N(0,1)) and clipping gradients y_weight_priors = 0.0 for p in self.y_params: if 'W' not in str(p): continue y_weight_priors += log_normal(p, 0, 1).sum() y_weight_priors_grad = T.grad(y_weight_priors, self.y_params, disconnected_inputs='ignore') xhat_weight_priors = 0.0 for p in self.xhat_params: if 'W' not in str(p): continue xhat_weight_priors += log_normal(p, 0, 1).sum() xhat_weight_priors_grad = T.grad(xhat_weight_priors, self.xhat_params, disconnected_inputs='ignore') n = self.sh_train_x.shape[0].astype( theano.config.floatX ) # no. of data_preparation points in train set n_b = n / self.sym_batchsize.astype( theano.config.floatX) # no. of batches in train set y_grads = [T.zeros(p.shape) for p in self.y_params] for i in range(len(y_grads)): y_grads[i] = (y_grads_l[i] + y_grads_u[i]) y_grads[i] *= n_b y_grads[i] += y_weight_priors_grad[i] y_grads[i] /= -n xhat_grads = [T.zeros(p.shape) for p in self.xhat_params] for i in range(len(xhat_grads)): xhat_grads[i] = (xhat_grads_l[i] + xhat_grads_u[i]) xhat_grads[i] *= n_b xhat_grads[i] += xhat_weight_priors_grad[i] xhat_grads[i] /= -n params = self.y_params + self.xhat_params grads = y_grads + xhat_grads # Collect the lower bound and scale it with the weight priors. elbo = ((lb_l.sum() + lb_u.sum()) * n_b + y_weight_priors + xhat_weight_priors) / -n # Avoid vanishing and exploding gradients. clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') updates = adam(mgrads, params, self.sym_lr, sym_beta1, sym_beta2) ### Compile training function ### x_batch_l = self.sh_train_x[self.batch_slice][:self.sym_bs_l] x_batch_u = self.sh_train_x[self.batch_slice][self.sym_bs_l:] t_batch_l = self.sh_train_t[self.batch_slice][:self.sym_bs_l] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX) x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX) givens = { self.sym_x_l: x_batch_l, self.sym_x_u: x_batch_u, self.sym_t_l: t_batch_l } inputs = [ self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples ] f_train = theano.function(inputs=inputs, outputs=[elbo], givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize'] = 200 self.train_args['inputs']['batchsize_labeled'] = 100 self.train_args['inputs']['beta'] = 1200. self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['outputs']['lb'] = '%0.4f' ### Compile testing function ### class_err_test = self._classification_error(self.sym_x_l, self.sym_t_l) givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t} f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err_test], givens=givens) # Testing args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['err'] = '%0.2f%%' ### Compile validation function ### f_validate = None if validation_set is not None: class_err_valid = self._classification_error( self.sym_x_l, self.sym_t_l) givens = { self.sym_x_l: self.sh_valid_x, self.sym_t_l: self.sh_valid_t } inputs = [self.sym_samples] f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err_valid], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['err'] = '%0.2f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def __init__(self, feature_shape, latent_size, hidden_structure, reconstruction_distribution=None, number_of_reconstruction_classes=None, use_count_sum=False, use_batch_norm=False): self.use_count_sum = use_count_sum and \ (reconstruction_distribution != "bernoulli") # Add warm-up to the model, weighting the KL-term gradually higher for each epoch self.use_batch_norm = use_batch_norm print("Setting up model.") print(" feature size: {}".format(feature_shape)) print(" latent size: {}".format(latent_size)) print(" hidden structure: {}".format(", ".join( map(str, hidden_structure)))) if type(reconstruction_distribution) == str: print(" reconstruction distribution: " + reconstruction_distribution) else: print(" reconstruction distribution: custom") if number_of_reconstruction_classes > 0: print( " reconstruction classes: {}".format( number_of_reconstruction_classes), " (including 0s)") if self.use_count_sum: print(" using count sums") if self.use_batch_norm: print(" using batch normalisation of each layer.") print("") # Setup super(VariationalAutoEncoderForCounts, self).__init__() self.feature_shape = feature_shape self.latent_size = latent_size self.hidden_structure = hidden_structure symbolic_x = T.matrix('x') # counts symbolic_z = T.matrix('z') # latent variable self.number_of_epochs_trained = 0 symbolic_learning_rate = T.scalar("epsilon") symbolic_warm_up_weight = T.scalar("beta") self.learning_curves = { "training": { "LB": [], "ENRE": [], "KL": [], "KL_all": [] }, "validation": { "LB": [], "ENRE": [], "KL": [] } } if reconstruction_distribution: if type(reconstruction_distribution) == str: if number_of_reconstruction_classes > 0: reconstruction_distribution = "softmax_" + \ reconstruction_distribution self.k_max = number_of_reconstruction_classes - 1 reconstruction_distribution = \ reconstruction_distributions[reconstruction_distribution] reconstruction_distribution = \ reconstruction_distribution(self.k_max) else: reconstruction_distribution = \ reconstruction_distributions[reconstruction_distribution] self.x_parameters = reconstruction_distribution["parameters"] self.reconstruction_activation_functions = \ reconstruction_distribution["activation functions"] self.expectedNegativeReconstructionError = \ reconstruction_distribution["function"] self.meanOfReconstructionDistribution = reconstruction_distribution[ "mean"] self.preprocess = reconstruction_distribution["preprocess"] else: reconstruction_distribution = "Gaussian (default)" # Use a Gaussian distribution as standard self.x_parameters = ["mu", "sigma"] self.reconstruction_activation_functions = { "mu": identity, "sigma": identity } self.expectedNegativeReconstructionError = lambda x, x_theta, eps = 0.0: \ log_normal(x, x_theta["mu"], x_theta["sigma"], eps) self.meanOfReconstructionDistribution = lambda x_theta: x_theta[ "mu"] self.preprocess = lambda x: x # if number_of_reconstruction_classes > 0: # # self.x_parameters += ["p_k"] # self.reconstruction_activation_functions["p_k"] = softmax # log_distribution = self.expectedNegativeReconstructionError # self.expectedNegativeReconstructionError = lambda x, x_theta, eps = 0.0: \ # log_cross_entropy_extended(x, x_theta, # log_distribution, k_max = number_of_reconstruction_classes - 1, # eps = 0.0) # mean_of_distribution = self.meanOfReconstructionDistribution # self.meanOfReconstructionDistribution = lambda x_theta: \ # meanOfCrossEntropyExtendedDistibution(x_theta, # mean_of_distribution, k_max = number_of_reconstruction_classes - 1) # self.k_max = number_of_reconstruction_classes - 1 if self.use_count_sum: symbolic_n = T.matrix('n') # sum of counts # Models ## Recognition model q(z|x) l_enc_in = InputLayer(shape=(None, feature_shape), name="ENC_INPUT") l_enc = l_enc_in for i, hidden_size in enumerate(hidden_structure): l_enc = DenseLayer(l_enc, num_units=hidden_size, nonlinearity=rectify, name='ENC_DENSE{:d}'.format(i + 1)) if self.use_batch_norm: l_enc = batch_norm(l_enc) if self.use_batch_norm: l_z_mu = batch_norm( DenseLayer(l_enc, num_units=latent_size, nonlinearity=None, name='ENC_Z_MU')) l_z_log_var = batch_norm( DenseLayer(l_enc, num_units=latent_size, nonlinearity=lambda x: T.clip(x, -10, 10), name='ENC_Z_LOG_VAR')) else: l_z_mu = DenseLayer(l_enc, num_units=latent_size, nonlinearity=None, name='ENC_Z_MU') l_z_log_var = DenseLayer(l_enc, num_units=latent_size, nonlinearity=lambda x: T.clip(x, -10, 10), name='ENC_Z_LOG_VAR') # Sample a latent representation z \sim q(z|x) = N(mu(x), logvar(x)) l_z = SimpleSampleLayer(mean=l_z_mu, log_var=l_z_log_var, name="ENC_SAMPLE") self.encoder = l_z ## Generative model p(x|z) l_dec_z_in = InputLayer(shape=(None, latent_size), name="DEC_INPUT") if self.use_count_sum: l_dec_n_in = InputLayer(shape=(None, 1), name="DEC_N_INPUT") l_dec = ConcatLayer([l_dec_z_in, l_dec_n_in], axis=1, name="DEC_MERGE_INPUT") else: l_dec = l_dec_z_in for i, hidden_size in enumerate(reversed(hidden_structure)): if self.use_batch_norm: l_dec = batch_norm( DenseLayer( l_dec, num_units=hidden_size, nonlinearity=rectify, name='DEC_DENSE{:d}'.format(len(hidden_structure) - i))) else: l_dec = DenseLayer( l_dec, num_units=hidden_size, nonlinearity=rectify, name='DEC_DENSE{:d}'.format(len(hidden_structure) - i)) l_x_theta = {} for p in self.x_parameters: p_name = 'DEC_X_' + p.upper() if self.reconstruction_activation_functions[p] == softmax: if self.use_batch_norm: l_dense = batch_norm( DenseLayer(l_dec, num_units=feature_shape * (self.k_max + 1), nonlinearity=identity, name=p_name + "_DENSE")) else: l_dense = DenseLayer(l_dec, num_units=feature_shape * (self.k_max + 1), nonlinearity=identity, name=p_name + "_DENSE") l_reshape = ReshapeLayer(l_dense, (-1, (self.k_max + 1))) if self.use_batch_norm: l_softmax = batch_norm( DenseLayer(l_reshape, num_units=(self.k_max + 1), nonlinearity=softmax, name=p_name + "_SOFTMAX")) else: l_softmax = DenseLayer(l_reshape, num_units=(self.k_max + 1), nonlinearity=softmax, name=p_name + "_SOFTMAX") l_x_theta[p] = ReshapeLayer(l_softmax, (-1, feature_shape, (self.k_max + 1))) else: if self.use_batch_norm: l_x_theta[p] = batch_norm( DenseLayer(l_dec, num_units=feature_shape, nonlinearity=self. reconstruction_activation_functions[p], name=p_name)) else: l_x_theta[p] = DenseLayer( l_dec, num_units=feature_shape, nonlinearity=self. reconstruction_activation_functions[p], name=p_name) self.decoder = {p: l_x_theta[p] for p in self.x_parameters} ## Get outputs from models ## Training outputs z_train, z_mu_train, z_log_var_train = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_in: symbolic_x}, deterministic=False) inputs = {l_dec_z_in: z_train} if self.use_count_sum: inputs[l_dec_n_in] = symbolic_n x_theta_train = get_output([l_x_theta[p] for p in self.x_parameters], inputs, deterministic=False) x_theta_train = { p: o for p, o in zip(self.x_parameters, x_theta_train) } ## Evaluation outputs z_eval, z_mu_eval, z_log_var_eval = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_in: symbolic_x}, deterministic=True) inputs = {l_dec_z_in: z_eval} if self.use_count_sum: inputs[l_dec_n_in] = symbolic_n x_theta_eval = get_output([l_x_theta[p] for p in self.x_parameters], inputs, deterministic=True) x_theta_eval = {p: o for p, o in zip(self.x_parameters, x_theta_eval)} ## Sample outputs inputs = {l_dec_z_in: symbolic_z} if self.use_count_sum: inputs[l_dec_n_in] = symbolic_n x_theta_sample = get_output([l_x_theta[p] for p in self.x_parameters], inputs, deterministic=True) x_theta_sample = { p: o for p, o in zip(self.x_parameters, x_theta_sample) } # Likelihood lower_bound_train, log_p_x_train, KL__train, KL__train_all = \ self.lowerBound(symbolic_x, x_theta_train, z_mu_train, z_log_var_train, beta=symbolic_warm_up_weight) lower_bound_eval, log_p_x_eval, KL__eval, KL__eval_all = \ self.lowerBound(symbolic_x, x_theta_eval, z_mu_eval, z_log_var_eval) all_parameters = get_all_params( [l_z] + [l_x_theta[p] for p in self.x_parameters], trainable=True) print("Parameters to train:") for parameter in all_parameters: print(" {}: {}".format(parameter, parameter.get_value().shape)) # Let Theano do its magic and get all the gradients we need for training all_gradients = T.grad(-lower_bound_train, all_parameters) # Set the update function for parameters. The Adam optimizer works really well with VAEs. update_expressions = updates.adam(all_gradients, all_parameters, learning_rate=symbolic_learning_rate) inputs = [symbolic_x] if self.use_count_sum: inputs.append(symbolic_n) inputs.append(symbolic_learning_rate) inputs.append(symbolic_warm_up_weight) self.f_train = theano.function(inputs=inputs, outputs=[ lower_bound_train, log_p_x_train, KL__train, KL__train_all ], updates=update_expressions) inputs = [symbolic_x] if self.use_count_sum: inputs.append(symbolic_n) self.f_eval = theano.function( inputs=inputs, outputs=[lower_bound_eval, log_p_x_eval, KL__eval, KL__eval_all]) self.f_z = theano.function(inputs=[symbolic_x], outputs=[z_eval]) inputs = [symbolic_z] if self.use_count_sum: inputs.append(symbolic_n) self.f_sample = theano.function( inputs=inputs, outputs=[x_theta_sample[p] for p in self.x_parameters]) inputs = [symbolic_x] if self.use_count_sum: inputs.append(symbolic_n) self.f_recon = theano.function( inputs=inputs, outputs=[x_theta_eval[p] for p in self.x_parameters])