def latent_gaussian_x_gaussian(z, z_mu, z_log_var, x_mu, x_log_var, x, eq_samples, iw_samples, epsilon=1e-6): # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions z = z.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) x_log_var = x_log_var.reshape((-1, eq_samples, iw_samples, num_features)) # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs x = x.reshape((-1,num_features)) x = x.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_features) z_mu = z_mu.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately # so we sum over feature/latent dimensions for multivariate pdfs log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3) log_pz = log_stdnormal(z).sum(axis=3) #log_px_given_z = log_bernoulli(x, T.clip(x_mu, epsilon, 1 - epsilon)).sum(axis=3) log_px_given_z = log_normal2(x, x_mu, x_log_var).sum(axis=3) #all log_*** should have dimension (batch_size, eq_samples, iw_samples) # Calculate the LL using log-sum-exp to avoid underflow a = log_pz + log_px_given_z - log_qz_given_x # size: (batch_size, eq_samples, iw_samples) a_max = T.max(a, axis=2, keepdims=True) # size: (batch_size, eq_samples, 1) LL = T.mean(a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2) ) ) return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
def latent_gaussian_x_bernoulli(z, z_mu, psi_u_list, z_log_var, x_mu, x, eq_samples, iw_samples, epsilon=1e-6): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. The sizes of the inputs are z: (batch_size*Eq_samples*ivae_samples*nsamples, num_latent) z_mu: (batch_size, num_latent) z_log_var: (batch_size, num_latent) x_mu: (batch_size*Eq_samples*ivae_samples*nsamples, num_latent) x: (batch_size, num_features) Reference: Burda et. al. 2015 "Importance Weighted Autoencoders" """ # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions z = z.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) # dimshuffle x since we need to broadcast it when calculationg the binary # cross-entropy x = x.dimshuffle(0,'x','x',1) # x: (batch_size, eq_samples, iw_samples, num_latent) for i in range(len(psi_u_list)): psi_u_list[i] = psi_u_list[i].reshape((-1, eq_samples, iw_samples)) #calculate LL components, note that we sum over the feature/num_unit dimension z_mu = z_mu.dimshuffle(0,'x','x',1) # mean: (batch_size, num_latent) z_log_var = z_log_var.dimshuffle(0,'x','x',1) # logvar: (batch_size, num_latent) log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3) log_pz = log_stdnormal(z).sum(axis=3) log_px_given_z = log_bernoulli(x, T.clip(x_mu,epsilon,1-epsilon)).sum(axis=3) #normalizing flow loss sum_log_psiu = 0 for psi_u in psi_u_list: sum_log_psiu += T.log(T.abs_(1+psi_u)) #all log_*** should have dimension (batch_size, eq_samples, iw_samples) # Calculate the LL using log-sum-exp to avoid underflow a = log_pz + log_px_given_z - log_qz_given_x + sum_log_psiu # size: (batch_size, eq_samples, iw_samples) a_max = T.max(a, axis=2, keepdims=True) # size: (batch_size, eq_samples, 1) # LL is calculated using Eq (8) in burda et al. # Working from inside out of the calculation below: # T.exp(a-a_max): (bathc_size, Eq_samples, iw_samples) # -> subtract a_max to avoid overflow. a_max is specific for each set of # importance samples and is broadcoasted over the last dimension. # # T.log( T.mean(T.exp(a-a_max), axis=2): (batch_size, Eq_samples) # -> This is the log of the sum over the importance weighted samples # # Lastly we add T.mean(a_max) to correct for the log-sum-exp trick LL = T.mean(a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2))) return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
def log_likelihood(z, z_mu, z_log_var, x_mu, x, analytic_kl_term): if analytic_kl_term: kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis = 1) log_px_given_z = log_bernoulli(x, x_mu, eps = 1e-6).sum(axis = 1) LL = T.mean(-kl_term + log_px_given_z) else: log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis = 1) log_pz = log_stdnormal(z).sum(axis = 1) log_px_given_z = log_bernoulli(x, x_mu, eps = 1e-6).sum(axis = 1) LL = T.mean(log_pz + log_px_given_z - log_qz_given_x) return LL
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, eq_samples, iw_samples, epsilon=1e-6): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. The sizes of the inputs are z: (batch_size*eq_samples*iw_samples, num_latent) z_mu: (batch_size, num_latent) z_log_var: (batch_size, num_latent) x_mu: (batch_size*eq_samples*iw_samples, num_features) x: (batch_size, num_features) Reference: Burda et al. 2015 "Importance Weighted Autoencoders" """ # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions z = z.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs x = x.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_features) z_mu = z_mu.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately # so we sum over feature/latent dimensions for multivariate pdfs log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3) log_pz = log_stdnormal(z).sum(axis=3) log_px_given_z = log_bernoulli(x, x_mu, eps=epsilon).sum(axis=3) #all log_*** should have dimension (batch_size, eq_samples, iw_samples) # Calculate the LL using log-sum-exp to avoid underflow a = log_pz + log_px_given_z - log_qz_given_x # size: (batch_size, eq_samples, iw_samples) a_max = T.max(a, axis=2, keepdims=True) # size: (batch_size, eq_samples, 1) # LL is calculated using Eq (8) in Burda et al. # Working from inside out of the calculation below: # T.exp(a-a_max): (batch_size, eq_samples, iw_samples) # -> subtract a_max to avoid overflow. a_max is specific for each set of # importance samples and is broadcasted over the last dimension. # # T.log( T.mean(T.exp(a-a_max), axis=2) ): (batch_size, eq_samples) # -> This is the log of the sum over the importance weighted samples # # The outer T.mean() computes the mean over eq_samples and batch_size # # Lastly we add T.mean(a_max) to correct for the log-sum-exp trick LL = T.mean(a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2) ) ) return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, eq_samples, iw_samples, epsilon=1e-6): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. The sizes of the inputs are z: (batch_size*eq_samples*iw_samples, num_latent) z_mu: (batch_size, num_latent) z_log_var: (batch_size, num_latent) x_mu: (batch_size*eq_samples*iw_samples, num_features) x: (batch_size, num_features) Reference: Burda et al. 2015 "Importance Weighted Autoencoders" """ # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions z = z.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs x = x.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_features) z_mu = z_mu.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately # so we sum over feature/latent dimensions for multivariate pdfs log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3) log_pz = log_stdnormal(z).sum(axis=3) log_px_given_z = log_bernoulli(x, T.clip(x_mu, epsilon, 1 - epsilon)).sum(axis=3) #all log_*** should have dimension (batch_size, eq_samples, iw_samples) # Calculate the LL using log-sum-exp to avoid underflow a = log_pz + log_px_given_z - log_qz_given_x # size: (batch_size, eq_samples, iw_samples) a_max = T.max(a, axis=2, keepdims=True) # size: (batch_size, eq_samples, 1) # LL is calculated using Eq (8) in Burda et al. # Working from inside out of the calculation below: # T.exp(a-a_max): (batch_size, eq_samples, iw_samples) # -> subtract a_max to avoid overflow. a_max is specific for each set of # importance samples and is broadcasted over the last dimension. # # T.log( T.mean(T.exp(a-a_max), axis=2) ): (batch_size, eq_samples) # -> This is the log of the sum over the importance weighted samples # # The outer T.mean() computes the mean over eq_samples and batch_size # # Lastly we add T.mean(a_max) to correct for the log-sum-exp trick LL = T.mean(a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2) ) ) return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
def latent_gaussian_x_bernoulli(z0, zk, z0_mu, z0_log_var, logdet_J_list, x_mu, x, eq_samples, iw_samples, epsilon=1e-6): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. The sizes of the inputs are z0: (batch_size*eq_samples*iw_samples, num_latent) zk: (batch_size*eq_samples*iw_samples, num_latent) z0_mu: (batch_size, num_latent) z0_log_var: (batch_size, num_latent) logdet_J_list: list of `nflows` elements, each with shape (batch_size*eq_samples*iw_samples) x_mu: (batch_size*eq_samples*iw_samples, num_features) x: (batch_size, num_features) Reference: Burda et al. 2015 "Importance Weighted Autoencoders" """ # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions z0 = z0.reshape((-1, eq_samples, iw_samples, latent_size)) zk = zk.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) for i in range(len(logdet_J_list)): logdet_J_list[i] = logdet_J_list[i].reshape((-1, eq_samples, iw_samples)) # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs x = x.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_features) z0_mu = z0_mu.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) z0_log_var = z0_log_var.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately # so we sum over feature/latent dimensions for multivariate pdfs log_q0z0_given_x = log_normal2(z0, z0_mu, z0_log_var).sum(axis=3) log_pzk = log_stdnormal(zk).sum(axis=3) log_px_given_zk = log_bernoulli(x, x_mu, epsilon).sum(axis=3) #normalizing flow loss sum_logdet_J = 0 for logdet_J_k in logdet_J_list: sum_logdet_J += logdet_J_k # Calculate the LL using log-sum-exp to avoid underflow all log_*** -> shape: (batch_size, eq_samples, iw_samples) LL = log_mean_exp(log_pzk + log_px_given_zk - log_q0z0_given_x + sum_logdet_J, axis=2) # log-mean-exp over iw_samples dimension -> shape: (batch_size, eq_samples) LL = T.mean(LL) # average over eq_samples, batch_size dimensions -> shape: () return LL, T.mean(log_q0z0_given_x), T.mean(sum_logdet_J), T.mean(log_pzk), T.mean(log_px_given_zk)
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x, x_mu, analytic_kl_term): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. """ if analytic_kl_term: kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis=1) log_px_given_z = log_bernoulli(x, x_mu).sum(axis=1) LL = T.mean(-kl_term + log_px_given_z) else: log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=1) log_pz = log_stdnormal(z).sum(axis=1) log_px_given_z = log_bernoulli(x, x_mu).sum(axis=1) LL = T.mean(log_pz + log_px_given_z - log_qz_given_x) return LL
def lowerbound_for_reinforce(z, z_mu, z_log_var, x_mu, x, num_features, num_labelled, num_classes, epsilon=1e-6): x = x.reshape((-1, num_features)) x_mu = x_mu.reshape((-1, num_features)) log_qz_given_xy = log_normal2(z, z_mu, z_log_var).sum(axis=1) log_pz = log_stdnormal(z).sum(axis=1) log_py = T.log(1.0 / num_classes) log_px_given_zy = log_bernoulli(x, T.clip(x_mu, epsilon, 1 - epsilon)).sum(axis=1) ll_xy = log_px_given_zy + log_pz + log_py - log_qz_given_xy return ll_xy[num_labelled:]
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, analytic_kl_term): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. The sizes of the inputs are z: (batch_size, num_latent) z_mu: (batch_size, num_latent) z_log_var: (batch_size, num_latent) x_mu: (batch_size, num_features) x: (batch_size, num_features) """ if analytic_kl_term: kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis=1) log_px_given_z = log_bernoulli(x, x_mu).sum(axis=1) LL = T.mean(-kl_term + log_px_given_z) else: log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=1) log_pz = log_stdnormal(z).sum(axis=1) log_px_given_z = log_bernoulli(x, x_mu).sum(axis=1) LL = T.mean(log_pz + log_px_given_z - log_qz_given_x) return LL
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, analytic_kl_term): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. The sizes of the inputs are z: (batch_size, num_latent) z_mu: (batch_size, num_latent) z_log_var: (batch_size, num_latent) x_mu: (batch_size, num_features) x: (batch_size, num_features) """ if analytic_kl_term: kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis=1) log_px_given_z = log_bernoulli(x, x_mu, eps=1e-6).sum(axis=1) LL = T.mean(-kl_term + log_px_given_z) else: log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=1) log_pz = log_stdnormal(z).sum(axis=1) log_px_given_z = log_bernoulli(x, x_mu, eps=1e-6).sum(axis=1) LL = T.mean(log_pz + log_px_given_z - log_qz_given_x) return LL
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, eq_samples, iw_samples, epsilon=1e-6): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. The sizes of the inputs are z: (batch_size*eq_samples*iw_samples, num_latent) z_mu: (batch_size, num_latent) z_log_var: (batch_size, num_latent) x_mu: (batch_size*eq_samples*iw_samples, num_features) x: (batch_size, num_features) Reference: Burda et al. 2015 "Importance Weighted Autoencoders" """ # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions z = z.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs x = x.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_features) z_mu = z_mu.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately # so we sum over feature/latent dimensions for multivariate pdfs log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3) log_pz = log_stdnormal(z).sum(axis=3) log_px_given_z = log_bernoulli(x, x_mu, epsilon).sum(axis=3) # Calculate the LL using log-sum-exp to avoid underflow all log_*** -> shape: (batch_size, eq_samples, iw_samples) LL = log_mean_exp(log_pz + log_px_given_z - log_qz_given_x, axis=2) # log-mean-exp over iw_samples dimension -> shape: (batch_size, eq_samples) LL = T.mean(LL) # average over eq_samples, batch_size dimensions -> shape: () return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
def latent_gaussian_x_bernoulli(z, z_mu_q, z_logvar_q, z_mu_p, z_logvar_p, x_mu, x, eq_samples, iw_samples, latent_sizes, num_features, epsilon=1e-6,reverse_z=False,clip_val=None, temp=None, epoch=None): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. z: (batch_size*nsamples*ivae_samples*nsamples, num_laten) x_mu: (batch_size*nsamples*ivae_samples, num_laten) """ if reverse_z: #for ladder like VAE where x->z3->z2->z1 z = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z,latent_sizes)] z_mu_q = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_mu_q[:-1],latent_sizes[:-1])] + [z_mu_q[-1].dimshuffle((0,'x','x',1))] z_logvar_q = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_logvar_q[:-1],latent_sizes[:-1])] + [z_logvar_q[-1].dimshuffle((0,'x','x',1))] else: #for normal VAE where x->z1->z2->z3 z = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z,latent_sizes)] z_mu_q = [z_mu_q[0].dimshuffle((0,'x','x',1))] + [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_mu_q[1:],latent_sizes[1:])] z_logvar_q = [z_logvar_q[0].dimshuffle((0,'x','x',1))] + [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_logvar_q[1:],latent_sizes[1:])] z_mu_p = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_mu_p,latent_sizes[:-1])] z_logvar_p = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_logvar_p,latent_sizes[:-1])] x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) x = x.dimshuffle((0,'x','x',1)) log_pz = [log_normal2(z_, mu_, logvar_) for z_, mu_, logvar_ in zip(z[:-1],z_mu_p,z_logvar_p)] + [log_stdnormal(z[-1])] log_qz = [log_normal2(z_, mu_, logvar_) for z_, mu_, logvar_ in zip(z, z_mu_q,z_logvar_q)] log_px = -T.nnet.binary_crossentropy(T.clip(x_mu,epsilon,1-epsilon), x) if clip_val is not None: log_pz = [T.clip(lp,clip_val,0) for lp in log_pz] log_qz = [T.clip(lp,clip_val,0) for lp in log_qz] #all log_*** should have dimension (batch_size, nsamples, ivae_samples) nlayers = len(log_qz) if temp is None: temp = [1.0 for _ in range(nlayers)] else: temp_step = (nlayers+1)*temp/float(100) temp = [T.max((i+1)*temp-epoch*temp_step,0.0) for i in range(nlayers)] a = log_px.sum(axis=3) + sum([p.sum(axis=3)*t for p,t in zip(log_pz,temp)]) - sum([p.sum(axis=3) for p in log_qz]) a_max = T.max(a, axis=2, keepdims=True) #(batch_size, nsamples, 1) #It is important that a_max is inside the mean since it is sample specific # T.exp(a-a_max): (bathc_size, nsamples, ivae_samples) # -> a_max to avoid overflow which is a problem. a_max is specific for # each set importance set of samples and is broadcoasted over the last dimension. # # T.log( T.mean(T.exp(a-a_max), axis=2): (bathc_size, nsamples) # -> This is the log of the sum over the importance weithed samples # # a_max.reshape((-1,nsamples)) (batch_size, nsamples) # -> We need to remove the last dimension of a_max to make the addition # # a_max.reshape((-1,nsamples)) + T.log( T.mean(T.exp(a-a_max), axis=2)) (batch_size, nsamples) # -> This is the LL estimater, eq (8) in Burda et. al. 2015, where nsamples is used to estimate the expectation # Last the LL estimator is meaned over all diemensions lower_bound = T.mean( a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2))) return lower_bound, log_px, log_pz, log_qz
def latent_gaussian_x_bernoulli(z0, zk, z0_mu, z0_log_var, logdet_J_list, x_mu, x, eq_samples, iw_samples, epsilon=1e-6): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. The sizes of the inputs are z0: (batch_size*eq_samples*iw_samples, num_latent) zk: (batch_size*eq_samples*iw_samples, num_latent) z0_mu: (batch_size, num_latent) z0_log_var: (batch_size, num_latent) logdet_J_list: list of `nflows` elements, each with shape (batch_size*eq_samples*iw_samples) x_mu: (batch_size*eq_samples*iw_samples, num_features) x: (batch_size, num_features) Reference: Burda et al. 2015 "Importance Weighted Autoencoders" """ # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions z0 = z0.reshape((-1, eq_samples, iw_samples, latent_size)) zk = zk.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) for i in range(len(logdet_J_list)): logdet_J_list[i] = logdet_J_list[i].reshape( (-1, eq_samples, iw_samples)) # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs x = x.dimshuffle( 0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_features) z0_mu = z0_mu.dimshuffle( 0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) z0_log_var = z0_log_var.dimshuffle( 0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately # so we sum over feature/latent dimensions for multivariate pdfs log_q0z0_given_x = log_normal2(z0, z0_mu, z0_log_var).sum(axis=3) log_pzk = log_stdnormal(zk).sum(axis=3) log_px_given_zk = log_bernoulli(x, x_mu, epsilon).sum(axis=3) #normalizing flow loss sum_logdet_J = 0 for logdet_J_k in logdet_J_list: sum_logdet_J += logdet_J_k # Calculate the LL using log-sum-exp to avoid underflow all log_*** -> shape: (batch_size, eq_samples, iw_samples) LL = log_mean_exp( log_pzk + log_px_given_zk - log_q0z0_given_x + sum_logdet_J, axis=2 ) # log-mean-exp over iw_samples dimension -> shape: (batch_size, eq_samples) LL = T.mean( LL) # average over eq_samples, batch_size dimensions -> shape: () return LL, T.mean(log_q0z0_given_x), T.mean(sum_logdet_J), T.mean( log_pzk), T.mean(log_px_given_zk)