示例#1
0
def latent_gaussian_x_gaussian(z, z_mu, z_log_var, x_mu, x_log_var, x, eq_samples, iw_samples, epsilon=1e-6):
    # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
    z = z.reshape((-1, eq_samples, iw_samples, latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))
    x_log_var = x_log_var.reshape((-1, eq_samples, iw_samples, num_features))

    # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs
    x = x.reshape((-1,num_features))
    x = x.dimshuffle(0, 'x', 'x', 1)                    # size: (batch_size, eq_samples, iw_samples, num_features)
    z_mu = z_mu.dimshuffle(0, 'x', 'x', 1)              # size: (batch_size, eq_samples, iw_samples, num_latent)
    z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1)    # size: (batch_size, eq_samples, iw_samples, num_latent)

    # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately 
    # so we sum over feature/latent dimensions for multivariate pdfs
    log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3)
    log_pz = log_stdnormal(z).sum(axis=3)
    #log_px_given_z = log_bernoulli(x, T.clip(x_mu, epsilon, 1 - epsilon)).sum(axis=3)
    log_px_given_z = log_normal2(x, x_mu, x_log_var).sum(axis=3)

    #all log_*** should have dimension (batch_size, eq_samples, iw_samples)
    # Calculate the LL using log-sum-exp to avoid underflow
    a = log_pz + log_px_given_z - log_qz_given_x    # size: (batch_size, eq_samples, iw_samples)
    a_max = T.max(a, axis=2, keepdims=True)         # size: (batch_size, eq_samples, 1)

    LL = T.mean(a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2) ) )

    return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
示例#2
0
def latent_gaussian_x_bernoulli(z, z_mu, psi_u_list, z_log_var, x_mu, x, eq_samples, iw_samples, epsilon=1e-6):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid. The sizes of the inputs are
    z: (batch_size*Eq_samples*ivae_samples*nsamples, num_latent)
    z_mu: (batch_size, num_latent)
    z_log_var: (batch_size, num_latent)
    x_mu: (batch_size*Eq_samples*ivae_samples*nsamples, num_latent)
    x: (batch_size, num_features)

    Reference: Burda et. al. 2015 "Importance Weighted Autoencoders"

    """

    # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
    z = z.reshape((-1, eq_samples, iw_samples,  latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples,  num_features))

    # dimshuffle x since we need to broadcast it when calculationg the binary
    # cross-entropy
    x = x.dimshuffle(0,'x','x',1) # x: (batch_size, eq_samples, iw_samples, num_latent)

    for i in range(len(psi_u_list)):
        psi_u_list[i] = psi_u_list[i].reshape((-1, eq_samples, iw_samples))


    #calculate LL components, note that we sum over the feature/num_unit dimension
    z_mu = z_mu.dimshuffle(0,'x','x',1) # mean: (batch_size, num_latent)
    z_log_var = z_log_var.dimshuffle(0,'x','x',1) # logvar: (batch_size, num_latent)
    log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3)
    log_pz = log_stdnormal(z).sum(axis=3)
    log_px_given_z = log_bernoulli(x, T.clip(x_mu,epsilon,1-epsilon)).sum(axis=3)

    #normalizing flow loss
    sum_log_psiu = 0
    for psi_u in psi_u_list:
        sum_log_psiu +=  T.log(T.abs_(1+psi_u))

    #all log_*** should have dimension (batch_size, eq_samples, iw_samples)
    # Calculate the LL using log-sum-exp to avoid underflow
    a = log_pz + log_px_given_z - log_qz_given_x + sum_log_psiu    # size: (batch_size, eq_samples, iw_samples)
    a_max = T.max(a, axis=2, keepdims=True)                        # size: (batch_size, eq_samples, 1)

    # LL is calculated using Eq (8) in burda et al.
    # Working from inside out of the calculation below:
    # T.exp(a-a_max): (bathc_size, Eq_samples, iw_samples)
    # -> subtract a_max to avoid overflow. a_max is specific for  each set of
    # importance samples and is broadcoasted over the last dimension.
    #
    # T.log( T.mean(T.exp(a-a_max), axis=2): (batch_size, Eq_samples)
    # -> This is the log of the sum over the importance weighted samples
    #
    # Lastly we add T.mean(a_max) to correct for the log-sum-exp trick
    LL = T.mean(a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2)))

    return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
 def log_likelihood(z, z_mu, z_log_var, x_mu, x, analytic_kl_term):
     if analytic_kl_term:
         kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis = 1)
         log_px_given_z = log_bernoulli(x, x_mu,  eps = 1e-6).sum(axis = 1)
         LL = T.mean(-kl_term + log_px_given_z)
     else:
         log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis = 1)
         log_pz = log_stdnormal(z).sum(axis = 1)
         log_px_given_z = log_bernoulli(x, x_mu,  eps = 1e-6).sum(axis = 1)
         LL = T.mean(log_pz + log_px_given_z - log_qz_given_x)
     return LL
示例#4
0
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, eq_samples, iw_samples, epsilon=1e-6):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid. The sizes of the inputs are
    z: (batch_size*eq_samples*iw_samples, num_latent)
    z_mu: (batch_size, num_latent)
    z_log_var: (batch_size, num_latent)
    x_mu: (batch_size*eq_samples*iw_samples, num_features)
    x: (batch_size, num_features)

    Reference: Burda et al. 2015 "Importance Weighted Autoencoders"
    """

    # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
    z = z.reshape((-1, eq_samples, iw_samples, latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))

    # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs
    x = x.dimshuffle(0, 'x', 'x', 1)                    # size: (batch_size, eq_samples, iw_samples, num_features)
    z_mu = z_mu.dimshuffle(0, 'x', 'x', 1)              # size: (batch_size, eq_samples, iw_samples, num_latent)
    z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1)    # size: (batch_size, eq_samples, iw_samples, num_latent)

    # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately 
    # so we sum over feature/latent dimensions for multivariate pdfs
    log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3)
    log_pz = log_stdnormal(z).sum(axis=3)
    log_px_given_z = log_bernoulli(x, x_mu, eps=epsilon).sum(axis=3)

    #all log_*** should have dimension (batch_size, eq_samples, iw_samples)
    # Calculate the LL using log-sum-exp to avoid underflow
    a = log_pz + log_px_given_z - log_qz_given_x    # size: (batch_size, eq_samples, iw_samples)
    a_max = T.max(a, axis=2, keepdims=True)         # size: (batch_size, eq_samples, 1)

    # LL is calculated using Eq (8) in Burda et al.
    # Working from inside out of the calculation below:
    # T.exp(a-a_max): (batch_size, eq_samples, iw_samples)
    # -> subtract a_max to avoid overflow. a_max is specific for  each set of
    # importance samples and is broadcasted over the last dimension.
    #
    # T.log( T.mean(T.exp(a-a_max), axis=2) ): (batch_size, eq_samples)
    # -> This is the log of the sum over the importance weighted samples
    #
    # The outer T.mean() computes the mean over eq_samples and batch_size
    #
    # Lastly we add T.mean(a_max) to correct for the log-sum-exp trick
    LL = T.mean(a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2) ) )

    return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
示例#5
0
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, eq_samples, iw_samples, epsilon=1e-6):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid. The sizes of the inputs are
    z: (batch_size*eq_samples*iw_samples, num_latent)
    z_mu: (batch_size, num_latent)
    z_log_var: (batch_size, num_latent)
    x_mu: (batch_size*eq_samples*iw_samples, num_features)
    x: (batch_size, num_features)

    Reference: Burda et al. 2015 "Importance Weighted Autoencoders"
    """

    # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
    z = z.reshape((-1, eq_samples, iw_samples, latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))

    # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs
    x = x.dimshuffle(0, 'x', 'x', 1)                    # size: (batch_size, eq_samples, iw_samples, num_features)
    z_mu = z_mu.dimshuffle(0, 'x', 'x', 1)              # size: (batch_size, eq_samples, iw_samples, num_latent)
    z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1)    # size: (batch_size, eq_samples, iw_samples, num_latent)

    # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately 
    # so we sum over feature/latent dimensions for multivariate pdfs
    log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3)
    log_pz = log_stdnormal(z).sum(axis=3)
    log_px_given_z = log_bernoulli(x, T.clip(x_mu, epsilon, 1 - epsilon)).sum(axis=3)

    #all log_*** should have dimension (batch_size, eq_samples, iw_samples)
    # Calculate the LL using log-sum-exp to avoid underflow
    a = log_pz + log_px_given_z - log_qz_given_x    # size: (batch_size, eq_samples, iw_samples)
    a_max = T.max(a, axis=2, keepdims=True)         # size: (batch_size, eq_samples, 1)

    # LL is calculated using Eq (8) in Burda et al.
    # Working from inside out of the calculation below:
    # T.exp(a-a_max): (batch_size, eq_samples, iw_samples)
    # -> subtract a_max to avoid overflow. a_max is specific for  each set of
    # importance samples and is broadcasted over the last dimension.
    #
    # T.log( T.mean(T.exp(a-a_max), axis=2) ): (batch_size, eq_samples)
    # -> This is the log of the sum over the importance weighted samples
    #
    # The outer T.mean() computes the mean over eq_samples and batch_size
    #
    # Lastly we add T.mean(a_max) to correct for the log-sum-exp trick
    LL = T.mean(a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2) ) )

    return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
示例#6
0
def latent_gaussian_x_bernoulli(z0, zk, z0_mu, z0_log_var, logdet_J_list, x_mu, x, eq_samples, iw_samples, epsilon=1e-6):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid. The sizes of the inputs are
    z0: (batch_size*eq_samples*iw_samples, num_latent)
	zk: (batch_size*eq_samples*iw_samples, num_latent)
    z0_mu: (batch_size, num_latent)
    z0_log_var: (batch_size, num_latent)
    logdet_J_list: list of `nflows` elements, each with shape (batch_size*eq_samples*iw_samples)
    x_mu: (batch_size*eq_samples*iw_samples, num_features)
    x: (batch_size, num_features)

    Reference: Burda et al. 2015 "Importance Weighted Autoencoders"
    """

    # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
    z0 = z0.reshape((-1, eq_samples, iw_samples, latent_size))
    zk = zk.reshape((-1, eq_samples, iw_samples, latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))

    for i in range(len(logdet_J_list)):
        logdet_J_list[i] = logdet_J_list[i].reshape((-1, eq_samples, iw_samples))

    # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs
    x = x.dimshuffle(0, 'x', 'x', 1)                    # size: (batch_size, eq_samples, iw_samples, num_features)
    z0_mu = z0_mu.dimshuffle(0, 'x', 'x', 1)            # size: (batch_size, eq_samples, iw_samples, num_latent)
    z0_log_var = z0_log_var.dimshuffle(0, 'x', 'x', 1)  # size: (batch_size, eq_samples, iw_samples, num_latent)

    # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately 
    # so we sum over feature/latent dimensions for multivariate pdfs
    log_q0z0_given_x = log_normal2(z0, z0_mu, z0_log_var).sum(axis=3)
    log_pzk = log_stdnormal(zk).sum(axis=3)
    log_px_given_zk = log_bernoulli(x, x_mu, epsilon).sum(axis=3)

    #normalizing flow loss
    sum_logdet_J = 0
    for logdet_J_k in logdet_J_list:
        sum_logdet_J += logdet_J_k

    # Calculate the LL using log-sum-exp to avoid underflow                                       all log_***                                       -> shape: (batch_size, eq_samples, iw_samples)
    LL = log_mean_exp(log_pzk + log_px_given_zk - log_q0z0_given_x + sum_logdet_J, axis=2)      # log-mean-exp over iw_samples dimension            -> shape: (batch_size, eq_samples)
    LL = T.mean(LL)                                                                             # average over eq_samples, batch_size dimensions    -> shape: ()

    return LL, T.mean(log_q0z0_given_x), T.mean(sum_logdet_J), T.mean(log_pzk), T.mean(log_px_given_zk)
示例#7
0
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x, x_mu, analytic_kl_term):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid.
    """
    if analytic_kl_term:
        kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis=1)
        log_px_given_z = log_bernoulli(x, x_mu).sum(axis=1)
        LL = T.mean(-kl_term + log_px_given_z)
    else:
        log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=1)
        log_pz = log_stdnormal(z).sum(axis=1)
        log_px_given_z = log_bernoulli(x, x_mu).sum(axis=1)
        LL = T.mean(log_pz + log_px_given_z - log_qz_given_x)
    return LL
示例#8
0
def lowerbound_for_reinforce(z,
                             z_mu,
                             z_log_var,
                             x_mu,
                             x,
                             num_features,
                             num_labelled,
                             num_classes,
                             epsilon=1e-6):
    x = x.reshape((-1, num_features))
    x_mu = x_mu.reshape((-1, num_features))

    log_qz_given_xy = log_normal2(z, z_mu, z_log_var).sum(axis=1)
    log_pz = log_stdnormal(z).sum(axis=1)
    log_py = T.log(1.0 / num_classes)
    log_px_given_zy = log_bernoulli(x, T.clip(x_mu, epsilon,
                                              1 - epsilon)).sum(axis=1)
    ll_xy = log_px_given_zy + log_pz + log_py - log_qz_given_xy
    return ll_xy[num_labelled:]
示例#9
0
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, analytic_kl_term):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid. The sizes of the inputs are
    z: (batch_size, num_latent)
    z_mu: (batch_size, num_latent)
    z_log_var: (batch_size, num_latent)
    x_mu: (batch_size, num_features)
    x: (batch_size, num_features)
    """
    if analytic_kl_term:
        kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis=1)
        log_px_given_z = log_bernoulli(x, x_mu).sum(axis=1)
        LL = T.mean(-kl_term + log_px_given_z)
    else:
        log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=1)
        log_pz = log_stdnormal(z).sum(axis=1)
        log_px_given_z = log_bernoulli(x, x_mu).sum(axis=1)
        LL = T.mean(log_pz + log_px_given_z - log_qz_given_x)
    return LL
示例#10
0
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, analytic_kl_term):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid. The sizes of the inputs are
    z: (batch_size, num_latent)
    z_mu: (batch_size, num_latent)
    z_log_var: (batch_size, num_latent)
    x_mu: (batch_size, num_features)
    x: (batch_size, num_features)
    """
    if analytic_kl_term:
        kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis=1)
        log_px_given_z = log_bernoulli(x, x_mu, eps=1e-6).sum(axis=1)
        LL = T.mean(-kl_term + log_px_given_z)
    else:
        log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=1)
        log_pz = log_stdnormal(z).sum(axis=1)
        log_px_given_z = log_bernoulli(x, x_mu, eps=1e-6).sum(axis=1)
        LL = T.mean(log_pz + log_px_given_z - log_qz_given_x)
    return LL
示例#11
0
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, eq_samples, iw_samples, epsilon=1e-6):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid. The sizes of the inputs are
    z: (batch_size*eq_samples*iw_samples, num_latent)
    z_mu: (batch_size, num_latent)
    z_log_var: (batch_size, num_latent)
    x_mu: (batch_size*eq_samples*iw_samples, num_features)
    x: (batch_size, num_features)

    Reference: Burda et al. 2015 "Importance Weighted Autoencoders"
    """

    # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
    z = z.reshape((-1, eq_samples, iw_samples, latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))

    # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs
    x = x.dimshuffle(0, 'x', 'x', 1)                    # size: (batch_size, eq_samples, iw_samples, num_features)
    z_mu = z_mu.dimshuffle(0, 'x', 'x', 1)              # size: (batch_size, eq_samples, iw_samples, num_latent)
    z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1)    # size: (batch_size, eq_samples, iw_samples, num_latent)

    # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately 
    # so we sum over feature/latent dimensions for multivariate pdfs
    log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3)
    log_pz = log_stdnormal(z).sum(axis=3)
    log_px_given_z = log_bernoulli(x, x_mu, epsilon).sum(axis=3)

    # Calculate the LL using log-sum-exp to avoid underflow                   all log_***                                       -> shape: (batch_size, eq_samples, iw_samples)
    LL = log_mean_exp(log_pz + log_px_given_z - log_qz_given_x, axis=2)     # log-mean-exp over iw_samples dimension            -> shape: (batch_size, eq_samples)
    LL = T.mean(LL)                                                         # average over eq_samples, batch_size dimensions    -> shape: ()

    return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
示例#12
0
def latent_gaussian_x_bernoulli(z, z_mu_q, z_logvar_q, z_mu_p, z_logvar_p, x_mu, x, eq_samples, iw_samples, latent_sizes, num_features, epsilon=1e-6,reverse_z=False,clip_val=None, temp=None, epoch=None):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid.
    z: (batch_size*nsamples*ivae_samples*nsamples, num_laten)
    x_mu: (batch_size*nsamples*ivae_samples, num_laten)
    """
    if reverse_z:
        #for ladder like VAE where x->z3->z2->z1
        z = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z,latent_sizes)]
        z_mu_q = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_mu_q[:-1],latent_sizes[:-1])] + [z_mu_q[-1].dimshuffle((0,'x','x',1))]
        z_logvar_q = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_logvar_q[:-1],latent_sizes[:-1])] + [z_logvar_q[-1].dimshuffle((0,'x','x',1))]
    else:
        #for normal VAE where x->z1->z2->z3
        z = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z,latent_sizes)]
        z_mu_q = [z_mu_q[0].dimshuffle((0,'x','x',1))] + [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_mu_q[1:],latent_sizes[1:])]
        z_logvar_q =  [z_logvar_q[0].dimshuffle((0,'x','x',1))] + [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_logvar_q[1:],latent_sizes[1:])]

    z_mu_p = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_mu_p,latent_sizes[:-1])]
    z_logvar_p = [z_.reshape((-1, eq_samples, iw_samples, ls)) for z_,ls in zip(z_logvar_p,latent_sizes[:-1])]

    x_mu = x_mu.reshape((-1, eq_samples, iw_samples,  num_features))
    x = x.dimshuffle((0,'x','x',1))

    log_pz =  [log_normal2(z_, mu_, logvar_) for z_, mu_, logvar_ in zip(z[:-1],z_mu_p,z_logvar_p)] + [log_stdnormal(z[-1])]
    log_qz = [log_normal2(z_, mu_, logvar_) for z_, mu_, logvar_ in zip(z, z_mu_q,z_logvar_q)]
    log_px = -T.nnet.binary_crossentropy(T.clip(x_mu,epsilon,1-epsilon), x)

    if clip_val is not None:
        log_pz = [T.clip(lp,clip_val,0) for lp in log_pz]
        log_qz = [T.clip(lp,clip_val,0) for lp in log_qz]

    #all log_*** should have dimension (batch_size, nsamples, ivae_samples)


    nlayers = len(log_qz)

    if temp is None:
        temp = [1.0 for _ in range(nlayers)]
    else:
        temp_step = (nlayers+1)*temp/float(100)
        temp = [T.max((i+1)*temp-epoch*temp_step,0.0) for i in range(nlayers)]

    a = log_px.sum(axis=3) + sum([p.sum(axis=3)*t for p,t in zip(log_pz,temp)]) - sum([p.sum(axis=3) for p in log_qz])
    a_max = T.max(a, axis=2, keepdims=True) #(batch_size, nsamples, 1)
    #It is important that a_max is inside the mean since it is sample specific

    # T.exp(a-a_max): (bathc_size, nsamples, ivae_samples)
    # -> a_max to avoid overflow which is a problem. a_max is specific for
    # each set importance set of samples and is broadcoasted over the last dimension.
    #
    # T.log( T.mean(T.exp(a-a_max), axis=2): (bathc_size, nsamples)
    # -> This is the log of the sum over the importance weithed samples
    #
    # a_max.reshape((-1,nsamples)) (batch_size, nsamples)
    # -> We need to remove the last dimension of a_max to make the addition
    #
    # a_max.reshape((-1,nsamples)) + T.log( T.mean(T.exp(a-a_max), axis=2)) (batch_size, nsamples)
    # -> This is the LL estimater, eq (8) in Burda et. al. 2015, where nsamples is used to estimate the expectation
    # Last the LL estimator is meaned over all diemensions
    lower_bound = T.mean( a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2)))
    return lower_bound, log_px, log_pz, log_qz
示例#13
0
def latent_gaussian_x_bernoulli(z0,
                                zk,
                                z0_mu,
                                z0_log_var,
                                logdet_J_list,
                                x_mu,
                                x,
                                eq_samples,
                                iw_samples,
                                epsilon=1e-6):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid. The sizes of the inputs are
    z0: (batch_size*eq_samples*iw_samples, num_latent)
	zk: (batch_size*eq_samples*iw_samples, num_latent)
    z0_mu: (batch_size, num_latent)
    z0_log_var: (batch_size, num_latent)
    logdet_J_list: list of `nflows` elements, each with shape (batch_size*eq_samples*iw_samples)
    x_mu: (batch_size*eq_samples*iw_samples, num_features)
    x: (batch_size, num_features)

    Reference: Burda et al. 2015 "Importance Weighted Autoencoders"
    """

    # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
    z0 = z0.reshape((-1, eq_samples, iw_samples, latent_size))
    zk = zk.reshape((-1, eq_samples, iw_samples, latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))

    for i in range(len(logdet_J_list)):
        logdet_J_list[i] = logdet_J_list[i].reshape(
            (-1, eq_samples, iw_samples))

    # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs
    x = x.dimshuffle(
        0, 'x', 'x',
        1)  # size: (batch_size, eq_samples, iw_samples, num_features)
    z0_mu = z0_mu.dimshuffle(
        0, 'x', 'x',
        1)  # size: (batch_size, eq_samples, iw_samples, num_latent)
    z0_log_var = z0_log_var.dimshuffle(
        0, 'x', 'x',
        1)  # size: (batch_size, eq_samples, iw_samples, num_latent)

    # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately
    # so we sum over feature/latent dimensions for multivariate pdfs
    log_q0z0_given_x = log_normal2(z0, z0_mu, z0_log_var).sum(axis=3)
    log_pzk = log_stdnormal(zk).sum(axis=3)
    log_px_given_zk = log_bernoulli(x, x_mu, epsilon).sum(axis=3)

    #normalizing flow loss
    sum_logdet_J = 0
    for logdet_J_k in logdet_J_list:
        sum_logdet_J += logdet_J_k

    # Calculate the LL using log-sum-exp to avoid underflow                                       all log_***                                       -> shape: (batch_size, eq_samples, iw_samples)
    LL = log_mean_exp(
        log_pzk + log_px_given_zk - log_q0z0_given_x + sum_logdet_J, axis=2
    )  # log-mean-exp over iw_samples dimension            -> shape: (batch_size, eq_samples)
    LL = T.mean(
        LL)  # average over eq_samples, batch_size dimensions    -> shape: ()

    return LL, T.mean(log_q0z0_given_x), T.mean(sum_logdet_J), T.mean(
        log_pzk), T.mean(log_px_given_zk)