Exemplo n.º 1
0
class Joint:
    '''
  Wrapper to handle calculating the log p(y, w | X) = log [ p(y | X, w) *
  p(w) ] for a given sample of w.
  Should be the same as the slow version but vectorized and therefore faster.
  '''
    def __init__(self, Xtrain, ytrain, sess):
        self.Xtrain = Xtrain
        self.ytrain = ytrain
        self.sess = sess

        self.n_samples = 1000  # TODO this is hard coded and must be matched in elbo and fc.
        N, D = Xtrain.shape
        self.w = tf.placeholder(tf.float32, [D, self.n_samples])
        self.X = tf.placeholder(tf.float32, [N, D])
        #self.y = Bernoulli(logits=ed.dot(self.X, self.w))
        self.y = Bernoulli(logits=tf.matmul(self.X, self.w))
        self.prior = Normal(loc=tf.zeros([self.n_samples, D]),
                            scale=1.0 *
                            tf.ones([self.n_samples, D]))  # TODO hard coded

    def log_prob(self, samples):
        copied_ytrain = np.repeat(self.ytrain[:, np.newaxis],
                                  self.n_samples,
                                  axis=1)
        per_sample = self.sess.run(self.y.log_prob(copied_ytrain),
                                   feed_dict={
                                       self.X: self.Xtrain,
                                       self.w: samples.T
                                   }).astype(np.float32)
        lik = np.sum(per_sample, axis=0)
        prior = np.sum(self.prior.log_prob(samples).eval(), axis=1)
        return lik + prior
class Joint:
    '''Wrapper to handle joint probability p(UV, R_train)
    
        log p(UV, R_train) = log [ p(R_train | UV) * p(UV) ]
    '''
    def __init__(self, R_true, I_train, sess, D, N, M):
        """
        Args:
            R_true: full matrix
            I_train: training mask
        """
        self.n_samples = FLAGS.n_monte_carlo_samples
        self.R = tf.constant(R_true, dtype=tf.float32)
        self.I = tf.constant(I_train, dtype=tf.float32)
        self.D = D
        self.N = N
        self.M = M
        scale_uv = tf.concat([tf.ones([D, N]), tf.ones([D, M])], axis=1)
        mean_uv = tf.concat([tf.zeros([D, N]), tf.zeros([D, M])], axis=1)

        self.prior_UV = Normal(loc=mean_uv, scale=scale_uv)  # (D, N + M)

    def log_lik(self, sample_uv):
        """
        Args:
            sample_uv: single (D, (N + M)) samples from qUV
        Returns:
            tensor scalar of log likelihood
        """
        # constructed matrix dist. R ~ N(U'V, 1)
        pR = Normal(loc=tf.matmul(tf.transpose(sample_uv[:, :self.N]),
                                  sample_uv[:, self.N:]),
                    scale=tf.ones([self.N, self.M]))  # dist (N, M)
        full_log_likelihood = pR.log_prob(self.R)  # (N, M)
        full_log_likelihood_t = full_log_likelihood.eval()
        train_log_likelihood = full_log_likelihood * self.I  # (N, M)
        log_lik = tf.reduce_sum(train_log_likelihood)  # ()
        return log_lik

    def log_prob(self, sample_uv):
        """
        Args:
            sample_uv: single (D, (N + M)) samples from qUV
        Returns:
            tensor scalar of log_prob
        """
        prior_batch = self.prior_UV.log_prob(sample_uv)  # (D, N + M)
        prior = tf.reduce_sum(prior_batch)
        ll = self.log_lik(sample_uv)
        #print('DEBUG values', prior.eval(), ll.eval())
        p_joint = prior + ll
        #return prior
        return p_joint

    def log_prob_batch(self, samples):
        """
            samples: (n_samples, D, N + M) tensor
        """
        raise NotImplementedError('what to do here? just run in a loop?')
class Joint:
    '''Wrapper to handle calculating the joint probability of data

    log p(y, w | X) = log [ p(y | X, w) * p(w) ]
    '''
    def __init__(self, X, y, sess, n_samples, logger=None):
        """Initialize the distribution.

            Constructs the graph for evaluation of joint probabilities
            of data X and weights (latent vars) w
        
            Args:
                X:  [N x D] data
                y:  [D] predicted target variable
                sess: tensorflow session
                n_samples: number of monte carlo samples to compute expectation
        """
        self.sess = sess
        self.n_samples = n_samples
        # (N, ) -> (N, n_samples)
        # np.tile(y[:, np.newaxis], (1, self.n_samples))
        y_matrix = np.repeat(y[:, np.newaxis], self.n_samples, axis=1)
        if logger is not None: self.logger = logger

        # Define the model graph
        N, D = X.shape
        self.X = tf.convert_to_tensor(X, dtype=tf.float32)
        self.Y = tf.convert_to_tensor(y_matrix, dtype=tf.float32)
        self.W = tf.get_variable('samples', (self.n_samples, D),
                                 tf.float32,
                                 initializer=tf.zeros_initializer())
        # (N, n_samples)
        self.py = Bernoulli(logits=tf.matmul(self.X, tf.transpose(self.W)))
        self.w_prior = Normal(loc=tf.zeros([self.n_samples, D], tf.float32),
                              scale=tf.ones([self.n_samples, D], tf.float32))
        # to get prior log probability would be summed across the D features
        # [n_samples D] -> [n_samples]
        self.prior = tf.reduce_sum(self.w_prior.log_prob(self.W), axis=1)
        log_likelihoods = self.py.log_prob(self.Y)  # (N, n_samples)
        self.ll = tf.reduce_sum(log_likelihoods, axis=0)  # (n_samples, )
        self.joint = self.ll + self.prior

    def log_prob(self, samples):
        """Log probability of samples.
        
        Since X is already given. samples, like for target distribution, for
        base distributions on approximation, for individual atoms are all
        samples of w.

        Args:
            samples: [self.n_samples x D] tensor
        Returns:
            [self.n_samples, ] joint log probability of samples, X, y
        """
        assert samples.shape[
            0] == self.n_samples, 'Different number of samples'
        self.sess.run(self.W.assign(samples))
        return self.joint
Exemplo n.º 4
0
def _test(mu, sigma, n):
    rv = Normal(mu=mu, sigma=sigma)
    rv_sample = rv.sample(n)
    x = rv_sample.eval()
    x_tf = tf.constant(x, dtype=tf.float32)
    mu = mu.eval()
    sigma = sigma.eval()
    assert np.allclose(
        rv.log_prob(x_tf).eval(), stats.norm.logpdf(x, mu, sigma))
Exemplo n.º 5
0
def _test(mu, sigma, n):
  rv = Normal(mu=mu, sigma=sigma)
  rv_sample = rv.sample(n)
  x = rv_sample.eval()
  x_tf = tf.constant(x, dtype=tf.float32)
  mu = mu.eval()
  sigma = sigma.eval()
  assert np.allclose(rv.log_prob(x_tf).eval(),
                     stats.norm.logpdf(x, mu, sigma))
Exemplo n.º 6
0
    def clustering(self, x_data):
        mu_sample = self.qmu.sample(100)
        sigmasq_sample = self.qsigmasq.sample(100)
        x_post = Normal(loc=tf.ones([self.N, 1, 1, 1]) * mu_sample,
                        scale=tf.ones([self.N, 1, 1, 1]) *
                        tf.sqrt(sigmasq_sample))
        x_broadcasted = tf.tile(tf.reshape(x_data, [self.N, 1, 1, self.D]),
                                [1, 100, self.K, 1])

        log_liks = x_post.log_prob(x_broadcasted)
        log_liks = tf.reduce_sum(log_liks, 3)
        log_liks = tf.reduce_mean(log_liks, 1)

        self.clusters = tf.argmax(log_liks, 1).eval()
 def log_lik(self, sample_uv):
     """
     Args:
         sample_uv: single (D, (N + M)) samples from qUV
     Returns:
         tensor scalar of log likelihood
     """
     # constructed matrix dist. R ~ N(U'V, 1)
     pR = Normal(loc=tf.matmul(tf.transpose(sample_uv[:, :self.N]),
                               sample_uv[:, self.N:]),
                 scale=tf.ones([self.N, self.M]))  # dist (N, M)
     full_log_likelihood = pR.log_prob(self.R)  # (N, M)
     full_log_likelihood_t = full_log_likelihood.eval()
     train_log_likelihood = full_log_likelihood * self.I  # (N, M)
     log_lik = tf.reduce_sum(train_log_likelihood)  # ()
     return log_lik
init.run()

for _ in range(inference.n_iter):
    info_dict = inference.update()
    inference.print_progress(info_dict)
    t = info_dict['t']
    if t % inference.n_print == 0:
        print("Inferred cluster means:")
        print(sess.run(qmu.mean()))

# Calculate likelihood for each data point and cluster assignment,
# averaged over many posterior samples. ``x_post`` has shape (N, 100, K, D).
mu_sample = qmu.sample(100)
sigma_sample = qsigma.sample(100)
x_post = Normal(mu=tf.ones([N, 1, 1, 1]) * mu_sample,
                sigma=tf.ones([N, 1, 1, 1]) * sigma_sample)
x_broadcasted = tf.tile(tf.reshape(x_train, [N, 1, 1, D]), [1, 100, K, 1])

# Sum over latent dimension, then average over posterior samples.
# ``log_liks`` ends up with shape (N, K).
log_liks = x_post.log_prob(x_broadcasted)
log_liks = tf.reduce_sum(log_liks, 3)
log_liks = tf.reduce_mean(log_liks, 1)

# Choose the cluster with the highest likelihood for each data point.
clusters = tf.argmax(log_liks, 1).eval()
plt.scatter(x_train[:, 0], x_train[:, 1], c=clusters, cmap=cm.bwr)
plt.axis([-3, 3, -3, 3])
plt.title("Predicted cluster assignments")
plt.show()
Exemplo n.º 9
0
 data = {x1: x_ph_bin, x2: x_ph_cont, y: y_ph, qt: t_ph, t: t_ph, qy: y_ph}
 # sample posterior predictive for p(y|z,t)
 y_post = ed.copy(y, {z: qz, t: t_ph}, scope='y_post')
 # crude approximation of the above
 y_post_mean = ed.copy(y, {z: qz.mean(), t: t_ph}, scope='y_post_mean')
 # construct a deterministic version (i.e. use the mean of the approximate posterior) of the lower bound
 # for early stopping according to a validation set
 y_post_eval = ed.copy(y, {z: qz.mean(), qt: t_ph, qy: y_ph, t: t_ph}, scope='y_post_eval')
 x1_post_eval = ed.copy(x1, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x1_post_eval')
 x2_post_eval = ed.copy(x2, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x2_post_eval')
 t_post_eval = ed.copy(t, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='t_post_eval')
 # losses
 logp_valid = tf.reduce_mean(tf.reduce_sum(y_post_eval.log_prob(y_ph) + t_post_eval.log_prob(t_ph), axis=1) +
                             tf.reduce_sum(x1_post_eval.log_prob(x_ph_bin), axis=1) +
                             tf.reduce_sum(x2_post_eval.log_prob(x_ph_cont), axis=1) +
                             tf.reduce_sum(z.log_prob(qz.mean()) - qz.log_prob(qz.mean()), axis=1))
 inference = ed.KLqp({z: qz}, data)
 optimizer = tf.train.AdamOptimizer(learning_rate=lr)
 inference.initialize(optimizer=optimizer)
 # saver and initializer before experiment
 saver = tf.train.Saver(tf.contrib.slim.get_variables())
 tf.global_variables_initializer().run()
 # Load existing model
 if load_model:
     print("Load model from: {}".format(load_model + '/{}-{}'.format(task, i)))
     saver.restore(sess, load_model + '/{}-{}'.format(task, i))
 n_epoch, n_iter_per_epoch, idx = epochs, max(10 * int(xtr.shape[0] / batch_size), 1), np.arange(xtr.shape[0])
 # dictionaries needed for evaluation
 tr0, tr1 = np.zeros((xalltr.shape[0], 1)), np.ones((xalltr.shape[0], 1))
 tr0t, tr1t = np.zeros((xte.shape[0], 1)), np.ones((xte.shape[0], 1))
 f1 = {x_ph_bin: xalltr[:, 0:len(binfeats)], x_ph_cont: xalltr[:, len(binfeats):], t_ph: tr1}
Exemplo n.º 10
0
        yi_post_eval = ed.copy(yi, {zi: qzi.mean(), qti: ti_ph, qyi: yi_ph, ti: ti_ph}, scope='yi_post_eval')
        yj_post_eval = ed.copy(yj, {zj: qzj.mean(), qtj: tj_ph, qyj: yj_ph, tj: tj_ph}, scope='yj_post_eval')

        xi1_post_eval = ed.copy(xi1, {zi: qzi.mean(), qti: ti_ph, qyi: yi_ph}, scope='xi1_post_eval')
        xi2_post_eval = ed.copy(xi2, {zi: qzi.mean(), qti: ti_ph, qyi: yi_ph}, scope='xi2_post_eval')

        xj1_post_eval = ed.copy(xj1, {zj: qzj.mean(), qtj: tj_ph, qyj: yj_ph}, scope='xj1_post_eval')
        xj2_post_eval = ed.copy(xj2, {zj: qzj.mean(), qtj: tj_ph, qyj: yj_ph}, scope='xj2_post_eval')
        
        ti_post_eval = ed.copy(ti, {zi: qzi.mean(), qti: ti_ph, qyi: yi_ph}, scope='ti_post_eval')
        tj_post_eval = ed.copy(tj, {zj: qzj.mean(), qtj: tj_ph, qyj: yj_ph}, scope='tj_post_eval')

        logp_valid = tf.reduce_mean(tf.reduce_sum(yi_post_eval.log_prob(yi_ph) + ti_post_eval.log_prob(ti_ph), axis=1) +
                                    tf.reduce_sum(xi1_post_eval.log_prob(xi_ph_bin), axis=1) +
                                    tf.reduce_sum(xi2_post_eval.log_prob(xi_ph_cont), axis=1) +
                                    tf.reduce_sum(zi.log_prob(qzi.mean()) - qzi.log_prob(qzi.mean()), axis=1)
                                    + tf.reduce_sum(yj_post_eval.log_prob(yj_ph) + tj_post_eval.log_prob(tj_ph), axis=1) +
                                    tf.reduce_sum(xj1_post_eval.log_prob(xj_ph_bin), axis=1) +
                                    tf.reduce_sum(xj2_post_eval.log_prob(xj_ph_cont), axis=1) +
                                    tf.reduce_sum(zj.log_prob(qzj.mean()) - qzj.log_prob(qzj.mean()), axis=1))

        #TODO: negative sampling...
        
        # inference = ed.KLqp({zi: qzi, zj: qzj, zi: qzj, zj: qzi}, data)
        inference = ed.KLqp({zi: qzi, zj: qzj, zi: qzj}, data)
        optimizer = tf.train.AdamOptimizer(learning_rate=args.lr)
        inference.initialize(optimizer=optimizer)

        saver = tf.train.Saver(tf.contrib.slim.get_variables())
        tf.global_variables_initializer().run()
Exemplo n.º 11
0
def cevae_tf(X, T, Y, n_epochs=100, early_stop = 10, d_cevae=20):

    T, Y = T.reshape((-1,1)), Y.reshape((-1,1))
    args = dict()
    args['earl'] = early_stop
    args['lr'] = 0.001
    args['opt'] = 'adam'
    args['epochs'] = n_epochs
    args['print_every'] = 10
    args['true_post'] = True

    M = None  # batch size during training
    d = d_cevae  # latent dimension
    lamba = 1e-4  # weight decay
    nh, h = 3, 200  # number and size of hidden layers

    contfeats = list(range(X.shape[1])) # all continuous
    binfeats = []
    
    # need for early stopping
    xtr, xva, ttr, tva, ytr, yva = train_test_split(X, T, Y)

    # zero mean, unit variance for y during training
    ym, ys = np.mean(Y), np.std(Y)
    ytr, yva = (ytr - ym) / ys, (yva - ym) / ys
    best_logpvalid = - np.inf

    with tf.Graph().as_default():
        sess = tf.InteractiveSession()

        ed.set_seed(1)
        np.random.seed(1)
        tf.set_random_seed(1)

        # x_ph_bin = tf.placeholder(tf.float32, [M, len(binfeats)], name='x_bin')  # binary inputs
        x_ph_cont = tf.placeholder(tf.float32, [M, len(contfeats)], name='x_cont')  # continuous inputs
        t_ph = tf.placeholder(tf.float32, [M, 1])
        y_ph = tf.placeholder(tf.float32, [M, 1])

        # x_ph = tf.concat([x_ph_bin, x_ph_cont], 1)
        x_ph = x_ph_cont
        activation = tf.nn.elu

        # CEVAE model (decoder)
        # p(z)
        z = Normal(loc=tf.zeros([tf.shape(x_ph)[0], d]), scale=tf.ones([tf.shape(x_ph)[0], d]))

        # p(x|z)
        hx = fc_net(z, (nh - 1) * [h], [], 'px_z_shared', lamba=lamba, activation=activation)
        # logits = fc_net(hx, [h], [[len(binfeats), None]], 'px_z_bin', lamba=lamba, activation=activation)
        # x1 = Bernoulli(logits=logits, dtype=tf.float32, name='bernoulli_px_z')

        mu, sigma = fc_net(hx, [h], [[len(contfeats), None], [len(contfeats), tf.nn.softplus]], 'px_z_cont', lamba=lamba,
                        activation=activation)
        x2 = Normal(loc=mu, scale=sigma, name='gaussian_px_z')

        # p(t|z)
        logits = fc_net(z, [h], [[1, None]], 'pt_z', lamba=lamba, activation=activation)
        t = Bernoulli(logits=logits, dtype=tf.float32)

        # p(y|t,z)
        mu2_t0 = fc_net(z, nh * [h], [[1, None]], 'py_t0z', lamba=lamba, activation=activation)
        mu2_t1 = fc_net(z, nh * [h], [[1, None]], 'py_t1z', lamba=lamba, activation=activation)
        y = Normal(loc=t * mu2_t1 + (1. - t) * mu2_t0, scale=tf.ones_like(mu2_t0))

        # CEVAE variational approximation (encoder)
        # q(t|x)
        logits_t = fc_net(x_ph, [d], [[1, None]], 'qt', lamba=lamba, activation=activation)
        qt = Bernoulli(logits=logits_t, dtype=tf.float32)
        # q(y|x,t)
        hqy = fc_net(x_ph, (nh - 1) * [h], [], 'qy_xt_shared', lamba=lamba, activation=activation)
        mu_qy_t0 = fc_net(hqy, [h], [[1, None]], 'qy_xt0', lamba=lamba, activation=activation)
        mu_qy_t1 = fc_net(hqy, [h], [[1, None]], 'qy_xt1', lamba=lamba, activation=activation)
        qy = Normal(loc=qt * mu_qy_t1 + (1. - qt) * mu_qy_t0, scale=tf.ones_like(mu_qy_t0))
        # q(z|x,t,y)
        inpt2 = tf.concat([x_ph, qy], 1)
        hqz = fc_net(inpt2, (nh - 1) * [h], [], 'qz_xty_shared', lamba=lamba, activation=activation)
        muq_t0, sigmaq_t0 = fc_net(hqz, [h], [[d, None], [d, tf.nn.softplus]], 'qz_xt0', lamba=lamba,
                                activation=activation)
        muq_t1, sigmaq_t1 = fc_net(hqz, [h], [[d, None], [d, tf.nn.softplus]], 'qz_xt1', lamba=lamba,
                                activation=activation)
        qz = Normal(loc=qt * muq_t1 + (1. - qt) * muq_t0, scale=qt * sigmaq_t1 + (1. - qt) * sigmaq_t0)

        # Create data dictionary for edward
        data = {x2: x_ph_cont, y: y_ph, qt: t_ph, t: t_ph, qy: y_ph}

        # sample posterior predictive for p(y|z,t)
        y_post = ed.copy(y, {z: qz, t: t_ph}, scope='y_post')
        # crude approximation of the above
        y_post_mean = ed.copy(y, {z: qz.mean(), t: t_ph}, scope='y_post_mean')
        # construct a deterministic version (i.e. use the mean of the approximate posterior) of the lower bound
        # for early stopping according to a validation set
        y_post_eval = ed.copy(y, {z: qz.mean(), qt: t_ph, qy: y_ph, t: t_ph}, scope='y_post_eval')
        # x1_post_eval = ed.copy(x1, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x1_post_eval')
        x2_post_eval = ed.copy(x2, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x2_post_eval')
        t_post_eval = ed.copy(t, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='t_post_eval')
        logp_valid = tf.reduce_mean(tf.reduce_sum(y_post_eval.log_prob(y_ph) + t_post_eval.log_prob(t_ph), axis=1) +
                                    tf.reduce_sum(x2_post_eval.log_prob(x_ph_cont), axis=1) +
                                    tf.reduce_sum(z.log_prob(qz.mean()) - qz.log_prob(qz.mean()), axis=1))

        inference = ed.KLqp({z: qz}, data)
        optimizer = tf.train.AdamOptimizer(learning_rate=args['lr'])
        inference.initialize(optimizer=optimizer)

        saver = tf.train.Saver(tf.contrib.slim.get_variables())
        tf.global_variables_initializer().run()

        n_epoch, n_iter_per_epoch, idx = args['epochs'], 10 * int(xtr.shape[0] / 100), np.arange(xtr.shape[0])

        # # dictionaries needed for evaluation
        t0, t1 = np.zeros((X.shape[0], 1)), np.ones((X.shape[0], 1))
        # tr0t, tr1t = np.zeros((xte.shape[0], 1)), np.ones((xte.shape[0], 1))
        f1 = {x_ph_cont: X, t_ph: t1}
        f0 = {x_ph_cont: X, t_ph: t0}
        # f1t = {x_ph_bin: xte[:, 0:len(binfeats)], x_ph_cont: xte[:, len(binfeats):], t_ph: tr1t}
        # f0t = {x_ph_bin: xte[:, 0:len(binfeats)], x_ph_cont: xte[:, len(binfeats):], t_ph: tr0t}

        for epoch in range(n_epoch):
            avg_loss = 0.0

            
            widgets = ["epoch #%d|" % epoch, Percentage(), Bar(), ETA()]
            pbar = ProgressBar(n_iter_per_epoch, widgets=widgets)
            pbar.start()
            np.random.shuffle(idx)
            for j in range(n_iter_per_epoch):
                # print('j', j)
                # pbar.update(j)
                batch = np.random.choice(idx, 100)
                x_train, y_train, t_train = xtr[batch], ytr[batch], ttr[batch]
                info_dict = inference.update(feed_dict={x_ph_cont: x_train,
                                                        t_ph: t_train, y_ph: y_train})
                avg_loss += info_dict['loss']

            avg_loss = avg_loss / n_iter_per_epoch
            avg_loss = avg_loss / 100

            if epoch % args['earl'] == 0 or epoch == (n_epoch - 1):
                logpvalid = sess.run(logp_valid, feed_dict={x_ph_cont: xva,
                                                            t_ph: tva, y_ph: yva})
                if logpvalid >= best_logpvalid:
                    print('Improved validation bound, old: {:0.3f}, new: {:0.3f}'.format(best_logpvalid, logpvalid))
                    best_logpvalid = logpvalid
                    saver.save(sess, 'data/cevae_models/dlvm')


        saver.restore(sess, 'data/cevae_models/dlvm')
        y0, y1 = get_y0_y1(sess, y_post, f0, f1, shape=Y.shape, L=100)
        y0, y1 = y0 * ys + ym, y1 * ys + ym
        
        sess.close()

    return y0.reshape((-1)), y1.reshape((-1))
Exemplo n.º 12
0
            data = {x1: x_ph_bin, x2: x_ph_cont, y: y_ph, qt: t_ph, t: t_ph, qy: y_ph}

            # sample posterior predictive for p(y|z,t)
            y_post = ed.copy(y, {z: qz, t: t_ph}, scope='y_post')
            # crude approximation of the above
            y_post_mean = ed.copy(y, {z: qz.mean(), t: t_ph}, scope='y_post_mean')
            # construct a deterministic version (i.e. use the mean of the approximate posterior) of the lower bound
            # for early stopping according to a validation set
            y_post_eval = ed.copy(y, {z: qz.mean(), qt: t_ph, qy: y_ph, t: t_ph}, scope='y_post_eval')
            x1_post_eval = ed.copy(x1, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x1_post_eval')
            x2_post_eval = ed.copy(x2, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x2_post_eval')
            t_post_eval = ed.copy(t, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='t_post_eval')
            logp_valid = tf.reduce_mean(tf.reduce_sum(y_post_eval.log_prob(y_ph) + t_post_eval.log_prob(t_ph), axis=1) +
                                        tf.reduce_sum(x1_post_eval.log_prob(x_ph_bin), axis=1) +
                                        tf.reduce_sum(x2_post_eval.log_prob(x_ph_cont), axis=1) +
                                        tf.reduce_sum(z.log_prob(qz.mean()) - qz.log_prob(qz.mean()), axis=1))

            z_learned = ed.copy(qz, {x1: x_ph_bin, x2: x_ph_cont})  # for matching
            inference = ed.KLqp({z: qz}, data)
        # -------------------------------------------------------------------------------------------------------------
        elif model_type == 'separated':
            # CEVAE model (decoder)

            n_ph = tf.shape(x_ph)[0]  # number of samples fed to placeholders

            latent_dims = (z_t_dim, z_y_dim)
            # prior over latent variables:
            # p(zx) -
            # zx = Normal(loc=tf.zeros([n_ph, z_x_dim]), scale=tf.ones([n_ph, z_x_dim]))
            # p(zt) -
            zt = Normal(loc=tf.zeros([n_ph, z_t_dim]), scale=tf.ones([n_ph, z_t_dim]))
Exemplo n.º 13
0
    def __init__(self, n, xdim, n_mixtures=5, mc_samples=500):
        # Compute the shape dynamically from placeholders
        self.x_ph = tf.placeholder(tf.float32, [None, xdim])
        self.k = k = n_mixtures
        self.batch_size = n
        self.d = d = xdim
        self.sample_size = tf.placeholder(tf.int32, ())

        # Build the priors over membership probabilities and mixture parameters
        with tf.variable_scope("priors"):
            pi = Dirichlet(tf.ones(k))

            mu = Normal(tf.zeros(d), tf.ones(d), sample_shape=k)
            sigmasq = InverseGamma(tf.ones(d), tf.ones(d), sample_shape=k)

        # Build the conditional mixture model
        with tf.variable_scope("likelihood"):
            x = ParamMixture(pi, {'loc': mu, 'scale_diag': tf.sqrt(sigmasq)},
                             MultivariateNormalDiag,
                             sample_shape=n)
            z = x.cat

        # Build approximate posteriors as Empirical samples
        t = mc_samples
        with tf.variable_scope("posteriors_samples"):
            qpi = Empirical(tf.get_variable(
                "qpi/params", [t, k],
                initializer=tf.constant_initializer(1.0 / k)))
            qmu = Empirical(tf.get_variable(
                "qmu/params", [t, k, d],
                initializer=tf.zeros_initializer()))
            qsigmasq = Empirical(tf.get_variable(
                "qsigmasq/params", [t, k, d],
                initializer=tf.ones_initializer()))
            qz = Empirical(tf.get_variable(
                "qz/params", [t, n],
                initializer=tf.zeros_initializer(),
                dtype=tf.int32))

        # Build inference graph using Gibbs and conditionals
        with tf.variable_scope("inference"):
            self.inference = ed.Gibbs({
                pi: qpi,
                mu: qmu,
                sigmasq: qsigmasq,
                z: qz
            }, data={
                x: self.x_ph
            })
            self.inference.initialize()

        # Build predictive posterior graph by taking samples
        n_samples = self.sample_size
        with tf.variable_scope("posterior"):
            mu_smpl = qmu.sample(n_samples) # shape: [1, 100, k, d]
            sigmasq_smpl = qsigmasq.sample(n_samples)

            x_post = Normal(
                loc=tf.ones((n, 1, 1, 1)) * mu_smpl,
                scale=tf.ones((n, 1, 1, 1)) * tf.sqrt(sigmasq_smpl)
            )
            # NOTE: x_ph has shape [n, d]
            x_broadcasted = tf.tile(
                tf.reshape(self.x_ph, (n, 1, 1, d)),
                (1, n_samples, k, 1)
            )

            x_ll = x_post.log_prob(x_broadcasted)
            x_ll = tf.reduce_sum(x_ll, axis=3)
            x_ll = tf.reduce_mean(x_ll, axis=1)

        self.sample_t_ph = tf.placeholder(tf.int32, ())
        self.eval_ops = {
            'generative_post': x_post,
            'qmu': qmu,
            'qsigma': qsigma,
            'post_running_mu': tf.reduce_mean(
                qmu.params[:self.sample_t_ph],
                axis=0
            )
            'post_log_prob': xll
        }
Exemplo n.º 14
0
        # for early stopping according to a validation set
        y_post_eval = ed.copy(y, {
            z: qz.mean(),
            y: y_ph,
            t: t_ph
        },
                              scope='y_post_eval')

        t_post_eval = ed.copy(t, {z: qz.mean(), y: y_ph}, scope='t_post_eval')

        log_valid = tf.reduce_mean(
            tf.reduce_sum(y_post_eval.log_prob(y_ph) +
                          t_post_eval.log_prob(t_ph),
                          axis=1) +
            tf.reduce_sum(z.log_prob(qz.mean()) - qz.log_prob(qz.mean()),
                          axis=1))

        tf.global_variables_initializer().run()

        # Information bottleneck control parameter
        BETA = 16671.79  #257.83 #2753.05 #9268.75 #4806.3 #16671.79

        # Latent Loss
        info_loss = tf.reduce_sum(tf.contrib.distributions.kl_divergence(
            qz, z))

        # Log-Likelihood
        class_loss = -BETA * tf.reduce_sum(
            y_post.log_prob(y_ph) + t_post.log_prob(t_ph), axis=1)
Exemplo n.º 15
0
Gibbs_inference_elapsedTime = time.time() - Gibbs_inference_startTime
posterior_mu = qmu.params.eval().mean(axis=0)

# Calculate likelihood for each data point and cluster assignment,
# averaged over many posterior samples. ``x_post`` has shape (N, 100, K, D).
print("Sampling from Posterior...")
mu_sample = qmu.sample(M)
sigmasq_sample = qsigma.sample(M)
pi_sample = qpi.sample(M)
x_post = Normal(loc=tf.ones([N, 1, 1, 1]) * mu_sample,
                scale=tf.ones([N, 1, 1, 1]) * tf.sqrt(sigmasq_sample))
x_broadcasted = tf.tile(tf.reshape(train_img, [N, 1, 1, D]), [1, M, K, 1])
x_broadcasted = tf.cast(x_broadcasted, dtype=tf.float32)
# Sum over latent dimension, then average over posterior samples.
# ``log_liks`` ends up with shape (N, K).
log_liks = tf.reduce_mean(tf.reduce_sum(x_post.log_prob(x_broadcasted), 3), 1)

print("Calculating Cluster Assignment...")
clusters = tf.argmax(log_liks, 1).eval()

result_img_dirs = '../tmp/img_result/{}'.format(current_time)
os.makedirs(result_img_dirs)
plt.hist(clusters)
plt.savefig(
    '../tmp/img_result/{}/cluster_dist_img={}_K={}_T={}_Time={}.png'.format(
        current_time, img_no, K, T, current_time))
result_cluster_assign_dirs = '../tmp/log/cluster_assign_matrix'
if not os.path.isdir(result_cluster_assign_dirs):
    os.makedirs(result_cluster_assign_dirs)
np.save(
    result_cluster_assign_dirs +
Exemplo n.º 16
0
    info_dict = inference.update()
    inference.print_progress(info_dict)
    t = info_dict['t']
    if t % inference.n_print == 0:
        print("Inferred cluster means:")
        print(sess.run(qmu.value()))

# Average per-cluster and per-data point likelihood over many posterior samples.
log_liks = []
for _ in range(100):
    mu_sample = qmu.sample()
    sigma_sample = qsigma.sample()
    # Take per-cluster and per-data point likelihood.
    log_lik = []
    for k in range(K):
        x_post = Normal(mu=tf.ones([N, 1]) * tf.gather(mu_sample, k),
                        sigma=tf.ones([N, 1]) * tf.gather(sigma_sample, k))
        log_lik.append(tf.reduce_sum(x_post.log_prob(x_train), 1))

    log_lik = tf.pack(log_lik)  # has shape (K, N)
    log_liks.append(log_lik)

log_liks = tf.reduce_mean(log_liks, 0)

# Choose the cluster with the highest likelihood for each data point.
clusters = tf.argmax(log_liks, 0).eval()
plt.scatter(x_train[:, 0], x_train[:, 1], c=clusters, cmap=cm.bwr)
plt.axis([-3, 3, -3, 3])
plt.title("Predicted cluster assignments")
plt.show()
Exemplo n.º 17
0
    def __init__(self, d, K, sig, sess, logdir):
        self.K = K
        self.sig = sig
        self.sess = sess
        self.logdir = logdir

        with tf.name_scope('model'):
            # Data Placeholder
            with tf.name_scope('input'):
                self.placeholders = tf.placeholder(tf.int32)
                self.words = self.placeholders

            # Index Masks
            with tf.name_scope('context_mask'):
                self.p_mask = tf.cast(
                    tf.range(d.cs / 2, d.n_minibatch + d.cs / 2), tf.int32)
                rows = tf.cast(
                    tf.tile(tf.expand_dims(tf.range(0, d.cs / 2), [0]),
                            [d.n_minibatch, 1]), tf.int32)
                columns = tf.cast(
                    tf.tile(tf.expand_dims(tf.range(0, d.n_minibatch), [1]),
                            [1, d.cs / 2]), tf.int32)
                self.ctx_mask = tf.concat(
                    [rows + columns, rows + columns + d.cs / 2 + 1], 1)

            with tf.name_scope('embeddings'):
                # Embedding vectors
                self.rho = tf.Variable(tf.random_normal([d.L, self.K]) /
                                       self.K,
                                       name='rho')

                # Context vectors
                self.alpha = tf.Variable(tf.random_normal([d.L, self.K]) /
                                         self.K,
                                         name='alpha')

                with tf.name_scope('priors'):
                    prior = Normal(loc=0.0, scale=self.sig)
                    self.log_prior = tf.reduce_sum(
                        prior.log_prob(self.rho) + prior.log_prob(self.alpha))

            with tf.name_scope('natural_param'):
                # Taget and Context Indices
                with tf.name_scope('target_word'):
                    self.p_idx = tf.gather(self.words, self.p_mask)
                    self.p_rho = tf.squeeze(tf.gather(self.rho, self.p_idx))

                # Negative samples
                with tf.name_scope('negative_samples'):
                    unigram_logits = tf.tile(
                        tf.expand_dims(tf.log(tf.constant(d.unigram)), [0]),
                        [d.n_minibatch, 1])
                    self.n_idx = tf.multinomial(unigram_logits, d.ns)
                    self.n_rho = tf.gather(self.rho, self.n_idx)

                with tf.name_scope('context'):
                    self.ctx_idx = tf.squeeze(
                        tf.gather(self.words, self.ctx_mask))
                    self.ctx_alphas = tf.gather(self.alpha, self.ctx_idx)

                # Natural parameter
                ctx_sum = tf.reduce_sum(self.ctx_alphas, [1])
                self.p_eta = tf.expand_dims(
                    tf.reduce_sum(tf.multiply(self.p_rho, ctx_sum), -1), 1)
                self.n_eta = tf.reduce_sum(
                    tf.multiply(
                        self.n_rho,
                        tf.tile(tf.expand_dims(ctx_sum, 1), [1, d.ns, 1])), -1)

            # Conditional likelihood
            self.y_pos = Bernoulli(logits=self.p_eta)
            self.y_neg = Bernoulli(logits=self.n_eta)

            self.ll_pos = tf.reduce_sum(self.y_pos.log_prob(1.0))
            self.ll_neg = tf.reduce_sum(self.y_neg.log_prob(0.0))

            self.log_likelihood = self.ll_pos + self.ll_neg

            scale = 1.0 * d.N / d.n_minibatch
            self.loss = -(scale * self.log_likelihood + self.log_prior)

            # Training
            optimizer = tf.train.AdamOptimizer()
            self.train = optimizer.minimize(self.loss)
            with self.sess.as_default():
                tf.global_variables_initializer().run()
            variable_summaries('rho', self.rho)
            variable_summaries('alpha', self.alpha)
            with tf.name_scope('objective'):
                tf.summary.scalar('loss', self.loss)
                tf.summary.scalar('priors', self.log_prior)
                tf.summary.scalar('ll_pos', self.ll_pos)
                tf.summary.scalar('ll_neg', self.ll_neg)
            self.summaries = tf.summary.merge_all()
            self.train_writer = tf.summary.FileWriter(self.logdir,
                                                      self.sess.graph)
            self.saver = tf.train.Saver()
            config = projector.ProjectorConfig()

            alpha = config.embeddings.add()
            alpha.tensor_name = 'model/embeddings/alpha'
            alpha.metadata_path = '../vocab.tsv'
            rho = config.embeddings.add()
            rho.tensor_name = 'model/embeddings/rho'
            rho.metadata_path = '../vocab.tsv'
            projector.visualize_embeddings(self.train_writer, config)