Exemplo n.º 1
0
Arquivo: models.py Projeto: kzky/works
 def __call__(self, d_gen, d=None):
     bs_gen = d_gen[0]
     if d:
         bs = d[0]
         return F.sum(F.log(d)) / bs + F.sum(F.log(1 - d_gen)) / bs_gen
     else:
         return F.sum(F.log(1 - d_gen)) / bs_gen
Exemplo n.º 2
0
def get_nll_gaussian(preds, target, variance, add_const=False):
    neg_log_p = ((preds - target) ** 2 / (2 * variance))
    if add_const:
        const = 0.5 * (F.log(2 * preds.xp.array(preds.xp.pi, dtype=preds.dtype)) + \
                       F.log(preds.xp.array(variance, preds.dtype)))
        neg_log_p += const
    ret = F.sum(neg_log_p) / (target.shape[0] * target.shape[1])
    return ret
Exemplo n.º 3
0
Arquivo: losses.py Projeto: kzky/works
 def __call__(self, d_x_gen, d_x_real=None):
     bs_d_x_gen = d_x_gen.shape[0]
     if d_x_real is not None:
         bs_d_x_real = d_x_real.shape[0]
         loss = F.sum(F.log(d_x_real)) / bs_d_x_real \
                + F.sum(F.log(1 - d_x_gen)) / bs_d_x_gen
         return - loss  # to minimize
         
     else:
         loss = F.sum(F.log(d_x_gen)) / bs_d_x_gen
         return - loss  # to minimize (reverse trick)
Exemplo n.º 4
0
Arquivo: losses.py Projeto: kzky/works
 def __call__(self, d_x_gen, d_x=None):
     #TODO: reverse trick
     bs_d_x_gen = d_x_gen.shape[0]
     if d_x is not None:
         bs_d_x = d_x.shape[0]
         loss = F.sum(F.log(F.sigmoid(d_x))) / bs_d_x \
                + F.sum(F.log(1 - F.sigmoid(d_x_gen))) / bs_d_x_gen
         return - loss  # to minimize
         
     else:
         loss = F.sum(F.log(1 - F.sigmoid(d_x_gen))) / bs_d_x_gen
         return loss
Exemplo n.º 5
0
 def kld(self, vec_true, vec_compare):
     ind = vec_true.data * vec_compare.data > 0
     ind_var = chainer.Variable(ind)
     include_nan = vec_true * F.log(vec_true / vec_compare)
     z = chainer.Variable(np.zeros((len(ind), 1), dtype=np.float32))
     # return np.nansum(vec_true * np.log(vec_true / vec_compare))
     return F.sum(F.where(ind_var, include_nan, z))
Exemplo n.º 6
0
 def _log_prob_words(self, context, temperature=1.0):
     """ This calculates an softmax over the vocabulary as a function
     of the dot product of context and word.
     """
     dot = F.matmul(context, F.transpose(self.vocab.W))
     prob = F.softmax(dot / temperature)
     return F.log(prob)
Exemplo n.º 7
0
def dirichlet_likelihood(weights, alpha=None):
    """ Calculate the log likelihood of the observed topic proportions.
    A negative likelihood is more likely than a negative likelihood.

    Args:
        weights (chainer.Variable): Unnormalized weight vector. The vector
            will be passed through a softmax function that will map the input
            onto a probability simplex.
        alpha (float): The Dirichlet concentration parameter. Alpha
            greater than 1.0 results in very dense topic weights such
            that each document belongs to many topics. Alpha < 1.0 results
            in sparser topic weights. The default is to set alpha to
            1.0 / n_topics, effectively enforcing the prior belief that a
            document belong to very topics at once.

    Returns:
        ~chainer.Variable: Output loss variable.
    """
    if type(weights) is Variable:
        n_topics = weights.data.shape[1]
    else:
        n_topics = weights.W.data.shape[1]
    if alpha is None:
        alpha = 1.0 / n_topics
    if type(weights) is Variable:
        proportions = F.softmax(weights)
    else:
        proportions = F.softmax(weights.W)
    loss = (alpha - 1.0) * F.log(proportions + 1e-8)
    return -F.sum(loss)
Exemplo n.º 8
0
def cosine_similarity(x, y, eps=1e-6):
    n1, n2, n3 = x.data.shape
    _, m2, _ = y.data.shape
    z = F.batch_matmul(x, y, transb=True)
    x2 = F.broadcast_to(F.reshape(F.sum(x * x, axis=2), (n1, n2, 1)), (n1, n2, m2))
    y2 = F.broadcast_to(F.reshape(F.sum(y * y, axis=2), (n1, 1, m2)), (n1, n2, m2))
    z /= F.exp(F.log(x2 * y2 + eps) / 2)
    return z
Exemplo n.º 9
0
def ordinal_loss(y, mask):
    xp = cuda.get_array_module(y.data)
    volatile = y.volatile
    b, c, n = y.data.shape
    max_y = F.broadcast_to(F.max(y, axis=1, keepdims=True), y.data.shape)
    y = y - max_y
    sum_y = F.broadcast_to(F.expand_dims(F.sum(y, axis=1), 1), y.data.shape)
    down_tri = np.tri(c, dtype=np.float32)
    up_tri = down_tri.T
    w1 = Variable(xp.asarray(down_tri.reshape(c, c, 1, 1)), volatile=volatile)
    w2 = Variable(xp.asarray(up_tri.reshape(c, c, 1, 1)), volatile=volatile)
    h = F.exp(F.expand_dims(y, -1))
    h1 = F.convolution_2d(h, w1)
    h1 = F.convolution_2d(F.log(h1), w1)
    h2 = F.convolution_2d(h, w2)
    h2 = F.convolution_2d(F.log(h2), w2)
    h = F.reshape(h1 + h2, (b, c, n))
    return F.sum((h - sum_y - y) * mask) / b
Exemplo n.º 10
0
    def read(address):
        #map from the reals to the hypercube of dimesion n
        index = F.tanh(address)
        
        #map from a point to the nearest corner of the hypercube
        f = lambda x: x > 0
        mainIndex = np.vectorize(f,index.data,cache=True)

        mainValue = F.select_item(array,lookup(mainIndex))
        scaleFactor =F.exp(F.sum(F.log(F.absolute(x))))

        return mainValue * scaleFactor
Exemplo n.º 11
0
Arquivo: net.py Projeto: szdr/RankNet
 def __call__(self, x_i, x_j, t_i, t_j):
     s_i = self.predictor(x_i)
     s_j = self.predictor(x_j)
     s_diff = s_i - s_j
     if t_i.data > t_j.data:
         S_ij = 1
     elif t_i.data < t_j.data:
         S_ij = -1
     else:
         S_ij = 0
     self.loss = (1 - S_ij) * s_diff / 2. + F.log(1 + F.exp(-s_diff))
     return self.loss
Exemplo n.º 12
0
def calc_log_posterior(theta, x, n=None):
    """Calculate unnormalized log posterior, ``log p(theta | x) + C``

    Args:
        theta(chainer.Variable): model parameters
        x(numpy.ndarray): sample data
        n(int): total data size
    Returns:
        chainer.Variable: Variable that holding unnormalized log posterior,
        ``log p(theta | x) + C`` of shape ``()``
    """

    theta1, theta2 = F.split_axis(theta, 2, 0)
    log_prior1 = F.sum(F.log(gaussian.gaussian_likelihood(theta1, 0, VAR1)))
    log_prior2 = F.sum(F.log(gaussian.gaussian_likelihood(theta2, 0, VAR2)))
    prob1 = gaussian.gaussian_likelihood(x, theta1, VAR_X)
    prob2 = gaussian.gaussian_likelihood(x, theta1 + theta2, VAR_X)
    log_likelihood = F.sum(F.log(prob1 / 2 + prob2 / 2))
    if n is not None:
        log_likelihood *= n / len(x)
    return log_prior1 + log_prior2 + log_likelihood
Exemplo n.º 13
0
 def _additional_score(self, y, a, src):
     batch_size = len(y.data)
     vocab_size = self._output
     xp         = self._xp
     src_len    = len(self.prob_dict)
     # Calculating dict prob
     y_dict = F.reshape(F.batch_matmul(self.prob_dict, a, transa=True), (batch_size, vocab_size))
     is_prob = False
     # Using dict prob
     if self._method == "bias":
         yp = y + F.log(eps + y_dict)
     elif self._method == "linear":
         yp = self.LI(y_dict, F.softmax(y))
         is_prob = True
     else:
         raise ValueError("Unrecognized dictionary method:", self._method)
     return yp, is_prob
Exemplo n.º 14
0
def choose_var_of_type(spec, context, scope, type_def):
    compatible_scope = [var for var in scope if var.type_def.can_be(type_def)]
    scope = list(scope)

    var_ndxs = [i for i in range(len(scope)) if scope[i].type_def.can_be(type_def)]
    var_embeddings = [scope[i].vec for i in var_ndxs]
    var_lprobs = [F.matmul(vec, F.transpose(context['state'])) for vec in var_embeddings]
    normalizer = Variable(np.array([[0]], dtype=np.float32))
    for vlp in var_lprobs:
        normalizer = normalizer + F.exp(vlp)
    normalizer = F.log(normalizer)
    var_lprobs = [vlp - normalizer for vlp in var_lprobs]
    vlp_data = np.array([vlp.data for vlp in var_lprobs])[:,0,0]
    ps = np.exp(vlp_data)
    ps /= np.sum(ps)

    ndx = np.random.choice(range(len(ps)), p=ps)
    lp = var_lprobs[ndx]
    var = scope[var_ndxs[ndx]]
    context['lp'] += lp[:,0]
    return var, context
Exemplo n.º 15
0
    def free_energy(self, v):
        """
        :param Variable (batch_size, in_channels, image_height, image_width) - input data (training data)
        :return: scalar
        """
        batch_size = v.data.shape[0]
        in_channels = self.in_channels
        real = self.real
        if real == 0:
            '''
            visible layer is 0, 1 (bit)
            vbias_term = 1 * SUM(a(i) * v(i))
            '''
            v_sum = F.sum(v, axis=(2, 3))  # sum over image_height & image_width
            # Originally, it should return sum for each batch.
            # but it returns scalar, which is sum over batches, since sum is used at the end anyway.
            vbias_term = F.sum(F.matmul(v_sum, self.conv.a))
            wx_b = self.conv(v)

        else:
            '''
            visible layer takes real value
            vbias_term = 0.5 * SUM((v(i)-a(i)) * (v(i) - a(i)))
            '''
            #TODO: check
            #m = Variable(xp.ones((batch_size, 1), dtype=xp.float32))
            n = F.reshape(self.conv.a, (1, in_channels, 1, 1))
            xp = cuda.get_array_module(n.data)
            std_ch = xp.reshape(self.std, (1, in_channels, 1, 1))

            #v_ = v - F.matmul(m, n)
            v_ = (v - F.broadcast_to(n, v.data.shape)) / std_ch
            vbias_term = F.sum(0.5 * v_ * v_)
            wx_b = self.conv(v / std_ch)


        hidden_term = F.sum(F.log(1 + F.exp(wx_b)))
        # print('vbias = ', vbias_term.data, ', hidden = ', hidden_term.data, 'F.exp(wx_b) = ', F.exp(wx_b).data)
        return - vbias_term - hidden_term
Exemplo n.º 16
0
def focal_loss(predictions,
               actual_obj_ids,
               gamma=2.0,
               alpha=0.25,
               class_weight=None,
               xp=numpy):
    pred_probas = F.softmax(predictions)
    actual_probas = xp.eye(predictions.shape[-1])[actual_obj_ids]

    pt_positive = actual_probas * pred_probas
    pt_negative = (1. - actual_probas) * (1. - pred_probas)
    pt = pt_positive + pt_negative

    at_positive = actual_probas * alpha
    at_negative = (1. - actual_probas) * (1. - alpha)
    at = at_positive + at_negative

    fl = -at * (1. - pt)**gamma * F.log(pt)
    if class_weight is not None:
        #print(fl.shape, class_weight.shape)
        weights = xp.array(class_weight.reshape(fl.shape))
        fl = weights * fl
    return F.mean(fl)
Exemplo n.º 17
0
def listnet(x, t, nr_docs):
    """
    The Top-1 approximated ListNet loss as in Cao et al (2006), Learning to
    Rank: From Pairwise Approach to Listwise Approach

    :param x: The activation of the previous layer
    :type x: chainer.Variable

    :param t: The target labels
    :type t: chainer.Variable

    :param nr_docs: The number of documents per query
    :type nr_docs: chainer.Variable

    :return: The Top-1 listnet loss
    :rtype: chainer.Variable
    """
    t, nr_docs = as_variable(t), as_variable(nr_docs)
    t = t.data.astype(x.dtype)
    st = cf.softmax(t, axis=1)
    sx = cf.softmax(x, axis=1)
    sce = -cf.mean(st * cf.log(sx), axis=1)
    return cf.mean(sce)
Exemplo n.º 18
0
def get_normalized_image_variable(time, wavelength):
    img = get_sun_image(time, wavelength)
    if img is None:
        return None

    img = img[np.newaxis, np.newaxis, :, :]
    img = img.astype(np.float32)
    x = Variable(img)
    if gpuid >= 0:
        x.to_gpu()

    if wavelength == 'hmi':
        ret = x / 300
    elif wavelength == 211:
        ret = F.sigmoid(x / 100)
    elif wavelength == 193:
        ret = F.sigmoid(x / 300)
    elif wavelength == 94:
        ret = F.sigmoid(x / 30)
    else:
        ret = F.log(1 + F.relu(x))

    return ret
Exemplo n.º 19
0
    def decode(self, input_id, teacher_id, label_id, word_th, train=True):
        """
        :param input_id: batch of word ID by output of decoder
        :param teacher_id : batch of correct ID
        :param label_id :
        :param word_th : batch of correct at label
        :param train: True or false
        :return: decoded embed vector
        """
        batch_word = chainer.Variable(xp.array(input_id, dtype=xp.int32))
        batch_label = chainer.Variable(xp.array(label_id, dtype=xp.int32))
        predict_mat, predict_at, self.c_batch, self.h_batch = self.dec(
            batch_word, batch_label, self.c_batch, self.h_batch, train=train)
        if train:
            t = xp.array(teacher_id, dtype=xp.int32)
            t = chainer.Variable(t)

            predict_ids = xp.argmax(predict_mat.data, axis=1)
            correct_at = xp.zeros((1, predict_ids.shape[0]), dtype=xp.float32)
            for ind in range(predict_ids.shape[0]):
                # right answer
                if predict_ids[ind] < word_th and teacher_id[ind] < word_th:
                    correct_at[0, ind] = 1.0
                elif predict_ids[ind] > word_th and teacher_id[ind] > word_th:
                    correct_at[0, ind] = 1.0
                # wrong answer
                else:
                    correct_at[0, ind] = 0.0
            correct_at = chainer.Variable(
                correct_at.reshape(predict_ids.shape[0], 1))
            at_loss = -F.sum(F.log(predict_at) * correct_at) / self.batch_size
            # if at_loss.data > 0:
            #     print(at_loss.data)
            return F.softmax_cross_entropy(predict_mat,
                                           t) + at_loss, predict_mat
        else:
            return predict_mat
Exemplo n.º 20
0
def get_dealer_sampling(N_pic=100, imgH=64, imgW=64, N_card=4):

    thres = [0.99995, 0.9999, 0.9998, 0.9995]  #*512で13,26,52,131個相当

    #<ランダム点画像の生成>
    img_r = xp.random.rand(N_pic,
                           imgW * imgH).astype(np.float32)  #100枚分の0-1乱数作成
    img_p = xp.zeros(
        (N_card, N_pic, imgW * imgH)).astype(np.float32)  #4*100枚分のイメージメモリ確保

    for i, thre in enumerate(thres):  #閾値よりも高いものだけ1を代入
        img_p[i][img_r >= thre] = 1

    #点画像変形 (N_card, N_pic, imgW*imgH,) ⇒ (N_pic, imgW*imgH, N_card)
    img_p = chainer.Variable(img_p.transpose((1, 2, 0)))

    #<サンプリング係数の生成>
    #100個の「1」を作成
    x_one = xp.ones((N_pic, 1), dtype=np.float32)
    #「1」をディーラーを通したあとsoftmaxで0-1確率にする
    card_prob = F.softmax(Md['de'](x_one))
    #gumbel_softmaxを通してサンプリング
    card_gum = F.gumbel_softmax(F.log(card_prob), tau=0.2)
    #サンプリング係数の画像化 (N_pic, N_card) ⇒ (N_pic, imgW*imgH, N_card)
    card_gum_b = F.broadcast_to(F.reshape(card_gum, (N_pic, 1, N_card)),
                                img_p.shape)

    #<ランダム点画像とサンプリング係数画像の合成>
    #ランダム点画像とサンプリング係数をかけて、合成(sum)し、2次元画像へ変形
    img_p_sum = F.reshape(F.sum(img_p * card_gum_b, axis=2),
                          (N_pic, 1, imgH, imgW))

    #点⇒ガウス球へ変形
    img_core = Md['decon_core'](img_p_sum) * 255
    img_core = F.broadcast_to(img_core, (N_pic, 3, imgH, imgW))

    return img_core
Exemplo n.º 21
0
 def regular_graph_output(
     self, f_A, f_G
 ):  # f_A is appearance feature shape = (N,D), f_G is geometry feature shape = (N,4)
     assert f_A.shape[0] == f_G.shape[0]
     if self.add_self:
         assert f_A.shape[1] == self.out_size
     N = f_G.shape[0]
     geo_dim = f_G.shape[1]
     f_R = []
     for nr in range(self.num_relations):
         f_G = F.tile(f_G, (1, N))  # shape = (N, 4 * N)
         f_G_1 = F.reshape(
             f_G, (N * N, geo_dim))  # after tile: N x (4 x N), then N^2 x 4
         f_G_2 = F.tile(f_G, (N, 1))  # shape = (N*N, 4)
         encoded_offset = self.encode_box_offset(f_G_1,
                                                 f_G_2)  # shape = (N*N, 4)
         # paper formula (5), shape = (N,N)
         w_G = F.relu(
             getattr(self, self.W_G_lst[nr])(self.position_encoding(
                 encoded_offset, self.d_g)))
         w_G = F.reshape(w_G, shape=(N, N))
         # paper formula (4), shape = (N,N)
         w_K_result = getattr(self,
                              self.W_K_lst[nr])(f_A)  # shape = (N, d_k)
         w_Q_transpose_result = F.transpose(
             getattr(self, self.W_Q_lst[nr])(f_A))  # shape = (d_k, N)
         w_A = F.matmul(w_K_result, w_Q_transpose_result)  # shape = (N,N)
         # paper formula (3), shape = (N,N)
         w_A = w_A + F.log(w_G)
         w = F.softmax(w_A, axis=1)
         # w = w_G * F.exp(w_A) / F.sum(w_G * F.exp(w_A), axis=1) # denominator shape = (N,1) numerator shape = (N,N)
         # paper formula (2), weight sum = matmul:(N,N) x (N, out_size//nr) = (N, out_size//nr)
         f_R_nr = F.matmul(w, getattr(self, self.W_V_lst[nr])(f_A))
         f_R.append(f_R_nr)
     if self.add_self:
         return f_A + F.concat(f_R, axis=1)
     return F.concat(f_R, axis=1)
Exemplo n.º 22
0
Arquivo: lstm.py Projeto: musyoku/NLP
    def train(self, seq_batch, test=False):
        self.reset_state()
        forward_h, backward_h = self.scan(seq_batch, test=test)
        xp = self.xp
        sum_loss = 0
        seq_batch = seq_batch.T

        for i in xrange(len(forward_h)):
            fh = forward_h[i]
            bh = backward_h[i]
            c = seq_batch[i]
            c = Variable(xp.asanyarray(c, dtype=np.int32))
            if fh is None:
                out = bh
            elif bh is None:
                out = fh
            else:
                h = F.concat((fh, bh))
                forget = self.forget(h, test=test)
                forget = F.softmax(forget)
                out = apply_attention(fh, forget, 0) + apply_attention(
                    bh, forget, 1)
            if self.fc is not None:
                out = self.fc(out, test=test)
            entropy = 0
            if fh is not None and bh is not None:
                entropy = -forget * F.log(forget + 1e-6)
                entropy = F.sum(entropy)
            loss = F.softmax_cross_entropy(out, c) + entropy
            sum_loss += loss

        self.zero_grads()
        sum_loss.backward()
        self.update()
        if self.gpu:
            sum_loss.to_cpu()
        return sum_loss.data
Exemplo n.º 23
0
        def lf(z_t, z_t_plus_1, action, done_label, reset=True):
            k = self.k
            output_dim = self.output_dim
            if reset:
                self.reset_state()

            output = self.fprop(F.concat((z_t, action)))
            if self.predict_done:
                coef, mu, ln_var, done = output
            else:
                coef, mu, ln_var = output

            coef = F.reshape(coef, (-1, output_dim, k))
            coef = F.softmax(coef, axis=2)
            mu = F.reshape(mu, (-1, output_dim, k))
            ln_var = F.reshape(ln_var, (-1, output_dim, k))

            z_t_plus_1 = F.repeat(z_t_plus_1, k, 1).reshape(-1, output_dim, k)

            normals = F.sum(
                coef *
                F.exp(-F.gaussian_nll(z_t_plus_1, mu, ln_var, reduce='no')),
                axis=2)
            densities = F.sum(normals, axis=1)
            nll = -F.log(densities)

            loss = F.sum(nll)

            if self.predict_done:
                done_loss = F.sigmoid_cross_entropy(done.reshape(-1, 1),
                                                    done_label,
                                                    reduce="no")
                done_loss *= (1. + done_label.astype("float32") * 9.)
                done_loss = F.mean(done_loss)
                loss = loss + done_loss

            return loss
Exemplo n.º 24
0
    def compute_logits(self, new_states, concatenated, attn):
        new_output_state = new_states[-1]

        all_concatenated = F.concat((concatenated, new_output_state))
        logits = self.decoder_chain.lin_o(self.decoder_chain.maxo(all_concatenated))

        if self.lexicon_probability_matrix is not None:
            current_mb_size = new_output_state.data.shape[0]
            assert self.mb_size is None or current_mb_size <= self.mb_size
            lexicon_probability_matrix = self.lexicon_probability_matrix[:current_mb_size]

            # Just making sure data shape is as expected
            attn_mb_size, max_source_length_attn = attn.data.shape
            assert attn_mb_size == current_mb_size
            lex_mb_size, max_source_length_lexicon, v_size_lexicon = lexicon_probability_matrix.shape
            assert max_source_length_lexicon == max_source_length_attn
            assert logits.data.shape == (current_mb_size, v_size_lexicon)

            if self.demux:
                assert lex_mb_size == 1
                weighted_lex_probs = F.reshape(
                    matmul_constant(attn, lexicon_probability_matrix.reshape(lexicon_probability_matrix.shape[1],
                                                                             lexicon_probability_matrix.shape[2])),
                    logits.data.shape)
            else:
                assert lex_mb_size == current_mb_size

    #                 weighted_lex_probs = F.reshape(
    #                         F.batch_matmul(attn, ConstantFunction(lexicon_probability_matrix)(), transa = True),
    #                                                logits.data.shape)

                weighted_lex_probs = F.reshape(
                    batch_matmul_constant(attn, lexicon_probability_matrix, transa=True),
                    logits.data.shape)

            logits += F.log(weighted_lex_probs + self.lex_epsilon)
        return logits
Exemplo n.º 25
0
    def _compute_loss(self, exp_batch, errors_out=None):
        """Compute a loss of categorical DQN."""
        y, t = self._compute_y_and_t(exp_batch)
        # Minimize the cross entropy
        # y is clipped to avoid log(0)
        eltwise_loss = -t * F.log(F.clip(y, 1e-10, 1.))

        if errors_out is not None:
            del errors_out[:]
            delta = F.sum(eltwise_loss, axis=1)
            delta = cuda.to_cpu(delta.array)
            for e in delta:
                errors_out.append(e)

        if 'weights' in exp_batch:
            return compute_weighted_value_loss(
                y,
                t,
                exp_batch['weights'],
                batch_accumulator=self.batch_accumulator)
        else:
            return compute_value_loss(y,
                                      t,
                                      batch_accumulator=self.batch_accumulator)
Exemplo n.º 26
0
Arquivo: lstm.py Projeto: musyoku/NLP
	def train(self, seq_batch, test=False):
		self.reset_state()
		forward_h, backward_h = self.scan(seq_batch, test=test)
		xp = self.xp
		sum_loss = 0
		seq_batch = seq_batch.T

		for i in xrange(len(forward_h)):
			fh = forward_h[i]
			bh = backward_h[i]
			c = seq_batch[i]
			c = Variable(xp.asanyarray(c, dtype=np.int32))
			if fh is None:
				out = bh
			elif bh is None:
				out = fh
			else:
				h = F.concat((fh, bh))
				forget = self.forget(h, test=test)
				forget = F.softmax(forget)
				out = apply_attention(fh, forget, 0) + apply_attention(bh, forget, 1)
			if self.fc is not None:
				out = self.fc(out, test=test)
			entropy = 0
			if fh is not None and bh is not None:
				entropy = -forget * F.log(forget + 1e-6)
				entropy = F.sum(entropy)
			loss = F.softmax_cross_entropy(out, c) + entropy
			sum_loss += loss

		self.zero_grads()
		sum_loss.backward()
		self.update()
		if self.gpu:
			sum_loss.to_cpu()
		return sum_loss.data
Exemplo n.º 27
0
import numpy as np
import chainer
from chainer import functions as F

a = chainer.Variable(np.array(6.))
b = chainer.Variable(np.array(4.))
c = chainer.Variable(np.array(2.))

r = (a + b) * (F.log(b) + c)
# r = (a + b) * (F.log(np.array(4.)) + c)

r.grad = np.array(5.0)
r.backward()
print(a.grad, b.grad, c.grad)
Exemplo n.º 28
0
def concat_losses(p, e):
    loss_x = -F.sum(F.log(sum_axis(p))) / numpy.float32(p.data.shape[0])
    loss_e = F.sigmoid_cross_entropy(*e)
    return loss_x + loss_e
Exemplo n.º 29
0
 def _loss(self, context, target, weight):
     _context = F.dropout(context, ratio=self.dropout_ratio)
     _word = F.dropout(self.vocab(target), ratio=self.dropout_ratio)
     dot = -F.log(F.sigmoid(F.sum(_context * _word, axis=1)) + 1e-9)
     return F.sum(dot * weight)
Exemplo n.º 30
0
def log():
    x = rand((1, 8, 8, 8), bias=1e-10)
    y = F.log(x)
    return {'input': x}, {'out': y}
Exemplo n.º 31
0
 def __init__(self, mean, var):
     self.mean = _wrap_by_variable(mean)
     self.var = _wrap_by_variable(var)
     self.ln_var = F.log(var)
Exemplo n.º 32
0
 def all_log_prob(self):
     with chainer.force_backprop_mode():
         if self.min_prob > 0:
             return F.log(self.all_prob)
         else:
             return F.log_softmax(self.beta * self.logits)
 def softplus(self, x):
     return F.log(F.exp(x) + 1)
 def __call__(self, x):
     x = F.log(x) + 13.0
     h = F.leaky_relu(self.l1(x))
     h = F.leaky_relu(self.l2(h))
     h = F.leaky_relu(self.l3(h))
     return F.exp(self.l9(h)-13.0)
Exemplo n.º 35
0
def myCrossEntropyError(m, y):
    DELTA = 1e-7  # マイナス無限大を発生させないように微小な値を追加する
    return -F.sum(y * F.log(m + DELTA) + (1 - y) * F.log(1 - m + DELTA))
Exemplo n.º 36
0
def logsumexp(x, mask, zero_pad, axis):
    x_exp = F.where(mask, F.exp(x), zero_pad)
    return F.log(F.sum(x_exp, axis=axis))
Exemplo n.º 37
0
 def _encode(self, xs):
     exs = self.embed_mat(xs)
     h = F.tanh(self.l1(exs))
     logits = F.softplus(self.l2(h))
     logits = F.log(logits + 1e-10).reshape(-1, self.M, self.K)
     return logits, exs
Exemplo n.º 38
0
Arquivo: adgm.py Projeto: musyoku/adgm
	def compute_lower_bound(self, x_l_cpu_data, y_l_cpu_data, x_u_cpu_data, test=False):
		assert(isinstance(x_l_cpu_data, np.ndarray))

		def lower_bound(log_px, log_py, log_pa, log_pz, log_qz, log_qa):
			return log_px + log_py + log_pa + log_pz - log_qz - log_qa


		# _l: labeled
		# _u: unlabeled
		batchsize_l = x_l_cpu_data.shape[0]
		batchsize_u = x_u_cpu_data.shape[0]
		ndim_x = x_u_cpu_data.shape[1]
		n_types_of_label = y_l_cpu_data.shape[1]
		num_mc_samples = self.config.num_mc_samples
		xp = self.xp

		### lower bound of labeled data ###
		# repeat num_mc_samples times
		if num_mc_samples == 1:
			x_l = self.to_variable(x_l_cpu_data)
			y_l = self.to_variable(y_l_cpu_data)
		else:
			x_l = self.to_variable(np.repeat(x_l_cpu_data, num_mc_samples, axis=0))
			y_l = self.to_variable(np.repeat(y_l_cpu_data, num_mc_samples, axis=0))

		a_mean_l, a_ln_var_l = self.q_a_x(x_l, test=test)
		a_l = F.gaussian(a_mean_l, a_ln_var_l)
		z_mean_l, z_ln_var_l = self.q_z_axy(a_l, x_l, y_l, test=test)
		z_l = F.gaussian(z_mean_l, z_ln_var_l)

		# compute lower bound
		log_pa_l = self.log_pa(a_l, x_l, y_l, z_l, test=test)
		log_px_l = self.log_px(a_l, x_l, y_l, z_l, test=test)
		log_py_l = self.log_py(y_l)
		log_pz_l = self.log_pz(z_l)
		log_qa_l = -self.gaussian_nll_keepbatch(a_l, a_mean_l, a_ln_var_l)	# 'gaussian_nll_keepbatch' returns the negative log-likelihood
		log_qz_l = -self.gaussian_nll_keepbatch(z_l, z_mean_l, z_ln_var_l)
		lower_bound_l = lower_bound(log_px_l, log_py_l, log_pa_l, log_pz_l, log_qz_l, log_qa_l)

		# take the average
		if num_mc_samples > 1:
			lower_bound_l /= num_mc_samples

		### lower bound of unlabeled data ###
		if batchsize_u > 0:
			# To marginalize y, we repeat unlabeled x, and construct a target (batchsize_u * n_types_of_label) x n_types_of_label
			# Example of n-dimensional x and target matrix for a 3 class problem and batch_size=2.
			#       x_u              y_repeat
			#  [[x0[0], x0[1], ..., x0[n]]         [[1, 0, 0]
			#   [x1[0], x1[1], ..., x1[n]]          [1, 0, 0]
			#   [x0[0], x0[1], ..., x0[n]]          [0, 1, 0]
			#   [x1[0], x1[1], ..., x1[n]]          [0, 1, 0]
			#   [x0[0], x0[1], ..., x0[n]]          [0, 0, 1]
			#   [x1[0], x1[1], ..., x1[n]]]         [0, 0, 1]]

			# marginalize x and y
			x_u_marg = np.broadcast_to(x_u_cpu_data, (n_types_of_label, batchsize_u, ndim_x)).reshape((batchsize_u * n_types_of_label, ndim_x))
			y_u_marg = np.repeat(np.identity(n_types_of_label, dtype=np.float32), batchsize_u, axis=0)

			# repeat num_mc_samples times
			x_u = x_u_marg
			y_u = y_u_marg
			if num_mc_samples > 1:
				n_rows_marg = x_u_marg.shape[0]
				n_rows = n_rows_marg * num_mc_samples
				x_u = np.repeat(x_u_marg, num_mc_samples, axis=0)
				y_u = np.repeat(y_u_marg, num_mc_samples, axis=0)

			x_u = self.to_variable(x_u)
			y_u = self.to_variable(y_u)

			a_mean_u, a_ln_var_u = self.q_a_x(x_u, test=test)
			a_u = F.gaussian(a_mean_u, a_ln_var_u)
			z_mean_u, z_ln_var_u = self.q_z_axy(a_u, x_u, y_u, test=test)
			z_u = F.gaussian(z_mean_u, z_ln_var_u)

			# compute lower bound
			log_pa_u = self.log_pa(a_u, x_u, y_u, z_u, test=test)
			log_px_u = self.log_px(a_u, x_u, y_u, z_u, test=test)
			log_py_u = self.log_py(y_u)
			log_pz_u = self.log_pz(z_u)
			log_qa_u = -self.gaussian_nll_keepbatch(a_u, a_mean_u, a_ln_var_u)	# 'gaussian_nll_keepbatch' returns the negative log-likelihood
			log_qz_u = -self.gaussian_nll_keepbatch(z_u, z_mean_u, z_ln_var_u)
			lower_bound_u = lower_bound(log_px_u, log_py_u, log_pa_u, log_pz_u, log_qz_u, log_qa_u)

			# Compute sum_y{q(y|x){-L(x,y) + H(q(y|x))}}
			# Let LB(xn, y) be the lower bound for an input image xn and a label y (y = 0, 1, ..., 9).
			# Let bs be the batchsize.
			# 
			# lower_bound_u is a vector and it looks like...
			# [LB(x0,0), LB(x1,0), ..., LB(x_bs,0), LB(x0,1), LB(x1,1), ..., LB(x_bs,1), ..., LB(x0,9), LB(x1,9), ..., LB(x_bs,9)]
			# 
			# After reshaping. (axis 1 corresponds to label, axis 2 corresponds to batch)
			# [[LB(x0,0), LB(x1,0), ..., LB(x_bs,0)],
			#  [LB(x0,1), LB(x1,1), ..., LB(x_bs,1)],
			#                   .
			#                   .
			#                   .
			#  [LB(x0,9), LB(x1,9), ..., LB(x_bs,9)]]
			# 
			# After transposing. (axis 1 corresponds to batch)
			# [[LB(x0,0), LB(x0,1), ..., LB(x0,9)],
			#  [LB(x1,0), LB(x1,1), ..., LB(x1,9)],
			#                   .
			#                   .
			#                   .
			#  [LB(x_bs,0), LB(x_bs,1), ..., LB(x_bs,9)]]
			if num_mc_samples == 1:
				lower_bound_u = F.transpose(F.reshape(lower_bound_u, (n_types_of_label, -1)))
			else:
				lower_bound_u = F.reshape(lower_bound_u, (n_types_of_label, num_mc_samples * batchsize_u))
				lower_bound_u = F.transpose(lower_bound_u)

			# take expectations w.r.t y
			if num_mc_samples == 1:
				x_u = self.to_variable(x_u_cpu_data)
			else:
				x_u = self.to_variable(np.repeat(x_u_cpu_data, num_mc_samples, axis=0))

			a_mean_u, a_ln_var_u = self.q_a_x(x_u, test=test)
			a_u = F.gaussian(a_mean_u, a_ln_var_u)
			y_distribution = F.softmax(self.q_y_ax(a_u, x_u, test=test))

			lower_bound_u = y_distribution * (lower_bound_u - F.log(y_distribution + 1e-6))

			# take the average
			if num_mc_samples > 1:
				lower_bound_u /= num_mc_samples

			lb_labeled = F.sum(lower_bound_l) / batchsize_l
			lb_unlabeled = F.sum(lower_bound_u) / batchsize_u
			lower_bound = lb_labeled + lb_unlabeled
		else:
			lb_unlabeled = None
			lb_labeled = F.sum(lower_bound_l) / batchsize_l
			lower_bound = lb_labeled

		return lower_bound, lb_labeled, lb_unlabeled
Exemplo n.º 39
0
 def forward(self, x):
     y1 = F.log(x)
     return y1
Exemplo n.º 40
0
def get_kl_categorical_uniform(preds, num_atoms, num_edge_types, add_const=False, eps=1e-16):
    kl_div = preds * F.log(preds + eps)
    if add_const:
        const = F.log(preds.xp.array(num_edge_types, dtype=preds.dtype))
        kl_div += const
    return F.sum(kl_div) / (num_atoms * preds.shape[0])
Exemplo n.º 41
0
#setup optimizer
optimizerG = optimizers.Adam(alpha=0.001)
optimizerG.setup(G)
optimizerG.add_hook(chainer.optimizer.WeightDecay(0.0005))
optimizerD = optimizers.Adam(alpha=0.001)
optimizerD.setup(D)
optimizerD.add_hook(chainer.optimizer.WeightDecay(0.0005))

batch_size = args.batch
for i in xrange(args.iter):
    for k in xrange(args.ksteps):
        optimizerD.zero_grads()
        z_batch = gen.sample(batch_size)
        x_batch = data.sample(batch_size)
        #符号をチェック
        loss_d = F.sum(-F.log(D(x_batch)) -
                       F.log(np.ones([batch_size, 1]) -
                             D(G(z_batch)))) / batch_size
        loss_d.backward()

        optimizerD.update()

    optimizerG.zero_grads()
    z_batch = gen.sample(batch_size)
    x_batch = data.sample(batch_size)
    #loss_g=F.sum( F.log(np.ones([batch_size,1])-D(G(z_batch))) )/batch_size
    loss_g = -F.sum(F.log(D(G(z_batch)))) / batch_size
    loss_g.backward()
    optimizerG.update()

    if i != 0 and i % 100 == 0:
Exemplo n.º 42
0
def sigmoid_cross_entropy(x, z):
    return F.relu(x) - x * z + F.log(1 + F.exp(-abs(x)))
 def square_norm(x,y):
     return F.sum((F.log(x)-F.log(y))**2)/batchsize
Exemplo n.º 44
0
def entropy(p):
    return -F.sum(F.log(p) * p)
Exemplo n.º 45
0
 def soft_cross_entropy_loss(self, x, p):
     # in the argment of log(), add small value to care numerical instability
     loss = -F.mean(F.sum(p * F.log(1e-30 + self.predict_proba(x)), axis=1))
     # reporting loss
     reporter.report({'loss': loss}, self)
     return loss
Exemplo n.º 46
0
 def compute_entropy(self, p):
     if p.ndim == 2:
         return -F.sum(p * F.log(p + 1e-16), axis=1)
     return -F.sum(p * F.log(p + 1e-16))
Exemplo n.º 47
0
 def __init__(self, mean, var):
     self.mean = distribution._wrap_by_variable(mean)
     self.var = distribution._wrap_by_variable(var)
     self.ln_var = F.log(var)
Exemplo n.º 48
0
 def compute_kld(self, p, q):
     assert self.get_batchsize(p) == self.get_batchsize(q)
     return F.reshape(
         F.sum(p * (F.log(p + 1e-16) - F.log(q + 1e-16)), axis=1), (-1, 1))
Exemplo n.º 49
0
def logsumexp(x, mask, zero_pad, axis):
    x_exp = F.where(mask, F.exp(x), zero_pad)
    return F.log(F.sum(x_exp, axis=axis))
Exemplo n.º 50
0
def oneplus(x):
    return 1 + F.log(1 + F.exp(x))
Exemplo n.º 51
0
 def all_log_prob(self):
     with chainer.force_backprop_mode():
         return F.log(self.all_prob)
Exemplo n.º 52
0
 def square_norm(x, y):
     return F.sum((F.log(x) - F.log(y))**2) / batchsize
Exemplo n.º 53
0
def kl_divergence(y, t):
    entropy = -F.sum(t[t.nonzero()] * F.log(t[t.nonzero()]))
    cross_entropy = -F.sum(t * F.log_softmax(y))

    return (cross_entropy - entropy) / y.shape[0]
Exemplo n.º 54
0
def distance(y0, y1):
    p0 = F.sigmoid(y0)
    p1 = F.sigmoid(y1)
    return F.sum(p0 * F.log((p0 + 1e-8) / (p1 + 1e-8)) + (1 - p0) * F.log((1 - p0 + 1e-8) / (1 - p1 + 1e-8))) / \
           p0.data.shape[0]
Exemplo n.º 55
0
    def test_parameterized_softmax_distr(self):
        import numpy as np
        """test output dimension"""
        segs = (tuple(), (12, 3, 11), (12, ))
        x = np.arange(39, dtype=np.float32)
        x2 = 2 * x
        logits = F.stack((x, x2))
        beta = 1.0

        parameterized_distr = ParamSoftmaxDistribution(logits, segs, beta)

        p = parameterized_distr.all_prob
        log_p = parameterized_distr.all_log_prob

        # sanity check
        assert p.shape[1] == 1 + 12 * 3 * 11 + 12
        assert log_p.shape[1] == 1 + 12 * 3 * 11 + 12
        """test probabilities and log probabilities calculated"""
        x = np.arange(15, dtype=np.float32)
        x2 = 2 * x
        logits = F.stack((x, x2))

        num_of_cols = 6

        def probs():
            action_type = F.softmax(beta * logits[:, :3])
            back_prob = action_type[:, 0:1]
            filter_prob = action_type[:, 1:2]
            group_prob = action_type[:, 2:3]
            filter_col_prob = F.softmax(beta * logits[:, 3:3 + num_of_cols])
            filter_col_prob = F.broadcast_to(
                filter_prob, filter_col_prob.shape) * filter_col_prob
            group_col_prob = F.softmax(
                beta *
                logits[:, 3 + num_of_cols:3 + num_of_cols + num_of_cols])
            group_col_prob = F.broadcast_to(
                group_prob, group_col_prob.shape) * group_col_prob

            res = F.concat((back_prob, filter_col_prob, group_col_prob))
            # sanity check, sum(result, axis=1) == 1
            assert np.all(F.sum(res, axis=1).data == 1.0)

            return res

        def log_probs():
            action_type = F.log_softmax(beta * logits[:, :3])
            back_prob = action_type[:, 0:1]
            filter_prob = action_type[:, 1:2]
            group_prob = action_type[:, 2:3]
            filter_col_prob = F.log_softmax(beta *
                                            logits[:, 3:3 + num_of_cols])
            filter_col_prob = F.broadcast_to(
                filter_prob, filter_col_prob.shape) + filter_col_prob
            group_col_prob = F.log_softmax(
                beta *
                logits[:, 3 + num_of_cols:3 + num_of_cols + num_of_cols])
            group_col_prob = F.broadcast_to(
                group_prob, group_col_prob.shape) + group_col_prob

            res = F.concat((back_prob, filter_col_prob, group_col_prob))
            return res

        segs = (tuple(), (num_of_cols, ), (num_of_cols, ))
        parameterized_distr = ParamSoftmaxDistribution(logits, segs, beta)

        p = probs()
        p2 = parameterized_distr.all_prob

        # sanity check 2: probs() == generic_probs()
        assert np.allclose(p.data, p2.data)

        log_p = log_probs()
        log_p2 = parameterized_distr.all_log_prob

        # sanity check 3: probs() == generic_probs()
        assert np.allclose(log_p.data, log_p2.data)

        # sanity check 4: log(p)==log_p
        assert np.allclose(F.log(p).data, log_p.data)
Exemplo n.º 56
0
    def loss_softmax_cross_entropy(self, predict, ground_truth):
        eps = 1e-16
        cross_entropy = -F.mean(F.log(predict + eps) * ground_truth)

        return cross_entropy
Exemplo n.º 57
0
def forward(model, batch, num_samples, word_keep_rate, UNK, train=True):
    
    batch_size = batch.shape[0]

    xp = model.xp
    use_gpu = (xp == cuda.cupy)
    if use_gpu:
        batch = cuda.to_gpu(batch)

    model.reset_state()
    model.zerograds()

    # encode
    batch_length = len(batch[0])-1
    for i in range(batch_length):
        w = Variable(batch[:, i])
        model.encode(w, train=train)
    
    # infer q(z|x)
    model.infer(train=train)

    # compute KL
    KL = 0
    for i in range(model.num_layers):
        # h
        mu, sigma = model.hmus[i], model.hsigmas[i]
        KL += -F.sum((1 + 2 * F.log(sigma) - sigma*sigma - mu*mu) / 2)

        # c
        mu, sigma = model.cmus[i], model.csigmas[i]
        KL += -F.sum((1 + 2 * F.log(sigma) - sigma*sigma - mu*mu) / 2)

    KL /= batch_size
    # draw and decode
    cross_entropies = []
    if not train:
        ys, ts = [], []

    UNKs = np.array([UNK for _ in range(batch_size)], dtype=np.int32)
    if use_gpu:
        UNKs = cuda.to_gpu(UNKs)
    for _ in range(num_samples):

        cross_entropies.append(0)
        if not train:
            ys.append([])
            ts.append([])

        if train == True:
            model.set_by_sample(train=train)
        else:
            model.set_by_MLE(train=train)

        last_w = None
        for i in range(batch_length):
            w, next_w = Variable(batch[:, i]), Variable(batch[:, i+1])
            # word dropout
            masked_w = batch[:, i]
            if np.random.uniform() > word_keep_rate:
                enable = (masked_w != -1)
                masked_w = F.where(enable, masked_w, UNKs)
            y = model.decode(masked_w, train=train)
            cross_entropies[-1] += F.softmax_cross_entropy(y, next_w)
            if not train:
                ys[-1].append(xp.argmax(y.data, axis=1))
                ts[-1].append(next_w.data)
            last_w = next_w

        if not train:
            ys[-1] = xp.vstack(ys[-1]).T
            ts[-1] = xp.vstack(ts[-1]).T
            if use_gpu:
                ys[-1] = cuda.to_cpu(ys[-1])
                ts[-1] = cuda.to_cpu(ts[-1])

    if train:
        return (KL, cross_entropies)
    else:
        assert(len(cross_entropies) == 1 and len(ys) == 1 and len(ts) == 1)
        return (KL, (cross_entropies, ys, ts))
Exemplo n.º 58
0
def _safe_log(x):
    """Logarithm function that won't backprop inf to input."""
    return F.log(F.where(x.data > 0, x, x.data))
Exemplo n.º 59
0
Arquivo: model.py Projeto: emakryo/mdn
 def __call__(self, *args):
     density = self.predictor(*args)
     nll = -F.sum(F.log(density))
     report({'nll': nll}, self)
     return nll
Exemplo n.º 60
0
	def compute_lower_bound_loss(self, labeled_x, labeled_y, label_ids, unlabeled_x, test=False):

		def lower_bound(log_px_zy, log_py, log_pz, log_qz_xy):
			lb = log_px_zy + log_py + log_pz - log_qz_xy
			return lb

		# _l: labeled
		# _u: unlabeled
		batchsize_l = labeled_x.data.shape[0]
		batchsize_u = unlabeled_x.data.shape[0]
		num_types_of_label = labeled_y.data.shape[1]
		xp = self.xp

		### Lower bound for labeled data ###
		# Compute eq.6 -L(x,y)
		z_mean_l, z_ln_var_l = self.encoder_xy_z(labeled_x, labeled_y, test=test, apply_f=False)
		z_l = F.gaussian(z_mean_l, z_ln_var_l)
		log_px_zy_l = self.log_px_zy(labeled_x, z_l, labeled_y, test=test)
		log_py_l = self.log_py(labeled_y, test=test)
		if False:
			log_pz_l = self.log_pz(z_l, z_mean_l, z_ln_var_l, test=test)
			log_qz_xy_l = self.log_qz_xy(z_l, z_mean_l, z_ln_var_l, test=test)
			lower_bound_l = lower_bound(log_px_zy_l, log_py_l, log_pz_l, log_qz_xy_l)
		else:
			lower_bound_l = log_px_zy_l + log_py_l - self.gaussian_kl_divergence_keepbatch(z_mean_l, z_ln_var_l)

		if batchsize_u > 0:
			### Lower bound for unlabeled data ###
			# To marginalize y, we repeat unlabeled x, and construct a target (batchsize_u * num_types_of_label) x num_types_of_label
			# Example of n-dimensional x and target matrix for a 3 class problem and batch_size=2.
			#         unlabeled_x_ext                 y_ext
			#  [[x0[0], x0[1], ..., x0[n]]         [[1, 0, 0]
			#   [x1[0], x1[1], ..., x1[n]]          [1, 0, 0]
			#   [x0[0], x0[1], ..., x0[n]]          [0, 1, 0]
			#   [x1[0], x1[1], ..., x1[n]]          [0, 1, 0]
			#   [x0[0], x0[1], ..., x0[n]]          [0, 0, 1]
			#   [x1[0], x1[1], ..., x1[n]]]         [0, 0, 1]]

			unlabeled_x_ext = xp.zeros((batchsize_u * num_types_of_label, unlabeled_x.data.shape[1]), dtype=xp.float32)
			y_ext = xp.zeros((batchsize_u * num_types_of_label, num_types_of_label), dtype=xp.float32)
			for n in xrange(num_types_of_label):
				y_ext[n * batchsize_u:(n + 1) * batchsize_u,n] = 1
				unlabeled_x_ext[n * batchsize_u:(n + 1) * batchsize_u] = unlabeled_x.data
			y_ext = Variable(y_ext)
			unlabeled_x_ext = Variable(unlabeled_x_ext)

			# Compute eq.6 -L(x,y) for unlabeled data
			z_mean_u_ext, z_mean_ln_var_u_ext = self.encoder_xy_z(unlabeled_x_ext, y_ext, test=test, apply_f=False)
			z_u_ext = F.gaussian(z_mean_u_ext, z_mean_ln_var_u_ext)
			log_px_zy_u = self.log_px_zy(unlabeled_x_ext, z_u_ext, y_ext, test=test)
			log_py_u = self.log_py(y_ext, test=test)
			if False:
				log_pz_u = self.log_pz(z_u_ext, z_mean_u_ext, z_mean_ln_var_u_ext, test=test)
				log_qz_xy_u = self.log_qz_xy(z_u_ext, z_mean_u_ext, z_mean_ln_var_u_ext, test=test)
				lower_bound_u = lower_bound(log_px_zy_u, log_py_u, log_pz_u, log_qz_xy_u)
			else:
				lower_bound_u = log_px_zy_u + log_py_u - self.gaussian_kl_divergence_keepbatch(z_mean_u_ext, z_mean_ln_var_u_ext)

			# Compute eq.7 sum_y{q(y|x){-L(x,y) + H(q(y|x))}}
			# Let LB(xn, y) be the lower bound for an input image xn and a label y (y = 0, 1, ..., 9).
			# Let bs be the batchsize.
			# 
			# lower_bound_u is a vector and it looks like...
			# [LB(x0,0), LB(x1,0), ..., LB(x_bs,0), LB(x0,1), LB(x1,1), ..., LB(x_bs,1), ..., LB(x0,9), LB(x1,9), ..., LB(x_bs,9)]
			# 
			# After reshaping. (axis 1 corresponds to label, axis 2 corresponds to batch)
			# [[LB(x0,0), LB(x1,0), ..., LB(x_bs,0)],
			#  [LB(x0,1), LB(x1,1), ..., LB(x_bs,1)],
			#                   .
			#                   .
			#                   .
			#  [LB(x0,9), LB(x1,9), ..., LB(x_bs,9)]]
			# 
			# After transposing. (axis 1 corresponds to batch)
			# [[LB(x0,0), LB(x0,1), ..., LB(x0,9)],
			#  [LB(x1,0), LB(x1,1), ..., LB(x1,9)],
			#                   .
			#                   .
			#                   .
			#  [LB(x_bs,0), LB(x_bs,1), ..., LB(x_bs,9)]]
			lower_bound_u = F.transpose(F.reshape(lower_bound_u, (num_types_of_label, batchsize_u)))
			
			y_distribution = self.encoder_x_y(unlabeled_x, test=test, softmax=True)
			lower_bound_u = y_distribution * (lower_bound_u - F.log(y_distribution + 1e-6))

			loss_labeled = -F.sum(lower_bound_l) / batchsize_l
			loss_unlabeled = -F.sum(lower_bound_u) / batchsize_u
			loss = loss_labeled + loss_unlabeled
		else:
			loss_unlabeled = None
			loss_labeled = -F.sum(lower_bound_l) / batchsize_l
			loss = loss_labeled

		return loss, loss_labeled, loss_unlabeled