Exemplo n.º 1
0
    def nodeforward(self, x, cs, hs, ctx):
        x = nd.reshape(x, (self.dim_h, ))
        _Ui = nd.zeros((self.dim_h, ), ctx=ctx)
        _Uo = nd.zeros((self.dim_h, ), ctx=ctx)
        _Uu = nd.zeros((self.dim_h, ), ctx=ctx)
        _Uf = [nd.zeros((self.dim_h, ), ctx=ctx) for i in range(len(cs))]

        for idx in range(len(cs)):
            _Ui = nd.add(_Ui, nd.dot(self.Uis[idx].data(), hs[idx]))
            _Uo = nd.add(_Uo, nd.dot(self.Uos[idx].data(), hs[idx]))
            _Uu = nd.add(_Uu, nd.dot(self.Uus[idx].data(), hs[idx]))
            for j in range(len(cs)):
                _Uf[idx] = nd.add(_Uf[idx],
                                  nd.dot(self.Ufs[idx][j].data(), hs[j]))

        i = nd.sigmoid(
            nd.add(nd.add(nd.dot(self.Wi.data(), x), _Ui), self.bi.data()))
        o = nd.sigmoid(
            nd.add(nd.add(nd.dot(self.Wo.data(), x), _Uo), self.bo.data()))
        f = [
            nd.sigmoid(
                nd.add(nd.add(nd.dot(self.Wf.data(), x), _Uf[idx]),
                       self.bf.data())) for idx in range(len(cs))
        ]
        u = nd.tanh(
            nd.add(nd.add(nd.dot(self.Wu.data(), x), _Uu), self.bu.data()))

        c = nd.zeros((self.dim_h, ), ctx=ctx)
        for idx in range(len(cs)):
            c = nd.add(c, nd.multiply(f[idx], cs[idx]))
        c = nd.add(nd.multiply(i, u), c)

        h = nd.multiply(o, nd.tanh(c))
        return c, h
Exemplo n.º 2
0
    def where(self, mask, tensor_in_1, tensor_in_2):
        """
        Apply a boolean selection mask to the elements of the input tensors.

        Example::

            >>> where(
                astensor([1, 0, 1]),
                astensor([1, 1, 1]),
                astensor([2, 2, 2]))
            [1. 2. 1.]

        Args:
            mask (bool): Boolean mask (boolean or tensor object of booleans)
            tensor_in_1 (Tensor): Tensor object
            tensor_in_2 (Tensor): Tensor object

        Returns:
            MXNet NDArray: The result of the mask being applied to the tensors.
        """
        mask = self.astensor(mask)
        tensor_in_1 = self.astensor(tensor_in_1)
        tensor_in_2 = self.astensor(tensor_in_2)
        return nd.add(nd.multiply(mask, tensor_in_1),
                      nd.multiply(nd.subtract(1, mask), tensor_in_2))
Exemplo n.º 3
0
    def where(self, mask, tensor_in_1, tensor_in_2):
        """
        Apply a boolean selection mask to the elements of the input tensors.

        Example:

            >>> import pyhf
            >>> pyhf.set_backend(pyhf.tensor.mxnet_backend())
            >>> pyhf.tensorlib.where(
            ...   pyhf.tensorlib.astensor([1, 0, 1]),
            ...   pyhf.tensorlib.astensor([1, 1, 1]),
            ...   pyhf.tensorlib.astensor([2, 2, 2]))
            ...
            <BLANKLINE>
            [1. 2. 1.]
            <NDArray 3 @cpu(0)>

        Args:
            mask (bool): Boolean mask (boolean or tensor object of booleans)
            tensor_in_1 (Tensor): Tensor object
            tensor_in_2 (Tensor): Tensor object

        Returns:
            MXNet NDArray: The result of the mask being applied to the tensors.
        """
        mask = self.astensor(mask)
        tensor_in_1 = self.astensor(tensor_in_1)
        tensor_in_2 = self.astensor(tensor_in_2)
        return nd.add(
            nd.multiply(mask, tensor_in_1),
            nd.multiply(nd.subtract(1, mask), tensor_in_2),
        )
Exemplo n.º 4
0
 def feature_detect(self, tag_inputs, word_inputs, bert):
     is_train = autograd.is_training()
     batch_size = word_inputs.shape[1]
     seq_len = word_inputs.shape[0]
     # unked_words = np.where(word_inputs < self._vocab.words_in_train, word_inputs, self._vocab.UNK)
     if self.pret_word_embs is not None:
         word_embs = self.pret_word_embs(nd.array(word_inputs))
         if bert is not None:
             word_embs = nd.concat(word_embs, nd.array(bert), dim=2)
     else:
         word_embs = nd.array(bert)
     tag_embs = self.tag_embs(nd.array(tag_inputs)) if self.tag_embs is not None else None
     # Dropout
     if is_train:
         wm, tm = self.generate_emb_mask(seq_len, batch_size)
         if self.tag_embs is not None:
             emb_inputs = nd.concat(nd.multiply(wm, word_embs), nd.multiply(tm, tag_embs), dim=2)
         else:
             emb_inputs = nd.multiply(wm, word_embs)
     else:
         if self.tag_embs is not None:
             emb_inputs = nd.concat(word_embs, tag_embs, dim=2)  # seq_len x batch_size
         else:
             emb_inputs = word_embs
     top_recur = biLSTM(self.f_lstm, self.b_lstm, emb_inputs, batch_size,
                        dropout_x=self.dropout_lstm_input if is_train else 0)
     return top_recur
Exemplo n.º 5
0
 def forward(self, x):
     with x.context:
         c = nd.softmax(self.b.data(), axis=1)
         u = nd.dot(x, self.w.data())
         s = nd.multiply(c, u)
         s_nrm = nd.sum(s*s)
         fact = s_nrm / ( 1. + s_nrm)
         v = fact * s / nd.sqrt(s_nrm)
         self.u_v = nd.sum(nd.multiply(u, v))
         return u
Exemplo n.º 6
0
 def hybrid_forward(self, F, X, *args):
     """
     This method closely follows the formulas in RNN/lstm_formulas.png
     """
     h, c = args[0], args[1]
     f_t = (self.W_f(X) + self.U_f(h)).sigmoid()
     i_t = (self.W_i(X) + self.U_i(h)).sigmoid()
     o_t = (self.W_o(X) + self.U_o(h)).sigmoid()
     c_tilde_t = (self.W_c(X) + self.U_c(h)).sigmoid()
     new_c = nd.multiply(f_t, c) + nd.multiply(i_t, c_tilde_t)
     new_h = nd.multiply(o_t, c.sigmoid())
     return new_h, new_c
def get_xps(weight_denominator, weight_numerator, z):
    xps = list()
    xps.append(z)
    for _ in range(max(len(weight_numerator), len(weight_denominator))):
        xps.append(nd.multiply(xps[-1], z))
    xps.insert(0, nd.ones_like(z))
    return xps
def Rational_MXNET_A_F(x, weight_numerator, weight_denominator, training):
    # P(X) / Q(X) = a_0 + a_1 * X + ... + a_n * X ^ n /
    #               1 + | b_0 * X | + | b_1 * X | ^ 2 + ... + | b_i * X | ^ {i + 1}

    z = nd.reshape(x, shape=(-1, ))

    xps = get_xps(weight_denominator, weight_numerator, z)

    numerator = nd.array([0], dtype='float32')
    for i, w_n in enumerate(weight_numerator):
        numerator = numerator + nd.multiply(w_n, xps[i])

    denominator = nd.array([1.0], dtype='float32')
    for j, w_d in enumerate(weight_denominator):
        denominator = denominator + nd.abs(nd.multiply(w_d, xps[j + 1]))

    return nd.divide(numerator, denominator).reshape(x.shape)
Exemplo n.º 9
0
 def _clip_px_gradients(self, batch_grads, px_clipping_factors):
     # hacky workaround for not knowing how to multiply a (b,) shape array with a (b, x) or (b, x, y) shape array
     expanded_batch_clipping_factors = nd.expand_dims(
         px_clipping_factors, 1)
     if len(batch_grads.shape) == 3:
         expanded_batch_clipping_factors = nd.expand_dims(
             expanded_batch_clipping_factors, 1)
     return nd.multiply(batch_grads, expanded_batch_clipping_factors)
def Rational_MXNET_C_F(x, weight_numerator, weight_denominator, training):
    # P(X) / Q(X) = a_0 + a_1 * X + ... + a_n * X ^ n /
    #               eps + |b_1*X + b_1*X^2 + ... + b_{n-1}*X^n|

    z = nd.reshape(x, shape=(-1, ))

    xps = get_xps(weight_denominator, weight_numerator, z)

    numerator = nd.array([0], dtype='float32')
    for i, w_n in enumerate(weight_numerator):
        numerator = numerator + nd.multiply(w_n, xps[i])

    denominator = nd.array([0], dtype='float32')
    for j, w_d in enumerate(weight_denominator):
        denominator = denominator + nd.multiply(w_d, xps[j])

    return nd.divide(numerator, (0.1 + nd.abs(denominator))).reshape(x.shape)
def Rational_MXNET_D_F(x,
                       weight_numerator,
                       weight_denominator,
                       training,
                       random_deviation=0.1):
    # P(X)/Q(X) = noised(a_0) + noised(a_1)*X +noised(a_2)*X^2 + ... + noised(a_n)*X^n /
    #                1 + |noised(b_0)*X + noised(b_1)*X^2 + ... + noised(b_{n-1})*X^n|
    # Noised parameters have uniform noise to be in range [(1-random_deviation)*parameter,(1+random_deviation)*parameter].

    if not training:
        # do not add noise
        return Rational_MXNET_B_F(x, weight_numerator, weight_denominator,
                                  training)

    z = nd.reshape(x, shape=(-1, ))
    lower_bound = nd.array([1 - random_deviation])
    upper_bound = nd.array([1 + random_deviation])

    xps = get_xps(weight_denominator, weight_numerator, z)

    numerator = nd.array([0], dtype='float32')
    for i, w_n in enumerate(weight_numerator):
        w_n_noised = nd.multiply(
            w_n,
            nd.sample_uniform(low=lower_bound,
                              high=upper_bound,
                              shape=z.shape,
                              dtype='float32'))
        numerator = numerator + nd.multiply(w_n_noised, xps[i])

    denominator = nd.array([0], dtype='float32')
    for j, w_d in enumerate(weight_denominator):
        w_d_noised = nd.multiply(
            w_d,
            nd.sample_uniform(low=lower_bound,
                              high=upper_bound,
                              shape=z.shape,
                              dtype='float32'))
        denominator = denominator + nd.multiply(w_d_noised, xps[j + 1])

    return nd.divide(numerator, (1 + nd.abs(denominator))).reshape(x.shape)
Exemplo n.º 12
0
 def make_std_mask(trg, pad, ctx):
     """
     Create a mask to hide padding ad future words.
     Compare each element of trg_mask and sub_mask. (1, 1) -> 1 o.w. -> 0
     There is no bitwise operator for mxnet
     """
     trg_mask = (trg != pad).expand_dims(axis = -2)
     trg_mask = nd.repeat(trg_mask, repeats = trg_mask.shape[-1], axis = -2)
     sub_mask = subsequent_mask(trg.shape[-1])
     sub_mask = nd.repeat(sub_mask, repeats = trg_mask.shape[0], axis = 0)
     trg_mask = nd.multiply(trg_mask, sub_mask.as_in_context(ctx))
     return trg_mask
Exemplo n.º 13
0
    def dense_bw(self, input_layer, input_error):
        """Fully connected layer backward process"""
        self.d_act_z = self.d_act(self.z)

        self.delta_b = nd.multiply(input_error, self.d_act_z)
        x = nd.transpose(input_layer)

        self.delta_W = nd.dot(x, self.delta_b)

        output_bp = nd.dot(self.delta_b, nd.transpose(self.W))

        self.delta_b = nd.sum(self.delta_b, axis=0)
        assert self.batch_size == input_error.shape[0]

        self.W = nd.subtract(
            self.W, self.delta_W * (self.learning_rate / self.batch_size))
        self.b = nd.subtract(
            self.b, self.delta_b * (self.learning_rate / self.batch_size))

        return output_bp
Exemplo n.º 14
0
def partial_trim(epoch, v, net, f):
    # apply partial knowledge trimmed mean attack

    vi_shape = v[0].shape

    #first compute the distribution parameters
    all_grads = nd.concat(*v, dim=1)
    adv_grads = all_grads[:, :f]
    e_mu = nd.mean(adv_grads, axis=1)  # mean
    e_sigma = nd.sqrt(
        nd.sum(nd.square(nd.subtract(adv_grads, e_mu.reshape(-1, 1))), axis=1)
        / f)  # standard deviation

    for i in range(f):
        # apply attack to compromised worker devices with randomness
        v[i] = (
            e_mu - nd.multiply(e_sigma, nd.sign(e_mu)) *
            (3. + nd.random.uniform(shape=e_sigma.shape))).reshape(vi_shape)

    return v
    def _concrete_dropout(self, x):
        """Forward pass for dropout layer
        """
        with autograd.record():
            eps = 1e-7
            temp = 0.1

            self.p = nd.sigmoid(self.p_logit.data())

            # Check if batch size is the same as unif_noise, if not take care
            unif_noise = nd.array(np.random.uniform(size=tuple(x.shape)))

            drop_prob = (nd.log(self.p + eps) - nd.log(1 - self.p + eps) +
                         nd.log(unif_noise + eps) -
                         nd.log(1 - unif_noise + eps))
            drop_prob = nd.sigmoid(drop_prob / temp)
            random_tensor = 1 - drop_prob
            retain_prob = 1 - self.p
            x = nd.multiply(x, random_tensor)
            x = x / retain_prob
        self.all_p.append(self.p)
        return x
Exemplo n.º 16
0
    def forward(self, x):
        embeds = self.embed(x)  # batch * time step * embedding
        x_i = embeds.expand_dims(1)
        x_i = nd.repeat(x_i, repeats=self.sentence_length,
                        axis=1)  # batch * time step * time step * embedding
        x_j = embeds.expand_dims(2)
        x_j = nd.repeat(x_j, repeats=self.sentence_length,
                        axis=2)  # batch * time step * time step * embedding
        x_full = nd.concat(
            x_i, x_j, dim=3)  # batch * time step * time step * (2 * embedding)
        # New input data
        _x = x_full.reshape((-1, 2 * self.emb_dim))

        # Network for attention
        _attn = self.attn(_x)
        _att = _attn.reshape((-1, self.sentence_length, self.sentence_length))
        _att = nd.sigmoid(_att)
        att = nd.softmax(_att, axis=1)

        _x = self.g_fc1(_x)  # (batch * time step * time step) * hidden_dim
        _x = self.g_fc2(_x)  # (batch * time step * time step) * hidden_dim
        # sentence_length*sentence_length개의 결과값을 모두 합해서 sentence representation으로 나타냄

        x_g = _x.reshape(
            (-1, self.sentence_length, self.sentence_length, self.hidden_dim))

        _inflated_att = _att.expand_dims(axis=-1)
        _inflated_att = nd.repeat(_inflated_att,
                                  repeats=self.hidden_dim,
                                  axis=3)

        x_q = nd.multiply(_inflated_att, x_g)

        sentence_rep = nd.mean(x_q.reshape(shape=(-1, self.sentence_length**2,
                                                  self.hidden_dim)),
                               axis=1)
        return sentence_rep, att
Exemplo n.º 17
0
    def cross_entropy(self, o, y):
        """
        o is the output from fully connected layer (num_examples x num_classes)
        y is labels (num_examples x 1)
        Note that y is not one-hot encoded vector.
        It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
        """
        m = y.shape[0]
        print(o[0])
        p = self.softmax(o)
        print(p[0])
        print(y[0])
        k = nd.multiply(y, p)
        print(k[0])
        input()
        # We use multidimensional array indexing to extract
        # softmax probability of the correct label for each sample.
        # Refer to https://docs.scipy.org/doc/numpy/user/basics.indexing.html#indexing-multi-dimensional-arrays for understanding multidimensional array indexing.

        log_likelihood = -nd.log(nd.max(k, axis=1))
        print(log_likelihood)
        loss = nd.sum(log_likelihood) / m
        print(666, loss)
        return loss
Exemplo n.º 18
0
    def run(self,
            word_inputs,
            tag_inputs,
            arc_targets=None,
            rel_targets=None,
            is_train=True):
        """
        Train or test
        :param word_inputs: seq_len x batch_size
        :param tag_inputs: seq_len x batch_size
        :param arc_targets: seq_len x batch_size
        :param rel_targets: seq_len x batch_size
        :param is_train: is training or test
        :return:
        """

        # return 0, 0, 0, nd.dot(self.junk.data(), nd.ones((3, 1))).sum()

        def flatten_numpy(ndarray):
            """
            Flatten nd-array to 1-d column vector
            :param ndarray:
            :return:
            """
            return np.reshape(ndarray, (-1, ), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))  # non padding, non root token number

        if is_train or arc_targets is not None:
            mask_1D = flatten_numpy(mask)
            # mask_1D_tensor = nd.inputTensor(mask_1D, batched=True)
            mask_1D_tensor = nd.array(mask_1D)

            #  if batched=True, the last dimension is used as a batch dimension if arr is a list of numpy ndarrays

        unked_words = np.where(word_inputs < self._vocab.words_in_train,
                               word_inputs, self._vocab.UNK)
        word_embs = self.word_embs(nd.array(unked_words, dtype='int'))
        if self.pret_word_embs:
            word_embs = word_embs + self.pret_word_embs(nd.array(word_inputs))
        tag_embs = self.tag_embs(nd.array(tag_inputs))

        # Dropout
        if is_train:
            wm, tm = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = nd.concat(nd.multiply(wm, word_embs),
                                   nd.multiply(tm, tag_embs),
                                   dim=2)
        else:
            emb_inputs = nd.concat(word_embs, tag_embs,
                                   dim=2)  # seq_len x batch_size

        top_recur = biLSTM(self.bi_lstm, emb_inputs, batch_size)
        # if is_train:
        #     top_recur = nd.Dropout(data=top_recur, axes=[0], p=self.dropout_mlp)

        W_dep, b_dep = self.mlp_dep_W.data(), self.mlp_dep_b.data()
        W_head, b_head = self.mlp_head_W.data(), self.mlp_head_b.data()
        dep, head = leaky_relu(nd.dot(top_recur, W_dep.T) + b_dep), leaky_relu(
            nd.dot(top_recur, W_head.T) + b_head)
        # if is_train:
        #     dep, head = nd.Dropout(data=dep, axes=[0], p=self.dropout_mlp), nd.Dropout(data=head, axes=[0],
        #                                                                                p=self.dropout_mlp)
        dep, head = nd.transpose(dep, axes=[2, 0,
                                            1]), nd.transpose(head,
                                                              axes=[2, 0, 1])
        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]
        # return 0, 0, 0, dep_arc.sum() + head_arc.sum()

        W_arc = self.arc_W.data()
        arc_logits = bilinear(dep_arc,
                              W_arc,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # return 0, 0, 0, arc_logits.sum()
        # (#head x #dep) x batch_size

        flat_arc_logits = reshape_fortran(arc_logits,
                                          (seq_len, seq_len * batch_size))
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.argmax(0)
        if len(arc_preds.shape) == 1:  # dynet did unnecessary jobs
            arc_preds = np.expand_dims(arc_preds, axis=1)
        # seq_len x batch_size

        if is_train or arc_targets is not None:
            correct = np.equal(arc_preds.asnumpy(), arc_targets)
            arc_correct = correct.astype(np.float32) * mask
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = flatten_numpy(arc_targets)
            losses = self.softmax_loss(flat_arc_logits, nd.array(targets_1D))
            arc_loss = nd.sum(losses * mask_1D_tensor) / num_tokens

        if not is_train:
            arc_probs = np.transpose(
                np.reshape(
                    nd.softmax(flat_arc_logits).asnumpy(),
                    (seq_len, seq_len, batch_size), 'F'))
        # #batch_size x #dep x #head

        W_rel = self.rel_W.data()
        # dep_rel = nd.concat([dep_rel, nd.inputTensor(np.ones((1, seq_len),dtype=np.float32))])
        # head_rel = nd.concat([head_rel, nd.inputTensor(np.ones((1, seq_len), dtype=np.float32))])
        rel_logits = bilinear(dep_rel,
                              W_rel,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = reshape_fortran(
            rel_logits, (seq_len, self._vocab.rel_size, seq_len * batch_size))
        # (#head x rel_size) x (#dep x batch_size)

        _target_vec = nd.array(
            targets_1D if is_train else flatten_numpy(arc_preds)).reshape(
                seq_len * batch_size, 1)
        _target_mat = _target_vec * nd.ones((1, self._vocab.rel_size))

        partial_rel_logits = nd.pick(flat_rel_logits, _target_mat.T, axis=0)
        # (rel_size) x (#dep x batch_size)

        if is_train or arc_targets is not None:
            rel_preds = partial_rel_logits.argmax(0)
            targets_1D = flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds.asnumpy(), targets_1D).astype(
                np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = self.softmax_loss(partial_rel_logits,
                                       nd.array(targets_1D))
            rel_loss = nd.sum(losses * mask_1D_tensor) / num_tokens

        if not is_train:
            rel_probs = np.transpose(
                np.reshape(
                    nd.softmax(nd.transpose(flat_rel_logits)).asnumpy(),
                    (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
        # batch_size x #dep x #head x #nclasses

        if is_train or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if is_train:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by one
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len]))

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs
Exemplo n.º 19
0
    def forward(self, x):

        self.batch_size, self.input_dim_vector, self.input_num_capsule = x.shape

        assert (self.batch_size, self.input_dim_vector,
                self.input_num_capsule) == (self.batch_size, 8, 1152)

        x_exp = x.expand_dims(axis=1)
        x_exp = x_exp.expand_dims(axis=4)
        assert x_exp.shape == (self.batch_size, 1, 8, 1152, 1)

        x_tile = x_exp.tile(reps=[1, self.num_capsule, 1, 1, 1])
        assert x_tile.shape == (self.batch_size, 10, 8, 1152, 1)

        x_trans = x_tile.transpose(axes=(0, 3, 1, 2, 4))
        assert x_trans.shape == (self.batch_size, 1152, 10, 8, 1)

        # W = self.W_ij.data()
        print(self.W_ij.data()[0, 0, 0, 0])
        # W = self.routing_weight
        # print('W',W[0,0,0,0])

        W = self.W_ij.data().tile(reps=[self.batch_size, 1, 1, 1, 1])

        assert W.shape == (self.batch_size, 1152, 10, 8, 16)

        # [8, 16].T x [8, 1] => [16, 1]
        x_dot = x_trans.reshape(shape=(-1, self.input_dim_vector, 1))  #(8,1)
        W_dot = W.reshape(shape=(-1, self.input_dim_vector,
                                 self.dim_vector))  #(8,16)

        u_hat = nd.batch_dot(W_dot, x_dot, transpose_a=True)

        u_hat = u_hat.reshape(shape=(self.batch_size, self.input_num_capsule,
                                     self.num_capsule, self.dim_vector, -1))
        assert u_hat.shape == (self.batch_size, 1152, 10, 16, 1)

        b_IJ = nd.zeros(
            (self.batch_size, self.input_num_capsule, self.num_capsule, 1, 1),
            ctx=self.context)

        assert b_IJ.shape == ((self.batch_size, 1152, 10, 1, 1))

        u_hat_stopped = nd.stop_gradient(u_hat, name='stop_gradient')

        for r_iter in range(self.iter_routing):
            c_IJ = nd.softmax(b_IJ, axis=2)

            s_J = nd.multiply(c_IJ, u_hat)
            s_J = s_J.sum(axis=1, keepdims=True)
            # print('s_J',s_J[0,0,0])

            assert s_J.shape == (self.batch_size, 1, 10, 16, 1)

            v_J = self.squash(s_J, axis=3)

            assert v_J.shape == (self.batch_size, 1, 10, 16, 1)

            v_J_tiled = v_J.tile(reps=[1, 1152, 1, 1, 1])

            if self.iter_routing > 1:
                # u_hat_stopped  (self.batch_size, 1152, 10, 16, 1)
                # v_J_tiled (self.batch_size, 1152, 10, 16, 1)
                # u_hat_stopped = u_hat_stopped.reshape(shape=(-1,self.dim_vector,1))
                # v_J_tiled = v_J_tiled.reshape(shape=(-1,self.dim_vector,1))
                #
                u_produce_v = nd.stop_gradient(
                    nd.multiply(u_hat_stopped, v_J_tiled, transpose_a=True))

                # u_produce_v = u_produce_v.reshape(shape=(self.batch_size, self.input_num_capsule, self.num_capsule, 1, 1))
                assert u_produce_v.shape == (self.batch_size, 1152, 10, 1, 1)

                b_IJ = nd.stop_gradient(b_IJ + u_produce_v, name="update_b_IJ")

        #(batch_size,1,10,16,1)
        assert v_J.shape == (self.batch_size, 1, self.num_capsule,
                             self.dim_vector, 1)
        # print('v_J',v_J[0,0,0,0])
        return v_J