Exemplo n.º 1
0
    def __call__(self, hs, ys):
        """CTC forward.

        Args:
            hs (list of chainer.Variable | N-dimension array): Input variable from encoder.
            ys (list of chainer.Variable | N-dimension array): Input variable of decoder.

        Returns:
            chainer.Variable: A variable holding a scalar value of the CTC loss.

        """
        self.loss = None
        ilens = [x.shape[0] for x in hs]
        olens = [x.shape[0] for x in ys]

        # zero padding for hs
        y_hat = linear_tensor(self.ctc_lo, F.dropout(
            F.pad_sequence(hs), ratio=self.dropout_rate))
        y_hat = F.separate(y_hat, axis=1)  # ilen list of batch x hdim

        # zero padding for ys
        y_true = F.pad_sequence(ys, padding=-1)  # batch x olen

        # get length info
        input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32))
        label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32))
        logging.info(self.__class__.__name__ + ' input lengths:  ' + str(input_length.data))
        logging.info(self.__class__.__name__ + ' output lengths: ' + str(label_length.data))

        # get ctc loss
        self.loss = F.connectionist_temporal_classification(
            y_hat, y_true, 0, input_length, label_length)
        logging.info('ctc loss:' + str(self.loss.data))

        return self.loss
Exemplo n.º 2
0
    def __call__(self, hs, ys):
        """Core function of the Warp-CTC layer.

        Args:
            hs (iterable of chainer.Variable | N-dimention array): Input variable from encoder.
            ys (iterable of chainer.Variable | N-dimension array): Input variable of decoder.

        Returns:
           chainer.Variable: A variable holding a scalar value of the CTC loss.

        """
        self.loss = None
        ilens = [x.shape[0] for x in hs]
        olens = [x.shape[0] for x in ys]

        # zero padding for hs
        y_hat = linear_tensor(self.ctc_lo, F.dropout(
            F.pad_sequence(hs), ratio=self.dropout_rate))
        y_hat = F.transpose(y_hat, (1, 0, 2))  # batch x frames x hdim

        # get length info
        logging.info(self.__class__.__name__ + ' input lengths:  ' + str(ilens))
        logging.info(self.__class__.__name__ + ' output lengths: ' + str(olens))

        # get ctc loss
        from chainer_ctc.warpctc import ctc as warp_ctc
        self.loss = warp_ctc(y_hat, ilens, [cuda.to_cpu(l.data) for l in ys])[0]
        logging.info('ctc loss:' + str(self.loss.data))

        return self.loss
Exemplo n.º 3
0
    def __call__(self, hs, ys):
        """CTC forward

        :param hs:
        :param ys:
        :return:
        """
        self.loss = None
        ilens = [x.shape[0] for x in hs]
        olens = [x.shape[0] for x in ys]

        # zero padding for hs
        y_hat = linear_tensor(self.ctc_lo, F.dropout(
            F.pad_sequence(hs), ratio=self.dropout_rate))
        y_hat = F.transpose(y_hat, (1, 0, 2))  # batch x frames x hdim

        # get length info
        logging.info(self.__class__.__name__ + ' input lengths:  ' + str(ilens))
        logging.info(self.__class__.__name__ + ' output lengths: ' + str(olens))

        # get ctc loss
        from chainer_ctc.warpctc import ctc as warp_ctc
        self.loss = warp_ctc(y_hat, ilens, [cuda.to_cpu(l.data) for l in ys])[0]
        logging.info('ctc loss:' + str(self.loss.data))

        return self.loss
Exemplo n.º 4
0
    def log_softmax(self, hs):
        """log_softmax of frame activations

        :param hs:
        :return:
        """
        y_hat = linear_tensor(self.ctc_lo, F.pad_sequence(hs))
        return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)
Exemplo n.º 5
0
    def log_softmax(self, hs):
        """Log_softmax of frame activations.

        Args:
            hs (list of chainer.Variable | N-dimension array): Input variable from encoder.

        Returns:
            chainer.Variable: A n-dimension float array.

        """
        y_hat = linear_tensor(self.ctc_lo, F.pad_sequence(hs))
        return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)
Exemplo n.º 6
0
    def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0):
        """Compute AttDot forward layer.

        Args:
            enc_hs (chainer.Variable | N-dimensional array): Input variable from encoder.
            dec_z (chainer.Variable | N-dimensional array): Input variable of decoder.
            scaling (float): Scaling weight to make attention sharp.

        Returns:
            chainer.Variable: Weighted sum over flames.
            chainer.Variable: Attention weight.

        """
        batch = len(enc_hs)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None:
            self.enc_h = F.pad_sequence(enc_hs)  # utt x frame x hdim
            self.h_length = self.enc_h.shape[1]
            # utt x frame x att_dim
            self.pre_compute_enc_h = F.tanh(
                linear_tensor(self.mlp_enc, self.enc_h))

        if dec_z is None:
            dec_z = chainer.Variable(
                self.xp.zeros((batch, self.dunits), dtype=np.float32))
        else:
            dec_z = F.reshape(dec_z, (batch, self.dunits))

        # <phi (h_t), psi (s)> for all t
        u = F.broadcast_to(F.expand_dims(F.tanh(self.mlp_dec(dec_z)), 1),
                           self.pre_compute_enc_h.shape)
        e = F.sum(self.pre_compute_enc_h * u, axis=2)  # utt x frame
        # Applying a minus-large-number filter to make a probability value zero for a padded area
        # simply degrades the performance, and I gave up this implementation
        # Apply a scaling to make an attention sharp
        w = F.softmax(scaling * e)
        # weighted sum over flames
        # utt x hdim
        c = F.sum(self.enc_h *
                  F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape),
                  axis=1)

        return c, w
Exemplo n.º 7
0
    def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0):
        """Compute AttLoc forward layer.

        Args:
            enc_hs (chainer.Variable | N-dimensional array): Input variable from encoders.
            dec_z (chainer.Variable | N-dimensional array): Input variable of decoder.
            att_prev (chainer.Variable | None): Attention weight.
            scaling (float): Scaling weight to make attention sharp.

        Returns:
            chainer.Variable: Weighted sum over flames.
            chainer.Variable: Attention weight.

        """
        batch = len(enc_hs)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None:
            self.enc_h = F.pad_sequence(enc_hs)  # utt x frame x hdim
            self.h_length = self.enc_h.shape[1]
            # utt x frame x att_dim
            self.pre_compute_enc_h = linear_tensor(self.mlp_enc, self.enc_h)

        if dec_z is None:
            dec_z = chainer.Variable(
                self.xp.zeros((batch, self.dunits), dtype=np.float32))
        else:
            dec_z = F.reshape(dec_z, (batch, self.dunits))

        # initialize attention weight with uniform dist.
        if att_prev is None:
            att_prev = [
                self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32)
                for hh in enc_hs
            ]
            att_prev = [chainer.Variable(att) for att in att_prev]
            att_prev = F.pad_sequence(att_prev)

        # TODO(watanabe) use <chainer variable>.reshpae(), instead of F.reshape()
        # att_prev: utt x frame -> utt x 1 x 1 x frame -> utt x att_conv_chans x 1 x frame
        att_conv = self.loc_conv(
            F.reshape(att_prev, (batch, 1, 1, self.h_length)))
        # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
        att_conv = F.swapaxes(F.squeeze(att_conv, axis=2), 1, 2)
        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
        att_conv = linear_tensor(self.mlp_att, att_conv)

        # dec_z_tiled: utt x frame x att_dim
        dec_z_tiled = F.broadcast_to(F.expand_dims(self.mlp_dec(dec_z), 1),
                                     self.pre_compute_enc_h.shape)

        # dot with gvec
        # utt x frame x att_dim -> utt x frame
        # TODO(watanabe) use batch_matmul
        e = F.squeeze(linear_tensor(
            self.gvec,
            F.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled)),
                      axis=2)
        # Applying a minus-large-number filter to make a probability value zero for a padded area
        # simply degrades the performance, and I gave up this implementation
        # Apply a scaling to make an attention sharp
        w = F.softmax(scaling * e)

        # weighted sum over flames
        # utt x hdim
        c = F.sum(self.enc_h *
                  F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape),
                  axis=1)

        return c, w