示例#1
0
    def encode(self, xs, task='all', flip=False, use_cache=False, streaming=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all/ys*/ys_sub1*/ys_sub2*
            flip (bool): if True, flip acoustic features in the time-dimension
            use_cache (bool): use the cached forward encoder state in the previous chunk as the initial state
            streaming (bool): streaming encoding
        Returns:
            eout_dict (dict):

        """
        if self.input_type == 'speech':
            # Frame stacking
            if self.n_stacks > 1:
                xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs]

            # Splicing
            if self.n_splices > 1:
                xs = [splice(x, self.n_splices, self.n_stacks) for x in xs]
            xlens = torch.IntTensor([len(x) for x in xs])

            # Flip acoustic features in the reverse order
            if flip:
                xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs]
            else:
                xs = [np2tensor(x, self.device_id).float() for x in xs]
            xs = pad_list(xs, 0.)

            # SpecAugment
            if self.use_specaug and self.training:
                xs = self.specaug(xs)

            # Gaussian noise injection
            if self.gaussian_noise:
                xs = add_gaussian_noise(xs)

            # Sequence summary network
            if self.ssn is not None:
                xs += self.ssn(xs, xlens)

        elif self.input_type == 'text':
            xlens = torch.IntTensor([len(x) for x in xs])
            xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs]
            xs = pad_list(xs, self.pad)
            xs = self.dropout_emb(self.embed(xs))
            # TODO(hirofumi): fix for Transformer

        # encoder
        eout_dict = self.enc(xs, xlens, task.split('.')[0], use_cache, streaming)

        if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer']:
            for sub in ['sub1', 'sub2']:
                eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone()
                eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:]

        return eout_dict
示例#2
0
    def encode(self, xs, task='all', flip=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all or ys* or ys_sub1* or ys_sub2*
            flip (bool): if True, flip acoustic features in the time-dimension
        Returns:
            enc_outs (dict):

        """
            
        if self.input_type == 'speech':

            xlens = torch.IntTensor([len(x) for x in xs])
            # Flip acoustic features in the reverse order
            if flip:
                xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs]
            else:
                xs = [np2tensor(x, self.device_id).float() for x in xs]
            xs = pad_list(xs, 0.0)

            # SpecAugment
            if self.is_specaug and self.training:
                xs = self.specaug(xs)

            # Gaussian noise injection
            if self.gaussian_noise:
                xs = add_gaussian_noise(xs)

            # Sequence summary network
            if self.ssn is not None:
                xs += self.ssn(xs, xlens)

        elif self.input_type == 'text':
            xlens = torch.IntTensor([len(x) for x in xs])
            xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs]
            xs = pad_list(xs, self.pad)
            xs = self.embed(xs)
        
        # encoder
        enc_outs = self.enc(xs, xlens, task.split('.')[0])
        if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer']:
            for sub in ['sub1', 'sub2']:
                enc_outs['ys_' + sub]['xs'] = enc_outs['ys']['xs'].clone()
                enc_outs['ys_' + sub]['xlens'] = enc_outs['ys']['xlens'][:]
        del xs 
        return enc_outs