def encode(self, xs, task='all', flip=False, use_cache=False, streaming=False): """Encode acoustic or text features. Args: xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]` task (str): all/ys*/ys_sub1*/ys_sub2* flip (bool): if True, flip acoustic features in the time-dimension use_cache (bool): use the cached forward encoder state in the previous chunk as the initial state streaming (bool): streaming encoding Returns: eout_dict (dict): """ if self.input_type == 'speech': # Frame stacking if self.n_stacks > 1: xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs] # Splicing if self.n_splices > 1: xs = [splice(x, self.n_splices, self.n_stacks) for x in xs] xlens = torch.IntTensor([len(x) for x in xs]) # Flip acoustic features in the reverse order if flip: xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs] else: xs = [np2tensor(x, self.device_id).float() for x in xs] xs = pad_list(xs, 0.) # SpecAugment if self.use_specaug and self.training: xs = self.specaug(xs) # Gaussian noise injection if self.gaussian_noise: xs = add_gaussian_noise(xs) # Sequence summary network if self.ssn is not None: xs += self.ssn(xs, xlens) elif self.input_type == 'text': xlens = torch.IntTensor([len(x) for x in xs]) xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs] xs = pad_list(xs, self.pad) xs = self.dropout_emb(self.embed(xs)) # TODO(hirofumi): fix for Transformer # encoder eout_dict = self.enc(xs, xlens, task.split('.')[0], use_cache, streaming) if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer']: for sub in ['sub1', 'sub2']: eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone() eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:] return eout_dict
def encode(self, xs, task='all', streaming=False, lookback=False, lookahead=False): """Encode acoustic or text features. Args: xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]` task (str): all/ys*/ys_sub1*/ys_sub2* streaming (bool): streaming encoding lookback (bool): truncate leftmost frames for lookback in CNN context lookahead (bool): truncate rightmost frames for lookahead in CNN context Returns: eout_dict (dict): """ if self.input_type == 'speech': # Frame stacking if self.n_stacks > 1: xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs] # Splicing if self.n_splices > 1: xs = [splice(x, self.n_splices, self.n_stacks) for x in xs] xlens = torch.IntTensor([len(x) for x in xs]) xs = pad_list([np2tensor(x, self.device).float() for x in xs], 0.) # SpecAugment if self.specaug is not None and self.training: xs = self.specaug(xs) # Weight noise injection if self.weight_noise_std > 0 and self.training: self.add_weight_noise(std=self.weight_noise_std) # Input Gaussian noise injection if self.input_noise_std > 0 and self.training: xs = add_input_noise(xs, std=self.input_noise_std) # Sequence summary network if self.ssn is not None: xs = self.ssn(xs, xlens) elif self.input_type == 'text': xlens = torch.IntTensor([len(x) for x in xs]) xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device) for x in xs] xs = pad_list(xs, self.pad) xs = self.dropout_emb(self.embed(xs)) # TODO(hirofumi): fix for Transformer # encoder eout_dict = self.enc(xs, xlens, task.split('.')[0], streaming, lookback, lookahead) if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv']: for sub in ['sub1', 'sub2']: eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone() eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:] return eout_dict
def collate_fn(self, batch): xs = [] xlens = [] ys = [] ys_hist = [] ys_sub1 = [] ys_sub2 = [] utt_ids = [] speakers = [] sessions = [] text = [] for item in batch: xs.append(item['xs'][0]) xlens.append(item['xlens'][0]) ys.append(item['ys'][0]) ys_hist.append(item['ys_hist'][0]) ys_sub1.append(item['ys_sub1']) ys_sub2.append(item['ys_sub2']) utt_ids.append(item['utt_ids'][0]) speakers.append(item['speakers'][0]) sessions.append(item['sessions'][0]) text.append(item['text']) if self.num_stacks > 1: xs = [stack_frame(x, self.num_stacks, self.num_skips) for x in xs] # Splicing if self.num_splices > 1: xs = [splice(x, self.num_splices, self.num_stacks) for x in xs] data = { 'xs': xs, 'xlens': xlens, 'ys': ys, 'ys_hist': ys_hist, 'ys_sub1': ys_sub1, 'ys_sub2': ys_sub2, 'utt_ids': utt_ids, 'speakers': speakers, 'sessions': sessions, 'text': text } return data
def encode(self, xs, task='all', flip=False): """Encode acoustic or text features. Args: xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]` task (str): all or ys* or ys_sub1* or ys_sub2* flip (bool): if True, flip acoustic features in the time-dimension Returns: enc_outs (dict): """ if 'lmobj' in task: eouts = { 'ys': { 'xs': None, 'xlens': None }, 'ys_sub1': { 'xs': None, 'xlens': None }, 'ys_sub2': { 'xs': None, 'xlens': None } } return eouts else: if self.input_type == 'speech': # Frame stacking if self.n_stacks > 1: xs = [ stack_frame(x, self.n_stacks, self.n_skips) for x in xs ] # Splicing if self.n_splices > 1: xs = [splice(x, self.n_splices, self.n_stacks) for x in xs] xlens = torch.IntTensor([len(x) for x in xs]) # Flip acoustic features in the reverse order if flip: xs = [ torch.from_numpy(np.flip( x, axis=0).copy()).float().cuda(self.device_id) for x in xs ] else: xs = [np2tensor(x, self.device_id).float() for x in xs] xs = pad_list(xs, 0.0) # SpecAugment if self.is_specaug and self.training: xs = self.specaug(xs) # Gaussian noise injection if self.gaussian_noise: xs = add_gaussian_noise(xs) # Sequence summary network if self.ssn is not None: xs += self.ssn(xs, xlens) elif self.input_type == 'text': xlens = torch.IntTensor([len(x) for x in xs]) xs = [ np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs ] xs = pad_list(xs, self.pad) xs = self.embed(xs) # encoder enc_outs = self.enc(xs, xlens, task.split('.')[0]) if self.main_weight < 1 and self.enc_type in [ 'conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer' ]: for sub in ['sub1', 'sub2']: enc_outs['ys_' + sub]['xs'] = enc_outs['ys']['xs'].clone() enc_outs['ys_' + sub]['xlens'] = enc_outs['ys']['xlens'][:] return enc_outs