def append_sos_eos(self, ys, bwd=False, replace_sos=False): """Append <sos> and <eos> and return padded sequences. Args: ys (list): A list of length `[B]`, which contains a list of size `[L]` Returns: ys_in_pad (LongTensor): `[B, L]` ys_out_pad (LongTensor): `[B, L]` ylens (IntTensor): `[B]` """ w = next(self.parameters()) eos = w.new_zeros(1).fill_(self.eos).long() ys = [ np2tensor(np.fromiter(y[::-1] if bwd else y, dtype=np.int64), self.device_id) for y in ys ] if replace_sos: ylens = np2tensor( np.fromiter([y[1:].size(0) + 1 for y in ys], dtype=np.int32)) # +1 for <eos> ys_in_pad = pad_list([y for y in ys], self.pad) ys_out_pad = pad_list([torch.cat([y[1:], eos], dim=0) for y in ys], self.pad) else: ylens = np2tensor( np.fromiter([y.size(0) + 1 for y in ys], dtype=np.int32)) # +1 for <eos> ys_in_pad = pad_list([torch.cat([eos, y], dim=0) for y in ys], self.pad) ys_out_pad = pad_list([torch.cat([y, eos], dim=0) for y in ys], self.pad) return ys_in_pad, ys_out_pad, ylens
def test_forward(args): args = make_args(**args) batch_size = 4 xmax = 40 device = "cpu" xs = [ np.random.randn(xlen, args['input_dim']).astype(np.float32) for xlen in range(xmax - batch_size, xmax) ] xs_pad = pad_list([np2tensor(x, device).float() for x in xs], 0.) stack_module = importlib.import_module( 'neural_sp.models.seq2seq.frontends.frame_stacking') splice_module = importlib.import_module( 'neural_sp.models.seq2seq.frontends.splicing') xs = [ stack_module.stack_frame(x, args['n_stacks'], args['n_stacks']) for x in xs ] out = [ splice_module.splice(x, args['n_splices'], args['n_stacks']) for x in xs ] out_pad = pad_list([np2tensor(x, device).float() for x in out], 0.) assert out_pad.size(0) == xs_pad.size(0) assert out_pad.size(1) == math.ceil(xs_pad.size(1) / args['n_stacks']) assert out_pad.size( 2) == xs_pad.size(2) * args['n_splices'] * args['n_stacks']
def forward_att(self, eouts, elens, ys): """Compute XE loss for the sequence-to-sequence model. Args: eouts (FloatTensor): `[B, T, d_model]` elens (list): A list of length `[B]` ys (list): A list of length `[B]`, which contains a list of size `[L]` Returns: loss (FloatTensor): `[1]` acc (float): ppl (float): """ bs = eouts.size(0) # Append <sos> and <eos> eos = eouts.new_zeros((1,)).fill_(self.eos).long() ylens = [len(y) for y in ys] ys = [np2tensor(np.fromiter(y[::-1] if self.backward else y, dtype=np.int64), self.device_id).long() for y in ys] ys_in = [torch.cat([eos, y], dim=0) for y in ys] ys_out = [torch.cat([y, eos], dim=0) for y in ys] ys_in_pad = pad_list(ys_in, self.pad) ys_out_pad = pad_list(ys_out, self.pad) # Add positional embedding ys_emb = self.embed(ys_in_pad) * (self.d_model ** 0.5) if self.pe_type: ys_emb = self.pos_emb_out(ys_emb) for l in range(self.n_layers): ys_emb, yy_aw, xy_aw = self.layers[l](eouts, elens, ys_emb, ylens) logits = self.norm_top(ys_emb) if self.adaptive_softmax is None: logits = self.output(logits) # Compute XE sequence loss if self.adaptive_softmax is None: if self.lsm_prob > 0: # Label smoothing loss = cross_entropy_lsm(logits, ys_out_pad, ylens=[y.size(0) for y in ys_out], lsm_prob=self.lsm_prob, size_average=False) / bs else: loss = F.cross_entropy(logits.view((-1, logits.size(2))), ys_out_pad.view(-1), ignore_index=self.pad, size_average=False) / bs else: loss = self.adaptive_softmax(logits.view((-1, logits.size(2))), ys_out_pad.view(-1)).loss # Compute token-level accuracy in teacher-forcing if self.adaptive_softmax is None: acc = compute_accuracy(logits, ys_out_pad, pad=self.pad) else: acc = compute_accuracy(self.adaptive_softmax.log_prob( logits.view((-1, logits.size(2)))), ys_out_pad, pad=self.pad) ppl = min(np.exp(loss.item()), np.inf) return loss, acc, ppl
def lm_rescoring(hyps, lm, lm_weight, reverse=False, length_norm=False, tag=''): if lm is None: return hyps for i in range(len(hyps)): ys = hyps[i]['hyp'] # include <sos> if reverse: ys = ys[::-1] ys = [np2tensor(np.fromiter(ys, dtype=np.int64), lm.device)] ys_in = pad_list([y[:-1] for y in ys], -1) # `[1, L-1]` ys_out = pad_list([y[1:] for y in ys], -1) # `[1, L-1]` if ys_in.size(1) > 0: _, _, scores_lm = lm.predict(ys_in, None) score_lm = sum([scores_lm[0, t, ys_out[0, t]] for t in range(ys_out.size(1))]) if length_norm: score_lm /= ys_out.size(1) # normalize by length else: score_lm = 0 hyps[i]['score'] += score_lm * lm_weight hyps[i]['score_lm_' + tag] = score_lm # DO NOT sort here !!! return hyps
def encode(self, xs, task='all', flip=False): """Encode acoustic or text features. Args: xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]` task (str): all or ys* or ys_sub1* or ys_sub2* or ys_sub3* flip (bool): if True, flip acoustic features in the time-dimension Returns: enc_outs (dict): """ if 'lmobj' in task: eouts = {'ys': {'xs': None, 'xlens': None}, 'ys_sub1': {'xs': None, 'xlens': None}, 'ys_sub2': {'xs': None, 'xlens': None}, 'ys_sub3': {'xs': None, 'xlens': None}} return eouts else: if self.input_type == 'speech': # Frame stacking if self.n_stacks > 1: xs = [stack_frame(x, self.n_stacks, self.n_skips)for x in xs] # Splicing if self.n_splices > 1: xs = [splice(x, self.n_splices, self.n_stacks) for x in xs] xlens = [len(x) for x in xs] # Flip acoustic features in the reverse order if flip: xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs] else: xs = [np2tensor(x, self.device_id).float() for x in xs] xs = pad_list(xs, 0.0) elif self.input_type == 'text': xlens = [len(x) for x in xs] xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id).long() for x in xs] xs = pad_list(xs, self.pad) xs = self.embed_in(xs) enc_outs = self.enc(xs, xlens, task.split('.')[0]) if self.main_weight < 1 and self.enc_type in ['cnn', 'tds']: for sub in ['sub1', 'sub2', 'sub3']: enc_outs['ys_' + sub]['xs'] = enc_outs['ys']['xs'].clone() enc_outs['ys_' + sub]['xlens'] = enc_outs['ys']['xlens'][:] # Bridge between the encoder and decoder if self.main_weight > 0 and self.is_bridge and (task in ['all', 'ys']): enc_outs['ys']['xs'] = self.bridge(enc_outs['ys']['xs']) if self.sub1_weight > 0 and self.is_bridge and (task in ['all', 'ys_sub1']): enc_outs['ys_sub1']['xs'] = self.bridge_sub1(enc_outs['ys_sub1']['xs']) if self.sub2_weight > 0 and self.is_bridge and (task in ['all', 'ys_sub2']): enc_outs['ys_sub2']['xs'] = self.bridge_sub2(enc_outs['ys_sub2']['xs']) if self.sub3_weight > 0 and self.is_bridge and (task in ['all', 'ys_sub3']): enc_outs['ys_sub3']['xs'] = self.bridge_sub3(enc_outs['ys_sub3']['xs']) return enc_outs
def encode(self, xs, task='all', flip=False, use_cache=False, streaming=False): """Encode acoustic or text features. Args: xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]` task (str): all/ys*/ys_sub1*/ys_sub2* flip (bool): if True, flip acoustic features in the time-dimension use_cache (bool): use the cached forward encoder state in the previous chunk as the initial state streaming (bool): streaming encoding Returns: eout_dict (dict): """ if self.input_type == 'speech': # Frame stacking if self.n_stacks > 1: xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs] # Splicing if self.n_splices > 1: xs = [splice(x, self.n_splices, self.n_stacks) for x in xs] xlens = torch.IntTensor([len(x) for x in xs]) # Flip acoustic features in the reverse order if flip: xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs] else: xs = [np2tensor(x, self.device_id).float() for x in xs] xs = pad_list(xs, 0.) # SpecAugment if self.use_specaug and self.training: xs = self.specaug(xs) # Gaussian noise injection if self.gaussian_noise: xs = add_gaussian_noise(xs) # Sequence summary network if self.ssn is not None: xs += self.ssn(xs, xlens) elif self.input_type == 'text': xlens = torch.IntTensor([len(x) for x in xs]) xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs] xs = pad_list(xs, self.pad) xs = self.dropout_emb(self.embed(xs)) # TODO(hirofumi): fix for Transformer # encoder eout_dict = self.enc(xs, xlens, task.split('.')[0], use_cache, streaming) if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer']: for sub in ['sub1', 'sub2']: eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone() eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:] return eout_dict
def forward_rnnt(self, eouts, elens, ys): """Compute XE loss for the attention-based sequence-to-sequence model. Args: eouts (FloatTensor): `[B, T, dec_n_units]` elens (IntTensor): `[B]` ys (list): A list of length `[B]`, which contains a list of size `[L]` Returns: loss (FloatTensor): `[1]` """ # Append <sos> and <eos> eos = eouts.new_zeros(1).fill_(self.eos).long() if self.end_pointing: _ys = [ np2tensor(np.fromiter(y + [self.eos], dtype=np.int64), self.device_id) for y in ys ] else: _ys = [ np2tensor(np.fromiter(y, dtype=np.int64), self.device_id) for y in ys ] ylens = np2tensor(np.fromiter([y.size(0) for y in _ys], dtype=np.int32)) ys_in_pad = pad_list([torch.cat([eos, y], dim=0) for y in _ys], self.pad) ys_out_pad = pad_list(_ys, 0).int() # int for warprnnt_loss # Update prediction network dout, _ = self.recurrency(self.embed(ys_in_pad), None) # Compute output distribution logits = self.joint(eouts, dout) # Compute Transducer loss log_probs = F.log_softmax(logits, dim=-1) if self.device_id >= 0: ys_out_pad = ys_out_pad.cuda(self.device_id) elens = elens.cuda(self.device_id) ylens = ylens.cuda(self.device_id) assert log_probs.size(2) == ys_out_pad.size(1) + 1 loss = self.warprnnt_loss(log_probs, ys_out_pad.int(), elens, ylens) # NOTE: Transducer loss has already been normalized by bs # NOTE: index 0 is reserved for blank in warprnnt_pytorch # if self.device_id >= 0: # loss = loss.cuda(self.device_id) # Label smoothing for Transducer # if self.lsm_prob > 0: # loss = loss * (1 - self.lsm_prob) + kldiv_lsm_ctc(logits, # ylens=elens, # size_average=True) * self.lsm_prob # TODO(hirofumi): this leads to out of memory return loss
def encode(self, xs, task='all', streaming=False, lookback=False, lookahead=False): """Encode acoustic or text features. Args: xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]` task (str): all/ys*/ys_sub1*/ys_sub2* streaming (bool): streaming encoding lookback (bool): truncate leftmost frames for lookback in CNN context lookahead (bool): truncate rightmost frames for lookahead in CNN context Returns: eout_dict (dict): """ if self.input_type == 'speech': # Frame stacking if self.n_stacks > 1: xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs] # Splicing if self.n_splices > 1: xs = [splice(x, self.n_splices, self.n_stacks) for x in xs] xlens = torch.IntTensor([len(x) for x in xs]) xs = pad_list([np2tensor(x, self.device).float() for x in xs], 0.) # SpecAugment if self.specaug is not None and self.training: xs = self.specaug(xs) # Weight noise injection if self.weight_noise_std > 0 and self.training: self.add_weight_noise(std=self.weight_noise_std) # Input Gaussian noise injection if self.input_noise_std > 0 and self.training: xs = add_input_noise(xs, std=self.input_noise_std) # Sequence summary network if self.ssn is not None: xs = self.ssn(xs, xlens) elif self.input_type == 'text': xlens = torch.IntTensor([len(x) for x in xs]) xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device) for x in xs] xs = pad_list(xs, self.pad) xs = self.dropout_emb(self.embed(xs)) # TODO(hirofumi): fix for Transformer # encoder eout_dict = self.enc(xs, xlens, task.split('.')[0], streaming, lookback, lookahead) if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv']: for sub in ['sub1', 'sub2']: eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone() eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:] return eout_dict
def forward_transducer(self, eouts, elens, ys): """Compute Transducer loss. Args: eouts (FloatTensor): `[B, T, enc_n_units]` elens (IntTensor): `[B]` ys (list): length `B`, each of which contains a list of size `[L]` Returns: loss (FloatTensor): `[1]` """ # Append <sos> and <eos> _ys = [ np2tensor(np.fromiter(y, dtype=np.int64), eouts.device) for y in ys ] ylens = np2tensor(np.fromiter([y.size(0) for y in _ys], dtype=np.int32)) eos = eouts.new_zeros((1, ), dtype=torch.int64).fill_(self.eos) ys_in = pad_list([torch.cat([eos, y], dim=0) for y in _ys], self.pad) # `[B, L+1]` ys_out = pad_list(_ys, self.blank) # `[B, L]` # Update prediction network ys_emb = self.dropout_emb(self.embed(ys_in)) dout, _ = self.recurrency(ys_emb, None) # Compute output distribution logits = self.joint(eouts, dout) # `[B, T, L+1, vocab]` # Compute Transducer loss log_probs = torch.log_softmax(logits, dim=-1) assert log_probs.size(2) == ys_out.size(1) + 1 if self.device_id >= 0: ys_out = ys_out.to(eouts.device) elens = elens.to(eouts.device) ylens = ylens.to(eouts.device) import warp_rnnt loss = warp_rnnt.rnnt_loss(log_probs, ys_out.int(), elens, ylens, average_frames=False, reduction='mean', gather=False) else: import warprnnt_pytorch self.warprnnt_loss = warprnnt_pytorch.RNNTLoss() loss = self.warprnnt_loss(log_probs, ys_out.int(), elens, ylens) # NOTE: Transducer loss has already been normalized by bs # NOTE: index 0 is reserved for blank in warprnnt_pytorch return loss
def encode(self, xs, task='all', flip=False): """Encode acoustic or text features. Args: xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]` task (str): all or ys* or ys_sub1* or ys_sub2* flip (bool): if True, flip acoustic features in the time-dimension Returns: enc_outs (dict): """ if self.input_type == 'speech': xlens = torch.IntTensor([len(x) for x in xs]) # Flip acoustic features in the reverse order if flip: xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs] else: xs = [np2tensor(x, self.device_id).float() for x in xs] xs = pad_list(xs, 0.0) # SpecAugment if self.is_specaug and self.training: xs = self.specaug(xs) # Gaussian noise injection if self.gaussian_noise: xs = add_gaussian_noise(xs) # Sequence summary network if self.ssn is not None: xs += self.ssn(xs, xlens) elif self.input_type == 'text': xlens = torch.IntTensor([len(x) for x in xs]) xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs] xs = pad_list(xs, self.pad) xs = self.embed(xs) # encoder enc_outs = self.enc(xs, xlens, task.split('.')[0]) if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer']: for sub in ['sub1', 'sub2']: enc_outs['ys_' + sub]['xs'] = enc_outs['ys']['xs'].clone() enc_outs['ys_' + sub]['xlens'] = enc_outs['ys']['xlens'][:] del xs return enc_outs
def test_forward(args): args = make_args(**args) batch_size = 4 xmaxs = [40, 45] if args['chunk_size_left'] == -1 else [1600, 1655] device_id = -1 module = importlib.import_module( 'neural_sp.models.seq2seq.encoders.conformer') enc = module.ConformerEncoder(**args) for xmax in xmaxs: xs = np.random.randn(batch_size, xmax, args['input_dim']).astype(np.float32) xlens = torch.IntTensor([len(x) for x in xs]) xs = pad_list([np2tensor(x, device_id).float() for x in xs], 0.) enc_out_dict = enc(xs, xlens, task='all') assert enc_out_dict['ys']['xs'].size(0) == batch_size, xs.size() assert enc_out_dict['ys']['xs'].size( 1) == enc_out_dict['ys']['xlens'][0], xs.size() if args['n_layers_sub1'] > 0: assert enc_out_dict['ys_sub1']['xs'].size( 0) == batch_size, xs.size() assert enc_out_dict['ys_sub1']['xs'].size( 1) == enc_out_dict['ys_sub1']['xlens'][0], xs.size() if args['n_layers_sub2'] > 0: assert enc_out_dict['ys_sub2']['xs'].size( 0) == batch_size, xs.size() assert enc_out_dict['ys_sub2']['xs'].size( 1) == enc_out_dict['ys_sub2']['xlens'][0], xs.size()
def lm_rescoring(self, hyps, lm, lm_weight, reverse=False, tag=''): for i in range(len(hyps)): ys = hyps[i]['hyp'] # include <sos> if reverse: ys = ys[::-1] ys = [np2tensor(np.fromiter(ys, dtype=np.int64), self.device_id)] ys_in = pad_list([y[:-1] for y in ys], -1) # `[1, L-1]` ys_out = pad_list([y[1:] for y in ys], -1) # `[1, L-1]` lmout, lmstate, scores_lm = lm.predict(ys_in, None) score_lm = sum( [scores_lm[0, t, ys_out[0, t]] for t in range(ys_out.size(1))]) score_lm /= ys_out.size(1) hyps[i]['score'] += score_lm * lm_weight hyps[i]['score_lm_' + tag] = score_lm
def __call__(self, logits, elens, ys, ylens): """Forced alignment with references. Args: logits (FloatTensor): `[B, T, vocab]` elens (List): length `[B]` ys (List): length `[B]`, each of which contains a list of size `[L]` ylens (List): length `[B]` Returns: trigger_points (IntTensor): `[B, L]` """ with torch.no_grad(): ys = [ np2tensor(np.fromiter(y, dtype=np.int64), logits.device) for y in ys ] ys_in_pad = pad_list(ys, 0) # zero padding mask = make_pad_mask(elens.to(logits.device)) mask = mask.unsqueeze(2).expand_as(logits) logits = logits.masked_fill_(mask == 0, self.log0) log_probs = torch.log_softmax(logits, dim=-1).transpose( 0, 1) # `[T, B, vocab]` trigger_points = self.align(log_probs, elens, ys_in_pad, ylens) return trigger_points
def test_forward(args): args = make_args(**args) batch_size = 4 xmaxs = [40, 45] if int(args['chunk_size_left'].split('_')[0]) == -1 else [400, 455] device = "cpu" module = importlib.import_module('neural_sp.models.seq2seq.encoders.rnn') enc = module.RNNEncoder(**args) enc = enc.to(device) for xmax in xmaxs: xs = np.random.randn(batch_size, xmax, args['input_dim']).astype(np.float32) xlens = torch.IntTensor([len(x) - i * enc.subsampling_factor for i, x in enumerate(xs)]) # shuffle perm_ids = torch.randperm(batch_size) xs = xs[perm_ids] xlens = xlens[perm_ids] xs = pad_list([np2tensor(x, device).float() for x in xs], 0.) enc_out_dict = enc(xs, xlens, task='all') assert enc_out_dict['ys']['xs'].size(0) == batch_size assert enc_out_dict['ys']['xs'].size(1) == enc_out_dict['ys']['xlens'].max() for b in range(batch_size): if 'conv' in args['enc_type'] or args['subsample_type'] in ['max_pool', '1dconv', 'drop', 'add']: assert enc_out_dict['ys']['xlens'][b].item() == math.ceil(xlens[b].item() / enc.subsampling_factor) else: assert enc_out_dict['ys']['xlens'][b].item() == xlens[b].item() // enc.subsampling_factor if args['n_layers_sub1'] > 0: # all outputs assert enc_out_dict['ys_sub1']['xs'].size(0) == batch_size assert enc_out_dict['ys_sub1']['xs'].size(1) == enc_out_dict['ys_sub1']['xlens'].max() for b in range(batch_size): if 'conv' in args['enc_type'] or args['subsample_type'] in ['max_pool', '1dconv', 'drop', 'add']: assert enc_out_dict['ys_sub1']['xlens'][b].item() == math.ceil( xlens[b].item() / enc.subsampling_factor_sub1) else: assert enc_out_dict['ys_sub1']['xlens'][b].item() == xlens[b].item() // enc.subsampling_factor_sub1 # single output enc_out_dict_sub1 = enc(xs, xlens, task='ys_sub1') assert enc_out_dict_sub1['ys_sub1']['xs'].size(0) == batch_size assert enc_out_dict_sub1['ys_sub1']['xs'].size(1) == enc_out_dict['ys_sub1']['xlens'].max() if args['n_layers_sub2'] > 0: # all outputs assert enc_out_dict['ys_sub2']['xs'].size(0) == batch_size assert enc_out_dict['ys_sub2']['xs'].size(1) == enc_out_dict['ys_sub2']['xlens'].max() for b in range(batch_size): if 'conv' in args['enc_type'] or args['subsample_type'] in ['max_pool', '1dconv', 'drop', 'add']: assert enc_out_dict['ys_sub2']['xlens'][b].item() == math.ceil( xlens[b].item() / enc.subsampling_factor_sub2) else: assert enc_out_dict['ys_sub2']['xlens'][b].item() == xlens[b].item() // enc.subsampling_factor_sub2 # single output enc_out_dict_sub2 = enc(xs, xlens, task='ys_sub2') assert enc_out_dict_sub2['ys_sub2']['xs'].size(0) == batch_size assert enc_out_dict_sub2['ys_sub2']['xs'].size(1) == enc_out_dict_sub2['ys_sub2']['xlens'].max()
def test_blockwise(args): args = make_args(**args) batch_size = 4 xmaxs = [1600, 1655] device_id = -1 module = importlib.import_module( 'neural_sp.models.seq2seq.encoders.transformer') N_l = args['chunk_size_left'] N_c = args['chunk_size_current'] N_r = args['chunk_size_right'] for xmax in xmaxs: xs = np.random.randn(batch_size, xmax, args['input_dim']).astype(np.float32) xs = pad_list([np2tensor(x, device_id).float() for x in xs], 0.) xs_block = module.blockwise(xs, N_l, N_c, N_r) # Extract the center region xs_block = xs_block[:, N_l:N_l + N_c] # `[B * n_blocks, N_c, input_dim]` xs_block = xs_block.contiguous().view(batch_size, -1, xs_block.size(2)) xs_block = xs_block[:, :xmax] assert xs_block.size() == xs.size() assert torch.equal(xs_block, xs)
def test_transformer_forward(args): args = make_transformer_args(**args) batch_size = 4 xmax = 40 if args['chunk_size_left'] == -1 else 1600 device_id = -1 xs = np.random.randn(batch_size, xmax, args['input_dim']).astype(np.float32) xlens = torch.IntTensor([len(x) for x in xs]) xs = pad_list([np2tensor(x, device_id).float() for x in xs], 0.) transformer = importlib.import_module( 'neural_sp.models.seq2seq.encoders.transformer') enc = transformer.TransformerEncoder(**args) enc_out_dict = enc(xs, xlens, task='all') assert enc_out_dict['ys']['xs'].size(0) == batch_size assert enc_out_dict['ys']['xs'].size(1) == enc_out_dict['ys']['xlens'][0] if args['n_layers_sub1'] > 0: assert enc_out_dict['ys_sub1']['xs'].size(0) == batch_size assert enc_out_dict['ys_sub1']['xs'].size( 1) == enc_out_dict['ys_sub1']['xlens'][0] if args['n_layers_sub2'] > 0: assert enc_out_dict['ys_sub2']['xs'].size(0) == batch_size assert enc_out_dict['ys_sub2']['xs'].size( 1) == enc_out_dict['ys_sub2']['xlens'][0]
def test_forward_2d(args): args = make_args_2d(**args) batch_size = 4 xmaxs = [40, 45] device = "cpu" module = importlib.import_module('neural_sp.models.seq2seq.encoders.conv') (channels, kernel_sizes, strides, poolings), is_1dconv = module.parse_cnn_config(args['channels'], args['kernel_sizes'], args['strides'], args['poolings']) assert not is_1dconv enc = module.ConvEncoder(**args) enc = enc.to(device) for xmax in xmaxs: xs = np.random.randn(batch_size, xmax, args['input_dim']).astype(np.float32) xlens = torch.IntTensor([len(x) for x in xs]) xs = pad_list([np2tensor(x, device).float() for x in xs], 0.) xs, xlens = enc(xs, xlens) assert xs.size(0) == batch_size, xs.size() assert xs.size(1) == xlens[0], xs.size()
def test_forward(args): args = make_args(**args) batch_size = 4 emax = 40 device = "cpu" eouts = np.random.randn(batch_size, emax, ENC_N_UNITS).astype(np.float32) elens = torch.IntTensor([len(x) for x in eouts]) eouts = pad_list([np2tensor(x, device).float() for x in eouts], 0.) ylens = [4, 5, 3, 7] ys = [np.random.randint(0, VOCAB, ylen).astype(np.int32) for ylen in ylens] if args['lm_init'] or args['lm_fusion']: args_lm = make_args_rnnlm() module_rnnlm = importlib.import_module('neural_sp.models.lm.rnnlm') args['external_lm'] = module_rnnlm.RNNLM(args_lm).to(device) module = importlib.import_module('neural_sp.models.seq2seq.decoders.las') dec = module.RNNDecoder(**args) dec = dec.to(device) loss, observation = dec(eouts, elens, ys, task='all') assert loss.dim() == 1 assert loss.size(0) == 1 assert loss.item() >= 0 assert isinstance(observation, dict)
def test_forward(args): args = make_args(**args) batch_size = 4 xmaxs = [40, 45] if args['chunk_size_left'] == -1 else [400, 455] device = "cpu" module = importlib.import_module('neural_sp.models.seq2seq.encoders.transformer') enc = module.TransformerEncoder(**args) enc = enc.to(device) for xmax in xmaxs: xs = np.random.randn(batch_size, xmax, args['input_dim']).astype(np.float32) xlens = torch.IntTensor([len(x) - i * enc.subsampling_factor for i, x in enumerate(xs)]) xs = pad_list([np2tensor(x, device).float() for x in xs], 0.) # for mode in ['train', 'eval']: # too slow for mode in ['train']: if mode == 'train': enc.train() enc_out_dict = enc(xs, xlens, task='all') elif mode == 'eval': enc.eval() with torch.no_grad(): enc_out_dict = enc(xs, xlens, task='all') # enc._plot_attention() # too slow assert enc_out_dict['ys']['xs'].size(0) == batch_size, xs.size() assert enc_out_dict['ys']['xs'].size(1) == enc_out_dict['ys']['xlens'][0], xs.size() if args['n_layers_sub1'] > 0: assert enc_out_dict['ys_sub1']['xs'].size(0) == batch_size, xs.size() assert enc_out_dict['ys_sub1']['xs'].size(1) == enc_out_dict['ys_sub1']['xlens'][0], xs.size() if args['n_layers_sub2'] > 0: assert enc_out_dict['ys_sub2']['xs'].size(0) == batch_size, xs.size() assert enc_out_dict['ys_sub2']['xs'].size(1) == enc_out_dict['ys_sub2']['xlens'][0], xs.size()
def test_fixed_config_forward(args): args = make_args(**args) batch_size = 4 xmax = 400 input_dim = 80 device = "cpu" xs = np.random.randn(batch_size, xmax, input_dim).astype(np.float32) xs = pad_list([np2tensor(x, device).float() for x in xs], 0.) module = importlib.import_module( 'neural_sp.models.seq2seq.frontends.spec_augment') specaug = module.SpecAugment(**args) # fixed setting specaug.librispeech_basic() out = specaug(xs) assert out.size() == xs.size() specaug.librispeech_double() out = specaug(xs) assert out.size() == xs.size() specaug.switchboard_mild() out = specaug(xs) assert out.size() == xs.size() specaug.switchboard_strong() out = specaug(xs) assert out.size() == xs.size()
def generate_probs(self, batch, lm=None, lm_weight=0, temperature=1): # Encode input features if self.input_type == 'speech': enc_outs = self.encode(batch['xs'], task='ys') else: enc_outs = self.encode(batch['ys_sub1'], task='ys') # for the forward decoder in the main task logits = self.dec_fwd.forward_att(enc_outs['ys']['xs'], enc_outs['ys']['xlens'], batch['ys'], return_logits=True) teacher_probs = torch.softmax(logits / temperature, dim=-1).data if lm is not None and lm_weight > 0: # Append <sos> and <eos> eos = logits.new_zeros(1).fill_(self.eos).long() _ys = [ np2tensor(np.fromiter(y, dtype=np.int64), self.device_id) for y in batch['ys'] ] ys_in = [torch.cat([eos, y], dim=0) for y in _ys] ys_in_pad = pad_list(ys_in, self.pad) lmout, _ = lm.decode(lm.encode(ys_in_pad), None) lm_probs = torch.softmax(lm.generate(lmout), dim=-1).data teacher_probs = (1 - lm_weight) * teacher_probs + lm_weight * lm_probs return teacher_probs
def _forward(self, ys): if self.backward: ys = [ np2tensor(np.fromiter(y[::-1], dtype=np.int64), self.device_id).long() for y in ys ] else: ys = [ np2tensor(np.fromiter(y, dtype=np.int64), self.device_id).long() for y in ys ] ys = pad_list(ys, self.pad) ys_in = ys[:, :-1] ys_out = ys[:, 1:] # Path through embedding ys_in = self.embed(ys_in) if self.fast_impl: ys_in, _ = self.rnn(ys_in, hx=None) ys_in = self.dropout_top(ys_in) else: xs_lower = None for l in range(self.nlayers): # Path through RNN ys_in, _ = self.rnn[l](ys_in, hx=None) ys_in = self.dropout[l](ys_in) # Residual connection if self.residual and l > 0: ys_in += xs_lower xs_lower = ys_in # NOTE: Exclude residual connection from the raw inputs logits = self.output(ys_in) # Compute XE sequence loss loss = F.cross_entropy(logits.view((-1, logits.size(2))), ys_out.contiguous().view(-1), ignore_index=self.pad, size_average=True) # Compute token-level accuracy in teacher-forcing pad_pred = logits.view(ys_out.size(0), ys_out.size(1), logits.size(-1)).argmax(2) mask = ys_out != self.pad numerator = torch.sum( pad_pred.masked_select(mask) == ys_out.masked_select(mask)) denominator = torch.sum(mask) acc = float(numerator) * 100 / float(denominator) observation = { 'loss': loss.item(), 'acc': acc, 'ppl': math.exp(loss.item()) } return loss, observation
def generate_lm_logits(self, ys, lm, temperature=5.0): # Append <sos> and <eos> eos = next(lm.parameters()).new_zeros(1).fill_(self.eos).long() ys = [np2tensor(np.fromiter(y, dtype=np.int64), self.device)for y in ys] ys_in = pad_list([torch.cat([eos, y], dim=0) for y in ys], self.pad) lmout, _ = lm.decode(ys_in, None) logits = lm.output(lmout) return logits
def forward_lmobj(self, ys): """Compute XE loss for LM objective. Args: ys (list): A list of length `[B]`, which contains a list of size `[L]` Returns: loss (FloatTensor): `[1]` acc (float): accuracy ppl (float): perplexity """ w = next(self.parameters()) # Append <sos> and <eos> eos = w.new_zeros(1).fill_(self.eos) ys = [ np2tensor(np.fromiter(y, dtype=np.int64), self.device_id) for y in ys ] ylens = np2tensor( np.fromiter([y.size(0) + 1 for y in ys], dtype=np.int32)) # +1 for <eos> ys_in_pad = pad_list([torch.cat([eos, y], dim=0) for y in ys], self.pad) ys_out_pad = pad_list([torch.cat([y, eos], dim=0) for y in ys], self.pad) # Update prediction network dout, _ = self.recurrency(self.embed(ys_in_pad), None) logits = self.output_lmobj(dout) # Compute XE loss for LM objective loss = F.cross_entropy(logits.view((-1, logits.size(2))), ys_out_pad.view(-1), ignore_index=self.pad, size_average=True) # Compute token-level accuracy in teacher-forcing acc = compute_accuracy(logits, ys_out_pad, self.pad) ppl = min(np.exp(loss.item()), np.inf) # scale loss for CTC loss *= ylens.float().mean() return loss, acc, ppl
def test_streaming_decoding(params): args = make_args(attn_type='mocha') params = make_decode_params(**params) batch_size = params['recog_batch_size'] emax = 400 device = "cpu" eouts = np.random.randn(batch_size, emax, ENC_N_UNITS).astype(np.float32) eouts = pad_list([np2tensor(x, device).float() for x in eouts], 0.) ctc_log_probs = None if params['recog_ctc_weight'] > 0: ctc_log_probs = torch.FloatTensor(batch_size, emax, VOCAB, device=device) args_lm = make_args_rnnlm() module_rnnlm = importlib.import_module('neural_sp.models.lm.rnnlm') lm = None if params['recog_lm_weight'] > 0: lm = module_rnnlm.RNNLM(args_lm).to(device) if args['lm_fusion']: args['external_lm'] = module_rnnlm.RNNLM(args_lm).to(device) module = importlib.import_module('neural_sp.models.seq2seq.decoders.las') dec = module.RNNDecoder(**args) dec = dec.to(device) N_l = 5 n_chunks = math.ceil(emax / N_l) hyps = None module_bs = importlib.import_module( 'neural_sp.models.seq2seq.decoders.beam_search') helper = module_bs.BeamSearch(params['recog_beam_width'], dec.eos, params['recog_ctc_weight'], params['recog_lm_weight'], device) dec.eval() with torch.no_grad(): for chunk_idx in range(n_chunks): eouts_chunk = eouts[:, N_l * chunk_idx:N_l * (chunk_idx + 1)] out = dec.beam_search_block_sync(eouts_chunk, params, helper, idx2token, hyps, lm, ctc_log_probs=ctc_log_probs) assert len(out) == 3 end_hyps, hyps, _ = out assert isinstance(end_hyps, list) assert isinstance(hyps, list)
def test_forward(): batch_size = 4 xmax = 40 input_dim = 80 device = "cpu" xs = np.random.randn(batch_size, xmax, input_dim).astype(np.float32) xs = pad_list([np2tensor(x, device).float() for x in xs], 0.) out = add_input_noise(xs, std=0.075) assert out.size() == xs.size()
def test_forward(args): args = make_args(**args) batch_size = 4 xmaxs = [40, 45] if args['chunk_size_left'] == -1 else [800, 855] device_id = -1 module = importlib.import_module('neural_sp.models.seq2seq.encoders.rnn') enc = module.RNNEncoder(**args) for xmax in xmaxs: xs = np.random.randn(batch_size, xmax, args['input_dim']).astype(np.float32) xlens = torch.IntTensor( [len(x) - i * enc.subsampling_factor for i, x in enumerate(xs)]) xs = pad_list([np2tensor(x, device_id).float() for x in xs], 0.) enc_out_dict = enc(xs, xlens, task='all') assert enc_out_dict['ys']['xs'].size(0) == batch_size assert enc_out_dict['ys']['xs'].size( 1) == enc_out_dict['ys']['xlens'].max() for b in range(batch_size): if 'conv' in args['rnn_type'] or args['subsample_type'] in [ 'max_pool', '1dconv' ]: assert enc_out_dict['ys']['xlens'][b].item() == math.ceil( xlens[b].item() / enc.subsampling_factor) else: assert enc_out_dict['ys']['xlens'][b].item() == math.floor( xlens[b].item() / enc.subsampling_factor) if args['n_layers_sub1'] > 0: assert enc_out_dict['ys_sub1']['xs'].size(0) == batch_size assert enc_out_dict['ys_sub1']['xs'].size( 1) == enc_out_dict['ys_sub1']['xlens'].max() for b in range(batch_size): if 'conv' in args['rnn_type'] or args['subsample_type'] in [ 'max_pool', '1dconv' ]: assert enc_out_dict['ys_sub1']['xlens'][b].item( ) == math.ceil(xlens[b].item() / enc.subsampling_factor) else: assert enc_out_dict['ys_sub1']['xlens'][b].item( ) == math.floor(xlens[b].item() / enc.subsampling_factor) if args['n_layers_sub2'] > 0: assert enc_out_dict['ys_sub2']['xs'].size(0) == batch_size assert enc_out_dict['ys_sub2']['xs'].size( 1) == enc_out_dict['ys_sub2']['xlens'].max() for b in range(batch_size): if 'conv' in args['rnn_type'] or args['subsample_type'] in [ 'max_pool', '1dconv' ]: assert enc_out_dict['ys_sub2']['xlens'][b].item( ) == math.ceil(xlens[b].item() / enc.subsampling_factor) else: assert enc_out_dict['ys_sub2']['xlens'][b].item( ) == math.floor(xlens[b].item() / enc.subsampling_factor)
def forward(self, eouts, elens, ys, forced_align=False): """Compute CTC loss. Args: eouts (FloatTensor): `[B, T, dec_n_units]` elens (list): A list of length B ys (list): A list of length B, which contains a list of size `[L]` Returns: loss (FloatTensor): `[B, L, vocab]` """ # Concatenate all elements in ys for warpctc_pytorch ylens = np2tensor(np.fromiter([len(y) for y in ys], dtype=np.int32)) ys_ctc = torch.cat([ np2tensor(np.fromiter(y[::-1] if self.bwd else y, dtype=np.int32)) for y in ys ], dim=0) # NOTE: do not copy to GPUs here # Compute CTC loss logits = self.output(eouts) loss = self.warpctc_loss( logits.transpose(1, 0), # time-major ys_ctc, elens.cpu(), ylens) # NOTE: ctc loss has already been normalized by bs # NOTE: index 0 is reserved for blank in warpctc_pytorch if self.device_id >= 0: loss = loss.cuda(self.device_id) # Label smoothing for CTC if self.lsm_prob > 0: loss = loss * (1 - self.lsm_prob) + kldiv_lsm_ctc( logits, elens) * self.lsm_prob trigger_points = None if forced_align: ys = [ np2tensor(np.fromiter(y, dtype=np.int64), self.device_id) for y in ys ] ys_in_pad = pad_list(ys, 0) # pad by zero trigger_points = self.forced_aligner.align(logits.clone(), elens, ys_in_pad, ylens) return loss, trigger_points
def test_forward(args): args = make_args(**args) batch_size = 4 xmax = 400 input_dim = 80 device = "cpu" xs = np.random.randn(batch_size, xmax, input_dim).astype(np.float32) xs = pad_list([np2tensor(x, device).float() for x in xs], 0.) module = importlib.import_module( 'neural_sp.models.seq2seq.frontends.spec_augment') specaug = module.SpecAugment(**args) out = specaug(xs) assert out.size() == xs.size()
def forced_align(self, logits, elens, ys, ylens): """Forced alignment with references. Args: logits (FloatTensor): `[B, T, vocab]` elens (List): length `B` ys (List): length `B`, each of which contains a list of size `[L]` ylens (List): length `B` Returns: trigger_points (IntTensor): `[B, L]` """ with torch.no_grad(): ys = [np2tensor(np.fromiter(y, dtype=np.int64), logits.device) for y in ys] ys_in_pad = pad_list(ys, 0) trigger_points = self.forced_aligner.align(logits.clone(), elens, ys_in_pad, ylens) return trigger_points