def prepare_inputs(bs, idim, odim, maxin_len, maxout_len, spk_embed_dim=None, spc_dim=None, device=torch.device('cpu')): ilens = np.sort(np.random.randint(1, maxin_len, bs))[::-1].tolist() olens = np.sort(np.random.randint(1, maxout_len, bs))[::-1].tolist() ilens = torch.LongTensor(ilens).to(device) olens = torch.LongTensor(olens).to(device) xs = [np.random.randint(0, idim, l) for l in ilens] ys = [np.random.randn(l, odim) for l in olens] xs = pad_list([torch.from_numpy(x).long() for x in xs], 0).to(device) ys = pad_list([torch.from_numpy(y).float() for y in ys], 0).to(device) labels = ys.new_zeros(ys.size(0), ys.size(1)) for i, l in enumerate(olens): labels[i, l - 1:] = 1 batch = { "xs": xs, "ilens": ilens, "ys": ys, "labels": labels, "olens": olens, } if spk_embed_dim is not None: spembs = torch.from_numpy(np.random.randn(bs, spk_embed_dim)).float().to(device) batch["spembs"] = spembs if spc_dim is not None: spcs = [np.random.randn(l, spc_dim) for l in olens] spcs = pad_list([torch.from_numpy(spc).float() for spc in spcs], 0).to(device) batch["spcs"] = spcs return batch
def calculate_all_attentions(self, hs_pad, hlen, ys_pad, strm_idx=0): """Calculate all of attentions :param torch.Tensor hs_pad: batch of padded hidden state sequences (B, Tmax, D) :param torch.Tensor hlen: batch of lengths of hidden state sequences (B) :param torch.Tensor ys_pad: batch of padded character id sequence tensor (B, Lmax) :param int strm_idx: stream index for parallel speaker attention in multi-speaker case :return: attention weights with the following shape, 1) multi-head case => attention weights (B, H, Lmax, Tmax), 2) other case => attention weights (B, Lmax, Tmax). :rtype: float ndarray """ # TODO(kan-bayashi): need to make more smart way ys = [y[y != self.ignore_id] for y in ys_pad] # parse padded ys att_idx = min(strm_idx, len(self.att) - 1) # hlen should be list of integer hlen = list(map(int, hlen)) self.loss = None # prepare input and output word sequences with sos/eos IDs eos = ys[0].new([self.eos]) sos = ys[0].new([self.sos]) ys_in = [torch.cat([sos, y], dim=0) for y in ys] ys_out = [torch.cat([y, eos], dim=0) for y in ys] # padding for ys with -1 # pys: utt x olen ys_in_pad = pad_list(ys_in, self.eos) ys_out_pad = pad_list(ys_out, self.ignore_id) # get length info olength = ys_out_pad.size(1) # initialization c_list = [self.zero_state(hs_pad)] z_list = [self.zero_state(hs_pad)] for _ in six.moves.range(1, self.dlayers): c_list.append(self.zero_state(hs_pad)) z_list.append(self.zero_state(hs_pad)) att_w = None att_ws = [] self.att[att_idx].reset() # reset pre-computation of h # pre-computation of embedding eys = self.dropout_emb(self.embed(ys_in_pad)) # utt x olen x zdim # loop for an output sequence for i in six.moves.range(olength): att_c, att_w = self.att[att_idx](hs_pad, hlen, self.dropout_dec[0](z_list[0]), att_w) ey = torch.cat((eys[:, i, :], att_c), dim=1) # utt x (zdim + hdim) z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list) att_ws.append(att_w) # convert to numpy array with the shape (B, Lmax, Tmax) att_ws = att_to_numpy(att_ws, self.att[att_idx]) return att_ws
def mask_uniform(ys_pad, mask_token, eos, ignore_id): """Replace random tokens with <mask> label and add <eos> label. The number of <mask> is chosen from a uniform distribution between one and the target sequence's length. :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax) :param int mask_token: index of <mask> :param int eos: index of <eos> :param int ignore_id: index of padding :return: padded tensor (B, Lmax) :rtype: torch.Tensor :return: padded tensor (B, Lmax) :rtype: torch.Tensor """ from espnet.nets.pytorch_backend.nets_utils import pad_list ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys ys_out = [y.new(y.size()).fill_(ignore_id) for y in ys] ys_in = [y.clone() for y in ys] for i in range(len(ys)): num_samples = numpy.random.randint(1, len(ys[i]) + 1) idx = numpy.random.choice(len(ys[i]), num_samples) ys_in[i][idx] = mask_token ys_out[i][idx] = ys[i][idx] return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)
def convert_batch(batch, backend="pytorch", is_cuda=False, idim=2, odim=2, num_inputs=2): ilens_list = [ np.array([x[1]["input"][idx]["shape"][0] for x in batch]) for idx in range(num_inputs) ] olens = np.array([x[1]["output"][0]["shape"][0] for x in batch]) xs_list = [[ np.random.randn(ilen, idim).astype(np.float32) for ilen in ilens_list[idx] ] for idx in range(num_inputs)] ys = [np.random.randint(1, odim, olen).astype(np.int32) for olen in olens] is_pytorch = backend == "pytorch" if is_pytorch: xs_list = [ pad_list([torch.from_numpy(x).float() for x in xs_list[idx]], 0) for idx in range(num_inputs) ] ilens_list = [ torch.from_numpy(ilens_list[idx]).long() for idx in range(num_inputs) ] ys = pad_list([torch.from_numpy(y).long() for y in ys], -1) if is_cuda: xs_list = [xs_list[idx].cuda() for idx in range(num_inputs)] ilens_list = [ilens_list[idx].cuda() for idx in range(num_inputs)] ys = ys.cuda() return xs_list, ilens_list, ys
def prepare_inputs( idim, odim, ilens, olens, spk_embed_dim=None, device=torch.device("cpu") ): xs = [np.random.randint(0, idim, l) for l in ilens] ys = [np.random.randn(l, odim) for l in olens] ilens = torch.LongTensor(ilens).to(device) olens = torch.LongTensor(olens).to(device) xs = pad_list([torch.from_numpy(x).long() for x in xs], 0).to(device) ys = pad_list([torch.from_numpy(y).float() for y in ys], 0).to(device) labels = ys.new_zeros(ys.size(0), ys.size(1)) for i, l in enumerate(olens): labels[i, l - 1 :] = 1 batch = { "xs": xs, "ilens": ilens, "ys": ys, "labels": labels, "olens": olens, } if spk_embed_dim is not None: batch["spembs"] = torch.FloatTensor( np.random.randn(len(ilens), spk_embed_dim) ).to(device) return batch
def decoder_forward(self, hs_pad, hlens, ys_pad): ys = [y[y != self.ignore_id] for y in ys_pad] hlens = list(map(int, hlens)) blank = ys[0].new([self.blank]) ys_in = [torch.cat([blank, y], dim=0) for y in ys] ys_in_pad = pad_list(ys_in, self.blank) olength = ys_in_pad.size(1) z_list, c_list = self.zero_state(hs_pad) eys = self.dropout_embed(self.embed(ys_in_pad)) z_all = [] for i in six.moves.range(olength): y, (z_list, c_list) = self.rnn_forward(eys[:, i, :], (z_list, c_list)) z_all.append(y) h_dec = torch.stack(z_all, dim=1) h_enc = hs_pad.unsqueeze(2) h_dec = h_dec.unsqueeze(1) z = self.joint(h_enc, h_dec) y = pad_list(ys, self.blank).type(torch.int32) z_len = to_device(self, torch.IntTensor(hlens)) y_len = to_device(self, torch.IntTensor([_y.size(0) for _y in ys])) return z, y, z_len, y_len
def convert_batch(batch, backend="pytorch", is_cuda=False, idim=40, odim=5): ilens = np.array([x[1]['input'][0]['shape'][0] for x in batch]) olens = np.array([x[1]['output'][0]['shape'][0] for x in batch]) xs = [np.random.randn(ilen, idim).astype(np.float32) for ilen in ilens] ys = [np.random.randint(1, odim, olen).astype(np.int32) for olen in olens] is_pytorch = backend == "pytorch" if is_pytorch: xs = pad_list([torch.from_numpy(x).float() for x in xs], 0) ilens = torch.from_numpy(ilens).long() ys = pad_list([torch.from_numpy(y).long() for y in ys], -1) if is_cuda: xs = xs.cuda() ilens = ilens.cuda() ys = ys.cuda() else: if is_cuda: xp = importlib.import_module('cupy') xs = [chainer.Variable(xp.array(x)) for x in xs] ys = [chainer.Variable(xp.array(y)) for y in ys] ilens = xp.array(ilens) else: xs = [chainer.Variable(x) for x in xs] ys = [chainer.Variable(y) for y in ys] return xs, ilens, ys
def prepare_inputs( mode, ilens=[20, 15], olens_tgt=[4, 3], olens_src=[3, 2], is_cuda=False ): np.random.seed(1) assert len(ilens) == len(olens_tgt) xs = [np.random.randn(ilen, 40).astype(np.float32) for ilen in ilens] ys_tgt = [np.random.randint(1, 5, olen).astype(np.int32) for olen in olens_tgt] ys_src = [np.random.randint(1, 5, olen).astype(np.int32) for olen in olens_src] ilens = np.array([x.shape[0] for x in xs], dtype=np.int32) if mode == "chainer": raise NotImplementedError elif mode == "pytorch": ilens = torch.from_numpy(ilens).long() xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0) ys_pad_tgt = pad_list([torch.from_numpy(y).long() for y in ys_tgt], -1) ys_pad_src = pad_list([torch.from_numpy(y).long() for y in ys_src], -1) if is_cuda: xs_pad = xs_pad.cuda() ilens = ilens.cuda() ys_pad_tgt = ys_pad_tgt.cuda() ys_pad_src = ys_pad_src.cuda() return xs_pad, ilens, ys_pad_tgt, ys_pad_src else: raise ValueError("Invalid mode")
def prepare_inputs(mode, ilens=[150, 100], olens=[4, 3], is_cuda=False): np.random.seed(1) assert len(ilens) == len(olens) xs = [np.random.randn(ilen, 40).astype(np.float32) for ilen in ilens] ys = [np.random.randint(1, 5, olen).astype(np.int32) for olen in olens] ilens = np.array([x.shape[0] for x in xs], dtype=np.int32) if mode == "chainer": if is_cuda: xp = importlib.import_module('cupy') xs = [chainer.Variable(xp.array(x)) for x in xs] ys = [chainer.Variable(xp.array(y)) for y in ys] ilens = xp.array(ilens) else: xs = [chainer.Variable(x) for x in xs] ys = [chainer.Variable(y) for y in ys] return xs, ilens, ys elif mode == "pytorch": ilens = torch.from_numpy(ilens).long() xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0) ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], -1) if is_cuda: xs_pad = xs_pad.cuda() ilens = ilens.cuda() ys_pad = ys_pad.cuda() return xs_pad, ilens, ys_pad else: raise ValueError("Invalid mode")
def prepare_inputs(mode, num_encs=2, is_cuda=False): ilens_list = [[3, 2] for _ in range(num_encs)] olens = [2, 1] np.random.seed(1) assert len(ilens_list[0]) == len(ilens_list[1]) == len(olens) xs_list = [[np.random.randn(ilen, 2).astype(np.float32) for ilen in ilens] for ilens in ilens_list] ys = [np.random.randint(1, 2, olen).astype(np.int32) for olen in olens] ilens_list = [ np.array([x.shape[0] for x in xs], dtype=np.int32) for xs in xs_list ] if mode == "pytorch": ilens_list = [torch.from_numpy(ilens).long() for ilens in ilens_list] xs_pad_list = [ pad_list([torch.from_numpy(x).float() for x in xs], 0) for xs in xs_list ] ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], -1) if is_cuda: xs_pad_list = [xs_pad.cuda() for xs_pad in xs_pad_list] ilens_list = [ilens.cuda() for ilens in ilens_list] ys_pad = ys_pad.cuda() return xs_pad_list, ilens_list, ys_pad else: raise ValueError("Invalid mode")
def convert_batch(batch, backend="pytorch", is_cuda=False, idim=40, odim=5): ilens = np.array([x[1]["input"][0]["shape"][0] for x in batch]) olens_tgt = np.array([x[1]["output"][0]["shape"][0] for x in batch]) olens_src = np.array([x[1]["output"][1]["shape"][0] for x in batch]) xs = [np.random.randn(ilen, idim).astype(np.float32) for ilen in ilens] ys_tgt = [ np.random.randint(1, odim, olen).astype(np.int32) for olen in olens_tgt ] ys_src = [ np.random.randint(1, odim, olen).astype(np.int32) for olen in olens_src ] is_pytorch = backend == "pytorch" if is_pytorch: xs = pad_list([torch.from_numpy(x).float() for x in xs], 0) ilens = torch.from_numpy(ilens).long() ys_tgt = pad_list([torch.from_numpy(y).long() for y in ys_tgt], -1) ys_src = pad_list([torch.from_numpy(y).long() for y in ys_src], -1) if is_cuda: xs = xs.cuda() ilens = ilens.cuda() ys_tgt = ys_tgt.cuda() ys_src = ys_src.cuda() else: raise NotImplementedError return xs, ilens, ys_tgt, ys_src
def add_sos_eos(self, ys_pad): from espnet.nets.pytorch_backend.nets_utils import pad_list eos = ys_pad.new([self.eos]) sos = ys_pad.new([self.sos]) ys = [y[y != self.ignore_id] for y in ys_pad] # parse padded ys ys_in = [torch.cat([sos, y], dim=0) for y in ys] ys_out = [torch.cat([y, eos], dim=0) for y in ys] return pad_list(ys_in, self.eos), pad_list(ys_out, self.ignore_id)
def forward(self, hs_pad, hlens, ys_pad): """Decoder forward Args: hs_pad (torch.Tensor): batch of padded hidden state sequences (B, Tmax, D) hlens (torch.Tensor): batch of lengths of hidden state sequences (B) ys_pad (torch.Tensor): batch of padded character id sequence tensor (B, Lmax) Returns: loss (torch.Tensor): rnnt-att loss value """ ys = [y[y != self.ignore_id] for y in ys_pad] hlens = list(map(int, hlens)) blank = ys[0].new([self.blank]) ys_in = [torch.cat([blank, y], dim=0) for y in ys] ys_in_pad = pad_list(ys_in, self.blank) olength = ys_in_pad.size(1) c_list = [self.zero_state(hs_pad)] z_list = [self.zero_state(hs_pad)] for _ in six.moves.range(1, self.dlayers): c_list.append(self.zero_state(hs_pad)) z_list.append(self.zero_state(hs_pad)) att_w = None self.att[0].reset() eys = self.dropout_emb(self.embed(ys_in_pad)) z_all = [] for i in six.moves.range(olength): att_c, att_w = self.att[0](hs_pad, hlens, self.dropout_dec[0](z_list[0]), att_w) ey = torch.cat((eys[:, i, :], att_c), dim=1) z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list) z_all.append(self.dropout_dec[-1](z_list[-1])) h_dec = torch.stack(z_all, dim=1) h_enc = hs_pad.unsqueeze(2) h_dec = h_dec.unsqueeze(1) z = self.joint(h_enc, h_dec) y = pad_list(ys, self.blank).type(torch.int32) z_len = to_device(self, torch.IntTensor(hlens)) y_len = to_device(self, torch.IntTensor([_y.size(0) for _y in ys])) loss = to_device(self, self.rnnt_loss(z, y, z_len, y_len)) return loss
def forward(self, xs, ilens, ys, olens, spembs=None): """Calculate forward propagation. Args: xs (Tensor): Batch of the padded sequences of character ids (B, Tmax). ilens (Tensor): Batch of lengths of each input sequence (B,). ys (Tensor): Batch of the padded sequence of target features (B, Lmax, odim). olens (Tensor): Batch of lengths of each output sequence (B,). spembs (Tensor, optional): Batch of speaker embedding vectors (B, spk_embed_dim). Returns: Tensor: Batch of durations (B, Tmax). """ att_ws = self._calculate_encoder_decoder_attentions(xs, ilens, ys, olens, spembs=spembs) # TODO(kan-bayashi): fix this issue # this does not work in multi-gpu case. registered buffer is not saved. if int(self.diag_head_idx) == -1: self._init_diagonal_head(att_ws) att_ws = att_ws[:, self.diag_head_idx] durations = [ self._calculate_duration(att_w, ilen, olen) for att_w, ilen, olen in zip(att_ws, ilens, olens) ] return pad_list(durations, 0)
def _generate_pseudo_label(self, encoder_out, text, text_lengths): from espnet.nets.pytorch_backend.nets_utils import pad_list from itertools import groupby # sample from ASR model ys_hat = self.ctc.argmax(encoder_out).data # remove the blank and pad text_hat = [] text_hat_lengths = [] for y in ys_hat.cpu(): y_hat = [x[0] for x in groupby(y)] y_hat_beta = [] for idx in y_hat: idx = int(idx) if idx != self.ignore_id and idx != self.error_calculator.idx_blank: y_hat_beta.append(int(idx)) text_hat.append(torch.tensor(y_hat_beta).to(text.device)[:-1]) text_hat_lengths.append(len(y_hat_beta) - 1) # make the list to tensor text_hat = pad_list(text_hat, self.ignore_id) text_hat_lengths = torch.tensor(text_hat_lengths).to( text_lengths.device) # replace the label and safe detach the tensor text = text_hat.detach() text_lengths = text_hat_lengths.detach() return text, text_lengths
def test_pad_list(): xs = [[1, 2, 3], [1, 2], [1, 2, 3, 4]] xs = list(map(lambda x: torch.LongTensor(x), xs)) xpad = pad_list(xs, -1) es = [[1, 2, 3, -1], [1, 2, -1, -1], [1, 2, 3, 4]] assert xpad.data.tolist() == es
def recognize_batch(self, xs, recog_args, char_list, rnnlm=None): """E2E beam search. :param list xs: list of input acoustic feature arrays [(T_1, D), (T_2, D), ...] :param Namespace recog_args: argument Namespace containing options :param list char_list: list of characters :param torch.nn.Module rnnlm: language model module :return: N-best decoding results :rtype: list """ prev = self.training self.eval() ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64) # subsample frame xs = [xx[:: self.subsample[0], :] for xx in xs] xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs] xs_pad = pad_list(xs, 0.0) # 0. Frontend if self.frontend is not None: enhanced, hlens, mask = self.frontend(xs_pad, ilens) hs_pad, hlens = self.feature_transform(enhanced, hlens) else: hs_pad, hlens = xs_pad, ilens batchsize = hs_pad.size(0) # 1. Encoder hyps, hlens, _ = self.enc(hs_pad, hlens) hyps = hyps.view(batchsize, -1, self.odim) return hyps
def translate_batch(self, xs, trans_args, char_list, rnnlm=None): """E2E batch beam search. :param list xs: list of input acoustic feature arrays [(T_1, D), (T_2, D), ...] :param Namespace trans_args: argument Namespace containing options :param list char_list: list of characters :param torch.nn.Module rnnlm: language model module :return: N-best decoding results :rtype: list """ prev = self.training self.eval() ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64) # subsample frame xs = [xx[::self.subsample[0], :] for xx in xs] xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs] xs_pad = pad_list(xs, 0.0) # 1. Encoder hs_pad, hlens, _ = self.enc(xs_pad, ilens) # 2. Decoder hlens = torch.tensor(list(map(int, hlens))) # make sure hlens is tensor y = self.dec.recognize_beam_batch(hs_pad, hlens, None, trans_args, char_list, rnnlm) if prev: self.train() return y
def translate_batch(self, xs, trans_args, char_list, rnnlm=None): """E2E batch beam search. :param list xs: list of input source text feature arrays [(T_1, D), (T_2, D), ...] :param Namespace trans_args: argument Namespace containing options :param list char_list: list of characters :param torch.nn.Module rnnlm: language model module :return: N-best decoding results :rtype: list """ prev = self.training self.eval() # 1. Encoder if self.multilingual: ilens = np.fromiter((len(xx[1:]) for xx in xs), dtype=np.int64) hs = [to_device(self, torch.from_numpy(xx[1:])) for xx in xs] else: ilens = np.fromiter((len(xx) for xx in xs), dtype=np.int64) hs = [to_device(self, torch.from_numpy(xx)) for xx in xs] xpad = pad_list(hs, self.pad) hs_pad, hlens, _ = self.enc(self.dropout(self.embed(xpad)), ilens) # 2. Decoder hlens = torch.tensor(list(map(int, hlens))) # make sure hlens is tensor y = self.dec.recognize_beam_batch(hs_pad, hlens, None, trans_args, char_list, rnnlm) if prev: self.train() return y
def test_length_regulator(): # prepare inputs idim = 5 ilens = [10, 5, 3] xs = pad_list([torch.randn((ilen, idim)) for ilen in ilens], 0.0) ds = pad_list([torch.arange(ilen) for ilen in ilens], 0) # test with non-zero durations length_regulator = LengthRegulator() xs_expand = length_regulator(xs, ds, ilens) assert int(xs_expand.shape[1]) == int(ds.sum(dim=-1).max()) # test with duration including zero ds[:, 2] = 0 xs_expand = length_regulator(xs, ds, ilens) assert int(xs_expand.shape[1]) == int(ds.sum(dim=-1).max())
def forward_mt(self, xs_pad, ys_in_pad, ys_out_pad, ys_mask): """Forward pass in the auxiliary MT task. :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim) :param torch.Tensor ys_in_pad: batch of padded target sequences (B, Lmax) :param torch.Tensor ys_out_pad: batch of padded target sequences (B, Lmax) :param torch.Tensor ys_mask: batch of input token mask (B, Lmax) :return: MT loss value :rtype: torch.Tensor :return: accuracy in MT decoder :rtype: float """ loss, acc = 0.0, None if self.mt_weight == 0: return loss, acc ilens = torch.sum(xs_pad != self.ignore_id, dim=1).cpu().numpy() # NOTE: xs_pad is padded with -1 xs = [x[x != self.ignore_id] for x in xs_pad] # parse padded xs xs_zero_pad = pad_list(xs, self.pad) # re-pad with zero xs_zero_pad = xs_zero_pad[:, : max(ilens)] # for data parallel src_mask = ( make_non_pad_mask(ilens.tolist()).to(xs_zero_pad.device).unsqueeze(-2) ) hs_pad, hs_mask = self.encoder_mt(xs_zero_pad, src_mask) pred_pad, _ = self.decoder(ys_in_pad, ys_mask, hs_pad, hs_mask) loss = self.criterion(pred_pad, ys_out_pad) acc = th_accuracy( pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id ) return loss, acc
def forward(self, xs, ds, alpha=1.0): """Calculate forward propagation. Args: xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D). ds (LongTensor): Batch of durations of each frame (B, T). alpha (float, optional): Alpha value to control speed of speech. Returns: Tensor: replicated input tensor based on durations (B, T*, D). """ if alpha != 1.0: assert alpha > 0 ds = torch.round(ds.float() * alpha).long() if ds.sum() == 0: logging.warning("predicted durations includes all 0 sequences. " "fill the first element with 1.") # NOTE(kan-bayashi): This case must not be happend in teacher forcing. # It will be happened in inference with a bad duration predictor. # So we do not need to care the padded sequence case here. ds[ds.sum(dim=1).eq(0)] = 1 repeat = [torch.repeat_interleave(x, d, dim=0) for x, d in zip(xs, ds)] return pad_list(repeat, self.pad_value)
def enhance(self, xs): """Forward only the frontend stage. Args: xs (ndarray): input acoustic feature (T, C, F) Returns: enhanced (ndarray): mask (torch.Tensor): ilens (torch.Tensor): batch of lengths of input sequences (B) """ if self.frontend is None: raise RuntimeError('Frontend does\'t exist') prev = self.training self.eval() ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64) # subsample frame xs = [xx[::self.subsample[0], :] for xx in xs] xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs] xs_pad = pad_list(xs, 0.0) enhanced, hlensm, mask = self.frontend(xs_pad, ilens) if prev: self.train() return enhanced.cpu().numpy(), mask.cpu().numpy(), ilens
def enhance(self, xs): """Forward only the frontend stage. :param ndarray xs: input acoustic feature (T, C, F) """ if self.frontend is None: raise RuntimeError('Frontend doesn\'t exist') prev = self.training self.eval() ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64) # subsample frame xs = [xx[::self.subsample[0], :] for xx in xs] xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs] xs_pad = pad_list(xs, 0.0) enhanced, hlensm, mask = self.frontend(xs_pad, ilens) if prev: self.train() if isinstance(enhanced, (tuple, list)): enhanced = list(enhanced) mask = list(mask) for idx in range(len(enhanced)): # number of speakers enhanced[idx] = enhanced[idx].cpu().numpy() mask[idx] = mask[idx].cpu().numpy() return enhanced, mask, ilens return enhanced.cpu().numpy(), mask.cpu().numpy(), ilens
def common_collate_fn( data: Collection[Tuple[str, Dict[str, np.ndarray]]], float_pad_value: Union[float, int] = 0.0, int_pad_value: int = -32768, not_sequence: Collection[str] = (), ) -> Tuple[List[str], Dict[str, torch.Tensor]]: """Concatenate ndarray-list to an array and convert to torch.Tensor. Examples: >>> from espnet2.samplers.constant_batch_sampler import ConstantBatchSampler, >>> import espnet2.tasks.abs_task >>> from espnet2.train.dataset import ESPnetDataset >>> sampler = ConstantBatchSampler(...) >>> dataset = ESPnetDataset(...) >>> keys = next(iter(sampler) >>> batch = [dataset[key] for key in keys] >>> batch = common_collate_fn(batch) >>> model(**batch) Note that the dict-keys of batch are propagated from that of the dataset as they are. """ assert check_argument_types() uttids = [u for u, _ in data] data = [d for _, d in data] assert all(set(data[0]) == set(d) for d in data), "dict-keys mismatching" assert all(not k.endswith("_lengths") for k in data[0]), f"*_lengths is reserved: {list(data[0])}" output = {} for key in data[0]: # NOTE(kamo): # Each models, which accepts these values finally, are responsible # to repaint the pad_value to the desired value for each tasks. if data[0][key].dtype.kind == "i": pad_value = int_pad_value else: pad_value = float_pad_value array_list = [d[key] for d in data] # Assume the first axis is length: # tensor_list: Batch x (Length, ...) tensor_list = [torch.from_numpy(a) for a in array_list] # tensor: (Batch, Length, ...) tensor = pad_list(tensor_list, pad_value) output[key] = tensor # lens: (Batch,) if key not in not_sequence: lens = torch.tensor([d[key].shape[0] for d in data], dtype=torch.long) output[key + "_lengths"] = lens output = (uttids, output) assert check_return_type(output) return output
def prepare_inputs(idim, odim, ilens, olens, is_cuda=False): np.random.seed(1) xs = [np.random.randn(ilen, idim).astype(np.float32) for ilen in ilens] ys = [np.random.randint(1, odim, olen).astype(np.int32) for olen in olens] ilens = np.array([x.shape[0] for x in xs], dtype=np.int32) xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0) ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], -1) ilens = torch.from_numpy(ilens).long() if is_cuda: xs_pad = xs_pad.cuda() ys_pad = ys_pad.cuda() ilens = ilens.cuda() return xs_pad, ilens, ys_pad
def forward(self, hs_pad, hlens, ys_pad): """Forward function for transducer. Args: hs_pad (torch.Tensor): batch of padded hidden state sequences (B, Tmax, D) hlens (torch.Tensor): batch of lengths of hidden state sequences (B) ys_pad (torch.Tensor): batch of padded character id sequence tensor (B, Lmax) Returns: loss (float): rnnt loss value """ ys = [y[y != self.ignore_id] for y in ys_pad] hlens = list(map(int, hlens)) blank = ys[0].new([self.blank]) ys_in = [torch.cat([blank, y], dim=0) for y in ys] ys_in_pad = pad_list(ys_in, self.blank) olength = ys_in_pad.size(1) z_list, c_list = self.zero_state(hs_pad) eys = self.dropout_embed(self.embed(ys_in_pad)) z_all = [] for i in six.moves.range(olength): y, (z_list, c_list) = self.rnn_forward(eys[:, i, :], (z_list, c_list)) z_all.append(y) h_dec = torch.stack(z_all, dim=1) h_enc = hs_pad.unsqueeze(2) h_dec = h_dec.unsqueeze(1) z = self.joint(h_enc, h_dec) y = pad_list(ys, self.blank).type(torch.int32) z_len = to_device(self, torch.IntTensor(hlens)) y_len = to_device(self, torch.IntTensor([_y.size(0) for _y in ys])) loss = to_device(self, self.rnnt_loss(z, y, z_len, y_len)) return loss
def prepare_inputs(idim, odim, ilens, olens, is_cuda=False): np.random.seed(1) feats = [np.random.randn(ilen, idim).astype(np.float32) for ilen in ilens] labels = [np.random.randint(1, odim, olen).astype(np.int32) for olen in olens] feats_len = np.array([x.shape[0] for x in feats], dtype=np.int32) feats = pad_list([torch.from_numpy(x).float() for x in feats], 0) labels = pad_list([torch.from_numpy(y).long() for y in labels], -1) feats_len = torch.from_numpy(feats_len).long() if is_cuda: feats = feats.cuda() labels = labels.cuda() feats_len = feats_len.cuda() return feats, feats_len, labels
def __call__(self, batch, device): # batch should be located in list assert len(batch) == 1 inputs_and_targets = batch[0] # parse inputs and targets xs, ys, spembs, spcs = inputs_and_targets # get list of lengths (must be tensor for DataParallel) ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).long().to(device) olens = torch.from_numpy(np.array([y.shape[0] for y in ys])).long().to(device) # perform padding and conversion to tensor xs = pad_list([torch.from_numpy(x).long() for x in xs], 0).to(device) ys = pad_list([torch.from_numpy(y).float() for y in ys], 0).to(device) # make labels for stop prediction labels = ys.new_zeros(ys.size(0), ys.size(1)) for i, l in enumerate(olens): labels[i, l - 1:] = 1.0 # prepare dict new_batch = { "xs": xs, "ilens": ilens, "ys": ys, "labels": labels, "olens": olens, } # load second target if spcs is not None: spcs = pad_list([torch.from_numpy(spc).float() for spc in spcs], 0).to(device) new_batch["spcs"] = spcs # load speaker embedding if spembs is not None: spembs = torch.from_numpy(np.array(spembs)).float().to(device) new_batch["spembs"] = spembs return new_batch
def prepare_loss_inputs(ys_pad, hlens, blank_id=0, ignore_id=-1): """Prepare tensors for transducer loss computation. Args: ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) hlens (torch.Tensor): batch of hidden sequence lengthts (B) or batch of masks (B, 1, Tmax) blank_id (int): index of blank label ignore_id (int): index of initial padding Returns: ys_in_pad (torch.Tensor): batch of padded target sequences + blank (B, Lmax + 1) target (torch.Tensor): batch of padded target sequences (B, Lmax) pred_len (torch.Tensor): batch of hidden sequence lengths (B) target_len (torch.Tensor): batch of output sequence lengths (B) """ device = ys_pad.device ys = [y[y != ignore_id] for y in ys_pad] blank = ys[0].new([blank_id]) ys_in = [torch.cat([blank, y], dim=0) for y in ys] ys_in_pad = pad_list(ys_in, blank_id) target = pad_list(ys, blank_id).type(torch.int32) target_len = torch.IntTensor([y.size(0) for y in ys]) if torch.is_tensor(hlens): if hlens.dim() > 1: hs = [h[h != 0] for h in hlens] hlens = list(map(int, [h.size(0) for h in hs])) else: hlens = list(map(int, hlens)) pred_len = torch.IntTensor(hlens) pred_len = pred_len.to(device) target = target.to(device) target_len = target_len.to(device) return ys_in_pad, target, pred_len, target_len