def __call__(self, batch): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor) """ # batch should be located in list xs, ys = batch ys = list(ys) if len(xs) != len(ys): print("error uttr") print(xs[0]) pass # perform subsampling if self.subsampling_factor > 1: xs = [x[::self.subsampling_factor, :] for x in xs] # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) ilens = torch.from_numpy(ilens).to(self.device) xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(self.device, dtype=self.dtype) ys_pad = pad_list([torch.from_numpy(y[2]) for y in ys], self.ignore_id).long().to(self.device) return xs_pad, ilens, ys_pad
def __call__(self, batch, device): """Transforms a batch and send it to a device :param list batch: The batch to transform :param torch.device device: The device to send to :return: a tuple xs_pad, ilens, ys_pad :rtype (torch.Tensor, torch.Tensor, torch.Tensor) """ # batch should be located in list assert len(batch) == 1 xs, ys = batch[0] # perform subsampling if self.subsampling_factor > 1: xs = [x[::self.subsampling_factor, :] for x in xs] # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) # perform padding and convert to tensor xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(device) ilens = torch.from_numpy(ilens).to(device) ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], self.ignore_id).to(device) return xs_pad, ilens, ys_pad
def __call__(self, batch, device): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. device (torch.device): The device to send to. Returns: tuple( list(torch.Tensor), list(torch.Tensor), torch.Tensor) """ # batch should be located in list assert len(batch) == 1 xs_list = batch[0][:self.num_encs] ys = batch[0][-1] # perform subsampling if np.sum(self.subsamping_factors) > self.num_encs: xs_list = [[x[::self.subsampling_factors[i], :] for x in xs_list[i]] for i in range(self.num_encs)] # get batch of lengths of input sequences ilens_list = [np.array([x.shape[0] for x in xs_list[i]]) for i in range(self.num_encs)] # perform padding and convert to tensor # currently only support real number xs_list_pad = [pad_list([torch.from_numpy(x).float() for x in xs_list[i]], 0).to(device, dtype=self.dtype) for i in range(self.num_encs)] ilens_list = [torch.from_numpy(ilens_list[i]).to(device) for i in range(self.num_encs)] # NOTE: this is for multi-task learning (e.g., speech translation) ys_pad = pad_list([torch.from_numpy(np.array(y[0]) if isinstance(y, tuple) else y).long() for y in ys], self.ignore_id).to(device) return xs_list_pad, ilens_list, ys_pad
def __call__(self, batch, device=torch.device("cpu")): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. device (torch.device): The device to send to. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor) """ # batch should be located in list assert len(batch) == 1 xs, ys = batch[0] # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) # perform padding and convert to tensor xs_pad = pad_list([torch.from_numpy(x).long() for x in xs], self.pad).to(device) ilens = torch.from_numpy(ilens).to(device) ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], self.ignore_id).to(device) return xs_pad, ilens, ys_pad
def __call__(self, batch): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. device (torch.device): The device to send to. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor) """ # batch should be located in list xs, ys = batch ys = list(ys) # perform subsampling if self.subsampling_factor > 1: xs = [x[::self.subsampling_factor, :] for x in xs] # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) # perform padding and convert to tensor # currently only support real number if xs[0].dtype.kind == 'c': xs_pad_real = pad_list( [torch.from_numpy(x.real).float() for x in xs], 0).to(self.dtype).cuda(self.device, non_blocking=True) xs_pad_imag = pad_list( [torch.from_numpy(x.imag).float() for x in xs], 0).to(self.dtype).cuda(self.device, non_blocking=True) # Note(kamo): # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E. # Don't create ComplexTensor and give it E2E here # because torch.nn.DataParellel can't handle it. xs_pad = {'real': xs_pad_real, 'imag': xs_pad_imag} else: xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(self.dtype).cuda(self.device, non_blocking=True) ilens = torch.from_numpy(ilens).cuda(self.device, non_blocking=True) # NOTE: this is for multi-task learning (e.g., speech translation) ys_pad = pad_list([ torch.from_numpy( np.array(y[0]) if isinstance(y, tuple) else y).long() for y in ys ], self.ignore_id).cuda(self.device, non_blocking=True) if self.task == "asr": return xs_pad, ilens, ys_pad elif self.task == "st": ys_pad_asr = pad_list( [torch.from_numpy(np.array(y[1])).long() for y in ys], 0).cuda(self.device, non_blocking=True) return xs_pad, ilens, ys_pad, ys_pad_asr else: raise ValueError('Support only asr and st data')
def __call__(self, batch, device=torch.device("cpu")): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. device (torch.device): The device to send to. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor) """ # batch should be located in list assert len(batch) == 1 xs, ys = batch[0] # perform subsampling if self.subsampling_factor > 1: xs = [x[:: self.subsampling_factor, :] for x in xs] # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) # perform padding and convert to tensor # currently only support real number if xs[0].dtype.kind == "c": xs_pad_real = pad_list( [torch.from_numpy(x.real).float() for x in xs], 0 ).to(device, dtype=self.dtype) xs_pad_imag = pad_list( [torch.from_numpy(x.imag).float() for x in xs], 0 ).to(device, dtype=self.dtype) # Note(kamo): # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E. # Don't create ComplexTensor and give it E2E here # because torch.nn.DataParellel can't handle it. xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag} else: xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to( device, dtype=self.dtype ) ilens = torch.from_numpy(ilens).to(device) # NOTE: this is for multi-output (e.g., speech translation) ys_pad = pad_list( [ torch.from_numpy( np.array(y[0][:]) if isinstance(y, tuple) else y ).long() for y in ys ], self.ignore_id, ).to(device) return xs_pad, ilens, ys_pad
def __call__(self, batch, device, evaluation=False): """Transforms a batch and send it to a device :param list batch: The batch to transform :param torch.device device: The device to send to :return: a tuple xs_pad, ilens, ys_pad :rtype (torch.Tensor, torch.Tensor, torch.Tensor) """ # batch should be located in list assert len(batch) == 1 xs, ys = batch[0] # perform subsampling if self.subsampling_factor > 1: xs = [x[::self.subsampling_factor, :] for x in xs] # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) # perform padding and convert to tensor # currently only support real number if xs[0].dtype.kind == 'c': xs_pad_real = pad_list( [torch.from_numpy(x.real).float() for x in xs], 0).to(device) xs_pad_imag = pad_list( [torch.from_numpy(x.imag).float() for x in xs], 0).to(device) # Note(kamo): # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E. # Don't create ComplexTensor and give it E2E here # because torch.nn.DataParellel can't handle it. xs_pad = {'real': xs_pad_real, 'imag': xs_pad_imag} else: if self.use_specaug and not evaluation: xs_pad = pad_list( [specaug(torch.from_numpy(x).float()) for x in xs], 0).to(device) else: xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(device) ilens = torch.from_numpy(ilens).to(device) # NOTE: this is for multi-task learning (e.g., speech translation) ys_pad = pad_list([ torch.from_numpy( np.array(y[0]) if isinstance(y, tuple) else y).long() for y in ys ], self.ignore_id).to(device) return xs_pad, ilens, ys_pad
def __call__(self, batch, device): """Transform a batch and send it to a device. Args: batch (list(tuple(str, dict[str, dict[str, Any]]))): The batch to transform. device (torch.device): The device to send to. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor): Transformed batch. """ # batch should be located in list assert len(batch) == 1 xs, ys = batch[0] # Convert zip object to list in python 3.x ys = list(ys) # perform subsampling if self.subsampling_factor > 1: xs = [x[::self.subsampling_factor, :] for x in xs] # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) # perform padding and convert to tensor # currently only support real number if xs[0].dtype.kind == 'c': xs_pad_real = pad_list( [torch.from_numpy(x.real).float() for x in xs], 0).to(device, dtype=self.dtype) xs_pad_imag = pad_list( [torch.from_numpy(x.imag).float() for x in xs], 0).to(device, dtype=self.dtype) # Note(kamo): # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E. # Don't create ComplexTensor and give it to E2E here # because torch.nn.DataParallel can't handle it. xs_pad = {'real': xs_pad_real, 'imag': xs_pad_imag} else: xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(device, dtype=self.dtype) ilens = torch.from_numpy(ilens).to(device) # TODO(Xuankai): try to make this neat if not isinstance(ys[0], np.ndarray): ys_pad = [torch.from_numpy(y[0]).long() for y in ys] + [torch.from_numpy(y[1]).long() for y in ys] ys_pad = pad_list(ys_pad, self.ignore_id) ys_pad = ys_pad.view(2, -1, ys_pad.size(1)).transpose(0, 1).to(device) # (num_spkrs, B, Tmax) else: ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], self.ignore_id).to(device) return xs_pad, ilens, ys_pad
def __call__(self, batch, device=torch.device('cpu')): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. device (torch.device): The device to send to. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor) """ _, ys = batch[0] ys_asr = copy.deepcopy(ys) xs_pad, ilens, ys_pad = super().__call__(batch, device) if self.asr_task: ys_pad_asr = pad_list([ torch.from_numpy( np.insert(y[1][1], 0, y[1][0]) if isinstance(y[1], tuple ) else np. array(y[1][:]) if isinstance(y, tuple) else y).long() for y in ys_asr ], self.ignore_id).to(device) else: ys_pad_asr = None return xs_pad, ilens, ys_pad, ys_pad_asr
def forward(self, xs, labels=None): ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64) xs = [to_device(self.slu, to_torch_tensor(xx).float()) for xx in xs] xs_pad = pad_list(xs, 0.0) embeddings = self.slu(xs_pad, ilens, None) outputs = self.classifier(embeddings, labels) return outputs
def test_train_acc(): n_out = 7 _eos = n_out - 1 n_batch = 3 label_length = numpy.array([4, 2, 3], dtype=numpy.int32) np_pred = numpy.random.rand(n_batch, max(label_length) + 1, n_out).astype(numpy.float32) # NOTE: 0 is only used for CTC, never appeared in attn target np_target = [ numpy.random.randint(1, n_out - 1, size=ol, dtype=numpy.int32) for ol in label_length ] eos = numpy.array([_eos], 'i') ys_out = [F.concat([y, eos], axis=0) for y in np_target] # padding for ys with -1 # pys: utt x olen # NOTE: -1 is default ignore index for chainer pad_ys_out = F.pad_sequence(ys_out, padding=-1) y_all = F.reshape(np_pred, (n_batch * (max(label_length) + 1), n_out)) ch_acc = F.accuracy(y_all, F.concat(pad_ys_out, axis=0), ignore_label=-1) # NOTE: this index 0 is only for CTC not attn. so it can be ignored # unfortunately, torch cross_entropy does not accept out-of-bound ids th_ignore = 0 th_pred = torch.from_numpy(y_all.data) th_ys = [torch.from_numpy(numpy.append(t, eos)).long() for t in np_target] th_target = pad_list(th_ys, th_ignore) th_acc = th_accuracy(th_pred, th_target, th_ignore) numpy.testing.assert_allclose(ch_acc.data, th_acc)
def __call__(self, batch, device): # batch should be located in list assert len(batch) == 1 inputs_and_targets = batch[0] # parse inputs and targets xs, ys, spembs, spcs = inputs_and_targets # get list of lengths (must be tensor for DataParallel) ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).long().to(device) olens = torch.from_numpy(np.array([y.shape[0] for y in ys])).long().to(device) # perform padding and conversion to tensor xs = pad_list([torch.from_numpy(x).long() for x in xs], 0).to(device) ys = pad_list([torch.from_numpy(y).float() for y in ys], 0).to(device) # make labels for stop prediction labels = ys.new_zeros(ys.size(0), ys.size(1)) for i, l in enumerate(olens): labels[i, l - 1:] = 1.0 # prepare dict new_batch = { "xs": xs, "ilens": ilens, "ys": ys, "labels": labels, "olens": olens, } # load second target if spcs is not None: spcs = pad_list([torch.from_numpy(spc).float() for spc in spcs], 0).to(device) new_batch["spcs"] = spcs # load speaker embedding if spembs is not None: spembs = torch.from_numpy(np.array(spembs)).float().to(device) new_batch["spembs"] = spembs return new_batch
def loss_fn_asr(self, best_x): loss_nll = torch.nn.NLLLoss() asr_loss = [] ys = [torch.tensor(y['yseq']).long() for x in best_x for y in x] ys_asr = pad_list([y for y in ys], -1).to(self.device) batch = int(ys_asr.size(0) / self.nbest) ys_asr = ys_asr.view(batch, self.nbest, -1) char_scores = torch.stack(best_x[0][0]['char_score']) score = char_scores.mean(0).view(-1) return score
def __call__(self, batch, device=torch.device("cpu")): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. device (torch.device): The device to send to. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor) """ # batch should be located in list assert len(batch) == 1 xs, ys, ys_src = batch[0] # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) ilens = torch.from_numpy(ilens).to(device) xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(device, dtype=self.dtype) ys_pad = pad_list( [torch.from_numpy(np.array(y, dtype=np.int64)) for y in ys], self.ignore_id, ).to(device) if self.use_source_text: ys_pad_src = pad_list( [ torch.from_numpy(np.array(y, dtype=np.int64)) for y in ys_src ], self.ignore_id, ).to(device) else: ys_pad_src = None return xs_pad, ilens, ys_pad, ys_pad_src
def test_attn_loss(): n_out = 7 _eos = n_out - 1 n_batch = 3 label_length = numpy.array([4, 2, 3], dtype=numpy.int32) np_pred = numpy.random.rand(n_batch, max(label_length) + 1, n_out).astype(numpy.float32) # NOTE: 0 is only used for CTC, never appeared in attn target np_target = [ numpy.random.randint(1, n_out - 1, size=ol, dtype=numpy.int32) for ol in label_length ] eos = numpy.array([_eos], 'i') ys_out = [F.concat([y, eos], axis=0) for y in np_target] # padding for ys with -1 # pys: utt x olen # NOTE: -1 is default ignore index for chainer pad_ys_out = F.pad_sequence(ys_out, padding=-1) y_all = F.reshape(np_pred, (n_batch * (max(label_length) + 1), n_out)) ch_loss = F.softmax_cross_entropy(y_all, F.concat(pad_ys_out, axis=0)) # NOTE: this index 0 is only for CTC not attn. so it can be ignored # unfortunately, torch cross_entropy does not accept out-of-bound ids th_ignore = 0 th_pred = torch.from_numpy(y_all.data) th_target = pad_list([torch.from_numpy(t.data).long() for t in ys_out], th_ignore) if LooseVersion(torch.__version__) < LooseVersion('1.0'): reduction_str = 'elementwise_mean' else: reduction_str = 'mean' th_loss = torch.nn.functional.cross_entropy(th_pred, th_target.view(-1), ignore_index=th_ignore, reduction=reduction_str) print(ch_loss) print(th_loss) # NOTE: warpctc_pytorch.CTCLoss does not normalize itself by batch-size # while chainer's default setting does loss_data = float(th_loss) numpy.testing.assert_allclose(loss_data, ch_loss.data, 0.05)
def test_ctc_loss(in_length, out_length, use_warpctc): pytest.importorskip("torch") if use_warpctc: pytest.importorskip("warpctc_pytorch") import warpctc_pytorch torch_ctcloss = warpctc_pytorch.CTCLoss(size_average=True) else: if LooseVersion(torch.__version__) < LooseVersion("1.0"): pytest.skip("pytorch < 1.0 doesn't support CTCLoss") _ctcloss_sum = torch.nn.CTCLoss(reduction="sum") def torch_ctcloss(th_pred, th_target, th_ilen, th_olen): th_pred = th_pred.log_softmax(2) loss = _ctcloss_sum(th_pred, th_target, th_ilen, th_olen) # Batch-size average loss = loss / th_pred.size(1) return loss n_out = 7 input_length = numpy.array(in_length, dtype=numpy.int32) label_length = numpy.array(out_length, dtype=numpy.int32) np_pred = [ numpy.random.rand(il, n_out).astype(numpy.float32) for il in input_length ] np_target = [ numpy.random.randint(0, n_out, size=ol, dtype=numpy.int32) for ol in label_length ] # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr.py ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2) ch_target = F.pad_sequence(np_target, padding=-1) ch_loss = F.connectionist_temporal_classification(ch_pred, ch_target, 0, input_length, label_length).data th_pred = pad_list([torch.from_numpy(x) for x in np_pred], 0.0).transpose(0, 1) th_target = torch.from_numpy(numpy.concatenate(np_target)) th_ilen = torch.from_numpy(input_length) th_olen = torch.from_numpy(label_length) th_loss = torch_ctcloss(th_pred, th_target, th_ilen, th_olen).numpy() numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
def __call__(self, batch, device): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. device (torch.device): The device to send to. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor) """ _, ys = batch[0] xs_pad, ilens, ys_pad = super().__call__(batch, device) if self.asr_task: ys_pad_asr = pad_list( [torch.from_numpy(np.array(y[1])).long() for y in ys], self.ignore_id).to(device) else: ys_pad_asr = None return xs_pad, ilens, ys_pad, ys_pad_asr
def random_sampler(self, hyps, xlens, xs, spembs): # convert hyps to xs, xlens to ylens, ys to xs # separate yseq from dictionary of nbest_hyps ys = hyps #for i, y_hat in enumerate(ys): #seq_hat = [self.char_list[int(idx)] for idx in y_hat if int(idx) != -1] #seq_hat_text = "".join(seq_hat).replace('<space>', ' ') #logging.info("prediction[%d]: " % i + seq_hat_text) xlens_tts = torch.from_numpy(np.array([y.shape[0] for y in ys])).long().to(self.device) xlens_tts = sorted(xlens_tts, reverse=True) xs_tts = pad_list([y.long() for y in ys], 0).to(self.device) xs, xlens = mask_by_length_and_multiply(xs, xlens, 0, self.nbest) onelens = np.fromiter((1 for xx in spembs), dtype=np.int64) spembs, _ = mask_by_length_and_multiply(spembs.unsqueeze(1), torch.tensor(onelens), 0, self.nbest) spembs = spembs.squeeze(1) ylens_tts = torch.Tensor([ torch.max(xlens) for _ in range(len(xlens)) ]).type(xlens.dtype) ys_tts = xs labels = ys_tts.new_zeros(ys_tts.size(0), ys_tts.size(1)) for i, l in enumerate(ylens_tts): labels[i, l - 1:] = 1.0 return xs_tts, xlens_tts, ys_tts, labels, ylens_tts, spembs
def asr_to_tts(self, hyps, xlens, xs): # convert hyps to xs, xlens to ylens, ys to xs # separate yseq from dictionary of nbest_hyps ys = [ torch.tensor(y['yseq']) for x in hyps for y in x] for i, y_hat in enumerate(ys): seq_hat = [self.char_list[int(idx)] for idx in y_hat if int(idx) != -1] seq_hat_text = "".join(seq_hat).replace('<space>', ' ') logging.info("prediction[%d]: " % i + seq_hat_text) xlens_tts = torch.from_numpy(np.array([y.shape[0] for y in ys])).long().to(self.device) xlens_tts = sorted(xlens_tts, reverse=True) xs_tts = pad_list([y.long() for y in ys], 0).to(self.device) reduced_best = len(hyps[0]) logging.info("nbest is %d", reduced_best) xs, xlens = mask_by_length_and_multiply(xs, xlens, 0, reduced_best) ylens_tts = xlens ys_tts = xs labels = ys_tts.new_zeros(ys_tts.size(0), ys_tts.size(1)) for i, l in enumerate(ylens_tts): labels[i, l - 1:] = 1.0 return xs_tts, xlens_tts, ys_tts, labels, ylens_tts
def __call__(self, batch, device=torch.device("cpu")): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. device (torch.device): The device to send to. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor) """ _, ys = batch[0] ys_asr = copy.deepcopy(ys) xs_pad, ilens, ys_pad = super().__call__(batch, device) if self.use_source_text: ys_pad_asr = pad_list( [torch.from_numpy(np.array(y[1])).long() for y in ys_asr], self.ignore_id, ).to(device) else: ys_pad_asr = None return xs_pad, ilens, ys_pad, ys_pad_asr
def test_ctc_loss(in_length, out_length, ctc_type): pytest.importorskip("torch") if ctc_type == "warpctc": pytest.importorskip("warpctc_pytorch") import warpctc_pytorch torch_ctcloss = warpctc_pytorch.CTCLoss(size_average=True) elif ctc_type == "builtin" or ctc_type == "cudnnctc": if LooseVersion(torch.__version__) < LooseVersion("1.0"): pytest.skip("pytorch < 1.0 doesn't support CTCLoss") _ctcloss_sum = torch.nn.CTCLoss(reduction="sum") def torch_ctcloss(th_pred, th_target, th_ilen, th_olen): th_pred = th_pred.log_softmax(2) loss = _ctcloss_sum(th_pred, th_target, th_ilen, th_olen) # Batch-size average loss = loss / th_pred.size(1) return loss elif ctc_type == "gtnctc": pytest.importorskip("gtn") from espnet.nets.pytorch_backend.gtn_ctc import GTNCTCLossFunction _ctcloss_sum = GTNCTCLossFunction.apply def torch_ctcloss(th_pred, th_target, th_ilen, th_olen): targets = [t.tolist() for t in th_target] log_probs = torch.nn.functional.log_softmax(th_pred, dim=2) loss = _ctcloss_sum(log_probs, targets, th_ilen, 0, "none") return loss n_out = 7 input_length = numpy.array(in_length, dtype=numpy.int32) label_length = numpy.array(out_length, dtype=numpy.int32) np_pred = [ numpy.random.rand(il, n_out).astype(numpy.float32) for il in input_length ] np_target = [ numpy.random.randint(0, n_out, size=ol, dtype=numpy.int32) for ol in label_length ] # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr.py ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2) ch_target = F.pad_sequence(np_target, padding=-1) ch_loss = F.connectionist_temporal_classification(ch_pred, ch_target, 0, input_length, label_length).data th_pred = pad_list([torch.from_numpy(x) for x in np_pred], 0.0).transpose(0, 1) if ctc_type == "gtnctc": # gtn implementation expects targets as list th_target = np_target # keep as B x T x H for gtn th_pred = th_pred.transpose(0, 1) else: th_target = torch.from_numpy(numpy.concatenate(np_target)) th_ilen = torch.from_numpy(input_length) th_olen = torch.from_numpy(label_length) th_loss = torch_ctcloss(th_pred, th_target, th_ilen, th_olen).numpy() numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
def __call__(self, batch, device=torch.device("cpu")): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. device (torch.device): The device to send to. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor) """ # batch should be located in list assert len(batch) == 1 xs, ys = batch[0] # perform subsampling if self.subsampling_factor > 1: xs = [x[::self.subsampling_factor, :] for x in xs] if len(xs) == 2: logging.info("input and target are different form by transform") xs_in = xs[0] xs_out = xs[1] else: xs_in = xs xs_out = xs # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs_in]) if self.tnum > 0: xs_pad_in = pad_list( [torch.from_numpy(x[:-self.tnum]).float() for x in xs_in], 0).to(device, dtype=self.dtype) xs_pad_out = pad_list([ torch.stack([ torch.from_numpy(x[i + 1:-self.tnum + i + 1]).float() if (-self.tnum + i + 1) != 0 else torch.from_numpy( x[i + 1:]).float() for i in range(self.tnum) ], dim=1) for x in xs_out ], 0).to(device, dtype=self.dtype) else: xs_pad_in = pad_list([torch.from_numpy(x).float() for x in xs_in], 0).to(device, dtype=self.dtype) xs_pad_out = pad_list( [torch.from_numpy(x).float().unsqueeze(1) for x in xs_out], 0).to(device, dtype=self.dtype) ilens = torch.from_numpy(ilens).to(device) - self.tnum # NOTE: this is for multi-output (e.g., speech translation) ys_pad = pad_list( [ torch.from_numpy( np.array(y[0][:]) if isinstance(y, tuple) else y).long() for y in ys ], self.ignore_id, ).to(device) return xs_pad_in, xs_pad_out, ilens, ys_pad
def recog(args): """Decode with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.recog_args = args model.eval() # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None if args.word_rnnlm: rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf) word_dict = rnnlm_args.char_list_dict char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(word_dict), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.word_rnnlm, word_rnnlm) word_rnnlm.eval() if rnnlm is not None: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.MultiLevelLM(word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict)) else: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) # gpu if args.ngpu == 1: gpu_id = list(range(args.ngpu)) logging.info('gpu id: ' + str(gpu_id)) model.cuda() if rnnlm: rnnlm.cuda() # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] new_js = {} load_inputs_and_targets = LoadInputsAndTargets( mode='asr', load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={'train': False}) if args.batchsize == 0: with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info('(%d/%d) decoding ' + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] if args.streaming_mode == 'window': logging.info( 'Using streaming recognizer with window size %d frames', args.streaming_window) se2e = WindowStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm) for i in range(0, feat.shape[0], args.streaming_window): logging.info('Feeding frames %d - %d', i, i + args.streaming_window) se2e.accept_input(feat[i:i + args.streaming_window]) logging.info('Running offline attention decoder') se2e.decode_with_attention_offline() logging.info('Offline attention decoder finished') nbest_hyps = se2e.retrieve_recognition() elif args.streaming_mode == 'segment': logging.info( 'Using streaming recognizer with threshold value %d', args.streaming_min_blank_dur) nbest_hyps = [] for n in range(args.nbest): nbest_hyps.append({'yseq': [], 'score': 0.0}) se2e = SegmentStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm) r = np.prod(model.subsample) for i in range(0, feat.shape[0], r): hyps = se2e.accept_input(feat[i:i + r]) if hyps is not None: text = ''.join([ train_args.char_list[int(x)] for x in hyps[0]['yseq'][1:-1] if int(x) != -1 ]) text = text.replace( '\u2581', ' ').strip() # for SentencePiece text = text.replace(model.space, ' ') text = text.replace(model.blank, '') logging.info(text) for n in range(args.nbest): nbest_hyps[n]['yseq'].extend(hyps[n]['yseq']) nbest_hyps[n]['score'] += hyps[n]['score'] else: nbest_hyps = model.recognize(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) else: def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # sort data keys = list(js.keys()) feat_lens = [js[key]['input'][0]['shape'][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] batch = [(name, js[name]) for name in names] feats = load_inputs_and_targets(batch)[0] if train_args.slu_model: xs = feats ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64) xs = [ to_device(model, to_torch_tensor(xx).float()) for xx in xs ] xs_pad = pad_list(xs, 0.0) embeddings = model(xs_pad, ilens, None).cpu().numpy() for i in range(len(batch)): new_js[batch[i][0]] = embeddings[i].tolist() else: nbest_hyps = model.recognize_batch(feats, args, train_args.char_list, rnnlm=rnnlm) for i, nbest_hyp in enumerate(nbest_hyps): name = names[i] new_js[name] = add_results_to_json( js[name], nbest_hyp, train_args.char_list) with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8'))