def score(self, ys, state, x): ys_mask = subsequent_mask(len(ys), device=x.device).unsqueeze(0) logp, state = self.forward_one_step(ys.unsqueeze(0), ys_mask, x.unsqueeze(0), cache=state) return logp.squeeze(0), state
def score(self, ys, state, x): """Score.""" ys_mask = subsequent_mask(len(ys), device=x.device).unsqueeze(0) if self.selfattention_layer_type != "selfattn": # TODO(karita): implement cache logging.warning( f"{self.selfattention_layer_type} does not support cached decoding." ) state = None logp, state = self.forward_one_step( ys.unsqueeze(0), ys_mask, x.unsqueeze(0), cache=state ) return logp.squeeze(0), state
def forward(self, text, speaker_embedding=None): self.eval() x = text xs = x.unsqueeze(0) hs, _ = self.encoder(xs, None) if self.spk_embed_dim is not None: speaker_embeddings = speaker_embedding.unsqueeze(0) hs = self._integrate_with_spk_embed(hs, speaker_embeddings) maxlen = int(hs.size(1) * 10.0 / self.reduction_factor) minlen = int(hs.size(1) * 0.0 / self.reduction_factor) idx = 0 ys = hs.new_zeros(1, 1, self.odim) outs, probs = [], [] z_cache = self.decoder.init_state(x) while True: idx += 1 y_masks = subsequent_mask(idx).unsqueeze(0).to(x.device) z, z_cache = self.decoder.forward_one_step(ys, y_masks, hs, cache=z_cache) outs += [self.feat_out(z).view(self.reduction_factor, self.odim)] probs += [torch.sigmoid(self.prob_out(z))[0]] ys = torch.cat((ys, outs[-1][-1].view(1, 1, self.odim)), dim=1) att_ws_ = [] for name, m in self.named_modules(): if isinstance(m, MultiHeadedAttention) and "src" in name: att_ws_ += [m.attn[0, :, -1].unsqueeze(1)] if idx == 1: att_ws = att_ws_ else: att_ws = [ torch.cat([att_w, att_w_], dim=1) for att_w, att_w_ in zip(att_ws, att_ws_) ] if int(sum(probs[-1] >= 0.5)) > 0 or idx >= maxlen: if idx < minlen: continue outs = (torch.cat(outs, dim=0).unsqueeze(0).transpose(1, 2)) if self.postnet is not None: outs = outs + self.postnet(outs) outs = outs.transpose(2, 1).squeeze(0) break return outs
def batch_score(self, ys, states, xs): """ Score new token batch (required). Args: ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen). states (List[Any]): Scorer states for prefix tokens. xs (torch.Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat). Returns: tuple[torch.Tensor, List[Any]]: Tuple of batchfied scores for next token with shape of `(n_batch, n_vocab)` and next state list for ys. """ # merge states n_batch = len(ys) n_layers = len(self.decoders) if states[0] is None: batch_state = None else: # transpose state of [batch, layer] into [layer, batch] batch_state = [ torch.stack([states[b][i] for b in range(n_batch)]) for i in range(n_layers) ] # batch decoding ys_mask = subsequent_mask(ys.size(-1), device=xs.device).unsqueeze(0) logp, states = self.forward_one_step(ys, ys_mask, xs, cache=batch_state) # transpose state of [layer, batch] into [batch, layer] state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] return logp, state_list
def _target_mask(self, olens): y_masks = make_non_pad_mask(olens).to(olens.device) s_masks = subsequent_mask(y_masks.size(-1), device=y_masks.device).unsqueeze(0) return y_masks.unsqueeze(-2) & s_masks