示例#1
0
    def inference(self, x, lengths=None):
        """Inference step

        Find the most likely mean and variance

        Args:
            x (torch.Tensor): the input tensor
            lengths (torch.Tensor): the lengths of the input tensor

        Returns:
            tuple: mean and variance of the output features
        """
        log_pi, log_sigma, mu = self.forward(x, lengths)
        sigma, mu = mdn_get_most_probable_sigma_and_mu(log_pi, log_sigma, mu)
        return mu, sigma
示例#2
0
    def inference(self, x, lengths=None):
        """Inference step

        Args:
            x (torch.Tensor): input features
            lengths (torch.Tensor): lengths of input features

        Returns:
            tuple: (mu, sigma) if use_mdn, (output, ) otherwise
        """
        if self.use_mdn:
            (log_pi, log_sigma, mu), _ = self(x, lengths)
            sigma, mu = mdn_get_most_probable_sigma_and_mu(log_pi, log_sigma, mu)
            return mu, sigma
        else:
            return self(x, lengths)[0]
示例#3
0
    def test_mdn_get_most_probable_sigma_and_mu(self):
        self.test_mdn_loss()

        pi, sigma, mu = self.model(
            torch.from_numpy(self.x_test.reshape(1, -1,
                                                 self.d_in)).to(self.device))
        _, max_mu = mdn.mdn_get_most_probable_sigma_and_mu(pi, sigma, mu)
        max_mu = max_mu.squeeze(0).cpu().detach().numpy()
        print(max_mu.shape)

        for i, sample in enumerate(max_mu):
            lower_limit = self.y_test_range[i][0]
            upper_limit = self.y_test_range[i][1]
            assert lower_limit < sample and upper_limit > sample
            print(
                f"sample: {sample}, lower_limit: {lower_limit}, upper_limit: {upper_limit}"
            )
示例#4
0
 def inference(self, x, lengths=None):
     log_pi, log_sigma, mu = self.forward(x, lengths)
     sigma, mu = mdn_get_most_probable_sigma_and_mu(log_pi, log_sigma, mu)
     return mu, sigma
示例#5
0
def predict_timelag(device,
                    labels,
                    timelag_model,
                    timelag_config,
                    timelag_in_scaler,
                    timelag_out_scaler,
                    binary_dict,
                    continuous_dict,
                    pitch_indices=None,
                    log_f0_conditioning=True,
                    allowed_range=[-20, 20],
                    allowed_range_rest=[-40, 40]):
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                timelag_linguistic_features, idx, log_f0_conditioning),
                                                           kind="slinear")

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features)
    if isinstance(timelag_in_scaler, MinMaxScaler):
        # clip to feature range
        timelag_linguistic_features = np.clip(
            timelag_linguistic_features, timelag_in_scaler.feature_range[0],
            timelag_in_scaler.feature_range[1])

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)

    # Run model
    if timelag_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        log_pi, log_sigma, mu = timelag_model.inference(x, [x.shape[1]])
        if np.any(timelag_config.has_dynamic_features):
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * timelag_out_scaler.var_
            max_mu = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_timelag = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_timelag = timelag_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag)
        if np.any(timelag_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                pred_timelag, timelag_out_scaler.var_,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features)

    # Rounding
    pred_timelag = np.round(pred_timelag)

    # Clip to the allowed range
    for idx in range(len(pred_timelag)):
        if _is_silence(note_labels.contexts[idx]):
            pred_timelag[idx] = np.clip(pred_timelag[idx],
                                        allowed_range_rest[0],
                                        allowed_range_rest[1])
        else:
            pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range[0],
                                        allowed_range[1])

    # frames -> 100 ns
    pred_timelag *= 50000

    return pred_timelag
示例#6
0
def predict_acoustic(device,
                     labels,
                     acoustic_model,
                     acoustic_config,
                     acoustic_in_scaler,
                     acoustic_out_scaler,
                     binary_dict,
                     continuous_dict,
                     subphone_features="coarse_coding",
                     pitch_indices=None,
                     log_f0_conditioning=True):

    # Musical/linguistic features
    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=True,
        subphone_features=subphone_features)

    if log_f0_conditioning:
        for idx in pitch_indices:
            linguistic_features[:, idx] = interp1d(_midi_to_hz(
                linguistic_features, idx, log_f0_conditioning),
                                                   kind="slinear")

    # Apply normalization
    linguistic_features = acoustic_in_scaler.transform(linguistic_features)
    if isinstance(acoustic_in_scaler, MinMaxScaler):
        # clip to feature range
        linguistic_features = np.clip(linguistic_features,
                                      acoustic_in_scaler.feature_range[0],
                                      acoustic_in_scaler.feature_range[1])

    # Predict acoustic features
    x = torch.from_numpy(linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC:
        log_pi, log_sigma, mu = acoustic_model.inference(x, [x.shape[1]])
        if np.any(acoustic_config.has_dynamic_features):
            # (B, T, D_out)
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)

            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * acoustic_out_scaler.var_
            max_mu = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())

            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_acoustic = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_acoustic = acoustic_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic)
        if np.any(acoustic_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                pred_acoustic, acoustic_out_scaler.var_,
                get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features)

    return pred_acoustic
示例#7
0
def predict_duration(device,
                     labels,
                     duration_model,
                     duration_config,
                     duration_in_scaler,
                     duration_out_scaler,
                     lag,
                     binary_dict,
                     continuous_dict,
                     pitch_indices=None,
                     log_f0_conditioning=True):
    # Extract musical/linguistic features
    duration_linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    if log_f0_conditioning:
        for idx in pitch_indices:
            duration_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                duration_linguistic_features, idx, log_f0_conditioning),
                                                            kind="slinear")

    # Apply normalization
    duration_linguistic_features = duration_in_scaler.transform(
        duration_linguistic_features)
    if isinstance(duration_in_scaler, MinMaxScaler):
        # clip to feature range
        duration_linguistic_features = np.clip(
            duration_linguistic_features, duration_in_scaler.feature_range[0],
            duration_in_scaler.feature_range[1])

    # Apply model
    x = torch.from_numpy(duration_linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if duration_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        log_pi, log_sigma, mu = duration_model.inference(x, [x.shape[1]])
        if np.any(duration_config.has_dynamic_features):
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * duration_out_scaler.var_
            max_mu = duration_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())

            # (T, D_out) -> (T, static_dim)
            pred_durations = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(duration_config.num_windows),
                duration_config.stream_sizes,
                duration_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_durations = duration_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_durations = duration_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_durations = duration_out_scaler.inverse_transform(pred_durations)
        if np.any(duration_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_durations = multi_stream_mlpg(
                pred_durations, duration_out_scaler.var_,
                get_windows(duration_config.num_windows),
                duration_config.stream_sizes,
                duration_config.has_dynamic_features)

    pred_durations[pred_durations <= 0] = 1
    pred_durations = np.round(pred_durations)

    return pred_durations
示例#8
0
def train_step(
    model,
    model_config,
    optimizer,
    grad_scaler,
    train,
    in_feats,
    out_feats,
    lengths,
    out_scaler,
    feats_criterion="mse",
    pitch_reg_dyn_ws=1.0,
    pitch_reg_weight=1.0,
):
    model.train() if train else model.eval()
    optimizer.zero_grad()
    log_metrics = {}

    if feats_criterion in ["l2", "mse"]:
        criterion = nn.MSELoss(reduction="none")
    elif feats_criterion in ["l1", "mae"]:
        criterion = nn.L1Loss(reduction="none")
    else:
        raise RuntimeError("not supported criterion")

    prediction_type = (
        model.module.prediction_type()
        if isinstance(model, nn.DataParallel)
        else model.prediction_type()
    )

    # Apply preprocess if required (e.g., FIR filter for shallow AR)
    # defaults to no-op
    if isinstance(model, nn.DataParallel):
        out_feats = model.module.preprocess_target(out_feats)
    else:
        out_feats = model.preprocess_target(out_feats)

    # Run forward
    with autocast(enabled=grad_scaler is not None):
        outs = model(in_feats, lengths, out_feats)
        if isinstance(outs, tuple) and len(outs) == 2:
            pred_out_feats, lf0_residual = outs
        else:
            pred_out_feats, lf0_residual = outs, None

    # Mask (B, T, 1)
    mask = make_non_pad_mask(lengths).unsqueeze(-1).to(in_feats.device)

    # Compute loss
    if prediction_type == PredictionType.PROBABILISTIC:
        pi, sigma, mu = pred_out_feats

        # (B, max(T)) or (B, max(T), D_out)
        mask_ = mask if len(pi.shape) == 4 else mask.squeeze(-1)
        # Compute loss and apply mask
        with autocast(enabled=grad_scaler is not None):
            loss_feats = mdn_loss(pi, sigma, mu, out_feats, reduce=False)
            loss_feats = loss_feats.masked_select(mask_).mean()
    else:
        with autocast(enabled=grad_scaler is not None):
            # NOTE: multiple predictions
            if isinstance(pred_out_feats, list):
                loss_feats = 0
                for pred_out_feats_ in pred_out_feats:
                    loss_feats += criterion(
                        pred_out_feats_.masked_select(mask),
                        out_feats.masked_select(mask),
                    ).mean()
            else:
                loss_feats = criterion(
                    pred_out_feats.masked_select(mask), out_feats.masked_select(mask)
                ).mean()

    # Pitch regularization
    # NOTE: l1 loss seems to be better than mse loss in my experiments
    # we could use l2 loss as suggested in the sinsy's paper
    if lf0_residual is not None:
        with autocast(enabled=grad_scaler is not None):
            if isinstance(lf0_residual, list):
                loss_pitch = 0
                for lf0_residual_ in lf0_residual:
                    loss_pitch += (
                        (pitch_reg_dyn_ws * lf0_residual_.abs())
                        .masked_select(mask)
                        .mean()
                    )
            else:
                loss_pitch = (
                    (pitch_reg_dyn_ws * lf0_residual.abs()).masked_select(mask).mean()
                )
    else:
        loss_pitch = torch.tensor(0.0).to(in_feats.device)

    loss = loss_feats + pitch_reg_weight * loss_pitch

    if prediction_type == PredictionType.PROBABILISTIC:
        with torch.no_grad():
            pred_out_feats_ = mdn_get_most_probable_sigma_and_mu(pi, sigma, mu)[1]
    else:
        if isinstance(pred_out_feats, list):
            pred_out_feats_ = pred_out_feats[-1]
        else:
            pred_out_feats_ = pred_out_feats
    distortions = compute_distortions(
        pred_out_feats_, out_feats, lengths, out_scaler, model_config
    )

    if train:
        if grad_scaler is not None:
            grad_scaler.scale(loss).backward()
            grad_scaler.step(optimizer)
            grad_scaler.update()
        else:
            loss.backward()
            optimizer.step()

    log_metrics.update(distortions)
    log_metrics.update(
        {
            "Loss": loss.item(),
            "Loss_Feats": loss_feats.item(),
            "Loss_Pitch": loss_pitch.item(),
        }
    )

    return loss, log_metrics
示例#9
0
def train_step(
    model,
    optimizer,
    grad_scaler,
    train,
    in_feats,
    out_feats,
    lengths,
    out_scaler,
    feats_criterion="mse",
    stream_wise_loss=False,
    stream_weights=None,
    stream_sizes=None,
):
    model.train() if train else model.eval()
    optimizer.zero_grad()

    if feats_criterion in ["l2", "mse"]:
        criterion = nn.MSELoss(reduction="none")
    elif feats_criterion in ["l1", "mae"]:
        criterion = nn.L1Loss(reduction="none")
    else:
        raise RuntimeError("not supported criterion")

    prediction_type = (model.module.prediction_type() if isinstance(
        model, nn.DataParallel) else model.prediction_type())

    # Apply preprocess if required (e.g., FIR filter for shallow AR)
    # defaults to no-op
    if isinstance(model, nn.DataParallel):
        out_feats = model.module.preprocess_target(out_feats)
    else:
        out_feats = model.preprocess_target(out_feats)

    # Run forward
    with autocast(enabled=grad_scaler is not None):
        pred_out_feats = model(in_feats, lengths)

    # Mask (B, T, 1)
    mask = make_non_pad_mask(lengths).unsqueeze(-1).to(in_feats.device)

    # Compute loss
    if prediction_type == PredictionType.PROBABILISTIC:
        pi, sigma, mu = pred_out_feats
        # (B, max(T)) or (B, max(T), D_out)
        mask_ = mask if len(pi.shape) == 4 else mask.squeeze(-1)
        # Compute loss and apply mask
        with autocast(enabled=grad_scaler is not None):
            loss = mdn_loss(pi, sigma, mu, out_feats, reduce=False)
        loss = loss.masked_select(mask_).mean()
    else:
        if stream_wise_loss:
            w = get_stream_weight(stream_weights,
                                  stream_sizes).to(in_feats.device)
            streams = split_streams(out_feats, stream_sizes)
            pred_streams = split_streams(pred_out_feats, stream_sizes)
            loss = 0
            for pred_stream, stream, sw in zip(pred_streams, streams, w):
                with autocast(enabled=grad_scaler is not None):
                    loss += (sw * criterion(pred_stream.masked_select(mask),
                                            stream.masked_select(mask)).mean())
        else:
            with autocast(enabled=grad_scaler is not None):
                loss = criterion(pred_out_feats.masked_select(mask),
                                 out_feats.masked_select(mask)).mean()

    if prediction_type == PredictionType.PROBABILISTIC:
        with torch.no_grad():
            pred_out_feats_ = mdn_get_most_probable_sigma_and_mu(
                pi, sigma, mu)[1]
    else:
        pred_out_feats_ = pred_out_feats
    distortions = compute_distortions(pred_out_feats_, out_feats, lengths,
                                      out_scaler)

    if train:
        if grad_scaler is not None:
            grad_scaler.scale(loss).backward()
            grad_scaler.step(optimizer)
            grad_scaler.update()
        else:
            loss.backward()
            optimizer.step()

    return loss, distortions
示例#10
0
文件: train_util.py 项目: r9y9/nnsvs
def eval_spss_model(
    step,
    netG,
    in_feats,
    out_feats,
    lengths,
    model_config,
    out_scaler,
    writer,
    sr,
    trajectory_smoothing=True,
    trajectory_smoothing_cutoff=50,
):
    # make sure to be in eval mode
    netG.eval()
    is_autoregressive = (netG.module.is_autoregressive() if isinstance(
        netG, nn.DataParallel) else netG.is_autoregressive())
    prediction_type = (netG.module.prediction_type() if isinstance(
        netG, nn.DataParallel) else netG.prediction_type())
    utt_indices = [-1, -2, -3]
    utt_indices = utt_indices[:min(3, len(in_feats))]

    if np.any(model_config.has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(
            model_config.stream_sizes,
            model_config.has_dynamic_features,
            model_config.num_windows,
        )
    else:
        static_stream_sizes = model_config.stream_sizes

    for utt_idx in utt_indices:
        out_feats_denorm_ = out_scaler.inverse_transform(
            out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0))
        mgc, lf0, vuv, bap = get_static_features(
            out_feats_denorm_,
            model_config.num_windows,
            model_config.stream_sizes,
            model_config.has_dynamic_features,
        )[:4]
        mgc = mgc.squeeze(0).cpu().numpy()
        lf0 = lf0.squeeze(0).cpu().numpy()
        vuv = vuv.squeeze(0).cpu().numpy()
        bap = bap.squeeze(0).cpu().numpy()

        f0, spectrogram, aperiodicity = gen_world_params(
            mgc, lf0, vuv, bap, sr)
        wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5)
        group = f"utt{np.abs(utt_idx)}_reference"
        wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav
        writer.add_audio(group, wav, step, sr)

        # Run forward
        if is_autoregressive:
            outs = netG(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]],
                out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
            )
        else:
            outs = netG(in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                        [lengths[utt_idx]])

        # ResF0 case
        if isinstance(outs, tuple) and len(outs) == 2:
            outs, _ = outs

        if prediction_type == PredictionType.PROBABILISTIC:
            pi, sigma, mu = outs
            pred_out_feats = mdn_get_most_probable_sigma_and_mu(pi, sigma,
                                                                mu)[1]
        else:
            pred_out_feats = outs
        # NOTE: multiple outputs
        if isinstance(pred_out_feats, list):
            pred_out_feats = pred_out_feats[-1]
        if isinstance(pred_out_feats, tuple):
            pred_out_feats = pred_out_feats[0]

        if not isinstance(pred_out_feats, list):
            pred_out_feats = [pred_out_feats]

        # Run inference
        if prediction_type == PredictionType.PROBABILISTIC:
            inference_out_feats, _ = netG.inference(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]])
        else:
            inference_out_feats = netG.inference(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]])
        pred_out_feats.append(inference_out_feats)

        # Plot normalized input/output
        in_feats_ = in_feats[utt_idx, :lengths[utt_idx]].cpu().numpy()
        out_feats_ = out_feats[utt_idx, :lengths[utt_idx]].cpu().numpy()
        fig, ax = plt.subplots(3, 1, figsize=(8, 8))
        ax[0].set_title("Reference features")
        ax[1].set_title("Input features")
        ax[2].set_title("Predicted features")
        mesh = librosa.display.specshow(out_feats_.T,
                                        x_axis="frames",
                                        y_axis="frames",
                                        ax=ax[0],
                                        cmap="viridis")
        # NOTE: assuming normalized to N(0, 1)
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[0])
        mesh = librosa.display.specshow(in_feats_.T,
                                        x_axis="frames",
                                        y_axis="frames",
                                        ax=ax[1],
                                        cmap="viridis")
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[1])
        mesh = librosa.display.specshow(
            inference_out_feats.squeeze(0).cpu().numpy().T,
            x_axis="frames",
            y_axis="frames",
            ax=ax[2],
            cmap="viridis",
        )
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[2])
        for ax_ in ax:
            ax_.set_ylabel("Feature")
        plt.tight_layout()
        group = f"utt{np.abs(utt_idx)}_inference"
        writer.add_figure(f"{group}/Input-Output", fig, step)
        plt.close()

        assert len(pred_out_feats) == 2
        for idx, pred_out_feats_ in enumerate(pred_out_feats):
            pred_out_feats_ = pred_out_feats_.squeeze(0).cpu().numpy()
            pred_out_feats_denorm = (out_scaler.inverse_transform(
                torch.from_numpy(pred_out_feats_).to(
                    in_feats.device)).cpu().numpy())
            if np.any(model_config.has_dynamic_features):
                # (T, D_out) -> (T, static_dim)
                pred_out_feats_denorm = multi_stream_mlpg(
                    pred_out_feats_denorm,
                    (out_scaler.scale_**2).cpu().numpy(),
                    get_windows(model_config.num_windows),
                    model_config.stream_sizes,
                    model_config.has_dynamic_features,
                )
            pred_mgc, pred_lf0, pred_vuv, pred_bap = split_streams(
                pred_out_feats_denorm, static_stream_sizes)[:4]

            # Remove high-frequency components of mgc/bap
            # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering
            if trajectory_smoothing:
                modfs = int(1 / 0.005)
                for d in range(pred_mgc.shape[1]):
                    pred_mgc[:, d] = lowpass_filter(
                        pred_mgc[:, d],
                        modfs,
                        cutoff=trajectory_smoothing_cutoff)
                for d in range(pred_bap.shape[1]):
                    pred_bap[:, d] = lowpass_filter(
                        pred_bap[:, d],
                        modfs,
                        cutoff=trajectory_smoothing_cutoff)

            # Generated sample
            f0, spectrogram, aperiodicity = gen_world_params(
                pred_mgc, pred_lf0, pred_vuv, pred_bap, sr)
            wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5)
            wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav
            if idx == 1:
                group = f"utt{np.abs(utt_idx)}_inference"
            else:
                group = f"utt{np.abs(utt_idx)}_forward"
            writer.add_audio(group, wav, step, sr)
            plot_spsvs_params(
                step,
                writer,
                mgc,
                lf0,
                vuv,
                bap,
                pred_mgc,
                pred_lf0,
                pred_vuv,
                pred_bap,
                group=group,
                sr=sr,
            )