def inference(self, x, lengths=None): """Inference step Find the most likely mean and variance Args: x (torch.Tensor): the input tensor lengths (torch.Tensor): the lengths of the input tensor Returns: tuple: mean and variance of the output features """ log_pi, log_sigma, mu = self.forward(x, lengths) sigma, mu = mdn_get_most_probable_sigma_and_mu(log_pi, log_sigma, mu) return mu, sigma
def inference(self, x, lengths=None): """Inference step Args: x (torch.Tensor): input features lengths (torch.Tensor): lengths of input features Returns: tuple: (mu, sigma) if use_mdn, (output, ) otherwise """ if self.use_mdn: (log_pi, log_sigma, mu), _ = self(x, lengths) sigma, mu = mdn_get_most_probable_sigma_and_mu(log_pi, log_sigma, mu) return mu, sigma else: return self(x, lengths)[0]
def test_mdn_get_most_probable_sigma_and_mu(self): self.test_mdn_loss() pi, sigma, mu = self.model( torch.from_numpy(self.x_test.reshape(1, -1, self.d_in)).to(self.device)) _, max_mu = mdn.mdn_get_most_probable_sigma_and_mu(pi, sigma, mu) max_mu = max_mu.squeeze(0).cpu().detach().numpy() print(max_mu.shape) for i, sample in enumerate(max_mu): lower_limit = self.y_test_range[i][0] upper_limit = self.y_test_range[i][1] assert lower_limit < sample and upper_limit > sample print( f"sample: {sample}, lower_limit: {lower_limit}, upper_limit: {upper_limit}" )
def inference(self, x, lengths=None): log_pi, log_sigma, mu = self.forward(x, lengths) sigma, mu = mdn_get_most_probable_sigma_and_mu(log_pi, log_sigma, mu) return mu, sigma
def predict_timelag(device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=[-20, 20], allowed_range_rest=[-40, 40]): # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz( timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features) if isinstance(timelag_in_scaler, MinMaxScaler): # clip to feature range timelag_linguistic_features = np.clip( timelag_linguistic_features, timelag_in_scaler.feature_range[0], timelag_in_scaler.feature_range[1]) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) # Run model if timelag_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) log_pi, log_sigma, mu = timelag_model.inference(x, [x.shape[1]]) if np.any(timelag_config.has_dynamic_features): max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * timelag_out_scaler.var_ max_mu = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_timelag = timelag_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag) if np.any(timelag_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( pred_timelag, timelag_out_scaler.var_, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features) # Rounding pred_timelag = np.round(pred_timelag) # Clip to the allowed range for idx in range(len(pred_timelag)): if _is_silence(note_labels.contexts[idx]): pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1]) else: pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range[0], allowed_range[1]) # frames -> 100 ns pred_timelag *= 50000 return pred_timelag
def predict_acoustic(device, labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict, subphone_features="coarse_coding", pitch_indices=None, log_f0_conditioning=True): # Musical/linguistic features linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) if log_f0_conditioning: for idx in pitch_indices: linguistic_features[:, idx] = interp1d(_midi_to_hz( linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization linguistic_features = acoustic_in_scaler.transform(linguistic_features) if isinstance(acoustic_in_scaler, MinMaxScaler): # clip to feature range linguistic_features = np.clip(linguistic_features, acoustic_in_scaler.feature_range[0], acoustic_in_scaler.feature_range[1]) # Predict acoustic features x = torch.from_numpy(linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC: log_pi, log_sigma, mu = acoustic_model.inference(x, [x.shape[1]]) if np.any(acoustic_config.has_dynamic_features): # (B, T, D_out) max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * acoustic_out_scaler.var_ max_mu = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_acoustic = acoustic_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic) if np.any(acoustic_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( pred_acoustic, acoustic_out_scaler.var_, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features) return pred_acoustic
def predict_duration(device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, lag, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True): # Extract musical/linguistic features duration_linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) if log_f0_conditioning: for idx in pitch_indices: duration_linguistic_features[:, idx] = interp1d(_midi_to_hz( duration_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization duration_linguistic_features = duration_in_scaler.transform( duration_linguistic_features) if isinstance(duration_in_scaler, MinMaxScaler): # clip to feature range duration_linguistic_features = np.clip( duration_linguistic_features, duration_in_scaler.feature_range[0], duration_in_scaler.feature_range[1]) # Apply model x = torch.from_numpy(duration_linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if duration_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) log_pi, log_sigma, mu = duration_model.inference(x, [x.shape[1]]) if np.any(duration_config.has_dynamic_features): max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * duration_out_scaler.var_ max_mu = duration_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_durations = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(duration_config.num_windows), duration_config.stream_sizes, duration_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_durations = duration_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_durations = duration_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_durations = duration_out_scaler.inverse_transform(pred_durations) if np.any(duration_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_durations = multi_stream_mlpg( pred_durations, duration_out_scaler.var_, get_windows(duration_config.num_windows), duration_config.stream_sizes, duration_config.has_dynamic_features) pred_durations[pred_durations <= 0] = 1 pred_durations = np.round(pred_durations) return pred_durations
def train_step( model, model_config, optimizer, grad_scaler, train, in_feats, out_feats, lengths, out_scaler, feats_criterion="mse", pitch_reg_dyn_ws=1.0, pitch_reg_weight=1.0, ): model.train() if train else model.eval() optimizer.zero_grad() log_metrics = {} if feats_criterion in ["l2", "mse"]: criterion = nn.MSELoss(reduction="none") elif feats_criterion in ["l1", "mae"]: criterion = nn.L1Loss(reduction="none") else: raise RuntimeError("not supported criterion") prediction_type = ( model.module.prediction_type() if isinstance(model, nn.DataParallel) else model.prediction_type() ) # Apply preprocess if required (e.g., FIR filter for shallow AR) # defaults to no-op if isinstance(model, nn.DataParallel): out_feats = model.module.preprocess_target(out_feats) else: out_feats = model.preprocess_target(out_feats) # Run forward with autocast(enabled=grad_scaler is not None): outs = model(in_feats, lengths, out_feats) if isinstance(outs, tuple) and len(outs) == 2: pred_out_feats, lf0_residual = outs else: pred_out_feats, lf0_residual = outs, None # Mask (B, T, 1) mask = make_non_pad_mask(lengths).unsqueeze(-1).to(in_feats.device) # Compute loss if prediction_type == PredictionType.PROBABILISTIC: pi, sigma, mu = pred_out_feats # (B, max(T)) or (B, max(T), D_out) mask_ = mask if len(pi.shape) == 4 else mask.squeeze(-1) # Compute loss and apply mask with autocast(enabled=grad_scaler is not None): loss_feats = mdn_loss(pi, sigma, mu, out_feats, reduce=False) loss_feats = loss_feats.masked_select(mask_).mean() else: with autocast(enabled=grad_scaler is not None): # NOTE: multiple predictions if isinstance(pred_out_feats, list): loss_feats = 0 for pred_out_feats_ in pred_out_feats: loss_feats += criterion( pred_out_feats_.masked_select(mask), out_feats.masked_select(mask), ).mean() else: loss_feats = criterion( pred_out_feats.masked_select(mask), out_feats.masked_select(mask) ).mean() # Pitch regularization # NOTE: l1 loss seems to be better than mse loss in my experiments # we could use l2 loss as suggested in the sinsy's paper if lf0_residual is not None: with autocast(enabled=grad_scaler is not None): if isinstance(lf0_residual, list): loss_pitch = 0 for lf0_residual_ in lf0_residual: loss_pitch += ( (pitch_reg_dyn_ws * lf0_residual_.abs()) .masked_select(mask) .mean() ) else: loss_pitch = ( (pitch_reg_dyn_ws * lf0_residual.abs()).masked_select(mask).mean() ) else: loss_pitch = torch.tensor(0.0).to(in_feats.device) loss = loss_feats + pitch_reg_weight * loss_pitch if prediction_type == PredictionType.PROBABILISTIC: with torch.no_grad(): pred_out_feats_ = mdn_get_most_probable_sigma_and_mu(pi, sigma, mu)[1] else: if isinstance(pred_out_feats, list): pred_out_feats_ = pred_out_feats[-1] else: pred_out_feats_ = pred_out_feats distortions = compute_distortions( pred_out_feats_, out_feats, lengths, out_scaler, model_config ) if train: if grad_scaler is not None: grad_scaler.scale(loss).backward() grad_scaler.step(optimizer) grad_scaler.update() else: loss.backward() optimizer.step() log_metrics.update(distortions) log_metrics.update( { "Loss": loss.item(), "Loss_Feats": loss_feats.item(), "Loss_Pitch": loss_pitch.item(), } ) return loss, log_metrics
def train_step( model, optimizer, grad_scaler, train, in_feats, out_feats, lengths, out_scaler, feats_criterion="mse", stream_wise_loss=False, stream_weights=None, stream_sizes=None, ): model.train() if train else model.eval() optimizer.zero_grad() if feats_criterion in ["l2", "mse"]: criterion = nn.MSELoss(reduction="none") elif feats_criterion in ["l1", "mae"]: criterion = nn.L1Loss(reduction="none") else: raise RuntimeError("not supported criterion") prediction_type = (model.module.prediction_type() if isinstance( model, nn.DataParallel) else model.prediction_type()) # Apply preprocess if required (e.g., FIR filter for shallow AR) # defaults to no-op if isinstance(model, nn.DataParallel): out_feats = model.module.preprocess_target(out_feats) else: out_feats = model.preprocess_target(out_feats) # Run forward with autocast(enabled=grad_scaler is not None): pred_out_feats = model(in_feats, lengths) # Mask (B, T, 1) mask = make_non_pad_mask(lengths).unsqueeze(-1).to(in_feats.device) # Compute loss if prediction_type == PredictionType.PROBABILISTIC: pi, sigma, mu = pred_out_feats # (B, max(T)) or (B, max(T), D_out) mask_ = mask if len(pi.shape) == 4 else mask.squeeze(-1) # Compute loss and apply mask with autocast(enabled=grad_scaler is not None): loss = mdn_loss(pi, sigma, mu, out_feats, reduce=False) loss = loss.masked_select(mask_).mean() else: if stream_wise_loss: w = get_stream_weight(stream_weights, stream_sizes).to(in_feats.device) streams = split_streams(out_feats, stream_sizes) pred_streams = split_streams(pred_out_feats, stream_sizes) loss = 0 for pred_stream, stream, sw in zip(pred_streams, streams, w): with autocast(enabled=grad_scaler is not None): loss += (sw * criterion(pred_stream.masked_select(mask), stream.masked_select(mask)).mean()) else: with autocast(enabled=grad_scaler is not None): loss = criterion(pred_out_feats.masked_select(mask), out_feats.masked_select(mask)).mean() if prediction_type == PredictionType.PROBABILISTIC: with torch.no_grad(): pred_out_feats_ = mdn_get_most_probable_sigma_and_mu( pi, sigma, mu)[1] else: pred_out_feats_ = pred_out_feats distortions = compute_distortions(pred_out_feats_, out_feats, lengths, out_scaler) if train: if grad_scaler is not None: grad_scaler.scale(loss).backward() grad_scaler.step(optimizer) grad_scaler.update() else: loss.backward() optimizer.step() return loss, distortions
def eval_spss_model( step, netG, in_feats, out_feats, lengths, model_config, out_scaler, writer, sr, trajectory_smoothing=True, trajectory_smoothing_cutoff=50, ): # make sure to be in eval mode netG.eval() is_autoregressive = (netG.module.is_autoregressive() if isinstance( netG, nn.DataParallel) else netG.is_autoregressive()) prediction_type = (netG.module.prediction_type() if isinstance( netG, nn.DataParallel) else netG.prediction_type()) utt_indices = [-1, -2, -3] utt_indices = utt_indices[:min(3, len(in_feats))] if np.any(model_config.has_dynamic_features): static_stream_sizes = get_static_stream_sizes( model_config.stream_sizes, model_config.has_dynamic_features, model_config.num_windows, ) else: static_stream_sizes = model_config.stream_sizes for utt_idx in utt_indices: out_feats_denorm_ = out_scaler.inverse_transform( out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0)) mgc, lf0, vuv, bap = get_static_features( out_feats_denorm_, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, )[:4] mgc = mgc.squeeze(0).cpu().numpy() lf0 = lf0.squeeze(0).cpu().numpy() vuv = vuv.squeeze(0).cpu().numpy() bap = bap.squeeze(0).cpu().numpy() f0, spectrogram, aperiodicity = gen_world_params( mgc, lf0, vuv, bap, sr) wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5) group = f"utt{np.abs(utt_idx)}_reference" wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav writer.add_audio(group, wav, step, sr) # Run forward if is_autoregressive: outs = netG( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]], out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), ) else: outs = netG(in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) # ResF0 case if isinstance(outs, tuple) and len(outs) == 2: outs, _ = outs if prediction_type == PredictionType.PROBABILISTIC: pi, sigma, mu = outs pred_out_feats = mdn_get_most_probable_sigma_and_mu(pi, sigma, mu)[1] else: pred_out_feats = outs # NOTE: multiple outputs if isinstance(pred_out_feats, list): pred_out_feats = pred_out_feats[-1] if isinstance(pred_out_feats, tuple): pred_out_feats = pred_out_feats[0] if not isinstance(pred_out_feats, list): pred_out_feats = [pred_out_feats] # Run inference if prediction_type == PredictionType.PROBABILISTIC: inference_out_feats, _ = netG.inference( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) else: inference_out_feats = netG.inference( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) pred_out_feats.append(inference_out_feats) # Plot normalized input/output in_feats_ = in_feats[utt_idx, :lengths[utt_idx]].cpu().numpy() out_feats_ = out_feats[utt_idx, :lengths[utt_idx]].cpu().numpy() fig, ax = plt.subplots(3, 1, figsize=(8, 8)) ax[0].set_title("Reference features") ax[1].set_title("Input features") ax[2].set_title("Predicted features") mesh = librosa.display.specshow(out_feats_.T, x_axis="frames", y_axis="frames", ax=ax[0], cmap="viridis") # NOTE: assuming normalized to N(0, 1) mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[0]) mesh = librosa.display.specshow(in_feats_.T, x_axis="frames", y_axis="frames", ax=ax[1], cmap="viridis") mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[1]) mesh = librosa.display.specshow( inference_out_feats.squeeze(0).cpu().numpy().T, x_axis="frames", y_axis="frames", ax=ax[2], cmap="viridis", ) mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[2]) for ax_ in ax: ax_.set_ylabel("Feature") plt.tight_layout() group = f"utt{np.abs(utt_idx)}_inference" writer.add_figure(f"{group}/Input-Output", fig, step) plt.close() assert len(pred_out_feats) == 2 for idx, pred_out_feats_ in enumerate(pred_out_feats): pred_out_feats_ = pred_out_feats_.squeeze(0).cpu().numpy() pred_out_feats_denorm = (out_scaler.inverse_transform( torch.from_numpy(pred_out_feats_).to( in_feats.device)).cpu().numpy()) if np.any(model_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_out_feats_denorm = multi_stream_mlpg( pred_out_feats_denorm, (out_scaler.scale_**2).cpu().numpy(), get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features, ) pred_mgc, pred_lf0, pred_vuv, pred_bap = split_streams( pred_out_feats_denorm, static_stream_sizes)[:4] # Remove high-frequency components of mgc/bap # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering if trajectory_smoothing: modfs = int(1 / 0.005) for d in range(pred_mgc.shape[1]): pred_mgc[:, d] = lowpass_filter( pred_mgc[:, d], modfs, cutoff=trajectory_smoothing_cutoff) for d in range(pred_bap.shape[1]): pred_bap[:, d] = lowpass_filter( pred_bap[:, d], modfs, cutoff=trajectory_smoothing_cutoff) # Generated sample f0, spectrogram, aperiodicity = gen_world_params( pred_mgc, pred_lf0, pred_vuv, pred_bap, sr) wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5) wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav if idx == 1: group = f"utt{np.abs(utt_idx)}_inference" else: group = f"utt{np.abs(utt_idx)}_forward" writer.add_audio(group, wav, step, sr) plot_spsvs_params( step, writer, mgc, lf0, vuv, bap, pred_mgc, pred_lf0, pred_vuv, pred_bap, group=group, sr=sr, )