def postprocess_duration(labels, pred_durations, lag): note_indices = get_note_indices(labels) # append the end of note note_indices.append(len(labels)) output_labels = hts.HTSLabelFile() for i in range(1, len(note_indices)): # Apply time lag p = labels[note_indices[i-1]:note_indices[i]] p.start_times = np.minimum( np.asarray(p.start_times) + lag[i-1].reshape(-1), np.asarray(p.end_times) - 50000 * len(p)) p.start_times = np.maximum(p.start_times, 0) if len(output_labels) > 0: p.start_times = np.maximum(p.start_times, output_labels.start_times[-1] + 50000) # Compute normalized phoneme durations d = fe.duration_features(p) d_hat = pred_durations[note_indices[i-1]:note_indices[i]] d_norm = d[0] * d_hat / d_hat.sum() d_norm = np.round(d_norm) d_norm[d_norm <= 0] = 1 # TODO: better way to adjust? if d_norm.sum() != d[0]: d_norm[-1] += d[0] - d_norm.sum() p.set_durations(d_norm) if len(output_labels) > 0: output_labels.end_times[-1] = p.start_times[0] for n in p: output_labels.append(n) return output_labels
def predict_timelag(device, labels, timelag_model, timelag_in_scaler, timelag_out_scaler, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=[-30, 30]): # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz( timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) y = timelag_model(x, [x.shape[1]]).squeeze(0).cpu() # De-normalization and rounding lag = np.round(timelag_out_scaler.inverse_transform(y.data.numpy())) # Clip to the allowed range lag = np.clip(lag, allowed_range[0], allowed_range[1]) # frames -> 100 ns lag *= 50000 return lag
def predict_duration(device, labels, duration_model, duration_in_scaler, duration_out_scaler, lag, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True): # Get note indices note_indices = get_note_indices(labels) # append the end of note note_indices.append(len(labels)) # Extract musical/linguistic features duration_linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) if log_f0_conditioning: for idx in pitch_indices: duration_linguistic_features[:, idx] = interp1d(_midi_to_hz( duration_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization duration_linguistic_features = duration_in_scaler.transform( duration_linguistic_features) # Apply model x = torch.from_numpy(duration_linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) pred_durations = duration_model( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_durations = duration_out_scaler.inverse_transform(pred_durations) pred_durations[pred_durations <= 0] = 1 pred_durations = np.round(pred_durations) return pred_durations
print("Prepare data for time-lag models") full_lab_align_files = sorted(glob(join(full_align_dir, "*.lab"))) full_lab_score_files = sorted(glob(join(full_score_dir, "*.lab"))) for lab_align_path, lab_score_path in zip(full_lab_align_files, full_lab_score_files): name = basename(lab_align_path) lab_align = hts.load(lab_align_path) lab_score = hts.load(lab_score_path) # this may harm for computing offset lab_align = remove_sil_and_pau(lab_align) lab_score = remove_sil_and_pau(lab_score) # Extract note onsets and let's compute a offset note_indices = get_note_indices(lab_score) onset_align = np.asarray(lab_align[note_indices].start_times) onset_score = np.asarray(lab_score[note_indices].start_times) global_offset = (onset_align - onset_score).mean() global_offset = int(round(global_offset / 50000) * 50000) # Apply offset correction only when there is a big gap apply_offset_correction = np.abs( global_offset * 1e-7) > offset_correction_threshold if apply_offset_correction: print(f"{name}: Global offset (in sec): {global_offset * 1e-7}") lab_score.start_times = list( np.asarray(lab_score.start_times) + global_offset) lab_score.end_times = list(
def predict_timelag( device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, numeric_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=None, allowed_range_rest=None, force_clip_input_features=False, ): """Predict time-lag from HTS labels Args: device (torch.device): device labels (nnmnkwii.io.hts.HTSLabelFile): HTS-style labels timelag_model (nn.Module): time-lag model timelag_config (dict): time-lag model config timelag_in_scaler (sklearn.preprocessing.MinMaxScaler): input scaler timelag_out_scaler (sklearn.preprocessing.MinMaxScaler): output scaler binary_dict (dict): binary feature dict numeric_dict (dict): numeric feature dict pitch_indices (list): indices of pitch features log_f0_conditioning (bool): whether to condition on log f0 allowed_range (list): allowed range of time-lag allowed_range_rest (list): allowed range of time-lag for rest force_clip_input_features (bool): whether to clip input features Returns; ndarray: time-lag predictions """ if allowed_range is None: allowed_range = [-20, 20] if allowed_range_rest is None: allowed_range_rest = [-40, 40] # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, numeric_dict, add_frame_features=False, subphone_features=None, ).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d( _midi_to_hz(timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear", ) # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features ) if force_clip_input_features and isinstance(timelag_in_scaler, MinMaxScaler): # clip to feature range (except for pitch-related features) non_pitch_indices = [ idx for idx in range(timelag_linguistic_features.shape[1]) if idx not in pitch_indices ] timelag_linguistic_features[:, non_pitch_indices] = np.clip( timelag_linguistic_features[:, non_pitch_indices], timelag_in_scaler.feature_range[0], timelag_in_scaler.feature_range[1], ) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) # Run model if timelag_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) max_mu, max_sigma = timelag_model.inference(x, [x.shape[1]]) if np.any(timelag_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = ( max_sigma.squeeze(0).cpu().data.numpy() ** 2 * timelag_out_scaler.var_ ) max_sigma_sq = np.maximum(max_sigma_sq, 1e-14) max_mu = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features, ) else: # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) else: # (T, D_out) pred_timelag = ( timelag_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() ) # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag) if np.any(timelag_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( pred_timelag, timelag_out_scaler.var_, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features, ) # Rounding pred_timelag = np.round(pred_timelag) # Clip to the allowed range for idx in range(len(pred_timelag)): if _is_silence(note_labels.contexts[idx]): pred_timelag[idx] = np.clip( pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1] ) else: pred_timelag[idx] = np.clip( pred_timelag[idx], allowed_range[0], allowed_range[1] ) # frames -> 100 ns pred_timelag *= 50000 return pred_timelag
def postprocess_duration(labels, pred_durations, lag): """Post-process durations based on predicted time-lag Ref : https://arxiv.org/abs/2108.02776 Args: labels (HTSLabelFile): HTS labels pred_durations (array or tuple): predicted durations for non-MDN, mean and variance for MDN lag (array): predicted time-lag Returns: HTSLabelFile: labels with adjusted durations """ note_indices = get_note_indices(labels) # append the end of note note_indices.append(len(labels)) is_mdn = isinstance(pred_durations, tuple) and len(pred_durations) == 2 output_labels = hts.HTSLabelFile() for i in range(1, len(note_indices)): p = labels[note_indices[i - 1] : note_indices[i]] # Compute note duration with time-lag # eq (11) L = int(fe.duration_features(p)[0]) if i < len(note_indices) - 1: L_hat = L - (lag[i - 1] - lag[i]) / 50000 else: L_hat = L - (lag[i - 1]) / 50000 # Prevent negative duration L_hat = max(L_hat, 1) # adjust the start time of the note p.start_times = np.minimum( np.asarray(p.start_times) + lag[i - 1].reshape(-1), np.asarray(p.end_times) - 50000 * len(p), ) p.start_times = np.maximum(p.start_times, 0) if len(output_labels) > 0: p.start_times = np.maximum( p.start_times, output_labels.start_times[-1] + 50000 ) # Compute normalized phoneme durations if is_mdn: mu = pred_durations[0][note_indices[i - 1] : note_indices[i]] sigma_sq = pred_durations[1][note_indices[i - 1] : note_indices[i]] # eq (17) rho = (L_hat - mu.sum()) / sigma_sq.sum() # eq (16) d_norm = mu + rho * sigma_sq if np.any(d_norm <= 0): # eq (12) (using mu as d_hat) print( f"Negative phoneme durations are predicted at {i}-th note. " "The note duration: ", f"{round(float(L)*0.005,3)} sec -> {round(float(L_hat)*0.005,3)} sec", ) print( "It's likely that the model couldn't predict correct durations " "for short notes." ) print( f"Variance scaling based durations (in frame):\n{(mu + rho * sigma_sq)}" ) print( f"Fallback to uniform scaling (in frame):\n{(L_hat * mu / mu.sum())}" ) d_norm = L_hat * mu / mu.sum() else: # eq (12) d_hat = pred_durations[note_indices[i - 1] : note_indices[i]] d_norm = L_hat * d_hat / d_hat.sum() d_norm = np.round(d_norm) d_norm[d_norm <= 0] = 1 p.set_durations(d_norm) if len(output_labels) > 0: output_labels.end_times[-1] = p.start_times[0] for n in p: output_labels.append(n) return output_labels
def predict_timelag(device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=[-20, 20], allowed_range_rest=[-40, 40]): # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz( timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features) if isinstance(timelag_in_scaler, MinMaxScaler): # clip to feature range timelag_linguistic_features = np.clip( timelag_linguistic_features, timelag_in_scaler.feature_range[0], timelag_in_scaler.feature_range[1]) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) # Run model if timelag_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) log_pi, log_sigma, mu = timelag_model.inference(x, [x.shape[1]]) if np.any(timelag_config.has_dynamic_features): max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * timelag_out_scaler.var_ max_mu = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_timelag = timelag_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag) if np.any(timelag_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( pred_timelag, timelag_out_scaler.var_, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features) # Rounding pred_timelag = np.round(pred_timelag) # Clip to the allowed range for idx in range(len(pred_timelag)): if _is_silence(note_labels.contexts[idx]): pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1]) else: pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range[0], allowed_range[1]) # frames -> 100 ns pred_timelag *= 50000 return pred_timelag