def postprocess_duration(labels, pred_durations, lag): note_indices = get_note_indices(labels) # append the end of note note_indices.append(len(labels)) output_labels = hts.HTSLabelFile() for i in range(1, len(note_indices)): # Apply time lag p = labels[note_indices[i-1]:note_indices[i]] p.start_times = np.minimum( np.asarray(p.start_times) + lag[i-1].reshape(-1), np.asarray(p.end_times) - 50000 * len(p)) p.start_times = np.maximum(p.start_times, 0) if len(output_labels) > 0: p.start_times = np.maximum(p.start_times, output_labels.start_times[-1] + 50000) # Compute normalized phoneme durations d = fe.duration_features(p) d_hat = pred_durations[note_indices[i-1]:note_indices[i]] d_norm = d[0] * d_hat / d_hat.sum() d_norm = np.round(d_norm) d_norm[d_norm <= 0] = 1 # TODO: better way to adjust? if d_norm.sum() != d[0]: d_norm[-1] += d[0] - d_norm.sum() p.set_durations(d_norm) if len(output_labels) > 0: output_labels.end_times[-1] = p.start_times[0] for n in p: output_labels.append(n) return output_labels
def test_linguistic_and_duration_features_for_duration_model(): qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed") binary_dict, continuous_dict = hts.load_question_set(qs_file_name) # Phone-level linguistic features # Linguistic features input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab") labels = hts.load(input_state_label) assert labels.is_state_alignment_label() x = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None) y = np.fromfile(join(DATA_DIR, "binary_label_416", "arctic_a0001.lab"), dtype=np.float32).reshape(-1, x.shape[-1]) assert np.allclose(x, y) # Duration features labels = hts.load(input_state_label) x = fe.duration_features(labels, feature_type="numerical", unit_size="state", feature_size="phoneme") y = np.fromfile(join(DATA_DIR, "duration_untrimmed", "arctic_a0001.dur"), dtype=np.float32).reshape(-1, x.shape[-1]) assert np.allclose(x, y)
def collect_features(self, path): labels = hts.load(path) features = fe.duration_features(labels) indices = labels.silence_phone_indices() features = np.delete(features, indices, axis=0) #print('DurationFeature:',features.shape) return features.astype(np.float32)
def _process_feature(out_dir, index, label_path): labels = hts.load(label_path) features = fe.duration_features(labels) n_frames = len(features) indices = labels.silence_phone_indices() features = np.delete(features, indices, axis=0) voiced_frames = features.shape[0] # Write the duration to disk: duration_filename = 'arctic_%05d.npy' % index np.save(os.path.join(out_dir, duration_filename), features.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (duration_filename, n_frames, voiced_frames)
def test_state_alignment_label_file(): input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab") labels = hts.load(input_state_label) with open(input_state_label) as f: assert f.read() == str(labels) print(labels.num_states()) assert labels.num_states() == 5 # Get and restore durations durations = fe.duration_features(labels) labels_copy = copy.deepcopy(labels) labels_copy.set_durations(durations) assert str(labels) == str(labels_copy)
def test_phone_alignment_label(): qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed") binary_dict, continuous_dict = hts.load_question_set(qs_file_name) input_state_label = join(DATA_DIR, "label_phone_align", "arctic_a0001.lab") labels = hts.load(input_state_label) x = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None) assert not labels.is_state_alignment_label() assert np.all(np.isfinite(x)) for subphone_features in ["coarse_coding", "minimal_phoneme"]: x = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) assert np.all(np.isfinite(x)) x = fe.duration_features(labels) assert np.all(np.isfinite(x))
def collect_features(self, path): labels = hts.load(path) features = fe.duration_features(labels) return features.astype(np.float32)
def postprocess_duration(labels, pred_durations, lag): """Post-process durations based on predicted time-lag Ref : https://arxiv.org/abs/2108.02776 Args: labels (HTSLabelFile): HTS labels pred_durations (array or tuple): predicted durations for non-MDN, mean and variance for MDN lag (array): predicted time-lag Returns: HTSLabelFile: labels with adjusted durations """ note_indices = get_note_indices(labels) # append the end of note note_indices.append(len(labels)) is_mdn = isinstance(pred_durations, tuple) and len(pred_durations) == 2 output_labels = hts.HTSLabelFile() for i in range(1, len(note_indices)): p = labels[note_indices[i - 1] : note_indices[i]] # Compute note duration with time-lag # eq (11) L = int(fe.duration_features(p)[0]) if i < len(note_indices) - 1: L_hat = L - (lag[i - 1] - lag[i]) / 50000 else: L_hat = L - (lag[i - 1]) / 50000 # Prevent negative duration L_hat = max(L_hat, 1) # adjust the start time of the note p.start_times = np.minimum( np.asarray(p.start_times) + lag[i - 1].reshape(-1), np.asarray(p.end_times) - 50000 * len(p), ) p.start_times = np.maximum(p.start_times, 0) if len(output_labels) > 0: p.start_times = np.maximum( p.start_times, output_labels.start_times[-1] + 50000 ) # Compute normalized phoneme durations if is_mdn: mu = pred_durations[0][note_indices[i - 1] : note_indices[i]] sigma_sq = pred_durations[1][note_indices[i - 1] : note_indices[i]] # eq (17) rho = (L_hat - mu.sum()) / sigma_sq.sum() # eq (16) d_norm = mu + rho * sigma_sq if np.any(d_norm <= 0): # eq (12) (using mu as d_hat) print( f"Negative phoneme durations are predicted at {i}-th note. " "The note duration: ", f"{round(float(L)*0.005,3)} sec -> {round(float(L_hat)*0.005,3)} sec", ) print( "It's likely that the model couldn't predict correct durations " "for short notes." ) print( f"Variance scaling based durations (in frame):\n{(mu + rho * sigma_sq)}" ) print( f"Fallback to uniform scaling (in frame):\n{(L_hat * mu / mu.sum())}" ) d_norm = L_hat * mu / mu.sum() else: # eq (12) d_hat = pred_durations[note_indices[i - 1] : note_indices[i]] d_norm = L_hat * d_hat / d_hat.sum() d_norm = np.round(d_norm) d_norm[d_norm <= 0] = 1 p.set_durations(d_norm) if len(output_labels) > 0: output_labels.end_times[-1] = p.start_times[0] for n in p: output_labels.append(n) return output_labels
def get_duration(lab_path): labels = hts.load(lab_path) feature = fe.duration_features(labels) indices = labels.silence_phone_indices() feature = np.delete(feature, indices, axis=0) return feature.astype(np.float32)
def collect_features(self, path): labels = hts.load(path) features = fe.duration_features(labels) indices = labels.silence_phone_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def __test(labels, unit_size, feature_size): fe.duration_features(labels, unit_size=unit_size, feature_size=feature_size)