def fix_mono_lab_before_align(lab): f = hts.HTSLabelFile() f.append(lab[0]) for i in range(1, len(lab)): # nothing to do f.append(lab[i], strict=False) return (f)
def test_hts_append(): lab_path = join(DATA_DIR, "BASIC5000_0001.lab") test_labels = hts.load(lab_path) print("\n{}".format(test_labels)) # should get same string representation labels = hts.HTSLabelFile() assert str(labels) == "" for label in test_labels: labels.append(label) assert str(test_labels) == str(labels) @raises(ValueError) def test_invalid_start_time(): l = hts.HTSLabelFile() l.append((100000, 0, "NG")) def test_succeeding_times(): l = hts.HTSLabelFile() l.append((0, 1000000, "OK")) l.append((1000000, 2000000, "OK")) @raises(ValueError) def test_non_succeeding_times(): l = hts.HTSLabelFile() l.append((0, 1000000, "OK")) l.append((1500000, 2000000, "NG")) test_invalid_start_time() test_succeeding_times() test_non_succeeding_times()
def fix_mono_lab_after_align(lab): f = hts.HTSLabelFile() f.append(lab[0]) for i in range(1, len(lab)): # fix consecutive pau/sil if ((f.contexts[-1] == "pau" or f.contexts[-1] == "sil") and (lab.contexts[i] == "pau" or lab.contexts[i] == "sil")): print("Consecutive pau/sil-s are detected.") d = round((f.end_times[-1] - f.start_times[-1]) / 2) f.end_times[-1] = f.start_times[-1] + d f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i])) elif (f.contexts[-1] == lab.contexts[i] and f.start_times[-1] == lab.start_times[i] and f.end_times[-1] == lab.end_times[i]): # duplicated vowel before "cl"? print( "{} and {} have the same start_time {} and end_time {}. There seems to be a missing phoneme in mono_dtw." .format(f.contexts[-1], lab.contexts[i], f.start_times[-1], f.end_times[-1])) print() d = round((lab.end_times[i] - lab.start_times[i]) / 2) f.end_times[-1] = f.start_times[-1] + d f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i])) elif (f.end_times[-1] != lab.start_times[i]): # There is a gap between the end_times of the last phoneme and the start_times of the next phoneme print( "end_time {} of the phoneme {} and start_time {} of the phoneme {} is not the same. There seems to be a missing phoneme in sinsy_mono_round." .format(f.end_times[-1], f.contexts[-1], lab.start_times[i], lab.contexts[i])) # expand lab.start_times[i] to f.end_times[-1] f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i])) else: f.append(lab[i], strict=False) return (f)
def postprocess_duration(labels, pred_durations, lag): note_indices = get_note_indices(labels) # append the end of note note_indices.append(len(labels)) output_labels = hts.HTSLabelFile() for i in range(1, len(note_indices)): # Apply time lag p = labels[note_indices[i-1]:note_indices[i]] p.start_times = np.minimum( np.asarray(p.start_times) + lag[i-1].reshape(-1), np.asarray(p.end_times) - 50000 * len(p)) p.start_times = np.maximum(p.start_times, 0) if len(output_labels) > 0: p.start_times = np.maximum(p.start_times, output_labels.start_times[-1] + 50000) # Compute normalized phoneme durations d = fe.duration_features(p) d_hat = pred_durations[note_indices[i-1]:note_indices[i]] d_norm = d[0] * d_hat / d_hat.sum() d_norm = np.round(d_norm) d_norm[d_norm <= 0] = 1 # TODO: better way to adjust? if d_norm.sum() != d[0]: d_norm[-1] += d[0] - d_norm.sum() p.set_durations(d_norm) if len(output_labels) > 0: output_labels.end_times[-1] = p.start_times[0] for n in p: output_labels.append(n) return output_labels
def _fix_mono_lab_after_align_default(lab): f = hts.HTSLabelFile() f.append(lab[0]) for i in range(1, len(lab)): # fix consecutive pau/sil if (f.contexts[-1] == "pau" or f.contexts[-1] == "sil") and (lab.contexts[i] == "pau" or lab.contexts[i] == "sil"): print("Consecutive pau/sil-s are detected.") d = round((f.end_times[-1] - f.start_times[-1]) / 2) f.end_times[-1] = f.start_times[-1] + d f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i])) elif f.end_times[-1] != lab.start_times[i]: # There is a gap between the end_times of the last phoneme and # the start_times of the next phoneme print( "end_time {} of the phoneme {} and start_time {} of the phoneme {} is not the same." .format( # noqa f.end_times[-1], f.contexts[-1], lab.start_times[i], lab.contexts[i])) print( "There seems to be a missing phoneme in generated_mono_round.") # expand lab.start_times[i] to f.end_times[-1] f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i])) else: f.append(lab[i], strict=False) return f
def remove_sil_and_pau(lab): newlab = hts.HTSLabelFile() for label in lab: if "-sil" not in label[-1] and "-pau" not in label[-1]: newlab.append(label, strict=False) return newlab
def merge_sil(lab): N = len(lab) f = hts.HTSLabelFile() f.append(lab[0], strict=False) is_full_context = "@" in lab[0][-1] for i in range(1, N): if (is_full_context and "-sil" in f[-1][-1] and "-sil" in lab[i][-1]) \ or (not is_full_context and f[-1][-1] == "sil" and lab[i][-1] == "sil"): # extend sil f.end_times[-1] = lab[i][1] else: f.append(lab[i], strict=False) return f
def _fix_mono_lab_after_align_default(lab): f = hts.HTSLabelFile() f.append(lab[0]) for i in range(1, len(lab)): # fix contigous pau if (f.contexts[-1] == "pau" and lab.contexts[i] == "pau" and f.start_times[-1] == lab.start_times[i] and f.end_times[-1] == lab.end_times[i]): d = round((lab.end_times[i] - lab.start_times[i]) / 2) f.end_times[-1] = f.start_times[-1] + d f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i])) else: f.append(lab[i], strict=False) return f
os.makedirs(d, exist_ok=True) sinsy = pysinsy.sinsy.Sinsy() assert sinsy.setLanguages("j", pysinsy.get_default_dic_dir()) mono_lab_files = sorted(glob(join(args.pjs_root, "**/*.lab"))) muxicxml_files = sorted(glob(join(args.pjs_root, "**/*.musicxml"))) assert len(mono_lab_files) == len(muxicxml_files) for mono_path, xml_path in zip(mono_lab_files, muxicxml_files): align_mono_lab = hts.load(mono_path) name = basename(mono_path) assert sinsy.loadScoreFromMusicXML(xml_path) # check if sinsy's phoneme output is same as the provided alignment format sinsy_labels = sinsy.createLabelData(True, 1, 1).getData() sinsy_mono_lab = hts.HTSLabelFile() for label in sinsy_labels: sinsy_mono_lab.append(label.split(), strict=False) assert len(align_mono_lab) == len(sinsy_mono_lab) assert (np.asarray(align_mono_lab.contexts) == np.asarray( sinsy_mono_lab.contexts)).all() # rounding has_too_short_ph = False for idx in range(len(align_mono_lab)): b, e = align_mono_lab.start_times[idx], align_mono_lab.end_times[idx] bb, ee = round(b / 50000) * 50000, round(e / 50000) * 50000 # TODO: better way if bb == ee: # ensure minimum frame length 1
def test_non_succeeding_times_wo_strict(): l = hts.HTSLabelFile() l.append((0, 1000000, "OK"), strict=False) l.append((1500000, 2000000, "OK"), strict=False)
def test_non_succeeding_times(): l = hts.HTSLabelFile() l.append((0, 1000000, "OK")) l.append((1500000, 2000000, "NG"))
def test_invalid_start_time(): l = hts.HTSLabelFile() l.append((100000, 0, "NG"))
def postprocess_duration(labels, pred_durations, lag): """Post-process durations based on predicted time-lag Ref : https://arxiv.org/abs/2108.02776 Args: labels (HTSLabelFile): HTS labels pred_durations (array or tuple): predicted durations for non-MDN, mean and variance for MDN lag (array): predicted time-lag Returns: HTSLabelFile: labels with adjusted durations """ note_indices = get_note_indices(labels) # append the end of note note_indices.append(len(labels)) is_mdn = isinstance(pred_durations, tuple) and len(pred_durations) == 2 output_labels = hts.HTSLabelFile() for i in range(1, len(note_indices)): p = labels[note_indices[i - 1] : note_indices[i]] # Compute note duration with time-lag # eq (11) L = int(fe.duration_features(p)[0]) if i < len(note_indices) - 1: L_hat = L - (lag[i - 1] - lag[i]) / 50000 else: L_hat = L - (lag[i - 1]) / 50000 # Prevent negative duration L_hat = max(L_hat, 1) # adjust the start time of the note p.start_times = np.minimum( np.asarray(p.start_times) + lag[i - 1].reshape(-1), np.asarray(p.end_times) - 50000 * len(p), ) p.start_times = np.maximum(p.start_times, 0) if len(output_labels) > 0: p.start_times = np.maximum( p.start_times, output_labels.start_times[-1] + 50000 ) # Compute normalized phoneme durations if is_mdn: mu = pred_durations[0][note_indices[i - 1] : note_indices[i]] sigma_sq = pred_durations[1][note_indices[i - 1] : note_indices[i]] # eq (17) rho = (L_hat - mu.sum()) / sigma_sq.sum() # eq (16) d_norm = mu + rho * sigma_sq if np.any(d_norm <= 0): # eq (12) (using mu as d_hat) print( f"Negative phoneme durations are predicted at {i}-th note. " "The note duration: ", f"{round(float(L)*0.005,3)} sec -> {round(float(L_hat)*0.005,3)} sec", ) print( "It's likely that the model couldn't predict correct durations " "for short notes." ) print( f"Variance scaling based durations (in frame):\n{(mu + rho * sigma_sq)}" ) print( f"Fallback to uniform scaling (in frame):\n{(L_hat * mu / mu.sum())}" ) d_norm = L_hat * mu / mu.sum() else: # eq (12) d_hat = pred_durations[note_indices[i - 1] : note_indices[i]] d_norm = L_hat * d_hat / d_hat.sum() d_norm = np.round(d_norm) d_norm[d_norm <= 0] = 1 p.set_durations(d_norm) if len(output_labels) > 0: output_labels.end_times[-1] = p.start_times[0] for n in p: output_labels.append(n) return output_labels
def _process_utterance(out_dir, index, speaker_id, wav_path, lab_path, binary_dict, continuous_dict, text): # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) # determine sessionID and uttID wavbn = os.path.basename(wav_path) uttID = os.path.splitext(wavbn)[0] if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # time-aligned context if hparams.frame_shift_ms is None: frame_shift_in_micro_sec = (hparams.hop_size * 10000000) // hparams.sample_rate else: frame_shift_in_micro_sec = hparams.frame_shift_ms * 10000 labels = hts.HTSLabelFile(frame_shift_in_micro_sec) labels.load(lab_path) linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, frame_shift_in_micro_sec=frame_shift_in_micro_sec) Nwav = len(out) // audio.get_hop_size() out = out[:Nwav * audio.get_hop_size()] timesteps = len(out) context = linguistic_features # Write the spectrograms to disk: audio_filename = 'audio-' + uttID + '.npy' context_filename = 'context-' + uttID + '.npy' np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, context_filename), context.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, context_filename, timesteps, text, speaker_id)
if config is None: print(f"Cannot read config file: {sys.argv[1]}.") sys.exit(-1) print("Copy original label files.") files = sorted( glob(join(expanduser(config["db_root"]), "**/*.lab"), recursive=True)) dst_dir = join(config["out_dir"], "mono_label") os.makedirs(dst_dir, exist_ok=True) for m in tqdm(files): if config["spk"] == "natsumeyuuri": # natsume_singing name = splitext(basename(m))[0] if name in config["exclude_songs"]: continue h = hts.HTSLabelFile() with open(m) as f: for label in f: s, e, lab = label.strip().split() if config["label_time_unit"] == "sec": s, e = int(float(s) * 1e7), int(float(e) * 1e7) h.append((s, e, lab)) with open(join(dst_dir, basename(m)), "w") as of: of.write(str(fix_mono_lab_before_align(h, config["spk"]))) else: # ofuton_p_utagoe_db, oniku_kurumi_utagoe_db name = splitext(basename(m))[0] if name in config["exclude_songs"]: continue f = hts.load(m) with open(join(dst_dir, basename(m)), "w") as of:
def segment_labels(lab, strict=True, threshold=1.0, min_duration=5.0, force_split_threshold=10.0): """Segment labels based on sil/pau Example: [a b c sil d e f pau g h i sil j k l] -> [a b c] [d e f] [g h i] [j k l] """ segments = [] seg = hts.HTSLabelFile() start_indices = [] end_indices = [] si = 0 large_silence_detected = False for idx, (s, e, l) in enumerate(lab): d = (e - s) * 1e-7 is_silence = _is_silence(l) if len(seg) > 0: # Compute duration except for long silences seg_d = compute_nosil_duration(seg) else: seg_d = 0 # let's try to split # if we find large silence, force split regardless min_duration if (d > force_split_threshold) or (is_silence and d > threshold and seg_d > min_duration): if idx == len(lab) - 1: continue elif len(seg) > 0: if d > force_split_threshold: large_silence_detected = True else: large_silence_detected = False start_indices.append(si) si = 0 end_indices.append(idx - 1) segments.append(seg) seg = hts.HTSLabelFile() continue else: if len(seg) == 0: si = idx seg.append((s, e, l), strict) if len(seg) > 0: seg_d = compute_nosil_duration(seg) # If the last segment is short, combine with the previous segment. if seg_d < min_duration and not large_silence_detected: end_indices[-1] = si + len(seg) - 1 else: start_indices.append(si) end_indices.append(si + len(seg) - 1) # Trim large sil for each segment segments2 = [] start_indices_new, end_indices_new = [], [] for s, e in zip(start_indices, end_indices): seg = lab[s:e + 1] # ignore "sil" or "pau" only segment if len(seg) == 1 and _is_silence(seg.contexts[0]): continue seg2, forward, backward = trim_long_sil_and_pau(seg, return_indices=True) start_indices_new.append(s + forward) end_indices_new.append(s + backward) segments2.append(seg2) return segments2, start_indices_new, end_indices_new
assert sinsy.setLanguages("j", config["sinsy_dic"]) # generate full/mono labels by sinsy print("Convert musicxml to label files.") files = sorted(glob(join(expanduser(config["db_root"]), "**/*.*xml"), recursive=True)) for path in tqdm(files): name = splitext(basename(path))[0] if name in config["exclude_songs"]: continue assert sinsy.loadScoreFromMusicXML(path) for is_mono in [True, False]: n = "generated_mono" if is_mono else "generated_full" labels = sinsy.createLabelData(is_mono, 1, 1).getData() lab = hts.HTSLabelFile() for label in labels: lab.append(label.split(), strict=False) lab = merge_sil(lab) dst_dir = join(config["out_dir"], f"{n}") os.makedirs(dst_dir, exist_ok=True) with open(join(dst_dir, name + ".lab"), "w") as f: f.write(str(lab)) sinsy.clearScore() print("Copy original label files.") files = sorted(glob(join(expanduser(config["db_root"]), "**/*.lab"), recursive=True)) dst_dir = join(config["out_dir"], "mono_label") os.makedirs(dst_dir, exist_ok=True) for m in tqdm(files): if config["spk"] == "natsumeyuuri":
from tqdm import tqdm global DATA_ROOT from sklearn.preprocessing import StandardScaler,MinMaxScaler sample_rate=22050 hop_size=256 frame_period =1000*hop_size/sample_rate #5 #hop_size=int(frame_period*sample_rate/1000) frame_shift_in_micro_sec=int(frame_period*10000) fft_len=1024 mel_dim=80 window='hann' fmin=50 fmax=7600 _hts=hts.HTSLabelFile(frame_shift_in_micro_sec=frame_shift_in_micro_sec) def is_outlier(x, p25, p75): """Check if value is an outlier.""" lower = p25 - 1.5 * (p75 - p25) upper = p75 + 1.5 * (p75 - p25) return x <= lower or x >= upper def remove_outlier(x, p_bottom: int = 25, p_top: int = 75): """Remove outlier from x.""" p_bottom = np.percentile(x, p_bottom) p_top = np.percentile(x, p_top) indices_of_outliers = [] for ind, value in enumerate(x): if is_outlier(value, p_bottom, p_top):