def save_and_plot_fn(args, log_dir, step, loss, prefix): idx, (seq, spec, align) = args audio_path = os.path.join( log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx)) align_path = os.path.join( log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx)) waveform = inv_spectrogram(spec.T) save_audio(waveform, audio_path) info_text = 'step={:d}, loss={:.5f}'.format(step, loss) if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]: log('Training korean : Use jamo') plot.plot_alignment(align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=True), isKorean=True) else: log('Training non-korean : X use jamo') plot.plot_alignment(align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=False), isKorean=False)
def split_on_silence_with_librosa(audio_path, top_db=40, frame_length=1024, hop_length=256, skip_idx=0, out_ext="wav", min_segment_length=3, max_segment_length=8, pre_silence_length=0, post_silence_length=0): filename = os.path.basename(audio_path).split('.', 1)[0] in_ext = audio_path.rsplit(".")[1] audio = load_audio(audio_path) edges = librosa.effects.split(audio, top_db=top_db, frame_length=frame_length, hop_length=hop_length) new_audio = np.zeros_like(audio) for idx, (start, end) in enumerate(edges[skip_idx:]): new_audio[start:end] = remove_breath(audio[start:end]) save_audio(new_audio, add_postfix(audio_path, "no_breath")) audio = new_audio edges = librosa.effects.split(audio, top_db=top_db, frame_length=frame_length, hop_length=hop_length) audio_paths = [] for idx, (start, end) in enumerate(edges[skip_idx:]): segment = audio[start:end] duration = get_duration(segment) if duration <= min_segment_length or duration >= max_segment_length: continue output_path = "{}/{}.{:04d}.{}".format(os.path.dirname(audio_path), filename, idx, out_ext) padded_segment = np.concatenate([ get_silence(pre_silence_length), segment, get_silence(post_silence_length), ]) save_audio(padded_segment, output_path) audio_paths.append(output_path) return audio_paths
def save_and_plot_fn(args, log_dir, step, loss, prefix): idx, (seq, spec, align) = args audio_path = os.path.join( log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx)) align_path = os.path.join( log_dir, '{}-step-{:09d}-audio{:03d}.png'.format(prefix, step, idx)) waveform = inv_spectrogram(spec.T) save_audio(waveform, audio_path) info_text = 'step={:d}, loss={:.5f}'.format(step, loss) plot.plot_alignment( align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=True))
def text_recognition(path, config): root, ext = os.path.splitext(path) txt_path = root + ".txt" if os.path.exists(txt_path): with open(txt_path) as f: out = json.loads(open(txt_path).read()) return out from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types out = {} error_count = 0 tmp_path = os.path.splitext(path)[0] + ".tmp.wav" while True: try: client = speech.SpeechClient() content = load_audio( path, pre_silence_length=config.pre_silence_length, post_silence_length=config.post_silence_length) max_duration = config.max_duration - \ config.pre_silence_length - config.post_silence_length audio_duration = get_duration(content) if audio_duration >= max_duration: print(" [!] Skip {} because of duration: {} > {}". \ format(path, audio_duration, max_duration)) return {} content = resample_audio(content, config.sample_rate) save_audio(content, tmp_path, config.sample_rate) with io.open(tmp_path, 'rb') as f: audio = types.RecognitionAudio(content=f.read()) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=config.sample_rate, language_code='ko-KR') response = client.recognize(config, audio) if len(response.results) > 0: alternatives = response.results[0].alternatives results = [ alternative.transcript for alternative in alternatives ] assert len(results) == 1, "More than 1 results: {}".format( results) out = {path: "" if len(results) == 0 else results[0]} print(path, results[0]) break break except Exception as err: raise Exception("OS error: {0}".format(err)) error_count += 1 print("Skip warning for {} for {} times". \ format(path, error_count)) if error_count > 5: break else: continue remove_file(tmp_path) with open(txt_path, 'w') as f: json.dump(out, f, indent=2, ensure_ascii=False) return out
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, use_manual_attention=False, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True): idx, (wav, alignment, path, text, sequence) = args if base_path: plot_path = "{}/{}.png".format(base_path, get_time()) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None #plot_path = add_prefix(plot_path, time_str) if use_manual_attention: plot_path = add_postfix(plot_path, "manual") if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if use_short_concat: wav = short_concat(wav, alignment, text, start_of_sentence, end_of_sentence, pre_word_num, post_word_num, pre_surplus_idx, post_surplus_idx) if attention_trim and end_of_sentence: end_idx_counter = 0 attention_argmax = alignment.argmax(0) end_idx = min(len(sequence) - 1, max(attention_argmax)) max_counter = min((attention_argmax == end_idx).sum(), 5) for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] audio_out = inv_spectrogram(wav.T) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_audio(audio_out, current_path) return True else: io_out = io.BytesIO() save_audio(audio_out, io_out) result = io_out.getvalue() return result