def _process_utterance(out_dir, index, tar_cd_path, in_jd_path, in_cg_path): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: tar_cd_wav = audio.load_wav(tar_cd_path) # Compute the linear-scale spectrogram from the wav: tar_cd_spectrogram = audio.spectrogram(tar_cd_wav).astype(np.float32) n_frames = tar_cd_spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: tar_cd_mel_spectrogram = audio.melspectrogram(tar_cd_wav).astype( np.float32) in_jd_wav = audio.load_wav(in_jd_path) in_cg_wav = audio.load_wav(in_cg_path) # Compute the linear-scale spectrogram from the wav: # Beacase of use voice traing,needless spectrogram. #in_spectrogram = audio.spectrogram(in_cg_wav).astype(np.float32) # Compute the mel-scale spectrogram from the wav: in_jd_mel_spectrogram = audio.melspectrogram(in_jd_wav).astype(np.float32) in_cg_mel_spectrogram = audio.melspectrogram(in_cg_wav).astype(np.float32) # Write the spectrograms to disk: in_jd_mel_spectrogram_filename = 'Imuspeech-in_jd_mel_spec-%05d.npy' % index in_cg_mel_spectrogram_filename = 'Imuspeech-in_cg_mel_spec-%05d.npy' % index tar_cd_spectrogram_filename = 'Imuspeech-tar_cd_spec-%05d.npy' % index tar_cd_mel_filename = 'Imuspeech-tar_cd_mel-%05d.npy' % index np.save(os.path.join(out_dir, in_jd_mel_spectrogram_filename), in_jd_mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, in_cg_mel_spectrogram_filename), in_cg_mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, tar_cd_spectrogram_filename), tar_cd_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, tar_cd_mel_filename), tar_cd_mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (tar_cd_spectrogram_filename, tar_cd_mel_filename, n_frames, in_jd_mel_spectrogram_filename, in_cg_mel_spectrogram_filename)
def _process_utterance(out_dir, index, src_path, tgt_path): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. src_path: Path to the source audio file tgt_path: Path to the target audio file Returns: A (tgt_spectrogram_filename, tgt_mel_filename, n_frames, src_spectogram_filename) tuple to write to train.txt ''' # Load the audio to a numpy array: src_wav = audio.load_wav(src_path) tgt_wav = audio.load_wav(tgt_path) # Compute the linear-scale spectrogram from the wav: src_spectrogram = audio.spectrogram( src_wav, num_src_freq=hparams.num_src_freq, frame_length_ms=hparams.src_frame_length_ms).astype(np.float32) src_n_frames = src_spectrogram.shape[1] tgt_spectrogram = audio.spectrogram(tgt_wav).astype(np.float32) tgt_n_frames = tgt_spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: src_mel_spectrogram = audio.melspectrogram(src_wav).astype(np.float32) tgt_mel_spectrogram = audio.melspectrogram(tgt_wav).astype(np.float32) # Write the spectrograms to disk: src_spectrogram_filename = 'wav2wav_src-spec-%05d.npy' % index src_mel_filename = 'wav2wav_src-mel-%05d.npy' % index np.save(os.path.join(out_dir, src_spectrogram_filename), src_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, src_mel_filename), src_mel_spectrogram.T, allow_pickle=False) tgt_spectrogram_filename = 'wav2wav_tgt-spec-%05d.npy' % index tgt_mel_filename = 'wav2wav_tgt-mel-%05d.npy' % index np.save(os.path.join(out_dir, tgt_spectrogram_filename), tgt_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, tgt_mel_filename), tgt_mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (tgt_spectrogram_filename, tgt_mel_filename, tgt_n_frames, src_spectrogram_filename)
def _process_utterance(out_dir, index, wav_path_neutral, wav_path_happy): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav1 = audio.load_wav(wav_path_neutral) wav2 = audio.load_wav(wav_path_happy) # Compute the neutral linear-scale spectrogram from the wav: spectrogram_neutral = audio.spectrogram(wav1).astype(np.float32) n_frames = spectrogram_neutral.shape[1] # Compute a neutral mel-scale spectrogram from the wav: mel_spectrogram_neutral = audio.melspectrogram(wav1).astype(np.float32) spectrogram_happy = audio.spectrogram(wav2).astype(np.float32) n_frames = spectrogram_happy.shape[1] mel_spectrogram_happy = audio.melspectrogram(wav2).astype(np.float32) # Write the spectrograms to disk: spectrogram_neutral_filename = 'neutral-spec-%05d.npy' % index mel_neutral_filename = 'neutral-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_neutral_filename), spectrogram_neutral.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_neutral_filename), mel_spectrogram_neutral.T, allow_pickle=False) spectrogram_happy_filename = 'happy-spec-%05d.npy' % index mel_happy_filename = 'happy-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_happy_filename), spectrogram_happy.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_happy_filename), mel_spectrogram_happy.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_neutral_filename, mel_neutral_filename, spectrogram_happy_filename, mel_happy_filename, n_frames)
def _process_utterance(out_dir, index, source_wav_path, target_wav_path): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: source_wav = audio.load_wav(source_wav_path) target_wav = audio.load_wav(target_wav_path) # Compute the linear-scale spectrogram from the wav: target_spectrogram = audio.spectrogram(target_wav).astype(np.float32) n_frames = target_spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: source_mel_spectrogram = audio.melspectrogram(source_wav).astype( np.float32) target_mel_spectrogram = audio.melspectrogram(target_wav).astype( np.float32) # Write the spectrograms to disk: #source_spectrogram_filename = 'source-spec-%05d.npy' % index source_mel_filename = 'source-mel-%05d.npy' % index target_spectrogram_filename = 'target-spec-%05d.npy' % index target_mel_filename = 'target-mel-%05d.npy' % index #np.save(os.path.join(out_dir, source_spectrogram_filename), source_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, source_mel_filename), source_mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, target_spectrogram_filename), target_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, target_mel_filename), target_mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (source_mel_filename, n_frames, target_spectrogram_filename, target_mel_filename)
def run_eval(args): #print(hparams_debug_string()) is_teacher_force = False reference_mel = None synth = Synthesizer(teacher_forcing_generating=is_teacher_force) synth.load(args.model, args.reference) base_path = get_output_base_path(args.model) if args.reference is not None: ref_wav = audio.load_wav(args.reference) reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T #path = '%s_ref-%s.wav' % (base_path, os.path.splitext(os.path.basename(args.reference))[0]) path = 'ref-%s.wav' % (os.path.splitext(os.path.basename(args.reference))[0]) else: raise ValueError("You must set the reference audio.") with open('examples_test.txt', 'r') as fs: lines = fs.readlines() for i, line in enumerate(lines): args.text = line.strip().split('|')[-1] path_id = '%d_' %(i+6) new_path = path_id + path print('Synthesizing: %s' % args.text) print('Output wav file: %s' % new_path) with open(new_path, 'wb') as f: f.write(synth.synthesize(args.text, reference_mel=reference_mel))
def _process_utterance(out_dir, index, wav_path, labels_path, text, person_id=1): # Load the wav file and trim silence from the ends: wav = audio.load_wav(wav_path) start_offset, end_offset = _parse_labels(labels_path) start = int(start_offset * hparams.sample_rate) end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1 wav = wav[start:end] max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate if len(wav) > max_samples: return None spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'blizzard-spec-%05d.npy' % index mel_filename = 'blizzard-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, text, person_id)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def process_utterance(out_path, index, wav_path, text): ''' generate linear and mel scale spectrograms for each text, wav pairs and save the np array into disk return the file names of the np array files ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # .T: transpose of narray # allow_pickle: for security and portability not allow np.save(os.path.join(out_path, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_path, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def run_eval(args): print(hparams_debug_string()) is_teacher_force = False mel_targets = args.mel_targets reference_mel = None if args.mel_targets is not None: is_teacher_force = True mel_targets = np.load(args.mel_targets) synth = Synthesizer(teacher_forcing_generating=is_teacher_force) synth.load(args.checkpoint, args.reference_audio) base_path = get_output_base_path(args.checkpoint) if args.reference_audio is not None: ref_wav = audio.load_wav(args.reference_audio) reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T path = '%s_ref-%s.wav' % (base_path, os.path.splitext(os.path.basename(args.reference_audio))[0]) else: if hparams.use_gst: print("*******************************") print("TODO: add style weights when there is no reference audio. Now we use random weights, " + "which may generate unintelligible audio sometimes.") print("*******************************") path = '%s_ref-randomWeight.wav' % (base_path) else: raise ValueError("You must set the reference audio if you don't want to use GSTs.") with open(path, 'wb') as f: print('Synthesizing: %s' % args.text) print('Output wav file: %s' % path) f.write(synth.synthesize(args.text, reference_mel=reference_mel))
def _process_utterance(out_dir, prompt_id, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim leading and trailing silence: margin = int(hparams.sample_rate * 0.1) wav = wav[margin:-margin] wav, _ = librosa.effects.trim(wav, top_db=40, frame_length=1024, hop_length=256) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'amy-spec-%s.npy' % prompt_id mel_filename = 'amy-mel-%s.npy' % prompt_id np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, pinyin): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input pinyin: The pinyin of Chinese spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'femalemandarin-spec-%05d.npy' % index mel_filename = 'femalemandarin-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, pinyin)
def synthesize(self, path_in, path_re, mel_targets=None, reference_mel=None, alignment_path=None): wav_in = audio.load_wav(path_in) wav_re = audio.load_wav(path_re) mel_in = audio.melspectrogram(wav_in).astype(np.float32) mel_re = audio.melspectrogram(wav_re).astype(np.float32) # print(mel_jp) feed_dict = { self.model.inputs: [mel_in.T], self.model.input_lengths: np.asarray([len(mel_in)], dtype=np.int32), self.model.inputs_jp: [mel_re.T], } # if mel_targets is not None: # mel_targets = np.expand_dims(mel_targets, 0) # print(reference_mel.shapex) # feed_dict.update({self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32)}) # if reference_mel is not None: # reference_mel = np.expand_dims(reference_mel, 0) # print(reference_mel.shapex) # feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)}) wav_out, alignments = self.session.run( [self.wav_output, self.alignments], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav_out) end_point = audio.find_endpoint(wav) wav = wav[:end_point] nowTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # 生成当前时间 randomNum = random.randint(0, 100) # 生成的随机整数n,其中0<=n<=100 if randomNum <= 10: randomNum = str(0) + str(randomNum) uniqueNum = str(nowTime) + str(randomNum) out_dir = "static\\out\\" + uniqueNum + ".wav" out_name = uniqueNum + ".wav" audio.save_wav(wav, out_dir) out = io.BytesIO() audio.save_wav(wav, out) # n_frame = int(end_point / (hparams.frame_shift_ms / 1000* hparams.sample_rate)) + 1 # plot.plot_alignment(alignments[:,:n_frame], alignment_path, info='%s' % (path)) return out_dir, out_name
def convert_file(audio_path): y = audio.load_wav(audio_path) peak = np.abs(y).max() if hp.peak_norm or peak > 1.0: y *= (0.9 / peak) linear = audio.spectrogram(y) mel = audio.melspectrogram(y) return mel.astype(np.float32), linear.astype(np.float32)
def preprocess_utterance(wav_file,input_path, output_path): wav = audio.load_wav(wav_file) wav_path, name = os.path.split(wav_file) out_dir = wav_path.replace(input_path,output_path) if not os.path.exists(out_dir): os.makedirs(out_dir) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_filename = name.replace('.wav','.npy') np.save(os.path.join(out_dir, mel_filename),mel_spectrogram.T,allow_pickle=False) print(mel_filename,mel_spectrogram.shape[1])
def _process_utterance(out_dir, index, wav_path, text): wav = audio.load_wav(wav_path) spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'selvas-spec-%04d.npy' % int(index) mel_filename = 'selvas-mel-%04d.npy' % int(index) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, text)
def _extract_features_std(self, wav, text): wav_pre = audio.preemphasis(wav) linear_target = audio.spectrogram(wav_pre).astype(sp.float32) mel_target = audio.melspectrogram(wav_pre).astype(sp.float32) input_data = sp.asarray(text_to_sequence(str(text, encoding='utf8'), self._cleaner_names), dtype=sp.int32) input_length = sp.int32(len(input_data)) return input_data, [input_length], mel_target.T, linear_target.T, [ sp.int32(len(linear_target.T)) ]
def run_eval(args): print(hparams_debug_string()) synth = Synthesizer() synth.load(args.checkpoint) base_path = get_output_base_path(args.checkpoint) wav = load_wav(args.reference_audio) mel = melspectrogram(wav).transpose() for i, text in enumerate(sentences): path = '%s-%d.wav' % (base_path, i) print('Synthesizing: %s' % path) with open(path, 'wb') as f: f.write(synth.synthesize(text, mel))
def _process_utterance(out_dir, index, wav_path): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # cut or pad wav into 2s length = hparams.sample_rate * hparams.duration wav = librosa.util.fix_length(wav, length) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Computer mfcc # mfcc = audio.mfcc(wav).astype(np.float32) # Write the spectrograms to disk: wav_name = os.path.basename(wav_path) wav_name = wav_name.split('.')[0] spectrogram_filename = 'spec-%s.npy' % wav_name mel_filename = 'mel-%s.npy' % wav_name mfcc_filename = 'mfcc-%s.npy' % wav_name np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # np.save( # os.path.join(out_dir, mfcc_filename), # mfcc.T, # allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames)
def _process_utterance(out_dir, index, wav_path, text): wav, _ = audio.load_wav(wav_path) spectrogram = audio.spectrogram(wav).astype(np.float32) # (1025, frame) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # (80, frame) spectrogram_filename = 'kss-spec-%05d.npy' % index mel_filename = 'kss-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) # (frame, 1025) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # (frame, 80) return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, text, person_id): # Load the wav file and trim silence from the ends: wav = audio.load_wav(wav_path) #max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate #if len(wav) > max_samples: # return None spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'arctic-spec-%05d.npy' % index mel_filename = 'arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, text, person_id)
def _process_utterance(out_dir, name, wav_path, text): wav = audio.load_wav(wav_path) spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'bznsyp-spec-%s.npy' % name mel_filename = 'bznsyp-mel-%s.npy' % name np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) #text = sentence_to_pinyin(text) return (spectrogram_filename, mel_filename, n_frames, text)
def synthesize(self, input_path): s, sr = sf.read(input_path) spec = audio.melspectrogram(s).astype(np.float32).T feed_dict = { self.model.inputs: [np.asarray(spec, dtype=np.float32)], self.model.input_lengths: np.asarray([spec.shape[0]], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def __generate_spectrograms(file_path, category, index, out_dir): wav = audio.load_wav(file_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = '{}spec{}.npy'.format(category, index) mel_filename = '{}mel{}.npy'.format(category, index) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
def get_wav_linear_and_mel_targert(wav_path, set_spec_length=None): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Return a tuple describing this training example: if set_spec_length is not None: return (spectrogram.T[:set_spec_length], mel_spectrogram.T[:set_spec_length], n_frames) #wav = wav.reshape(-1, 1) #wav = np.pad(wav, [[2048, 0], [0, 0]], 'constant') #wav = np.pad(wav, [[2048, 0]], 'constant') return (wav, spectrogram.T, mel_spectrogram.T, n_frames)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: print('wave_path :', wav_path) wav = audio.load_wav(wav_path) print('wav :', wav.shape, 'sr:') # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) #print('spectrogram: ', spectrogram, '\nspectrogram,shape: ', spectrogram.shape) n_frames = spectrogram.shape[1] print('n_frames : ', n_frames) # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) #print('melspectrogram: ', mel_spectrogram, '\nspectrogram,shape: ', mel_spectrogram.shape) # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index print('spectrogram_filename:', spectrogram_filename) print('mel_filename:', mel_filename) print('out_dir: ', out_dir) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer, hparams, model_name): '''Evaluate model during training. Supposes that model variables are averaged. ''' start_time = time.time() y_hat, y_target, loss, input_mel, upsampled_features = sess.run([model.tower_y_hat[0], model.tower_y_target[0], model.eval_loss, model.tower_eval_c[0], model.tower_eval_upsampled_local_features[0]]) duration = time.time() - start_time log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'.format( len(y_target), duration, len(y_target) / duration)) # Make audio and plot paths pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step)) target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step)) plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step)) upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step)) # Save figure util.waveplot(plot_path, y_hat, y_target, model._hparams, title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss)) log('Eval loss for global step {}: {:.3f}'.format(global_step, loss)) # Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance # Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range) util.plot_spectrogram(generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format( global_step, loss), target_spectrogram=input_mel.T) util.plot_spectrogram(upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'.format( global_step, loss), auto_aspect=True) # Save Audio save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) save_wavenet_wav(y_target, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) # Write eval summary to tensorboard log('Writing eval summary!') add_test_stats(summary_writer, global_step, loss, hparams=hparams)
def _process_utterance(out_dir, index, wav_path, labels_path, text): # Load the wav file and trim silence from the ends: wav = audio.load_wav(wav_path) start_offset, end_offset = _parse_labels(labels_path) start = int(start_offset * hparams.sample_rate) end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1 wav = wav[start:end] max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate if len(wav) > max_samples: return None spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'blizzard-spec-%05d.npy' % index mel_filename = 'blizzard-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(wav_path, text, id): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: wav_path: Path to the audio file containing the speech input seq: The text in the input audio file id : identity Returns: A example containing many datas ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32).T # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T return wav, spectrogram, mel_spectrogram, text, id
def run_eval(args): print(hparams_debug_string()) reference_mel = None synth = Synthesizer() synth.load(args.checkpoint, args.reference_audio) if args.reference_audio is not None: ref_wav = audio.load_wav(args.reference_audio) reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T base_path = get_output_base_path(args.checkpoint) for i, text in enumerate(sentences): path = '%s_%d_%.1f_%d.wav' % (base_path + '_gst', hparams.gst_index, hparams.gst_scale, i) print('Synthesizing: %s' % path) with open(path, 'wb') as f: f.write(synth.synthesize(text, reference_mel=reference_mel))
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: # wav = audio.load_wav(wav_path) y, sr = librosa.load(wav_path, sr=hparams.sample_rate) # Trim the beginning and ending silence # Test again trimming top_db wav = librosa.effects.trim(y, top_db=45)[0] # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def save_log(sess, global_step, model, plot_dir, wav_dir, hparams, model_name): log('\nSaving intermediate states at step {}'.format(global_step)) idx = 0 y_hat, y, loss, length, input_pml_features, upsampled_features = sess.run([model.tower_y_hat_log[0][idx], model.tower_y_log[0][idx], model.loss, model.tower_input_lengths[0][idx], model.tower_c[0][idx], model.tower_upsampled_local_features[0][idx]]) # mask by length y_hat[length:] = 0 y[length:] = 0 # Make audio and plot paths pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step)) target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step)) plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step)) upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step)) # Save figure util.waveplot(plot_path, y_hat, y, hparams, title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss)) # Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance # Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range) util.plot_spectrogram(generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format( global_step, loss), target_spectrogram=input_pml_features.T) util.plot_spectrogram(upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'.format( global_step, loss), auto_aspect=True) # Save audio save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) save_wavenet_wav(y, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
def _process_utterance(out_dir, index, wav_path, pinyin): wav = audio.load_wav(wav_path) spectrogram = audio.spectrogram(wav).astype(np.float32) n_frame = spectrogram.shape[1] if n_frame > hp.max_frame_num: return None mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'thchs30-spec-%05d.npy' % index mel_filename = 'thchs30-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frame, pinyin)