def build_from_path(input_dir, out_dir, n_jobs=4, tqdm=lambda x: x): """ Preprocesses the Lj speech dataset from a gven input path to a given output directory Args: - in_dir: input directory that contains the files to prerocess - out_dir: output drectory of the preprocessed Lj dataset - n_jobs: Optional, number of worker process to parallelize across - tqdm: Optional, provides a nice progress bar Returns: - A list of tuple describing the train examples. this should be written to train.txt """ # We use ProcessPoolExecutor to parallelize across processes, this is just for # optimization purposes and it can be omited executor = ProcessPoolExecutor(max_workers=n_jobs) futures = [] index = 1 with open(os.path.join(input_dir, 'wavs.txt'), encoding='utf-8') as f: for line in f: parts = line.strip().split('<------>') wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(parts[0])) text1 = parts[1] text = ch2p(text1) print(str(text1) + "====>" + str(text)) futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text))) index += 1 return [future.result() for future in tqdm(futures)]
def tts_synthesize(self, get_txt, res): txt = process_txt(get_txt) split_sentence = txt.split(',') result = b''#AudioSegment.silent(duration=50) import datetime start_time = datetime.datetime.now() app_logger.info('synthesizing ...') inputs = [] for x in split_sentence: inputs.append(ch2p(x)) app_logger.info('to pinyin: ' + str([len(j) for j in inputs]) + ' ' + str(inputs)) out = synthesizer.synthesize(inputs) for wav in out: _, wav = vad_check_wav(wav_path=wav) result += wav uuid_str = str(uuid.uuid1()).replace('-', '') tmp_fn = os.path.realpath('./tmp/%s.wav' % uuid_str) tmp_path = os.path.dirname(tmp_fn) app_logger.info('tmp_path: ' + str(tmp_path)) #result.export(tmp_fn, format='wav') self.write_wave_vad(wav_path=tmp_fn, audio=result, sample_rate=16000) app_logger.info('self.vol: ' + str(self.vol)) total_len = sum([len(j) for j in split_sentence]) app_logger.info('total_len: ' + str(total_len)) new_fn = self.handle_wav(tmp_fn) # , char_len=total_len app_logger.info('new_fn after handle_wav: ' + str(new_fn)) fp = open(new_fn, 'rb') data = fp.read() fp.close() aud = io.BytesIO(data) res.data = aud.read() end_time = datetime.datetime.now() d = (end_time - start_time) used_time_ms = d.total_seconds() * 1000 app_logger.info('used_time_ms:' + str(used_time_ms)) res.content_type = 'audio/wav' rm_wav = True if rm_wav: os.system('rm -f %s/%s*.wav' % (tmp_path, uuid_str))
def generate(self, text=None): text = ch2p(text) sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( sequence) taco_stft = TacotronSTFT(self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length, sampling_rate=self.hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim( torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, 60)
def build_from_path(in_dir, out_dir, silence_threshold, fft_size, num_workers=cpu_count(), tqdm=lambda x: x): executor = ProcessPoolExecutor(max_workers=num_workers) futures = [] index = 1 with open(os.path.join(in_dir, 'wavs.txt'), encoding='utf-8') as f: for line in f: parts = line.strip().split('<------>') wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) # text = parts[1] text1 = parts[1] text = ch2p(text1) print("%s.wav: %s ===>%s" % (parts[0], text1, text)) futures.append( executor.submit( partial(_process_utterance, out_dir, index, wav_path, text, silence_threshold, fft_size))) index += 1 return [future.result() for future in tqdm(futures)]