def generate_audio_response(text, speaker_id): global global_config model_name = os.path.basename(global_config.load_path) isKorean=global_config.is_korean hashed_text = hashlib.md5(text.encode('utf-8')).hexdigest() relative_dir_path = os.path.join(AUDIO_DIR, model_name) relative_audio_path = os.path.join( relative_dir_path, "{}.{}.wav".format(hashed_text, speaker_id)) real_path = os.path.join(ROOT_PATH, relative_audio_path) makedirs(os.path.dirname(real_path)) if not os.path.exists(add_postfix(real_path, 0)): try: audio = synthesizer.synthesize( [text], paths=[real_path], speaker_ids=[speaker_id], attention_trim=True, isKorean=isKorean)[0] except Exception as e: traceback.print_exc() return jsonify(success=False), 400 return send_file( add_postfix(relative_audio_path, 0), mimetype="audio/wav", as_attachment=True, attachment_filename=hashed_text + ".wav") response = make_response(audio) response.headers['Content-Type'] = 'audio/wav' response.headers['Content-Disposition'] = 'attachment; filename=sound.wav' return response
def split_on_silence_with_librosa(audio_path, top_db=40, frame_length=1024, hop_length=256, skip_idx=0, out_ext="wav", min_segment_length=3, max_segment_length=8, pre_silence_length=0, post_silence_length=0): filename = os.path.basename(audio_path).split('.', 1)[0] in_ext = audio_path.rsplit(".")[1] audio = load_audio(audio_path) edges = librosa.effects.split(audio, top_db=top_db, frame_length=frame_length, hop_length=hop_length) new_audio = np.zeros_like(audio) for idx, (start, end) in enumerate(edges[skip_idx:]): new_audio[start:end] = remove_breath(audio[start:end]) save_audio(new_audio, add_postfix(audio_path, "no_breath")) audio = new_audio edges = librosa.effects.split(audio, top_db=top_db, frame_length=frame_length, hop_length=hop_length) audio_paths = [] for idx, (start, end) in enumerate(edges[skip_idx:]): segment = audio[start:end] duration = get_duration(segment) if duration <= min_segment_length or duration >= max_segment_length: continue output_path = "{}/{}.{:04d}.{}".format(os.path.dirname(audio_path), filename, idx, out_ext) padded_segment = np.concatenate([ get_silence(pre_silence_length), segment, get_silence(post_silence_length), ]) save_audio(padded_segment, output_path) audio_paths.append(output_path) return audio_paths
def _save_data_core(self, phase='face'): data_dir = os.path.join(data_param['data_save_dir'], phase) create_dir(data_dir) for index in range(len(self.faces)): name = self.names[index] img_path = os.path.join(data_dir, add_postfix(name, "_{}".format(phase))) cv2.imwrite(img_path, self.faces[index]) np.savetxt(os.path.splitext(img_path)[0] + ".pts", self.aug_landmarks[index], fmt="%.4f") np.savetxt(os.path.splitext(img_path)[0] + ".opts", self.occlusions[index], fmt="%d")
def _split_core(self, x, y, mode, phase): data_dir = os.path.join(data_param['data_save_dir'], mode) for index in range(len(x)): img = x[index][0] name = x[index][1] landmark = y[index][0] * self.img_size occlusion = y[index][1] # save data img_path = os.path.join(data_dir, add_postfix(name, "_{}".format(phase))) cv2.imwrite(img_path, img) np.savetxt(os.path.splitext(img_path)[0] + ".pts", landmark, fmt="%.4f") np.savetxt(os.path.splitext(img_path)[0] + ".opts", occlusion, fmt="%d")
def _balance(self, balanced_num=None): """Balance dataset Increase occlusion objs by (balanced_num + 1) times :param balanced_num: required balanced_num to increase nums of occlusion objs """ count = 0 for index in range(self.data_size): if np.sum(self.occlusions[index]) > 0: count += 1 ratio = float(count) / self.data_size balanced_num = int(float(1) / ratio) if balanced_num is None else balanced_num occlusions_add = [] heatmaps_add = [] faces_add = [] names_add = [] landmarks_add = [] for index in range(len(self.occlusions)): if np.sum(self.occlusions[index]) > 0: for num in range(balanced_num): heatmap = gaussian_noise(self.heat_maps[index], color=self.color) heatmaps_add.append(heatmap) face = gaussian_noise(self.faces[index], color=self.color) faces_add.append(face) occlusions_add.append(self.occlusions[index]) landmarks_add.append(self.aug_landmarks[index]) names_add.append( add_postfix(self.names[index], "_gaussian_{}".format(num))) if self.print_debug and (index + 1) % 500 == 0: logger("data aug phase 2 processed {} images".format(index + 1)) self.faces = extend(self.faces, faces_add) self.occlusions.extend(occlusions_add) self.heat_maps.extend(heatmaps_add) self.aug_landmarks.extend(landmarks_add) self.names.extend(names_add) self.data_size = len(self.occlusions) logger("length of imgs and occlusions is {}".format(self.data_size))
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, use_manual_attention=False, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True): idx, (wav, alignment, path, text, sequence) = args if base_path: plot_path = "{}/{}.png".format(base_path, get_time()) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None #plot_path = add_prefix(plot_path, time_str) if use_manual_attention: plot_path = add_postfix(plot_path, "manual") if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if use_short_concat: wav = short_concat(wav, alignment, text, start_of_sentence, end_of_sentence, pre_word_num, post_word_num, pre_surplus_idx, post_surplus_idx) if attention_trim and end_of_sentence: end_idx_counter = 0 attention_argmax = alignment.argmax(0) end_idx = min(len(sequence) - 1, max(attention_argmax)) max_counter = min((attention_argmax == end_idx).sum(), 5) for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] audio_out = inv_spectrogram(wav.T) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_audio(audio_out, current_path) return True else: io_out = io.BytesIO() save_audio(audio_out, io_out) result = io_out.getvalue() return result
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True, config=None): idx, (wav, alignment, path, text, sequence, mel) = args if base_path: plot_path = "{}/{}_{}.png".format(base_path, config.file.split('.')[0], idx) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if attention_trim and end_of_sentence: # attention이 text의 마지막까지 왔다면, 그 뒷부분은 버린다. end_idx_counter = 0 attention_argmax = alignment.argmax( 0) # alignment: text length(encoder), target length(decoder) ==> target length(decoder) end_idx = min(len(sequence) - 1, max(attention_argmax)) max_counter = min((attention_argmax == end_idx).sum(), 5) for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] mel = mel[:spec_end_idx] audio_out = inv_linear_spectrogram(wav.T, hparams) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] mel = mel[:index[-1] // hparams.hop_size] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_wav(audio_out, current_path, hparams.sample_rate) # hccho mel_path = current_path.replace(".wav", ".npy") np.save(mel_path, mel) return current_path else: io_out = io.BytesIO() save_wav(audio_out, io_out, hparams.sample_rate) result = io_out.getvalue() return io_out
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True): idx, (wav, alignment, path, text, sequence, mel) = args if base_path: plot_path = "{}/{}.png".format(base_path, get_time()) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if use_short_concat: wav = short_concat(wav, alignment, text, start_of_sentence, end_of_sentence, pre_word_num, post_word_num, pre_surplus_idx, post_surplus_idx) if attention_trim and end_of_sentence: # attention이 text의 마지막까지 왔다면, 그 뒷부분은 버린다. end_idx_counter = 0 attention_argmax = alignment.argmax( 0 ) # alignment: text length(encoder), target length(decoder) ==> target length(decoder) end_idx = min(len(sequence) - 1, max(attention_argmax)) # max_counter = min((attention_argmax == end_idx).sum(), 5) + 1 # 20200612 위 로직을 보면 attention_argmax에서 end_idx랑 같은 값을 count한 거(실제 끝 값)랑 5를 min해서 max_counter를 정하게 되어 있다. # 한국말은 끝음을 오래 발음하는 경향이 있기 때문에 5로 자르지 않고 실제 발음한거만큼 끝까지 사용할 필요가 있어서 아래 로직으로 교체한다. # (설계자가 왜 5로 잘랐는지는 미지수) max_counter = (attention_argmax == end_idx).sum() for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] mel = mel[:spec_end_idx] audio_out = inv_linear_spectrogram(wav.T, hparams) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] mel = mel[:index[-1] // hparams.hop_size] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_wav(audio_out, current_path, hparams.sample_rate) #hccho mel_path = current_path.replace(".wav", ".npy") np.save(mel_path, mel) #return True return audio_out else: io_out = io.BytesIO() save_wav(audio_out, io_out, hparams.sample_rate) result = io_out.getvalue() return audio_out
def generate_audio_response(textList, speaker_id, alarm_id): #global global_config #model_name = os.path.basename(global_config.load_path) #iskorean=global_config.is_korean audio_clear() global member_id, method_id if member_id != speaker_id: if speaker_id == 0: if not (member_id==0): if member_id != -1: synthesizer.close() synthesizer.load('logs/backup_log/son+yuinna', 2) elif speaker_id == 3: if not (member_id==3): if member_id != -1: synthesizer.close() synthesizer.load('logs/backup_log/new_inna+kss+leejh+nandong2',4) else: if not (member_id==1 or member_id==2 or member_id==4): if member_id != -1: synthesizer.close() synthesizer.load('logs/backup_log/new_inna+kss+leejh', 3) member_id = speaker_id if speaker_id==0: model_name='손석희' #speaker_id=0 elif speaker_id==1: model_name='유인나' speaker_id=0 elif speaker_id==2: model_name='코퍼스' #한국어 코퍼스 speaker_id=1 elif speaker_id==3: model_name='김난희' #speaker_id=3 else: model_name='이주형' speaker_id=2 ########################################################################################### # 이 부분 반목문 돌림 textcnt = 0 # 몇번째 텍스트인지 확인 용도 audio_list = [] #체크 용도 print(textList) for text in textList: # hashed_text = hashlib.md5(text.encode('utf-8')).hexdigest() # 텍스트 hashed_text = "{}".format(str(textcnt)) # 이 부분을 반복문 # 이 부분이 경로 생성 하는 부분 relative_dir_path = os.path.join(AUDIO_DIR, model_name) relative_audio_path = os.path.join( relative_dir_path, "{}.{}.wav".format(hashed_text, speaker_id)) real_path = os.path.join(ROOT_PATH, relative_audio_path) makedirs(os.path.dirname(real_path)) if not os.path.exists(add_postfix(real_path, 0)): try: #audio는 파일명임 audio = synthesizer.synthesize( [text], paths=[real_path], speaker_ids=[speaker_id], attention_trim=True)[0] audio_list.append(audio) except: return jsonify(success=False), 400 textcnt +=1 ########################################################################################### # 음성 합치기 # 합친 음성 이름은 'output.wav' CUR_PATH = os.getcwd() #print(CUR_PATH) # audio 이름 체크용 FILE_PATH = os.path.join(AUDIO_PATH, model_name) #print(FILE_PATH) # audio 이름 체크용 print("method {} 실행중".format(method_id)) alarm_type = 0 alarm_id -= 1 if (method_id == 1) or (method_id == 2): # basic combine_audio(os.path.join(CUR_PATH, FILE_PATH)) elif method_id == 3: # morning_call combine_audio(os.path.join(CUR_PATH, FILE_PATH)) # web\audio\model_name\output.wav if alarm_id == 0 or alarm_id == 1 or alarm_id == 2 or alarm_id == 3: alarm_type = 0 else: alarm_id = (alarm_id - 4) alarm_type = 1 create_alarm(alarm_id, model_name, alarm_type) # bgm_select, model_name, type elif method_id == 4: # briefing combine_audio(os.path.join(CUR_PATH, FILE_PATH)) # web\audio\model_name\output.wav create_briefing(alarm_id, model_name) # bgm_select, model_name, #0 1 2 3 elif method_id == 5: # birthday combine_audio(os.path.join(CUR_PATH, FILE_PATH)) # web\audio\model_name\output.wav create_birthday(0, model_name) # bgm_select, model_name, #0 1 2 3 #print(os.path.join(CUR_PATH, FILE_PATH)) #print(TEST_PATH) ########################################################################################### return send_file( os.path.join('audio', model_name, 'output.wav'), mimetype="audio/wav", as_attachment=True, attachment_filename=hashed_text + ".wav") ########################################################################################### # 합친 파일 불러와서 audio에 넣기 response = make_response(os.path.join('web', 'audio', model_name, 'output.wav')) response.headers['Content-Type'] = 'audio/wav' response.headers['Content-Disposition'] = 'attachment; filename=sound.wav' return response