def save_spectrogram_tdsv(): """ Select text specific utterance and perform STFT with the audio file. Audio spectrogram files are divided as train set and test set and saved as numpy file. Need : utterance data set (VTCK) """ print("start text dependent utterance selection") os.makedirs(config.train_path, exist_ok=True) # make folder to save train file os.makedirs(config.test_path, exist_ok=True) # make folder to save test file # =============================================================================================2020/03/23 23:25 # ============================ total_speaker_num = 0 for audio_path in audio_paths: total_speaker_num += len(os.listdir(audio_path)) train_speaker_num = (total_speaker_num // 10) * 9 # split total data 90% train and 10% test print("total speaker number : %d" % total_speaker_num) print("train : %d, test : %d" % (train_speaker_num, total_speaker_num - train_speaker_num)) i = 0 # ============================ # =============================================================================================2020/03/23 23:25 for audio_path in audio_paths: for folder in os.listdir(audio_path): speaker_path = os.path.join(audio_path, folder) # path of each speaker print("%dth speaker processing..." % i) utterances_spec = [] for utter_name in os.listdir(speaker_path): utter_path = os.path.join(speaker_path, utter_name) # path of each utterance # utter_path= os.path.join(audio_path, folder, os.listdir(os.path.join(audio_path, folder))[0]) # if os.path.splitext(os.path.basename(utter_path))[0][-3:] != '001': # if the text utterance doesn't exist pass # print(os.path.basename(utter_path)[:4], "001 file doesn't exist") # continue try: utter, sr = librosa.core.load(utter_path, config.sr) # load the utterance audio utter_trim, index = librosa.effects.trim(utter, top_db=14) # trim the beginning and end blank if utter_trim.shape[0]/sr <= config.hop*(config.tdsv_frame+2): # if trimmed file is too short, then pass print(os.path.basename(utter_path), "voice trim fail") continue S = librosa.core.stft(y=utter_trim, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr)) # perform STFT S = keyword_spot(S) # keyword spot (for now, just slice last 80 frames which contains "Call Stella") utterances_spec.append(S) # make spectrograms list except: continue utterances_spec = np.array(utterances_spec) # list to numpy array # np.random.shuffle(utterances_spec) # shuffle spectrogram (by person) # total_num = utterances_spec.shape[0] # train_num = (total_num//10)*9 # split total data 90% train and 10% test # print("selection is end") # print("total utterances number : %d"%total_num, ", shape : ", utterances_spec.shape) # print("train : %d, test : %d"%(train_num, total_num- train_num)) print(utterances_spec.shape) if i < train_speaker_num: # save spectrogram as numpy file np.save(os.path.join(config.train_path, "speaker%d.npy" % i), utterances_spec) else: np.save(os.path.join(config.test_path, "speaker%d.npy" % (i - train_speaker_num)), utterances_spec) i+=1
def save_spectrogram_tdsv(): """ Select text specific utterance and perform STFT with the audio file. Audio spectrogram files are divided as train set and test set and saved as numpy file. Need : utterance data set (VTCK) """ print("start text dependent utterance selection") os.makedirs(config.train_path, exist_ok=True) # make folder to save train file os.makedirs(config.test_path, exist_ok=True) # make folder to save test file utterances_spec = [] for folder in os.listdir(audio_path): spk_dir = os.path.join(audio_path, folder) spk_utts = os.listdir(spk_dir) spk_utts.sort() utter_path = os.path.join(spk_dir, spk_utts[0]) if os.path.splitext( os.path.basename(utter_path) )[0][-3:] != '001': # if the text utterance doesn't exist pass print(os.path.basename(utter_path)[:4], "001 file doesn't exist") continue utter, sr = librosa.core.load(utter_path, config.sr) # load the utterance audio utter_trim, index = librosa.effects.trim( utter, top_db=14) # trim the beginning and end blank if utter_trim.shape[0] / sr <= config.hop * ( config.tdsv_frame + 2): # if trimmed file is too short, then pass print(os.path.basename(utter_path), "voice trim fail") continue S = librosa.core.stft(y=utter_trim, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr)) # perform STFT S = keyword_spot( S ) # keyword spot (for now, just slice last 80 frames which contains "Call Stella") utterances_spec.append(S) # make spectrograms list utterances_spec = np.array(utterances_spec) # list to numpy array np.random.shuffle(utterances_spec) # shuffle spectrogram (by person) total_num = utterances_spec.shape[0] train_num = (total_num // 10) * 9 # split total data 90% train and 10% test print("selection is end") print("total utterances number : %d" % total_num, ", shape : ", utterances_spec.shape) print("train : %d, test : %d" % (train_num, total_num - train_num)) np.save(os.path.join(config.train_path, "train.npy"), utterances_spec[:train_num]) # save spectrogram as numpy file np.save(os.path.join(config.test_path, "test.npy"), utterances_spec[train_num:])
def save_spectrogram_tdsv(path, data_type): """ Select text specific utterance and perform STFT with the audio file. Audio spectrogram files are divided as train set and test set and saved as numpy file. Need : utterance data set (VTCK) """ print('Preprocess ' + data_type) utterances_spec = [] for folder in os.listdir(path): if not os.path.isdir(os.path.join(path, folder)): continue audios = os.listdir(os.path.join(path, folder)) audios.sort() utter_path = os.path.join(path, folder, audios[0]) if config.train and os.path.splitext( os.path.basename(utter_path) )[0][-3:] != '001': # if the text utterance doesn't exist pass print(os.path.basename(utter_path)[:4], "001 file doesn't exist") continue utter, sr = librosa.core.load(utter_path, config.sr) # load the utterance audio utter_trim, index = librosa.effects.trim( utter, top_db=14) # trim the beginning and end blank if utter_trim.shape[0] / sr <= config.hop * ( config.tdsv_frame + 2): # if trimmed file is too short, then pass print(os.path.basename(utter_path), "voice trim fail") continue S = librosa.core.stft(y=utter_trim, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr)) # perform STFT S = keyword_spot( S ) # keyword spot (for now, just slice last 80 frames which contains "Call Stella") utterances_spec.append(S) # make spectrograms list utterances_spec = np.array(utterances_spec) # list to numpy array np.random.shuffle(utterances_spec) # shuffle spectrogram (by person) total_num = utterances_spec.shape[0] print("Speaker number : %d" % total_num, ", shape : ", utterances_spec.shape) np.save(os.path.join(path, data_type + ".npy"), utterances_spec) # save spectrogram as numpy file