def process_ted(csv_file, category): parent_path = _data_path + 'TEDLIUM_release2/' + category + '/' labels, wave_files, offsets, durs = [], [], [], [] # create csv writer writer = csv.writer(csv_file, delimiter=',') # read STM file list stm_list = glob.glob(parent_path + 'stm/*') for stm in stm_list: with open(stm, 'rt') as f: records = f.readlines() for record in records: field = record.split() # wave file name wave_file = parent_path + 'sph/%s.sph.wav' % field[0] wave_files.append(wave_file) # label index labels.append(data.str2index(' '.join(field[6:]))) # start, end info start, end = float(field[3]), float(field[4]) offsets.append(start) durs.append(end - start) # save results for i, (wave_file, label, offset, dur) in enumerate(zip(wave_files, labels, offsets, durs)): # print info print("TEDLIUM corpus preprocessing (%d / %d) - '%s-%.2f]" % (i, len(wave_files), wave_file, offset)) # load wave file wave, sr = librosa.load(wave_file, mono=True, sr=None, offset=offset, duration=dur) # get mfcc feature mfcc = librosa.feature.mfcc(wave, sr=16000) # save result ( exclude small mfcc data to prevent ctc loss ) if len(label) < mfcc.shape[1]: # filename fn = "%s-%.2f" % (wave_file.split('/')[-1], offset) # save meta info writer.writerow([fn] + label) # save mfcc np.save('asset/data/preprocess/mfcc/' + fn + '.npy', mfcc, allow_pickle=False)
def process_ted(csv_file, category): parent_path = _data_path + 'TEDLIUM_release2/' + category + '/' labels, wave_files, offsets, durs = [], [], [], [] # create csv writer writer = csv.writer(csv_file, delimiter=',') # read STM file list stm_list = glob.glob(parent_path + 'stm/*') for stm in stm_list: with open(stm, 'rt') as f: records = f.readlines() for record in records: field = record.split() # wave file name wave_file = parent_path + 'sph/%s.sph.wav' % field[0] wave_files.append(wave_file) # label index labels.append(data.str2index(' '.join(field[6:]))) # start, end info start, end = float(field[3]), float(field[4]) offsets.append(start) durs.append(end - start) # save results for i, (wave_file, label, offset, dur) in enumerate(zip(wave_files, labels, offsets, durs)): fn = "%s-%.2f" % (wave_file.split('/')[-1], offset) target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy' if os.path.exists( target_filename ): continue # print info print("TEDLIUM corpus preprocessing (%d / %d) - '%s-%.2f]" % (i, len(wave_files), wave_file, offset)) # load wave file if not os.path.exists( wave_file ): sph_file = wave_file.rsplit('.',1)[0] if os.path.exists( sph_file ): convert_sph( sph_file, wave_file ) else: raise RuntimeError("Missing sph file from TedLium corpus at %s"%(sph_file)) wave, sr = librosa.load(wave_file, mono=True, sr=None, offset=offset, duration=dur) # get mfcc feature mfcc = librosa.feature.mfcc(wave, sr=16000) # save result ( exclude small mfcc data to prevent ctc loss ) if len(label) < mfcc.shape[1]: # filename # save meta info writer.writerow([fn] + label) # save mfcc np.save(target_filename, mfcc, allow_pickle=False)
def save_recording(wave_file_name, wave, sentence, csv_file_name): mfcc = librosa.feature.mfcc(wave, sr=16000) time_id = strftime("%d_%b_%Y_%H_%M_%S", gmtime()) target_file_name = wave_file_name.split(".")[0] + time_id + ".npy" print sentence label = " ".join(map(str, data.str2index(unidecode(sentence)))) file_csv = open(csv_file_name, "a") file_csv.write(",".join([target_file_name, label]) + "\n") file_csv.close() np.save(FOLDER_MFCC + "/" + target_file_name, mfcc, allow_pickle=False)
def process_commonvoice(csv_file, category): parent_path = _data_path + 'cv_corpus_v1/' labels, wave_files = [], [] # create csv writer writer = csv.writer(csv_file, delimiter=',') with open(parent_path+category+'.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: # filename,text,up_votes,down_votes,age,gender,accent,duration wave_file = parent_path + '/' + row['filename'] + '.wav' wave_files.append(wave_file) labels.append(data.str2index(row['text'])) # save results count = 0 f = 0 s = 0 for i, (wave_file, label) in enumerate(zip(wave_files, labels)): try: fn = wave_file.replace('/', '-') target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy' if os.path.exists( target_filename ): continue # print info print("CommonVoice corpus preprocessing (%d / %d) - '%s']" % (i, len(wave_files), wave_file)) wave, sr = librosa.load(wave_file, mono=True, sr=None) # re-sample ( 48K -> 16K ) wave = wave[::3] # get mfcc feature mfcc = librosa.feature.mfcc(wave, sr=16000) except Exception, e: f += 1 print 'Failed for ', wave_file continue # save result ( exclude small mfcc data to prevent ctc loss ) if len(label) < mfcc.shape[1]: # filename # save meta info writer.writerow([fn] + label) # save mfcc np.save(target_filename, mfcc, allow_pickle=False) count+=1 else: s+= 1
def process_vctk(csv_file): # create csv writer writer = csv.writer(csv_file, delimiter=',') # read label-info df = pd.read_table(_data_path + 'VCTK-Corpus/speaker-info.txt', usecols=['ID'], index_col=False, delim_whitespace=True) # read file IDs file_ids = [] for d in [ _data_path + 'VCTK-Corpus/txt/p%d/' % uid for uid in df.ID.values ]: file_ids.extend([f[-12:-4] for f in sorted(glob.glob(d + '*.txt'))]) for i, f in enumerate(file_ids): # wave file name wave_file = _data_path + 'VCTK-Corpus/wav48/%s/' % f[:4] + f + '.wav' # print info print("VCTK corpus preprocessing (%d / %d) - '%s']" % (i, len(file_ids), wave_file)) # load wave file wave, sr = librosa.load(wave_file, mono=True, sr=None) # re-sample ( 48K -> 16K ) wave = wave[::3] # get mfcc feature mfcc = librosa.feature.mfcc(wave, sr=16000) # get label index label = data.str2index( open(_data_path + 'VCTK-Corpus/txt/%s/' % f[:4] + f + '.txt').read()) # save result ( exclude small mfcc data to prevent ctc loss ) if len(label) < mfcc.shape[1]: # filename fn = wave_file.split('/')[-1] # save meta info writer.writerow([fn] + label) # save mfcc np.save('asset/data/preprocess/mfcc/' + fn + '.npy', mfcc, allow_pickle=False)
def process_new_data(csv_file, category): parent_path = _data_path + 'FINAL_DATA/' + category + '/' labels, wave_files = [], [] # create csv writer writer = csv.writer(csv_file, delimiter=',') # read label text file list f = open(parent_path + 'text.txt', 'rt') records = f.readlines() for record in records: # parsing record field = record.split('|') # split by '|' if (field[0] + '.wav' in os.listdir(parent_path + 'audio/')): # wave file name wave_file = parent_path + 'audio/' + '%s.wav' % field[0] wave_files.append(wave_file) # label index labels.append(data.str2index( field[1])) # last column is text label f.close() # save results for i, (wave_file, label) in enumerate(zip(wave_files, labels)): fn = wave_file.split('/')[-1] target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy' if os.path.exists(target_filename): continue # print info print("new_data corpus preprocessing (%d / %d) - '%s']" % (i, len(wave_files), wave_file)) # load wave file wave, sr = librosa.load(wave_file, mono=True, sr=None) # re-sample ( 48K -> 16K ) # wave = wave[::3] # get mfcc feature mfcc = librosa.feature.mfcc(wave, sr=16000) # save result ( exclude small mfcc data to prevent ctc loss ) if len(label) < mfcc.shape[1]: # save meta info writer.writerow([fn] + label) # save mfcc np.save(target_filename, mfcc, allow_pickle=False)
def process_oneword(csv_file): # create csv writer writer = csv.writer(csv_file, delimiter=',') # read label-info #df = pd.read_table(_data_path + 'VCTK-Corpus/speaker-info.txt', usecols=['ID'], # index_col=False, delim_whitespace=True) oneword_dir = "../audios" file_ids = glob.glob(os.path.join(oneword_dir, "*.wav")) # read file IDs #file_ids = [] #for d in [_data_path + 'VCTK-Corpus/txt/p%d/' % uid for uid in df.ID.values]: # file_ids.extend([f[-12:-4] for f in sorted(glob.glob(d + '*.txt'))]) for i, wave_file in enumerate(file_ids): # wave file name #wave_file = _data_path + 'VCTK-Corpus/wav48/%s/' % f[:4] + f + '.wav' fn = wave_file.split('/')[-1] target_filename = 'asset/data/preprocess/mfcc-one/' + fn + '.npy' if os.path.exists(target_filename): continue # print info print("One word corpus preprocessing (%d / %d) - '%s']" % (i, len(file_ids), wave_file)) # load wave file wave, sr = librosa.load(wave_file, mono=True, sr=None) # re-sample ( 48K -> 16K ) wave = wave[::3] # get mfcc feature mfcc = librosa.feature.mfcc(wave, sr=16000) # get label index label_fn = fn.split(".")[0] label = data.str2index( open('../audios_labels/%s' % (label_fn + '.txt')).read()) # save result ( exclude small mfcc data to prevent ctc loss ) if len(label) < mfcc.shape[1]: # save meta info writer.writerow([fn] + label) # save mfcc np.save(target_filename, mfcc, allow_pickle=False)
def process_tidigits(csv_file): # create csv writer writer = csv.writer(csv_file, delimiter=',') # read label-info #df = pd.read_table(_data_path + 'TIDIGITS/spkrinfo.txt', usecols=['ID'], # index_col=False, delim_whitespace=True) # read file IDs just for woman ac file_ids = [] #for d in [_data_path + 'VCTK-Corpus/txt/p%d/' % uid for uid in df.ID.values]: # file_ids.extend([f[-12:-4] for f in sorted(glob.glob(d + '*.txt'))]) d = _data_path + 'TIDIGITS/train/woman/ac/' file_ids.extend([f for f in sorted(glob.glob(d + '*.wav'))]) for i, wave_file in enumerate(file_ids): # wave file name fn = wave_file.split('/')[-1] target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy' if os.path.exists(target_filename): continue # print info print("TIDIGITS corpus preprocessing (%d / %d) - '%s']" % (i, len(file_ids), wave_file)) # load wave file wave, sr = librosa.load(wave_file, mono=True, sr=None) # re-sample ( 48K -> 16K ) wave = wave[::3] # get mfcc feature mfcc = librosa.feature.mfcc(wave, sr=16000) # get label index gold_text = re.sub(".wav", "", wave_file) label = data.str2index(gold_text) # save result ( exclude small mfcc data to prevent ctc loss ) #if len(label) < mfcc.shape[1]: # save meta info writer.writerow([fn] + label) # save mfcc np.save(target_filename, mfcc, allow_pickle=False)
def process_vctk(csv_file): # create csv writer writer = csv.writer(csv_file, delimiter=',') # read label-info df = pd.read_table(_data_path + 'VCTK-Corpus/speaker-info.txt', usecols=['ID'], index_col=False, delim_whitespace=True) # read file IDs file_ids = [] for d in [_data_path + 'VCTK-Corpus/txt/p%d/' % uid for uid in df.ID.values]: file_ids.extend([f[-12:-4] for f in sorted(glob.glob(d + '*.txt'))]) for i, f in enumerate(file_ids): # wave file name wave_file = _data_path + 'VCTK-Corpus/wav48/%s/' % f[:4] + f + '.wav' fn = wave_file.split('/')[-1] target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy' if os.path.exists( target_filename ): continue # print info print("VCTK corpus preprocessing (%d / %d) - '%s']" % (i, len(file_ids), wave_file)) # load wave file wave, sr = librosa.load(wave_file, mono=True, sr=None) # re-sample ( 48K -> 16K ) wave = wave[::3] # get mfcc feature mfcc = librosa.feature.mfcc(wave, sr=16000) # get label index label = data.str2index(open(_data_path + 'VCTK-Corpus/txt/%s/' % f[:4] + f + '.txt').read()) # save result ( exclude small mfcc data to prevent ctc loss ) if len(label) < mfcc.shape[1]: # save meta info writer.writerow([fn] + label) # save mfcc np.save(target_filename, mfcc, allow_pickle=False)
def process_libri(csv_file, category): parent_path = _data_path + 'LibriSpeech/' + category + '/' labels, wave_files = [], [] # create csv writer writer = csv.writer(csv_file, delimiter=',') # read directory list by speaker speaker_list = glob.glob(parent_path + '*') for spk in speaker_list: # read directory list by chapter chapter_list = glob.glob(spk + '/*/') for chap in chapter_list: # read label text file list txt_list = glob.glob(chap + '/*.txt') for txt in txt_list: with open(txt, 'rt') as f: records = f.readlines() for record in records: # parsing record field = record.split('-') # split by '-' speaker = field[0] chapter = field[1] field = field[2].split() # split field[2] by ' ' utterance = field[0] # first column is utterance id # wave file name wave_file = parent_path + '%s/%s/%s-%s-%s.flac' % \ (speaker, chapter, speaker, chapter, utterance) wave_files.append(wave_file) # label index labels.append(data.str2index(' '.join( field[1:]))) # last column is text label # save results for i, (wave_file, label) in enumerate(zip(wave_files, labels)): fn = wave_file.split('/')[-1] target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy' if os.path.exists(target_filename): continue # print info print("LibriSpeech corpus preprocessing (%d / %d) - '%s']" % (i, len(wave_files), wave_file)) # load flac file wave, sr, _ = scikits.audiolab.flacread(wave_file) # get mfcc feature mfcc = librosa.feature.mfcc(wave, sr=16000) # save result ( exclude small mfcc data to prevent ctc loss ) if len(label) < mfcc.shape[1]: # filename # save meta info writer.writerow([fn] + label) # save mfcc np.save(target_filename, mfcc, allow_pickle=False)
def process_libri(csv_file, category): parent_path = _data_path + 'LibriSpeech/' + category + '/' labels, wave_files = [], [] # create csv writer writer = csv.writer(csv_file, delimiter=',') # read directory list by speaker speaker_list = glob.glob(parent_path + '*') for spk in speaker_list: # read directory list by chapter chapter_list = glob.glob(spk + '/*/') for chap in chapter_list: # read label text file list txt_list = glob.glob(chap + '/*.txt') for txt in txt_list: with open(txt, 'rt') as f: records = f.readlines() for record in records: # parsing record field = record.split('-') # split by '-' speaker = field[0] chapter = field[1] field = field[2].split() # split field[2] by ' ' utterance = field[0] # first column is utterance id # wave file name wave_file = parent_path + '%s/%s/%s-%s-%s.flac' % \ (speaker, chapter, speaker, chapter, utterance) wave_files.append(wave_file) # label index labels.append(data.str2index(' '.join(field[1:]))) # last column is text label # save results for i, (wave_file, label) in enumerate(zip(wave_files, labels)): fn = wave_file.split('/')[-1] target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy' if os.path.exists( target_filename ): continue # print info print("LibriSpeech corpus preprocessing (%d / %d) - '%s']" % (i, len(wave_files), wave_file)) # load flac file wave, sr, _ = scikits.audiolab.flacread(wave_file) # get mfcc feature mfcc = librosa.feature.mfcc(wave, sr=16000) # save result ( exclude small mfcc data to prevent ctc loss ) if len(label) < mfcc.shape[1]: # filename # save meta info writer.writerow([fn] + label) # save mfcc np.save(target_filename, mfcc, allow_pickle=False)
def process_voxforge(csv_file): parent_path = _data_path + 'voxforge_corpus/' labels, wave_files = [], [] # create csv writer writer = csv.writer(csv_file, delimiter=',') speaker_list = glob.glob(parent_path + '*') for spk in speaker_list: prompts_file = spk + '/' + 'etc/prompts-original' if not os.path.exists( prompts_file ): prompts_file = spk + '/' + 'etc/prompts.txt' if not os.path.exists( prompts_file ): continue with open(prompts_file, 'rt') as f: records = f.readlines() for record in records: try: field = record.split() # split field[2] by ' ' wave_file = spk + '/' + 'wav/' + field[0] + '.wav' wave_files.append(wave_file) # adding to list of file paths # label index labels.append(data.str2index(' '.join(field[1:]))) # last column is text label except: continue # save results count = 0 f = 0 s = 0 for i, (wave_file, label) in enumerate(zip(wave_files, labels)): try: fn = wave_file.replace('/', '-') target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy' if os.path.exists( target_filename ): continue # print info print("VoxForge corpus preprocessing (%d / %d) - '%s']" % (i, len(wave_files), wave_file)) wave, sr = librosa.load(wave_file, mono=True, sr=None) # re-sample ( 48K -> 16K ) wave = wave[::3] # get mfcc feature mfcc = librosa.feature.mfcc(wave, sr=16000) except Exception, e: print 'failed for ', wave_file, e f+=1 continue # save result ( exclude small mfcc data to prevent ctc loss ) if len(label) < mfcc.shape[1]: # save meta info writer.writerow([fn] + label) # save mfcc np.save(target_filename, mfcc, allow_pickle=False) count+= 1 else: s+=1
def process_libri(csv_file, category): parent_path = _data_path + category + '/' labels, wave_files = [], [] # create csv writer writer = csv.writer(csv_file, delimiter=',') # read directory list by speaker speaker_list = glob.glob(parent_path + '*') for spk in speaker_list: # read directory list by chapter chapter_list = glob.glob(spk + '/*/') for chap in chapter_list: # read label text file list txt_list = glob.glob(chap + '/*.txt') for txt in txt_list: with open(txt, 'rt') as f: records = f.readlines() for record in records: # parsing record field = record.split('-') # split by '-' speaker = field[0] chapter = field[1] field = field[2].split() # split field[2] by ' ' utterance = field[0] # first column is utterance id # wave file name wave_file = parent_path + '%s/%s/%s-%s-%s.flac' % \ (speaker, chapter, speaker, chapter, utterance) wave_files.append(wave_file) # label index labels.append(data.str2index(' '.join(field[1:]))) # last column is text label # save results for i, (wave_file, label) in enumerate(zip(wave_files, labels)): fn = wave_file.split('/')[-1] #extract file name target_filename = _root_path + 'preprocess/mfcc/' + fn + '.npy' if os.path.exists( target_filename ): #print 'continue.' continue # print info print("LibriSpeech corpus preprocessing (%d / %d) - '%s']" % (i, len(wave_files), wave_file)) # load flac file wave, sr, _ = scikits.audiolab.flacread(wave_file) # get mfcc feature, default 20 mfcc features, return np.ndarray [shape=(n_mfcc=20, t)], where t is the number of frames. #mfcc = librosa.feature.mfcc(wave, sr=16000) n_fft = 400 #16000*0.025 #25ms hop_length = 160 #16000*0.01 """ return np.ndarray [shape=(n_mfcc=20, t)], where t is the number of frames 40ms per frame(window length) with 10ms stride t = sec_of_samples*sample_rate/hop_length """ mfcc_total = [] mfcc = librosa.feature.mfcc(wave, sr=16000, n_fft=n_fft, hop_length=hop_length, n_mfcc=13) mfcc_delta = librosa.feature.delta(mfcc) mfcc_delta2 = librosa.feature.delta(mfcc, order=2) #mfcc = librosa.feature.melspectrogram(wave, sr=16000, n_fft=n_fft, hop_length=hop_length, n_mels=128) mfcc_total.append(mfcc) mfcc_total.append(mfcc_delta) mfcc_total.append(mfcc_delta2) mfcc = np.asarray(mfcc_total) #size: 3*13*fea_len mfcc_ = np.transpose(mfcc,axes=[2,1,0]) #size: fea_len*13*3 #mfcc_ = mfcc_[:,1:] #do normalization """ """ mean = np.mean(mfcc_) std = np.std(mfcc_) mfcc_ = (mfcc_- mean)/std # save result ( exclude small mfcc data to prevent ctc loss ) if len(label) < mfcc_.shape[0]: """if len(label) > mfcc.shape[1], meaning that there has at least two characters in 10ms(hop_length) and we can not separate it.""" # filename # save meta info writer.writerow([fn] + label) # save mfcc np.save(target_filename, mfcc_, allow_pickle=False)
decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False) # to dense tensor pred = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 targ = tf.placeholder(dtype=tf.int32, shape=(1, None)) #corpus.label.shape)#corpus.label loss = logit.sg_ctc(target=targ, seq_len=seq_len) opt = tf.train.GradientDescentOptimizer(learning_rate=lr) optimizer = opt.minimize(loss, var_list=(noise, )) new_target = np.array(str2index(fool)) # run network with tf.Session() as sess: # init variables tf.sg_init(sess) all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + \ tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS) vars_to_train = [el for el in all_vars if 'noise' not in el.name] # restore parameters saver = tf.train.Saver(vars_to_train) saver.restore(sess, tf.train.latest_checkpoint('asset/train'))