def audio_split(path,min_sl=300,sth=-40): """ sgm:AudioSegment 对象 silence_thresh=-70 # 小于-70dBFS以下的为静默 min_silence_len=700 # 静默超过700毫秒则拆分 length_limit=60*1000 # 拆分后每段不得超过1分钟 abandon_chunk_len=500 # 放弃小于500毫秒的段 joint_silence_len=1300 # 段拼接时加入1300毫秒间隔用于断句 """ if os.path.isfile(path): dp=os.path.splitext(path) if os.path.splitext(path)[1] in ['.mp3','.wav','.flv','.ogg','.raw','.m4a']: sgm=AudioSegment.from_file(path,format=dp[1].replace('.','')) chunks=split_on_silence(sgm,min_silence_len=min_sl,silence_thresh=sth) return chunks else: print('%s is not audio file,Please input audio file....'%path) sys.exit() elif isinstance(path,pydub.audio_segment.AudioSegment): #sgm=path chunks=split_on_silence(path,min_silence_len=min_sl,silence_thresh=sth) return chunks else: print('Input is not audio file or AudioSegment') sys.exit() return
def split_audio(self, path): path = str(path) dirc = os.path.dirname(path) base = os.path.basename(dirc) ########CONVERT VIDEO TO AUDIO######### svideo = path saudio = dirc + '/' + 'audio.wav' command = 'ffmpeg', '-i', svideo, '-ar', '16000', '-ac', '1', saudio subprocess.call(command, shell=True) ########SPLIT AUDIO######### sound = AudioSegment.from_wav(dirc + '/' + 'audio.wav') if (base == 'Video LK'): chunks = split_on_silence(sound, min_silence_len=200, silence_thresh=-40) else: chunks = split_on_silence(sound, min_silence_len=200, silence_thresh=-50) for i, chunk in enumerate(chunks): chunk.export(dirc + '/' + 'chunk{0}.wav'.format(i), format="wav") time.sleep(2) samplerate, data = wavfile.read(dirc + '/' + 'audio.wav') os.remove(dirc + '/' + 'audio.wav') return data
def slice_cut_silence(audio_path_str): # Load your audio. song = AudioSegment.from_wav(audio_path_str) audio_path = Path(audio_path_str) dir = audio_path.parent stem = audio_path.stem print("song dBFS: {}".format(song.dBFS)) # Split track where the silence is 2 seconds or more and get chunks using # the imported function. chunks = split_on_silence( # Use the loaded audio. song, # Specify that a silent chunk must be at least 1 seconds or 1000 ms long. min_silence_len=400, # Consider a chunk silent if it's quieter than (the max. amplitude of track - 15) dBFS. silence_thresh=song.dBFS - 15, keep_silence=400) print(len(chunks)) audio_slices = [] # setting minimum length of each chunk to 10 seconds min_length = 3 * 1000 output_chunks = [] for chunk in chunks: if len(chunk) > 60 * 1000: sub_chunks = split_on_silence(chunk, min_silence_len=200, keep_silence=100, silence_thresh=song.dBFS - 15) print(len(sub_chunks)) output_chunks.extend(sub_chunks) else: # if the last output chunk is longer than the target length, # we can start a new one output_chunks.append(chunk) # Process each chunk with your parameters for i, chunk in enumerate(output_chunks): # Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding. # set frame rate as the original track # silence_chunk = AudioSegment.silent(duration=200, frame_rate=song.frame_rate) # Add the padding chunk to beginning and end of the entire chunk. # audio_chunk = silence_chunk + chunk + silence_chunk # Normalize the entire chunk. To amplify the audio # normalized_chunk = match_target_amplitude(chunk, -20.0) # Export the audio chunk out_path = dir / "{}_silence{}.wav".format(stem, i) # print("Exporting {} dBFS {}.".format(out_path, normalized_chunk.dBFS)) chunk.export(out_path, format="wav") audio_slices.append(str(out_path)) return audio_slices
def audio_split_by_silence(self, pathname, j): total = 0 song = AudioSegment.from_wav(pathname).set_channels(1) chunks = split_on_silence( song, # split on silences longer than 1000ms (1 sec) min_silence_len=self.min_silent, # anything under -16 dBFS is considered silence silence_thresh=song.dBFS - 16, # keep 200 ms of leading/trailing silence keep_silence=False) base = os.path.basename(pathname) basefilename = os.path.splitext(base)[0] #print(chunks) i = 0 for chunk in tqdm(chunks): if math.ceil(chunk.duration_seconds) <= 10: total += chunk.duration_seconds FileName = '{3}/{0}_{1:0>3}_{2}.wav'.format( j, i, str(math.ceil(chunk.duration_seconds)), self.outpathname) # print("Saving........ "+FileName) chunk.export(FileName, format="wav") i += 1 else: song1 = AudioSegment.from_mono_audiosegments(chunk) chunks = split_on_silence( song1, # split on silences longer than 1000ms (1 sec) min_silence_len=self.min_silent, # anything under -16 dBFS is considered silence silence_thresh=song.dBFS - 16, # keep 200 ms of leading/trailing silence keep_silence=False) for chunk in chunks: if math.ceil(chunk.duration_seconds) <= 10: total += chunk.duration_seconds FileName = '{3}/{0}_{1:0>3}_{2}.wav'.format( j, i, str(math.ceil(chunk.duration_seconds)), self.outpathname) # print("Saving........ "+FileName) chunk.export(FileName, format="wav") i += 1 print("Total duration we extract:%s" % time.strftime('%H:%M:%S', time.gmtime(total))) print("Total duration of file:%s" % time.strftime( '%H:%M:%S', time.gmtime(round(song.duration_seconds, 2))))
def contact(folder): file_path = './input/{0}/{0}'.format(folder) # 处理excel file = docx.Document(file_path + ".docx") sentences = [] for para in file.paragraphs: sen = para.text.split('.')[0].split('?')[0].split('!')[0].strip() sentences.append(sen) #处理音频 woman_sound = AudioSegment.from_file(file_path + '_女.mp3', format="mp3") woman_chunks = split_on_silence(woman_sound, min_silence_len=1000, silence_thresh=-55) # exportChunks(woman_chunks) man_sound = AudioSegment.from_file(file_path + '_男.mp3', format="mp3") man_chunks = split_on_silence(man_sound, min_silence_len=500, silence_thresh=-55) # exportChunks(man_chunks) print("word中共{0}个句子".format(len(sentences))) print("女生 音频中共划分出{0}个音频".format(len(woman_chunks))) # print("女生慢 音频中共划分出{0}个音频".format(len(woman_slow_chunks))) print("男生 音频中共划分出{0}个音频".format(len(man_chunks))) # 开始输出 count = 0 for i in range(1, len(sentences) + 1): sentence = sentences[-i] sentence = processSentence(sentence) path = "./output/{0}/{1}.mp3".format(folder, sentence) if os.path.exists(path): print("{0}已经存在".format(sentence)) continue chinese_chunk = increaseDB(woman_chunks[-i * 3]) man_chunk = increaseDB(man_chunks[-i]) woman_slow_chunk = increaseDB(woman_chunks[-i * 3 + 2]) contacted_chunk = silence_sound * 2 + chinese_chunk + silence_sound * 3 + man_chunk + silence_sound * 3 + woman_slow_chunk + silence_sound * 2 contacted_chunk.export(path, format="mp3") count = count + 1 print("此次共生成{0}个单词音频".format(count))
def test_final(test_files_folder): filenames_all = [] phone_numbers_all = [] # loop for each of the test file for filename in sorted(os.listdir(test_files_folder), key=lambda x: int(os.path.splitext(x)[0])): # only if it is wave file if filename.endswith(".wav") and "_" not in filename: # full file path current_file = os.path.join(test_files_folder, filename) # print("Looping for file ", current_file) # In this file, take out all 10 utterances all_utterances = asg.from_file(current_file) silence_len = 205 # (in ms) minimum length of a silence to be used for a split thresh = -70 # (in dBFS) anything quieter than this will be considered silence. default=-16 separate_utterances = split_on_silence(all_utterances, min_silence_len=silence_len, silence_thresh=thresh) # fail-safe if len(separate_utterances) != 10: thresh = -55 # (in dBFS) anything quieter than this will be considered silence. default=-16 separate_utterances = split_on_silence( all_utterances, min_silence_len=silence_len, silence_thresh=thresh) if len(separate_utterances) != 10: raise ValueError("Error in ", filename) # string to hold the detected phone number output = "" # for each of the numeral in this file for numeral in separate_utterances: # apply all 10 models to this numeral numeral_path = "test_attempt" + filename[:-4] + "_delete.wav" numeral.export(numeral_path, format="wav") max_score = -float("inf") output_label = None for item in num_models: trained_model, label = item current_score = score_one_word(trained_model, numeral_path) if current_score > max_score: max_score = current_score output_label = label output += output_label os.remove(numeral_path) filenames_all.append(filename) phone_numbers_all.append(output) return filenames_all, phone_numbers_all
def ReadAudioFile(fname, p=PATH): """ Reads audio. When reading, we will first pad the file with some silence and then trim it. That way all the audio files in our system will be similarly positioned. """ from pydub import AudioSegment from pydub.silence import split_on_silence from scipy.io import wavfile silence = AudioSegment.silent(500) audio = AudioSegment.from_wav(p + '/' + fname) audio = silence + audio + silence chunk = split_on_silence(audio, min_silence_len=175, silence_thresh=-50) c = chunk[0] c.export(fname, format='wav') f = ConvertSampleRate(fname) rate, data = wavfile.read(f) ### CLEAN-UP ### os.remove(fname) if f != fname: # A new file was also created. Delete it too. os.remove(f) return rate, data
def segmentate(audios_path): classes = listdir(audios_path) for i, animal in enumerate(classes): if animal.find(' ') != -1: continue # not an animal but a set of words current_path = path.join(audios_path, animal) sounds = listdir(current_path) for sound in sounds: if not sound.lower().endswith('.wav'): continue AUDIO_FILE = path.join(current_path, sound) sound_file = AudioSegment.from_wav(AUDIO_FILE) try: audio_chunks = split_on_silence( sound_file, min_silence_len=2, # consider it silent if quieter than -16 dBFS silence_thresh=min(-16, sound_file.dBFS*7), keep_silence=0 ) except: continue for j, chunk in enumerate(audio_chunks): out_file = "splitAudio/{1}/{2}chunk{0}.wav".format(j, animal, sound[:-4]) if chunk.duration_seconds < 0.5: continue #print(chunk.duration_seconds) if not exists("splitAudio/{0}".format(animal)): makedirs("splitAudio/{0}".format(animal)) #print("exporting", out_file) chunk.export(out_file, format="wav")
def xcut(wavfn, outfd): cnt = 0 sound = AudioSegment.from_wav(wavfn) print len(sound) length = len(sound)/1000.0 #length in second print "length=",length if length < LENGTH_MIN: print "too short, drop" return sound = sound[CUT_START:-CUT_TAIL] print len(sound) chunks = split_on_silence(sound, min_silence_len=500, silence_thresh=-36, keep_silence=400)#silence time:700ms and silence_dBFS<-70dBFS print "chunks num =", len(chunks) if len(chunks) <= 2: print "no un-head-tail chunk, drop" return chunks = chunks[1:-1] for ck in chunks: nfn = outfd + "/" + (wavfn.split('/')[-1][:-4]) + ('_%d.wav'%cnt) nck = ck.set_frame_rate(16000) nck = nck.set_channels(1) if len(nck) < EFFECT_MIN: pass elif len(nck) > EFFECT_MAX: pass else: nck.export(nfn, format="wav") cnt = cnt + 1 print "chunk name= %s length= %d" % (nfn, len(nck))
def get_large_audio_transcription(path): """ Splitting the large audio file into chunks and apply speech recognition on each of these chunks """ sound = AudioSegment.from_wav(path) chunks = split_on_silence( sound, min_silence_len=500, silence_thresh=sound.dBFS - 14, keep_silence=500, ) folder_name = "audio-chunks" if not os.path.isdir(folder_name): os.mkdir(folder_name) whole_text = "" print('[+] Processing Audio') for i, audio_chunk in enumerate(chunks, start=1): chunk_filename = os.path.join(folder_name, f"chunk{i}.wav") audio_chunk.export(chunk_filename, format="wav") with sr.AudioFile(chunk_filename) as source: audio_listened = r.record(source) try: text = r.recognize_google(audio_listened) except sr.UnknownValueError as e: print("Error:", str(e)) else: text = f"{text.capitalize()}. " print(chunk_filename, ":", text) whole_text += text os.rmdir(folder_name) return whole_text
def segmentation(min_silence_len,silence_thresh,chunk_silent): song = AudioSegment.from_file('test212.wav',format="wav") #print(song.frame_rate) shutil.rmtree("audio_chunks") chunk_dur=[] #print(4) chunks,silent_ranges= split_on_silence(song,min_silence_len = min_silence_len,silence_thresh = silence_thresh) #print(6) try: os.mkdir('audio_chunks') except(FileExistsError): pass i=0 chunk_silent = AudioSegment.silent(duration = 100) for chunk in chunks: audio_chunk = chunk_silent + chunk + chunk_silent audio_chunk.export("./audio_chunks/chunk{0}.wav".format(i), bitrate ='64k', format ="wav") i+=1 filename = './audio_chunks/chunk'+str(i-1)+'.wav' with cl.closing(wave.open(filename,'r')) as f: rate1 = f.getframerate() chunk_rate = rate1 nf_chunks=i return chunk_rate,nf_chunks,silent_ranges
def SplitAudio(fname): """ Splits given audio file on silence and exports the separated chunks. """ from pydub import AudioSegment from pydub.silence import split_on_silence from scipy.io import wavfile from scipy import signal silence = AudioSegment.silent(500) audio = AudioSegment.from_wav(fname) audio = silence + audio + silence chunks = split_on_silence(audio, min_silence_len=175, silence_thresh=-50) wavs = defaultdict(list) b, a = signal.butter(4, 0.1, analog=False) for i, c in enumerate(chunks): fname = 'Words/word{}.wav'.format(i) c.export(fname, format='wav') rate, data = wavfile.read(fname) if len(data.shape) == 2: # Two channels were found (stereo), but we need only one data = data[:, 0] data = signal.filtfilt(b, a, data) # Filter signal wavs['rate'].append(rate) wavs['data'].append(data) return pd.DataFrame(wavs)
def crop2chunk(filename): # clear folder firstly if os.path.exists('./static/segmentFile') == False: os.mkdir('./static/segmentFile') else: shutil.rmtree('./static/segmentFile') os.mkdir('./static/segmentFile') sound = AudioSegment.from_mp3(filename) loudness = sound.dBFS chunks = split_on_silence( sound, #must be silent for at least half a second min_silence_len=430, #consider it silent if quieter than - 16 dBFS silence_thresh=-45, keep_silence=400) # 放弃长度小于2秒的录音片段 for i in list(range(len(chunks)))[::-1]: if len(chunks[i]) <= 2000 or len(chunks[i]) >= 10000: chunks.pop(i) print('取有效分段(大于2s小于10s):', len(chunks)) ''' for x in range(0,int(len(sound)/1000)): print(x,sound[x*1000:(x+1)*1000].max_dBFS) ''' for i, chunk in enumerate(chunks): chunk.export("./static/segmentFile/chunk{0}.wav".format(i), format="wav")
def split_aud_by_dir(src_dir, out_dir): l = os.listdir(src_dir) m = os.listdir(out_dir) if len(m) != 0: m = [i[:8] for i in m] l = l[l.index(m[-1] + ".mp3"):] for j in l: try: sound_file = AudioSegment.from_mp3(src_dir + j) # sound_file = librosa.core.amplitude_to_db(sound_file0) audio_chunks = split_on_silence( sound_file, # must be silent for at least half a second min_silence_len=25, # consider it silent if quieter than -50 dBFS silence_thresh=-90) for i, chunk in enumerate(audio_chunks): out_file = out_dir + j.replace(".mp3", "") + "_{0}.mp3".format(i) # print "exporting", out_file chunk.export(out_file, format="mp3") except: print("error with " + j) return 0
class Home(TemplateView): template_name = 'home.html' Splitting the large audio file into chunks and apply speech recognition on each of these chunks """ # open the audio file using pydub sound = AudioSegment.from_wav(path) # split audio sound where silence is 700 miliseconds or more and get chunks chunks = split_on_silence(sound, # experiment with this value for your target audio file min_silence_len = 500, # adjust this per requirement silence_thresh = sound.dBFS-14, # keep the silence for 1 second, adjustable as well keep_silence=500, ) folder_name = "audio-chunks" # create a directory to store the audio chunks if not os.path.isdir(folder_name): recognize_google(audio_listened,language="fr-FR") except sr.UnknownValueError as e: print("Error:", str(e)) else: #text = f"{text.capitalize()}. " print(chunk_filename, ":", text) text+="." inter_text = translator.translate(text,dest = 'hi') print(inter_text.text) hindi_text+=inter_text.text whole_text += text
def slice_clip(inFile, outDir, minSilence, threshold, verbose): print("Loading input file...") sound = AudioSegment.from_mp3(inFile) print("Setting Channels to mono...") channels = sound.split_to_mono() for c, channel in enumerate(channels): print("Processing Channel {}".format(c)) print("Creating Slices...") chunks = split_on_silence(channel, min_silence_len=minSilence, silence_thresh=threshold) print("Saving slices to disk...") for i, chunk in enumerate(chunks): # Normalize the entire chunk. normalized_chunk = SU.match_target_amplitude(chunk, -20.0) fileName = "c{}s{}.wav".format(c, i) # Export the audio chunk with new bitrate. if verbose: print("Exporting {}...".format(fileName)) print("\tChunk Size: {}".format(SU.ms2hms(len(chunk)))) if outDir[-1] != "/": outDir += "/" normalized_chunk.export(outDir + fileName, format="wav")
def extractWordFiles(filename, filedir, subdir, purge=False): sound_file = AudioSegment.from_wav(filename) audio_chunks = split_on_silence( sound_file, # must be silent for at least ... in ms min_silence_len=150, # consider it silent if quieter than -... dBFS silence_thresh=-50) if not os.path.exists(filedir + subdir): os.makedirs(filedir + subdir) # Delete tmp files in directory files = glob.glob(filedir + subdir + '*') filecount = 0 for f in files: if purge: os.remove(f) else: filecount += 1 for i, chunk in enumerate(audio_chunks): out_file = filedir + subdir + "chunk{0}.wav".format(i + filecount) print("exporting: ", out_file) chunk.export(out_file, format="wav")
def split_slience(audio_file): dbfs = audio_file.dBFS audio_chunks = split_on_silence(audio_file, min_silence_len=1000, silence_thresh=dbfs - 30, keep_silence=True) return audio_chunks
def cutOnBestSilence(bestSilence, file, targetFolder): song = AudioSegment.from_wav(file) normalized_sound = match_target_amplitude(song, -20.0) chunks = split_on_silence (normalized_sound, min_silence_len=bestSilence, silence_thresh=-45) # print(str(len(chunks)) + " Chunks") # print("Exporting Chuncks....") # print("\n") for i, chunk in enumerate(chunks): normalized_chunk = chunk if i >= 0 and i <= 9: normalized_chunk.export(targetFolder +'/chunk00000' + str(i) + '.wav', format="wav") if i >= 10 and i <= 99 : normalized_chunk.export(targetFolder + '/chunk0000' + str(i) + '.wav', format="wav") if i >= 100 and i <= 999 : normalized_chunk.export(targetFolder + '/chunk000' + str(i) + '.wav', format="wav") if i >= 1000 and i <= 9999 : normalized_chunk.export(targetFolder + '/chunk00' + str(i) + '.wav', format="wav") if i >= 10000 and i <= 99999 : normalized_chunk.export(targetFolder + '/chunk0' + str(i) + '.wav', format="wav") # cutOnBestSilence(222, "test.wav", r"C:\Pro\Py\MySpeechRecognizer\toAnalyse\SplittedFiles\out")
def cut_audio_chunks(loadpath, savepath, min_silence_len, silence_thresh, audioformat='mp3'): '''Function to split raw audio into chunks corresponding to isolated events takes a specified loading path, a saving path, the minimum silence time length in ms, and the threshold for silence in dB.''' if audioformat == 'mp3': sound_file = AudioSegment.from_mp3(loadpath) elif audioformat == 'wav': sound_file = AudioSegment.from_wav(loadpath) # Make sure the directories exist to store the segmented audio: if not os.path.exists(savepath): os.makedirs(savepath) if not os.path.exists(savepath): os.makedirs(savepath) # split audio audio_chunks = split_on_silence( sound_file, # must be silent for at least half a second min_silence_len=min_silence_len, # consider it silent if quieter than silence_thresh=silence_thresh) # store the audio chunks for i, chunk in enumerate(audio_chunks): out_file = savepath + i + ".wav" print("exporting ", out_file) chunk.export(out_file, format="wav")
def gapProc(self): #def gapProc(self , lowest): sound_file = AudioSegment.from_wav(self.fname) audio_chunks = split_on_silence(sound_file, # must be silent for at least 100ms min_silence_len=1, # consider it silent if quieter than -16 dBFS silence_thresh=8) # List made to store all of the silence .wav chunks waveAry = [] # List made to store the lengths of the silence chunks chunkLengthArray = [] for i, chunk in enumerate(audio_chunks): out_file = ".//splitAudio//chunk{0}.wav".format(i) #waveAry.append(chunk) chunkLengthArray.append(len(chunk)) #If there were no silences, set the mean variable to 0 if len(chunkLengthArray) == 0: avgChunkLength = 0 stdevChunkLength = 0 # If thee is exactly 1 silence, set the stdev to 0 # and the average chunk length to the value of the only silence elif len(chunkLengthArray) == 1: stdevChunkLength = 0 avgChunkLength = chunkLengthArray[0] # Otherwise calculate the mean gap and stdev of the gaps and store # them in variables else: avgChunkLength = mean(chunkLengthArray) stdevChunkLength = stdev(chunkLengthArray) # Return the array containing the lengths of the gaps return(chunkLengthArray)
def btnOpen_Click(self): path, _ = QFileDialog.getOpenFileName(None, "Open Audio files", "","Sound Files (*.wav)") # audio = path.replace("/","\\") # path='audiototext.wav' if path != "": self.lbl_audio.setText("Converting audio file to text...") r = sr.Recognizer() sound = AudioSegment.from_wav(path) chunks = split_on_silence(sound, min_silence_len = 500, silence_thresh = sound.dBFS-14, keep_silence=500, ) folder_name = "audio-data" if not os.path.isdir(folder_name): os.mkdir(folder_name) whole_text = "" for i, audio_chunk in enumerate(chunks, start=1): chunk_filename = os.path.join(folder_name, f"audio{i}.wav") audio_chunk.export(chunk_filename, format="wav") with sr.AudioFile(chunk_filename) as source: audio_listened = r.record(source) try: text = r.recognize_google(audio_listened,language="vi-VI") except Exception as ex: self.txtData.setText("False Loading!!!\n" + str(ex) + "\n" + path + "\n") else: text = f"{text.capitalize()}. " # print(chunk_filename, ":", text) print(text) whole_text += text self.txtData.setText(whole_text) shutil.rmtree(folder_name) self.lbl_audio.setText("Successfuly!") self.btnSaveAs.setEnabled(True)
def split( file, min_silence_len=500, silence_thresh=-20, max_len=7, keep_silence=1000, ): audio = AudioSegment.from_mp3(file) audio_chunks = split_on_silence( audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh, keep_silence=keep_silence, ) audios, temp, length = [], [], 0 for i in range(len(audio_chunks)): if length + audio_chunks[i].duration_seconds >= max_len and len(temp): audios.append(sum(temp)) temp = [] length = 0 temp.append(audio_chunks[i]) length += audio_chunks[i].duration_seconds if len(temp): audios.append(sum(temp)) return audios, audio
def split_and_recognize(binary): """ Performs automated speech recognition on input WAV audio bytes. Uses CMUSphinx for ASR, see https://cmusphinx.github.io/. Requires speech_recognition and pocketsphix python packages. Requires swig to be installed. Also possibly libpulse-dev libasound2-dev. """ r = sr.Recognizer() bigWav = AudioSegment(binary) chunks = split_on_silence(bigWav, min_silence_len = 500, silence_thresh = bigWav.dBFS-14, keep_silence=500) whole_text = "" num = len(chunks) for i, audio_chunk in enumerate(chunks): chunk_audio = audio_chunk.export(io.BytesIO(), format="wav") chunk_audio.seek(0) with sr.AudioFile(chunk_audio) as source: audio = r.record(source) try: text = r.recognize_sphinx(audio) whole_text += f"{text} " except: msg = "<inaudible>" whole_text += msg chunk_audio.close() return whole_text
def text_from_large_clip(audio_file): r = sr.Recognizer() audio_path = os.path.join(os.getcwd(), "wav_outputs") audio_path = os.path.join("wav_outputs", audio_file) audio = AudioSegment.from_wav(audio_path) # in order to split the audio we have to do in chuncks (these chunks will be split by the silence found for atleast 1 second) audio_chunks = split_on_silence(audio, min_silence_len=1000, # 1 second is also 1000 milliseconds silence_thresh = audio.dBFS - 16 # our audio chunks will detect silence under 16 dBFS the normal ) # now since speech_recognition needs to listen to audio files we have to save the chunks to a folder save_chunks_to_folder(audio_chunks) audio_files = os.listdir(os.getcwd() + r"/audio_chunks") chunks_folder = os.path.join(os.getcwd() + r"/audio_chunks") translated_text = "" for audio_fi1e in audio_files: with sr.AudioFile(os.path.join(chunks_folder, audio_fi1e)) as audio: audio_data = r.record(audio) try: audio_chunk_text = r.recognize_google(audio_data) audio_chunk_text = audio_chunk_text.capitalize() + "." translated_text += audio_chunk_text except: print("No Text Detected in the Audio!") return translated_text
def splitWavFileAndStore(filename, minsillen=50, silthresh=-60): # minsillen= 100, silthresh = -60 line = AudioSegment.from_wav(filename) audio_chunks = split_on_silence( line, min_silence_len=minsillen, silence_thresh=silthresh) # isolation of words is done here rejectedOffset = 0 for i, chunk in enumerate(audio_chunks): # audio_chunks is a python list if (checkChunk(chunk, i, minimumWordSize, maximumWordSize)): # rejectedOffset = rejectedOffset + 1 continue out_file = DEFAULT_CHUNKNAME.format(i - rejectedOffset + fileOffset) print("size of chunk{}: {} ".format(i - rejectedOffset + fileOffset, len(chunk))) print("exporting", out_file) chunk.export(out_file, format="wav") print("done exporting...") temp = i print("Total number of files:", temp + 1) return temp + 1
def get_large_audio_transcription(path): sound = AudioSegment.from_wav(path) chunks = split_on_silence( sound, min_silence_len=500, silence_thresh=sound.dBFS - 14, keep_silence=500, ) folder_name = "audio-chunks" if not os.path.isdir(folder_name): os.mkdir(folder_name) whole_text = "" for i, audio_chunk in enumerate(chunks, start=1): chunk_filename = os.path.join(folder_name, f"chunk{i}.wav") audio_chunk.export(chunk_filename, format="wav") with sr.AudioFile(chunk_filename) as source: audio_listened = r.record(source) try: text = r.recognize_google(audio_listened, language="en-US") except sr.UnknownValueError as e: pass #print("Error:",str(e)) else: text = f"{text.capitalize()}. " #print(chunk_filename,":",text) whole_text += text return whole_text
def split(filepath, save_path, time_length): sound = AudioSegment.from_wav(filepath) dBFS = sound.dBFS chunks = split_on_silence( sound, min_silence_len=500, silence_thresh=dBFS - 16, keep_silence=250 # optional ) # setting minimum length of each chunk to x seconds target_length = time_length * 1000 output_chunks = [chunks[0]] for chunk in chunks[1:]: if len(output_chunks[-1]) < target_length: output_chunks[-1] += chunk else: # if the last output chunk is longer than the target length, # we can start a new one output_chunks.append(chunk) # Attention! if os.path.exists(save_path): shutil.rmtree(save_path) os.mkdir(save_path) for i, chunk in enumerate(output_chunks): chunk.export(os.path.join(save_path, "{0}.wav".format(i)), format="wav") return len(output_chunks)
def split_silence_hm(audio_dir, split_silence_dir, sum_dir): ''' Args : audio_dir : 여러 오디오('wav')가 있는 파일경로 split_silence_dir : 묵음 부분 마다 자른 오디오 파일을 저장할 파일 경로 sum_dir : 묵음 부분 마다 자른 오디오 파일을 합쳐서 저장할 파일경로 ''' # audio_dir에 있는 모든 파일을 가져온다. audio_dir = librosa.util.find_files(audio_dir, ext=['wav']) # 폴더 생성하기 def createFolder(directory): try: if not os.path.exists(directory): os.makedirs(directory) except OSError: print('Error: Creating directory. ' + directory) # audio_dir에 있는 파일을 하나 씩 불러온다. for path in audio_dir: print("묵음을 없앨 파일 ", path) # 오디오 불러오기 sound_file = AudioSegment.from_wav(path) # 파일 이름만 가져오기 _, w_id = os.path.split(path) w_id = w_id[:-4] # 가장 최소의 dbfs가 무엇인지 # dbfs : 아날로그 db과는 다른 디지털에서의 db 단위, 0일 때가 최고 높은 레벨 dbfs = sound_file.dBFS # silence 부분 마다 자른다. audio_chunks = split_on_silence( sound_file, min_silence_len=200, silence_thresh=dbfs - 16, # keep_silence= 100 keep_silence=0) # 파일 명으로 새로운 폴더를 생성한다. createFolder(split_silence_dir + w_id) # silence 부분 마다 자른 거 wav로 저장 for i, chunk in enumerate(audio_chunks): out_file = split_silence_dir + w_id + "\\" + w_id + f"_{i}.wav" # print ("exporting", out_file) chunk.export(out_file, format="wav") # 묵음을 기준으로 자른 오디오 파일을 하나의 파일로 합친다. path_wav = split_silence_dir + w_id + "\\" print("묵음으로 잘린 파일이 저장된 곳", path_wav) path_out = sum_dir + w_id + '_silence_total.wav' print("오디오 합친 파일 경로 ", path_out) voice_sum(form='wav', audio_dir=path_wav, save_dir=None, out_dir=path_out)
def split_words(folder): file_path = './input/{0}/{0}'.format(folder) sound = AudioSegment.from_file(file_path + '.mp3', format="mp3") chunks = split_on_silence(sound, min_silence_len=1000, silence_thresh=-50) loc = (file_path + ".xls") wb = xlrd.open_workbook(loc) sheet = wb.sheet_by_index(0) print("音频中共划分出{0}个单音频".format(len(chunks))) print("excel中共{0}个单词".format(sheet.nrows - 1)) # 检查单词是否重复 word_exist = {} for i in range(1, sheet.nrows): word = sheet.cell_value(i, 2) if word_exist.get(word): print("{0}重复了".format(word)) else: word_exist.setdefault(word, True) # 开始输出 count = 0 for i in range(1, sheet.nrows): word = sheet.cell_value(sheet.nrows - i, 2).strip() path = "./output/{0}/{1}.mp3".format(folder, word) if os.path.exists(path): print("{0}已经存在".format(word)) else: chunk = addSilence(increaseDB(chunks[-i])) chunk.export(path, format="mp3") # print(chunk.max) count = count + 1 print("此次共生成{0}个单词音频".format(count))
def split_silence_hm(audio_dir, split_silence_dir, sum_dir): audio_dir = librosa.util.find_files(audio_dir, ext=['wav' ]) # audio_dir에 있는 모든 파일을 가져온다 for path in audio_dir: # audio_dir에 있는 파일을 하나 씩 불러온다. sound_file = AudioSegment.from_wav(path) _, w_id = os.path.split(path) w_id = w_id[:-4] dbfs = sound_file.dBFS audio_chunks = split_on_silence( sound_file, # silence 부분 마다 자른다. silence_thresh=dbfs - 16, # silence_thresh : 몇 db 이하를 침묵이라고 할 것인지 min_silence_len=200, # min_silence_len : 몇 초 이상 침묵할 때 자를 것인지 keep_silence=0 # keep_silence : 앞, 뒤로 몇 초 여유를 줄 것인지 ) for i, chunk in enumerate(audio_chunks): # silence 부분 마다 자른 거 wav로 저장 out_file = split_silence_dir + w_id + "\\" + w_id + f"_{i}.wav" chunk.export(out_file, format="wav") path_wav = split_silence_dir + w_id + "\\" # 묵음을 기준으로 자른 오디오 파일을 하나의 파일로 합친다. # 묵음으로 잘린 파일이 저장된 곳 path_out = sum_dir + w_id + '_silence_total.wav' # 오디오 합친 파일 경로 voice_sum(form='wav', audio_dir=path_wav, save_dir=None, out_dir=path_out)
def multi_split_on_silence(wav_file, output_dir, min_silence_len, silence_thresh, pass_first_time=0, seek_step=110, keep_silence=250): speech = AudioSegment.from_wav(wav_file) file_name = os.path.basename(wav_file) # print(file_name) chunks = split_on_silence(speech, min_silence_len=min_silence_len, silence_thresh=silence_thresh, seek_step=seek_step, keep_silence=keep_silence) curr_pass_time = 0. for i in range(0, len(chunks)-3): if curr_pass_time < pass_first_time: curr_pass_time += len(chunks[i])/1000 continue # print(len(chunks[i]), os.path.join(output_dir, file_name[:-4]+'_'+str(i+1).zfill(4)+'.wav')) chunks[i].export(os.path.join(output_dir, file_name[:-4]+'_'+str(i+1).zfill(4)+'.wav'), format='wav', parameters=["-ar", "16000", "-ac", "1"])
def cutbySilence(self, min_silence_len=1000, r=1): #using dBFS to normalize the silence across files silence_thresh = self.audio.dBFS - 5/r audio_splits = silence.split_on_silence(self.audio, min_silence_len=min_silence_len, keep_silence=150, silence_thresh=-16) #cuts that are still too long, maybe an area of higher overall dBFS long_splits = [split for split in audio_splits if math.floor(split.duration_seconds)>20] if r != 2: for split in long_splits: audio_splits.remove(split) #cut recursively new_splits = self.cutbySilence(split, r=r+1) for ns in new_splits: audio_splits.append(ns) #clean the cuts of anything too short audio_splits = [split for split in audio_splits if math.floor(split.duration_seconds)>.5] return audio_splits
def split_song(f_name, length_val, threshold_val, overwrite=True, export_all=False): f_prefix = f_name[0:-4] sound = AudioSegment.from_mp3(f_name) print '%s was loaded.' % f_name print 'Splitting by silence STARTED.' chunks = split_on_silence(sound, min_silence_len=length_val, silence_thresh=-threshold_val) print 'Splitting by silence FINISHED.' if not len(chunks)==1: if export_all: for i, chunk in enumerate(chunks): chunk.export('%s_part_%i.mp3' % (f_prefix, i), format="mp3") print 'Segments were saved to %i mp3 files.' % len(chunks) else: if overwrite: chunks[0].export(f_name, format="mp3") else: chunks[0].export('%s_part_0.mp3' % f_prefix, format="mp3") print 'Splitting %s DONE' % f_name else: print 'No splitting was needed for %s!' % f_name
def main(): voice = "Kate" description = "fern_hill" silence_length = 145 words_per_minute = 140 #maybe this should be slower #TODO: maybe make it so the ambient is same bpm as wpm print "Doing text-to-speech synthesis." generate_audio(description, voice, words_per_minute) print "Getting nouns" with open("descriptions/%s.txt" % description, "r") as f: text = f.read() text = text.decode('utf-8') tokenizer = nltk.RegexpTokenizer(r'\w+') tokenized_text = tokenizer.tokenize(text) nouns = get_nouns(tokenized_text) print "Getting sfx for nouns" sfx = find_sfx(nouns) try: reading = AudioSegment.from_file("/Users/restd/PycharmProjects/dreamScene/readings/%s.wav" % description, format="wav") + 19 except IOError: reading1 = AudioSegment.from_file("/Users/restd/PycharmProjects/dreamScene/readings/%s.mp3" % description, format="mp3") process_reading("/Users/restd/PycharmProjects/dreamScene/readings/%s.aiff" % description, len(reading1)) sys.exit(0) #TODO: make this less of a guessing game word_chunks = split_on_silence(reading, min_silence_len=silence_length, silence_thresh=-16) print "Adding sfx to reading" scene = add_sfx_to_reading(word_chunks, tokenized_text, nouns, sfx, silence_length, reading) print "Getting and adding related (rough) ambient music." ambient = get_ambient(len(scene), nouns, description)#len(scene), nouns) scene = scene.overlay(ambient - 16, loop=True) scene.export("scenes/%s.mp3" % description, format="mp3")
def split(input_path, output_directory=None): print "Start cutting" sound = AudioSegment.from_wav(input_path) chunks = split_on_silence(sound, # must be silent for at least half a second min_silence_len = 500, # consider it silent if quieter than -50 dBFS silence_thresh = -33 ) # export the chunk # i for the output file count i = 0 slash_index = input_path.rfind("/") + 1 filename = input_path[slash_index: -4] for i, chunk in enumerate(chunks): chunk.export("{dir}{name}_{count}.wav".format( dir=output_directory, name=filename, count=i), format="wav") print 'There are splited into {number} files'.format(number=i + 1) return i + 1
# 404 for LiveATC is 168 bytes if os.path.getsize(mp3) == 168: print "Not available yet" os.remove(mp3) print "Waiting 10 minutes to ask again" time.sleep(600) continue else: file_name = next_filename print "Creating audio segment from " + file_name podcast = AudioSegment.from_mp3(file_name) print "Chunking based on silence" chunks = split_on_silence(podcast, min_silence_len=500, silence_thresh=-50) output_directory = file_name[0:-4] os.mkdir(output_directory) print "Exporting chunks" for i, chunk in enumerate(chunks): chunk.export(output_directory + "/chunk{0}.mp3".format(i), format="mp3") print "Removing " + files_to_get.pop(0) + " from list of files to get" time.sleep(600)
from pydub import AudioSegment from pydub.silence import split_on_silence import random import sys,os name = '01.mp3' path = '/Users/syslot/Desktop' file_name = os.path.join(path,name) sound = AudioSegment.from_mp3(file_name) chunks = split_on_silence(sound,min_silence_len=700,silence_thresh=-70)#silence time:700ms and silence_dBFS<-70dBFS words = chunks[2:] #first and second are not words. len1 = len(words) new = AudioSegment.empty() silence = AudioSegment.silent(duration=1000)#1000ms silence order = range(len1) random.shuffle(order) print(order) comments = "" for i in order: new += words[i]+silence comments += str(i)+"," save_name = file_name.split(".")[0]+"-random{0}.".format(random.randrange(0,9))+file_name.split(".")[1] new.export(save_name, format="mp3",tags={'artist': 'AppLeU0', 'album': file_name, 'comments': comments[:-1]})
from pydub import AudioSegment from pydub.silence import split_on_silence import sys params = { 'file' : sys.argv[1], 'min_silence_len': sys.argv[2], 'silence_thresh': sys.argv[3] } sound = AudioSegment.from_wav(params['file']) chunks = split_on_silence(sound, # must be silent for at least half a second min_silence_len=params['min_silence_len'], # consider it silent if quieter than -16 dBFS silence_thresh=['silence_thresh'] ) for i, chunk in enumerate(chunks): chunk.export("./cutups/chunk{0}.wav".format(i), format="wav")
from dadasql.model import Line, Fundamental, DBFS, Duration from sqlalchemy.orm.exc import NoResultFound import random, math from pydub import AudioSegment, silence path = '/root/dada-dial/sounds/' filename = 'user.wav' #create pydub audio file user_audio = AudioSegment.from_wav(path+filename) #this is a hacky way to get rests that mimic the users user_rests = silence.detect_silence(user_audio) user_rests_len = [s[1]-s[0] for s in user_rests if (user_audio.duration_seconds*1000 - s[1])>3] user_rest_segments = [AudioSegment.silent(duration=rest_len) for rest_len in user_rests_len] print [r.duration_seconds for r in user_rest_segments] user_splits = silence.split_on_silence(user_audio) split_durations = [math.ceil(s.duration_seconds) for s in user_splits] split_dbfs = [int(s.dBFS) for s in user_splits] split_fundamentals = [] for s in user_splits: s.export(path + 'temp.wav', format='wav') s_fft = dadaFFT(path+'temp.wav') fundamental, power = s_fft.get_fundamental() split_fundamentals.append(int(fundamental)) #got all the user input information, now we need to find lines that match #match on duration duration_results = [] for d in split_durations: try: duration_results.append([d[0] for d in db_session.query(Line.id).join(Duration.lines).filter(Duration.duration==d).all()]) except NoResultFound:
EXPORT_PATH = '/home/gswewf/data/五十音图' time_start = "00:16" time_end = "01:35" song = AudioSegment.from_mp3(file) start = (int(time_start.split(':')[0])*60 + int(time_start.split(':')[1]))*1000 end = (int(time_end.split(':')[0])*60 + int(time_end.split(':')[1]))*1000 # print(start, end) # 剪切时间:是按ms 毫秒来的,所以时间格式的转换就要到毫秒级的。 word = song[start:end] # 这里silence_thresh是认定小于-42dBFS以下的为silence,然后需要保持小于-42dBFS超过 700毫秒。这样子分割成一段一段的。 # 最关键的就是这两个值的确定,这里需要我们用到foobar的一个功能:视图——可视化———音量计 # 可以观察一段音频的dBFS大小,正常的音量差不多都是在-25dBFS到-10dBFS。这个单位是从-96dBFS到0dBFS的,越靠近0,音量越大。 # 我们这里取-42dBFS以下的,认为是静音。然后可以用foobar估算每个单词中间的间隙时间,大概是在900ms也就是0.9s。我们还是取小一些 0.7s分割。 words = split_on_silence(word, min_silence_len=700, silence_thresh=-42) # 再来就是生成一个乱序的序列,然后把单词对应进去,然后中间插入空白静音1s。 silent = AudioSegment.silent(duration=1000) print("共分割出{}个音".format(len(words))) wushiyintu = ['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ', 'た', 'ち', 'つ', 'て', 'と', 'な', 'に', 'ぬ', 'ね', 'の', 'は', 'ひ', 'ふ', 'へ', 'ほ', 'ま', 'み', 'む', 'め', 'も', 'や', 'ゆ', 'よ', 'ら', 'り', 'る', 'れ', 'ろ', 'わ', 'を', 'ん']