def get_audio(self, word: str) -> str: """ Downloads the audio to the path and returns the anki formatted audio name if the download was succesful :return: The anki formatted audio string eg. [sound:word.ogg] or empty string if the audio failed :param word: The word for which the audio will be fetched """ try: link = self.__get_audio_link(word) r = requests.get(link) if not r.ok: return '' except Exception: return '' file_path = os.path.join(self.path, f'{word}.ogg') with open(file_path, 'wb') as f: f.write(r.content) if self.normalize: effects.normalize(AudioSegment.from_ogg(file_path)).export(file_path) return f'[sound:{word}.ogg]'
def MP3ToWav(self, inDir, outDir, minTrackLength=0): mp3FilePath = inDir + '*' files = glob.glob(mp3FilePath) for inFile in files: if not inFile.endswith('.mp3'): continue filename = os.path.basename(inFile) outFile = outDir + filename.replace('.mp3', '.wav') # convert wav to mp3 print('Computing track length for file:', inFile) sound = AudioSegment.from_mp3(inFile) trackLength = len(sound) / 1000 if trackLength < minTrackLength: print('Skipping mp3 conversion for file:', inFile, ' Track Length ', trackLength, '(sec) is less than minium track length ', minTrackLength, '(sec)') continue print('Creating:', outFile, ' audio track length:', trackLength, '(sec)') goodVolumeSound = effects.normalize(sound) goodVolumeSound.export(outFile, format='wav')
def normaliser(path: PathOrStr, fmt: str) -> None: """normalise every file in a directory of a particular audio format""" for p in Path(path).iterdir(): if p.is_file() and p.suffix.strip('.') == fmt: clip = effects.normalize(AudioSegment.from_file(p.as_posix(), fmt)) clip.export(p.as_posix(), format=fmt) print(f'normalised: {p.as_posix()}')
def audioProcessing(sample_list): procs = [] for s in sample_list: sample = AudioSegment.from_file(s,s.split('.')[1]) if settings['NORMALIZE']: sample = normalize(sample) if settings['SILENCE']: sample = strip_silence(sample, 1000, silence_thresh=(sample.dBFS-6)) if settings['PADDING']: sample += AudioSegment.silent(duration=settings['PADDING']) if settings['PB_SPEED'] != 1: sample = speedup(sample, settings['PB_SPEED']) if not settings['CONCAT']: # Exporting indivudial samples sample_name = s.split('/')[-1].split('.')[0] export(sample, sample_name, settings['OUTPUT_FORMAT'], settings['OUTPUT_BITRATE']) else: procs.append(sample) if settings['CONCAT']: # Exporting sample chain concatSamples(procs, raw_input('Enter sample chain name: ')) if settings['REM_INDIVIDUAL_SAMPLES']: removeIndividualSamples(sample_list)
def daily(): sumAttenuation = config.getint('vereinheiter', 'dailiesSumAttenuation') daily = AudioSegment.silent(100) for einheit in range(1, 16): [clips, sums] = database.readEinheit(einheit) if len(clips) > 0: audio = [AudioSegment.from_wav(clip) for clip in clips] audio = panClips(audio) sumsAudio = [AudioSegment.from_wav(sum) for sum in sums] sumsAudio = panClips(sumsAudio) [audio.append(sum - sumAttenuation) for sum in sumsAudio] duration = max([len(clip) for clip in audio]) result = AudioSegment.silent(duration=duration) for clip in audio: result = result.overlay(clip - 30, position=randint( 0, duration - len(clip))) result = effects.normalize(result, 0.5) result = result.remove_dc_offset() daily = daily.append(result) if len(daily) > 100: name = os.path.join(dailiesDir, date()) daily.export(name + '.wav', format='wav') daily.export(name + '.mp3', format='mp3', bitrate="128k") database.writeDaily(date(), name + '.wav') os.system(config.get('vereinheiter', 'dailiesSyncCmd')) print('new daily:', name + '.mp3') else: database.writeDaily(date(), '') print('skip empty daily')
def addClipsToSums(einheit, clips, sums): numSpeakers = config.getint('vereinheiter', 'numSpeakers') attenuation = config.getint('vereinheiter', 'sumAttenuation') clipsAudio = [AudioSegment.from_wav(clip) for clip in clips] sumsAudio = [AudioSegment.from_wav(sum) for sum in sums] while len(sumsAudio) < numSpeakers: sumsAudio.append(AudioSegment.silent(1)) newSums = [] for i in range(0, numSpeakers): sum = sumsAudio[i] clip = clipsAudio[i] duration = max(len(clip), len(sum)) result = AudioSegment.silent(duration=duration) result = result.overlay(clip - 6, position=randint(0, duration - len(clip))) result = result.overlay(sum - 6 - attenuation, position=randint(0, duration - len(sum))) result = effects.normalize(result) result = result.remove_dc_offset() sumName = os.path.join( sumsDir, str(einheit) + '_' + datetime.datetime.now().isoformat() + '.wav') result.export(sumName, format='wav') newSums.append(sumName) print('new sums for einheit', einheit) [print(' ', sum) for sum in newSums] return newSums
def load_audio_from_webm_blob(opus_audio_bytes, channels=1, frame_rate=16000): bio = BytesIO(opus_audio_bytes) sound = pydub.AudioSegment.from_file(bio, coded='opus') sound = normalize(sound) sound = sound.set_frame_rate(frame_rate) sound = sound.set_channels(channels) return np.array(sound.get_array_of_samples())
def _normalized_sound(audio_file): ''' 볼륨 정규화 ''' audio = AudioSegment.from_wav(audio_file) normalizedsound = effects.normalize(audio) return normalizedsound
def load_audio(): forwards = [] wc = os.path.join(in_folder, "*.mp3") for song in glob.glob(wc): audio = AudioSegment.from_mp3(song) forwards.append(effects.normalize(audio)) return forwards
def AudioStandarize(audio_file, sr=32000, device=None, high_pass=0, ultrasonic=False): if not device: device = 'cuda:0' if torch.cuda.is_available() else 'cpu' filext = audio_file[-3:].lower() if filext == "mp3": sound = AudioSegment.from_mp3(audio_file) elif filext == "wma": sound = AudioSegment.from_file(audio_file, "wma") elif filext == "m4a": sound = AudioSegment.from_file(audio_file, "m4a") elif filext == "ogg": sound = AudioSegment.from_ogg(audio_file) elif filext == "wav": sound = AudioSegment.from_wav(audio_file) elif filext in ["mp4", "wma", "aac"]: sound = AudioSegment.from_file(audio_file, filext) else: print( 'Sorry, this file type is not permitted. The legal extensions are: wav, mp3, wma, m4a, ogg.' ) return None original_metadata = { 'channel': sound.channels, 'sample_rate': sound.frame_rate, 'sample_size': len(sound.get_array_of_samples()), 'duration': sound.duration_seconds } print( 'Origional audio: channel = %s, sample_rate = %s Hz, sample_size = %s, duration = %s s' % (original_metadata['channel'], original_metadata['sample_rate'], original_metadata['sample_size'], original_metadata['duration'])) if ultrasonic: if sound.frame_rate > 100000: # UltraSonic sound = speed_change(sound, 1 / 12) else: return False if sound.frame_rate > sr: sound = scipy_effects.low_pass_filter(sound, sr / 2) if sound.frame_rate != sr: sound = sound.set_frame_rate(sr) if sound.channels > 1: sound = sound.split_to_mono()[0] if not sound.sample_width == 2: sound = sound.set_sample_width(2) if high_pass: sound = sound.high_pass_filter(high_pass) sound = effects.normalize(sound) # normalize max-amplitude to 0 dB songdata = np.array(sound.get_array_of_samples()) duration = round(songdata.shape[0] / sound.frame_rate * 1000) #ms audiodata = torch.tensor(songdata, device=device).float() print( 'Standarized audio: channel = %s, sample_rate = %s Hz, sample_size = %s, duration = %s s' % (sound.channels, sound.frame_rate, songdata.shape[0], sound.duration_seconds)) return sound.frame_rate, audiodata, duration, sound, original_metadata
async def normscmd(self, m): """.norms <reply to audio> Нормализовать звук (Из тихого - нормальный)""" audio = await get_audio(m, "Нормализация") if not audio: return out = effects.normalize(audio.audio) await go_out(m, audio, out, audio.pref, audio.pref)
def doScan(path): target = '/share/bot/mod/' + random.choice(os.listdir('/share/bot/mod/')) os.system('ffmpeg -i ' + path + ' ' + path + '.wav') path = path + '.wav' effects.normalize(AudioSegment.from_wav(path)).export(path, format='wav') song = AudioSegment.from_wav(path) chunks = split_on_silence(song, min_silence_len=100, silence_thresh=-16) effects.normalize(random.choice(chunks)).export(path + '_source.wav', format='wav') os.system('/share/bot/genmod ' + path + '_source.wav "' + target + '" > ' + path + '.pcm') os.system('ffmpeg -f s16le -ar 44.1k -ac 2 -i ' + path + '.pcm -y -t 35 ' + path + '_result.mp3') os.remove(path + '_source.wav') os.remove(path + '.pcm') os.remove(path) return path + '_result.mp3'
async def normscmd(self, m): """.norms <reply to audio> Normalize sound (from quiet to normal)""" audio = await get_audio(self, m, "Normalization") if not audio: return out = effects.normalize(audio.audio) await go_out(self, audio.message, audio, out, audio.pref, audio.pref)
def normalize(self): sound = AudioSegment.from_mp3(self.path) change_in_dBFS = -20.0 - sound.dBFS sound.apply_gain(change_in_dBFS) sound.set_frame_rate(44100) sound = effects.normalize(sound) self.out_path = self.out_path.replace("songs", "wavs") sound.export(self.out_path, format="wav", bitrate="16k")
def add_background_noise(file: str, poison_filename: str, poison_prefix: str = 'poision'): poison_sound = effects.normalize(AudioSegment.from_file(poison_filename)) poison_sound = poison_sound + (poison_sound.dBFS * 2) original_sound = AudioSegment.from_file(file) mixed = original_sound.overlay(poison_sound) full_file = f'{file[:-4]}_{poison_prefix}.wav' mixed.export(full_file, format='wav') return full_file
def normalize_audio(sound_file): wavefile = AudioSegment.from_wav(sound_file) normalized_wavefile = effects.normalize(wavefile, headroom=0.2) # normalized_wavefile = effects.compress_dynamic_range(normalized_wavefile) # normalized_wavefile = effects.compress_dynamic_range(wavefile, threshold=-10.0, ratio=2.0, attack=2.5, release=25.0) normalized_wavefile.export(sound_file, format='wav') return sound_file
def load_file(filename, file_format, frame_rate=16000): sound = pydub.AudioSegment.from_file(filename, file_format) sound = sound.set_frame_rate(frame_rate) sound = sound.set_channels(1) sound = sound.set_sample_width(2) sound = sound.remove_dc_offset() sound = effects.normalize(sound) return np.array(sound.get_array_of_samples())
def normalize_single_file_volume(self, track): # adjust and normalize audio within a single mp3 file s = AudioSegment.from_mp3(self.playlist[track]) normalized_s = effects.normalize(s) normalized_s.export(str(self.playlist[track]), format="mp3", tags=mediainfo(str(self.playlist[track])).get( 'TAG', {}))
def normalise(*paths: PathOrStr) -> None: """normalise an arbitrary number of audio file""" for path in paths: path = Path(path) suffix = path.suffix.strip('.') clip = effects.normalize(AudioSegment.from_file(path.as_posix(), suffix)) clip.export(path.as_posix(), format=suffix) print(f'normalised: {path.as_posix()}')
async def normscmd(self, m): """.norms <reply to audio> Нормализовать звук (Из тихого - нормальный)""" pref = "Нормализация" reply = await m.get_reply_message() audio = await get_audio(m, reply, pref) if not audio: return await m.edit(f"[{pref}] Работаю...") out = effects.normalize(audio[0]) await go_out(m, reply, audio, out, pref, pref)
def _load_sound_bytes(sound_path): buffer = io.BytesIO() try: sound = AudioSegment.from_file(sound_path) except: return sound = effects.normalize(sound) sound.export(buffer, format='wav', parameters=['-ac', '2', '-ar', '44100']) return buffer.getvalue()
def createClipFromRecording(fileName): fade = config.getfloat('vereinheiter', 'fadeDur') * 1000 silenceLen = int(config.getfloat('vereinheiter', 'silenceLen') * 1000) silenceTresh = config.getfloat('vereinheiter', 'silenceTresh') treshold = config.getfloat('vereinheiter', 'compressorTreshold') ratio = config.getfloat('vereinheiter', 'compressorRatio') attack = config.getfloat('vereinheiter', 'compressorAttack') release = config.getfloat('vereinheiter', 'compressorRelease') frames = AudioSegment.from_wav(os.path.join(recordingsDir, fileName)) if frames.dBFS < config.getfloat('vereinheiter', 'recordingMinDB'): print('recording level to low:', fileName) return [None, None] frames = frames.set_sample_width(4) frames = frames.fade_in(fade) frames = frames.fade_out(fade) frames = effects.normalize(frames) frames = frames.remove_dc_offset() nonsilent = silence.detect_nonsilent(frames, silenceLen, silenceTresh) chunks = [frames[chunk[0]:chunk[1]] for chunk in nonsilent] frames = AudioSegment.silent(100) for chunk in chunks: if len(chunk) > 100: chunk = chunk.fade_in(20) chunk = chunk.fade_out(20) frames = frames.append(AudioSegment.silent(250)) frames = frames.append(chunk) frames = effects.compress_dynamic_range(frames, threshold=treshold, ratio=ratio, attack=attack, release=release) frames = effects.normalize(frames) if (len(frames) > config.getfloat('vereinheiter', 'minClipLen') * 1000): clipName = os.path.join(clipsDir, fileName) frames.export(clipName, format='wav') einheit = os.path.basename(fileName).split('_')[0] database.writeClip(einheit, clipName) print('new clip:', clipName) return [einheit, clipName] else: print('recording too short:', fileName) return [None, None]
def play_sound(sound_path, samplerate=16000): sound = pydub.AudioSegment.from_file(sound_path, 'wav') sound = sound.set_frame_rate(samplerate) sound = sound.set_channels(1) sound = sound.set_sample_width(2) sound = sound.remove_dc_offset() sound = effects.normalize(sound) signal = [i / 65536 for i in sound.get_array_of_samples()] ds = sc.default_speaker() ds.play(signal, samplerate)
def splitTracks(file, stepDir): dir = join(stepDir, file[14:-4]) full = AudioSegment.from_file(file) trackA = join(dir, "trackA.wav") trackB = join(dir, "trackB.wav") if not exists(dir): makedirs(dir) seperated = full.split_to_mono() effects.normalize(seperated[0]).export(trackA, format="wav") effects.normalize(seperated[1]).export(trackB, format="wav") wavA = np.array(seperated[0].get_array_of_samples()) wavB = np.array(seperated[1].get_array_of_samples()) specA = getSpectrum(wavA, spectrumHz, frequencies) specB = getSpectrum(wavB, spectrumHz, frequencies) with open(join(dir, "specA.pck"), 'wb') as f: pickle.dump(specA, f) with open(join(dir, "specB.pck"), 'wb') as f: pickle.dump(specB, f) return {"original": file, "trackA": trackA, "trackB":trackB, "specA": join(dir, "specA.pck"), "specB": join(dir, "specB.pck")}
def make_mp3(filename, directory): base_filename = os.path.splitext(filename)[0] wav = os.path.join(directory, base_filename) + '.wav' subprocess.call( ['timidity', os.path.join(directory, filename), '-Ow', '-o', wav]) mp3 = os.path.join(directory, base_filename) + '.mp3' snd = AudioSegment.from_file(wav, 'wav') snd = effects.normalize(snd) snd.export(mp3, format='mp3')
def normalize(sdcard_root, filename, backup, headroom=constants.DEFAULT_HEADROOM): full_path = os.path.join(sdcard_root, filename) if backup: copyfile(full_path, full_path + '_bak') sound = AudioSegment.from_file(full_path, 'wav') sound = effects.normalize(sound, headroom) sound.export(full_path, format='wav')
def load_backing_tracks(): backing_tracks = [] track_names = os.listdir(TRACKS_DIR) print("Loading backing tracks...") for name in track_names: print(f'↳ Load & normalize \'{name}\'') track = AudioSegment.from_mp3(f'static/audio/{name}') track = effects.normalize(track, headroom=6) backing_tracks.append(track) print("Done") return backing_tracks
def speech_to_text(url): text = '' try: if os.path.exists('audio'): shutil.rmtree('audio') os.mkdir('audio') # Extract Audio From YouTube os.system( 'youtube-dl -f bestaudio --extract-audio --audio-format mp3 --output audio/test.mp3 --audio-quality 0 ' + url) # convert mp3 file to wav sound = AudioSegment.from_file("audio/test.mp3") sound.export("audio/test.wav", format="wav") s = AudioSegment.from_file("audio/test.wav") s = s + 10 s = normalize(s) s.export("audio/test.wav", format="wav") # transcribe audio file AUDIO_FILE = "audio/test.wav" # use the audio file as the audio source r = sr.Recognizer() with sr.AudioFile(AUDIO_FILE) as source: audio = r.record(source) # read the entire audio file text = r.recognize_google(audio) with open('transcript.txt', 'w') as f: f.write(text) return text except RequestError as r: print("Recognition request failed: Bad Request") except PermissionError as p: print(p, "\nRe-run your program") except Exception as e: print("Error in Google Speech to Text") print(e) traceback.print_exc() finally: if os.path.exists('audio'): shutil.rmtree('audio') return text
def split_fragments(text_file, audio_file, data_out, init_uid): """Create the pairs phrases/transcription in the dataset format from given audio/text files Args: text_file (str): Path to text file audio_file (str): Path to audio file data_out (str): Path to dataset output init_uid (int): Initial value for unique identifier Returns: metadata: Dataframe with all info of the dataset uid (int): Last used unique identifier """ # Load subtitle text = json.load(open(text_file, encoding='utf8')) fixed_text = fix_durations(text['original']) # Load sound sound = AudioSegment.from_file(audio_file) transcriptions = [] ids = [] uid = init_uid # Splitting by text sentence for sentence in fixed_text: # silence = AudioSegment.silent(duration=100) start = int(float(sentence['start']) * 1000) - 300 end = int( float(sentence['start']) * 1000 + float(sentence['dur']) * 1000) # Add 100ms silent around audio fragment = normalize(sound[start:end]) # Check audio length if (fragment.duration_seconds > 20) or (fragment.duration_seconds < .4): continue uid += 1 str_id = "{:0>6}".format(uid) ids.append(str_id) transcriptions.append(sentence['text']) # Frame rate fragment = fragment.set_frame_rate(TARGET_SAMPLE_RATE).set_channels(1) fragment.export(os.path.join(data_out, "wav", str_id + ".wav"), format="wav", bitrate=BIT_RATE) # Create dataset metadata = pd.DataFrame(columns=["id", "transcription"]) metadata["id"] = ids metadata["transcription"] = transcriptions return (metadata, uid)
def say(text): #converts text to phonemes if(text == 'QUIT'): exit() g2p = G2p() out = g2p(text) #identify sounds from phoneme name output = AudioSegment.empty() for i, pho in enumerate(out): if (pho == 'HH'): pho = 'H' elif (pho == 'NX'): pho = 'NG' elif (pho == 'TH'): pho = 'DH' if (pho[-1].isalpha() != True): pho = pho[:-1] if (out[i].isspace() or out[i] == '' or out[i] == "'" or out[i] == "-" or out[i] =='.' or out[i] == ',' or out[i] == '!' or out[i] == '?'): audio = AudioSegment.silent(duration=300) audio.fade_in(duration=300).fade_out(duration=300) output = output.append(audio, crossfade=10) else: phonemes[pho]= phonemes[pho].fade_in(duration=5) phonemes[pho] = phonemes[pho].fade_out(duration=5) phonemes[pho] = normalize(phonemes[pho]) output = output.append(phonemes[pho], crossfade=0) # output += AudioSegment.silent(duration=300) output = normalize(output) output.set_frame_rate(44100) play(output) print(text)
def play(pattern, bpm=80, metronome=None, dondokos=None): tick = 1000 * 60 / float(bpm * 4) # dokodoko = 1 beat = 4 ticks print('BPM: %s, tick: %s' % (bpm, tick)) beats = (len(pattern)+3) // 4 METRONOME_EARLY = 4 if metronome is not None: beats += METRONOME_EARLY if dondokos is not None: DONDOKOS_EARLY = dondokos beats += DONDOKOS_EARLY ticks = beats * 4 print('Beats: %d, ticks: %d' % (beats, ticks)) tick_times = [ x * tick for x in range(ticks) ] # exiftool my_set/* | grep Duration | sort -nr | head -n1 song = AudioSegment.silent(duration=(ticks*tick)+2000) # add Metronome if metronome is not None: print('Inserting metronome clicks every %s ticks.' % metronome) for cnt, t in enumerate(tick_times): if not (cnt % metronome): song = song.overlay(click, position=t) tick_times = tick_times[METRONOME_EARLY*metronome:] # add dondokos if dondokos is not None: print('Inserting dondokos (%s before start of pattern).' % DONDOKOS_EARLY) dondoko = AudioSegment.silent(duration=4*tick+2) dondoko = dondoko.overlay(my_sample('shime1.wav'), position=0) dondoko = dondoko.overlay(my_sample('shime2.wav') - 6, position=2*tick) dondoko = dondoko.overlay(my_sample('shime3.wav') - 6, position=3*tick) dondoko = dondoko - 16 for cnt, t in enumerate(tick_times): if not (cnt % 4): song = song.overlay(dondoko, position=t) tick_times = tick_times[DONDOKOS_EARLY*4:] if not 'implemented': dokodoko = AudioSegment.silent(duration=4*tick+2) dokodoko = dokodoko.overlay(do, position=0) dokodoko = dokodoko.overlay(ko, position=tick) dokodoko = dokodoko.overlay(do, position=2*tick) dokodoko = dokodoko.overlay(ko, position=3*tick) dokodoko = dokodoko - 16 dokonko = AudioSegment.silent(duration=4*tick+2) dokonko = dokonko.overlay(do, position=0) dokonko = dokonko.overlay(ko, position=tick) dokonko = dokonko.overlay(ko, position=3*tick) dokonko = dokonko - 16 print('Assembling song:') char_accu = '' for cnt, tup in enumerate(zip(tick_times, pattern)): t, char = tup song = song.overlay(char_map.get(char, pause), position=t) char_accu += char if not (cnt+1) % 8: print(char_accu) char_accu = '' if char_accu: print(char_accu) return effects.normalize(song)
# merge short silencies inside sentencies print('phase 3, merging short silencies inside sentencies') fragment_list = merge_short_sils(fragment_list) print(len(fragment_list), 'fragments after merging') # choke silences, adjust sentencies print('phase 4, choke and adjust') for idx, track_segm in enumerate(fragment_list): if is_silence(track_segm): action = choke else: action = adjustment # brings together try: res_track += action(track_segm) except NameError: res_track = action(track_segm) # normalize if normalizing: res_track = effects.normalize(res_track) # save result res_track.export(sys.argv[2], format='wav') print('done:', sys.argv[2])