def adjust_wav_amplitude(self, wav_file, rms_amplitude): """Calls normalize-audio to adjust amplitude of WAV file Args: :wav_file :rms_amplitude """ if rms_amplitude > 1.0: rms_amplitude = 1.0 voice_gain_cmd = [self.__tools.get_tool(Tools.KEY_NORMALIZE), '-a', str(rms_amplitude), wav_file] if Util.execute_rc(voice_gain_cmd) != 0: raise RuntimeError('Failed to adjust voice overlay volume')
def to_wav(self, output_file_name): """ Converts source audio track to WAV format convert source mp3 to wav. this is required for many reasons: * we need to adjust voice overlay amplitude to match MP3 file level and to do that we use "sox" too which cannot deal with MP3 directly. * there are some odd issues with "ffmpeg" failing during mixing phase when source is mp3 file. blind guess for now is that it's due to some structure mismatch between MP3 file (i.e. having cover image) and speech segments being just plain WAV. Most likely this can be solved better way but we need WAV anyway so no point wasting time at the moment for further research. """ wav_cmd = ['ffmpeg', '-i', self.file_name, output_file_name] if Util.execute_rc(wav_cmd) != 0: raise RuntimeError('Failed to convert to WAV file')
def check_env(self): """Checks if all external tools we need are already available and in $PATH """ if sys.platform == 'win32': self.__tools = { self.KEY_FFMPEG: 'ffmpeg.exe', self.KEY_SOX: 'sox.exe', self.KEY_ESPEAK: 'espeak.exe', } else: self.__tools = { self.KEY_FFMPEG: 'ffmpeg', self.KEY_SOX: 'sox', self.KEY_ESPEAK: 'espeak', } for _, tool in self.__tools.items(): failed = False if Util.which(tool) is None: Log.e("'{}' not found.".format(tool)) if failed: Util.abort('Required tools not found. See documentation for installation guidelines.') # sometimes normalize is called normalize-audio (i.e. in Debian/Ubuntu) # so we do special checks just for this one particular tool normalize_check_result = False if sys.platform == 'win32': normalize = 'normalize.exe' if Util.which(normalize) is not None: self.__tools[self.KEY_NORMALIZE] = normalize normalize_check_result = True else: normalize = 'normalize' if Util.which(normalize) is not None: self.__tools[self.KEY_NORMALIZE] = normalize normalize_check_result = True else: normalize = 'normalize-audio' if Util.which(normalize) is not None: self.__tools[self.KEY_NORMALIZE] = normalize normalize_check_result = True if not normalize_check_result: Util.abort('2: "{}" not found. See documentation for installation guidelines.'.format('normalize')) self.__check_env_called = True
def __create_voice_wav(self, segments, speech_wav_file_name): for idx, segment_text in enumerate(segments): segment_file_name = os.path.join(self.__tmp_dir, '{}.wav'.format(idx)) if not self.speak_to_wav(segment_text, segment_file_name): raise RuntimeError( 'Failed to save speak "{0}" into "{1}".'.format( segment_text, segment_file_name)) # we need to get the frequency of speech waveform generated by espeak to later be able to tell # ffmpeg how to pad/clip the part import wave wav = wave.open(os.path.join(self.__tmp_dir, '0.wav'), 'rb') speech_frame_rate = wav.getframerate() wav.close() # merge voice overlay segments into one file with needed padding concat_cmd = [self.__tools.get_tool(Tools.KEY_FFMPEG), '-y'] filter_complex = '' filter_complex_concat = ';' separator = '' max_len_tick = speech_frame_rate * 60 * self.__config.tick_interval max_len_title = speech_frame_rate * 60 * self.__config.tick_offset for idx, _ in enumerate(segments): concat_cmd.extend( ['-i', os.path.join(self.__tmp_dir, '{}.wav'.format(idx))]) # samples = rate_per_second * seconds * tick_interval_in_minutes max_len = max_len_title if idx == 0 else max_len_tick # http://ffmpeg.org/ffmpeg-filters.html#Filtergraph-description filter_complex += '{}[{}]apad=whole_len={}[g{}]'.format( separator, idx, max_len, idx) separator = ';' filter_complex_concat += '[g{}]'.format(idx) filter_complex_concat += 'concat=n={}:v=0:a=1'.format(len(segments)) concat_cmd.extend( ['-filter_complex', filter_complex + filter_complex_concat]) concat_cmd.append(speech_wav_file_name) if Util.execute_rc(concat_cmd) != 0: raise RuntimeError('Failed to merge voice segments')
def mix_wav_tracks(self, file_out, encoding_quality, wav_files): """Mixes given WAV tracks together Args: :file_out :encoding_quality LAME encoder quality parameter :wav_files list of WAV files to mix """ merge_cmd = [self.__tools.get_tool(Tools.KEY_FFMPEG), '-y'] _ = [merge_cmd.extend(['-i', wav]) for wav in wav_files] merge_cmd.extend([ '-filter_complex', 'amerge', '-ac', '2', '-c:a', 'libmp3lame', '-q:a', str(encoding_quality), file_out]) if Util.execute_rc(merge_cmd) != 0: raise RuntimeError('Failed to create final MP3 file')
def calculate_rms_amplitude(self, wav_file): """Calls SOX to get the RMS amplitude of WAV file Args: :wav_file Returns: float """ src_amplitude_cmd = [self.__tools.get_tool(Tools.KEY_SOX), wav_file, '-n', 'stat'] rc, _, err = Util.execute(src_amplitude_cmd) if rc != 0: raise RuntimeError('Failed to calculate RMS amplitude of "{}"'.format(wav_file)) # let's check what "sox" figured out sox_results = {re.sub(' +', '_', err[i].split(':')[0].strip().lower()): err[i].split(':')[1].strip() for i in range(0, len(err))} return float(sox_results['rms_amplitude'])
def speak_to_wav(self, text, out_file_name): # noinspection PyProtectedMember text_tmp_file = os.path.join( self.__tmp_dir, next(tempfile._get_candidate_names()) + '.txt') with open(text_tmp_file, "wb+") as fh: fh.write(text) fh.close() rc = Util.execute_rc([ self.__tools.get_tool(Tools.KEY_ESPEAK), '-s', str(self.__config.speech_speed), '-z', '-w', out_file_name, '-f', text_tmp_file ], debug=self.__config.debug) if rc == 0 and not self.__config.no_cleanup: os.remove(text_tmp_file) return rc == 0
def get_out_file_name(self, music_track): """Build out file name based on provided template and music_track data """ out_base_name, out_base_ext = Util.split_file_name( music_track.file_name) formatted_file_name = self.__config.file_out_format.format( name=out_base_name, ext=out_base_ext) out_file_name = os.path.basename(music_track.file_name) if self.__config.file_out is None: out_file_name = os.path.join( os.path.dirname(music_track.file_name), formatted_file_name) else: if os.path.isfile(self.__config.file_out): out_file_name = self.__config.file_out else: if os.path.isdir(self.__config.file_out): out_file_name = os.path.join(self.__config.file_out, formatted_file_name) return out_file_name
def __init__(self, file_name): if not os.path.isfile(file_name): raise OSError('File not found: "{}"'.format(file_name)) mp3 = MP3(file_name) base_name, _ = Util.split_file_name(file_name) self.base_name = base_name self.file_name = file_name # we round up duration to full minutes self.duration = mp3.info.length self.bitrate = mp3.info.bitrate # get track title either from tag, or from filename self.title = self.__get_tag(mp3, self.TAG_TITLE) self.artist = self.__get_tag(mp3, self.TAG_ARTIST) self.album_artist = self.__get_tag(mp3, self.TAG_ALBUM_ARTIST) self.album_title = self.__get_tag(mp3, self.TAG_ALBUM_TITLE) self.composer = self.__get_tag(mp3, self.TAG_COMPOSER) self.performer = self.__get_tag(mp3, self.TAG_PERFORMER) self.comment = self.__get_tag(mp3, self.TAG_COMMENT) self.track_number = self.__get_tag(mp3, self.TAG_TRACK_NUMBER)
def voice_stamp(self, mp3_file_name): result = True try: Log.level_push('Processing "{}"'.format(mp3_file_name)) music_track = Mp3FileInfo(mp3_file_name) # some sanity checks first min_track_length = 1 + self.__config.tick_offset if music_track.duration < min_track_length: raise ValueError( 'Track too short (min. {}, current len {})'.format( min_track_length, music_track.duration)) # check if we can create output file too if not self.__config.dry_run_mode: if os.path.exists(self.get_out_file_name( music_track)) and not self.__config.force_overwrite: raise OSError( 'Target "{}" already exists. Use -f to force overwrite.' .format(self.get_out_file_name(music_track))) # create temporary folder self.__make_temp_dir() # let's now create WAVs with our spoken parts. ticks = range(self.__config.tick_offset, music_track.duration, self.__config.tick_interval) extras = {'config_name': self.__config.name} # First goes track title, then time ticks # NOTE: we will generate title WAV even if i.e. title_format is empty. This is intentional, to keep # further logic simpler, because if both title and tick formats would be empty, then skipping # WAV generation would left us with no speech overlay file for processing and mixing. # I do not want to have the checks for such case track_title_to_speak = Util.prepare_for_speak( Util.process_placeholders( self.__config.title_format, Util.merge_dicts(music_track.get_placeholders(), extras))) Log.i('Announced as "{}"'.format(track_title_to_speak)) Log.v('Announcement format "{}"'.format( self.__config.title_format)) segments = [track_title_to_speak] if self.__config.tick_format != '': for time_marker in ticks: minutes = time_marker + self.__config.tick_add extras = { 'minutes': minutes, 'minutes_digits': Util.separate_chars(minutes), } tick_string = Util.process_placeholders( self.__config.tick_format, Util.merge_dicts(music_track.get_placeholders(), extras)) segments.append(Util.prepare_for_speak(tick_string)) if self.__config.dry_run_mode: Log.i('Duration {} mins, tick count: {}'.format( music_track.duration, (len(segments) - 1))) Log.v('Tick format "{}"'.format(self.__config.tick_format)) if not self.__config.dry_run_mode: speech_wav_full = os.path.join(self.__tmp_dir, 'speech.wav') self.__create_voice_wav(segments, speech_wav_full) # convert source music track to WAV music_wav_full_path = os.path.join( self.__tmp_dir, os.path.basename(music_track.file_name) + '.wav') music_track.to_wav(music_wav_full_path) # calculate RMS amplitude of music track as reference to gain voice to match rms_amplitude = self.__audio.calculate_rms_amplitude( music_wav_full_path) target_speech_rms_amplitude = rms_amplitude * self.__config.speech_volume_factor self.__audio.adjust_wav_amplitude(music_wav_full_path, target_speech_rms_amplitude) # mix all stuff together file_out = self.get_out_file_name(music_track) if not self.__config.dry_run_mode: Log.i('Writing: "{}"'.format(file_out)) # noinspection PyProtectedMember self.__tmp_mp3_file = os.path.join( os.path.dirname(file_out), next(tempfile._get_candidate_names()) + '.mp3') # noinspection PyUnboundLocalVariable self.__audio.mix_wav_tracks( self.__tmp_mp3_file, music_track.get_encoding_quality_for_lame_encoder(), [music_wav_full_path, speech_wav_full]) # copy some ID tags to newly create MP3 file music_track.write_id3_tags(self.__tmp_mp3_file) if os.path.exists(file_out): os.remove(file_out) os.rename(self.__tmp_mp3_file, file_out) self.__tmp_mp3_file = None else: output_file_msg = 'Output file "{}"'.format(file_out) if os.path.exists(self.get_out_file_name(music_track)): output_file_msg += ' *** TARGET FILE ALREADY EXISTS ***' Log.i(output_file_msg) Log.v('Output file name format "{}"'.format( self.__config.file_out_format)) Log.i('') except RuntimeError as ex: if not self.__config.debug: Log.e(ex) else: raise result = False finally: Log.level_pop() self.__cleanup() return result