def do_transform(self, in_buffer, out_buffer): """Run the data of ``in_buffer`` through the :class:`~audiotsm.base.tsm.TSM` object and write the output to ``out_buffer``. :param in_buffer: a ``Gst.Buffer`` containing the input data. :param out_buffer: a ``Gst.Buffer`` where the output data will be written. """ # There is a bug that increases the refcount of out_buffer, making it # non writable (https://bugzilla.gnome.org/show_bug.cgi?id=727702#c4). # Set the refcount to 1 to fix this. refcount = out_buffer.mini_object.refcount out_buffer.mini_object.refcount = 1 # Set the position of the output buffer out_buffer.pts = self._position # Run the TSM procedure reader = ArrayReader(self._gstbuffer_to_ndarray(in_buffer)) writer = ArrayWriter(self._channels) self._tsm.run(reader, writer, flush=False) self._ndarray_to_gstbuffer(out_buffer, writer.data) out_buffer.duration = ( (out_buffer.get_size() * Gst.SECOND) // (self._bps * self._samplerate) ) self._position += out_buffer.duration # Reset the refcount out_buffer.mini_object.refcount = refcount return Gst.FlowReturn.OK
def test_cbuffer_read_from(in_buffer, write_buffer, out_n, out_data): """Run tests for the CBuffer.write method.""" reader = ArrayReader(np.array(write_buffer)) n = in_buffer.read_from(reader) assert n == out_n assert_almost_equal(in_buffer.to_array(), out_data)
def wsola_sample(sample, speed): '''Scales sample by speed''' sample = sample.reshape(1, len(sample)) reader = ArrayReader(sample) writer = ArrayWriter(channels=1) tsm = wsola(1, speed=speed) tsm.run(reader, writer) return writer.data[0]
def time_stretch(data, speed): data = data.reshape(1, -1) reader = ArrayReader(data) writer = ArrayWriter(channels=1) tsm = wsola(channels=1, speed=speed) tsm.run(reader, writer) output = np.ascontiguousarray(writer.data.T) output = output.flatten() return output
def test_skip(data_in, n_in, n_out, data_out): """Run tests for the ArrayReader.skip method.""" reader = ArrayReader(np.array(data_in)) n = reader.skip(n_in) assert n == n_out # Check the data remaining in the reader buffer = np.zeros_like(data_out) reader.read(buffer) assert_almost_equal(buffer, data_out) # Check that there is no more data in the reader buffer = np.zeros_like(data_in) n = reader.read(buffer) assert not buffer.any() assert n == 0
def execute(self): # get values of audio frames, 0 for silence, 1 for loudness. has_loud_audio = self.get_loud_frame() # get edit points of silence and loudness. edit_points = self.get_edit_points(has_loud_audio) start_frame = 0 output = self.get_output() for edit_point in edit_points: audio_chunk = self.parameter.audio_data[ int(edit_point.start_frame * self.parameter.samples_per_frame): int(edit_point.end_frame * self.parameter.samples_per_frame) ] # need channels * frames, transpose data first. reader = ArrayReader(np.transpose(audio_chunk)) writer = ArrayWriter(reader.channels) tsm = phasevocoder(reader.channels, speed=self.parameter.new_speed[int(edit_point.should_keep)]) tsm.run(reader, writer) altered_audio_data = np.transpose(writer.data) altered_audio_data_length = altered_audio_data.shape[0] if altered_audio_data_length < self.parameter.audio_fade_envelope_size: altered_audio_data[:] = 0 # audio is less than 0.01 sec, let's just remove it. else: self.fade_out_silence(altered_audio_data) end_frame = start_frame + altered_audio_data_length start_output_frame = int(math.ceil(start_frame / self.parameter.samples_per_frame)) end_output_frame = int(math.ceil(end_frame / self.parameter.samples_per_frame)) output.apply_edit_point(edit_point, altered_audio_data, start_output_frame, end_output_frame) start_frame = end_frame output.close()
def test_read(data_in, read_out, n_out, data_out): """Run tests for the ArrayReader.read method.""" reader = ArrayReader(np.array(data_in)) buffer = np.zeros_like(read_out, dtype=np.float32) n = reader.read(buffer) assert_almost_equal(buffer, read_out) assert n == n_out # Check the data remaining in the reader buffer = np.zeros_like(data_out) reader.read(buffer) assert_almost_equal(buffer, data_out) # Check that there is no more data in the reader buffer = np.zeros_like(data_in) n = reader.read(buffer) assert not buffer.any() assert n == 0
""" # pylint: disable=invalid-name import numpy as np import sounddevice as sd from audiotsm import wsola from audiotsm.io.array import ArrayReader, ArrayWriter # The parameters of the input signal length = 1 # in seconds samplerate = 44100 # in Hz frequency = 440 # an A4 # Generate the input signal time = np.linspace(0, length, int(length * samplerate)) input_signal = np.sin(np.pi * frequency * time).reshape((1, -1)) # Run the TSM procedure reader = ArrayReader(input_signal) writer = ArrayWriter(channels=1) tsm = wsola(channels=1, speed=0.5) tsm.run(reader, writer) # Play the output # This example was written to show how to use an ArrayWriter. If you want to # play the output of a TSM procedure you should use an # audiotsm.io.stream.StreamWriter. sd.play(np.ascontiguousarray(writer.data.T), samplerate, blocking=True)
def speed_up_video(input_file: str, output_file: str = None, frame_rate: float = 30, sample_rate: int = 44100, silent_threshold: float = 0.03, silent_speed: float = 5.0, sounded_speed: float = 1.0, frame_spreadage: int = 1, audio_fade_envelope_size: int = 400, temp_folder: str = 'TEMP') -> None: """ Speeds up a video file with different speeds for the silent and loud sections in the video. :param input_file: The file name of the video to be sped up. :param output_file: The file name of the output file. If not given will be 'input_file'_ALTERED.ext. :param frame_rate: The frame rate of the given video. Only needed if not extractable through ffmpeg. :param sample_rate: The sample rate of the audio in the video. :param silent_threshold: The threshold when a chunk counts towards being a silent chunk. Value ranges from 0 (nothing) - 1 (max volume). :param silent_speed: The speed of the silent chunks. :param sounded_speed: The speed of the loud chunks. :param frame_spreadage: How many silent frames adjacent to sounded frames should be included to provide context. :param audio_fade_envelope_size: Audio transition smoothing duration in samples. :param temp_folder: The file path of the temporary working folder. """ # Set output file name based on input file name if none was given if output_file is None: output_file = _input_to_output_filename(input_file) # Create Temp Folder if os.path.exists(temp_folder): _delete_path(temp_folder) _create_path(temp_folder) # Find out framerate and duration of the input video command = 'ffprobe -i "{}" -hide_banner -loglevel error -select_streams v' \ ' -show_entries format=duration:stream=avg_frame_rate'.format(input_file) p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True, shell=True) std_out, err = p.communicate() match_frame_rate = re.search(r'frame_rate=(\d*)/(\d*)', str(std_out)) if match_frame_rate is not None: frame_rate = float(match_frame_rate.group(1)) / float( match_frame_rate.group(2)) # print(f'Found Framerate {frame_rate}') match_duration = re.search(r'duration=([\d.]*)', str(std_out)) original_duration = 0.0 if match_duration is not None: original_duration = float(match_duration.group(1)) # print(f'Found Duration {original_duration}') # Extract the audio command = 'ffmpeg -i "{}" -ab 160k -ac 2 -ar {} -vn {} -hide_banner' \ .format(input_file, sample_rate, temp_folder + '/audio.wav') _run_timed_ffmpeg_command(command, total=int(original_duration * frame_rate), unit='frames', desc='Extracting audio:') wav_sample_rate, audio_data = wavfile.read(temp_folder + "/audio.wav") audio_sample_count = audio_data.shape[0] max_audio_volume = _get_max_volume(audio_data) samples_per_frame = wav_sample_rate / frame_rate audio_frame_count = int(math.ceil(audio_sample_count / samples_per_frame)) # Find frames with loud audio has_loud_audio = np.zeros(audio_frame_count, dtype=bool) for i in range(audio_frame_count): start = int(i * samples_per_frame) end = min(int((i + 1) * samples_per_frame), audio_sample_count) audio_chunk = audio_data[start:end] chunk_max_volume = float( _get_max_volume(audio_chunk)) / max_audio_volume if chunk_max_volume >= silent_threshold: has_loud_audio[i] = True # Chunk the frames together that are quiet or loud chunks = [[0, 0, 0]] should_include_frame = np.zeros(audio_frame_count, dtype=bool) for i in tqdm(range(audio_frame_count), desc='Finding chunks:', unit='frames'): start = int(max(0, i - frame_spreadage)) end = int(min(audio_frame_count, i + 1 + frame_spreadage)) should_include_frame[i] = np.any(has_loud_audio[start:end]) if i >= 1 and should_include_frame[i] != should_include_frame[ i - 1]: # Did we flip? chunks.append([chunks[-1][1], i, should_include_frame[i - 1]]) chunks.append([ chunks[-1][1], audio_frame_count, should_include_frame[audio_frame_count - 1] ]) chunks = chunks[1:] # Generate audio data with varying speed for each chunk new_speeds = [silent_speed, sounded_speed] output_pointer = 0 audio_buffers = [] for index, chunk in tqdm(enumerate(chunks), total=len(chunks), desc='Changing audio:', unit='chunks'): audio_chunk = audio_data[int(chunk[0] * samples_per_frame):int(chunk[1] * samples_per_frame)] reader = ArrayReader(np.transpose(audio_chunk)) writer = ArrayWriter(reader.channels) tsm = phasevocoder(reader.channels, speed=new_speeds[int(chunk[2])]) tsm.run(reader, writer) altered_audio_data = np.transpose(writer.data) # smooth out transition's audio by quickly fading in/out if altered_audio_data.shape[0] < audio_fade_envelope_size: altered_audio_data[:] = 0 # audio is less than 0.01 sec, let's just remove it. else: premask = np.arange( audio_fade_envelope_size) / audio_fade_envelope_size mask = np.repeat(premask[:, np.newaxis], 2, axis=1) # make the fade-envelope mask stereo altered_audio_data[:audio_fade_envelope_size] *= mask altered_audio_data[-audio_fade_envelope_size:] *= 1 - mask audio_buffers.append(altered_audio_data / max_audio_volume) end_pointer = output_pointer + altered_audio_data.shape[0] start_output_frame = int(math.ceil(output_pointer / samples_per_frame)) end_output_frame = int(math.ceil(end_pointer / samples_per_frame)) chunks[index] = chunk[:2] + [start_output_frame, end_output_frame] output_pointer = end_pointer # print(chunks) output_audio_data = np.concatenate(audio_buffers) wavfile.write(temp_folder + "/audioNew.wav", sample_rate, output_audio_data) # Cut the video parts to length expression = _get_tree_expression(chunks) filter_graph_file = open(temp_folder + "/filterGraph.txt", 'w') filter_graph_file.write(f'fps=fps={frame_rate},setpts=') filter_graph_file.write(expression.replace(',', '\\,')) filter_graph_file.close() command = 'ffmpeg -i "{}" -i "{}" -filter_script:v "{}" -map 0 -map -0:a -map 1:a -c:a aac "{}"' \ ' -loglevel warning -stats -y -hide_banner' \ .format(input_file, temp_folder + '/audioNew.wav', temp_folder + '/filterGraph.txt', output_file) _run_timed_ffmpeg_command(command, total=chunks[-1][3], unit='frames', desc='Generating final:') _delete_path(temp_folder)
def apply_speed_to_audio(audio, speed): reader = ArrayReader(audio) writer = ArrayWriter(2) tsm = phasevocoder(reader.channels, speed) tsm.run(reader, writer) return writer.data