def skip_frames_ffmpeg(filename, skip=0): if skip == 0: return import os of, fex = os.path.splitext(filename) pts_ratio = 1 / (skip + 1) atempo_ratio = skip + 1 outname = of + '_skip' + fex if has_audio(filename): cmd = [ 'ffmpeg', '-y', '-i', filename, '-filter_complex', f'[0:v]setpts={pts_ratio}*PTS[v];[0:a]atempo={atempo_ratio}[a]', '-map', '[v]', '-map', '[a]', '-q:v', '3', '-shortest', outname ] else: cmd = [ 'ffmpeg', '-y', '-i', filename, '-filter_complex', f'[0:v]setpts={pts_ratio}*PTS[v]', '-map', '[v]', '-q:v', '3', outname ] ffmpeg_cmd(cmd, get_length(filename), pb_prefix='Skipping frames:')
def skip_frames_ffmpeg(filename, skip=0): """ Time-shrinks the video by skipping (discarding) every n frames determined by `skip`. To discard half of the frames (ie. double the speed of the video) use `skip=1`. Args: filename (str): Path to the video to process. skip (int, optional): Discard `skip` frames before keeping one. Defaults to 0. Outputs: `filename`_skip.<file extension> """ if skip == 0: return import os of, fex = os.path.splitext(filename) pts_ratio = 1 / (skip + 1) atempo_ratio = skip + 1 outname = of + '_skip' + fex if has_audio(filename): cmd = [ 'ffmpeg', '-y', '-i', filename, '-filter_complex', f'[0:v]setpts={pts_ratio}*PTS[v];[0:a]atempo={atempo_ratio}[a]', '-map', '[v]', '-map', '[a]', '-q:v', '3', '-shortest', outname ] else: cmd = [ 'ffmpeg', '-y', '-i', filename, '-filter_complex', f'[0:v]setpts={pts_ratio}*PTS[v]', '-map', '[v]', '-q:v', '3', outname ] ffmpeg_cmd(cmd, get_length(filename), pb_prefix='Skipping frames:')
def mg_videoreader(filename, starttime=0, endtime=0, skip=0, rotate=0, contrast=0, brightness=0, crop='None', color=True, keep_all=False, returned_by_process=False): """ Reads in a video file, and optionally apply several different processes on it. These include: - trimming, - skipping, - rotating, - applying brightness and contrast, - cropping, - converting to grayscale. Parameters ---------- - filename : str Path to the input video file. - starttime : int or float, optional Trims the video from this start time (s). - endtime : int or float, optional Trims the video until this end time (s). - skip : int, optional Time-shrinks the video by skipping (discarding) every n frames determined by `skip`. - rotate : int or float, optional Rotates the video by a `rotate` degrees. - contrast : int or float, optional Applies +/- 100 contrast to video. - brightness : int or float, optional Applies +/- 100 brightness to video. - crop : {'none', 'manual', 'auto'}, optional If `manual`, opens a window displaying the first frame of the input video file, where the user can draw a rectangle to which cropping is applied. If `auto` the cropping function attempts to determine the area of significant motion and applies the cropping to that area. - color : bool, optional Default is `True`. If `False`, converts the video to grayscale and sets every method in grayscale mode. - keep_all : bool, optional Default is `False`. If `True`, preserves an output video file after each used preprocessing stage. Outputs ------- - A video file with the applied processes. The name of the file will be `filename` + a suffix for each process. Returns ------- - length : int The number of frames in the output video file. - width : int The pixel width of the output video file. - height : int The pixel height of the output video file. - fps : int The FPS (frames per second) of the output video file. - endtime : float The length of the output video file in seconds. - of: str The path to the output video file without its extension. The file name gets a suffix for each used process. - fex : str The file extension of the output video file. Currently it is always 'avi'. """ # Separate filename from file extension of, fex = os.path.splitext(filename) trimming = False skipping = False rotating = False cbing = False cropping = False # Cut out relevant bit of video using starttime and endtime if starttime != 0 or endtime != 0: extract_subclip(filename, starttime, endtime, targetname=of + '_trim' + fex) of = of + '_trim' trimming = True # Convert to avi if the input is not avi - necesarry for cv2 compatibility on all platforms # if fex != '.avi': # convert_to_avi(of + fex) # fex = '.avi' # filename = of + fex # vidcap = cv2.VideoCapture(of + fex) # Get props from vidcap # fps = int(vidcap.get(cv2.CAP_PROP_FPS)) # width = int(vidcap.get(cv2.CAP_PROP_FRAME_WIDTH)) # height = int(vidcap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # length = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT)) # test reading # success, _ = vidcap.read() # if fps == 0 or length == 0 or not success: # raise ReadError(f"Could not open {filename}.") # source_length_s = length / fps # source_name = of + fex # new_length_s = source_length_s # dilation_ratio = 1 # need_to_embed_audio = False # video_has_audio_track = has_audio(source_name) # if skip != 0 or contrast != 0 or brightness != 0 or crop.lower() != 'none': # if contrast != 0 or brightness != 0 or crop.lower() != 'none': # if video_has_audio_track: # source_audio = extract_wav(source_name) # need_to_embed_audio = True # To skip ahead a few frames before the next sample set skip to a value above 0 # if skip != 0: # vidcap, length, fps, width, height = mg_skip_frames( # of, fex, vidcap, skip, fps, length, width, height) # if not keep_all and trimming: # os.remove(of + fex) # of = of + '_skip' # skipping = True # new_length_s = length / fps # dilation_ratio = source_length_s / new_length_s # if keep_all: # vidcap.release() # if video_has_audio_track: # embed_audio_in_video(source_audio, of + fex, dilation_ratio) if skip != 0: #skipped_video = skip_frames_ffmpeg(of + fex, skip) skip_frames_ffmpeg(of + fex, skip) if not keep_all and trimming: # vidcap.release() os.remove(of + fex) of = of + '_skip' skipping = True # vidcap.release() # vidcap = cv2.VideoCapture(of + fex) # length = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT)) # fps = int(vidcap.get(cv2.CAP_PROP_FPS)) # width = int(vidcap.get(cv2.CAP_PROP_FRAME_WIDTH)) # height = int(vidcap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # new_length_s = length / fps # dilation_ratio = source_length_s / new_length_s length = get_framecount(of + fex) fps = get_fps(of + fex) # Overwrite the inputvalue for endtime not to cut the video at 0... if endtime == 0: endtime = length / fps if rotate != 0: # vidcap.release() rotate_video(of + fex, rotate) if not keep_all and (skipping or trimming): os.remove(of + fex) of = of + '_rot' rotating = True # if keep_all and video_has_audio_track: # embed_audio_in_video(source_audio, of + fex, dilation_ratio) # Apply contrast/brightness before the motion analysis if contrast != 0 or brightness != 0: # if keep_all or rotating: # vidcap = cv2.VideoCapture(of + fex) # vidcap = mg_contrast_brightness( # of, fex, vidcap, fps, length, width, height, contrast, brightness) contrast_brightness_ffmpeg(of + fex, contrast=contrast, brightness=brightness) if not keep_all and (rotating or skipping or trimming): # vidcap.release() os.remove(of + fex) of = of + '_cb' cbing = True # vidcap.release() # vidcap = cv2.VideoCapture(of + fex) # length = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT)) # fps = int(vidcap.get(cv2.CAP_PROP_FPS)) # width = int(vidcap.get(cv2.CAP_PROP_FRAME_WIDTH)) # height = int(vidcap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # if keep_all: # vidcap.release() # if video_has_audio_track: # embed_audio_in_video(source_audio, of + fex, dilation_ratio) # Crops video either manually or automatically if crop.lower() != 'none': # if keep_all: # vidcap = cv2.VideoCapture(of + fex) # [vidcap, width, height] = mg_cropvideo( # fps, width, height, length, of, fex, crop, motion_box_thresh=0.1, motion_box_margin=1) mg_cropvideo_ffmpeg(of + fex, crop_movement=crop) if not keep_all and (cbing or rotating or skipping or trimming): # vidcap.release() os.remove(of + fex) of = of + '_crop' cropping = True # if keep_all: # vidcap.release() # if video_has_audio_track: # embed_audio_in_video(source_audio, of + fex, dilation_ratio) if color == False and returned_by_process == False: # vidcap.release() of_gray, fex = convert_to_grayscale(of + fex) if not keep_all and (cropping or cbing or rotating or skipping or trimming): os.remove(of + fex) of = of_gray # if color == True or returned_by_process == True: # vidcap.release() # if need_to_embed_audio: # embed_audio_in_video(source_audio, of + fex, dilation_ratio) # os.remove(source_audio) # if vidcap: # vidcap.release() width, height = get_widthheight(of + fex) video_has_audio_track = has_audio(of + fex) return length, width, height, fps, endtime, of, fex, video_has_audio_track
def mg_audio_tempogram(filename=None, window_size=4096, overlap=8, mel_filters=512, power=2, dpi=300, autoshow=True): """ Renders a figure with a plots of onset strength and tempogram of the video/audio file. Parameters ---------- - filename : str, optional Path to the audio/video file to be processed. - window_size : int, optional The size of the FFT frame. Default is 4096. - overlap : int, optional The window overlap. The hop size is window_size / overlap. Example: window_size=1024, overlap=4 -> hop=256 - mel_filters : int, optional The number of filters to use for filtering the frequency domain. Affects the vertical resolution (sharpness) of the spectrogram. NB: Too high values with relatively small window sizes can result in artifacts (typically black lines) in the resulting image. Default is 512. - power : int, float, optional The steepness of the curve for the color mapping. Default is 2. - dpi : int, optional Image quality of the rendered figure. Default is 300 DPI. - autoshow: bool, optional Whether to show the resulting figure automatically. Default is `True` (figure is shown). Outputs ------- - `filename` + '_tempogram.png' Returns ------- - MgFigure An MgFigure object referring to the internal figure and its data. """ if filename == None: print("No filename was given.") return if not has_audio(filename): print('The video has no audio track.') return of, fex = os.path.splitext(filename) hop_size = int(window_size / overlap) y, sr = librosa.load(filename, sr=None) oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_size) tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_size) # Estimate the global tempo for display purposes tempo = librosa.beat.tempo(onset_envelope=oenv, sr=sr, hop_length=hop_size)[0] fig, ax = plt.subplots(nrows=2, figsize=(10, 6), dpi=dpi, sharex=True) times = librosa.times_like(oenv, sr=sr, hop_length=hop_size) ax[0].plot(times, oenv, label='Onset strength') ax[0].label_outer() ax[0].legend(frameon=True) librosa.display.specshow(tempogram, sr=sr, hop_length=hop_size, x_axis='time', y_axis='tempo', cmap='magma', ax=ax[1]) ax[1].axhline(tempo, color='w', linestyle='--', alpha=1, label='Estimated tempo={:g}'.format(tempo)) ax[1].legend(loc='upper right') ax[1].set(title='Tempogram') plt.savefig('%s_tempogram.png' % of, format='png') if not autoshow: plt.close() # create MgFigure data = { "hop_size": hop_size, "sr": sr, "of": of, "times": times, "onset_env": oenv, "tempogram": tempogram, "tempo": tempo } mgf = MgFigure(figure=fig, figure_type='audio.tempogram', data=data, layers=None, image=of + '_tempogram.png') return mgf
def mg_audio_descriptors(filename=None, window_size=4096, overlap=8, mel_filters=512, power=2, dpi=300, autoshow=True): """ Renders a figure of plots showing spectral/loudness descriptors, including RMS energy, spectral flatness, centroid, bandwidth, rolloff of the video/audio file. Parameters ---------- - filename : str, optional Path to the audio/video file to be processed. - window_size : int, optional The size of the FFT frame. Default is 4096. - overlap : int, optional The window overlap. The hop size is window_size / overlap. Example: window_size=1024, overlap=4 -> hop=256 - mel_filters : int, optional The number of filters to use for filtering the frequency domain. Affects the vertical resolution (sharpness) of the spectrogram. NB: Too high values with relatively small window sizes can result in artifacts (typically black lines) in the resulting image. Default is 512. - power : int, float, optional The steepness of the curve for the color mapping. Default is 2. - dpi : int, optional Image quality of the rendered figure. Default is 300 DPI. - autoshow: bool, optional Whether to show the resulting figure automatically. Default is `True` (figure is shown). Outputs ------- - `filename` + '_descriptors.png' Returns ------- - MgFigure An MgFigure object referring to the internal figure and its data. """ if filename == None: print("No filename was given.") return if not has_audio(filename): print('The video has no audio track.') return of, fex = os.path.splitext(filename) hop_size = int(window_size / overlap) y, sr = librosa.load(filename, sr=None) cent = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=window_size, hop_length=hop_size) spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=window_size, hop_length=hop_size) flatness = librosa.feature.spectral_flatness(y=y, n_fft=window_size, hop_length=hop_size) rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=window_size, hop_length=hop_size, roll_percent=0.99) rolloff_min = librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=window_size, hop_length=hop_size, roll_percent=0.01) rms = librosa.feature.rms(y=y, frame_length=window_size, hop_length=hop_size) S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=mel_filters, fmax=sr / 2, n_fft=window_size, hop_length=hop_size, power=power) fig, ax = plt.subplots(figsize=(12, 8), dpi=dpi, nrows=3, sharex=True) img = librosa.display.specshow(librosa.power_to_db(S, ref=np.max, top_db=120), sr=sr, y_axis='mel', fmax=sr / 2, x_axis='time', hop_length=hop_size, ax=ax[2]) # get rid of "default" ticks ax[2].yaxis.set_minor_locator(matplotlib.ticker.NullLocator()) ax[0].set(title=os.path.basename(filename)) length = get_length(filename) plot_xticks = np.arange(0, length + 0.1, length / 20) ax[2].set(xticks=plot_xticks) freq_ticks = [elem * 100 for elem in range(10)] freq_ticks = [250] freq = 500 while freq < sr / 2: freq_ticks.append(freq) freq *= 1.5 freq_ticks = [round(elem, -1) for elem in freq_ticks] freq_ticks_labels = [ str(round(elem / 1000, 1)) + 'k' if elem > 1000 else int(round(elem)) for elem in freq_ticks ] ax[2].set(yticks=(freq_ticks)) ax[2].set(yticklabels=(freq_ticks_labels)) times = librosa.times_like(cent, sr=sr, n_fft=window_size, hop_length=hop_size) ax[2].fill_between(times, cent[0] - spec_bw[0], cent[0] + spec_bw[0], alpha=0.5, label='Centroid +- bandwidth') ax[2].plot(times, cent.T, label='Centroid', color='y') ax[2].plot(times, rolloff[0], label='Roll-off frequency (0.99)') ax[2].plot(times, rolloff_min[0], color='r', label='Roll-off frequency (0.01)') ax[2].legend(loc='upper right') ax[1].plot(times, flatness.T, label='Flatness', color='y') ax[1].legend(loc='upper right') ax[0].semilogy(times, rms[0], label='RMS Energy') ax[0].legend(loc='upper right') plt.tight_layout() plt.savefig('%s_descriptors.png' % of, format='png') if not autoshow: plt.close() # create MgFigure data = { "hop_size": hop_size, "sr": sr, "of": of, "times": times, "S": S, "length": length, "cent": cent, "spec_bw": spec_bw, "rolloff": rolloff, "rolloff_min": rolloff_min, "flatness": flatness, "rms": rms } mgf = MgFigure(figure=fig, figure_type='audio.descriptors', data=data, layers=None, image=of + '_descriptors.png') return mgf
def mg_audio_spectrogram(filename=None, window_size=4096, overlap=8, mel_filters=512, power=2, dpi=300, autoshow=True): """ Renders a figure showing the mel-scaled spectrogram of the video/audio file. Parameters ---------- - filename : str, optional Path to the audio/video file to be processed. - window_size : int, optional The size of the FFT frame. Default is 4096. - overlap : int, optional The window overlap. The hop size is window_size / overlap. Example: window_size=1024, overlap=4 -> hop=256 - mel_filters : int, optional The number of filters to use for filtering the frequency domain. Affects the vertical resolution (sharpness) of the spectrogram. NB: Too high values with relatively small window sizes can result in artifacts (typically black lines) in the resulting image. Default is 512. - power : int, float, optional The steepness of the curve for the color mapping. Default is 2. - dpi : int, optional Image quality of the rendered figure. Default is 300 DPI. - autoshow: bool, optional Whether to show the resulting figure automatically. Default is `True` (figure is shown). Outputs ------- - `filename` + '_spectrogram.png' Returns ------- - MgFigure An MgFigure object referring to the internal figure and its data. """ if filename == None: print("No filename was given.") return if not has_audio(filename): print('The video has no audio track.') return of, fex = os.path.splitext(filename) hop_size = int(window_size / overlap) y, sr = librosa.load(filename, sr=None) S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=mel_filters, fmax=sr / 2, n_fft=window_size, hop_length=hop_size, power=power) fig, ax = plt.subplots(figsize=(12, 6), dpi=dpi) img = librosa.display.specshow(librosa.power_to_db(S, ref=np.max, top_db=120), sr=sr, y_axis='mel', fmax=sr / 2, x_axis='time', hop_length=hop_size, ax=ax) colorbar_ticks = range(-120, 1, 10) fig.colorbar(img, format='%+2.0f dB', ticks=colorbar_ticks) # get rid of "default" ticks ax.yaxis.set_minor_locator(matplotlib.ticker.NullLocator()) ax.set(title=os.path.basename(filename)) length = get_length(filename) plot_xticks = np.arange(0, length + 0.1, length / 20) ax.set(xticks=plot_xticks) freq_ticks = [elem * 100 for elem in range(10)] freq_ticks = [] freq = 100 while freq < sr / 2: freq_ticks.append(freq) freq *= 1.3 freq_ticks = [round(elem, -2) for elem in freq_ticks] freq_ticks.append(sr / 2) freq_ticks_labels = [ str(round(elem / 1000, 1)) + 'k' if elem > 1000 else int(round(elem)) for elem in freq_ticks ] ax.set(yticks=(freq_ticks)) ax.set(yticklabels=(freq_ticks_labels)) plt.tight_layout() plt.savefig('%s_spectrogram.png' % of, format='png') if not autoshow: plt.close() # create MgFigure data = {"hop_size": hop_size, "sr": sr, "of": of, "S": S, "length": length} mgf = MgFigure(figure=fig, figure_type='audio.spectrogram', data=data, layers=None, image=of + '_spectrogram.png') return mgf
def mg_videoreader(filename, starttime=0, endtime=0, skip=0, rotate=0, contrast=0, brightness=0, crop='None', color=True, keep_all=False, returned_by_process=False): """ Reads in a video file, and optionally apply several different processes on it. These include: - trimming, - skipping, - rotating, - applying brightness and contrast, - cropping, - converting to grayscale. Args: filename (str): Path to the input video file. starttime (int or float, optional): Trims the video from this start time (s). Defaults to 0. endtime (int or float, optional): Trims the video until this end time (s). Defaults to 0 (which will make the algorithm use the full length of the input video instead). skip (int, optional): Time-shrinks the video by skipping (discarding) every n frames determined by `skip`. Defaults to 0. rotate (int or float, optional): Rotates the video by a `rotate` degrees. Defaults to 0. contrast (int or float, optional): Applies +/- 100 contrast to video. Defaults to 0. brightness (int or float, optional): Applies +/- 100 brightness to video. Defaults to 0. crop (str, optional): If 'manual', opens a window displaying the first frame of the input video file, where the user can draw a rectangle to which cropping is applied. If 'auto' the cropping function attempts to determine the area of significant motion and applies the cropping to that area. Defaults to 'None'. color (bool, optional): If False, converts the video to grayscale and sets every method in grayscale mode. Defaults to True. keep_all (bool, optional): If True, preserves an output video file after each used preprocessing stage. Defaults to False. returned_by_process (bool, optional): This parameter is only for internal use, do not use it. Defaults to False. Outputs: A video file with the applied processes. The name of the file will be `filename` + a suffix for each process. Returns: int: The number of frames in the output video file. int: The pixel width of the output video file. int: The pixel height of the output video file. int: The FPS (frames per second) of the output video file. float: The length of the output video file in seconds. str: The path to the output video file without its extension. The file name gets a suffix for each used process. str: The file extension of the output video file. bool: Whether the video has an audio track. """ # Separate filename from file extension of, fex = os.path.splitext(filename) trimming = False skipping = False rotating = False cbing = False cropping = False # Cut out relevant bit of video using starttime and endtime if starttime != 0 or endtime != 0: extract_subclip(filename, starttime, endtime, targetname=of + '_trim' + fex) of = of + '_trim' trimming = True if skip != 0: skip_frames_ffmpeg(of + fex, skip) if not keep_all and trimming: os.remove(of + fex) of = of + '_skip' skipping = True length = get_framecount(of + fex) fps = get_fps(of + fex) # 0 means full length if endtime == 0: endtime = length / fps if rotate != 0: rotate_video(of + fex, rotate) if not keep_all and (skipping or trimming): os.remove(of + fex) of = of + '_rot' rotating = True # Apply contrast/brightness before the motion analysis if contrast != 0 or brightness != 0: contrast_brightness_ffmpeg(of + fex, contrast=contrast, brightness=brightness) if not keep_all and (rotating or skipping or trimming): os.remove(of + fex) of = of + '_cb' cbing = True # Crops video either manually or automatically if crop.lower() != 'none': mg_cropvideo_ffmpeg(of + fex, crop_movement=crop) if not keep_all and (cbing or rotating or skipping or trimming): os.remove(of + fex) of = of + '_crop' cropping = True if color == False and returned_by_process == False: of_gray, fex = convert_to_grayscale(of + fex) if not keep_all and (cropping or cbing or rotating or skipping or trimming): os.remove(of + fex) of = of_gray width, height = get_widthheight(of + fex) video_has_audio_track = has_audio(of + fex) return length, width, height, fps, endtime, of, fex, video_has_audio_track
def mg_audio_tempogram(filename=None, window_size=4096, overlap=8, mel_filters=512, power=2, dpi=300, autoshow=True, title=None): """ Renders a figure with a plots of onset strength and tempogram of the video/audio file. Args: filename (str, optional): Path to the audio/video file to be processed. Defaults to None. window_size (int, optional): The size of the FFT frame. Defaults to 4096. overlap (int, optional): The window overlap. The hop size is window_size / overlap. Example: window_size=1024, overlap=4 -> hop=256. Defaults to 8. mel_filters (int, optional): The number of filters to use for filtering the frequency domain. Affects the vertical resolution (sharpness) of the spectrogram. NB: Too high values with relatively small window sizes can result in artifacts (typically black lines) in the resulting image. Defaults to 512. power (float, optional): The steepness of the curve for the color mapping. Defaults to 2. dpi (int, optional): Image quality of the rendered figure in DPI. Defaults to 300. autoshow (bool, optional): Whether to show the resulting figure automatically. Defaults to True. title (str, optional): Optionally add title to the figure. Defaults to None, which uses the file name as a title. Outputs: `filename`_tempogram.png Returns: MgFigure: An MgFigure object referring to the internal figure and its data. """ if filename == None: print("No filename was given.") return if not has_audio(filename): print('The video has no audio track.') return of, fex = os.path.splitext(filename) hop_size = int(window_size / overlap) y, sr = librosa.load(filename, sr=None) oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_size) tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_size) # Estimate the global tempo for display purposes tempo = librosa.beat.tempo(onset_envelope=oenv, sr=sr, hop_length=hop_size)[0] fig, ax = plt.subplots(nrows=2, figsize=(10, 6), dpi=dpi, sharex=True) # make sure background is white fig.patch.set_facecolor('white') fig.patch.set_alpha(1) # add title if title == None: title = os.path.basename(filename) fig.suptitle(title, fontsize=16) times = librosa.times_like(oenv, sr=sr, hop_length=hop_size) ax[0].plot(times, oenv, label='Onset strength') ax[0].label_outer() ax[0].legend(frameon=True) librosa.display.specshow(tempogram, sr=sr, hop_length=hop_size, x_axis='time', y_axis='tempo', cmap='magma', ax=ax[1]) ax[1].axhline(tempo, color='w', linestyle='--', alpha=1, label='Estimated tempo={:g}'.format(tempo)) ax[1].legend(loc='upper right') ax[1].set(title='Tempogram') plt.savefig('%s_tempogram.png' % of, format='png', transparent=False) if not autoshow: plt.close() # create MgFigure data = { "hop_size": hop_size, "sr": sr, "of": of, "times": times, "onset_env": oenv, "tempogram": tempogram, "tempo": tempo } mgf = MgFigure(figure=fig, figure_type='audio.tempogram', data=data, layers=None, image=of + '_tempogram.png') return mgf
def spectrogram(self, window_size=4096, overlap=8, mel_filters=512, power=2, dpi=300, autoshow=True, title=None): """ Renders a figure showing the mel-scaled spectrogram of the video/audio file. Args: window_size (int, optional): The size of the FFT frame. Defaults to 4096. overlap (int, optional): The window overlap. The hop size is window_size / overlap. Example: window_size=1024, overlap=4 -> hop=256. Defaults to 8. mel_filters (int, optional): The number of filters to use for filtering the frequency domain. Affects the vertical resolution (sharpness) of the spectrogram. NB: Too high values with relatively small window sizes can result in artifacts (typically black lines) in the resulting image. Defaults to 512. power (float, optional): The steepness of the curve for the color mapping. Defaults to 2. dpi (int, optional): Image quality of the rendered figure in DPI. Defaults to 300. autoshow (bool, optional): Whether to show the resulting figure automatically. Defaults to True. title (str, optional): Optionally add title to the figure. Defaults to None, which uses the file name as a title. Outputs: `self.filename`_spectrogram.png Returns: MgFigure: An MgFigure object referring to the internal figure and its data. """ if not has_audio(self.filename): print('The video has no audio track.') return hop_size = int(window_size / overlap) y, sr = librosa.load(self.filename, sr=None) S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=mel_filters, fmax=sr / 2, n_fft=window_size, hop_length=hop_size, power=power) fig, ax = plt.subplots(figsize=(12, 6), dpi=300) # make sure background is white fig.patch.set_facecolor('white') fig.patch.set_alpha(1) # add title if title == None: title = os.path.basename(self.filename) fig.suptitle(title, fontsize=16) img = librosa.display.specshow(librosa.power_to_db(S, ref=np.max, top_db=120), sr=sr, y_axis='mel', fmax=sr / 2, x_axis='time', hop_length=hop_size, ax=ax) print(type(img)) colorbar_ticks = range(-120, 1, 10) fig.colorbar(img, format='%+2.0f dB', ticks=colorbar_ticks) # get rid of "default" ticks ax.yaxis.set_minor_locator(matplotlib.ticker.NullLocator()) # ax.set(title=os.path.basename(self.filename)) length = get_length(self.filename) plot_xticks = np.arange(0, length + 0.1, length / 20) ax.set(xticks=plot_xticks) freq_ticks = [elem * 100 for elem in range(10)] freq_ticks = [] freq = 100 while freq < sr / 2: freq_ticks.append(freq) freq *= 1.3 freq_ticks = [round(elem, -2) for elem in freq_ticks] freq_ticks.append(sr / 2) freq_ticks_labels = [ str(round(elem / 1000, 1)) + 'k' if elem > 1000 else int(round(elem)) for elem in freq_ticks ] ax.set(yticks=(freq_ticks)) ax.set(yticklabels=(freq_ticks_labels)) plt.tight_layout() plt.savefig('%s_spectrogram.png' % self.of, format='png', transparent=False) if not autoshow: plt.close() # create MgFigure data = { "hop_size": hop_size, "sr": sr, "of": self.of, "S": S, "length": length } mgf = MgFigure(figure=fig, figure_type='audio.spectrogram', data=data, layers=None, image=self.of + '_spectrogram.png') return mgf