def process_online(processor, infile, outfile, **kwargs): """Process a file or audio stream with the given Processor. Parameters ---------- processor : :class:`Processor` instance Processor to be processed. infile : str or file handle, optional Input file (handle). If none is given, the stream present at the system's audio inpup is used. Additional keyword arguments can be used to influence the frame size and hop size. outfile : str or file handle Output file (handle). kwargs : dict, optional Keyword arguments passed to :class:`.audio.signal.Stream` if `in_stream` is 'None'. Notes ----- Right now there is no way to determine if a processor is online-capable or not. Thus, calling any processor with this function may not produce the results expected. """ from madmom.audio.signal import Stream, FramedSignal # set default values kwargs['sample_rate'] = kwargs.get('sample_rate', 44100) kwargs['num_channels'] = kwargs.get('num_channels', 1) # if no iput file is given, create a Stream with the given arguments if infile is None: # open a stream and start if not running already stream = Stream(**kwargs) if not stream.is_running(): stream.start() # use the input file else: # set parameters for opening the file frame_size = kwargs.get('frame_size', 2048) hop_size = kwargs.get('hop_size', 441) fps = kwargs.get('fps') num_channels = kwargs.get('num_channels') # FIXME: overwrite the frame size with the maximum value of all used # processors. This is needed if multiple frame sizes are used import warnings warnings.warn('make sure that the `frame_size` (%d) is equal to the ' 'maximum value used by any `FramedSignalProcessor`.' % frame_size) # Note: origin must be 'online' and num_frames 'None' to behave exactly # the same as with live input stream = FramedSignal(infile, frame_size=frame_size, hop_size=hop_size, fps=fps, origin='online', num_frames=None, num_channels=num_channels) # set arguments for online processing # Note: pass only certain arguments, because these will be passed to the # processors at every time step (kwargs contains file handles etc.) process_args = {'reset': False} # do not reset stateful processors # process everything frame-by-frame for frame in stream: _process((processor, frame, outfile, process_args))
def __init__(self, audiofilename, midifilename, instrument, instruments, context, audio_options): super().__init__() self.audiofilename = audiofilename self.midifilename = midifilename self.instrument = instrument self.instruments = instruments self.instrument_number_onehot = torch.zeros(1, context['frame_size'], len(self.instruments)) self.instrument_number_onehot[0, :, self.instruments[self.instrument]] = 1. self.audio_options = deepcopy(audio_options) spectrogram, y_frames, y_velocity = get_xy_from_file( self.audiofilename, self.midifilename, self.audio_options) self.spectrogram = FramedSignal( spectrogram, frame_size=context['frame_size'], hop_size=context['hop_size'], origin=context['origin'], ) self.y_frames = FramedSignal( y_frames, frame_size=context['frame_size'], hop_size=context['hop_size'], origin=context['origin'], ) self.y_velocity = FramedSignal( y_velocity, frame_size=context['frame_size'], hop_size=context['hop_size'], origin=context['origin'], ) self.fixed_noise = FramedSignal( # the noise should be strictly positive ... np.abs(np.random.normal(0, 1, (len(spectrogram), 7))), frame_size=context['frame_size'], hop_size=context['hop_size'], origin=context['origin'], ) if (len(self.spectrogram) != len(self.y_frames) or len(self.spectrogram) != len(self.y_velocity)): raise RuntimeError('x and y do not have the same length.')
def preprocess_sig(sig, frame_size): frames = FramedSignal(sig, frame_size = frame_size, fps = 100) stft = ShortTimeFourierTransform(frames) filt = FilteredSpectrogram(stft, filterbank = MelFilterbank, num_bands = 80, fmin = 27.5, fmax = 16000, norm_filters = True, unique_filters = False) log_filt = LogarithmicSpectrogram(filt, log = np.log, add = np.spacing(1)) return log_filt
def get_spectrogram(path, sample_rate=None, fps=None, window=np.hanning, fft_sizes=[1024], filtered=True, filterbank=LogarithmicFilterbank, num_bands=12, fmin=30, fmax=17000): ''' path: single file path filtered: generate FilteredSpectrogram or normal one return numpy array shaped (Frequencies, Timeframes, Channels) (log-spaced (Filtered)Spectrogram from madmom) ''' spectros = [] max_fft_size = np.max(fft_sizes) # sample_rate=None takes original sample_rate # only take 30s snippets to align data signal = Signal(path, sample_rate=sample_rate, start=0, stop=30) frames = FramedSignal(signal, fps=fps) channel_num = 0 for fft_size in fft_sizes: stft = ShortTimeFourierTransform(frames, window=window, fft_size=fft_size, circular_shift=True) spectro = LogarithmicSpectrogram(stft) if filtered: filtered_spectro = FilteredSpectrogram(spectro, filterbank=filterbank, num_bands=num_bands, fmin=fmin, fmax=fmax) spectros.append(filtered_spectro) else: spectros.append(spectro) # bring all spectros to the same shape, concat them and return them num_frequencies = max([spectro.shape[1] for spectro in spectros]) num_channels = len(spectros) num_timestamps = spectros[0].shape[0] final_spectro = np.zeros([num_frequencies, num_timestamps, num_channels]) for channel, spectro in enumerate(spectros): final_spectro[:spectro.shape[1], :, channel] = spectro.T return final_spectro
def test_process_lstm(self): # load uni-directional RNN models self.processor = RNNBeatProcessor(online=True, origin='online') # process the whole sequence at once result = self.processor(sample_file) self.assertTrue(np.allclose(result, sample_lstm_act, atol=1e-5)) # result must be the same if processed a second time result_1 = self.processor(sample_file) self.assertTrue(np.allclose(result, result_1)) # result must be the same if processed frame-by-frame frames = FramedSignal(sample_file, origin='online') self.processor = RNNBeatProcessor(online=True, num_frames=1, origin='future') result_2 = np.hstack([self.processor(f, reset=False) for f in frames]) self.assertTrue(np.allclose(result, result_2)) # result must be different without resetting result_3 = np.hstack([self.processor(f, reset=False) for f in frames]) self.assertFalse(np.allclose(result, result_3))
def preprocess_sig(sig, frame_size): frames = FramedSignal(sig, frame_size=frame_size, fps=100) stft = ShortTimeFourierTransform(frames) filt = FilteredSpectrogram(stft, num_bands=6) spec = np.log10(5 * filt + 1) # Calculate difference spectrogram with ratio 0.25 diff_frames = _diff_frames(0.25, frame_size=frame_size, hop_size=441, window=np.hanning) init = np.empty((diff_frames, spec.shape[1])) init[:] = np.inf spec = np.insert(spec, 0, init, axis=0) diff_spec = spec[diff_frames:] - spec[:-diff_frames] np.maximum(diff_spec, 0, out=diff_spec) diff_spec[np.isinf(diff_spec)] = 0 diff_spec = np.hstack((spec[diff_frames:], diff_spec)) return diff_spec
def __init__(self, audiofilename, midifilename, start_end=None): self.metadata = dict( audiofilename=audiofilename, midifilename=midifilename ) x, y = get_xy_from_file(audiofilename, midifilename) if start_end is not None: start, end = start_end x = x[start:end] y = y[start:end] self.x = FramedSignal( x, frame_size=5, hop_size=1, origin='center' ) self.y = y _, self.w, self.h = self.x.shape
def get_spectrogram(path, filtered=True, window=np.hanning, fft_size=1024, sample_rate=None): ''' path: single file path filtered: generate FilteredSpectrogram or normal one return numpy array shaped (Frequencies, Timeframes, Channels) (log-spaced (Filtered)Spectrogram from madmom) ''' # sample_rate=None takes original sample_rate signal = Signal(path, sample_rate=sample_rate) frames = FramedSignal(signal) stft = ShortTimeFourierTransform(frames, window=window, fft_size=fft_size) spectro = LogarithmicSpectrogram(stft) if filtered: return FilteredSpectrogram(spectro) else: return spectro
def __init__(self, audio_filename, midi_filename, input_context, target_maxfilter, audio_options, start_end=None, offset_suppression=None): self.metadata = dict(audio_filename=audio_filename, midi_filename=midi_filename) self.audio_options = deepcopy(audio_options) x, y_onsets, y_frames, y_offsets = get_xy_from_file_subsampled( self.metadata['audio_filename'], self.metadata['midi_filename'], self.audio_options, start_end) self.y_onsets = widen(y_onsets, target_maxfilter['y_onsets']) self.y_frames = widen(y_frames, target_maxfilter['y_frames']) if offset_suppression is not None: # this gets passed the widened *onsets* already y_offsets = suppress_offets(y_onsets, y_offsets) # widen *after* suppression self.y_offsets = widen(y_offsets, target_maxfilter['y_offsets']) self.x = FramedSignal( x, frame_size=input_context['frame_size'], hop_size=input_context['hop_size'], origin=input_context['origin'], ) if (len(self.x) != len(self.y_onsets) or len(self.x) != len(self.y_frames) or len(self.x) != len(self.y_offsets)): raise RuntimeError('x and y do not have the same length.')
def pre_process_cwt(onsets_images_dir, non_onsets_images_dir, audio_files, ann_files): # onsets_images_dir = join('dataset_transformed', 'train')# , 'onsets') # non_onsets_images_dir = join('dataset_transformed', 'train')# , 'non-onsets') onsets_images_dir = 'dataset_transformed' non_onsets_images_dir = 'dataset_transformed' dataset_dir = 'dataset' audio_files = list_audio_files(dataset_dir) ann_files = list_annotation_files(dataset_dir) frame_size = 1024 sample_rate = 44100 t = frame_size / sample_rate # t = 0.09287981859410431 # seconds for frame_size = 4096 time = np.arange(frame_size, dtype=np.float16) scales = np.arange(1,81) # scaleogram with 80 rows print(f'There are {str(len(audio_files))} audio files and {str(len(ann_files))} annotation files') i = 0 for audio_file in audio_files: file_name = basename(audio_file) print(f'Pre-processing file {str(i+1)}/{str(len(audio_files))}: {file_name}') # Read audio file sig = Signal(audio_file, sample_rate, num_channels = 1) # Split audio signal into frames of same size frames = FramedSignal(sig, frame_size, hop_size = frame_size) print(f'There are {str(len(frames))} frames') # Read onset annotations for current audio file onset_file = ann_files[i] onsets = np.loadtxt(onset_file) print(f'Onsets read from {onset_file}') number_of_onsets = len(onsets) print(f'There are {str(number_of_onsets)} onsets') # Check if we already generated the correct amount of frames for that file before matching_files = glob.glob('dataset_transformed/' + '*'+ file_name + '*') if len(matching_files) > 0: if len(frames) == len(matching_files): print(f'Skipping file {str(i)}/{str(len(audio_files))}: {file_name}') i += 1 continue start = 0 end = t f = 0 onsets_found_this_file = 0 for frame in frames: # Plot frame # plt.plot(frame) # plt.show() # Check if contains onset start = f * t end = start + t f += 1 hasOnset = False for onset in onsets: if start <= onset and end >= onset: hasOnset = True onsets_found_this_file += 1 if hasOnset: print(f'There is an onset within the range: {str(start)} to {str(end)} ms') else: print(f'There are no onsets within the range: {str(start)} to {str(end)} ms') # Apply CWT cwt = scg.CWT(time, frame, scales, wavelet='cmor1.5-1.0') # print(cwt.coefs.shape) # Get scaleogram ax = scg.cws(cwt, yaxis = 'frequency', wavelet = 'cmor1.5-1.0', cbar = None, coi = False) # ['cgau1 :\tComplex Gaussian wavelets', 'cgau2 :\tComplex Gaussian wavelets', # 'cgau3 :\tComplex Gaussian wavelets', 'cgau4 :\tComplex Gaussian wavelets', # 'cgau5 :\tComplex Gaussian wavelets', 'cgau6 :\tComplex Gaussian wavelets', # 'cgau7 :\tComplex Gaussian wavelets', 'cgau8 :\tComplex Gaussian wavelets', # 'cmor1.5-1.0 :\tComplex Morlet wavelets', 'fbsp1-1.5-1.0 :\tFrequency B-Spline wavelets', # 'gaus1 :\tGaussian', 'gaus2 :\tGaussian', 'gaus3 :\tGaussian', 'gaus4 :\tGaussian', # 'gaus5 :\tGaussian', 'gaus6 :\tGaussian', 'gaus7 :\tGaussian', 'gaus8 :\tGaussian', # 'mexh :\tMexican hat wavelet', 'morl :\tMorlet wavelet', 'shan1.5-1.0 :\tShannon wavelets'] # Remove axis from image plt.subplots_adjust(bottom = 0, top = 1, left = 0, right = 1) # plt.show() # Get image from matplot and process it fig = plt.gcf() plot_img_np = get_img_from_fig(fig) image = Image.fromarray(plot_img_np).convert('RGB').resize((15,80)) # TODO try PIL.Image.LANCZOS # Save image label = '1' if hasOnset == True else '0' image.save(join(onsets_images_dir, f'{label}-{file_name}-F{str(f)}.png')) plt.close() if number_of_onsets != onsets_found_this_file: print(f'It was supposed to have {str(number_of_onsets)} onsets. Found {str(onsets_found_this_file)} instead. Exiting...') exit() i += 1
def get_cwt_dataset(split_file): audio_files = list_audio_files('dataset') ann_files = list_annotation_files('dataset') split = np.loadtxt(split_file, dtype = str) frame_size = 1024 sample_rate = 44100 t = 0.01 time = np.arange(frame_size, dtype=np.float16) scales = np.arange(1,81) # scaleogram with 80 rows i = 0 train_features, train_labels = [], [] # spectograms validation_features, validation_labels = [], [] # spectograms for audio_file in audio_files: file_name = basename(audio_file) print(f'Pre-processing file {str(i+1)}/{str(len(audio_files))}: {file_name}') # Read audio file sig = Signal(audio_file, sample_rate, num_channels = 1) frames = FramedSignal(sig, frame_size, hop_size = frame_size/2) # Read onset annotations for current audio file onset_file = ann_files[i] onsets = np.loadtxt(onset_file) print(f'Onsets read from {onset_file}') number_of_onsets = len(onsets) print(f'There are {str(number_of_onsets)} onsets') start = 0 end = t f = 0 for frame in frames: cwt = scg.CWT(time, frame, scales, wavelet='cmor1.5-1.0') # ax = scg.cws(cwt, yaxis = 'frequency', wavelet = 'cmor1.5-1.0', cbar = None, coi = False) # plt.subplots_adjust(bottom = 0, top = 1, left = 0, right = 1) # fig = plt.gcf() # plot_img_np = get_img_from_fig(fig) rgb_frame = Image.fromarray(cwt.coefs.astype(np.uint8)).convert('RGB').resize((15,80), Image.LANCZOS) rgb_frame = np.asarray(rgb_frame) plt.close() # Check if contains onset start = f * t end = start + t f += 1 label = 0 for onset in onsets: if start <= onset and end >= onset: label = 1 if audio_file in split: validation_features.append(rgb_frame) validation_labels.append(label) else: train_features.append(rgb_frame) train_labels.append(label) i += 1 if i == 10: break # Post process train_features = np.array(train_features) validation_features = np.array(validation_features) train_features = train_features.astype('float32') / 255. validation_features = validation_features.astype('float32') / 255. train_labels = np.array(train_labels, dtype=int) validation_labels = np.array(validation_labels, dtype=int) return train_features, train_labels, validation_features, validation_labels
def get_ffts_dataset(split_file): audio_files = list_audio_files('dataset') ann_files = list_annotation_files('dataset') split = np.loadtxt(split_file, dtype = str) frame_sizes = [2048, 1024, 4096] sample_rate = 44100 t = 0.01 i = 0 train_features, train_labels = [], [] # spectograms validation_features, validation_labels = [], [] # spectograms for audio_file in audio_files: file_name = basename(audio_file) print(f'Pre-processing file {str(i+1)}/{str(len(audio_files))}: {file_name}') # Read audio file sig = Signal(audio_file, sample_rate, num_channels = 1) all_spectograms = [] for frame_size in frame_sizes: frames = FramedSignal(sig, frame_size, fps = 100, hop_size = 441) stft = ShortTimeFourierTransform(frames) filt = FilteredSpectrogram(stft, filterbank = MelFilterbank, num_bands = 80, fmin = 27.5, fmax = 16000, norm_filters = True, unique_filters = False) log_filt = LogarithmicSpectrogram(filt, log = np.log, add = np.spacing(1)) all_spectograms.append(log_filt.T.astype(np.uint8)) # Stack all in different axis final_spectogram = np.dstack(all_spectograms) # Read onset annotations for current audio file onset_file = ann_files[i] onsets = np.loadtxt(onset_file) print(f'Onsets read from {onset_file}') number_of_onsets = len(onsets) print(f'There are {str(number_of_onsets)} onsets') start = 0 end = t + 0.14 f = 0 for a in range(7, final_spectogram.shape[1]-7): final_frame = final_spectogram[:,a-7:a+8] # +8, but numpy does not include the 8th element # Check if contains onset start = f * t end = start + (t * 15) f += 1 label = 0 onset_frame_start = start + (t * 5) onset_frame_end = onset_frame_start + (t * 5) # if f == 20: # exit() for onset in onsets: # if start <= onset and end >= onset: if onset_frame_start <= onset and onset_frame_end >= onset: label = 1 # if label == 1: # print(f'There is an onset within the range: {str(onset_frame_start)} to {str(onset_frame_end)} ms') # else: # print(f'There are no onsets within the range: {str(onset_frame_start)} to {str(onset_frame_end)} ms') if audio_file in split: validation_features.append(final_frame) validation_labels.append(label) else: train_features.append(final_frame) train_labels.append(label) i += 1 # Post process train_features = np.array(train_features) validation_features = np.array(validation_features) train_features = train_features.astype('float32') / 255. validation_features = validation_features.astype('float32') / 255. train_labels = np.array(train_labels, dtype=int) validation_labels = np.array(validation_labels, dtype=int) return train_features, train_labels, validation_features, validation_labels
def pre_process_fft(onsets_images_dir, non_onsets_images_dir, audio_files, ann_files): frame_sizes = [2048, 1024, 4096] sample_rate = 44100 t = 0.01 i = 0 for audio_file in audio_files: file_name = basename(audio_file) print(f'Pre-processing file {str(i+1)}/{str(len(audio_files))}: {file_name}') # Read audio file sig = Signal(audio_file, sample_rate, num_channels = 1) all_spectograms = [] for frame_size in frame_sizes: frames = FramedSignal(sig, frame_size, fps = 100, hop_size = 441) stft = ShortTimeFourierTransform(frames) filt = FilteredSpectrogram(stft, filterbank = MelFilterbank, num_bands = 80, fmin = 27.5, fmax = 16000, norm_filters = True, unique_filters = False) log_filt = LogarithmicSpectrogram(filt, log = np.log, add = np.spacing(1)) all_spectograms.append(log_filt.T.astype(np.uint8)) # Stack all in different axis final_spectogram = np.dstack(all_spectograms) # image = Image.fromarray((final_spectogram)) # image.save(join(onsets_images_dir, f'zzzz.png')) # Read onset annotations for current audio file onset_file = ann_files[i] onsets = np.loadtxt(onset_file) print(f'Onsets read from {onset_file}') number_of_onsets = len(onsets) print(f'There are {str(number_of_onsets)} onsets') # Split audio signal into frames of same size frames = FramedSignal(sig, frame_size, fps = 100, hop_size = 441) print(f'There are {str(len(frames))} frames') # Check if we already generated the correct amount of frames for that file before matching_files = glob.glob('dataset_transformed/' + '*'+ file_name + '*') if len(matching_files) > 0: if len(frames) == len(matching_files): print(f'Skipping file {str(i)}/{str(len(audio_files))}: {file_name}') i += 1 continue start = 0 end = t + 0.14 f = 0 onsets_found_this_file = 0 for a in range(final_spectogram.shape[1]-15): final_frame = final_spectogram[:,a:a+15] # Check if contains onset start = f * t end = start + t + 0.14 f += 1 hasOnset = False for onset in onsets: if start <= onset and end >= onset: hasOnset = True onsets_found_this_file += 1 # if hasOnset: # print(f'There is an onset within the range: {str(start)} to {str(end)} ms') # else: # print(f'There are no onsets within the range: {str(start)} to {str(end)} ms') image = Image.fromarray(final_frame) # Save image if hasOnset: image.save(join(onsets_images_dir, f'1-{file_name}-F{str(f)}.png')) else: image.save(join(non_onsets_images_dir, f'0-{file_name}-F{str(f)}.png')) i += 1
def getPCPHistogram(filename, fs=8192, show=False): res = {} sig = Signal(filename, num_channels=1) fsig = FramedSignal(sig, frame_size=fs) stft = ShortTimeFourierTransform(fsig) spec = Spectrogram(stft) chroma = PitchClassProfile(spec, num_classes=12) hist = [0 for i in range(12)] hist_f = [0 for i in range(12)] for f in range(len(chroma)): wf = chroma[f] hist = map(sum, zip(hist, wf)) f = flatness(wf) hist_f = map(sum, zip(hist_f, [w * f for w in wf])) s = sum(hist) hist = map(lambda x: x / s, hist) C_hist = [hist[i - 9] for i in range(12)] res['standard'] = C_hist s_f = sum(hist_f) hist_f = map(lambda x: x / s_f, hist_f) C_hist_f = [hist_f[i - 9] for i in range(12)] res['standard_f'] = C_hist_f hpss = HarmonicPercussiveSourceSeparation() h, _ = hpss.process(spec) chroma = PitchClassProfile(h, num_classes=12) hist = [0 for i in range(12)] hist_f = [0 for i in range(12)] for f in range(len(chroma)): wf = chroma[f] hist = map(sum, zip(hist, wf)) f = flatness(wf) hist_f = map(sum, zip(hist_f, [w * f for w in wf])) s = sum(hist) hist = map(lambda x: x / s, hist) C_hist = [hist[i - 9] for i in range(12)] res['hpss'] = C_hist s_f = sum(hist_f) hist_f = map(lambda x: x / s_f, hist_f) C_hist_f = [hist_f[i - 9] for i in range(12)] res['hpss_f'] = C_hist_f dcp = DeepChromaProcessor() deepchroma = dcp(filename) hist = [0 for i in range(12)] hist_f = [0 for i in range(12)] for f in range(len(deepchroma)): wf = deepchroma[f] hist = map(sum, zip(hist, wf)) f = flatness(wf) hist_f = map(sum, zip(hist_f, [w * f for w in wf])) s = sum(hist) hist = map(lambda x: x / s, hist) res['deep'] = hist s_f = sum(hist_f) hist_f = map(lambda x: x / s_f, hist_f) res['deep_f'] = hist_f if show: plt.subplot(131) plt.barh(range(12), res['standard']) plt.subplot(132) plt.barh(range(12), res['hpss']) plt.subplot(133) plt.barh(range(12), res['deep']) plt.show() return res