Пример #1
0
def load_audio_samples(audio_cache, sample_sources):
	# Create samples from audio_cache
	_AudioSamples = []

	for audio_source in sample_sources:
		filename = audio_source['filename']

		if filename not in audio_cache.keys():
			print("Not cached audio file: {}. Skipping...".format(filename))
			continue

		audio_data, audio_rate = audio_cache[filename]

		time_start = float(audio_source['start'])
		time_end = float(audio_source['end'])
		label = audio_source['label']

		start_sample = librosa.time_to_samples(time_start, sr=audio_rate)
		end_sample = librosa.time_to_samples(time_end, sr=audio_rate)

		# todo trim noise
		try:
			cutout_data = audio_data[start_sample:end_sample]
		except IndexError:
			print("Error in getting part of audio file: Out of Range error")
			continue

		sample = create_sample(label, audio_rate, cutout_data)
		_AudioSamples.append(sample)

	return _AudioSamples
Пример #2
0
def build_song_segments(
    target_song, segments_df, songs_df
):  # segments_df = all of tailor swift scenes, songs_df = a list of all of tailor's songs in db
    matches = []
    wave_segments = []
    target_remaining_samples = target_song
    while len(target_remaining_samples) > 10000:
        # TODO: only necessary to calculate onsets up to max length of segment, not entire remaining song
        # fix if too slow
        target_onset_envelope = librosa.onset.onset_strength(
            target_remaining_samples)
        # dot prod - correlate segments, penalize by sqrt of length, which makes longer segments better
        onset_silly_match = segments_df.onset_envelope.map(
            silly_segment_matcher(target_onset_envelope))
        best_match_index = onset_silly_match.idxmax()
        best_match = segments_df.loc[best_match_index]
        matching_song = songs_df[songs_df['name'] == best_match['name']]
        wave = matching_song.song_wave.values[0]
        start_times = matching_song.scene_start_times_sec.values[0]
        start_time = start_times[best_match.index_in_song]
        # end of song if index_in_song is the last one
        end_time = start_times[best_match.index_in_song + 1] if best_match.index_in_song + 1 < len(start_times) \
                    else librosa.samples_to_time(len(wave))
        start_sample = librosa.time_to_samples(start_time)
        end_sample = librosa.time_to_samples(end_time)

        matches.append((best_match['name'], start_time, end_time))
        consumed_samples = end_sample - start_sample
        target_remaining_samples = target_remaining_samples[consumed_samples:]
        # don't reuse samples
        segments_df = segments_df.drop(best_match_index)
        wave_segments.append(wave[start_sample:end_sample])
    return matches, np.concatenate(wave_segments)
Пример #3
0
    def play(self, dur):
        times = sm.waves.tspan(dur)
        waves = []

        for start, wdur, lf, hf, amp, amp_freq, amp_phase in zip(
            self.starts, self.durs, self.low_freqs, self.high_freqs, self.amps, self.amp_freqs, self.amp_phases
        ):
            ss = lr.time_to_samples(start, sm.sound.SAMPLE_RATE)
            es = lr.time_to_samples(start + wdur, sm.sound.SAMPLE_RATE)
            es = min(es, self.so.samples.size - 1)
            samps = self.so.samples[ss:es]
            if samps.size <= 0:
                continue
            if hf < lf:
                hf, lf = lf, hf
            if es - ss > 0.1 * sm.sound.SAMPLE_RATE:
                samps = sm.effects.band_pass(SoundObject(samps), lf, hf).samples
            peak = np.max(np.abs(samps))
            if peak > 0.0:
                samps /= peak
            samp_dur = lr.samples_to_time(samps.size, sm.sound.SAMPLE_RATE)
            wave = np.interp(
                times, 
                np.linspace(0, samp_dur, samps.size),
                samps,
                period=samp_dur
            ) * amp
            mod = sm.waves.sin(times, amp_freq, 1.0, amp_phase)
            wave *= mod
            waves.append(SoundObject(wave))

        return sm.sound.join(waves)
Пример #4
0
def generate_features(albums_dict, album_label_dict):
    features = []
    counter = 0
    for album in albums_dict:
        album_title = path_to_album(album)
        for song in albums_dict[album]:
            counter += 1
            song_path = os.path.join(album, song["filename"])
            song_title = filename_to_title(song["filename"])
            print(str(counter) + "th song: " + song_title)
            data, sr = librosa.load(song_path)
            if album_title in album_label_dict:
                if song_title in album_label_dict[album_title]:
                    for intervals in album_label_dict[album_title][song_title]:
                        start, end, chord = intervals[0], intervals[
                            1], intervals[2]
                        if end > start:
                            start_index = librosa.time_to_samples(start)
                            end_index = librosa.time_to_samples(end)
                            audio_slice = data[int(start_index):int(end_index)]
                            if len(audio_slice) == 0:
                                continue
                            mfccs = librosa.feature.mfcc(y=audio_slice,
                                                         sr=sample_rate,
                                                         n_mfcc=40)
                            mfccs_processed = np.mean(mfccs.T, axis=0)
                            features.append([mfccs_processed, chord])
    return features
Пример #5
0
def parse_grid(grid_path):
    '''
       note : parse a textgrid and then split the wav matches the textgrid

       arg :
           grid_path : textgrid path to parse
	'''

    dir_path = os.path.dirname(grid_path)
    grid_filename = os.path.basename(grid_path)
    name = os.path.splitext(grid_filename)[0]
    wav_path = os.path.join(dir_path, name + ".wav")
    target_tier = "comma"

    tg_obj = tgt.read_textgrid(grid_path)
    # get objects(textgrid-tier & wav) matches the grid path
    tier_obj = tg_obj.get_tier_by_name(target_tier)
    wav_obj, sr = librosa.load(wav_path, sr=None)

    for idx in range(len(tier_obj)):
        part = tier_obj[idx]
        time_s = librosa.time_to_samples(part.start_time, sr)
        time_e = librosa.time_to_samples(part.end_time, sr)
        librosa.output.write_wav('{}_{}.wav'.format(name, idx),
                                 wav_obj[time_s:time_e], sr)
        with open("{}_{}.txt".format(name, idx), "w") as f:
            f.write(part.text)
Пример #6
0
def synthesize(inputs):
    """
    Generate new Audio objects for output or further remixing.

    Parameters
    ----------

    inputs: generator, list, or tuple.
        See _format_inputs for details on parsing inputs.

    Returns
    ------
    An Audio object
    """
    # First we organize our inputs.
    inputs = _format_inputs(inputs)

    max_time = 0.0
    sample_rate = 44100
    array_length = 20 * 60  # 20 minutes!
    array_shape = (2, sample_rate * array_length)
    sparse_array = lil_matrix(array_shape)

    initial_offset = 0
    for i, (time_slice, start_time) in enumerate(inputs):
        # if we have a mono file, we return stereo here.
        resampled_audio, left_offset, right_offset = time_slice.get_samples()

        # set the initial offset, so we don't miss the start of the array
        if i == 0:
            initial_offset = max(left_offset * -1, right_offset * -1)

        # get the target start and duration
        start_time = start_time.delta * 1e-9
        duration = time_slice.duration.delta * 1e-9

        # find the max time
        if start_time + duration > max_time:
            max_time = start_time + duration
        # error if we'd go too far
        if start_time + duration > array_length:
            raise SynthesizeError("Amen can only synthesize up to 20 minutes of audio.")

        # get the target start and end samples
        starting_sample, _ = librosa.time_to_samples(
            [start_time, start_time + duration], sr=time_slice.audio.sample_rate
        )

        # figure out the actual starting and ending samples for each channel
        left_start = starting_sample + left_offset + initial_offset
        right_start = starting_sample + right_offset + initial_offset

        # add the data from each channel to the array
        sparse_array[0, left_start : left_start + len(resampled_audio[0])] += resampled_audio[0]
        sparse_array[1, right_start : right_start + len(resampled_audio[1])] += resampled_audio[1]

    max_samples = librosa.time_to_samples([max_time], sr=sample_rate)
    truncated_array = sparse_array[:, 0:max_samples].toarray()

    return Audio(raw_samples=truncated_array)
Пример #7
0
def chromaplot(y,
               rate,
               start_t=0,
               stop_t=None,
               play=True,
               harmonic_input=False):

    start = librosa.time_to_samples(start_t)

    if stop_t is not None:
        stop = librosa.time_to_samples(stop_t)

    if harmonic_input is False:
        h, p = librosa.effects.hpss(y[start:stop])
    else:
        h = y[start:stop]

    C = librosa.feature.chroma_cqt(y=h, sr=rate)

    plt.figure(figsize=(12, 4))
    librosa.display.specshow(C,
                             sr=rate,
                             x_axis='time',
                             y_axis='chroma',
                             vmin=0,
                             vmax=1)

    plt.title('Chromagram')
    plt.colorbar()
    plt.tight_layout()
    plt.show()

    if play:
        return play_button(y, rate, start_t, stop_t)
Пример #8
0
def synthesize(inputs):
    """
    Generate new Audio objects for output or further remixing.

    Parameters
    ----------

    inputs: generator, list, or tuple.
        See _format_inputs for details on parsing inputs.

    Returns
    ------
    An Audio object
    """
    # First we organize our inputs.
    inputs = _format_inputs(inputs)

    max_time = 0.0
    sample_rate = 44100
    array_length = 20 * 60 # 20 minutes!
    array_shape = (2, sample_rate * array_length)
    sparse_array = lil_matrix(array_shape)

    initial_offset = 0
    for i, (time_slice, start_time) in enumerate(inputs):
        # if we have a mono file, we return stereo here.
        resampled_audio, left_offset, right_offset = time_slice.get_samples()

        # set the initial offset, so we don't miss the start of the array
        if i == 0:
            initial_offset = max(left_offset * -1, right_offset * -1)

        # get the target start and duration
        start_time = start_time.delta * 1e-9
        duration = time_slice.duration.delta * 1e-9

        # find the max time
        if start_time + duration > max_time:
            max_time = start_time + duration
        # error if we'd go too far
        if start_time + duration > array_length:
            raise SynthesizeError("Amen can only synthesize up to 20 minutes of audio.")

        # get the target start and end samples
        starting_sample, _ = librosa.time_to_samples([start_time, start_time + duration],
                                                     sr=time_slice.audio.sample_rate)

        # figure out the actual starting and ending samples for each channel
        left_start = starting_sample + left_offset + initial_offset
        right_start = starting_sample + right_offset + initial_offset

        # add the data from each channel to the array
        sparse_array[0, left_start:left_start + len(resampled_audio[0])] += resampled_audio[0]
        sparse_array[1, right_start:right_start + len(resampled_audio[1])] += resampled_audio[1]

    max_samples = librosa.time_to_samples([max_time], sr=sample_rate)
    truncated_array = sparse_array[:, 0:max_samples].toarray()

    return Audio(raw_samples=truncated_array, sample_rate=sample_rate)
Пример #9
0
def pipeline(path, frame_ms=64, hop_ms=64):
    sig, rate = speech.read_soundfile(path)
    fsize = librosa.time_to_samples(float(frame_ms)/1000, rate)[0]
    hop = librosa.time_to_samples(float(hop_ms)/1000, rate)[0]
    frames = librosa.util.frame(sig, fsize, hop)
    rms = np.apply_along_axis(speech.rms, 0, frames)
    H, p = spectral_entropy(frames, rate, fsize)
    return sig, rate, frames, fsize, rms, H, p
Пример #10
0
def maptask_to_tacotron(output_path,
                        timed_units_path,
                        mono_path,
                        pause_time=1,
                        sr=20000):
    '''
    This script should extract the text and audio part of utterences seperated
    by $pause_time from $person in the Maptask dataset.

    The audio snippets is cut out and stored as wav-files with a name
    according to (ex: q1ec1-0001, q1ec1-0002, ...).

    Each line in the produces txt-file has the following form:

    name_of_audio|utterence (string)
    name_of_audio|utterence (string)
    '''
    mono_file_names = os.listdir(mono_path)
    mono_file_names.sort()  # Nice to do in order when debugging

    file_txt = join(output_path, 'maptask')
    file_f = open(file_txt+'.f.txt', "w")
    file_g = open(file_txt+'.g.txt', "w")

    wavs_path = join(output_path, 'wavs')
    if not exists(wavs_path):
        pathlib.Path(wavs_path).mkdir(parents=True, exist_ok=True)

    # Iterate through all (mono) audiofiles, chop the audio in to utterences
    for mono_wav in tqdm(mono_file_names):
        if '.wav' in mono_wav:  # failsafe
            # mono_wav: q1ec1.f.wav, q1ec1.g.wav, ...
            fpath = join(mono_path, mono_wav) # Full path to file

            # Load audio file
            sr, y = read(fpath)

            # get time and words from timed-units
            tu_data = get_time_filename_utterence(mono_wav, timed_units_path)

            for d in tu_data:
                start, end = d['time']  # time
                start = librosa.time_to_samples(start, sr=sr)
                end = librosa.time_to_samples(end, sr=sr)
                y_tmp = y[start:end]

                # write chopped audio to disk
                tmp_path = join(wavs_path, 'wavs', d['name']+'.wav')
                write(filename=tmp_path, rate=sr, data=y_tmp)

                # write corresponding row in txt
                s = d['name'] + '|' + d['words'] + '\n'
                if '.f.' in mono_wav:
                    file_f.write(s)
                else:
                    file_g.write(s)
    file_f.close()
    file_g.close()
Пример #11
0
def silence_fillers(file_path, file_name, start_time, end_time, sil_start_time,
                    sil_end_time):
    org_track = AudioSegment.from_file(os.path.join(file_path, file_name))
    org_track = org_track.set_frame_rate(22050)
    sample_rate = org_track.frame_rate
    sample_width = org_track.frame_width
    samples = np.array(org_track.get_array_of_samples())

    for idx in range(len(start_time)):

        pydub_start_t = start_time[idx] * 1000  #pydub works in ms
        pydub_end_t = end_time[idx] * 1000
        start_sample_id = librosa.time_to_samples(start_time[idx],
                                                  sr=sample_rate)[0]
        end_sample_id = librosa.time_to_samples(end_time[idx],
                                                sr=sample_rate)[0]

        temp_track = AudioSegment.silent(duration=(pydub_end_t -
                                                   pydub_start_t),
                                         frame_rate=sample_rate)
        temp_array = np.array(temp_track.get_array_of_samples())
        samples[start_sample_id:start_sample_id +
                temp_array.shape[0]] = temp_array

        # do a median filtering on samples
        samples[start_sample_id - 7:start_sample_id + temp_array.shape[0] +
                7] = scipy.signal.medfilt(
                    samples[start_sample_id - 7:start_sample_id +
                            temp_array.shape[0] + 7],
                    kernel_size=3)

    #to deal with the noise in the silences
    for idx in range(len(sil_start_time)):
        pydub_start_t = float(sil_start_time[idx]) * 1000  #pydub works in ms
        pydub_end_t = float(sil_end_time[idx]) * 1000
        start_sample_id = librosa.time_to_samples(float(sil_start_time[idx]),
                                                  sr=sample_rate)[0]
        end_sample_id = librosa.time_to_samples(float(sil_end_time[idx]),
                                                sr=sample_rate)[0]

        temp_track = AudioSegment.silent(duration=(pydub_end_t -
                                                   pydub_start_t),
                                         frame_rate=sample_rate)
        temp_array = np.array(temp_track.get_array_of_samples())
        samples[start_sample_id:start_sample_id +
                temp_array.shape[0]] = temp_array

        # do a median filtering on samples
        samples[start_sample_id - 7:start_sample_id + temp_array.shape[0] +
                7] = scipy.signal.medfilt(
                    samples[start_sample_id - 7:start_sample_id +
                            temp_array.shape[0] + 7],
                    kernel_size=3)

    sf.write(os.path.join(file_path, 'new_' + file_name),
             samples.astype('int16'), sample_rate)
Пример #12
0
def get_label_data(annotation, audio, label, sr):
    start_samples_indxs = librosa.time_to_samples(annotation[annotation.label ==\
                                                              label].start.values, sr)
    finish_samples_indxs = librosa.time_to_samples(annotation[annotation.label ==\
                                                               label].finish.values, sr)
    
    data = []
    for s, f in zip(start_samples_indxs, finish_samples_indxs):
        data.append(audio[np.arange(s, f)])
    return data
Пример #13
0
def play_button(y, rate, start_t=0, stop_t=None):
    '''Insert a play button that clips the audio between start and stop times. 
    By default, play the entire audio file.'''
    start = librosa.time_to_samples(start_t)

    if stop_t is not None:
        stop = librosa.time_to_samples(stop_t)

    return IPython.display.display(
        IPython.display.Audio(data=y[start:stop], rate=rate))
    def sound_data(self):
        (sound_data, sr) = self.recording.data
        if self.start_t is None:
            self.start_t = 0
        if self.end_t is None:
            self.end_t = librosa.samples_to_time(sound_data.size, sr)

        start_i = librosa.time_to_samples(self.start_t, sr)
        end_i = librosa.time_to_samples(self.end_t, sr)

        return sound_data[start_i:end_i]
Пример #15
0
def loss(drone: Drone, so: SoundObject):
    out = 0
    for _ in range(TRIALS):
        drone_wave = drone.play(SAMPLE_DURATION)
        st = random.uniform(0, so.duration - SAMPLE_DURATION)
        et = st + SAMPLE_DURATION
        ss = lr.time_to_samples(st, sm.sound.SAMPLE_RATE)
        es = lr.time_to_samples(et, sm.sound.SAMPLE_RATE)
        stft1 = lr.stft(so.samples[ss:es])
        stft2 = lr.stft(drone_wave.samples)
        out += np.mean(np.abs(stft1 - stft2))
    return out / TRIALS
Пример #16
0
def get_unlabel_data(annotation, audio, labels, sr):
    start_samples_indxs = librosa.time_to_samples(\
                          annotation[annotation.label.isin(labels)].start.values, sr)
    finish_samples_indxs = librosa.time_to_samples(\
                           annotation[annotation.label.isin(labels)].finish.values, sr)
    finish_samples_indxs = np.hstack([[0], finish_samples_indxs])
    start_samples_indxs = np.hstack([start_samples_indxs, [len(start_samples_indxs) - 1]])
    
    data = []
    for s, f in zip(finish_samples_indxs, start_samples_indxs):
        data.append(audio[np.arange(s, f)])
    return data
Пример #17
0
    def process_recording(self, recording):
        import librosa
        import soundfile

        original_file = self.project.get_recording_path(
            recording["recording_filename"], self.input_profile
        )

        destination_file = os.path.join(
            self.output_directory(),
            os.path.splitext(recording["recording_filename"])[0] + ".wav",
        )

        os.makedirs(name=os.path.dirname(destination_file), exist_ok=True)

        vettoed_segments = self.segments[
            self.segments["recording_filename"] == recording["recording_filename"]
        ]

        signal, sr = librosa.load(original_file, sr=None, mono=False)

        onsets = librosa.time_to_samples(
            times=vettoed_segments["segment_onset"].values / 1000, sr=sr
        )
        offsets = librosa.time_to_samples(
            times=vettoed_segments["segment_offset"].values / 1000, sr=sr
        )

        if signal.ndim == 1:
            for i in range(len(onsets)):
                signal[onsets[i] : offsets[i]] = 0

            soundfile.write(destination_file, signal, samplerate=sr)
        else:
            for i in range(len(onsets)):
                signal[:, onsets[i] : offsets[i]] = 0

            soundfile.write(destination_file, np.transpose(signal), samplerate=sr)

        return pd.DataFrame(
            [
                {
                    "original_filename": recording["recording_filename"],
                    "converted_filename": os.path.splitext(
                        recording["recording_filename"]
                    )[0]
                    + ".wav",
                    "success": True,
                }
            ]
        )
Пример #18
0
 def get_chords_in_interval(self, audio, chord_intervals, interval):
     start_index = librosa.time_to_samples(interval[0])
     end_index = librosa.time_to_samples(interval[1])
     audio_slice = audio[int(start_index):int(end_index)]
     ref_start, ref_end = interval[0], interval[1]
     
     chords = []
     curr_interval = chord_intervals[0]
     index = 0
     while curr_interval[0] < ref_end and index < len(chord_intervals):
         curr_interval = chord_intervals[index]
         if curr_interval[1] > ref_start:
             chords.append(curr_interval[2])
         index += 1
     return audio_slice, chords
Пример #19
0
def _trim(raw_audio, sr=read_audio.DEFAULT_SR):
    '''
    Finds the first onset of the sound, returns a good start time and end time that isolates the sound
    :param raw_audio: np array of audio data, from librosa.load
    :param sr: sample rate
    :return: dict with 'start' and 'end', in seconds
    '''
    start = 0.0
    end = None

    # Add an empty second so that the beginning onset is recognized
    silence_to_add = 1.0
    raw_audio = np.append(np.zeros(int(silence_to_add * sr)), raw_audio)

    # Spectral flux
    hop_length = int(librosa.time_to_samples(1. / 200, sr=sr))
    onsets = librosa.onset.onset_detect(y=raw_audio, sr=sr, hop_length=hop_length, units='time')

    if len(onsets) == 0:
        return {'start': start, 'end': end}
    elif len(onsets) > 1:
        # If there are multiple onsets, cut it off just before the second one
        end = onsets[1] - (silence_to_add + 0.01)

    start = max(onsets[0] - (silence_to_add + 0.01), 0.0)
    return {'start': start, 'end': end}
Пример #20
0
def default_onset(y, fs, window_length=51, polyorder=3):
    # These parameters are taken directly from the paper
    n_fft = 1024
    hop_length = int(librosa.time_to_samples(1. / 200, sr=fs))
    n_mels = 138
    fmin = 27.5
    fmax = 16000.
    # The paper uses a log-frequency representation,
    # but for simplicity, we'll use a Mel spectrogram instead.
    S = librosa.feature.melspectrogram(y,
                                       sr=fs,
                                       n_fft=n_fft,
                                       hop_length=hop_length,
                                       fmin=fmin,
                                       fmax=fmax,
                                       n_mels=n_mels)

    # compute the onset strength envelope
    # onset events using the librosa defaults.
    env_default = librosa.onset.onset_strength(y=y,
                                               sr=fs,
                                               hop_length=hop_length)
    env_default = smoothing.smooth(
        env_default, window_length,
        polyorder)  # window size 51, polynomial order 3
    onset_def = librosa.onset.onset_detect(y=env_default,
                                           sr=fs,
                                           hop_length=hop_length,
                                           units='time')
    return onset_def
def makeAudio(events, iteration, stimdir, spatial_flag=False):   
    eventsinsamples = librosa.time_to_samples(events,sr=int(sr_audio))
    # audiobufffers for spatial and mono audio     
    audiobuffer_L = np.zeros(max(eventsinsamples) + largestsampnum)
    audiobuffer_R = np.zeros(max(eventsinsamples) + largestsampnum)    
    y_mono = y
    
    for startpos in eventsinsamples:
        random_deg = np.random.randint(N)
        y_l = samples[random_deg][0]
        y_r = samples[random_deg][1]

        if spatial_flag == True:
            audiobuffer_L[startpos:(startpos + len(y_l))] = audiobuffer_L[startpos:(startpos + len(y_l))] + y_l
            audiobuffer_R[startpos:(startpos + len(y_r))] = audiobuffer_R[startpos:(startpos + len(y_r))] + y_r
        
        if spatial_flag == False:
            audiobuffer_L[startpos:(startpos + len(y_mono))] = audiobuffer_L[startpos:(startpos + len(y_mono))] + y_mono
            audiobuffer_R[startpos:(startpos + len(y_mono))] = audiobuffer_R[startpos:(startpos + len(y_mono))] + y_mono

    audio_l = 0.8*audiobuffer_L/max(audiobuffer_L)
    audio_r = 0.8*audiobuffer_R/max(audiobuffer_R)
    
    audio = np.array([audio_l, audio_r])
    audiofi = os.path.join(stimdir, config.dist_type + '_' + config.strs['quantize'] + '-' + str(config.qsteps) + '_' + config.strs['binaural'][0] + '_' + str(config.N) + '_' + iteration + '.wav')
    sf.write(audiofi, audio.T, samplerate=int(sr_audio))
    print('creating', audiofi)
    return audio
Пример #22
0
def makeAudio(events, iteration, stimdir, spatial_flag=False):   
    eventsinsamples = librosa.time_to_samples(events,sr=sr_audio)   
    # audiobufffers for spatial and mono audio     
    audiobuffer_L = np.zeros(max(eventsinsamples) + largestsampnum)
    audiobuffer_R = np.zeros(max(eventsinsamples) + largestsampnum)    
    y_mono = y
    
    for startpos in eventsinsamples:
        random_deg = np.random.randint(100)
        y_l = samples[random_deg][0]
        y_r = samples[random_deg][1]

        if spatial_flag == True:
            audiobuffer_L[startpos:(startpos + len(y_l))] = audiobuffer_L[startpos:(startpos + len(y_l))] + y_l
            audiobuffer_R[startpos:(startpos + len(y_r))] = audiobuffer_R[startpos:(startpos + len(y_r))] + y_r
        
        if spatial_flag == False:
            audiobuffer_L[startpos:(startpos + len(y_mono))] = audiobuffer_L[startpos:(startpos + len(y_mono))] + y_mono
            audiobuffer_R[startpos:(startpos + len(y_mono))] = audiobuffer_R[startpos:(startpos + len(y_mono))] + y_mono

    #audio_l = np.sum(audiobuffer_L, axis=0)
    #audio_r = np.sum(audiobuffer_R, axis=0)
    
    audio_l = 0.8*audiobuffer_L/max(audiobuffer_L)
    audio_r = 0.8*audiobuffer_R/max(audiobuffer_R)
    
    audio = np.array([audio_l, audio_r])
    audiofi = os.path.join(stimdir, dist_type[0] + '_' + binaural_str[0] + '_' + str(N) + '_' + str(np.round(iteration,2)) + '.wav')
    sf.write(audiofi, audio.T, samplerate=sr_audio)
    print('creating', audiofi)
    return audio
def split_audio(audio, beats, sr):
    beats_sample = librosa.time_to_samples(beats, sr=sr)
    audio_split = [
        audio[beats_sample[i]:beats_sample[i + 1]]
        for i in range(len(beats_sample) - 1)
    ]
    return audio_split
Пример #24
0
def run_algorithm(audio_file,
                  n_templates=[0, 0, 0],
                  output_savename="extracted_loop"):
    """Complete pipeline of algorithm.

    Parameters
    ----------
    audio_file : string
        Path to audio file to be loaded and analysed.
    n_templates : list of length 3
        The number of sound, rhythm and loop templates.
        Default value (0,0,0) causes the script to estimate reasonable values.
    output_savename: : string
        Base string for saved output filenames.

    Returns
    -------
    A set of files containing the extracted loops.

    Examples
    --------
    >>> run_algorithm("example_song.mp3", [40,20,7], "extracted_loop")
    
    See also
    --------
    tensorly.decomposition.non_negative_tucker
    """
    assert os.path.exists(audio_file)
    assert len(n_templates) == 3
    assert type(n_templates) is list
    # Load mono audio:
    signal_mono, fs = librosa.load(audio_file, sr=None, mono=True)
    # Use madmom to estimate the downbeat times:
    downbeat_times = get_downbeats(signal_mono)
    # Convert times to frames so we segment signal:
    downbeat_frames = librosa.time_to_samples(downbeat_times, sr=fs)
    # Create spectral cube out of signal:
    spectral_cube = make_spectral_cube(signal_mono, downbeat_frames)
    # Validate the input n_templates (inventing new ones if any is wrong):
    n_sounds, n_rhythms, n_loops = validate_template_sizes(
        spectral_cube, n_templates)
    # Use TensorLy to do the non-negative Tucker decomposition:
    core, factors = tld.non_negative_tucker(np.abs(spectral_cube),
                                            [n_sounds, n_rhythms, n_loops],
                                            n_iter_max=500,
                                            verbose=True)
    # Reconstruct each loop:
    for ith_loop in range(n_loops):
        # Multiply templates together to get real loop spectrum:
        loop_spectrum = create_loop_spectrum(factors[0], factors[1],
                                             core[:, :, ith_loop])
        # Choose best bar to reconstruct from (we will use its phase):
        bar_ind = choose_bar_to_reconstruct(factors[2], ith_loop)
        # Reconstruct loop signal by masking original spectrum:
        ith_loop_signal = get_loop_signal(loop_spectrum,
                                          spectral_cube[:, :, bar_ind])
        # Write signal to disk:
        librosa.output.write_wav(
            "{0}_{1}.wav".format(output_savename, ith_loop), ith_loop_signal,
            fs)
Пример #25
0
 def get_start_sample(self) -> int:
     if Beat.INDEX_VALUE == 'samples':
         return self.index
     elif Beat.INDEX_VALUE == 'time':
         return librosa.time_to_samples(self.index, sr=util.SAMPLE_RATE)
     else:
         raise NotImplementedError("Only samples and time are supported")
Пример #26
0
def test_tempo():
    def __test(tempo, sr, hop_length, ac_size, aggregate, y):

        tempo_est = librosa.beat.tempo(y=y,
                                       sr=sr,
                                       hop_length=hop_length,
                                       ac_size=ac_size,
                                       aggregate=aggregate)

        # Being within 5% for the stable frames is close enough
        if aggregate is None:
            win_size = int(ac_size * sr // hop_length)
            assert np.all(
                np.abs(tempo_est[win_size:-win_size] - tempo) <= 0.05 * tempo)
        else:
            assert np.abs(tempo_est - tempo) <= 0.05 * tempo, (tempo,
                                                               tempo_est)

    for sr in [22050, 44100]:
        for tempo in [40, 60, 80, 110, 160]:
            # Make a pulse train at the target tempo
            y = np.zeros(20 * sr)
            delay = np.asscalar(librosa.time_to_samples(60. / tempo, sr=sr))
            y[::delay] = 1
            for hop_length in [512, 1024]:
                for ac_size in [4, 8]:
                    for aggregate in [None, np.mean]:
                        yield __test, tempo, sr, hop_length, ac_size, aggregate, y
Пример #27
0
def main(args):
    outname = args.outdir
    y, sr = librosa.load(args.wavpath, sr=None)
    adjusted_onset_jam = jams.load(args.onsetjams)
    ann = adjusted_onset_jam.search(namespace='onset')[0]
    adjusted_onset_times = ann.to_event_values()[0]
    adj_on_samps = librosa.time_to_samples(adjusted_onset_times, sr=sr)
    y_chopt = chop_sig(y, adj_on_samps)
    print("about to clear csv files")

    with open(outname + '_pt.csv', 'w') as csvfile:
        pass
    with open(outname + '_onoff.csv', 'w') as csvfile:
        pass

    k = 0
    for seg, seg_start_time in zip(y_chopt, adjusted_onset_times):
        if k % 20 == 0:
            print(k, len(y_chopt))
        k += 1

        offset_time, pitch_track, t_step = segment_offset(
            seg, sr, seg_start_time)
        with open(outname + '_pt.csv', 'a') as pt:
            writer = csv.writer(pt, delimiter=',')
            for i, f in enumerate(pitch_track):
                if f > 0:
                    writer.writerow([seg_start_time + i * float(t_step), f])
        with open(outname + '_onoff.csv', 'a') as onoff:
            writer = csv.writer(onoff, delimiter=',')
            writer.writerow([seg_start_time, offset_time])

    return 0
Пример #28
0
def find_bounds_of_chord(song_analysis, chord_regex, rate):
    #bounds_chord = [librosa.time_to_samples(t, song[1]) for t in timestamps_chord]

    pairs = list(zip(song_analysis[:-1], song_analysis[1:]))
    time_bounds = [(float(chord['timestamp']), float(next_chord['timestamp'])) for chord, next_chord in pairs if re.match(chord_regex, chord['label'])]
    bounds = [librosa.time_to_samples(t, rate)for t in time_bounds]
    return bounds
Пример #29
0
    def __init__(self,
                 dataset,
                 sr=22050,
                 frameSize=2048,
                 hopSize=512,
                 transform=None,
                 cacheSize=4):
        self.dataset = dataset
        self.sr = sr
        self.frameSize = frameSize
        self.hopSize = hopSize
        self.transform = transform
        self.cacheSize = cacheSize
        self.frameDt = float(frameSize) / sr

        # count frames in dataset
        nFramesList = []
        for pathPair in dataset.pathPairs:
            wavPath = pathPair.wav
            duration = librosa.get_duration(filename=wavPath)
            nSamples = librosa.time_to_samples(duration, sr=self.sr)
            nFrames = 1 + int(
                (nSamples - self.frameSize) / float(self.hopSize))
            nFramesList.append(nFrames)
            # check validation
            sStart = librosa.frames_to_samples(nFrames - 1,
                                               hop_length=self.hopSize)
            sEnd = sStart + self.frameSize
            assert (nSamples > 0) and (
                sEnd <= nSamples), f'{nFrames}:{sStart}_{sEnd}, {nSamples}'
        self.frameCumsum = np.cumsum(nFramesList)

        # FIFO cache
        self._sampleCache = deque(maxlen=cacheSize)
        self._sampleIdxCache = deque(maxlen=cacheSize)
Пример #30
0
    def _slice_audio_by_interval(y: np.ndarray,
                                 sr: float,
                                 hop_length: int = 512,
                                 segmentation_interval_s: float = 1.0,
                                 **_kwargs) -> Tuple[np.ndarray, np.ndarray]:
        interval_samples: int = librosa.time_to_samples(
            segmentation_interval_s, sr=sr)
        total_samples: int = y.size  # y is monophonic
        num_segments: int = np.ceil(total_samples / interval_samples)
        onset_samples: np.ndarray = interval_samples * np.arange(num_segments)
        onset_frames: np.ndarray = librosa.samples_to_frames(
            onset_samples, hop_length=hop_length)

        duration_samples: np.ndarray = interval_samples * np.ones_like(
            onset_frames)

        # adjust duration of last fragment to end of file
        remainder = total_samples % interval_samples

        if remainder == 0:
            # `total_samples` is divisible by `interval_samples`: ceil operation above was not needed
            pass
        else:
            # `total_samples` is not divisible by `interval_samples`: last slice is shorter
            duration_samples[-1] = remainder

        duration_frames: np.ndarray = librosa.samples_to_frames(
            duration_samples, hop_length=hop_length)

        return onset_frames, duration_frames
Пример #31
0
def get_onset(wav_path):
    y, sr = librosa.core.load(wav_path, sr=None)
    sos = signal.butter(25, 100, btype='highpass', fs=sr, output='sos')
    wav_data = signal.sosfilt(sos, y)
    wav_data = normalize(wav_data)

    sodf = SpectralOnsetProcessor(onset_method='complex_flux',
                                  fps=50,
                                  filterbank=LogarithmicFilterbank,
                                  fmin=100,
                                  num_bands=24,
                                  norm=True)
    from madmom.audio.signal import Signal
    onset_strength = (sodf(Signal(data=wav_data, sample_rate=sr)))
    onset_strength = librosa.util.normalize(onset_strength)
    h_length = int(librosa.time_to_samples(1. / 50, sr=sr))

    onset_times = librosa.onset.onset_detect(onset_envelope=onset_strength,
                                             sr=sr,
                                             hop_length=h_length,
                                             units='time',
                                             pre_max=5,
                                             post_max=5,
                                             pre_avg=5,
                                             post_avg=5)
    f = open(onset_path, 'w')
    for x in onset_times:
        f.write(f"{x}\n")
    return onset_times
Пример #32
0
def test_tempo():

    def __test(tempo, sr, hop_length, ac_size, aggregate, y):

        tempo_est = librosa.beat.tempo(y=y, sr=sr, hop_length=hop_length,
                                       ac_size=ac_size,
                                       aggregate=aggregate)

        # Being within 5% for the stable frames is close enough
        if aggregate is None:
            win_size = int(ac_size * sr // hop_length)
            assert np.all(np.abs(tempo_est[win_size:-win_size] - tempo) <= 0.05 * tempo)
        else:
            assert np.abs(tempo_est - tempo) <= 0.05 * tempo, (tempo, tempo_est)

    for sr in [22050, 44100]:
        for tempo in [40, 60, 80, 110, 160]:
            # Make a pulse train at the target tempo
            y = np.zeros(20 * sr)
            delay = np.asscalar(librosa.time_to_samples(60./tempo, sr=sr))
            y[::delay] = 1
            for hop_length in [512, 1024]:
                for ac_size in [4, 8]:
                    for aggregate in [None, np.mean]:
                        yield __test, tempo, sr, hop_length, ac_size, aggregate, y
Пример #33
0
Файл: ltsd.py Проект: jlep/vad
def test(filename=None):
    import random, os
    import matplotlib.pyplot as plt
    from sys import argv
    #signal, params = read_signal(sound,WINSIZE)
    scenario=None
    if filename != None:
        scene = os.path.basename(filename)[0]
    else:
        filename = random.choice([x for x in os.listdir("tmp/") if os.path.splitext(x)[1] == ".flac"])
        scene = filename[0]
        filename = "tmp/"+filename
    print(filename)
    truths = vad.load_truths()
    signal,rate = speech.read_soundfile(filename)
    seconds = float(len(signal))/rate
    winsize = librosa.time_to_samples(float(WINMS)/1000, rate)[0]
    window = sp.hanning(winsize)
    ltsd = LTSD(winsize,window,5)
    res, threshold,nstart,nend =  ltsd.compute(signal)
    segments = ltsd.segments(res, threshold)
    #print(float(len(signal))/rate, librosa.core.frames_to_time(len(res), 8000, winsize/2))
    segments = librosa.core.frames_to_time(segments, rate, winsize/2)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    #ax.plot((signal/np.max(signal))*np.mean(res)+np.mean(res))
    ax.plot(np.linspace(0,seconds, len(res)), res)
    ax.plot([0, seconds], [threshold, threshold])
    vad.plot_segments(truths[scene]['combined'], segments, ax)
    n1 = float(nstart)/rate
    n2 = float(nend)/rate
    ax.vlines([n1,n2], -20,20)
    plt.show()
Пример #34
0
def pipeline(path, frame_ms=30, hop_ms=15):
    print("load")
    #sig, rate = librosa.load(path)
    #sig2, rate2 = ad.read_file(path)
    sig, rate = soundfile.read(path)
    sig = signal.wiener(sig)
    print("rate", rate)
    fsize = librosa.time_to_samples(float(frame_ms)/1000, rate)[0]
    hop = librosa.time_to_samples(float(hop_ms)/1000, rate)[0]
    print("frame size", fsize, "hop", hop)
    frames = librosa.util.frame(sig, fsize, hop)
    w = signal.hann(fsize)
    #frames_W = np.zeros_like(frames)
    #print(frames.shape)
    #frames = frames.T
    #print(w.shape)
    print("windowing function")
    frames_w = np.apply_along_axis(lambda x,w: x*w, 0, frames, w)
    frames = frames_w
    print("window suppression")
    frames = np.apply_along_axis(lambda x,w: x/(w+1e-15), 0, frames, w)
    #    frames_W[i] = signal.convolve(frames[i],w, mode='same')
    #frames = frames_W.T
    #w = signal.correlate(w,w,mode='full')
    #w = w[w.size/2:]
    #print(frames.shape)
    #frames = sigutil.enframe(sig, fsize, hop, signal.hann)
    print("normalized autocorrelation")
    naccs = np.apply_along_axis(nacc, 0, frames)
    print("trimming")
    naccs = np.apply_along_axis(trim_frame, 0, naccs)
    print(naccs.shape)
    minacs = np.zeros_like(naccs)
    for i in range(len(naccs.T)):
        minacs[:,i] = min_ac(naccs.T, i)
    print(minacs.shape)
    print("variances")
    #acvars = np.apply_along_axis(acvar, 0, naccs2)
    acvars = np.apply_along_axis(acvar, 0, minacs)
    print("ltacs")
    ltacs = np.zeros_like(acvars)
    for i in range(len(acvars)):
        ltacs[i] = ltac(acvars, i)
    return sig, rate, frames, fsize, minacs, acvars, ltacs
Пример #35
0
    def __test():
        beat = audio.timings['beats'][0]

        start = beat.time.delta * 1e-9
        duration = beat.duration.delta * 1e-9
        starting_sample, ending_sample = librosa.time_to_samples([start, start + duration], beat.audio.sample_rate)

        samples, left_offset, right_offset = beat.get_samples()
        left_offsets, right_offsets = beat._get_offsets(starting_sample, ending_sample, beat.audio.num_channels)

        duration = beat.duration.delta * 1e-9
        starting_sample, ending_sample = librosa.time_to_samples([0, duration], audio.sample_rate)

        initial_length = ending_sample - starting_sample
        left_offset_length = initial_length - left_offsets[0] + left_offsets[1]
        right_offset_length = initial_length - right_offsets[0] + right_offsets[1]

        assert(len(samples[0]) == left_offset_length)
        assert(len(samples[1]) == right_offset_length)
Пример #36
0
    def __test():
        beat = audio.timings['beats'][0]
        samples, left_offset, right_offset = beat.get_samples()

        start = beat.time.delta * 1e-9
        duration = beat.duration.delta * 1e-9
        starting_sample, ending_sample = librosa.time_to_samples([start, start + duration], beat.audio.sample_rate)
        left_offsets, right_offsets = beat._get_offsets(starting_sample, ending_sample, beat.audio.num_channels)

        start_sample = left_offsets[0] * -1
        end_sample = len(samples[0]) - left_offsets[1]
        reset_samples = samples[0][start_sample : end_sample]

        original_samples = audio.raw_samples[0, starting_sample : ending_sample]

        assert(np.array_equiv(reset_samples, original_samples))
Пример #37
0
    def get_samples(self):
        """
        Gets the samples corresponding to this TimeSlice from the parent audio object.
        """
        start = self.time.delta * 1e-9
        duration = self.duration.delta * 1e-9
        starting_sample, ending_sample = librosa.time_to_samples([start, start + duration],
                                                                 self.audio.sample_rate)

        left_offsets, right_offsets = self._get_offsets(starting_sample,
                                                        ending_sample,
                                                        self.audio.num_channels)

        samples = self._offset_samples(starting_sample, ending_sample,
                                       left_offsets, right_offsets,
                                       self.audio.num_channels)

        return samples, left_offsets[0], right_offsets[0]
Пример #38
0
Файл: ltsd.py Проект: jlep/vad
def vad(soundfile, noisefile=None):
    signal,rate = speech.read_soundfile(soundfile)
    if noisefile != None:
        noise,nrate = speech.read_soundfile(noisefile)
        print("found noisefile: "+noisefile)
    else:
        noise = None
    seconds = float(len(signal))/rate
    winsize = librosa.time_to_samples(float(WINMS)/1000, rate)[0]
    window = sp.hanning(winsize)
    ltsd = LTSD(winsize,window,5, init_noise=noise)
    res, threshold,nstart,nend =  ltsd.compute(signal)
    segments,  = ltsd.segments(res, threshold)
    #print(float(len(signal))/rate, librosa.core.frames_to_time(len(res), 8000, winsize/2))
    segments = librosa.core.samples_to_time(segments, rate).tolist()
    indexes = []
    for s in segments:
        indexes += s
    indexes.append(seconds)
    return indexes
Пример #39
0
    def __test(times, frames, sr, hop_length, click_freq, click_duration, click, length):

        y = librosa.clicks(times=times,
                           frames=frames,
                           sr=sr,
                           hop_length=hop_length,
                           click_freq=click_freq,
                           click_duration=click_duration,
                           click=click,
                           length=length)

        if times is not None:
            nmax = librosa.time_to_samples(times, sr=sr).max()
        else:
            nmax = librosa.frames_to_samples(frames, hop_length=hop_length).max()

        if length is not None:
            assert len(y) == length
        elif click is not None:
            assert len(y) == nmax + len(click)
Пример #40
0
def segment(audio_file, mode, db_delta_thresh=2.5, **kwargs):
    x, fs = claudio.read(audio_file, samplerate=22050, channels=1, bytedepth=2)

    if mode == 'hll':
        onset_times = hll_onsets(audio_file)
    else:
        onset_times = ONSETS.get(mode)(x, fs, **kwargs)

    onset_idx = librosa.time_to_samples(onset_times, fs)

    log_env_lpf = log_envelope(x, fs, 100)
    recs = []
    for time, idx in zip(onset_times, onset_idx):
        x_m = log_env_lpf[idx: idx + int(fs)]
        rec = dict(time=time, env_max=x_m.max(),
                   env_mean=x_m.mean(), env_std=x_m.std(),
                   env_delta=x_m.max() - log_env_lpf.mean())
        if rec['env_delta'] > db_delta_thresh:
            recs += [rec]

    return pd.DataFrame.from_records(recs)
 def __test(sr):
     assert np.allclose(librosa.time_to_samples([0, 1, 2], sr=sr),
                        [0, sr, 2 * sr])
Пример #42
0
def pipeline(path, frame_ms=30, hop_ms=15, filt=True, noisy=True, shift=True, snr=30):
    #sig, rate = librosa.load(path)
    #sig2, rate2 = ad.read_file(path)
    sig, rate = speech.read_soundfile(path)
    sig = signal.wiener(sig)
    fsize = librosa.time_to_samples(float(frame_ms)/1000, rate)[0]
    hop = librosa.time_to_samples(float(hop_ms)/1000, rate)[0]
    if filt:
        sig = bp_filter(sig)
    if noisy:
        sig = speech.add_noise(sig, "noise8k/white.flac", snr)
    frames = librosa.util.frame(sig, fsize, hop)
    w = signal.hann(fsize)
    #frames_W = np.zeros_like(frames)
    #print(frames.shape)
    #frames = frames.T
    #print(w.shape)
    frames_w = np.apply_along_axis(lambda x,w: x*w, 0, frames, w)
    frames = frames_w
    frames = np.apply_along_axis(lambda x,w: x/(w+1e-15), 0, frames, w)
    #    frames_W[i] = signal.convolve(frames[i],w, mode='same')
    #frames = frames_W.T
    #w = signal.correlate(w,w,mode='full')
    #w = w[w.size/2:]
    #print(frames.shape)
    #frames = sigutil.enframe(sig, fsize, hop, signal.hann)
    #print("normalized autocorrelation")
    naccs = np.apply_along_axis(nacc, 0, frames)
    #print("trimming")
    naccs = np.apply_along_axis(trim_frame, 0, naccs)
    lags = np.zeros(len(naccs.T))
    acf_n = np.zeros(len(naccs.T))
    for i in range(len(naccs.T)):
        frame = naccs.T[i]
        relmax = signal.argrelmax(frame)[0]
        if len(relmax)>0:
            argmax2 = relmax[0] + np.argmax(frame[relmax[0]:])
        else:
            argmax2 = np.argmax(frame)
        #print(relmax)
        """
        if len(relmax)>=2:
            #print(relmax[0], relmax[1], relmax[1]-relmax[0])
            lags[i] = relmax[1]-relmax[0]
        elif len(relmax) == 1:
            lags[i] = relmax[0]
        """
        lags[i] = argmax2
        acf_n[i] = len(relmax)
        #print(lags[i], len(relmax))
        naccs.T[i] = np.roll(frame, -1*argmax2)
    #minacs = np.zeros_like(naccs)
    #for i in range(len(naccs.T)):
    #    minacs[:,i] = min_ac(naccs.T, i)
    meanacs = np.zeros_like(naccs)
    for i in range(len(naccs.T)):
        meanacs[:,i] = mean_ac(naccs.T, i)
    #print(naccs.shape)
    #print(meanacs.shape)
    #print("lags")
    #print("variances")
    #acvars = np.apply_along_axis(acvar, 0, naccs2)
    acvars = np.apply_along_axis(acvar, 0, meanacs)
    #print("ltacs")
    ltacs = np.zeros_like(acvars)
    for i in range(len(acvars)):
        ltacs[i] = ltac(acvars, i)
    print("done: "+path)
    return sig, rate, frames, fsize, meanacs, acvars, ltacs, (lags, acf_n)
Пример #43
0
######################################################
# We'll load in a five-second clip of a track that has
# noticeable vocal vibrato.
# The method works fine for longer signals, but the
# results are harder to visualize.
y, sr = librosa.load('audio/Karissa_Hobbs_-_09_-_Lets_Go_Fishin.mp3',
                     sr=44100,
                     duration=5,
                     offset=35)


####################################################
# These parameters are taken directly from the paper
n_fft = 1024
hop_length = int(librosa.time_to_samples(1./200, sr=sr))
lag = 2
n_mels = 138
fmin = 27.5
fmax = 16000.
max_size = 3


########################################################
# The paper uses a log-frequency representation, but for
# simplicity, we'll use a Mel spectrogram instead.
S = librosa.feature.melspectrogram(y, sr=sr, n_fft=n_fft,
                                   hop_length=hop_length,
                                   fmin=fmin,
                                   fmax=fmax,
                                   n_mels=n_mels)
Пример #44
0
def meshuggahme(input_file, features, improve_func, onset_dicts, onset_dir,
                metric='cosine', output_file='output.wav', original_w=9.5):
    """Converts the given input file into a Meshuggah track and saves it into
    disk as a wav file.

    Parameters
    ----------
    input_file : str
        Path to the input audio file to be converted.
    features : np.array
        Model of features to use (either MFCCs or CQT)
    improve_func : function
        One of the _improve_ functions (see above)
    onset_dicts : dictionary
        Onsets model (see generate_data script)
    onset_dir : str
        Path to directory with meshuggah onset audio files
    metric : str
        One of the scipy.spatial.distance functions
    output_file : str
        Path to the output wav file
    original_w : float
        Weight of the original file (the higher the more original audio we'll
        get)
    """
    y, onset_times, mfcc_sync, cqt_sync, chroma_sync = \
        compute_features(input_file)
    assert mfcc_sync.shape[1] == cqt_sync.shape[1] and \
        cqt_sync.shape[1] == chroma_sync.shape[1]

    # Select feature
    if features.shape[0] == CQT_BINS:
        feat_sync = cqt_sync
    elif features.shape[0] == N_MFCC:
        feat_sync = mfcc_sync
    elif features.shape[0] == N_CHROMA:
        feat_sync = chroma_sync

    # Improve features
    features = improve_func(features.T)

    # Construct
    for feat, (start, end) in zip(feat_sync.T, zip(onset_times[:-1],
                                                   onset_times[1:])):
        # Get start and end times in samples
        start_end_samples = librosa.time_to_samples(np.array([start, end]),
                                                    sr=SRATE)

        # Compute minimum distance from all the matrix of onsets
        D = distance.cdist(features, improve_func(feat.reshape((1, -1))),
                           metric=metric)
        argsorted = np.argsort(D.flatten())

        # Find onset id with at least the same duration as the meshuggah onset
        sort_idx = 0
        dur = 0
        while True:
            onset_id = argsorted[sort_idx]

            # Get dictionary
            onset_dict = onset_dicts[onset_id]

            # Increase index to go to the next closest meshuggah onset
            sort_idx += 1

            # Try to concatenate
            x, sr = librosa.load(os.path.join(onset_dir,
                                              onset_dict["onset_file"]),
                                 sr=SRATE)
            if len(y[start_end_samples[0]:start_end_samples[1]]) <= len(x):
                break

        # Concatenate new audio
        w = np.min([D[onset_id][0] * original_w, 1])  # Rebalance weight
        y[start_end_samples[0]:start_end_samples[1]] = \
            y[start_end_samples[0]:start_end_samples[1]] * w + \
            x[:start_end_samples[1] - start_end_samples[0]] * (1 - w)

    # Write new audio file
    librosa.output.write_wav(output_file, y, sr=SRATE)