def analyse(filename, resample_to=2756, bt_hop_length=128,
            chroma_hop_length=512, chroma_n_fft=1024):
    samples, sampleRate = librosa.load(filename)
    length = float(len(samples))/sampleRate
    if resample_to:
        samples = librosa.resample(samples, sampleRate, resample_to)
        sampleRate = resample_to
    newSampleRate = 2756
    samples = librosa.resample(samples, sampleRate, newSampleRate)
    sampleRate = newSampleRate
    tempo, beats = librosa.beat.beat_track(samples, sampleRate,
                                           hop_length=bt_hop_length)
    beat_times = librosa.frames_to_time(beats, sampleRate,
                                        hop_length=bt_hop_length)
    chromagram = librosa.feature.chromagram(samples, sampleRate,
                                            hop_length=chroma_hop_length,
                                            n_fft=chroma_n_fft)
    chromagram = numpy.transpose(chromagram)
    distances = scipy.spatial.distance.cdist(chromagram, CHORDS, "cosine")
    chords = distances.argmin(axis=1)
    chords = scipy.signal.medfilt(chords, 11)
    chord_frames = numpy.array(numpy.where(numpy.diff(chords) != 0))
    chords = chords[chord_frames][0].astype(int)
    chord_times = librosa.frames_to_time(chord_frames, sampleRate,
                                         hop_length=chroma_hop_length,
                                         n_fft=chroma_n_fft)[0]
    chord_names = CHORD_NAMES[chords]
    return {"beats": list(beat_times),
            "chords": [{"chord": chord_name, "time": chord_time} for chord_name, chord_time in zip(chord_names, chord_times)],
            "tempo": tempo}
Пример #2
0
def test_resample_scikitsamplerate():
    warnings.resetwarnings()
    warnings.simplefilter('always')
    with warnings.catch_warnings(record=True) as out:

        librosa.resample(np.zeros(1000), 1000, 500, res_type='sinc_best')

        assert len(out) > 0
        assert out[0].category is DeprecationWarning
        assert 'deprecated' in str(out[0].message).lower()
Пример #3
0
def lpc_formants(signal, sr, num_formants, max_freq, time_step,
                 win_len, window_shape='gaussian'):
    output = {}
    new_sr = 2 * max_freq
    alpha = np.exp(-2 * np.pi * 50 * (1 / new_sr))
    proc = lfilter([1., -alpha], 1, signal)
    if sr > new_sr:
        proc = librosa.resample(proc, sr, new_sr)
    nperseg = int(win_len * new_sr)
    nperstep = int(time_step * new_sr)
    if window_shape == 'gaussian':
        window = gaussian(nperseg + 2, 0.45 * (nperseg - 1) / 2)[1:nperseg + 1]
    else:
        window = np.hanning(nperseg + 2)[1:nperseg + 1]
    indices = np.arange(int(nperseg / 2), proc.shape[0] - int(nperseg / 2) + 1, nperstep)
    num_frames = len(indices)
    for i in range(num_frames):
        if nperseg % 2 != 0:
            X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2) + 1]
        else:
            X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2)]
        frqs, bw = process_frame(X, window, num_formants, new_sr)
        formants = []
        for j, f in enumerate(frqs):
            if f < 50:
                continue
            if f > max_freq - 50:
                continue
            formants.append((np.asscalar(f), np.asscalar(bw[j])))
        missing = num_formants - len(formants)
        if missing:
            formants += [(None, None)] * missing
        output[indices[i] / new_sr] = formants
    return output
Пример #4
0
def read_song(source_id):
    song_data_raw, source_sr = lr.load(data_source[source_id])
    song_data = lr.resample(song_data_raw, source_sr, target_sr, scale=True)
    song_data = song_data[:song_data.shape[0]/10]
    song_data, data_denom = norm(song_data)
    
    return song_data, source_sr, data_denom
Пример #5
0
def get_best_fs_ratio(a, b, max_drift, steps, max_offset, correlation_size, center=1):
    '''
    Given two signals with components in common, tries to estimate the clock drift and offset of b vs a
    
    :parameters:
        - a : np.ndarray
            Some signal
        - b : np.ndarray
            Some other signal
        - max_drift : float
            max sample rate drift, in percent, e.g. .02 = 2% clock drift
        - steps : int
            Number of sample rates to consider, between -max_drift and max_drift
        - max_offset : int
            Maximum expected offset of the signals
        - correlation_size : int
            Number of samples to use in each correlate
        - center : float
            Ratio to deviate from - default 1
    Output:
        fs_ratio - fs ratio to make b line up well with a
    '''
    # Sample rate ratios to try
    fs_ratios = center + np.linspace(-max_drift, max_drift, steps + 1)
    # The max correlation value for each fs ratio
    corr_max = np.zeros(fs_ratios.shape)
    for n, ratio in enumerate(fs_ratios):
        # Resample b with this fs ratio
        b_resampled = librosa.resample(b, 1, ratio)
        # Compute the max correlation
        _, corr = align_over_window(a, b_resampled, max_offset, correlation_size)
        corr_max[n] = corr.max()
    # Choose ratio with the highest correlation value
    return fs_ratios[np.argmax(corr_max)]
Пример #6
0
def read_song(source_id, target_sr):
    song_data_raw, source_sr = lr.load(data_source[source_id])
    song_data = lr.resample(song_data_raw, source_sr, target_sr, scale=True)
    song_data = song_data[1500:(1500+2*seq_size)] #song_data.shape[0]/10]
    song_data, data_denom = norm(song_data, return_denom=True)

    return song_data, source_sr, data_denom
Пример #7
0
    def save_waveform_as(self, waveform, data_id, dst):
        source_sr, data_denom = self.get_data_info(data_id)

        waveform *= data_denom
        waveform_resampled = lr.resample(waveform, self.cfg.target_sr, source_sr, scale=True)

        logging.info("Saving waveform as {}".format(dst))
        lr.output.write_wav(dst, waveform_resampled, source_sr)
Пример #8
0
def compute_pcen(audio, sr):
    # Load settings.
    pcen_settings = get_pcen_settings()

    # Map to the range [-2**31, 2**31[
    audio = (audio * (2**31)).astype('float32')

    # Resample to 22,050 kHz
    if not sr == pcen_settings["sr"]:
        audio = librosa.resample(audio, sr, pcen_settings["sr"])
        sr = pcen_settings["sr"]

    # Compute Short-Term Fourier Transform (STFT).
    stft = librosa.stft(
        audio,
        n_fft=pcen_settings["n_fft"],
        win_length=pcen_settings["win_length"],
        hop_length=pcen_settings["hop_length"],
        window=pcen_settings["window"])

    # Compute squared magnitude coefficients.
    abs2_stft = (stft.real*stft.real) + (stft.imag*stft.imag)

    # Gather frequency bins according to the Mel scale.
    # NB: as of librosa v0.6.2, melspectrogram is type-instable and thus
    # returns 64-bit output even with a 32-bit input. Therefore, we need
    # to convert PCEN to single precision eventually. This might not be
    # necessary in the future, if the whole PCEN pipeline is kept type-stable.
    melspec = librosa.feature.melspectrogram(
        y=None,
        S=abs2_stft,
        sr=pcen_settings["sr"],
        n_fft=pcen_settings["n_fft"],
        n_mels=pcen_settings["n_mels"],
        htk=True,
        fmin=pcen_settings["fmin"],
        fmax=pcen_settings["fmax"])

    # Compute PCEN.
    pcen = librosa.pcen(
        melspec,
        sr=pcen_settings["sr"],
        hop_length=pcen_settings["hop_length"],
        gain=pcen_settings["pcen_norm_exponent"],
        bias=pcen_settings["pcen_delta"],
        power=pcen_settings["pcen_power"],
        time_constant=pcen_settings["pcen_time_constant"])

    # Convert to single floating-point precision.
    pcen = pcen.astype('float32')

    # Truncate spectrum to range 2-10 kHz.
    pcen = pcen[:pcen_settings["top_freq_id"], :]

    # Return.
    return pcen
Пример #9
0
def slice_clip(filename, start, stop, n_samples, sr, mono=True):
    '''Slice a fragment of audio from a file.

    This uses pysoundfile to efficiently seek without
    loading the entire stream.

    Parameters
    ----------
    filename : str
        Path to the input file

    start : int
        The sample index of `filename` at which the audio fragment should start

    stop : int
        The sample index of `filename` at which the audio fragment should stop (e.g. y = audio[start:stop])

    n_samples : int > 0
        The number of samples to load

    sr : int > 0
        The target sampling rate

    mono : bool
        Ensure monophonic audio

    Returns
    -------
    y : np.ndarray [shape=(n_samples,)]
        A fragment of audio sampled from `filename`

    Raises
    ------
    ValueError
        If the source file is shorter than the requested length

    '''

    with psf.SoundFile(str(filename), mode='r') as soundf:
        n_target = stop - start

        soundf.seek(start)

        y = soundf.read(n_target).T

        if mono:
            y = librosa.to_mono(y)

        # Resample to initial sr
        y = librosa.resample(y, soundf.samplerate, sr)

        # Clip to the target length exactly
        y = librosa.util.fix_length(y, n_samples)

        return y
Пример #10
0
 def _read_song(self, fname, proportion=None):
     logging.info("Reading {}".format(fname))
     song_data_raw, source_sr = lr.load(fname)
     logging.info("Got sampling rate {}, resampling to {} ...".format(source_sr, self.cfg.target_sr))
     song_data = lr.resample(song_data_raw, source_sr, self.cfg.target_sr, scale=True)
     logging.info("Normalizing with l2 norm ...")
     if proportion:
         song_data = song_data[: int(proportion * len(song_data)),]
     song_data, data_denom = norm(song_data)
     logging.info("Done")
     return song_data, source_sr, data_denom
Пример #11
0
def sample_clip(filename, n_samples, sr, mono=True):
    '''Sample a fragment of audio from a file.

    This uses pysoundfile to efficiently seek without
    loading the entire stream.

    Parameters
    ----------
    filename : str
        Path to the input file

    n_samples : int > 0
        The number of samples to load

    sr : int > 0
        The target sampling rate

    mono : bool
        Ensure monophonic audio

    Returns
    -------
    y : np.ndarray [shape=(n_samples,)]
        A fragment of audio sampled randomly from `filename`

    Raises
    ------
    ValueError
        If the source file is shorter than the requested length

    '''

    with psf.SoundFile(str(filename), mode='r') as soundf:

        n_target = int(np.ceil(n_samples * soundf.samplerate / sr))

        # Draw a random clip
        start = np.random.randint(0, len(soundf) - n_target)

        soundf.seek(start)

        y = soundf.read(n_target).T

        if mono:
            y = librosa.to_mono(y)

        # Resample to initial sr
        y = librosa.resample(y, soundf.samplerate, sr)

        # Clip to the target length exactly
        y = librosa.util.fix_length(y, n_samples)

        return y
Пример #12
0
def activation_upsample(A_low, sr):

    n, k, _, sr_old = A_low.shape

    A = np.zeros((n, k, 1, sr))

    for i in range(n):
        for j in range(k):
            act_res = librosa.resample(A_low[i, j, :, :].squeeze(), orig_sr=sr_old, target_sr=sr)

            # Local-max filter act_res
            A[i, j, :, : min(sr, len(act_res))] = librosa.localmax(act_res) * act_res

    return A
Пример #13
0
    def __test(sr_in, sr_out, res_type, y):

        y2 = librosa.resample(y, sr_in, sr_out,
                              res_type=res_type,
                              scale=True)

        # First, check that the audio is valid
        librosa.util.valid_audio(y2, mono=True)

        n_orig = np.sqrt(np.sum(np.abs(y)**2))
        n_res = np.sqrt(np.sum(np.abs(y2)**2))

        # If it's a no-op, make sure the signal is untouched
        assert np.allclose(n_orig, n_res, atol=1e-2), (n_orig, n_res)
def analyse(filename, resample_to=2756, bt_hop_length=128, chroma_hop_length=512, chroma_n_fft=1024):
  #load audiofile-> return as floating point time series (array) with a sampling rate (int>0)
  samples, sampleRate = librosa.load(filename)
  length = float(len(samples))/sampleRate
  if resample_to:
    #resample time series from sampleRate to resample_to rate
    samples = librosa.resample(samples, sampleRate, resample_to)
    sampleRate = resample_to
  newSampleRate = 2756
  #resample time series from sampleRate to newSampleRate(2756)
  samples = librosa.resample(samples, sampleRate, newSampleRate)
  sampleRate = newSampleRate
  #track the beats from the time series in samples, using the number of audio samples reps by hop_length
  # --> returning tempo(estimated global tempo in bpmin) and beats (frame numbers of estimated beat events)
  tempo, beats = librosa.beat.beat_track(samples, sampleRate, hop_length=bt_hop_length)
  #convert the frame counts in 'beats' to time (seconds)
  beat_times = librosa.frames_to_time(beats, sampleRate, hop_length=bt_hop_length)
  #draw a chromagram using the data from samples, sampleRate, hop_length, and window size specified by n_fft
  chromagram = librosa.feature.chromagram(samples, sampleRate, hop_length=chroma_hop_length, n_fft=chroma_n_fft)
  #permute the dimensions of chromogram into a new array
  chromagram = numpy.transpose(chromagram)
  #compute the cosine distance between chromogram and CHORDS
  distances = scipy.spatial.distance.cdist(chromagram, CHORDS, "cosine")
  #return the indices of the minimum values along axis=1 (axis=0 reps the columns, axis=1 reps the rows)
  chords = distances.argmin(axis=1)
  #apply median filter to chords, the size of the median filter is rep by the kernal of size 11
  chords = scipy.signal.medfilt(chords, 11)
  #create a new array from th chords array using points where the discrete difference along the x-axis is not 0
  chord_frames = numpy.array(numpy.where(numpy.diff(chords) != 0))
  chords = chords[chord_frames][0].astype(int)
  #convert the frame counts in 'chord_frames' to time(seconds) 
  chord_times = librosa.frames_to_time(chord_frames, sampleRate, hop_length=chroma_hop_length, n_fft=chroma_n_fft)[0]
  chord_names = CHORD_NAMES[chords]

  return {"beats": list(beat_times),
          "chords": [{"chord": chord_name, "time": chord_time} for chord_name, chord_time in zip(chord_names, chord_times)],
          "tempo": tempo}
Пример #15
0
    def __init__(self, file_path=None, raw_samples=None, convert_to_mono=False,
                 sample_rate=44100, analysis_sample_rate=22050):
        """
        Audio constructor.
        Opens a file path, loads the audio with librosa, and prepares the features

        Parameters
        ----------

        file_path: string
            path to the audio file to load

        raw_samples: np.array
            samples to use for audio output

        convert_to_mono: boolean
            (optional) converts the file to mono on loading

        sample_rate: number > 0 [scalar]
            (optional) sample rate to pass to librosa.


        Returns
        ------
        An Audio object
        """

        if file_path:
            y, sr = librosa.load(file_path, mono=convert_to_mono, sr=sample_rate)
        elif raw_samples is not None:
            # This assumes that we're passing in raw_samples
            # directly from another Audio's raw_samples.
            y = raw_samples
            sr = sample_rate

        self.file_path = file_path
        self.sample_rate = float(sr)
        self.analysis_sample_rate = float(analysis_sample_rate)
        self.num_channels = y.ndim
        self.duration = librosa.get_duration(y=y, sr=sr)

        self.analysis_samples = librosa.resample(librosa.to_mono(y),
                                                 sr, self.analysis_sample_rate,
                                                 res_type='kaiser_best')
        self.raw_samples = np.atleast_2d(y)

        self.zero_indexes = self._create_zero_indexes()
        self.features = self._create_features()
        self.timings = self._create_timings()
Пример #16
0
    def __test(res_type):
        y_native, sr = librosa.load(librosa.util.example_audio_file(),
                                    sr=None,
                                    offset=offset,
                                    duration=duration,
                                    res_type=res_type)

        y2 = librosa.resample(y_native, sr, sr_target, res_type=res_type)

        y, _ = librosa.load(librosa.util.example_audio_file(),
                            sr=sr_target,
                            offset=offset,
                            duration=duration,
                            res_type=res_type)

        assert np.allclose(y2, y)
Пример #17
0
def signal_to_formants(signal, sr, freq_lims, win_len,
                    time_step, num_formants, window_shape = 'gaussian',
                    begin = None, padding = None):
    rep = {}
    new_sr = 2 * freq_lims[1]
    alpha = np.exp(-2 * np.pi * 50 * (1 / new_sr))
    proc = lfilter([1., -alpha], 1, signal)
    proc = librosa.resample(proc, sr, new_sr)
    nperseg = int(win_len*new_sr)
    nperstep = int(time_step*new_sr)
    if window_shape == 'gaussian':
        window = gaussian(nperseg + 2, 0.45 * (nperseg - 1) / 2)[1:nperseg + 1]
    else:
        window = hanning(nperseg + 2)[1:nperseg+1]
    indices = np.arange(int(nperseg / 2), proc.shape[0] - int(nperseg / 2) + 1, nperstep)
    num_frames = len(indices)
    for i in range(num_frames):
        if nperseg % 2 != 0:
            X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2) + 1]
        else:
            X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2)]
        frqs, bw = process_frame(X, window, num_formants, new_sr)
        formants = []
        for j,f in enumerate(frqs):
            if f < 50:
                continue
            if f > freq_lims[1] - 50:
                continue
            formants.append((np.asscalar(f), np.asscalar(bw[j])))
        missing = num_formants - len(formants)
        if missing:
            formants += [(None,None)] * missing
        rep[indices[i]/new_sr] = formants

    duration = signal.shape[0] / sr
    if begin is not None:
        if padding is not None:
            begin -= padding
        real_output = {}
        for k,v in rep.items():
            if padding is not None and (k < padding or k > duration - padding):
                continue
            t = np.asscalar(k+begin)
            real_output[t] = v
        return real_output
    return rep
Пример #18
0
def get_enrate(signal = None, sr = None, filepath = None, downsample=100):
  """Computes the enrate as described in
    ARGS
      signal: audio signal <number array>
      sr: sampling rate <int>
      filepath: fullpath of audio file <str>
      downsample: sampling rate to downsample signal <int>
    RETURN
      enrate: proportional to speaking rate <float>
  """
  if signal == None:
    signal, sr = load_signal(filepath)

  # FFT data
  n_fft = downsample
  hop_length = int(0.8*downsample)

  # Half-wave rectify the signal waveform
  signal[signal<0] = 0

  # Low-pass filter
  numtaps=2
  cutoff=16.0
  nyq=sr/2.0

  transfer_function = scipy.signal.firwin(numtaps=numtaps, cutoff=cutoff/nyq, nyq=nyq)
  signal = scipy.signal.lfilter(transfer_function, 1.0, signal)

  # Downsample to 100hz
  signal = librosa.resample(signal, sr, downsample)

  # Hamming window 1-2 seconds with > 75% overlap
  fft_window = scipy.signal.hamming(downsample, sym=False)

  # FFT, ignore values above 16 hz
  magnitudes = np.abs(librosa.stft(signal, n_fft, hop_length, window=fft_window))

  bin_count, freq_res = get_bin_count_and_frequency_resolution(n_fft, downsample)
  lowest_fbin_idx = int(1/freq_res)
  highest_fbin_idx = int(16/freq_res)

  # Compute the spectral moment ( index weight each power spectral value and sum )
  enrate = np.sum(magnitudes[lowest_fbin_idx:highest_fbin_idx].T * np.array(range(lowest_fbin_idx, highest_fbin_idx)))
  return enrate
Пример #19
0
def wav_data_to_samples(wav_data, sample_rate):
  """Read PCM-formatted WAV data and return a NumPy array of samples.

  Uses scipy to read and librosa to process WAV data. Audio will be converted to
  mono if necessary.

  Args:
    wav_data: WAV audio data to read.
    sample_rate: The number of samples per second at which the audio will be
        returned. Resampling will be performed if necessary.

  Returns:
    A numpy array of audio samples, single-channel (mono) and sampled at the
    specified rate, in float32 format.

  Raises:
    AudioIOReadError: If scipy is unable to read the WAV data.
    AudioIOError: If audio processing fails.
  """
  try:
    # Read the wav file, converting sample rate & number of channels.
    native_sr, y = scipy.io.wavfile.read(six.BytesIO(wav_data))
  except Exception as e:  # pylint: disable=broad-except
    raise AudioIOReadError(e)

  if y.dtype == np.int16:
    # Convert to float32.
    y = int16_samples_to_float32(y)
  elif y.dtype == np.float32:
    # Already float32.
    pass
  else:
    raise AudioIOError(
        'WAV file not 16-bit or 32-bit float PCM, unsupported')
  try:
    # Convert to mono and the desired sample rate.
    if y.ndim == 2 and y.shape[1] == 2:
      y = y.T
      y = librosa.to_mono(y)
    if native_sr != sample_rate:
      y = librosa.resample(y, native_sr, sample_rate)
  except Exception as e:  # pylint: disable=broad-except
    raise AudioIOError(e)
  return y
Пример #20
0
    def __test(y, sr_in, sr_out, res_type, fix):

        y2 = librosa.resample(y, sr_in, sr_out,
                              res_type=res_type,
                              fix=fix)

        # First, check that the audio is valid
        librosa.util.valid_audio(y2, mono=True)

        # If it's a no-op, make sure the signal is untouched
        if sr_out == sr_in:
            assert np.allclose(y, y2)

        # Check buffer contiguity
        assert y2.flags['C_CONTIGUOUS']

        # Check that we're within one sample of the target length
        target_length = y.shape[-1] * sr_out // sr_in
        assert np.abs(y2.shape[-1] - target_length) <= 1
Пример #21
0
    def __test(infile):
        DATA    = load(infile)
        
        # load the wav file
        (y_in, sr_in) = librosa.load(DATA['wavfile'][0], sr=None, mono=True)

        # Resample it to the target rate
        y_out = librosa.resample(y_in, DATA['sr_in'], DATA['sr_out'])

        # Are we the same length?
        if len(y_out) == len(DATA['y_out']):
            # Is the data close?
            assert np.allclose(y_out, DATA['y_out'])
        elif len(y_out) == len(DATA['y_out']) - 1:
            assert (np.allclose(y_out, DATA['y_out'][:-1,0]) or
                    np.allclose(y_out, DATA['y_out'][1:,0]))
        elif len(y_out) == len(DATA['y_out']) + 1:
            assert (np.allclose(y_out[1:], DATA['y_out']) or
                    np.allclose(y_out[:-2], DATA['y_out']))
        else:
            assert False
        pass
Пример #22
0
def resample_all():

	audio_folder = 'scenes_stereo/'
	subsamp_folder = 'scenes_mono_8k/'

	chdir(audio_folder)
	mkdir(subsamp_folder)

	for sub_folder in glob('*'):

		mkdir(subsamp_folder + sub_folder)

		for filename in glob(sub_folder + '/*.wav'):

			print(filename)

			[fs, sig] = read(filename)

			sig = to_mono(sig.T)
			sig = resample(sig, fs, 8000)
		
			write(subsamp_folder + filename, 8000, sig)
Пример #23
0
def apply_offsets_resample(b, offset_locations, offsets):
    '''
    Adjust a signal b according to local offset estimations using resampling
    
    :parameters:
        - b : np.ndarray
            Some signal
        - offset_locations : np.ndarray
            locations, in samples, of each local offset estimation
        - offsets : np.ndarray
            local offset for the corresponding sample in offset_locations
    :returns:
        - b_aligned : np.ndarray
            b with offsets applied
    ''' 
    assert offset_locations.shape[0] == offsets.shape[0]
    # Include signal boundaries in offset locations
    offset_locations = np.append(0, np.append( offset_locations, b.shape[0]-100 ))
    # Allocate output signal
    b_aligned = np.zeros(np.int(np.sum(np.diff(offset_locations)) + np.max(np.abs(offsets))))
    # Set last offset to whatever the second to last one was
    offsets = np.append(offsets, offsets[-1])
    current = 0
    # !!!!!!!!!!!!!!!!!!
    # Should zip here
    # !!!!!!!!!!!!!!!!!!
    for n, offset in enumerate(offsets):
        start = offset_locations[n]
        end = offset_locations[n + 1]
        # Compute the necessary resampling ratio to compensate for this offset
        ratio = 1 + (-offset + start - current)/(end - start)
        # Resample this portion of the signal, with some padding at the end
        resampled = librosa.resample(b[start:end + 100], 1, ratio)
        # Compute length and place the signal
        length = int(end - current - offset)
        b_aligned[current:current + length] = resampled[:length]
        current += length
    return b_aligned
Пример #24
0
    def __test(infile, scipy_resample):
        DATA = load(infile)

        # load the wav file
        (y_in, sr_in) = librosa.load(DATA['wavfile'][0], sr=None, mono=True)

        # Resample it to the target rate
        y_out = librosa.resample(y_in, DATA['sr_in'], DATA['sr_out'],
                                 scipy_resample=scipy_resample)

        # Are we the same length?
        if len(y_out) == len(DATA['y_out']):
            # Is the data close?
            assert np.allclose(y_out, DATA['y_out'])
        elif len(y_out) == len(DATA['y_out']) - 1:
            assert (np.allclose(y_out, DATA['y_out'][:-1, 0]) or
                    np.allclose(y_out, DATA['y_out'][1:, 0]))
        elif len(y_out) == len(DATA['y_out']) + 1:
            assert (np.allclose(y_out[1:], DATA['y_out']) or
                    np.allclose(y_out[:-2], DATA['y_out']))
        else:
            assert False
        pass
Пример #25
0
    def read_csv(self):
        waves = []
        labels = {}

        with open(self.dataset_path, "r") as file:
            csv_reader = csv.reader(file)

            for row in csv_reader:
                if row:
                    label = row[-1]
                    wav_path = f"{CURRENT_DIR}/{row[0]}" if row[0][
                        0] != '/' else f"{CURRENT_DIR}{row[0]}"

                    signal, sr = librosa.load(wav_path, sr=self.dim)
                    signal = librosa.resample(signal, sr, self.dim)

                    if signal.shape[0] != self.dim:
                        continue

                    waves.append(wav_path)
                    labels[wav_path] = label

        return waves, labels
Пример #26
0
def resample(y, src_sr, target_sr, mode='kaiser_fast'):
    if mode == 'kaiser_best':
        warnings.warn(
            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
        we recommend the mode kaiser_fast in large scale audio trainning')

    assert type(y) == np.ndarray, 'currently only numpy data are supported'
    assert mode in __resample_mode__, f'resample mode must in {__resample_mode__}'

    assert type(
        src_sr
    ) == int and src_sr > 0 and src_sr <= 48000, 'make sure type(sr) == int and sr > 0 and sr <= 48000,'
    assert type(
        target_sr
    ) == int and target_sr > 0 and target_sr <= 48000, 'make sure type(sr) == int and sr > 0 and sr <= 48000,'

    if has_resampy:
        return resampy.resample(y, src_sr, target_sr, filter=mode)

    if has_librosa:
        return librosa.resample(y, src_sr, target_sr, res_type=mode)

    assert False, 'requires librosa or resampy to do resampling, pip install resampy'
Пример #27
0
    def on_post(self, request, response):
        # NB: uses middleware to pull out data.
        form_data = request.params['audio_data'].file
        data, samplerate = soundfile.read(form_data)

        # For debugging browser input, uncomment the following line:
        # scipy.io.wavfile.write('browser_input_audio.wav', samplerate, data)

        # NB: Convert the input stereo signal into mono.
        # In the future the frontend should be responsible for sampling details.
        mono = data[:, 0]

        # NB: We must downsample to the rate that the network is trained on.
        downsampled = librosa.resample(mono, samplerate, 16000)

        # Evaluate the model
        print(">>> Converting...")
        results = converter.convert(downsampled, conversion_direction = 'A2B')

        temp_dir = tempfile.TemporaryDirectory(prefix='tmp_ml_audio')
        temp_file = tempfile.NamedTemporaryFile(suffix='.wav')

        temp_file.write(results.read())

        out_file = temp_dir.name + '/output.ogg'

        # NB: Browsers have a great deal of trouble decoding WAV files unless they are in the
        # narrow slice of the WAV spec expected. None of the {librosa, scipy, soundfile} python
        # tools do a good job of this, so here we shell out to ffmpeg and generate OGG.
        # It's lazy and messy, but it works for now.
        # See https://github.com/librosa/librosa/issues/361 for a survey of the library landscape
        # See https://bugzilla.mozilla.org/show_bug.cgi?id=523837 for one of dozens of browser codec bugs
        _stdout = subprocess.check_output(['ffmpeg', '-i', temp_file.name, '-acodec', 'libvorbis', out_file])

        response.content_type = 'audio/ogg'
        with open(out_file, mode='rb') as f:
            response.data = f.read()
def resample2(data, sr):
    '''Resample if required and drop to disc resampled data '''

    #     now = datetime.datetime.now()
    #     time = now.strftime("%H:%M:%S")
    #     print(time, ': Resample all recordings to target sampling rate if required')

    downsample = data.loc[data['sample_rate'] != sr]
    #print(file, ': Resampling records:', len(downsample.index))
    resampled = []
    indexes = []
    length = []
    srs = []
    samples = []

    for index, row in downsample.iterrows():
        sample = row['raw_sounds']
        sro = row['sample_rate']
        y = librosa.resample(sample, sro, sr)
        l = len(y) / sr
        resampled.append(y)
        indexes.append(index)
        length.append(l)
        samples.append(len(y))
        srs.append(sr)

    output = pd.DataFrame(list(zip(resampled, srs, indexes, length, samples)),
                          columns=[
                              'raw_sounds', 'sample_rate', 'index', 'length',
                              'sample_count'
                          ]).set_index('index')
    data.update(output)  # Join resampled recordings to raw frame
    #    data.to_pickle(p / file.name)

    #file_list = [p/(file.name) for file in file_list]

    return data
Пример #29
0
    def process_uid(uid):
        audio_path = os.path.join(audio_files_dir, uid)
        sound_class = uid.split('-')[-1].split('.wav')[0]
        if sound_class not in sound_classes:
            raise ValueError('Sound Class: {} must be in Classes: {}'.format(
                sound_class, sound_classes))
        y, sr = librosa.load(audio_path, sr=44100)
        y_8k = librosa.resample(y, sr, 8000)

        if y_8k.shape[0] < min_wav_samples:
            return
        wav = torch.tensor(y_8k, dtype=torch.float32).unsqueeze(0)
        norm_wav = torch.tensor(normalize_wav(y_8k),
                                dtype=torch.float32).unsqueeze(0)
        output_uid_folder = os.path.join(output_dirpath, sound_class,
                                         uid.split('.wav')[0])

        data = {
            'wav': wav,
            'wav_norm': norm_wav,
        }

        # Append metadata to the written data
        for meta_label, meta_label_val in metadata_dict[uid].items():
            if meta_label in data:
                raise IndexError('Trying to override essential '
                                 'information about files by '
                                 'assigning metalabel: {} in data '
                                 'dictionary: {}'.format(meta_label, data))
            data[meta_label] = meta_label_val

        if not os.path.exists(output_uid_folder):
            os.makedirs(output_uid_folder)

        for k, v in data.items():
            file_path = os.path.join(output_uid_folder, k)
            joblib.dump(v, file_path, compress=0)
    def load_audio(self):
        """
        Reads wav file based on csv values, resamples audio to 8000hz, fixes length to 1 second
        :return: numpy array of stereo audio, DOA from file
        """
        df = pd.read_csv("{dir}/iteration_{iter}.csv".format(dir=self.directory, iter=self.iteration),
                         usecols=[1, 2])

        wav_name = df.iloc[0][0]
        filename = "{wav_name}".format(wav_name=wav_name)

        y, sr = librosa.load(filename, mono=False)

        y_8k = librosa.resample(y, sr, 8000)

        o_env = librosa.onset.onset_strength(y_8k[0], sr=8000)

        peaks = librosa.util.peak_pick(o_env, 3, 3, 3, 5, 0.25, 5)

        times = librosa.frames_to_time(np.arange(len(o_env)),
                                       sr=8000, hop_length=512)

        peak_times = times[peaks]

        time = 0
        for i in range(1, len(peak_times) + 1):
            if 3 - peak_times[-i] >= 0.75:
                time = peak_times[-i] - 0.25
                break

        sample = librosa.time_to_samples(np.array([time]), sr=8000)

        sliced_y = np.array([y_8k[0][sample[0]:], y_8k[1][sample[0]:]])

        y_out = librosa.util.fix_length(sliced_y, 8000)

        return y_out
Пример #31
0
def import_to_mel(filepath, sample_rate):
    '''
	Import target audio file and pre process.
	input:
		filepath to target audio
		sample_rate of the current model
	output:
		a mel spectrogram of the loaded audiofile, normalised and limited to 4 seconds. 
	'''
    # mel settings
    n_mels = 128
    n_fft = 2048
    hop_length = 512  # the paper says 2048, but then the output matrix is the wrong size ­ЪциРђЇРЎѓ№ИЈ

    # import, convert to mono, normalise
    waveform, sr = torchaudio.load(filepath)
    waveform = waveform.numpy()
    if (sr != sample_rate):
        waveform = librosa.resample(waveform, sr, sample_rate)
    if (waveform.shape[0] > 1):
        waveform = librosa.to_mono(waveform).reshape(1, len(waveform[0]))
    waveform[0] = waveform[0] * (1.0 / np.max(waveform[0]))  # normalise
    waveform = torch.from_numpy(waveform)

    # copy to a tensor of specific size
    waveform_4s = torch.zeros(1, sample_rate * 4)
    iter_len = min(sample_rate * 4, waveform.shape[1])
    for i in range(iter_len):
        waveform_4s[0][i] = waveform[0][i]

    # generate mel
    spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=sample_rate,
        n_mels=n_mels,
        n_fft=n_fft,
        hop_length=hop_length)(waveform_4s)
    return spectrogram
Пример #32
0
    def audio_strip(self, orig_x, orig_sr) -> np.array:
        """
        Trim silence from both side
        """
        # Noise reduction
        if self._do_noise_reduction:
            orig_x = nr.reduce_noise(audio_clip=orig_x, noise_clip=orig_x)

        # Resampling
        x = librosa.resample(orig_x,
                             orig_sr=orig_sr,
                             target_sr=self.SAMPLE_RATE)

        # Voice activity detection
        start_sample = 0
        stop_sample = x.shape[0]
        # begin
        for i in range(x.shape[0] // self.VAD_CHUNK):
            frame_buf = self.float_to_pcm16(x[self.VAD_CHUNK *
                                              i:self.VAD_CHUNK * (i + 1)])
            if self._vad.is_speech(frame_buf, self.SAMPLE_RATE):
                start_sample = (i + 1) * self.VAD_CHUNK
                break
        # end
        for i in range(x.shape[0] // self.VAD_CHUNK - 1,
                       start_sample // self.VAD_CHUNK, -1):
            frame_buf = self.float_to_pcm16(x[self.VAD_CHUNK *
                                              i:self.VAD_CHUNK * (i + 1)])
            if self._vad.is_speech(frame_buf, self.SAMPLE_RATE):
                stop_sample = i * self.VAD_CHUNK
                break
        # less than 120 ms
        if stop_sample - start_sample <= self.MINIMUM_DIFF * (
                self.SAMPLE_RATE // 1000):
            return None

        return x[start_sample:stop_sample]
def preprocess(raw_path, clean_path):
    raw_path = os.path.join(BASE_DIR, raw_path)
    clean_path = os.path.join(BASE_DIR, clean_path)
    if not os.path.isdir(raw_path):
        print('Path to Raw dataset is invalid!')
        return
    if not os.path.isdir(clean_path):
        os.mkdir(clean_path)
    for cls in os.scandir(raw_path):
        if cls.is_dir:
            for item in tqdm(os.scandir(cls), total=len(os.listdir(cls))):
                if item.name.endswith('.wav'):
                    audio = item.path
                    sr, signal = wavfile.read(audio)
                    signal = signal.astype(np.float32).T
                    if signal.shape[0] == 2:
                        signal = to_mono(signal)
                    elif signal.shape[0] == 1:
                        signal = to_mono(signal.reshape(-1))
                    signal = resample(signal, sr, SR)
                    sr = SR
                    signal = signal.astype(np.int16)
                    mask = purifier(signal, sr, 100)
                    signal = signal[mask]
                    if signal.shape[
                            0] < DURATION:  #if the audio after purification is less than 2s, append zeros
                        rectified_signal = np.zeros((DURATION, ),
                                                    dtype=np.int16)
                        rectified_signal[:signal.shape[0]] = signal
                        save_file(signal, sr, cls.name, item.name, 0)
                    else:
                        trunc = signal.shape[0] % DURATION
                        for i, j in enumerate(
                                range(0, signal.shape[0] - trunc, DURATION)):
                            strip = signal[j:j + DURATION]
                            save_file(strip, sr, clean_path, cls.name,
                                      item.name, i)
def load_wav_file(file_path,
                  sample_rate,
                  mono=True,
                  resample_type="kaiser_best"):
    """Load a wav audio file as a floating point time series. Significantly faster than
    load_sound_file."""

    actual_sample_rate, samples = wavfile.read(file_path)

    if samples.dtype != np.float32:
        assert samples.dtype == np.int16
        samples = np.true_divide(
            samples, 32768,
            dtype=np.float32)  # ends up roughly between -1 and 1

    if mono and len(samples.shape) > 1:
        if samples.shape[1] == 1:
            samples = samples[:, 0]
        else:
            samples = np.mean(samples, axis=1)

    if sample_rate is not None and actual_sample_rate != sample_rate:
        if resample_type == "auto":
            resample_type = ("kaiser_fast" if actual_sample_rate < sample_rate
                             else "kaiser_best")

        samples = librosa.resample(samples,
                                   actual_sample_rate,
                                   sample_rate,
                                   res_type=resample_type)
        warnings.warn(
            "{} had to be resampled from {} hz to {} hz. This hurt execution time."
            .format(str(file_path), actual_sample_rate, sample_rate))

    actual_sample_rate = actual_sample_rate if sample_rate is None else sample_rate

    return samples, actual_sample_rate
Пример #35
0
 def generator(self,
               data_dir,
               tmp_dir,
               dataset,
               eos_list=None,
               start_from=0,
               how_many=0):
     del eos_list
     i = 0
     data_tuples = _collect_data(tmp_dir)
     encoders = self.feature_encoders(data_dir)
     audio_encoder = encoders["waveforms"]
     text_encoder = encoders["targets"]
     for utt_id, media_file, text_data, speaker, utt_dataset in tqdm(
             sorted(data_tuples)[start_from:]):
         if dataset != utt_dataset:
             continue
         if how_many > 0 and i == how_many:
             return
         i += 1
         try:
             wav_data = audio_encoder.encode(media_file)
         except AssertionError:
             audio, sr = librosa.load(media_file)
             data_resampled = librosa.resample(audio, sr, SAMPLE_RATE)
             with tempfile.NamedTemporaryFile(suffix='.wav') as fid:
                 librosa.output.write_wav(fid.name, data_resampled,
                                          SAMPLE_RATE)
                 wav_data = audio_encoder.encode(fid.name)
         yield {
             "waveforms": wav_data,
             "waveform_lens": [len(wav_data)],
             "targets": text_encoder.encode(text_data),
             "raw_transcript": [text_data],
             "utt_id": [utt_id],
             "spk_id": [speaker],
         }
Пример #36
0
def _get_spectrograms(fpath, require_sr, preemphasis, n_fft, hop_length,
                      win_length, max_db, ref_db):
    '''Parse the wave file in `fpath` and
    Returns normalized melspectrogram and linear spectrogram.
    Args:
      fpath: A string. The full path of a sound file.
    Returns:
      mel: A 2d array of shape (T, n_mels) and dtype of float32.
      mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32.
    '''
    # Loading sound file
    y, sr = librosa.load(fpath, sr=None)
    if sr != require_sr:
        y = librosa.resample(y, sr, require_sr)

    # Preemphasis
    y = np.append(y[0], y[1:] - preemphasis * y[:-1])

    # stft
    linear = librosa.stft(y=y,
                          n_fft=n_fft,
                          hop_length=hop_length,
                          win_length=win_length)

    # magnitude spectrogram
    mag = np.abs(linear)  # (1+n_fft//2, T)

    # to decibel
    mag = 20 * np.log10(np.maximum(1e-5, mag))

    # normalize
    mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)

    # Transpose
    mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)

    return mag
Пример #37
0
def wav_data_to_samples(wav_data, sample_rate):
  """Read PCM-formatted WAV data and return a NumPy array of samples.

  Uses scipy to read and librosa to process WAV data. Audio will be converted to
  mono if necessary.

  Args:
    wav_data: WAV audio data to read.
    sample_rate: The number of samples per second at which the audio will be
        returned. Resampling will be performed if necessary.

  Returns:
    A numpy array of audio samples, single-channel (mono) and sampled at the
    specified rate, in float32 format.

  Raises:
    AudioIOReadException: If scipy is unable to read the WAV data.
    AudioIOException: If audio processing fails.
  """
  try:
    # Read the wav file, converting sample rate & number of channels.
    native_sr, y = scipy.io.wavfile.read(six.BytesIO(wav_data))
  except Exception as e:  # pylint: disable=broad-except
    raise AudioIOReadException(e)
  if y.dtype != np.int16:
    raise AudioIOException('WAV file not 16-bit PCM, unsupported')
  try:
    # Convert to float, mono, and the desired sample rate.
    y = int16_samples_to_float32(y)
    if y.ndim == 2 and y.shape[1] == 2:
      y = y.T
      y = librosa.to_mono(y)
    if native_sr != sample_rate:
      y = librosa.resample(y, native_sr, sample_rate)
  except Exception as e:  # pylint: disable=broad-except
    raise AudioIOException(e)
  return y
Пример #38
0
def asr_transcript(model, tokenizer, input_file):

    if not os.path.isfile(input_file):
        raise FileNotFoundError

    # tokenizer, model = load_model()

    speech, fs = sf.read(input_file)

    if len(speech.shape) > 1:
        speech = speech[:, 0] + speech[:, 1]

    if fs != 16000:
        speech = librosa.resample(speech, fs, 16000)

    input_values = tokenizer(speech, return_tensors="pt").input_values
    input_values = input_values.to(device)
    logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)

    transcription = tokenizer.decode(predicted_ids[0])

    return correct_sentence(transcription.lower())
Пример #39
0
def supercompression(filename, mono, resample):
    y_file, fs = librosa.load(filename, mono=mono)
    if resample:
        targetsample = 2000
        y_file = librosa.resample(y_file, target_sr=targetsample, orig_sr=fs)
    D = librosa.stft(y_file)
    S_db = np.transpose(
        np.array(librosa.amplitude_to_db(np.abs(D), ref=np.max))).tolist()
    edited = []
    average = 0
    count = 0
    for line in S_db:
        temp = []
        for data in line:
            data += 80
            if data > 0:
                average += (data)
                count += 1
    average = average / count
    for line in S_db:
        temp = []
        for data in line:
            data += 80
            if data == 0:
                category = 0
            elif data > 0 and data < average * 0.5:
                category = 1
            elif data > average * 0.5 and data < average:
                category = 2
            elif data > average and data < average * 1.5:
                category = 3
            elif data > average * 1.5:
                category = 4
            temp.append(category)
        edited.append(temp)
    return edited
def read_audio():
    # 녹음을 수행하고 prediction을 수행하는 부분
    audio = sd.rec(duration * fs, samplerate=fs, channels=4, dtype='float64')

    # print("Recording Audio")
    sd.wait()
    audio = np.multiply(audio, negative)
    audio = np.multiply(audio, mul)
    audio = np.multiply(audio, negative)
    # sf.write('./test audio file/False_alarm.wav', audio, fs)
    # print('recorded!')

    audio = audio.T
    y = librosa.resample(audio, fs, 16000)
    y = np.asfortranarray(y)
    fe1, fe2, fe3, fe4 = extract_feature(y, sr=16000)
    rr1, rr2 = prediction(fe1, fe2, fe3, fe4)
    result = ''
    if not rr2:
        result = rr1
    else:
        result = rr1 + ", " + rr2

    return result
Пример #41
0
    def __getitem__(self, idx): 
        hop_length = 1024
        # open audio
        file_path = self.data[idx]
        signal, sampling_rate = open_audio(file_path)
        if len(signal.shape) > 1: 
            signal = np.mean(signal, axis = 1)
        if sampling_rate != 44100:
            signal = librosa.resample(signal, sampling_rate, 44100)
            sampling_rate = 44100
            
            
        # get 30 second chunk
        len_index_30_sec = int(30 / (1 / sampling_rate))
        # trim first and last 30 seconds 
        signal = signal[len_index_30_sec:-len_index_30_sec]
        # random start index
        start_index = np.random.randint(low = 0, high = len(signal) - len_index_30_sec)
        signal = signal[start_index:start_index + len_index_30_sec]
        # if training change pitch randomly
        if self.train:
            n_steps = np.random.randint(low = -4, high=4) 
            signal = librosa.effects.pitch_shift(signal, sampling_rate, n_steps=n_steps)
        # extract harmonic 
        data_h = librosa.effects.harmonic(signal)
        # cqt transform
        S = np.real(librosa.cqt(data_h, sr=sampling_rate, hop_length=hop_length)).astype(np.float32)

        
        d = torch.from_numpy(np.expand_dims(S, axis = 0)).type(torch.FloatTensor) 
        # normalize 
        d = F.normalize(d)
        l = torch.from_numpy(np.array(self.labels[idx])).type(torch.LongTensor)
#         print(d.shape, sampling_rate, file_path)

        return d,l
def generate_amplitude_envelopes(signal, sr, num_bands, min_frequency, max_frequency, mode='downsample'):
    signal = preemphasize(signal, 0.97)
    proc = signal / np.sqrt(np.mean(signal ** 2)) * 0.03

    band_mins = [min_frequency * np.exp(np.log(max_frequency / min_frequency) / num_bands) ** x
                 for x in range(num_bands)]
    band_maxes = [min_frequency * np.exp(np.log(max_frequency / min_frequency) / num_bands) ** (x + 1)
                  for x in range(num_bands)]

    envs = []
    for i in range(num_bands):
        b, a = butter(2, (band_mins[i] / (sr / 2), band_maxes[i] / (sr / 2)), btype='bandpass')
        env = filtfilt(b, a, proc)
        env = abs(hilbert(env))
        if mode == 'downsample':
            env = resample(env, sr, 120)
        envs.append(env)
    envs = np.array(envs).T
    if mode == 'downsample':
        sr = 120
    output = dict()
    for i in range(envs.shape[0]):
        output[i / sr] = envs[i, :]
    return output
Пример #43
0
def preprocess_wav(fpath_or_wav, source_sr = None):
    """
    Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.

    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
    just .wav), either the waveform as a numpy array of floats.
    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
    preprocessing. After preprocessing, the waveform's sampling rate will match the data 
    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
    this argument will be ignored.
    """
    # Load the wav from disk if needed
    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
        wav, source_sr = librosa.load(fpath_or_wav, sr=None)
    else:
        wav = fpath_or_wav
    # Resample the wav if needed
    if source_sr is not None and source_sr != sampling_rate:
        wav = librosa.resample(wav, source_sr, sampling_rate)
    ## Apply the preprocessing: normalize volume and shorten long silences 
    #wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
    #wav = trim_long_silences(wav)
    return wav
Пример #44
0
def pool_msr(path, dirname, mrs_dir, msf_dir, ftype = 3):
    dirs = os.listdir("%s/%s"%(path, dirname))
    for file in dirs:
        if file.endswith('.wav'):
            print('%s/%s/%s' % (path, dirname, file))
            if dirname == "":
                sample_rate, signal = scipy.io.wavfile.read(path, file) 
                feat = logfbank(signal)
                mf = srmr_audio(path, file, pad)
            else:
                signal, fs = librosa.load("%s/%s/%s" % (path, dirname, file))
                signal = librosa.resample(signal, fs, 16000)                                     
                feat1 = logfbank(signal)
                feat2 = ssc(signal)
            
            

            mrs = feat1
           
            write_file(mrs_dir, file, mrs, ftype)

            msf = feat2
           
            write_file(msf_dir, file, msf, ftype)
Пример #45
0
def preview(filename, directory):
    print '...Previewing Accompaniment'
    FNULL = open(os.devnull, 'w')
    subprocess.call([
        'fluidsynth', '-T', 'wav', '-F', directory + "/" + filename[:-4] +
        '.raw', '-ni', directory[:-10] + 'lib/sf2/sf.sf2',
        directory + "/" + filename[:-4] + '.mid', '-g', '0.8', '-r', '22050'
    ],
                    stdout=FNULL,
                    stderr=subprocess.STDOUT)
    subprocess.call([
        'SoX', '-t', 'raw', '-r', '22050', '-e', 'signed', '-b', '16', '-c',
        '1', directory + "/" + filename[:-4] + '.raw',
        directory + "/" + filename[:-4] + '_midi.wav'
    ])

    y, sr = librosa.load(directory + "/" + filename)
    z, sr2 = librosa.load(directory + "/" + filename[:-4] + '_midi.wav')
    y = librosa.resample(y, sr, sr * 2)
    mix = np.zeros(max(len(y), len(z)), dtype=float)
    mix[:len(y)] += y / 2
    mix[:len(z)] += z / 2
    mix = np.int16(mix / np.max(np.abs(mix)) * 16383)
    write(directory + "/" + filename[:-4] + '_mix.wav', 44100, mix)
def process_wav_file(wav, orig_d_path, target_d_path, sample_duration,
                     sample_rate, channels_to_extract):
    # read the orig wav file, and resample with given sample_rate
    wav_p = os.path.join(orig_d_path, wav)
    data, sr = sf.read(wav_p, dtype=np.float32)
    if data.shape[1] < channels_to_extract:
        raise ValueError('Not enough channels')
    # reduce it to a single channel and resample
    data = np.concatenate(data[:, :channels_to_extract])
    data = librosa.resample(data, sr, sample_rate)
    # split it into the required duration
    frames_per_sample = int(sample_duration * sample_rate)
    # write the audio to the given directory
    idx = 0
    start = 0
    end = frames_per_sample
    while end < data.shape[0]:
        target_wav = wav[:-4] + '_' + str(idx) + '.wav'
        target_wav_p = os.path.join(target_d_path, target_wav)
        wavfile.write(target_wav_p, sample_rate, data[start:end])

        idx += 1
        start = end
        end += frames_per_sample
def get_spectrogram_feature(filepath):
    if filepath.split('/')[1] == 'TIMIT':
        sig = np.fromfile(filepath, dtype=np.int16)[512:].reshape((-1, 1))
    else:
        (fate, width, sig) = wavio.readwav(filepath)
    sig = sig.ravel().astype(np.float) / 32767
    sig = librosa.resample(sig, 16000, 8000) * 32767
    sig = sig.astype(np.int16)

    stft = torch.stft(torch.FloatTensor(sig),
                      N_FFT,
                      hop_length=int(0.01 * SAMPLE_RATE),
                      win_length=int(0.03 * SAMPLE_RATE),
                      window=torch.hamming_window(int(0.03 * SAMPLE_RATE)),
                      center=False,
                      normalized=False,
                      onesided=True)

    stft = (stft[:, :, 0].pow(2) + stft[:, :, 1].pow(2)).pow(0.5)
    amag = stft.numpy()
    feat = torch.FloatTensor(amag)
    feat = torch.FloatTensor(feat).transpose(0, 1)

    return feat
def get_x_y(i, fs, resample, directory, input_data_frame):
    """
    returns tuple of x, y  data pair at sample rate fs
    written to ensure that parallel computations have the correct label

    """

    file_path = "{directory}/{filename}".format(
        directory=directory, filename=input_data_frame.iloc[i][0])

    y, sr = librosa.load(file_path, mono=False)

    if resample:
        y_8k = librosa.resample(y, sr, fs)
        result_x = librosa.util.fix_length(y_8k, fs)

    else:

        y_8k = y
        result_x = librosa.util.fix_length(y_8k, sr)

    result_y = get_y(i, input_data_frame)

    return result_x, result_y
Пример #49
0
def load_audio(audiofile):
    try:
        audio, sr = soundfile.read(audiofile)
        if audio.shape[1] != 1:
            audio = librosa.to_mono(audio.T)
        if sr != 16000:
            audio = librosa.resample(audio, sr, 16000)
    except:
        path_audio = Path(audiofile)
        filetype = path_audio.suffix
        assert filetype in ['.mp3', '.ogg', '.flac', '.wav', '.m4a',
                            '.mp4'], filetype
        with tempfile.TemporaryDirectory() as tempdir:
            tempwav = Path(tempdir) / (path_audio.stem + '_temp' + '.flac')
            command = [
                'ffmpeg', '-i', audiofile, '-af', 'aformat=s16:16000', '-ac',
                '1', tempwav
            ]
            process = subprocess.Popen(command,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            stdout, stderr = process.communicate()
            audio, sr = soundfile.read(tempwav)
    return audio
def spoil_audio(sample, sr=8000, rem_coef=0.1):
    """Spoil the wav audio file by removing measures.
    Inputs:
        sample (numpy.ndarray): wav file array;
        sr=8000 (int): incoming sample rate;
        rem_coef=0.1 (float): removing coefficient
    Return:
        numpy.ndarray: reprocessed sample wav file array"""

    new_sample_rate = sr + sr * rem_coef
    sample_filtered = librosa.resample(sample,
                                       orig_sr=sr,
                                       target_sr=new_sample_rate)

    # Получить Series из wav numpy.ndarray
    s1 = Series(sample_filtered)

    # Выбрать случайно индексы
    rand_ind = np.random.choice(s1.index.values, sr, replace=False)

    # Выбрать из исходного Series только полученные случайные индексы
    s1.loc[rand_ind].sort_index().values

    return s1.loc[rand_ind].sort_index().values
Пример #51
0
def _resample_and_cut(cache_path, spec, example, label):
    """Resample and cut the audio files into snippets."""
    def _new_path(i):
        # Note that `splitext` handles hidden files differently and here we just
        # assume that audio files are not hidden files.
        # os.path.splitext('/root/path/.wav') => ('/root/path/.wav', '')
        # instead of ('/root/path/', '.wav') as this code expects.
        filename_without_extension = os.path.splitext(
            os.path.basename(example))[0]
        new_filename = filename_without_extension + '_%d.wav' % i
        return os.path.join(cache_path, label, new_filename)

    sampling_rate, xs = wavfile.read(example)
    if xs.dtype != np.int16:
        raise ValueError(
            'DataLoader expects 16 bit PCM encoded WAV files, but {} has type {}'
            .format(example, xs.dtype))

    # Extract snippets.
    n_samples_per_snippet = int(spec.snippet_duration_sec * sampling_rate)
    begin_index = 0
    count = 0
    while begin_index + n_samples_per_snippet <= len(xs):
        snippet = xs[begin_index:begin_index + n_samples_per_snippet]
        if spec.target_sample_rate != sampling_rate:
            # Resample, librosa.resample only works with float32.
            # Ref: https://github.com/bmcfee/resampy/issues/44
            snippet = snippet.astype(np.float32)
            snippet = librosa.resample(
                snippet,
                orig_sr=sampling_rate,
                target_sr=spec.target_sample_rate).astype(np.int16)
        wavfile.write(_new_path(count), spec.target_sample_rate, xs)
        begin_index += n_samples_per_snippet
        count += 1
    return count
Пример #52
0
def get_audio(config, mp3_path=None, array=None, array_sr=None):
    if mp3_path:
        array, sr_in = librosa.core.load(mp3_path, sr=None, mono=False)
    elif array is not None:
        array = array.astype(np.float32)
        sr_in = array_sr
    array = librosa.core.to_mono(array)
    array = librosa.resample(array, sr_in, config.sr)

    array = librosa.core.power_to_db(
        librosa.feature.melspectrogram(array, config.sr, n_mels=config.n_mels))
    array = array.astype(np.float32)
    # normalization
    mean, variance = tf.nn.moments(tf.constant(array),
                                   axes=[0, 1],
                                   keepdims=True)
    array = tf.nn.batch_normalization(array,
                                      mean,
                                      variance,
                                      offset=0,
                                      scale=1,
                                      variance_epsilon=.000001).numpy()

    return array
Пример #53
0
def __onset_time(song_path):
    '''
    Loads the song at song_path, computes onsets, returns array of times and harmonic component
    of source separated audio

    ~~~~ ARGUMENTS ~~~~
    - song_path (Path or str): path to audio file
    
    ~~~~ RETURNS ~~~~
    - y_harmonic (1D numpy array): numpy representation of audio, sr=22050, with percussive
                                   components of audio removed
    - onset_times (list of float): list of onset times corresponding to audio in seconds
    '''
    # Load the songs and the notes arrays one at a time
    # for idx in range (len(song_paths)):
    # Load the song
    y, sr = librosa.load(song_path)

    # resample the song if it isn't sr=22050 (for consistent sizing)
    if not sr == 22050:
        y = librosa.resample(y, sr, 22050)
        sr = 22050

    #source seperation, margin can be tuned
    y_harmonic, _ = librosa.effects.hpss(y, margin=2.0)

    # Set Hop_len
    hop_len = 512

    onset_frame_backtrack = librosa.onset.onset_detect(y_harmonic,
                                                       sr=sr,
                                                       hop_length=hop_len,
                                                       backtrack=True)
    onset_times = librosa.frames_to_time(onset_frame_backtrack)

    return y_harmonic, onset_times
def resample(sample_rate=None, dir=None, csv_path=None):

    clips = []
    start_time = time.time()

    # List all clips that appear on the csv (train, eval or test)

    if csv_path != 'test':
        with open(csv_path, 'r') as csvFile:
            reader = csv.reader(csvFile)
            for row in reader:
                clips.append(row[0])

        csvFile.close()
        clips.remove('fname')
    else:
        clips = os.listdir(dir)

    if os.path.exists(dir+'/resampled/'):
        shutil.rmtree(dir+'/resampled', ignore_errors=True)  # ignore errors whit read only files

    os.mkdir(dir+'/resampled')

    for clip in clips:
        # Audio clip is read
        data, sr = sf.read(dir+'/'+clip)
        data = data.T
        # Audio data is resampled to desired sample_rate
        if sr != sample_rate:
            data_resampled = librosa.resample(data, sr, sample_rate)
        # Processed data is saved into a directory under train_clip_dir
        sf.write(dir+'/resampled/'+clip, data_resampled, sample_rate, subtype='PCM_16')

    print('Audio data has been resampled successfully')
    elapsed_time = time.time() - start_time
    print('Elapsed time ' + str(elapsed_time) + ' seconds')
Пример #55
0
def k_filter(data, fs):
    """
    x_t: audio data in samples across time
    fs: sample rate of x_t

    TEMPORARY FUNCTION UNTIL THE LOUDNESS FUNCTION IS FIXED FOR ACTIVITY

    return: k-filtered data AND new 48khz fs
    """ 
    # Convert fs to 48khz to do K-Filtering
    if fs != 48000:
        data = librosa.resample(data, fs, 48000)
        fs  = 48000

    # Hi-Shelf Boost of +4dB at 1681hz
    a1 = [1.0, -1.69065929318241, 0.73248077421585]
    b1 = [1.53512485958697, -2.69169618940638, 1.19839281085285]

    # Create High-Pass roll off at 38hz
    a2 = [1.0, -1.99004745483398, 0.99007225036621]
    b2 = [1.0, -2.0, 1.0]

    # Filter in succession
    return lfilter(b2, a2, lfilter(b1, a1, data)), fs
Пример #56
0
def analyze_sound(event, context):
    file_data = event

    file_name = file_data['name']
    bucket_name = file_data['bucket']

    blob = storage_client.bucket(bucket_name).get_blob(file_name)
	
    blob_name = blob.name
    _, temp_local_filename = tempfile.mkstemp()

    # Download file from bucket.
    blob.download_to_filename(temp_local_filename)
    
    y, sr = librosa.load(temp_local_filename)

    y_mono = librosa.to_mono(y)
    y_mono_22050 = librosa.resample(y_mono, sr, 22050)
    mfccs = librosa.feature.mfcc(y=y_mono_22050, sr=22050, n_mfcc=12)

    mean_mfccs = [np.mean(mfcc) for mfcc in mfccs] 

    
    print(f'Audio file name: {file_name}')
    
    print(f'Audio file is {len(y)} samples long.')

    uid = uuid.uuid4()
    doc_ref = db.collection(u'sounds').document(str(uid))
    doc_ref.set({
        u'uid': str(uid),
        u'blob_name': blob_name,
        u'file_name': file_name,
        u'length': len(y),
        u'mean_mfccs': mean_mfccs
    })
Пример #57
0
filters_init = filters.copy()

use_gpu = True
filters_gpu = tf.placeholder(tf.float32, shape=filters.shape, name="Filters")
Rn_gpu = tf.placeholder(tf.float32, shape=(batch_size, filter_size), name="Rn")
prods_gpu = math_ops.matmul(filters_gpu, tf.transpose(Rn_gpu))
gpu_session = tf.Session()

assert not use_gpu or jobs == 1, "Can't use gpu with multiple processes"

data_source = [data_source[1]]

for source_id, source_filename in enumerate(data_source):
    data, source_sr = lr.load(source_filename)
    data = data[:500000]
    data = lr.resample(data, source_sr, target_sr, scale=True)

    data_test = lr.resample(data, target_sr, source_sr, scale=True)
    lr.output.write_wav("/home/alexeyche/Music/ml/test.wav", data_test, source_sr)

    data_denom = np.sqrt(np.sum(data ** 2))
    data = data/data_denom
    data = np.concatenate([data, np.zeros(filter_size)])

    print "Source with id {} and file {}".format(source_id, source_filename)
    processes = []
    records = []

    def sync(wait_all=True):
        global dfilters, records, processes, filters
        while len(processes)>0:
Пример #58
0
    def __init__(self,
                 db,                # data source
                 name = '',         # optional name

                 selectors = dict(),

                 partitioner = None,

                 meta_sources = [],     # optional sources other than 'features' and 'targets' from metadata

                 channel_filter = NoChannelFilter(),   # optional channel filter, default: keep all
                 channel_names = None,  # optional channel names (for metadata)

                 label_attribute = 'label', # metadata attribute to be used as label
                 label_map = None,      # optional conversion of labels
                 use_targets = True,    # use targets if provides, otherwise labels are used

                 remove_dc_offset = False,  # optional subtraction of channel mean, usually done already earlier
                 resample = None,       # optional down-sampling
                 normalize = True,      # normalize to max=1

                 # optional sub-sequences selection
                 start_sample = 0,
                 stop_sample  = None,   # optional for selection of sub-sequences
                 zero_padding = True,   # if True (default) trials that are too short will be padded with
                                        # otherwise they will rejected.

                 # optional signal filter to by applied before splitting the signal
                 signal_filter = None,

                 trial_processors = [],     # optional processing of the trials
                 target_processor = None,   # optional processing of the targets, e.g. zero-padding
                 transformers = [],         # optional transformations of the dataset

                 layout='tf',       # (0,1)-axes layout tf=time x features or ft=features x time

                 debug=False,
                 ):
        '''
        Constructor
        '''

        # save params
        self.params = locals().copy()
        del self.params['self']
        # print self.params

        self.name = name
        self.debug = debug

        metadb = DatasetMetaDB(db.metadata, selectors.keys())

        if partitioner is not None:
            pass # FIXME

        selected_trial_ids = metadb.select(selectors)
        log.info('selectors: {}'.format(selectors))
        log.info('selected trials: {}'.format(selected_trial_ids))

        if normalize:
            log.info('Data will be normalized to max amplitude 1 per channel (normalize=True).')

        trials = list()
        labels = list()
        targets = list()
        meta = list()

        if stop_sample == 'auto-min':
            stop_sample = np.min([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids])
            log.info('Using minimum trial length. stop_sample={}'.format(stop_sample))
        elif stop_sample ==  'auto-max':
            stop_sample = np.max([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids])
            log.info('Using maximum trial length. stop_sample={}'.format(stop_sample))

        for trial_i in selected_trial_ids:

            trial_meta = db.metadata[trial_i]

            if use_targets:
                if targets is None:
                    target = None
                else:
                    target = db.targets[trial_i]
                    assert not np.isnan(np.sum(target))

                if target_processor is not None:
                    target = target_processor.process(target, trial_meta)

                    assert not np.isnan(np.sum(target))
            else:
                # get and process label
                label = db.metadata[trial_i][label_attribute]
                if label_map is not None:
                    label = label_map[label]

            processed_trial = []

            trial = db.data[trial_i]

            if np.isnan(np.sum(trial)):
                print trial_i, trial

            assert not np.isnan(np.sum(trial))

            rejected = False # flag for trial rejection

            trial = np.atleast_2d(trial)

            # process 1 channel at a time
            for channel in xrange(trial.shape[0]):
                # filter channels
                if not channel_filter.keep_channel(channel):
                    continue

                samples = trial[channel, :]

                # subtract channel mean
                if remove_dc_offset:
                    samples -= samples.mean()

                # down-sample if requested
                if resample is not None and resample[0] != resample[1]:
                    samples = librosa.resample(samples, resample[0], resample[1], res_type='sinc_best')

                # apply optional signal filter after down-sampling -> requires lower order
                if signal_filter is not None:
                    samples = signal_filter.process(samples)

                # get sub-sequence in resampled space
                # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape))

                if stop_sample is not None and stop_sample > len(samples):
                    if zero_padding:
                        tmp = np.zeros(stop_sample)
                        tmp[:len(samples)] = samples
                        samples = tmp
                    else:
                        rejected = True
                        break # stop processing this trial

                s = samples[start_sample:stop_sample]

                # TODO optional channel processing

                # normalize to max amplitude 1
                if normalize:
                    s = librosa.util.normalize(s)

                # add 2nd data dimension
                s = s.reshape(s.shape[0], 1)
                # print s.shape

                s = np.asfarray(s, dtype=theano.config.floatX)

                processed_trial.append(s)

                ### end of channel iteration ###

            if rejected:
                continue    # next trial

            processed_trial = np.asfarray([processed_trial], dtype=theano.config.floatX)

            # processed_trial = processed_trial.reshape((1, processed_trial.shape))
            processed_trial = np.rollaxis(processed_trial, 1, 4)

            # optional (external) trial processing, e.g. windowing
            # trials will be in b01c format with tf layout for 01-axes
            for trial_processor in trial_processors:
                processed_trial = trial_processor.process(processed_trial, trial_meta)

            trials.append(processed_trial)

            for k in range(len(processed_trial)):
                meta.append(trial_meta)

                if use_targets:
                    targets.append(target)
                else:
                    labels.append(label)

        ### end of datafile iteration ###

        # turn into numpy arrays
        self.trials = np.vstack(trials)

        assert not np.isnan(np.sum(self.trials))

        # prepare targets / labels
        if use_targets:
            self.targets = np.vstack(targets)
            assert not np.isnan(np.sum(self.targets))
        else:
            labels = np.hstack(labels)
            if label_map is None:
                one_hot_formatter = OneHotFormatter(max(labels) + 1)
            else:
                one_hot_formatter = OneHotFormatter(max(label_map.values()) + 1)
            one_hot_y = one_hot_formatter.format(labels)
            self.targets = one_hot_y

        self.metadata = meta

        if layout == 'ft': # swap axes to (batch, feature, time, channels)
            self.trials = self.trials.swapaxes(1, 2)

        # transform after finalizing the data structure
        for transformer in transformers:
            self.trials, self.targets = transformer.process(self.trials, self.targets)

        self.trials = np.asarray(self.trials, dtype=theano.config.floatX)

        log.debug('final dataset shape: {} (b,0,1,c)'.format(self.trials.shape))
        # super(EEGEpochsDataset, self).__init__(topo_view=self.trials, y=self.targets, axes=['b', 0, 1, 'c'])

        self.X = self.trials.reshape(self.trials.shape[0], np.prod(self.trials.shape[1:]))
        self.y = self.targets
        log.info('generated dataset "{}" with shape X={}={} y={} targets={} '.
                 format(self.name, self.X.shape, self.trials.shape, self.y.shape, self.targets.shape))


        # determine data specs
        features_space = Conv2DSpace(
            shape=[self.trials.shape[1], self.trials.shape[2]],
            num_channels=self.trials.shape[3]
        )
        features_source = 'features'

        targets_space = VectorSpace(dim=self.targets.shape[-1])
        targets_source = 'targets'

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]

        # additional support for meta information
        self.meta_maps = dict()
        for meta_source in meta_sources:
            self.meta_maps[meta_source] = sorted(list(set([m[meta_source] for m in self.metadata])))
            space_components.extend([VectorSpace(dim=1)])
            source_components.extend([meta_source])
            log.info('Generated meta-source "{}" with value map: {}'
                     .format(meta_source, self.meta_maps[meta_source]))

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        log.debug('data specs: {}'.format(self.data_specs))
Пример #59
0
# tvars = tf.trainable_variables()
# grads_raw = tf.gradients(cost, tvars)
# grads, _ = tf.clip_by_global_norm(grads_raw, 5.0)

# apply_grads = optimizer.apply_gradients(zip(grads, tvars))

##################################
# DATA

fname = env.dataset([f for f in os.listdir(env.dataset()) if f.endswith(".wav")][0])
df = env.run("test_data.pkl")

if not os.path.exists(df):
    song_data_raw, source_sr = lr.load(fname)
    print "Got sampling rate {}, resampling to {} ...".format(source_sr, target_sr)
    song_data = lr.resample(song_data_raw, source_sr, target_sr, scale=True)
    song_data = song_data[:30000,]

    np.save(open(df, "w"), song_data)
else:
    song_data = np.load(open(df))

inputs_v, data_denom = norm(song_data)




##################################
# EVALUATION

sess = tf.Session()
Пример #60
0
    def __init__(self, 
                 path,
                 name = '',         # optional name
                 
                 # selectors
                 subjects='all',        # optional selector (list) or 'all'
                 trial_types='all',     # optional selector (list) or 'all'
                 trial_numbers='all',   # optional selector (list) or 'all'
                 conditions='all',      # optional selector (list) or 'all'     
                 
                 partitioner = None,            
                 
                 channel_filter = NoChannelFilter(),   # optional channel filter, default: keep all
                 channel_names = None,  # optional channel names (for metadata)
                 
                 label_map = None,      # optional conversion of labels

                 remove_dc_offset = False,  # optional subtraction of channel mean, usually done already earlier
                 resample = None,       # optional down-sampling

                 # optional sub-sequences selection
                 start_sample = 0,
                 stop_sample  = None,   # optional for selection of sub-sequences

                 # optional signal filter to by applied before spitting the signal
                 signal_filter = None,

                 # windowing parameters
                 frame_size = -1,
                 hop_size   = -1,       # values > 0 will lead to windowing
                 hop_fraction = None,   # alternative to specifying absolute hop_size
                 
                 # optional spectrum parameters, n_fft = 0 keeps raw data
                 n_fft = 0,
                 n_freq_bins = None,
                 spectrum_log_amplitude = False,
                 spectrum_normalization_mode = None,
                 include_phase = False,

                 flatten_channels=False,
                 layout='tf',       # (0,1)-axes layout tf=time x features or ft=features x time

                 save_matrix_path = None,
                 keep_metadata = False,
                 ):
        '''
        Constructor
        '''

        # save params
        self.params = locals().copy()
        del self.params['self']
        # print self.params
        
        # TODO: get the whole filtering into an extra class
        
        datafiles_metadata, metadb = load_datafiles_metadata(path)
        
#         print datafiles_metadata
        
        def apply_filters(filters, node):            
            if isinstance(node, dict):            
                filtered = []
                keepkeys = filters[0]
                for key, value in node.items():
                    if keepkeys == 'all' or key in keepkeys:
                        filtered.extend(apply_filters(filters[1:], value))
                return filtered
            else:
                return node # [node]
            
        
        # keep only files that match the metadata filters
        self.datafiles = apply_filters([subjects,trial_types,trial_numbers,conditions], datafiles_metadata)
        
        # copy metadata for retained files
        self.metadb = {}
        for datafile in self.datafiles:
            self.metadb[datafile] = metadb[datafile]
        
#         print self.datafiles
#         print self.metadb
        
        self.name = name

        if partitioner is not None:
            self.datafiles = partitioner.get_partition(self.name, self.metadb)
        
        self.include_phase = include_phase
        self.spectrum_normalization_mode = spectrum_normalization_mode
        self.spectrum_log_amplitude = spectrum_log_amplitude

        self.sequence_partitions = [] # used to keep track of original sequences
        
        # metadata: [subject, trial_no, stimulus, channel, start, ]
        self.metadata = []
        
        sequences = []
        labels = []
        n_sequences = 0

        if frame_size > 0 and hop_size == -1 and hop_fraction is not None:
            hop_size = np.ceil(frame_size / hop_fraction)

        for i in xrange(len(self.datafiles)):        
            with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): 

                # save start of next sequence
                self.sequence_partitions.append(n_sequences)

                data, metadata = load(os.path.join(path, self.datafiles[i]))

                label = metadata['label']
                if label_map is not None:
                    label = label_map[label]

                multi_channel_frames = []

                # process 1 channel at a time
                for channel in xrange(data.shape[1]):
                    # filter channels
                    if not channel_filter.keep_channel(channel):
                        continue

                    samples = data[:, channel]

                    # subtract channel mean
                    if remove_dc_offset:
                        samples -= samples.mean()

                    # down-sample if requested
                    if resample is not None and resample[0] != resample[1]:
                        samples = librosa.resample(samples, resample[0], resample[1])

                    # apply optional signal filter after down-sampling -> requires lower order
                    if signal_filter is not None:
                        samples = signal_filter.process(samples)

                    # get sub-sequence in resampled space
                    # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape))
                    samples = samples[start_sample:stop_sample]

                    if n_fft is not None and n_fft > 0: # Optionally:
                        ### frequency spectrum branch ###

                        # transform to spectogram
                        hop_length = n_fft / 4;
            
                        '''
                        from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html
                        >>> # Get a power spectrogram from a waveform y
                        >>> S       = np.abs(librosa.stft(y)) ** 2
                        >>> log_S   = librosa.logamplitude(S)
                        '''                                     
                             
                        S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length)
                        # mag = np.abs(S)        # magnitude spectrum
                        mag = np.abs(S)**2       # power spectrum
                        
                        # include phase information if requested
                        if self.include_phase:
                            # phase = np.unwrap(np.angle(S))
                            phase = np.angle(S)

                        # Optionally: cut off high bands
                        if n_freq_bins is not None:
                            mag = mag[0:n_freq_bins, :]
                            if self.include_phase:
                                phase = phase[0:n_freq_bins, :]
                                                  
                        if self.spectrum_log_amplitude:      
                            mag = librosa.logamplitude(mag)
                            
                        s = mag # for normalization
                                                    
                        '''
                        NOTE on normalization:
                        It depends on the structure of a neural network and (even more) 
                        on the properties of data. There is no best normalization algorithm 
                        because if there would be one, it would be used everywhere by default...
                    
                        In theory, there is no requirement for the data to be normalized at all. 
                        This is a purely practical thing because in practice convergence could 
                        take forever if your input is spread out too much. The simplest would be 
                        to just normalize it by scaling your data to (-1,1) (or (0,1) depending 
                        on activation function), and in most cases it does work. If your 
                        algorithm converges well, then this is your answer. If not, there are 
                        too many possible problems and methods to outline here without knowing 
                        the actual data.
                        '''
    
                        ## normalize to mean 0, std 1
                        if self.spectrum_normalization_mode == 'mean0_std1':
                            # s = preprocessing.scale(s, axis=0);
                            mean = np.mean(s)
                            std = np.std(s)
                            s = (s - mean) / std
                        
                        ## normalize by linear transform to [0,1]
                        elif self.spectrum_normalization_mode == 'linear_0_1':
                            s = s / np.max(s)
                        
                        ## normalize by linear transform to [-1,1]
                        elif self.spectrum_normalization_mode == 'linear_-1_1':
                            s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s))
                            
                        elif self.spectrum_normalization_mode is not None:
                            raise ValueError(
                                'unsupported spectrum normalization mode {}'.format(
                                    self.spectrum_normalization_mode)
                             )
                        
                        #print s.mean(axis=0)
                        #print s.std(axis=0)
    
                        # include phase information if requested
                        if self.include_phase:
                            # normalize phase to [-1.1]
                            phase = phase / np.pi
                            s = np.vstack([s, phase])
                        
                        # transpose to fit pylearn2 layout
                        s = np.transpose(s)
                        # print s.shape

                        ### end of frequency spectrum branch ###
                    else:
                        ### raw waveform branch ###

                        # normalize to max amplitude 1
                        s = librosa.util.normalize(samples)

                        # add 2nd data dimension
                        s = s.reshape(s.shape[0], 1)
                        # print s.shape

                        ### end of raw waveform branch ###

                    s = np.asfarray(s, dtype='float32')

                    if frame_size > 0 and hop_size > 0:
                        s = s.copy() # FIXME: THIS IS NECESSARY IN MultiChannelEEGSequencesDataset - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!!
                        frames = frame(s, frame_length=frame_size, hop_length=hop_size)
                    else:
                        frames = s
                    del s
                    # print frames.shape

                    if flatten_channels:
                        # add artificial channel dimension
                        frames = frames.reshape((frames.shape[0], frames.shape[1], frames.shape[2], 1))
                        # print frames.shape

                        sequences.append(frames)

                        # increment counter by new number of frames
                        n_sequences += frames.shape[0]

                        if keep_metadata:
                            # determine channel name
                            channel_name = None
                            if channel_names is not None:
                                channel_name = channel_names[channel]
                            elif 'channels' in metadata:
                                channel_name = metadata['channels'][channel]

                            self.metadata.append({
                                        'subject'   : metadata['subject'],            # subject
                                        'trial_type': metadata['trial_type'],         # trial_type
                                        'trial_no'  : metadata['trial_no'],           # trial_no
                                        'condition' : metadata['condition'],          # condition
                                        'channel'   : channel,                        # channel
                                        'channel_name' : channel_name,
                                        'start'     : self.sequence_partitions[-1],   # start
                                        'stop'      : n_sequences                     # stop
                                    })

                        for _ in xrange(frames.shape[0]):
                            labels.append(label)
                    else:
                        multi_channel_frames.append(frames)

                    ### end of channel iteration ###


                if not flatten_channels:
                    # turn list into array
                    multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32')
                    # [channels x frames x time x freq] -> cb01
                    # [channels x frames x time x 1] -> cb0.

                    # move channel dimension to end
                    multi_channel_frames = np.rollaxis(multi_channel_frames, 0, 4)
                    # print multi_channel_frames.shape
                    # log.debug(multi_channel_frames.shape)

                    sequences.append(multi_channel_frames)

                    # increment counter by new number of frames
                    n_sequences += multi_channel_frames.shape[0]

                    if keep_metadata:
                        self.metadata.append({
                                    'subject'   : metadata['subject'],            # subject
                                    'trial_type': metadata['trial_type'],         # trial_type
                                    'trial_no'  : metadata['trial_no'],           # trial_no
                                    'condition' : metadata['condition'],          # condition
                                    'channel'   : 'all',                          # channel
                                    'start'     : self.sequence_partitions[-1],   # start
                                    'stop'      : n_sequences                     # stop
                                })

                    for _ in xrange(multi_channel_frames.shape[0]):
                        labels.append(label)

                ### end of datafile iteration ###
      
        # turn into numpy arrays
        sequences = np.vstack(sequences)
        # print sequences.shape;
        
        labels = np.hstack(labels)
        
        # one_hot_y = one_hot(labels)
        one_hot_formatter = OneHotFormatter(labels.max() + 1) # FIXME!
        one_hot_y = one_hot_formatter.format(labels)
                
        self.labels = labels

        if layout == 'ft': # swap axes to (batch, feature, time, channels)
            sequences = sequences.swapaxes(1, 2)
            
        log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape))
        super(MultiChannelEEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c'])
        
        log.info('generated dataset "{}" with shape X={}={} y={} labels={} '.
                 format(self.name, self.X.shape, sequences.shape, self.y.shape, self.labels.shape))

        if save_matrix_path is not None:
            matrix = DenseDesignMatrix(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c'])
            with log_timing(log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)):
                serial.save(save_matrix_path, matrix)