def get_tempogram_features(feats):
    # filtered tempogram
    tempogram = feats['tempogram']
    tgf = util.normalize(
        np.maximum(
            0.0,
            tempogram - scipy.ndimage.median_filter(tempogram, size=(9, 1))))
    tr = util.normalize(tempogram_ratio(tgf))
    return np.median(tgf,
                     axis=1), np.median(tr, axis=1), tempo(tgf,
                                                           aggregate=np.median)
Пример #2
0
def tonnetz(y=None, sr=22050, chroma=None):

    if y is None and chroma is None:
        raise ParameterError(
            'Either the audio samples or the chromagram must be '
            'passed as an argument.')

    if chroma is None:
        chroma = chroma_cqt(y=y, sr=sr)

    # Generate Transformation matrix
    dim_map = np.linspace(0, 12, num=chroma.shape[0], endpoint=False)

    scale = np.asarray([7. / 6, 7. / 6, 3. / 2, 3. / 2, 2. / 3, 2. / 3])

    V = np.multiply.outer(scale, dim_map)

    # Even rows compute sin()
    V[::2] -= 0.5

    R = np.array([
        1,
        1,  # Fifths
        1,
        1,  # Minor
        0.5,
        0.5
    ])  # Major

    phi = R[:, np.newaxis] * np.cos(np.pi * V)

    # Do the transform to tonnetz
    return phi.dot(util.normalize(chroma, norm=1, axis=0))
Пример #3
0
def inference(a, with_postnet=False):
    generator = Generator(hp.model.in_channels).to(device)

    state_dict_g = load_checkpoint(a.checkpoint_file, device)
    generator.load_state_dict(state_dict_g['generator'])

    filelist = os.listdir(a.input_wavs_dir)

    os.makedirs(a.output_dir, exist_ok=True)

    generator.eval()
    #generator.remove_weight_norm()
    with torch.no_grad():
        for i, filename in enumerate(filelist):
            wav, sr = load_wav(os.path.join(a.input_wavs_dir, filename))
            wav = wav / MAX_WAV_VALUE
            wav = normalize(wav) * 0.95
            wav = torch.FloatTensor(wav)
            wav = wav.reshape((1, 1, wav.shape[0],)).to(device)
            before_y_g_hat, y_g_hat = generator(wav, with_postnet)
            audio = before_y_g_hat.reshape((before_y_g_hat.shape[2],))
            audio = audio * MAX_WAV_VALUE
            audio = audio.cpu().numpy().astype('int16')
            output_file = os.path.join(
                a.output_dir,
                os.path.splitext(filename)[0] + '_generated.wav'
            )
            write(output_file, hp.audio.sampling_rate, audio)
            print(output_file)
Пример #4
0
    def extract(audio_fn):
        # Read and Resample the audio
        try:
            data, _ = librosa.core.load(audio_fn, sr=sampling_rate)
            data = normalize(data)
        except Exception as e:
            logging.exception(e)
            return None

        # ensure length
        if len(data) > duration:
            data = data[:duration]
        elif len(data) < duration:
            data = np.pad(data, (duration - len(data), ),
                          mode='constant',
                          constant_values=0)

        # spectrogram
        f, t, Sxx = sp.signal.spectrogram(data,
                                          fs=sampling_rate,
                                          window=window,
                                          nperseg=frame_length,
                                          noverlap=overlap_length,
                                          nfft=nfft)

        if mel_scale:
            # spectrogram -> log mel fb
            f_to_mel = filters.mel(sr=sampling_rate,
                                   n_fft=nfft,
                                   n_mels=n_freq_bins)
            Sxx = f_to_mel.dot(Sxx)

        Sxx = np.expand_dims(np.log(1e-8 + Sxx), axis=-1)

        return Sxx
Пример #5
0
def spectral_bandwidth(y=None, sr=22050, S=None, n_fft=2048, hop_length=512,
                       freq=None, centroid=None, norm=True, p=2):


    S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length)

    if not np.isrealobj(S):
        raise ParameterError('Spectral bandwidth is only defined '
                             'with real-valued input')
    elif np.any(S < 0):
        raise ParameterError('Spectral bandwidth is only defined '
                             'with non-negative energies')

    if centroid is None:
        centroid = spectral_centroid(y=y, sr=sr, S=S,
                                     n_fft=n_fft,
                                     hop_length=hop_length,
                                     freq=freq)

    # Compute the center frequencies of each bin
    if freq is None:
        freq = fft_frequencies(sr=sr, n_fft=n_fft)

    if freq.ndim == 1:
        deviation = np.abs(np.subtract.outer(freq, centroid[0]))
    else:
        deviation = np.abs(freq - centroid[0])

    # Column-normalize S
    if norm:
        S = util.normalize(S, norm=1, axis=0)

    return np.sum(S * deviation**p, axis=0, keepdims=True)**(1./p)
Пример #6
0
 def LPC(self):
     fft = self.fft(self.windowed_x)
     self.Phase(fft)
     self.Spectrum()
     invert = self.ISTFT(self.magnitude_spectrum)
     invert = np.array(invert).T                                    
     self.correlation = util.normalize(invert, norm = np.inf)
     subslice = [slice(None)] * np.array(self.correlation).ndim
     subslice[0] = slice(self.windowed_x.shape[0])
     self.correlation = np.array(self.correlation)[subslice]      
     if not np.iscomplexobj(self.correlation):               
         self.correlation = self.correlation.real #compute autocorrelation of the frame
     self.correlation.flags.writeable = False
     E = np.copy(self.correlation[0])
     corr = np.copy(self.correlation)
     p = 14
     reflection = np.zeros(p)
     lpc = np.zeros(p+1)
     lpc[0] = 1
     temp = np.zeros(p)
     for i in range(1, p+1):
         k = float(self.correlation[i])
         for j in range(i):
             k += self.correlation[i-j] * lpc[j]
         k /= E
         reflection[i-1] = k
         lpc[i] = -k
         for j in range(1,i):
             temp[j] = lpc[j] - k * lpc[i-j]
         for j in range(1,i):
             lpc[j] = temp[j]
         E *= (1-pow(k,2))
     return lpc[1:]
def fbank(path, fft_span, hop_span, n_mels, fmin, fmax,affichage=False):
    """
    :param path: emplacement du fichier
    :param fft_span: taille de la fenetre pour la transformee de fourrier en seconde
    :param hop_span: pas entre deux echantillons en seconde
    :param n_mels: nombre de bandes de frequences mel
    :param fmin: frequence minimale de la decomposition
    :param fmax: frequence maximale de la decomposition
    :param affichage: True si on veut afficher le spectrogramme
    :return: Renvoie les vecteurs fbank representant le signal
             X matrice representant la decomposition fbank au cours du temps (une ligne = une decomposition pour une periode hop_span, de taille n_mels)
    """

    # 1ere facon d ouvrir un fichier
    # wav_signal = scipy.io.wavfile.read(path)
    # wav = np.array(wav_signal[1])
    # s_rate = wav_signal[0]
    # Deuxieme facon d ouvrir un fichier
    wav, s_rate = librosa.load(path)

    X = feature.melspectrogram(util.normalize(wav), s_rate, S=None, n_fft=int(np.floor(fft_span * s_rate)),
                               hop_length=int(np.floor(hop_span * s_rate)), n_mels=n_mels, fmin=fmin, fmax=fmax)
    # #Verification nombre d'echantillons (un toutes les 10ms)
    # size = X.shape
    # print 'Taille de la matrice de sortie',size
    # print 'Taille d un morceau de signal de 10ms que l on obtient' ,len(wav)/size[1]
    # print 'taille theorique d un morceau de signal',0.01*s_rate
    # print 's_rate',s_rate
    # print 'longueur',wav.shape
    # print wav.shape[0]/s_rate
    X = np.log(X)
    if affichage:
      afficherSpec(X,s_rate,hop_span)
    return np.transpose(X)
Пример #8
0
def audio_to_array(audio):

    #extract audio data and sampling rate from file
    data, fs = sf.read(audio)

    #convert to wav file at correct sampling rate
    sf.write(audio, data, fs)

    #read the audio sample
    audio = read(audio)

    #[removed]
    #y, sr = load(audio, offset=30, duration=5)
    #audio_arr = mfcc(y=y, sr=sr)

    #convert the audio to an array
    audio_arr = np.array(audio[1],dtype=float)

    #normalize
    audio_arr = normalize(audio_arr, np.inf, 0)

    #short-time Fourier transform
    audio_arr = np.abs(stft(audio_arr))

    #[removed]
    #Mel - frequency cepstral coefficients(MFCCs)
    #audio_arr = np.abs(mfcc(audio_arr))
    #audio_arr = mfcc(audio_arr, sr=44100)

    #reduce number of dimensions
    pca = PCA(n_components=5)
    audio_arr = pca.fit_transform(audio_arr)
    return audio_arr
Пример #9
0
 def LPC(self):
     fft = self.fft(self.windowed_x)
     self.Phase(fft)
     self.Spectrum()
     invert = self.ISTFT(self.magnitude_spectrum)
     invert = np.array(invert).T
     self.correlation = util.normalize(invert, norm=np.inf)
     subslice = [slice(None)] * np.array(self.correlation).ndim
     subslice[0] = slice(self.windowed_x.shape[0])
     self.correlation = np.array(self.correlation)[subslice]
     if not np.iscomplexobj(self.correlation):
         self.correlation = self.correlation.real  #compute autocorrelation of the frame
     self.correlation.flags.writeable = False
     E = np.copy(self.correlation[0])
     corr = np.copy(self.correlation)
     p = 14
     reflection = np.zeros(p)
     lpc = np.zeros(p + 1)
     lpc[0] = 1
     temp = np.zeros(p)
     for i in range(1, p + 1):
         k = float(self.correlation[i])
         for j in range(i):
             k += self.correlation[i - j] * lpc[j]
         k /= E
         reflection[i - 1] = k
         lpc[i] = -k
         for j in range(1, i):
             temp[j] = lpc[j] - k * lpc[i - j]
         for j in range(1, i):
             lpc[j] = temp[j]
         E *= (1 - pow(k, 2))
     return lpc[1:]
Пример #10
0
def chroma_cqt(y=None, sr=22050, C=None, hop_length=512, fmin=None,
               norm=np.inf, threshold=0.0, tuning=None, n_chroma=12,
               n_octaves=7, window=None, bins_per_octave=None, cqt_mode='full'):

    cqt_func = {'full': cqt, 'hybrid': hybrid_cqt}

    if bins_per_octave is None:
        bins_per_octave = n_chroma

    # Build the CQT if we don't have one already
    if C is None:
        C = np.abs(cqt_func[cqt_mode](y, sr=sr,
                                      hop_length=hop_length,
                                      fmin=fmin,
                                      n_bins=n_octaves * bins_per_octave,
                                      bins_per_octave=bins_per_octave,
                                      tuning=tuning))

    # Map to chroma
    cq_to_chr = filters.cq_to_chroma(C.shape[0],
                                     bins_per_octave=bins_per_octave,
                                     n_chroma=n_chroma,
                                     fmin=fmin,
                                     window=window)
    chroma = cq_to_chr.dot(C)

    if threshold is not None:
        chroma[chroma < threshold] = 0.0

    # Normalize
    if norm is not None:
        chroma = util.normalize(chroma, norm=norm, axis=0)

    return chroma
Пример #11
0
def spectral_centroid(y=None,
                      sr=22050,
                      S=None,
                      n_fft=2048,
                      hop_length=512,
                      freq=None):

    S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length)

    if not np.isrealobj(S):
        raise ParameterError('Spectral centroid is only defined '
                             'with real-valued input')
    elif np.any(S < 0):
        raise ParameterError('Spectral centroid is only defined '
                             'with non-negative energies')

    # Compute the center frequencies of each bin
    if freq is None:
        freq = fft_frequencies(sr=sr, n_fft=n_fft)

    if freq.ndim == 1:
        freq = freq.reshape((-1, 1))

    # Column-normalize S
    return np.sum(freq * util.normalize(S, norm=1, axis=0),
                  axis=0,
                  keepdims=True)
Пример #12
0
def window_sumsquare(window,
                     n_frames,
                     hop_length=200,
                     win_length=800,
                     n_fft=800,
                     dtype=np.float32,
                     norm=None):

    if win_length is None:
        win_length = n_fft

    n = n_fft + hop_length * (n_frames - 1)
    x = np.zeros(n, dtype=dtype)

    # Compute the squared window at the desired length
    win_sq = get_window(window, win_length, fftbins=True)
    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
    win_sq = librosa_util.pad_center(win_sq, n_fft)

    # Fill the envelope
    for i in range(n_frames):
        sample = i * hop_length
        x[sample:min(n, sample +
                     n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
    return x
def ChordSignature7thbass(chordid, feature, sevenths=True, inv=True):
    if chordid == const.N_CHORDS - 1:
        return "N"
    chroma = normalize(feature[:, 12:24], axis=1)
    bass_chroma = normalize(feature[:, :12], axis=1)
    root_note = chordid % 12
    third_note = (root_note + 4 - (chordid // 12)) % 12
    fifth_note = (root_note + 7) % 12
    seventh_note = (root_note + 10) % 12
    majseventh_note = (root_note + 11) % 12
    mean_root = np.mean(bass_chroma[:, root_note])
    mean_3rd = np.mean(bass_chroma[:, third_note])
    mean_5th = np.mean(bass_chroma[:, fifth_note])
    mean_7th = np.mean(chroma[:, seventh_note])
    mean_maj7th = np.mean(chroma[:, majseventh_note])
    root = PitchChr[root_note]
    quality = OutputQualityList[chordid // 12]
    bass = ""
    #determine seventh
    if sevenths:
        if (mean_7th > 0.5) or (mean_maj7th > 0.5):
            if mean_7th >= mean_maj7th:
                if quality == "min":
                    quality = "min7"
                else:
                    quality = "7"
            else:
                if quality == "maj":
                    quality = "maj7"
                else:
                    quality = "minmaj7"
    #determine bass
    if inv:
        if (mean_3rd > 0.6
                and mean_3rd > mean_root) or (mean_5th > 0.6
                                              and mean_5th > mean_root):
            if mean_3rd > mean_5th:
                if (quality == "min") or (quality == "min7"):
                    bass = "b3"
                else:
                    bass = "3"
            else:
                bass = "5"
    sign = "%s:%s" % (root, quality)
    if bass != "":
        sign += ("/" + bass)
    return sign
Пример #14
0
def window_sumsquare(window,
                     n_frames,
                     hop_length=200,
                     win_length=800,
                     n_fft=800,
                     dtype=np.float32,
                     norm=None):
    # 总共800长度,n:总共解析多少个针
    """
    # from librosa 0.6
    Compute the sum-square envelope of a window function at a given hop length.

    This is used to estimate modulation effects induced by windowing
    observations in short-time fourier transforms.

    Parameters
    ----------
    window : string, tuple, number, callable, or list-like
        Window specification, as in `get_window`

    n_frames : int > 0
        The number of analysis frames

    hop_length : int > 0
        The number of samples to advance between frames

    win_length : [optional]
        The length of the window function.  By default, this matches `n_fft`.

    n_fft : int > 0
        The length of each analysis frame.

    dtype : np.dtype
        The data type of the output

    Returns
    -------
    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
        The sum-squared envelope of the window function
    """
    if win_length is None:
        win_length = n_fft

    n = n_fft + hop_length * (n_frames - 1)  #总长
    x = np.zeros(n, dtype=dtype)

    # Compute the squared window at the desired length
    win_sq = get_window(window, win_length, fftbins=True)  #采样函数
    win_sq = librosa_util.normalize(win_sq, norm=norm)**2  #平方
    win_sq = librosa_util.pad_center(win_sq,
                                     n_fft)  #填充0. 结果长度是n_fft,如果win_length指定了,
    #那么这行代码彩旗效果.

    # Fill the envelope#下一个函数进行函数波形每次的偏右200然后叠加的运算.所以叫sum_square
    for i in range(n_frames):  #hop_length 表示跳过的大小.就是静音时间段的长度.
        sample = i * hop_length
        x[sample:min(n, sample +
                     n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
    return x
Пример #15
0
def window_sumsquare(
    window,
    n_frames,
    hop_length,
    win_length,
    n_fft,
    dtype=np.float32,
    norm=None,
):
    """
    # from librosa 0.6
    Compute the sum-square envelope of a window function at a given hop length.

    This is used to estimate modulation effects induced by windowing
    observations in short-time fourier transforms.

    Parameters
    ----------
    window : string, tuple, number, callable, or list-like
        Window specification, as in `get_window`

    n_frames : int > 0
        The number of analysis frames

    hop_length : int > 0
        The number of samples to advance between frames

    win_length : [optional]
        The length of the window function.  By default, this matches `n_fft`.

    n_fft : int > 0
        The length of each analysis frame.

    dtype : np.dtype
        The data type of the output

    Returns
    -------
    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
        The sum-squared envelope of the window function
    """
    if win_length is None:
        win_length = n_fft

    n = n_fft + hop_length * (n_frames - 1)
    x = np.zeros(n, dtype=dtype)

    # Compute the squared window at the desired length
    win_sq = get_window(window, win_length, fftbins=True)
    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
    win_sq = librosa_util.pad_center(win_sq, n_fft)

    # Fill the envelope
    for i in range(n_frames):
        sample = i * hop_length
        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
    return x
Пример #16
0
 def load_wav_to_torch(self, full_path):
     """
     Loads wavdata into torch array
     """
     data, sampling_rate = load(full_path, sr=None)
     data = 0.95 * normalize(data)
     data = torch.from_numpy(data).float()
     if self.augs is not None:
         data = self.augs(data)
     return data, sampling_rate
    def __getitem__(self, index):
        filename = self.audio_files[index]
        if self._cache_ref_count == 0:
            audio, sampling_rate = load_wav(filename)
            audio = audio / MAX_WAV_VALUE
            if not self.fine_tuning:
                audio = normalize(audio) * 0.95
            self.cached_wav = audio
            if sampling_rate != self.sampling_rate:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, self.sampling_rate))
            self._cache_ref_count = self.n_cache_reuse
        else:
            audio = self.cached_wav
            self._cache_ref_count -= 1

        audio = torch.FloatTensor(audio)
        audio = audio.unsqueeze(0)

        if not self.fine_tuning:
            if self.split:
                if audio.size(1) >= self.segment_size:
                    max_audio_start = audio.size(1) - self.segment_size
                    audio_start = random.randint(0, max_audio_start)
                    audio = audio[:, audio_start:audio_start+self.segment_size]
                else:
                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')

            mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
                                  self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
                                  center=False)
        else:
            mel = np.load(
                os.path.join(self.base_mels_path, os.path.splitext(filename)[0] + '.npy'))
            mel = torch.from_numpy(mel)

            if len(mel.shape) < 3:
                mel = mel.unsqueeze(0)

            if self.split:
                frames_per_seg = math.ceil(self.segment_size / self.hop_size)

                if audio.size(1) >= self.segment_size:
                    mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
                    mel = mel[:, :, mel_start:mel_start + frames_per_seg]
                    audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
                else:
                    mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')

        mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
                                   self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
                                   center=False)

        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
Пример #18
0
    def autocorrelation(self):
        self.N = (self.windowed_x[:, 0].shape[0] + 1) * 2
        corr = ifft(fft(self.windowed_x, n=self.N))
        self.correlation = util.normalize(corr, norm=np.inf)
        subslice = [slice(None)] * np.array(self.correlation).ndim
        subslice[0] = slice(self.windowed_x.shape[0])

        self.correlation = np.array(self.correlation)[subslice]
        if not np.iscomplexobj(self.correlation):
            self.correlation = self.correlation.real
        return self.correlation
def gen_win_sq(denoiser):
    window = denoiser.stft.window
    win_length = denoiser.stft.win_length
    n_fft = denoiser.stft.filter_length

    # Compute the squared window at the desired length
    win_sq = get_window(window, win_length, fftbins=True)
    win_sq = librosa_util.normalize(win_sq, norm=None)**2
    win_sq = librosa_util.pad_center(win_sq, n_fft)

    return win_sq
Пример #20
0
 def autocorrelation(self):
     self.N = (self.windowed_x[:,0].shape[0]+1) * 2
     corr = ifft(fft(self.windowed_x, n = self.N))                                   
     self.correlation = util.normalize(corr, norm = np.inf)
     subslice = [slice(None)] * np.array(self.correlation).ndim
     subslice[0] = slice(self.windowed_x.shape[0])                   
                                  
     self.correlation = np.array(self.correlation)[subslice]                                          
     if not np.iscomplexobj(self.correlation):               
         self.correlation = self.correlation.real
     return self.correlation
Пример #21
0
    def smooth(self, feat, win_len_smooth=4):
        '''
        This code is similar to the one used on librosa for smoothing cens: 
        https://librosa.github.io/librosa/generated/librosa.feature.chroma_cens.html
        '''
        win = filters.get_window('hann', win_len_smooth + 2, fftbins=False)
        win /= np.sum(win)
        win = np.atleast_2d(win)

        feat = scipy.signal.convolve2d(feat, win, mode='same', boundary='fill')
        return util.normalize(feat, norm=2, axis=0)
Пример #22
0
    def load_wav_to_torch(self, full_path):
        """
        Loads wavdata into torch array
        """
        data, sampling_rate = load(full_path, sr=self.sampling_rate)
        data = 0.95 * normalize(data)

        if self.augment:
            amplitude = np.random.uniform(low=0.3, high=1.0)
            data = data * amplitude

        return torch.from_numpy(data).float(), sampling_rate
Пример #23
0
def read_wav(
    fname: str, sr: int, norm: float = 0, pre_emphasis: bool = False
) -> np.ndarray:
    "Read a wave file into a normalized array"

    (S, _) = librosa.load(fname, sr=sr)
    (S, _) = effects.trim(S)
    if pre_emphasis:
        S[1:] -= S[:-1]
    if norm is not 0:
        S = librosa_util.normalize(S, norm=norm)
    return S
Пример #24
0
def add_noise(data, noise_ratio=.05):
    """
    adds randomness (white noise) to signal
    Args:
        data: array of audio file(s)
        noise_ratio: how much noise to add

    Returns: normalized audio file with white noise applied

    """
    noisy_data = data + noise_ratio * np.random.normal(loc=0.0, scale=1.0, size=data.shape)
    return normalize(noisy_data)
Пример #25
0
    def __getitem__(self, index):
        filename = self.audio_files[index]
        if self._cache_ref_count == 0:
            audio, sampling_rate = load_wav(filename)
            audio = audio / MAX_WAV_VALUE
            audio = normalize(audio) * 0.95
            self.cached_wav = audio
            if sampling_rate != self.sampling_rate:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, self.sampling_rate))
            self._cache_ref_count = self.n_cache_reuse
        else:
            audio = self.cached_wav
            self._cache_ref_count -= 1

        audio = torch.FloatTensor(audio)
        audio = audio.unsqueeze(0)

        if self.split:
            if audio.size(1) >= self.segment_size:
                max_audio_start = audio.size(1) - self.segment_size
                audio_start = random.randint(0, max_audio_start)
                audio = audio[:, audio_start:audio_start + self.segment_size]
            else:
                audio = torch.nn.functional.pad(
                    audio, (0, self.segment_size - audio.size(1)), "constant")

        mel = mel_spectrogram(
            audio,
            self.n_fft,
            self.num_mels,
            self.sampling_rate,
            self.hop_size,
            self.win_size,
            self.fmin,
            self.fmax,
            center=False,
        )

        mel_loss = mel_spectrogram(
            audio,
            self.n_fft,
            self.num_mels,
            self.sampling_rate,
            self.hop_size,
            self.win_size,
            self.fmin,
            self.fmax_loss,
            center=False,
        )

        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
Пример #26
0
def getLocalFeatures(X, T=True):
  import librosa
  from librosa.util import normalize
  def MFCC(sig, sample_rate=22050):
    return librosa.feature.mfcc(y=np.array(sig), sr=sample_rate, n_mfcc=12)

  X = np.asarray([map(MFCC, s) for s in X])
  X_flattened = [v for lis in X for v in lis]

  if T:
    X_flattened = np.asarray(map(np.transpose, X_flattened))

  X_normalized = normalize(X_flattened, norm=2)
  X_flattened = np.asarray([x for clip in X_normalized for x in clip])
  return X_flattened
Пример #27
0
    def load_wav_to_torch(self, full_path, offset):
        """
        Loads wavdata into torch array
        """
        load_duration = self.segment_length / self.sampling_rate
        data, _ = load(
            full_path, sr=self.sampling_rate, offset=offset, duration=load_duration
        )
        data = 0.95 * normalize(data)

        if self.augment:
            amplitude = self.random_state.uniform(low=0.3, high=1.0)
            data = data * amplitude

        return torch.from_numpy(data).float()
Пример #28
0
    def load_wav_to_torch(self, full_path):
        """
        Loads wavdata into torch array
        """
        data, sampling_rate = load(full_path, sr=self.sampling_rate)
        #print(data.shape,flush=True)
        try:
            data = 0.95 * normalize(data)
        except:
            print(full_path, flush=True)
            sys.exit(-1)
        if self.augment:
            amplitude = np.random.uniform(low=0.3, high=1.0)
            data = data * amplitude

        return torch.from_numpy(data).float(), sampling_rate
Пример #29
0
  def featureExtraction(self, X, transpose=True):
    def MFCC(signal, sr=22050):
      return librosa.feature.mfcc(y=np.array(signal), sr=sr, n_mfcc=20)

    X = np.array([map(MFCC, song) for song in X])
    print "--> After MFCC X.shape", X.shape
    X_train_flattened = [val for sublist in X for val in sublist]
    print "--> X_train_flattened.shape", np.array(X_train_flattened).shape

    if transpose:
      # X_train_flattened = np.array(map(np.transpose, X_train_flattened))
      X_train_flattened = np.array([np.transpose(clip).flatten() for clip in X_train_flattened])
      print "--> After transpose X_train_flattened.shape", X_train_flattened.shape

    X_train_flattened_norm = normalize(X_train_flattened, norm=2)
    return X_train_flattened_norm
Пример #30
0
def mix(x0, x1, snr):
    """Mix two signals

    Args:
        x0 (numpy.ndarray): signal (n_samples,)
        x1 (numpy.ndarray): signal (n_samples,)
        snr (float): mixing coefficient applied on x1 (dB)

    Returns:
        numpy.ndarray: mixed signal (n_samples,)
    """
    # apply
    x0 = _norm_n_weight(x0, 0)  # set this signal as `signal`
    x1 = _norm_n_weight(x1, -snr)  # treat this as `noise`
    y = normalize(x0 + x1)
    return y
def gd_eval(codes, grain_is, lr=0.05, maxiter=20, verbose=0, **kwargs):
    code_dot = []
    for i, (src_i, tgt_i) in enumerate(grain_is):
        print("{}/{}: {}, {}".format(i + 1, len(grain_is), src_i, tgt_i))
        target = normalize(autocorr_t(
            codes.sample(tgt_i),
            codes.t(tgt_i),
        ).feature,
                           norm=2)
        trajectory = choose_molecule_pitch_opt(target,
                                               codes.acorr_coef(src_i),
                                               trace=True,
                                               lr=lr,
                                               maxiter=maxiter,
                                               verbose=verbose,
                                               **kwargs)
        code_dot.append(trajectory)
    return code_dot
Пример #32
0
def butter_bandpass_filter(data, lowcut, highcut, fs, order=5, normalize=False):
    """
    applies butter bandpass to audio array
    Args:
        data: 1D array audio file
        lowcut: low frequency cutoff point
        highcut: high frequency cutoff point
        fs: sample rate
        order: roll-off (smaller is more aggressive)
        normalize: if True, normalizes data

    Returns: 1D array audio file with butter bandpass filter applied

    """
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    if normalize:
        y = normalize(y)
    return y
Пример #33
0
def _norm_n_weight(x, dB):
    """Normlize and weight to given dB ratio

    Args:
        x (numpy.ndarray): signal (n_samples,)
        dB (float): target dB (*ratio)

    Returns:
        numpy.ndarray: processed signal (n_samples,)
    """
    # normalize both signal
    x = normalize(x)

    # get the RMS of each signal
    rms = np.linalg.norm(x) / np.sqrt(len(x))

    # get the weight
    ratio = (10**(dB / 20)) / rms

    return x * ratio
def window_sumsquare(window,
                     n_frames,
                     hop_length=120,
                     win_length=800,
                     n_fft=800,
                     dtype=float,
                     norm=None):
    if win_length is None:
        win_length = n_fft
    n = n_fft + hop_length * (n_frames - 1)
    x = np.zeros(n, dtype=dtype)

    win_sq = get_window(window, win_length, fftbins=True)
    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
    win_sq = librosa_util.pad_center(win_sq, n_fft)

    for i in range(n_frames):
        sample = i * hop_length
        x[sample:min(n, sample +
                     n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
    return x
Пример #35
0
def fbank(path, fft_span, hop_span, n_mels, fmin, fmax, affichage=False):
    """
    :param path: emplacement du fichier
    :param fft_span: taille de la fenetre pour la transformee de fourrier en seconde
    :param hop_span: pas entre deux echantillons en seconde
    :param n_mels: nombre de bandes de frequences mel
    :param fmin: frequence minimale de la decomposition
    :param fmax: frequence maximale de la decomposition
    :param affichage: True si on veut afficher le spectrogramme
    :return: Renvoie les vecteurs fbank representant le signal
             X matrice representant la decomposition fbank au cours du temps (une ligne = une decomposition pour une periode hop_span, de taille n_mels)
    """

    # 1ere facon d ouvrir un fichier
    # wav_signal = scipy.io.wavfile.read(path)
    # wav = np.array(wav_signal[1])
    # s_rate = wav_signal[0]
    # Deuxieme facon d ouvrir un fichier
    wav, s_rate = librosa.load(path)

    X = feature.melspectrogram(util.normalize(wav),
                               s_rate,
                               S=None,
                               n_fft=int(np.floor(fft_span * s_rate)),
                               hop_length=int(np.floor(hop_span * s_rate)),
                               n_mels=n_mels,
                               fmin=fmin,
                               fmax=fmax)
    # #Verification nombre d'echantillons (un toutes les 10ms)
    # size = X.shape
    # print 'Taille de la matrice de sortie',size
    # print 'Taille d un morceau de signal de 10ms que l on obtient' ,len(wav)/size[1]
    # print 'taille theorique d un morceau de signal',0.01*s_rate
    # print 's_rate',s_rate
    # print 'longueur',wav.shape
    # print wav.shape[0]/s_rate
    X = np.log(X)
    if affichage:
        afficherSpec(X, s_rate, hop_span)
    return np.transpose(X)
Пример #36
0
def loadSamples():
  collections = pickle.load(open('samples.in', 'r'))
  X, y = [], []
  words = []

  for idx in xrange(len(collections.keys())):
    samples = collections[collections.keys()[idx]]
    to_clustered = random.randint(0, len(samples)-1)
    for i in xrange(len(samples)):
      if i == to_clustered:
        words.append([clip[:66058] for clip in samples[i]])
      else:
        X.append([clip[:66058] for clip in samples[i]])
        y.append(idx)

  X = [[runMFCC(window) for window in sample] for sample in X]
  words = [[runMFCC(window) for window in sample] for sample in words]

  normedData = np.asarray(normalize(X + words, norm=2))
  X, words = normedData[:30], normedData[30:]

  return np.asarray(map(flatten, X)), np.asarray(y), np.asarray(map(flatten, words))
Пример #37
0
def featureExtraction(X, transpose=True):
  def MFCC(signal, sr=22050):
    return librosa.feature.mfcc(y=np.array(signal), sr=sr, n_mfcc=12)

  X = np.array([map(MFCC, song) for song in X])
  # X = np.array([[MFCC(clip) for clip in song] for song in X])
  print "After MFCC X.shape", X.shape
  X_train_flattened = [val for sublist in X for val in sublist]
  print "X_train_flattened.shape", np.array(X_train_flattened).shape
  # librosa.display.specshow(X_train_flattened[0], x_axis='time')
  # plt.colorbar()
  # plt.title('MFCC X_train_flattened[0]')
  # plt.tight_layout()
  # plt.show()

  if transpose:
    X_train_flattened = np.array(map(np.transpose, X_train_flattened))
    print "After transpose X_train_flattened.shape", X_train_flattened.shape

  X_train_flattened_norm = normalize(X_train_flattened, norm=2)
  X_train_flattened_norm_final = np.array([mfcc for clip in X_train_flattened_norm for mfcc in clip])
  return X_train_flattened_norm_final
  def featureExtraction(self, X, transpose=True):
    def MFCC(signal, sr=22050):
      y = np.array(signal, dtype=np.float64)

      y_h1, y_p1 = librosa.effects.hpss(y)
      y_h2, y_p2 = librosa.effects.hpss(y_h1)
      y_h3, y_p3 = librosa.effects.hpss(y_p1)

      y_h4, y_p4 = librosa.effects.hpss(y_h2)
      y_h5, y_p5 = librosa.effects.hpss(y_p2)
      y_h6, y_p6 = librosa.effects.hpss(y_h3)
      y_h7, y_p7 = librosa.effects.hpss(y_p3)

      mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
      y_h1 = librosa.feature.mfcc(y=y_h1, sr=sr, n_mfcc=20)
      y_p1 = librosa.feature.mfcc(y=y_p1, sr=sr, n_mfcc=20)
      y_h2 = librosa.feature.mfcc(y=y_h2, sr=sr, n_mfcc=20)
      y_p2 = librosa.feature.mfcc(y=y_p2, sr=sr, n_mfcc=20)
      y_h3 = librosa.feature.mfcc(y=y_h3, sr=sr, n_mfcc=20)
      y_p3 = librosa.feature.mfcc(y=y_p3, sr=sr, n_mfcc=20)
      y_h4 = librosa.feature.mfcc(y=y_h4, sr=sr, n_mfcc=20)
      y_p4 = librosa.feature.mfcc(y=y_p4, sr=sr, n_mfcc=20)
      y_h5 = librosa.feature.mfcc(y=y_h5, sr=sr, n_mfcc=20)
      y_p5 = librosa.feature.mfcc(y=y_p5, sr=sr, n_mfcc=20)
      y_h6 = librosa.feature.mfcc(y=y_h6, sr=sr, n_mfcc=20)
      y_p6 = librosa.feature.mfcc(y=y_p6, sr=sr, n_mfcc=20)
      y_h7 = librosa.feature.mfcc(y=y_h7, sr=sr, n_mfcc=20)
      y_p7 = librosa.feature.mfcc(y=y_p7, sr=sr, n_mfcc=20)

      return np.vstack([y_h1, y_p1, y_h2, y_p2, y_h3, y_p3, y_h4, y_p4, y_h5, y_p5, y_h6, y_p6, y_h7, y_p7])

      # y = np.array(signal, dtype=np.float64)
      # y_harmonic, y_percussive = librosa.effects.hpss(y)

      # mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
      # mfcc_h = librosa.feature.mfcc(y=y_harmonic, sr=sr, n_mfcc=20)
      # mfcc_p = librosa.feature.mfcc(y=y_percussive, sr=sr, n_mfcc=20)

      # delta_mfcc  = librosa.feature.delta(mfcc)
      # delta2_mfcc = librosa.feature.delta(mfcc, order=2)
      # delta_mfcc_h  = librosa.feature.delta(mfcc_h)
      # delta2_mfcc_h = librosa.feature.delta(mfcc_h, order=2)
      # delta_mfcc_p  = librosa.feature.delta(mfcc_p)
      # delta2_mfcc_p = librosa.feature.delta(mfcc_p, order=2)
      # return np.vstack([mfcc_h, delta_mfcc_h, delta2_mfcc_h, mfcc_p, delta_mfcc_p, delta2_mfcc_p])

      # y_harmonic, y_percussive = librosa.effects.hpss(np.array(signal, dtype=np.float64))
      # mfcc_orig = librosa.feature.mfcc(y=np.array(signal, dtype=np.float64), sr=sr, n_mfcc=20)
      # mfcc_h = librosa.feature.mfcc(y=y_harmonic, sr=sr, n_mfcc=20)
      # mfcc_p = librosa.feature.mfcc(y=y_percussive, sr=sr, n_mfcc=20)
      # return np.vstack([mfcc_h, mfcc_p])

      # S = librosa.feature.melspectrogram(np.array(signal), sr=sr, n_mels=20)
      # # Convert to log scale (dB). We'll use the peak power as reference.
      # log_S = librosa.logamplitude(S, ref_power=np.max)
      # return librosa.feature.mfcc(S=log_S, n_mfcc=20)


      # S = librosa.feature.melspectrogram(y=np.array(signal), sr=sr, n_mels=20,fmax=9000)
      # return librosa.logamplitude(S, ref_power=np.max)

      # return librosa.feature.mfcc(y=np.array(signal), sr=sr, n_mfcc=20)

    X = np.array([map(MFCC, song) for song in X])
    print "--> After MFCC X.shape", X.shape
    X_train_flattened = [val for sublist in X for val in sublist]
    print "--> X_train_flattened.shape", np.array(X_train_flattened).shape

    if transpose:
      # X_train_flattened = np.array(map(np.transpose, X_train_flattened)))
      X_train_flattened = np.array([np.transpose(clip) for clip in X_train_flattened])
      # X_train_flattened = np.array([np.transpose(clip).flatten() for clip in X_train_flattened])
      print "--> After transpose X_train_flattened.shape", X_train_flattened.shape
    # #Important
    X_train_flattened_norm = normalize(X_train_flattened, norm=2)
    # return X_train_flattened_norm
    X_train_flattened_norm_final = np.array([mfcc for clip in X_train_flattened_norm for mfcc in clip])
    return X_train_flattened_norm_final
Пример #39
0
  print "Perform beat_track and cqt"
  tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
  cqt = librosa.cqt(y=y, sr=sr)
  print "saving cqt and beats... "
  np.save("./tempArray/cqt.npy", cqt)
  np.save("./tempArray/beats.npy", beats)
else:
  print "Loading cqt_med and frameConversion... "
  cqt = np.load('./tempArray/cqt.npy')
  beats = np.load('./tempArray/beats.npy')
  sr = 44100

print "Perform sync ..."
cqt_med, frameConversion = librosaF.sync(cqt, beats, aggregate=np.median)
cqt_med = cqt_med.T
cqt_med = normalize(cqt_med, norm=2)

print "Perform loadInterval2Frame ..."
interval = librosaF.loadInterval2Frame("../data/anno/698/parsed/textfile1_uppercase.txt", sr, frameConversion)

print "Creating sigmas matrix ..."
sigmas = np.random.rand(cqt_med.shape[0], cqt_med.shape[0]) + 1e-7 #add a base in case of 0 sigma
sigmas = ((sigmas + sigmas.T)/2)

gm = RM.feature2GaussianMatrix(cqt_med, sigmas) #(nSample, nFeature)
L = scipy.sparse.csgraph.laplacian(gm, normed=True)
m_true = RM.label2RecurrenceMatrix("../data/2.jams", gm.shape[0], interval)
L_true = scipy.sparse.csgraph.laplacian(m_true, normed=True)
np.save("./tempArray/L_true.npy", L_true)

print "cqt_med [min, max]: %s" % str((cqt_med.min(), cqt_med.max()))