def audio_to_data(signal):

    if config.silence_thr_db is not None:
        signal, _ = trim(signal,
                         config.silence_thr_db,
                         frame_length=config.fft_bins,
                         hop_length=config.fft_hop_len)

    spec = abs(
        stft(signal, config.fft_bins, config.fft_hop_len,
             config.fft_window_len))

    # mfccs = mfcc(signal, config.sample_rate, n_mfcc=config.mfcc_bins)
    # chroma = chroma_stft(signal, config.sample_rate, n_fft=config.fft_bins, hop_length=config.fft_hop_len, win_length=config.fft_window_len)
    # show(specshow(spec, sr=config.sample_rate, hop_length=config.fft_hop_len))
    # show(specshow(mfccs, sr=config.sample_rate, hop_length=config.fft_hop_len))
    # show(plot(chroma))

    vector = deepcopy(spec)
    print('\tmax min initially:', max(vector), min(vector))

    vector = amplitude_to_db(vector)
    print('\tmax min in db:', max(vector), min(vector))

    # vector = concatenate([vector, chroma], 0)
    vector = vector.T
    print('\tfinal vector shape:', vector.shape)

    return vector
示例#2
0
    def _adjust_time_resolution(self, batch, local_condition, max_time_steps):
        '''Adjust time resolution between audio and local condition
		'''
        if local_condition:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                self._assert_ready_for_upsample(x, c)
                if max_time_steps is not None:
                    max_steps = _ensure_divisible(max_time_steps,
                                                  self._hparams.hop_size, True)
                    if len(x) > max_time_steps:
                        max_time_frames = max_steps // self._hparams.hop_size
                        start = np.random.randint(0, len(c) - max_time_frames)
                        time_start = start * self._hparams.hop_size
                        x = x[time_start:time_start +
                              max_time_frames * self._hparams.hop_size]
                        c = c[start:start + max_time_frames, :]
                        self._assert_ready_for_upsample(x, c)

                new_batch.append((x, c, g, l))
            return new_batch
        else:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                x = trim(x)
                if max_time_steps is not None and len(x) > max_time_steps:
                    start = np.random.randint(0, len(c) - max_time_steps)
                    x = x[start:start + max_time_steps]
                new_batch.append((x, c, g, l))
            return new_batch
示例#3
0
def extract_features(audio, rate):
    audio = reduce_noise_power(audio, rate)

    audio, indexes = trim(audio)

    mfcc_feature = mfcc(y=audio,
                        sr=rate,
                        n_mfcc=13,
                        n_fft=int(0.025 * rate),
                        n_mels=40,
                        fmin=20,
                        hop_length=int(0.03 * rate))

    mfcc_feature = preprocessing.scale(mfcc_feature, axis=1)

    mfcc_feature = stats.zscore(mfcc_feature)

    pitches, magnitudes = pitch(y=audio,
                                sr=rate,
                                fmin=50,
                                fmax=400,
                                n_fft=int(0.025 * rate),
                                hop_length=int(0.03 * rate))

    #delta_f = delta(mfcc_feature)
    #d_delta_f = delta(mfcc_feature, order=2)
    combined = np.hstack((np.transpose(mfcc_feature), np.transpose(pitches)))
    return combined
示例#4
0
def trim(examples: Sequence[EmplacableExample], top_db: int = 40):
    return [
        ex.emplaced_audio_data(
            torch.from_numpy(
                effects.trim(ex.audio_data.cpu().numpy(), top_db=top_db)[0]))
        for ex in examples
    ]
示例#5
0
def Prepare(data):
    T = data.T
    if np.shape(T)[0] == 2:
        data = T[0]

    data = trim(data, 30)[0]

    return data
    def load_wav(cls, file_path: str, is_trim: bool) -> np.ndarray:
        """Load waveform."""
        wav = load(file_path, sr=cls.sample_rate)[0]
        if is_trim:
            wav = trim(wav, top_db=cls.top_db)[0]
        wav = np.clip(wav, -1.0 + 1e-6, 1.0 - 1e-6)

        return wav
    def load_wav(cls, file_path: str, is_trim: bool) -> np.ndarray:
        """Load waveform."""
        wav, _ = load(file_path, sr=cls.sample_rate, mono=True)
        # Trimming
        if is_trim:
            wav, _ = trim(wav)

        return wav
    def load_wav(cls, file_path: str, is_trim: bool) -> np.ndarray:
        """Load waveform."""
        wav, _ = load(file_path, sr=cls.sample_rate)
        wav = wav / (np.abs(wav).max() + 1e-6)
        if is_trim:
            wav, _ = trim(wav, top_db=cls.top_db)

        return wav
示例#9
0
    def load_wav(cls, file_path: str, is_trim: bool) -> np.ndarray:
        """Load waveform."""
        wav = load(file_path, sr=cls.sample_rate)[0]
        wav = wav / (np.abs(wav).max() + 1e-6)
        if is_trim:
            wav = trim(wav, top_db=cls.top_db)[0]
        wav = filtfilt(*cls.butter_highpass(), wav)
        wav = wav * 0.96

        return wav
示例#10
0
def load_wav_to_torch(full_path: str,
                      sr: Optional[int] = 24000) -> Tuple[torch.Tensor, int]:
    """Load audio file from `full_path` with optional resamplling to `sr`.
    Args:
        full_path (str): path to audio file.
        sr (int, optional): sample rate to resample to.
    Returns:
        (torch.Tensor, sampling_rate)
    """

    data, sampling_rate = load(full_path, sr)
    return torch.from_numpy(trim(data)[0]), sampling_rate
示例#11
0
def read_wav(
    fname: str, sr: int, norm: float = 0, pre_emphasis: bool = False
) -> np.ndarray:
    "Read a wave file into a normalized array"

    (S, _) = librosa.load(fname, sr=sr)
    (S, _) = effects.trim(S)
    if pre_emphasis:
        S[1:] -= S[:-1]
    if norm is not 0:
        S = librosa_util.normalize(S, norm=norm)
    return S
示例#12
0
    def guess(self):
        wav, _ = load(self.WAVE_OUTPUT_FILENAME, sr=self.sr)
        wav, _ = trim(wav, top_db=self.top_db)
        write_wav(self.WAVE_OUTPUT_FILENAME, wav, self.sr)
        print(">> save as", self.WAVE_OUTPUT_FILENAME)

        #dtw recognition
        x = self.getMfcc(wav, self.sr)
        res = self.recognition(x)
        print(res)

        self.audio_num = self.audio_num + 1
        self.WAVE_OUTPUT_FILENAME = "./saved/" + str(self.audio_num) + ".wav"
示例#13
0
    def __init__(self, file, dim=dim):
        self.directory = file
        # print(file)
        self.sound = sa.WaveObject.from_wave_file(self.directory)

        # self.sound = QtCore.QObject.QtMultimedia.QSound(file)
        self._raw, self.rate = load(file)
        trimmed, ind = trim(self._raw, top_db=50)
        # print(index[0])
        if ((ind[1] + int(0.03 * self.rate) - ind[0]) % 2):
            self.data = self._raw[ind[0]:ind[1] + int(0.03 * self.rate) - 1]
        else:
            self.data = self._raw[ind[0]:ind[1] + int(0.03 * self.rate)]
        self.seg = floor(self.data.shape[0] / dim)
        self.length = self._raw.shape[0]
示例#14
0
    def file2spectrogram(cls, file_path):
        """Load audio file and create spectrogram."""

        wav = load(file_path, sr=cls.sample_rate)[0]
        wav = trim(wav, top_db=cls.top_db)[0]
        wav = filtfilt(*cls.butter_highpass(), wav)
        wav = wav * 0.96

        d_mag = cls.short_time_fourier_transform(wav)
        d_mel = np.dot(d_mag.T, cls.mel_basis)

        db_val = 20 * np.log10(np.maximum(cls.min_level, d_mel))
        db_scaled = db_val - cls.ref_db
        db_normalized = (db_scaled + cls.max_db) / cls.max_db

        return np.clip(db_normalized, 0, 1).astype(np.float32)
示例#15
0
def trim_signal_length(signal, sample_rate, length=SAMPLE_DURATION):
    # Trim leading and trailing white space
    signal, i = trim(y=signal)

    # Replace zero values with small value to avoid divide by zero error later on
    signal[signal == 0] = 0.0001

    # Extend audio to be 3.5 seconds long if needed
    target = int(sample_rate * length)

    signal_length = len(signal)

    # If its longer, clip the length
    if signal_length > target:
        return signal[0:target]
    else:
        return np.pad(signal, (0, target - signal_length), 'wrap')
示例#16
0
文件: util.py 项目: tanooj-s/fourier
def load_sample(path, upper_bound, lower_bound):
    samples, rate = lc.load(path=path, mono=True, duration=upper_bound)
    samples, _ = le.trim(samples[int(rate * lower_bound):int(rate *
                                                             upper_bound)],
                         top_db=20)
    return samples, rate
示例#17
0
def trim_data(data, threshold):
    trimmed = []
    for record in data:
        trimmed.append((trim(record, top_db=threshold))[0])  # trim function from librosa
    return trimmed
示例#18
0
    #add the 7frame buffer back to the audio
    smples=np.int(sr*7/fps)
    aud = np.pad(np.float32(aud), (smples, smples), 'constant', constant_values=(0, 0))

    if (opt==1 or opt ==2):
        #image is of shape timepts x landmark pts x 1 x coordinates e.g. (28, 68, 2)
        image1=np.load(vfile)
    else:
        #image is of shape timepts x landmark pts x 1 x coordinates e.g. (28, 68, 2)
        video=spio.loadmat(vfile)
        image1=video['joint_mean']

    # PREPROCESSING AUDIO BELOW:
    #remove silence from the audio
    aud, index=effects.trim(np.float32(aud), top_db=freq)
   
    #16000 13.44 40.32 7168 21504 14336
    #compute video frame numbers from index
    stframe=np.floor(fps*index[0]/sr)
    endframe=np.ceil(fps*index[1]/sr)
    
    #trim video based on audio
    image1=image1[np.int(stframe):np.int(endframe),:,:]

 
    image=[]
    image_aug=[]
    for tp in range(np.shape(image1)[0]):
        #tp x landmarks x coordinates
        # measurements based on nose center point 33
示例#19
0
def audio_to_data(signal, song_id):

    meta = [song_id]

    if config.silence_thr_db:
        signal, _ = trim(signal,
                         config.silence_thr_db,
                         frame_length=config.fft_bins,
                         hop_length=config.fft_hop_len)

    spec = abs(
        stft(signal, config.fft_bins, config.fft_hop_len,
             config.fft_window_len))
    # mfccs = mfcc(signal, config.sample_rate, n_mfcc=config.mfcc_bins)
    # chroma = chroma_stft(signal, config.sample_rate, n_fft=config.fft_bins, hop_length=config.fft_hop_len, win_length=config.fft_window_len)

    # rows-frequencies cols-times
    # show(specshow(spec, sr=config.sample_rate, hop_length=config.fft_hop_len))
    # show(specshow(mfccs, sr=config.sample_rate, hop_length=config.fft_hop_len))
    # show(plot(chroma))

    spec_mod = deepcopy(spec)
    print('\tmax min initially:', max(spec_mod), min(spec_mod))

    spec_mod = stack([
        spec_mod[config.frequencies_of_bins.index(i), :]
        for i in config.frequencies_to_pick
    ], 0)
    print('\tmax min after bandpass:'******'\tmax min in db:', max(spec_mod), min(spec_mod))

    # spec_mod = clip(spec_mod, config.amp_min_thr_db, config.amp_max_thr_db)
    # print('db clipped.')

    if config.zscore_scale:

        mean = spec_mod.mean()
        std = spec_mod.std()
        spec_mod -= mean
        spec_mod /= std

        print('\tmax min after std:', max(spec_mod), min(spec_mod))

        scale = max([abs(max(spec_mod)), abs(min(spec_mod))])
        spec_mod /= scale

        meta.extend([mean, std, scale])

    elif config.minmax_scale:

        spec_min = min(spec_mod)
        spec_max = max(spec_mod)
        spec_mod -= spec_min
        spec_mod /= spec_max - spec_min

        print('\tmax min after min/max:', max(spec_mod), min(spec_mod))

        meta.extend([spec_min, spec_max])

    elif config.log_scale:

        spec_mod = log(spec_mod + 1e-10)

        print('\tmax min after log:', max(spec_mod), min(spec_mod))

    vector = spec_mod
    # vector = concatenate([vector, chroma], 0)
    vector = vector.T  # now first index time, second index frequency

    print('\tfinal vector shape:', vector.shape)

    return vector, meta
示例#20
0
def index():

    if request.method == 'POST':
        curr_time = str(int(time.time()))
        data = request.get_json()
        lowerbound = data['lowerbound']
        upperbound = data['upperbound']
        fft_on = data['fft_on']
        audiofile = data['uploaded_audiofile']
        prefix = "data:audio/wav;base64,"  # handle other audio file formats
        audiofile = audiofile.replace(prefix, "")
        original_filename = "upload_" + curr_time + ".wav"

        try:
            audiofile = base64.b64decode(audiofile)
            with open(original_filename, "wb") as f:
                f.write(audiofile)
        except Exception as e:
            return (str(e))

        samples, rate = lc.load(
            original_filename
        )  # can you get these directly from the base64 encoding sent it without saving it
        samples, _ = le.trim(samples[int(rate * lowerbound):int(rate *
                                                                upperbound)],
                             top_db=20)
        harmonic_samples, percussive_samples = split_hp(samples)

        response = dict()

        response['original_file'] = original_filename
        response['harmonic_file'] = write_wav(path="h_" + curr_time + ".wav",
                                              y=harmonic_samples,
                                              sr=rate)
        response['percussive_file'] = write_wav(path="p_" + curr_time + ".wav",
                                                y=percussive_samples,
                                                sr=rate)

        # get full working directory for these
        response['original_samples'] = [str(s) for s in samples]
        response['harmonic_samples'] = [str(s) for s in harmonic_samples]
        response['percussive_samples'] = [str(s) for s in percussive_samples]

        waveplot_encoding = "data:image/png;base64," + str(
            generate_waveplot(
                samples,
                rate))  # this should return an image of the matplotlib plot
        response['waveplot'] = waveplot_encoding

        spectrogram_encoding = "data:image/png;base64," + str(
            generate_spectrogram(samples, rate, opt=0))
        response['spectrogram'] = spectrogram_encoding

        h_spectrogram_encoding = "data:image/png;base64," + str(
            generate_spectrogram(harmonic_samples, rate, opt=1))
        response['h_spectrogram'] = h_spectrogram_encoding
        p_spectrogram_encoding = "data:image/png;base64," + str(
            generate_spectrogram(percussive_samples, rate, opt=2))
        response['p_spectrogram'] = p_spectrogram_encoding

        if fft_on:
            fourier_transform = get_fft(samples, rate)
            fftplot_encoding = "data:image/png;base64," + str(
                generate_fft_plot(fourier_transform['x'],
                                  fourier_transform['y']))
            response['fftplot'] = fftplot_encoding

        return jsonify(response)  # dict of lists

    return render_template("index.html")