Пример #1
0
    def pre_sff(self):
        feature_path = os.path.join(self.dataset['feature_path'], 'pre_sff')
        if not os.path.exists(feature_path):
            os.mkdir(feature_path)

        x_train = []
        y_train = []
        f_train = []
        for i, row in self.dataset.train_data.iterrows():
            print('[Train] {}) Getting pre_sff from {}...'.format(
                i, row['cur_name']),
                  end='')
            wav_name = os.path.join(self.dataset['data_path'], row['cur_name'])
            sr, wav_data = wavfile.read(wav_name)

            spec = stft(buf_to_float(wav_data),
                        n_fft=800,
                        hop_length=160,
                        win_length=320)[:200, :]  # до 4К KHz
            spec = np.log(np.abs(spec) + 1e-10)
            spec -= np.min(spec)

            x_train.append(spec)
            y_train.append(self._build_multilabel(row))
            f_train.append(row['cur_name'])
            print('done.')

        x_test = []
        y_test = []
        f_test = []
        for i, row in self.dataset.test_data.iterrows():
            print('[Test] {}) Getting sff from {}...'.format(
                i, row['cur_name']),
                  end='')
            wav_name = os.path.join(self.dataset['data_path'], row['cur_name'])
            sr, wav_data = wavfile.read(wav_name)

            spec = stft(buf_to_float(wav_data),
                        n_fft=800,
                        hop_length=160,
                        win_length=320)[:200, :]  # до 4К KHz
            spec = np.log(np.abs(spec) + 1e-10)
            spec -= np.min(spec)

            x_test.append(spec)
            y_test.append(self._build_multilabel(row))
            f_test.append(row['cur_name'])
            print('done.')

        self._save_pickles(feature_path, x_train, y_train, f_train, x_test,
                           y_test, f_test)
Пример #2
0
    def extract_feat_frame(self,
                           frames,
                           mode=0,
                           file=None,
                           sr=16000,
                           n_fft=512,
                           hop_length=512,
                           n_mels=40,
                           fmax=8000,
                           convert_16I_to_32F=True):

        #convert 16 bit integer to 32 float
        if convert_16I_to_32F:
            n_frames = []
            for frame in frames:
                n_frames.append(buf_to_float(frame, 2, np.float32))
            n_frames = np.concatenate(n_frames)
            frames = np.ascontiguousarray(n_frames, np.float32)

        frames = self.gain_norm(frames)

        if mode == 0:
            return self.extract_melspec_frame(frames,
                                              file=file,
                                              n_mels=n_mels,
                                              sr=sr)
        elif mode == 1:
            return self.extract_wav_frame(frames, file=file)
        elif mode == 2:
            return self.extract_log_spectrogram_frame(frames,
                                                      file=file,
                                                      n_mels=n_mels,
                                                      sr=sr)
        else:
            print("non-supported feature type")
Пример #3
0
 def get_features(self, sr):
     """
     calculates tempo and pitch using librosa
     documentation https://librosa.github.io/librosa/
     """
     timeseries = buf_to_float(self.bytes)
     pitch = estimate_tuning(timeseries, sr)
     # onset_env = onset_strength(timeseries, sr)
     # temp = tempo(onset_env, sr)[0]
     return([pitch])
Пример #4
0
def load(path,
         sr=22050,
         mono=True,
         offset=0.0,
         duration=None,
         dtype=np.float32,
         res_type='kaiser_best'):
    y = []
    with audioread.audio_open(os.path.realpath(path)) as input_file:
        sr_native = input_file.samplerate
        n_channels = input_file.channels
        duration_1 = input_file.duration

        s_start = int(np.round(sr_native * offset)) * n_channels

        if duration is None:
            s_end = np.inf
        else:
            s_end = s_start + (int(np.round(sr_native * duration)) *
                               n_channels)

        n = 0
        duration_2 = 0
        xxx = 0

        for frame in input_file:

            duration_2 = duration_2 + 1

            frame = util.buf_to_float(frame, dtype=dtype)
            n_prev = n
            n = n + len(frame)

            if n < s_start:
                # offset is after the current frame
                # keep reading
                continue

            if s_end < n_prev:
                # we're off the end.  stop reading
                break

            if s_end < n:
                # the end is in this frame.  crop.
                frame = frame[:(s_end - n_prev)]

            if n_prev <= s_start and s_start <= n:
                # beginning is in this frame
                frame = frame[(s_start - n_prev):]
                xxx = xxx + 1

            # tack on the current frame
            y.append(frame)

    if y:
        y1 = np.concatenate(y)
        print(".")

        if n_channels > 1:
            y1 = y1.reshape((-1, n_channels)).T
            if mono:
                y1 = to_mono(y1)

        if sr is not None:
            y1 = resample(y1, sr_native, sr, res_type=res_type)

        else:
            sr = sr_native

    # Final cleanup for dtype and contiguity
    y2 = np.ascontiguousarray(y1, dtype=dtype)

    return y2, sr
Пример #5
0
def stream_to_np(bytes_io,
                 sr=22050,
                 mono=True,
                 offset=0.0,
                 duration=None,
                 dtype=np.float32,
                 res_type='kaiser_best'):
    """
    重写了librosa.load函数,把文件参数改成bytesIO类型,并把audioread.audio_open替换为自定义的RawAudioStream类,
    因为前者需要文件路径作为参数。
    :param bytes_io:
    :param sr:
    :param mono:
    :param offset:
    :param duration:
    :param dtype:
    :param res_type:
    :return:
    """
    y = []

    with RawAudioStream(bytes_io) as input_file:
        sr_native = input_file.samplerate
        n_channels = input_file.channels

        s_start = int(np.round(sr_native * offset)) * n_channels

        if duration is None:
            s_end = np.inf
        else:
            s_end = s_start + (int(np.round(sr_native * duration)) *
                               n_channels)

        n = 0

        for frame in input_file:
            frame = util.buf_to_float(frame, dtype=dtype)
            n_prev = n
            n = n + len(frame)

            if n < s_start:
                # offset is after the current frame
                # keep reading
                continue

            if s_end < n_prev:
                # we're off the end.  stop reading
                break

            if s_end < n:
                # the end is in this frame.  crop.
                frame = frame[:s_end - n_prev]

            if n_prev <= s_start <= n:
                # beginning is in this frame
                frame = frame[(s_start - n_prev):]

            # tack on the current frame
            y.append(frame)

        if y:
            y = np.concatenate(y)

            if n_channels > 1:
                y = y.reshape((-1, n_channels)).T
                if mono:
                    y = to_mono(y)

            if sr is not None:
                y = resample(y, sr_native, sr, res_type=res_type)

            else:
                sr = sr_native

        # Final cleanup for dtype and contiguity
        y = np.ascontiguousarray(y, dtype=dtype)

        return y, sr
def load_yield_chunks(path,
                      sr=22050,
                      mono=True,
                      offset=0.0,
                      duration=None,
                      dtype=np.float32,
                      res_type='kaiser_best',
                      choplenspls=0,
                      hoplenspls=0):
    """Load an audio file as a floating point time series.
    This is MODIFIED from librosa's own load() function, to yield chunks one-by-one so they never all need to be loaded into memory.

    Parameters
    ----------
    path : string
        path to the input file.

        Any format supported by `audioread` will work.

    sr   : number > 0 [scalar]
        target sampling rate

        'None' uses the native sampling rate

    mono : bool
        convert signal to mono

    offset : float
        start reading after this time (in seconds)

    duration : float
        only load up to this much audio (in seconds)

    dtype : numeric type
        data type of `y`

    res_type : str
        resample type (see note)

        .. note::
            By default, this uses `resampy`'s high-quality mode ('kaiser_best').

            To use a faster method, set `res_type='kaiser_fast'`.

            To use `scipy.signal.resample`, set `res_type='scipy'`.

    choplenspls : int
        number of samples in each chunk to be yielded.

    Returns
    -------
    y    : np.ndarray [shape=(n,) or (2, n)]
        audio time series

    sr   : number > 0 [scalar]
        sampling rate of `y`


    Examples
    --------
    >>> # Load a wav file
    >>> filename = librosa.util.example_audio_file()
    >>> y, sr = librosa.load(filename)
    >>> y
    array([ -4.756e-06,  -6.020e-06, ...,  -1.040e-06,   0.000e+00], dtype=float32)
    >>> sr
    22050

    >>> # Load a wav file and resample to 11 KHz
    >>> filename = librosa.util.example_audio_file()
    >>> y, sr = librosa.load(filename, sr=11025)
    >>> y
    array([ -2.077e-06,  -2.928e-06, ...,  -4.395e-06,   0.000e+00], dtype=float32)
    >>> sr
    11025

    >>> # Load 5 seconds of a wav file, starting 15 seconds in
    >>> filename = librosa.util.example_audio_file()
    >>> y, sr = librosa.load(filename, offset=15.0, duration=5.0)
    >>> y
    array([ 0.069,  0.1  , ..., -0.101,  0.   ], dtype=float32)
    >>> sr
    22050

    """

    if not hoplenspls or (hoplenspls <= 0 or hoplenspls > choplenspls):
        hoplenspls = choplenspls

    y = np.array([], dtype=dtype)
    with audioread.audio_open(os.path.realpath(path)) as input_file:
        sr_native = input_file.samplerate
        n_channels = input_file.channels

        s_start = int(np.round(sr_native * offset)) * n_channels

        if duration is None:
            s_end = np.inf
        else:
            s_end = s_start + (int(np.round(sr_native * duration)) *
                               n_channels)

        n = 0

        for frame in input_file:
            frame = util.buf_to_float(frame, dtype=dtype)
            n_prev = n
            n = n + len(frame)

            if n < s_start:
                # offset is after the current frame
                # keep reading
                continue

            if s_end < n_prev:
                # we're off the end.  stop reading
                break

            if s_end < n:
                # the end is in this frame.  crop.
                frame = frame[:s_end - n_prev]

            if n_prev <= s_start <= n:
                # beginning is in this frame
                frame = frame[(s_start - n_prev):]

            # NB here we apply to one single frame, the postprocessing that librosa applies to the whole file at the end
            if n_channels > 1:
                frame = frame.reshape((-1, n_channels)).T
                if mono:
                    frame = to_mono(frame)

            if sr is not None:
                frame = resample(frame, sr_native, sr, res_type=res_type)

            else:
                sr = sr_native
            # Final cleanup for dtype and contiguity
            frame = np.ascontiguousarray(frame, dtype=dtype)

            y = np.concatenate((y, frame))
            while y.shape[0] >= choplenspls:
                yield (y[:choplenspls], sr)
                y = y[hoplenspls:]

    if y.shape[0] != 0:
        print(
            "WARNING: load_yield_chunks() dropped %i final samples" %
            (y.shape[0])
        )  # TODO can the final incomplete chunk be handled elegantly within the above loop?