def signal2features(signal, audio_parameters, features_param):
    if features_param['feature_type'] == 'mfcc':
        remove_energy = not features_param.get('energy', False)
        features = mfcc_spec(
            signal,
            audio_parameters['sample_rate'],
            (int(audio_parameters['window_t'] *
                 audio_parameters['sample_rate']),
             int(audio_parameters['hop_t'] * audio_parameters['sample_rate'])),
            num_filt=features_param['n_filt'],
            fft_size=features_param['n_fft'],
            num_coeffs=features_param['n_coef'] + remove_energy)
        if remove_energy:
            features = features[:, 1:]

    elif features_param['feature_type'] == 'lmfe':
        features = mfcc_spec(
            signal,
            audio_parameters['sample_rate'],
            (int(audio_parameters['window_t'] *
                 audio_parameters['sample_rate']),
             int(audio_parameters['hop_t'] * audio_parameters['sample_rate'])),
            num_filt=features_param['n_filt'],
            fft_size=features_param['n_fft'],
            num_coeffs=features_param['n_coef'],
            return_parts=True)[2][:, :features_param['n_coef']]
    # Insert new features here

    return features
Пример #2
0
        def __get_data__(manifest, sr, max_frame_len, intencode):
            """

            :param progress_bar:
            :return:
            """
            pg = tqdm if progress_bar else lambda x: x
            random.shuffle(manifest)
            inputs, targets = [], []
            for md in pg(manifest):
                audio_path = md[0]
                labels_path = md[1]
                _, audio = read(audio_path)
                # audio = load_audio(audio_path)
                labels = read_txt(labels_path)[0].replace(" ", "")
                mfccs = mfcc_spec(audio,
                                  sr,
                                  window_stride=(160, 80),
                                  fft_size=512,
                                  num_filt=20,
                                  num_coeffs=13)
                mfccs = normalize(mfccs)
                diff = max_frame_len - mfccs.shape[0]
                if diff >= 0:
                    mfccs = np.pad(mfccs, ((0, diff), (0, 0)), "constant")
                    target = intencode.convert_to_ints(labels)
                    if target is not None:
                        inputs.append(mfccs)
                        targets.append(target)
            return (inputs, targets)
Пример #3
0
    def get_data(self, progress_bar=True):
        """
        返回音频的MFCC特征和相应的label
        """
        pg = tqdm if progress_bar else lambda x: x

        inputs, targets, input_lengths= [], [], []
        meta_data = []
        for label in self.labels:
            path = os.listdir(os.path.join(self.data_path, label))
            for audio in path:
                audio_path = os.path.join(self.data_path, label, audio)
                meta_data.append((audio_path, label))
        
        random.shuffle(meta_data)   #打乱数据集

        for md in pg(meta_data):
            audio_path = md[0]
            label = md[1]
            _, audio = wavfile.read(audio_path)
            mfccs = mfcc_spec(
                audio, self.sr, window_stride=(160, 80),
                fft_size=512, num_filt=20, num_coeffs=13
            )
            mfccs = normalize(mfccs)
            diff = self.max_frame_len - mfccs.shape[0]
            input_lengths.append(mfccs.shape[0])
            mfccs = np.pad(mfccs, ((0, diff), (0, 0)), "constant")#padding
            inputs.append(mfccs)

            target = self.intencode.convert_to_ints(label)
            targets.append(target)
        return inputs, targets, input_lengths
Пример #4
0
def wav2feat(wavPath):
    _,audio = read(wavPath)
    mfccs = mfcc_spec(
        audio, 16000, window_stride=(160, 80),
        fft_size=512, num_filt=20, num_coeffs=13
    )
    mfccs = normalize(mfccs)
    diff = 225 - mfccs.shape[0]
    mfccs = np.pad(mfccs, ((0, diff), (0, 0)), "constant")
    sample = torch.Tensor(mfccs)
    sample = sample.transpose(0, 1)
    return sample
Пример #5
0
    def get_data(self, progress_bar=True):
        """Currently returns mfccs and integer encoded data

        Returns:
            (list, list): 
                inputs shape (sample_size, frame_len, mfcs_features)
                targets shape (sample_size, seq_len)  seq_len is variable
        """
        pg = tqdm if progress_bar else lambda x: x

        inputs, targets = [], []
        meta_data = []
        for labels in self.labels:
            path = os.listdir(os.path.join(self.data_path, labels))
            for audio in path:
                audio_path = os.path.join(self.data_path, labels, audio)
                meta_data.append((audio_path, labels))
        
        random.shuffle(meta_data)

        errFileList = []

        for md in pg(meta_data):
            audio_path = md[0]
            labels = md[1]

            try:
                _, audio = read(audio_path)
            except ValueError:
                print(audio_path)
                errFileList.append(audio_path)
                continue

            # _, audio = read(audio_path)
            mfccs = mfcc_spec(
                audio, self.sr, window_stride=(160, 80),
                fft_size=512, num_filt=20, num_coeffs=13
            )
            mfccs = normalize(mfccs)
            diff = self.max_frame_len - mfccs.shape[0]
            mfccs = np.pad(mfccs, ((0, diff), (0, 0)), "constant")
            inputs.append(mfccs)

            target = self.intencode.convert_to_ints(labels)
            targets.append(target)

            print(errFileList)

        return inputs, targets
Пример #6
0
 def __init__(self,
              sample_rate,
              fft_size=400,
              window_stride=(400, 200),
              num_filt=40,
              num_coeffs=40):
     super(MFCC, self).__init__()
     self.sample_rate = sample_rate
     self.window_stride = window_stride
     self.fft_size = fft_size
     self.num_filt = num_filt
     self.num_coeffs = num_coeffs
     self.mfcc = lambda x: mfcc_spec(x, self.sample_rate, self.
                                     window_stride, self.fft_size, self.
                                     num_filt, self.num_coeffs)
Пример #7
0
 def get_mfcc(self, path):
     audio, sr = librosa.load(path, sr=None)  # sr = 16000 sampling rate of `y`
     mfccs = mfcc_spec(audio, sr, window_stride=self.windows_stride,
                       fft_size=self.fft_size, num_filt=self.num_filt,
                       num_coeffs=self.num_coeffs)
     mfccs /= 16
     if self.positive_shift:
         mfccs += 1
         mfccs = np.where(mfccs < 0, 0, mfccs)
     if mfccs.shape[0] < self.max_len:
         pad = np.zeros((self.max_len - mfccs.shape[0], mfccs.shape[1],))
         mfccs = np.vstack((mfccs, pad))
     elif mfccs.shape[0] > self.max_len:
         mfccs = mfccs[:, :self.max_len]
     return torch.FloatTensor(np.expand_dims(mfccs, axis=0))
Пример #8
0
def sample_to_training_data(item):
    audio_fn = item["audio"]
    sr, y = wavfile.read(audio_fn)
    duration = y.shape[0] / sr

    mfcc = mfcc_spec(y, sr)
    activity = np.zeros((mfcc.shape[0]))
    N = mfcc.shape[0]
    for utt in item["json"]:
        if utt["word"] == "sil":
            continue
        start = utt["start"]
        end = utt["end"]
        start_index = round(N * start / duration)
        end_index = round(N * end / duration)
        activity[start_index:end_index] = 1.
    return mfcc, activity
Пример #9
0
def extractSoundFeatures(sig):
    allFeature = []
    for numberData in range(len(sig)):
        feature = []
        for channel in range(sig[numberData].shape[0]):
            x = sig[numberData][channel, :]
            powers, filters, mels, mfccs = mfcc_spec(x,
                                                     2048,
                                                     return_parts=True,
                                                     num_coeffs=len(x))
            feature.append(np.std(mfccs, axis=0))
            feature.append(np.sum(mfccs, axis=0))
            feature.extend(mfccs)
            # feature.append(np.max(powers, axis=0))
            # feature.append(np.min(powers, axis=0))
        allFeature.append(np.asarray(feature))
    return np.asarray(allFeature)
Пример #10
0
    def _process(self):
        self._processing = True
        features = mfcc_spec(
            self._buffer,
            sample_rate=self.mfccParams.sample_rate,
            window_stride=(self.mfccParams.window_l, self.mfccParams.stride_l),
            num_coeffs=self.mfccParams.n_coef + (not self.mfccParams.energy),
            num_filt=self.mfccParams.n_filt,
            fft_size=self.mfccParams.n_fft)
        if not self.mfccParams.energy:
            features = features[:, 1:]
        self._buffer = self._buffer[len(features) * self.mfccParams.stride_l:]

        if self._consumer is not None:
            self._consumer.input(features)
        self._processing = False
        with self._condition:
            self._condition.notify()
Пример #11
0
def vectorize_raw(audio: np.ndarray) -> np.ndarray:
    """Turns audio into feature vectors, without clipping for length"""
    if len(audio) == 0:
        raise InvalidAudio('Cannot vectorize empty audio!')

    sample_rate = 16000

    window_t = 0.1
    window_samples = int(sample_rate * window_t + 0.5)

    hop_t = 0.05
    hop_samples = int(sample_rate * hop_t + 0.5)

    n_filt = 20
    n_fft = 512
    n_mfcc = 13

    return mfcc_spec(audio,
                     sample_rate, (window_samples, hop_samples),
                     num_filt=n_filt,
                     fft_size=n_fft,
                     num_coeffs=n_mfcc)
Пример #12
0
def mfcc_from_file(filename):
    sr, y = wavfile.read(filename)
    mfcc = mfcc_spec(y, sr)
    duration = y.shape[0] / sr
    return mfcc, duration
Пример #13
0
import numpy as np
from sonopy import power_spec, mel_spec, mfcc_spec, filterbanks
from scipy.io import wavfile

sr, audio = wavfile.read('test.wav')

# powers = power_spec(audio, window_stride=(100, 50), fft_size=512)
# mels = mel_spec(audio, sr, window_stride=(1600, 800), fft_size=1024, num_filt=30)
mfccs = mfcc_spec(audio,
                  sr,
                  window_stride=(160, 80),
                  fft_size=512,
                  num_filt=20,
                  num_coeffs=13)
print(mfccs)
# filters = filterbanks(16000, 20, 257)  # Probably not ever useful

# powers, filters, mels, mfccs = mfcc_spec(audio, sr, return_parts=True)
Пример #14
0
    neg_loss = -(1 - yt) * K.log(1 - yp + K.epsilon())
    return LOSS_BIAS * K.mean(neg_loss) + (1. - LOSS_BIAS) * K.mean(pos_loss)


qqq = keras.models.load_model(
    "qqq.net", custom_objects={'weighted_log_loss': weighted_log_loss})

samples, sample_rate = librosa.load("testing/negative-00.wav",
                                    duration=2.0,
                                    sr=16000)
window_samples = int(sample_rate * 0.1 + 0.5)
hop_samples = int(sample_rate * 0.05 + 0.5)

mfccs = mfcc_spec(samples,
                  sample_rate, (window_samples, hop_samples),
                  num_filt=20,
                  fft_size=512,
                  num_coeffs=13)


def my_mfcc_spec(audio,
                 sample_rate,
                 window_stride=(160, 80),
                 fft_size=512,
                 num_filt=20,
                 num_coeffs=13,
                 return_parts=False):
    """Calculates mel frequency cepstrum coefficient spectrogram"""
    powers = power_spec(audio, window_stride, fft_size)
    if powers.size == 0:
        return np.empty((0, min(num_filt, num_coeffs)))
Пример #15
0
from sonopy import mfcc_spec, mel_spec

inhibit_t = 0.4
inhibit_dist_t = 1.0
inhibit_hop_t = 0.1

vectorizers = {
    Vectorizer.mels:
    lambda x: mel_spec(x,
                       pr.sample_rate, (pr.window_samples, pr.hop_samples),
                       num_filt=pr.n_filt,
                       fft_size=pr.n_fft),
    Vectorizer.mfccs:
    lambda x: mfcc_spec(x,
                        pr.sample_rate, (pr.window_samples, pr.hop_samples),
                        num_filt=pr.n_filt,
                        fft_size=pr.n_fft,
                        num_coeffs=pr.n_mfcc),
    Vectorizer.speechpy_mfccs:
    lambda x: __import__('speechpy').feature.
    mfcc(x, pr.sample_rate, pr.window_t, pr.hop_t, pr.n_mfcc, pr.n_filt, pr.
         n_fft)
}


def vectorize_raw(audio: np.ndarray) -> np.ndarray:
    """Turns audio into feature vectors, without clipping for length"""
    if len(audio) == 0:
        raise InvalidAudio('Cannot vectorize empty audio!')
    return vectorizers[pr.vectorizer](audio)