def Prediction(self):
        """
        Makes a prediction based on the input audio frames
        transformed into MFCCs

        Args:
            None

        Returns:
            None

        """
        # converts first 19 chunks of audio bytes into 16 bit int values
        in_data = np.fromstring(np.array(self.frames[:19]), 'Int16')

        # extract MFCCs from the 19 chunks of audio
        audio_sig = np.array([
            mfcc(in_data, self.rate, self.window, self.stride, self.mfcc,
                 self.filter_banks, self.fft_num, 0, None, True)
        ])

        # makes predictions
        prediction = self.ww_model.model.predict(audio_sig)

        if (self.print_pred):
            print(prediction)

        return prediction
Exemplo n.º 2
0
def get_feature(file_path: str, mfcc_len: int = 39, flatten: bool = False):
    import librosa
    from speechpy.feature import mfcc
    # 某些音频用scipy.io.wavfile读会报 "Incomplete wav chunk" error
    # 似乎是因为scipy只能读pcm和float格式,而有的wav不是这两种格式...
    # fs, signal = wav.read(file_path)
    signal, fs = librosa.load(file_path)

    s_len = len(signal)

    # 如果音频信号小于mean_signal_length,则扩充它
    if s_len < mean_signal_length:
        pad_len = mean_signal_length - s_len
        pad_rem = pad_len % 2
        pad_len //= 2
        signal = np.pad(signal, (pad_len, pad_len + pad_rem),
                        'constant',
                        constant_values=0)

    # 否则把它切开
    else:
        pad_len = s_len - mean_signal_length
        pad_len //= 2
        signal = signal[pad_len:pad_len + mean_signal_length]

    mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len)
    #  用 SVM & MLP 模型时要降维数据
    if flatten:
        mel_coefficients = np.ravel(mel_coefficients)

    return mel_coefficients
def vectorize_raw(audio: np.ndarray) -> np.ndarray:
    """Turns audio into feature vectors, without clipping for length"""
    from speechpy.feature import mfcc
    if len(audio) == 0:
        raise ValueError('Cannot vectorize empty audio!')
    return mfcc(audio, pr.sample_rate, pr.window_t, pr.hop_t, pr.n_mfcc,
                pr.n_filt, pr.n_fft)
def get_feature_vector_from_mfcc(file_path: str, mfcc_len: int = 45):
    # sf, signal = wav.read(file_path)
    signal, fs = sf.read(file_path)
    s_len = len(signal)
    fs = 16000
    # print(s_len)

    if s_len < mean_signal_length:
        # print("I am in the if loop, for the features padding")
        pad_len = mean_signal_length - s_len
        pad_len //= 2
        pad_rem = pad_len % 2
        signal = np.pad(signal, (pad_len, pad_len + pad_rem),
                        'constant',
                        constant_values=0)

    else:
        # print("I am in the else loop")
        pad_len = s_len - mean_signal_length
        pad_len //= 2

        signal = signal[pad_len:pad_len + mean_signal_length]

    mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len)
    mel_coefficients = np.ravel(mel_coefficients)
    normalize_feature_vector = min_max_scalling(mel_coefficients)

    return normalize_feature_vector
def get_feature_vector_from_mfcc(file_path: str,
                                 flatten: bool,
                                 mfcc_len: int = 39) -> np.ndarray:
    """
    Make feature vector from MFCC for the given wav file.

    Args:
        file_path (str): path to the .wav file that needs to be read.
        flatten (bool) : Boolean indicating whether to flatten mfcc obtained.
        mfcc_len (int): Number of cepestral co efficients to be consider.

    Returns:
        numpy.ndarray: feature vector of the wav file made from mfcc.
    """
    fs, signal = wav.read(file_path)
    s_len = len(signal)
    # pad the signals to have same size if lesser than required
    # else slice them
    if s_len < mean_signal_length:
        pad_len = mean_signal_length - s_len
        pad_rem = pad_len % 2
        pad_len //= 2
        signal = np.pad(signal, (pad_len, pad_len + pad_rem),
                        'constant',
                        constant_values=0)
    else:
        pad_len = s_len - mean_signal_length
        pad_len //= 2
        signal = signal[pad_len:pad_len + mean_signal_length]
    mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len)
    if flatten:
        # Flatten the data
        mel_coefficients = np.ravel(mel_coefficients)
    return mel_coefficients
Exemplo n.º 6
0
def calculate_acoustic_features(args, waveform):
    n_fft = int(args.window*SAMPLE_RATE/1000.0)
    hop_length = int(args.step * SAMPLE_RATE / 1000.0)
    if 'mfe' == args.feature_type:
        if args.backend=='speechpy':
            log_cut = 1e-8
            spec, energy = mfe(waveform, SAMPLE_RATE, frame_length=args.window*1e-3,
                frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft)
            if args.energy:
                acoustic_features = np.hstack((spec, energy[:, np.newaxis]))
            acoustic_features = np.log(acoustic_features + log_cut)
        else:
            spec = librosa.feature.melspectrogram(y=waveform, sr=SAMPLE_RATE, n_fft=n_fft, 
                hop_length=hop_length, n_mels=args.n_mels)
            acoustic_features = librosa.core.amplitude_to_db(spec).transpose()
            if args.energy:
                energy = librosa.feature.rmse(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose()
                acoustic_features = np.hstack((acoustic_features, energy))
    elif 'mfcc' == args.feature_type:
        if args.backend=='speechpy':
            acoustic_features = mfcc(waveform, SAMPLE_RATE, frame_length=args.window*1e-3,
                frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft,
                num_cepstral = args.n_mfcc)
        else:
            acoustic_features = librosa.feature.mfcc(y=waveform, sr=SAMPLE_RATE, n_mfcc=args.n_mfcc,
                n_fft=n_fft, hop_length=hop_length, n_mels=args.n_mels).transpose()
            if args.energy:
                energy = librosa.feature.rmse(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose()
                acoustic_features = np.hstack((acoustic_features, energy))
    elif 'lyon' == args.feature_type:
        waveform /= np.abs(waveform).max()
        acoustic_features = lyon_calc.lyon_passive_ear(waveform[:, np.newaxis].astype(np.double),
                                                       SAMPLE_RATE, hop_length)
        max_val = acoustic_features.max()
        if max_val > 0:
            acoustic_features /= max_val
        acoustic_features = acoustic_features.astype(np.float32)
        if args.energy:
            energy = librosa.feature.rmse(y=waveform, frame_length=hop_length, hop_length=hop_length).transpose()
            energy /= energy.max()
            len_delta = acoustic_features.shape[0] - energy.shape[0]
            if len_delta > 0:
                energy = np.pad(energy, [(0, len_delta), (0, 0)], 'edge')
            else:
                energy = energy[:acoustic_features.shape[0], :]
            acoustic_features = np.hstack((acoustic_features, energy))
    else:
        raise ValueError('Unexpected features type.')
    if args.deltas:
        orig_shape = acoustic_features.shape
        if args.backend=='speechpy':
            acoustic_features = extract_derivative_feature(acoustic_features)
        else:
            delta = librosa.feature.delta(acoustic_features, axis=0)
            ddelta = librosa.feature.delta(acoustic_features, order=2, axis=0)
            acoustic_features = np.stack((acoustic_features[:, :, np.newaxis],
                delta[:, :, np.newaxis], ddelta[:, :, np.newaxis]), axis=-1)
        acoustic_features = np.reshape(acoustic_features, (-1, orig_shape[-1] * 3))
    return acoustic_features
def get_data(data_path,
             flatten=True,
             mfcc_len=39,
             class_labels=("Neutral", "Angry", "Happy", "Sad")):
    """
    Process the data for training and testing.

    Perform the following steps.
    1. Read the files and get the audio frame.
    2. Perform the test-train split

    :param class_labels: class labels that we care about.
    :param data_path: path to the dataset folder
    :param mfcc_len: Number of mfcc features to take for each frame
    :param flatten: Boolean specifying whether to flatten the data or not
    :return: 4 arrays, x_train x_test y_train y_test
    """
    data = []
    labels = []
    max_fs = 0
    cur_dir = os.getcwd()
    sys.stderr.write('curdir: %s\n' % cur_dir)
    os.chdir(data_path)
    for i, directory in enumerate(class_labels):
        sys.stderr.write("started reading folder %s\n" % directory)
        os.chdir(directory)
        for filename in os.listdir('.'):
            fs, signal = read_wav(filename)
            max_fs = max(max_fs, fs)
            s_len = len(signal)
            # pad the signals to have same size if lesser than required
            # else slice them
            if s_len < mean_signal_length:
                pad_len = mean_signal_length - s_len
                pad_rem = pad_len % 2
                pad_len //= 2
                signal = np.pad(signal, (pad_len, pad_len + pad_rem),
                                'constant',
                                constant_values=0)
            else:
                pad_len = s_len - mean_signal_length
                pad_len //= 2
                signal = signal[pad_len:pad_len + mean_signal_length]
            mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len)

            if flatten:
                # Flatten the data
                mel_coefficients = np.ravel(mel_coefficients)
            data.append(mel_coefficients)
            labels.append(i)
        sys.stderr.write("ended reading folder %s\n" % directory)
        os.chdir('..')
    os.chdir(cur_dir)
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=42)
    return np.array(x_train), np.array(x_test), np.array(y_train), np.array(
        y_test)
Exemplo n.º 8
0
def array_to_features(data: list, config) -> list:
    """Takes a list of normalized audio signal amplitude and returns a list of MFCC parameters"""
    features = mfcc(data,
                    int(config['sampling_rate']),
                    frame_length=float(config['mfcc_frame_duration']),
                    frame_stride=float(config['mfcc_frame_stride']),
                    num_cepstral=int(config['mfcc_num_cepstral']),
                    num_filters=int(config['mfcc_num_filters']),
                    fft_length=int(config['mfcc_fft_length']))
    return features
 def run_interval(self, start, end):
     """Do fpd on interval of appropriate size."""
     # Extract MFCC values.
     sample = np.array(
         mfcc(np.frombuffer(self.raw_bytes[start:end], FORMAT), SAMPLE_RATE,
              WINDOW, STRIDE, MFCC, FILTER_BANKS, FFT_NUM, 0, None, True))
     # If positively predicted, write interval to wav file.
     if self.model.predict(np.reshape(sample, (1, 46, 13))) >= \
             PREDICTION_THRESHOLD:
         self.write(start, end)
         self.n_false_pos += 1
Exemplo n.º 10
0
def get_mfcc(data, sfs):
    """
    load the wav data
    Args:
        data(np.array): audio files
        sfs(np.array(int)): frequencies of the audio data
    Returns:
        (np.array): mel-frequency cepstrum of the audio data
    """
    if isinstance(sfs, (int, np.int64)):
        sfs = [sfs for i in range(len(data))]
    ret = np.array([mfcc(x, sf, num_cepstral=39) for x, sf in zip(data, sfs)])
    return np.expand_dims(ret, axis=1)
Exemplo n.º 11
0
    def Convert_To_MFCC(self, wf):
        '''
        Converts audio byte streams to MFCCs

        Args:
            wf - wave object

        Return:
            MFCCs - list of float lists
        '''

        return mfcc(self.Read_Audio_Data(wf), RATE, WINDOW, STRIDE, MFCC,
                    FILTER_BANKS, FFT_NUM, 0, None, True).tolist()
Exemplo n.º 12
0
    def get_mfccs(self, signal):
        s_len = len(signal)

        if s_len < self.mean_signal_length:
            pad_len = self.mean_signal_length - s_len
            pad_rem = pad_len % 2
            pad_len //= 2
            signal = np.pad(signal, (pad_len, pad_len + pad_rem),
                            'constant', constant_values=0)
        else:
            pad_len = s_len - self.mean_signal_length
            pad_len //= 2
            signal = signal[pad_len:pad_len + self.mean_signal_length]
        mel_coefficients = mfcc(signal, 16000, num_cepstral=39)
        return mel_coefficients
Exemplo n.º 13
0
    def test_mfcc(self):
       
       num_cepstral = 13
       mfcc = feature.mfcc(signal, sampling_frequency=fs,
                             frame_length=0.020, num_cepstral=num_cepstral, frame_stride=0.01,
                             num_filters=num_filters, fft_length=512, low_frequency=0,
                             high_frequency=None)

       # Shape matcher
       assert mfcc.shape[1] == num_cepstral
       
        
        
        
        
        
        
Exemplo n.º 14
0
def get_feature_vector_from_mfcc(file_path: str, flatten: bool,
                                 mfcc_len: int = 39) -> np.ndarray:
    fs, signal = wav.read(file_path)
    s_len = len(signal)
    if s_len < mean_signal_length:
        pad_len = mean_signal_length - s_len
        pad_rem = pad_len % 2
        pad_len //= 2
        signal = np.pad(signal, (pad_len, pad_len + pad_rem),
                        'constant', constant_values=0)
    else:
        pad_len = s_len - mean_signal_length
        pad_len //= 2
        signal = signal[pad_len:pad_len + mean_signal_length]
    mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len)
    if flatten:
        mel_coefficients = np.ravel(mel_coefficients)
    return mel_coefficients
Exemplo n.º 15
0
def get_featurefrom_mfcc(file_path: str, mfcc_len: int):
    signal, speed = sf.read(file_path)
    signal_len = len(signal)
    speed = 48000
    meansignal_length = 176578

    if signal_len < meansignal_length:
        pad_len = meansignal_length - signal_len
        pad_rem = pad_len % 2
        pad_len //= 2
        new_signal = np.pad(signal, (pad_len, pad_len + pad_rem),
                            "constant",
                            constant_values=0)
    else:
        pad_len = signal_len - meansignal_length
        pad_len //= 2
        new_signal = signal[pad_len:pad_len + meansignal_length]
    print(new_signal)
    # print(len(new_signal))
    mel_coefficients = mfcc(new_signal, speed, num_cepstral=mfcc_len)
    mel_coefficients = np.ravel(mel_coefficients)
    normalized_feature = min_max_scalling(mel_coefficients)
    return normalized_feature
def get_feature_vector_from_mfcc(file_path: str, mfcc_len: int):
    signal, fs = sf.read(file_path)
    s_len = len(signal)
    fs = 48000  # the sample rate of most fast signal in the data. . .

    # pad the signals to have same size if lesser than required
    if s_len < mean_signal_length:
        pad_len = mean_signal_length - s_len
        pad_len //= 2
        pad_rem = pad_len % 2
        signal = np.pad(signal, (pad_len, pad_len + pad_rem),
                        'constant',
                        constant_values=0)
    else:
        pad_len = s_len - mean_signal_length
        pad_len //= 2
        signal = signal[pad_len:pad_len + mean_signal_length]

    mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len)
    mel_coefficients = np.ravel(mel_coefficients)
    normalize_feature_vector = min_max_scalling(mel_coefficients)

    return normalize_feature_vector
Exemplo n.º 17
0
        noise = 0
        location = input("Location: ")
        description = input("Type of False Activation: ")

        while not(noise == 'q' or noise == 'm' or noise == 'l'):
            noise = input("Noise Level - Quiet (Q) Moderate (M) Loud (L): ").lower()
print()

while True:

    data = stream.read(CHUNK)
    frames.append(data)

    if (len(frames) > 19):
        in_data = np.fromstring(np.array(frames[:19]), 'Int16')
        audio_sig = np.array([mfcc(in_data, RATE, WINDOW, STRIDE, MFCC, FILTER_BANKS, FFT_NUM, 0, None, True)])

        prediction = model.predict(audio_sig)

        print(prediction)

        if (prediction > CONFIDENCE):
            act_count += 1

            if (act_count >= ACTIVATIONS):
                act_count = 0
                print("NIMBUS", end = " ", flush = True)

                if(false_act == "y"):

                    file_name = "notww_" + description + "-false_"+ location + "_" + noise +"_" + datetime.now().strftime("%m%d%Y%H%M%S%f") + "_ewenike.wav" 
Exemplo n.º 18
0
    def Wake_Word(self):
        ''' 
        Runs the wake word and calls the corresponding required functions for the end-to-end response

        Args: None

        Returns: None

        '''

        # reads chunk of audio
        data = self.istream.read(self.CHUNK, exception_on_overflow=False)

        # appends chunk to frame list
        self.frames.append(data)

        # begins making predictions after the first 2.5 seconds of audio is read
        if (len(self.frames) > 19):

            # converts first 19 chunks of audio bytes into 16 bit int values
            in_data = np.fromstring(np.array(self.frames[:19]), 'Int16')

            # extract MFCCs from the 19 chunks of audio
            audio_sig = np.array([
                mfcc(in_data, self.RATE, self.WINDOW, self.STRIDE, self.MFCC,
                     self.FILTER_BANKS, self.FFT_NUM, 0, None, True)
            ])

            # makes predictions
            prediction = self.ww_model.model.predict(audio_sig)

            # if the predictions is larger than the defined confidence
            if (prediction > self.CONFIDENCE):

                # increment the activation counter
                self.act_count += 1

                # if the number of consecutive activations exceeds the activation value
                if (self.act_count >= self.ACTIVATIONS):

                    # resets the activation count
                    self.act_count = 0

                    # stops the input stream
                    self.istream.stop_stream()

                    # wake word audio prompt
                    # subprocess.Popen(['mpg123', '-q', os.getcwd() + '%sUtils%sData%sNimbus_Awakened.mp3' % (self.delim, self.delim, self.delim)])

                    # stalls the program as the audio is played
                    time.sleep(4)
                    print("\nNIMBUS Activated\n\n")

                    # obtains the string from the audio input
                    stt_result = self.Speech_To_Text()

                    print(stt_result)

                    answer = ""

                    best_stt_answer = ""

                    # determines the appropriate input for the NLP engine
                    if list(stt_result) != []:
                        best_stt_answer = stt_result[0].alternatives[
                            0].transcript

                    else:
                        answer = "Sorry, I could not hear you. Please ask again."

                    # calls the NLP engine if speech was converted
                    if best_stt_answer != "":
                        answer = "Please ask again later."  #NLP(best_stt_answer)

                    # converts the NLP answer to audio
                    self.Text_To_Speech(answer)

                    # resets the wake word audio frames
                    self.frames = []

                    # opens the input stream for wake word detection
                    self.istream.start_stream()
            else:
                # print a period for not wake word predictions
                print('.', flush=True, end="")

                # reset the activation count
                self.act_count = 0

                # window the data stream
                self.frames = self.frames[1:]
Exemplo n.º 19
0
frames = processing.stack_frames(signal,
                                 sampling_frequency=fs,
                                 frame_length=0.020,
                                 frame_stride=0.01,
                                 filter=lambda x: np.ones((x, )),
                                 zero_padding=True)

# Extracting power spectrum
power_spectrum = processing.power_spectrum(frames, fft_points=512)
print('power spectrum shape=', power_spectrum.shape)

############# Extract MFCC features #############
mfcc = feature.mfcc(signal,
                    sampling_frequency=fs,
                    frame_length=0.020,
                    frame_stride=0.01,
                    num_filters=40,
                    fft_length=512,
                    low_frequency=0,
                    high_frequency=None)

# Cepstral mean variance normalization.
mfcc_cmvn = processing.cmvn(mfcc, variance_normalization=True)
print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)

# Extracting derivative features
mfcc_feature_cube = feature.extract_derivative_feature(mfcc)
print('mfcc feature cube shape=', mfcc_feature_cube.shape)

############# Extract logenergy features #############
logenergy = feature.lmfe(signal,
                         sampling_frequency=fs,
def Convert_To_MFCC(wf):
    return mfcc(Read_Audio_Data(wf), RATE, WINDOW, STRIDE, MFCC, FILTER_BANKS,
                FFT_NUM, 0, None, True).tolist()
Exemplo n.º 21
0
    # reads chunk of audio
    data = stream.read(CHUNK)

    # appends chunk to frame list
    frames.append(data)

    # begins making predictions after the first 2.5 seconds of audio is read
    if (len(frames) > 19):

        # converts first 19 chunks of audio bytes into 16 bit int values
        in_data = np.fromstring(np.array(frames[:19]), 'Int16')

        # extract MFCCs from the 19 chunks of audio
        audio_sig = np.array([
            mfcc(in_data, RATE, WINDOW, STRIDE, MFCC, FILTER_BANKS, FFT_NUM, 0,
                 None, True)
        ])

        # makes predictions
        prediction = model.predict(audio_sig)

        # print(prediction)

        # if the predictions is larger than the defined confidence
        if (prediction > CONFIDENCE):

            # increment the activation counter
            act_count += 1

            # if the number of consecutive activations exceeds the activation value
            if (act_count >= ACTIVATIONS):
def ConvertToMFCC(fileName, path):
    return mfcc(readAudioData(path + fileName), RATE, WINDOW, STRIDE, MFCC,
                FILTER_BANKS, FFT_NUM, 0, None, True).tolist()
Exemplo n.º 23
0
def _calc_mfcc(wav):
    mfccs = mfcc(wav, 16000, num_cepstral=39)
    return mfccs