def __init__(self, parent=None): """ Class constructor. """ QtGui.QWidget.__init__(self, parent) # UI setup self.ui = Ui_MusicSplitter() self.ui.setupUi(self) self.ui.browseButton.clicked.connect(self.HandleBrowseButton) self.ui.processButton.clicked.connect(self.HandleProcessButton) self.ui.saveButton.clicked.connect(self.HandleSaveButton) self.ui.tableView.resizeColumnsToContents() self.ui.tableView.setSelectionBehavior(QtGui.QTableView.SelectRows) self.ui.tableView.clicked.connect(self.HandleTableClicked) self.ui.tableView.setHorizontalScrollBarPolicy( QtCore.Qt.ScrollBarAlwaysOff) self.tableModel = None # VAD class self.vad = VAD() bandStart = self.ui.bandStartBox.value() self.vad.music_start_band = bandStart bandEnd = self.ui.bandEndBox.value() self.vad.music_end_band = bandEnd minSongLen = self.ui.songLenBox.value() self.vad.min_song_len = minSongLen self.foundSongs = None # Class members self.inputFile = None
def remove_silence(signal): """ Detects the speech regions and removes the non-speech regions using VAD (Voice Activity Detection) :param signal: np.ndarray (n by 1): input audio signal from one speaker :return: without_silence: np.ndarray (n by 1): original signal with removed silence regions """ regions = VAD(signal, int(df.sample_rate.iloc[0]), nFFT=512, win_length=0.02, hop_length=0.01, threshold=0.65) without_silence = np.array([signal[160 * i: 160 * (i + 1)] for i in range(regions.shape[0]) if regions[i] > 0]) without_silence = without_silence.flatten() return without_silence
def extract_features(df, label2ix, spec_kwargs, vad_kwargs, stacksize=1, frate=100, return_y=False): if return_y: return_y = 'label' in df X = {} if return_y: y = {} spectrum_encoder = Spectral(**spec_kwargs) vad_encoder = VAD(**vad_kwargs) for ix, fname in enumerate(df.filename.unique()): sig, fs = wavread(fname) if fs != spec_kwargs['fs']: raise ValueError('expected samplerate {}, got {}'.format( spec_kwargs['fs'], fs)) spec = spectrum_encoder.transform(sig) spec = (spec - spec.mean(0)) / spec.std(0) if stacksize > 1: spec = roll_array(spec, stacksize) vad = vad_encoder.activations(sig) vad = vad.reshape(vad.shape[0], -1) if stacksize > 1: vad = roll_array(vad, stacksize) X_curr = [] if return_y: y_curr = [] rows_iter = df[df.filename == fname].iterrows() for _, row in rows_iter: start = row.start end = row.end start_fr = int(start * frate) end_fr = int(end * frate) feat = np.hstack((spec[start_fr:end_fr], vad[start_fr:end_fr])) X_curr.append(feat.astype(np.float32)) if return_y: y_curr.append( np.ones(feat.shape[0], dtype=np.uint8) * \ label2ix[row.label] ) X[fname] = np.vstack(X_curr) if return_y: y[fname] = np.hstack(y_curr) if return_y: return X, y else: return X
def get_voice_times(frames, sample_rate, threshold=0, win_size=0.05, hop_size=0.025): ''' Get the times in which the VAD (Voice Activity Detector) detected voice Returns: a list of tuples each contains a voice-segment boundaries ''' detector = VAD(fs=sample_rate, win_size_sec=win_size, win_hop_sec=hop_size) decisions = list(detector.detect_speech(frames, threshold=threshold)) # Smooth the binary hard decisions vector with a sliding average slide_size = int(SLIDING_AVERAGE_WINDOW_SIZE / 2) smooth_decisions = [] for i in range(len(decisions)): if (i < slide_size) or (i >= len(decisions) - slide_size): smooth_decisions.append(False) continue # Majority vote smooth_decisions.append(decisions[i - slide_size:i + slide_size + 1].count(True) > slide_size) # Extract speech segments from the hard decisions voice_times = [] old_dec = False current_start = 0 for i, dec in enumerate(decisions): if dec and not old_dec: # We want to ignore short non-speech segments, so if the previous speech-end # is too close to this speech-start - remove the last speech segment and keep searching if voice_times and ( (i * hop_size) - voice_times[-1][1]) < MIN_SILENT_SEGMENT_LEN_SEC: current_start = voice_times[-1][0] voice_times = voice_times[:-1] else: current_start = i * hop_size if old_dec and not dec: voice_times.append((current_start, i * hop_size)) old_dec = dec return voice_times
def decode_live(source, volume, aggressiveness, url, topic, broker): from pulserecorder import PulseRecorder from vad import VAD, BUFFER_DURATION stream_id = get_uuid() # create requests session for saving cookies session = requests.Session() try: # pulseaudio recorder rec = PulseRecorder (source_name=source, volume=volume) vad = VAD(aggressiveness=aggressiveness) rec.start_recording() logging.info("Start talking.") while True: samples = rec.get_samples() audio, finalize = vad.process_audio(samples) if not audio: continue data = {'audio' : audio, 'do_finalize': finalize, 'topic' : topic, 'broker' : broker, 'id' : stream_id, 'sample_rate': 16000} response = session.post(url, json=data) if not response.ok: logging.error(response.text) else: logging.info ( "\tPrediction : %s - %f" % (response.json()['hstr'], response.json()['confidence'])) except KeyboardInterrupt: logging.info("Keyboard Interrupt: stopping service") rec.stop_recording() session.close() except Exception as e: logging.critical(e) session.close() sys.exit(1)
import flask import uuid import time import json import io import logging from flask import Response from flask import Flask from flask import request from flask_cors import CORS, cross_origin import numpy as np app = Flask(__name__) CORS(app) detector = VAD(frame_duration=0.5, model_path='models/vad') SAMPLING_RATE = 44100 @app.route("/") def homepage(): return "Welcome to the REST API!" @app.route("/predict", methods=["POST"]) def predict(): # initialize the data dictionary that will be returned from the view result = {"success": False} frames = flask.request.data array_frames = np.frombuffer(frames, dtype=np.int16) array_frames = array_frames.astype(np.float32, order='C') / 32768.0
parser.add_argument('--window', action='store', dest='window', type=float, required=True) parser.add_argument('--median-filter', action='store_true', dest='median', required=False) parser.add_argument('--no-median-filter', action='store_false', dest='median', required=False) r = parser.parse_args() vad = VAD() bandStart = 50 vad.music_start_band = bandStart minSongLen = 120 vad.min_song_len = minSongLen vad.music_end_band = r.band_end vad.sil_len = r.sil_len vad.thr = r.thr vad.frame_window = r.window vad.frame_overlap = r.window / 2
def __init__(self, session, nSampleRate=16000, datatype=np.int16, nNbrChannel=4, nEnergyThreshold=300, bActivateSpeechRecognition=True, bUseAnonymous=True, strUseLang="", bKeepAudioFiles=False, rTimePreVAD=0.150, rTimePostVAD=0.500): """ analyse chunk of data (must have no specific to robot nor naoqi method) - nSampleRate: the sample rate of your sound - datatype: the way your sound is stored - nNbrChannel: ... - nEnergyThreshold: threshold for the sound to be analysed for sound reco - rVadThreshold: threshold for confidence of the VAD: Currently not used - bActivateSpeechRecognition: do we send the interesting sound to the speech recognition ? - bActivateSoundRecognition: - strUseLang: lang to use for speech recognition, eg: "fr-FR", if leaved to "": use language currently in the tts """ self.session = session self.nSampleRate = nSampleRate self.datatype = datatype self.nNbrChannel = nNbrChannel self.bActivateSpeechRecognition = bActivateSpeechRecognition self.bUseAnonymous = bUseAnonymous self.rEnergyThreshold = nEnergyThreshold # 60 # 10 self.strUseLang = strUseLang self.rTimePreVAD = rTimePreVAD self.rTimePostVAD = rTimePostVAD self.rMfccWindowStepInSec = 0.01 self.nSizePreBuffer = int( self.rTimePreVAD * nSampleRate) # conversion from time to samples self.bStoringSpeech = False # are we currently storing for speech reco ? self.bStoringNoise = False # are we currently storing for sound reco ? self.aRecognizedSpeech = None self.bSpeechDetected = False self.bVisualFeedback = True self.bSpeechAnalysed = False # all sounds buffer will be stored in monochannel self.aStoredDataSpeech = np.array( [], dtype=self.datatype) # a numpy int16 array storing current sound self.aStoredDataNoise = np.array([], dtype=self.datatype) self.aStoredMfccSound = np.array([], dtype=np.float64) self.aStoredSoundPreBuffer = np.array([], dtype=self.datatype) self.createdFiles = [] self.timeLastBufferReceived = time.time() self.timeLastPeak = time.time() - 1000 self.timeLastVAD = self.timeLastPeak self.strLastRecognized = "" self.strDstPath = "/tmp/" self.debug_fileAllSpeech = None self.bKeepAudioFiles = bKeepAudioFiles self.bIsOnRobot = runner.is_on_robot() home = os.path.expanduser("~") self.storeDir = home + "/.abcdk/prevWavs" if not os.path.isdir(self.storeDir): os.makedirs(self.storeDir) self.vad = VAD(self.rTimePreVAD, self.rTimePostVAD) self.fs = freespeech.FreeSpeech(self.session) self.mem = self.session.service("ALMemory") self.leds = LedsDcm.LedsDcm(self.session) self.leds.createProxy() self.leds.createAliases() self.rEndLedLockTime = time.time() self.touch = self.mem.subscriber("TouchChanged") self.id_touch = self.touch.signal.connect( functools.partial(self.onTouch, "TouchChanged")) self.touched = False self.runningThread = True thread.start_new_thread(self.asrOnFile, ())
print('|{0:>15}|{1:<30.2f}|'.format('', phi)) print('+{0:->15}+{1:-<30}+'.format('-', '-')) for wavPath in filePathes: t = time.time() head, fileName = os.path.split(wavPath) clean, sr = sf.read(wavPath) maxi = max(abs(clean)) clean = [samp * (0.99 / maxi) for samp in clean] # magnitude normalization. noiseExt = np.resize(noise, len(clean)) vad = VAD(np.asarray(clean[19960521:19960521 + 360 * sr]), sr, threshold=0.95) ratio = list(vad).count(1) / len(vad) magWave = np.sum(np.square(clean)) / (ratio * len(clean)) # print(ratio, len(clean[19960521:19960521+120*sr]), len(vad), list(vad).count(1)) print('|{0:>15}|{1:<30}|'.format('file name', fileName)) print('|{0:>15}|{1:<30.4f}|'.format('vad ratio', ratio)) print('|{0:>15}|{1:<30}|'.format('conv', 'start')) fakeMic1 = np.convolve(conv1, clean) fakeMic2 = np.convolve(conv2, clean) maxi = max(abs(fakeMic1)) fakeMic1 = [samp * (0.99 / maxi) for samp in fakeMic1] # magnitude normalization. maxi = max(abs(fakeMic2))