Exemplo n.º 1
0
 def __init__(self, parent=None):
     """ Class constructor. 
     
     """
     QtGui.QWidget.__init__(self, parent)
     # UI setup
     self.ui = Ui_MusicSplitter()
     self.ui.setupUi(self)
     self.ui.browseButton.clicked.connect(self.HandleBrowseButton)
     self.ui.processButton.clicked.connect(self.HandleProcessButton)
     self.ui.saveButton.clicked.connect(self.HandleSaveButton)
     self.ui.tableView.resizeColumnsToContents()
     self.ui.tableView.setSelectionBehavior(QtGui.QTableView.SelectRows)
     self.ui.tableView.clicked.connect(self.HandleTableClicked)
     self.ui.tableView.setHorizontalScrollBarPolicy(
         QtCore.Qt.ScrollBarAlwaysOff)
     self.tableModel = None
     # VAD class
     self.vad = VAD()
     bandStart = self.ui.bandStartBox.value()
     self.vad.music_start_band = bandStart
     bandEnd = self.ui.bandEndBox.value()
     self.vad.music_end_band = bandEnd
     minSongLen = self.ui.songLenBox.value()
     self.vad.min_song_len = minSongLen
     self.foundSongs = None
     # Class members
     self.inputFile = None
Exemplo n.º 2
0
def remove_silence(signal):
    """
    Detects the speech regions and removes the non-speech regions using VAD (Voice Activity Detection)
    :param signal: np.ndarray (n by 1): input audio signal from one speaker
    :return: without_silence: np.ndarray (n by 1): original signal with removed silence regions
    """
    regions = VAD(signal, int(df.sample_rate.iloc[0]), nFFT=512, win_length=0.02, hop_length=0.01, threshold=0.65)

    without_silence = np.array([signal[160 * i: 160 * (i + 1)] for i in range(regions.shape[0]) if regions[i] > 0])
    without_silence = without_silence.flatten()
    return without_silence
Exemplo n.º 3
0
def extract_features(df,
                     label2ix,
                     spec_kwargs,
                     vad_kwargs,
                     stacksize=1,
                     frate=100,
                     return_y=False):
    if return_y:
        return_y = 'label' in df
    X = {}
    if return_y:
        y = {}
    spectrum_encoder = Spectral(**spec_kwargs)
    vad_encoder = VAD(**vad_kwargs)
    for ix, fname in enumerate(df.filename.unique()):
        sig, fs = wavread(fname)
        if fs != spec_kwargs['fs']:
            raise ValueError('expected samplerate {}, got {}'.format(
                spec_kwargs['fs'], fs))
        spec = spectrum_encoder.transform(sig)
        spec = (spec - spec.mean(0)) / spec.std(0)
        if stacksize > 1:
            spec = roll_array(spec, stacksize)
        vad = vad_encoder.activations(sig)
        vad = vad.reshape(vad.shape[0], -1)
        if stacksize > 1:
            vad = roll_array(vad, stacksize)

        X_curr = []
        if return_y:
            y_curr = []

        rows_iter = df[df.filename == fname].iterrows()
        for _, row in rows_iter:
            start = row.start
            end = row.end

            start_fr = int(start * frate)
            end_fr = int(end * frate)

            feat = np.hstack((spec[start_fr:end_fr], vad[start_fr:end_fr]))
            X_curr.append(feat.astype(np.float32))
            if return_y:
                y_curr.append(
                    np.ones(feat.shape[0], dtype=np.uint8) * \
                    label2ix[row.label]
                )
        X[fname] = np.vstack(X_curr)
        if return_y:
            y[fname] = np.hstack(y_curr)
    if return_y:
        return X, y
    else:
        return X
Exemplo n.º 4
0
def get_voice_times(frames,
                    sample_rate,
                    threshold=0,
                    win_size=0.05,
                    hop_size=0.025):
    '''
    Get the times in which the VAD (Voice Activity Detector) detected voice

    Returns:
        a list of tuples each contains a voice-segment boundaries
    '''

    detector = VAD(fs=sample_rate, win_size_sec=win_size, win_hop_sec=hop_size)
    decisions = list(detector.detect_speech(frames, threshold=threshold))

    # Smooth the binary hard decisions vector with a sliding average
    slide_size = int(SLIDING_AVERAGE_WINDOW_SIZE / 2)
    smooth_decisions = []
    for i in range(len(decisions)):
        if (i < slide_size) or (i >= len(decisions) - slide_size):
            smooth_decisions.append(False)
            continue

        # Majority vote
        smooth_decisions.append(decisions[i - slide_size:i + slide_size +
                                          1].count(True) > slide_size)

    # Extract speech segments from the hard decisions
    voice_times = []
    old_dec = False
    current_start = 0
    for i, dec in enumerate(decisions):
        if dec and not old_dec:
            # We want to ignore short non-speech segments, so if the previous speech-end
            # is too close to this speech-start - remove the last speech segment and keep searching
            if voice_times and (
                (i * hop_size) -
                    voice_times[-1][1]) < MIN_SILENT_SEGMENT_LEN_SEC:
                current_start = voice_times[-1][0]
                voice_times = voice_times[:-1]
            else:
                current_start = i * hop_size
        if old_dec and not dec:
            voice_times.append((current_start, i * hop_size))

        old_dec = dec

    return voice_times
Exemplo n.º 5
0
def decode_live(source, volume, aggressiveness, url, topic, broker):
    from pulserecorder import PulseRecorder
    from vad import VAD, BUFFER_DURATION

    stream_id = get_uuid()

    # create requests session for saving cookies
    session = requests.Session()

    try:
        # pulseaudio recorder
        rec = PulseRecorder (source_name=source, volume=volume)
        vad = VAD(aggressiveness=aggressiveness)

        rec.start_recording()
        logging.info("Start talking.")

        while True:
            samples = rec.get_samples()
            audio, finalize = vad.process_audio(samples)


            if not audio:
                continue

            data = {'audio'      : audio,
                    'do_finalize': finalize,
                    'topic'      : topic,
                    'broker'     : broker,
                    'id'         : stream_id,
                    'sample_rate': 16000}

            response = session.post(url, json=data)
            if not response.ok:
                logging.error(response.text)
            else:
                logging.info ( "\tPrediction    : %s - %f" % (response.json()['hstr'], response.json()['confidence']))
    except KeyboardInterrupt:
        logging.info("Keyboard Interrupt: stopping service")
        rec.stop_recording()
        session.close()
    except Exception as e:
        logging.critical(e)
        session.close()
        sys.exit(1)
Exemplo n.º 6
0
import flask
import uuid
import time
import json
import io
import logging
from flask import Response
from flask import Flask
from flask import request
from flask_cors import CORS, cross_origin
import numpy as np

app = Flask(__name__)
CORS(app)

detector = VAD(frame_duration=0.5, model_path='models/vad')
SAMPLING_RATE = 44100


@app.route("/")
def homepage():
    return "Welcome to the REST API!"


@app.route("/predict", methods=["POST"])
def predict():
    # initialize the data dictionary that will be returned from the view
    result = {"success": False}
    frames = flask.request.data
    array_frames = np.frombuffer(frames, dtype=np.int16)
    array_frames = array_frames.astype(np.float32, order='C') / 32768.0
Exemplo n.º 7
0
    parser.add_argument('--window',
                        action='store',
                        dest='window',
                        type=float,
                        required=True)
    parser.add_argument('--median-filter',
                        action='store_true',
                        dest='median',
                        required=False)
    parser.add_argument('--no-median-filter',
                        action='store_false',
                        dest='median',
                        required=False)
    r = parser.parse_args()

    vad = VAD()

    bandStart = 50
    vad.music_start_band = bandStart

    minSongLen = 120
    vad.min_song_len = minSongLen

    vad.music_end_band = r.band_end

    vad.sil_len = r.sil_len

    vad.thr = r.thr

    vad.frame_window = r.window
    vad.frame_overlap = r.window / 2
    def __init__(self,
                 session,
                 nSampleRate=16000,
                 datatype=np.int16,
                 nNbrChannel=4,
                 nEnergyThreshold=300,
                 bActivateSpeechRecognition=True,
                 bUseAnonymous=True,
                 strUseLang="",
                 bKeepAudioFiles=False,
                 rTimePreVAD=0.150,
                 rTimePostVAD=0.500):
        """
        analyse chunk of data (must have no specific to robot nor naoqi method)
        - nSampleRate: the sample rate of your sound
        - datatype: the way your sound is stored
        - nNbrChannel: ...
        - nEnergyThreshold: threshold for the sound to be analysed for sound reco
        - rVadThreshold: threshold for confidence of the VAD: Currently not used
        - bActivateSpeechRecognition: do we send the interesting sound to the speech recognition ?
        - bActivateSoundRecognition: 
        - strUseLang: lang to use for speech recognition, eg: "fr-FR", if leaved to "": use language currently in the tts
        """

        self.session = session
        self.nSampleRate = nSampleRate
        self.datatype = datatype
        self.nNbrChannel = nNbrChannel
        self.bActivateSpeechRecognition = bActivateSpeechRecognition
        self.bUseAnonymous = bUseAnonymous
        self.rEnergyThreshold = nEnergyThreshold
        # 60 # 10
        self.strUseLang = strUseLang

        self.rTimePreVAD = rTimePreVAD
        self.rTimePostVAD = rTimePostVAD

        self.rMfccWindowStepInSec = 0.01

        self.nSizePreBuffer = int(
            self.rTimePreVAD * nSampleRate)  # conversion from time to samples

        self.bStoringSpeech = False
        # are we currently storing for speech reco ?
        self.bStoringNoise = False
        # are we currently storing for sound reco ?
        self.aRecognizedSpeech = None
        self.bSpeechDetected = False
        self.bVisualFeedback = True
        self.bSpeechAnalysed = False

        # all sounds buffer will be stored in monochannel
        self.aStoredDataSpeech = np.array(
            [],
            dtype=self.datatype)  # a numpy int16 array storing current sound
        self.aStoredDataNoise = np.array([], dtype=self.datatype)
        self.aStoredMfccSound = np.array([], dtype=np.float64)
        self.aStoredSoundPreBuffer = np.array([], dtype=self.datatype)
        self.createdFiles = []

        self.timeLastBufferReceived = time.time()

        self.timeLastPeak = time.time() - 1000
        self.timeLastVAD = self.timeLastPeak

        self.strLastRecognized = ""
        self.strDstPath = "/tmp/"
        self.debug_fileAllSpeech = None
        self.bKeepAudioFiles = bKeepAudioFiles
        self.bIsOnRobot = runner.is_on_robot()
        home = os.path.expanduser("~")
        self.storeDir = home + "/.abcdk/prevWavs"
        if not os.path.isdir(self.storeDir):
            os.makedirs(self.storeDir)

        self.vad = VAD(self.rTimePreVAD, self.rTimePostVAD)
        self.fs = freespeech.FreeSpeech(self.session)
        self.mem = self.session.service("ALMemory")
        self.leds = LedsDcm.LedsDcm(self.session)
        self.leds.createProxy()
        self.leds.createAliases()
        self.rEndLedLockTime = time.time()
        self.touch = self.mem.subscriber("TouchChanged")
        self.id_touch = self.touch.signal.connect(
            functools.partial(self.onTouch, "TouchChanged"))
        self.touched = False
        self.runningThread = True
        thread.start_new_thread(self.asrOnFile, ())
Exemplo n.º 9
0
        print('|{0:>15}|{1:<30.2f}|'.format('', phi))
        print('+{0:->15}+{1:-<30}+'.format('-', '-'))
        for wavPath in filePathes:
            t = time.time()
            head, fileName = os.path.split(wavPath)

            clean, sr = sf.read(wavPath)

            maxi = max(abs(clean))
            clean = [samp * (0.99 / maxi)
                     for samp in clean]  # magnitude normalization.

            noiseExt = np.resize(noise, len(clean))

            vad = VAD(np.asarray(clean[19960521:19960521 + 360 * sr]),
                      sr,
                      threshold=0.95)
            ratio = list(vad).count(1) / len(vad)
            magWave = np.sum(np.square(clean)) / (ratio * len(clean))
            # print(ratio, len(clean[19960521:19960521+120*sr]), len(vad), list(vad).count(1))
            print('|{0:>15}|{1:<30}|'.format('file name', fileName))
            print('|{0:>15}|{1:<30.4f}|'.format('vad ratio', ratio))
            print('|{0:>15}|{1:<30}|'.format('conv', 'start'))

            fakeMic1 = np.convolve(conv1, clean)
            fakeMic2 = np.convolve(conv2, clean)

            maxi = max(abs(fakeMic1))
            fakeMic1 = [samp * (0.99 / maxi)
                        for samp in fakeMic1]  # magnitude normalization.
            maxi = max(abs(fakeMic2))