Пример #1
0
    def __call__(self, context: SpeechContext, frame: np.ndarray) -> None:
        """Processes a single frame of audio to determine if voice is present

        Args:
            context (SpeechContext): State based information that needs to be shared
            between pieces of the pipeline
            frame (np.ndarray): Single frame of PCM-16 audio from an input source

        """
        frame = frame.tobytes()
        result: bool = self._vad.is_speech(frame, self._sample_rate)

        raw = result > 0
        if raw == self._run_value:
            self._run_length += 1
        else:
            self._run_value = raw
            self._run_length = 1

        if self._run_value != context.is_speech:
            if self._run_value and self._run_length >= self._rise_length:
                context.is_speech = True
                _LOG.info("vad: true")
            if not self._run_value and self._run_length >= self._fall_length:
                context.is_speech = False
                _LOG.info("vad: false")
Пример #2
0
    def __call__(self, context: SpeechContext, frame: np.ndarray) -> None:
        """Processes a single frame of audio to determine if voice is present

        Args:
            context (SpeechContext): State based information that needs to be shared
            between pieces of the pipeline
            frame (np.ndarray): Single frame of PCM-16 audio from an input source

        """
        # validate dtype
        if not np.issubdtype(frame.dtype, np.signedinteger):
            raise TypeError("invalid_dtype")

        result: bool = self._vad.is_speech(frame)

        raw = result > 0
        if raw == self._run_value:
            self._run_length += 1
        else:
            self._run_value = raw
            self._run_length = 1

        if self._run_value != context.is_speech:
            if self._run_value and self._run_length >= self._rise_length:
                context.is_speech = True
                _LOG.info("vad: true")
            if not self._run_value and self._run_length >= self._fall_length:
                context.is_speech = False
                _LOG.info("vad: false")
def test_reset():
    context = SpeechContext()
    recognizer = CloudSpeechRecognizer()
    recognizer._client._socket = mock.MagicMock()

    recognizer._client._socket.recv.return_value = json.dumps({
        "error":
        None,
        "final":
        False,
        "hypotheses": [{
            "confidence": 0.5,
            "transcript": "this is a test"
        }],
        "status":
        "ok",
    })

    frame = np.random.rand(160).astype(np.int16)

    # trigger _begin and first _send
    context.is_active = True
    recognizer(context, frame)

    # trigger _send
    recognizer(context, frame)

    # we haven't triggered _commit or sent the final frame
    # which means context is still active and _is_active is True
    recognizer.reset()

    assert not recognizer._is_active
    assert not recognizer._client.is_connected
Пример #4
0
def test_detect_vad_inactive(_mock):
    context = SpeechContext()

    detector = WakewordTrigger(model_dir="wakeword_model")

    test_frame = np.random.rand(160, ).astype(np.float32)
    context.is_speech = False
    detector(context, test_frame)
    assert not context.is_active
Пример #5
0
def test_detect_activate(_mock):
    context = SpeechContext()
    detector = WakewordTrigger(model_dir="wakeword_model")
    detector.detect_model.return_value[0][:] = 0.6

    test_frame = np.random.rand(512, ).astype(np.float32)
    context.is_speech = True
    detector(context, test_frame)
    context.is_speech = False
    assert context.is_active
Пример #6
0
def test_detect_inactive_vad_deactivate(_mock):
    context = SpeechContext()
    detector = WakewordTrigger(model_dir="wakeword_model")

    for _ in range(3):
        test_frame = np.random.rand(160, ).astype(np.float32)
        context.is_speech = True
        detector(context, test_frame)
        context.is_speech = False
        assert not context.is_active
    detector(context, test_frame)
Пример #7
0
def test_detect_manual_min_delay(_mock):
    context = SpeechContext()
    detector = WakewordTrigger(model_dir="wakeword_model")
    detector.detect_model.return_value[0][:] = 1

    context.is_active = True
    test_frame = np.random.rand(512, ).astype(np.float32)
    detector(context, test_frame)
    detector(context, test_frame)
    detector(context, test_frame)

    assert context.is_active
def test_recognize():
    context = SpeechContext()
    recognizer = CloudSpeechRecognizer()
    recognizer._client._socket = mock.MagicMock()

    recognizer._client._socket.recv.return_value = json.dumps({
        "error":
        None,
        "final":
        False,
        "hypotheses": [{
            "confidence": 0.5,
            "transcript": "this is a test"
        }],
        "status":
        "ok",
    })

    frame = np.random.rand(160).astype(np.int16)
    # call with context active to test _begin and first _send
    context.is_active = True
    recognizer(context, frame)

    # call again to test with internal _is_active as True
    recognizer(context, frame)

    # call with context not active to test _commit
    context.is_active = False
    recognizer(context, frame)

    recognizer._client._socket.recv.return_value = json.dumps({
        "error":
        None,
        "final":
        True,
        "hypotheses": [{
            "confidence": 0.5,
            "transcript": "this is a test"
        }],
        "status":
        "ok",
    })

    # call with the client indicating it's the final frame to test _receive
    recognizer(context, frame)

    recognizer._client._socket.max_idle_time = 500
    # test timeout
    for i in range(501):
        recognizer(context, frame)

    assert not context.is_active
    assert not recognizer._client.is_connected
def test_recognize(*args):
    context = SpeechContext()
    audio = np.zeros(160).astype(np.int16)
    recognizer = GoogleSpeechRecognizer(language="en-US", credentials="")

    context.is_active = True
    for i in range(10):
        if i > 3:
            context.is_active = False
        recognizer(context, audio)

    recognizer.reset()
    recognizer.close()
Пример #10
0
def test_voice_activity_trigger():
    context = SpeechContext()
    trigger = VoiceActivityTrigger()

    frame = np.zeros(160, np.int16)

    trigger(context, frame)
    assert not context.is_active

    context.is_speech = True
    trigger(context, frame)
    assert context.is_active

    trigger.close()
Пример #11
0
def test_context():
    context = SpeechContext()

    # test is_speech
    assert not context.is_speech
    context.is_speech = True
    assert context.is_speech

    # test is_active
    assert not context.is_active
    context.is_active = True
    assert context.is_active

    # test transcript
    assert not context.transcript
    context.transcript = "this is a test"
    assert context.transcript

    # test confidence
    assert context.confidence == 0.0
    context.confidence = 1.0
    assert context.confidence == 1.0

    # test reset
    context.reset()
    assert not context.is_speech
    assert not context.is_active
    assert not context.transcript
    assert context.confidence == 0.0
def test_max_active():
    max_active = 500
    min_active = 20
    context = SpeechContext()
    timeout = ActivationTimeout(min_active=min_active, max_active=max_active)

    context.is_active = True

    steps_before_timeout = (max_active // 20) + 1
    for _ in range(steps_before_timeout):
        timeout(context)

    assert not context.is_active

    timeout.close()
Пример #13
0
def test_processing():
    context = SpeechContext()

    sample_rate = 8000
    frequency = 2000

    agc = AutomaticGainControl(
        sample_rate=sample_rate,
        frame_width=10,
        target_level_dbfs=9,
        compression_gain_db=15,
    )

    # valid amplification
    frame = sin_frame(sample_rate, frequency, amplitude=0.08)
    level = rms(frame)
    agc(context, frame)
    assert rms(frame) > level

    # valid attenuation
    frame = sin_frame(sample_rate, frequency)
    level = rms(frame)
    agc(context, frame)
    assert rms(frame) < level

    agc.close()
Пример #14
0
def test_invalid_frame_size():
    context = SpeechContext()
    agc = AutomaticGainControl()

    bad_frame = np.random.rand(100)
    with pytest.raises(ValueError):
        agc(context, bad_frame)
Пример #15
0
def test_invalid_frame_dtype():
    context = SpeechContext()
    agc = AutomaticGainControl()

    bad_frame = np.random.rand(320)
    with pytest.raises(TypeError):
        agc(context, bad_frame)
Пример #16
0
def test_invalid_dtype():
    context = SpeechContext()
    detector = VoiceActivityDetector()

    bad_frame = np.random.rand(160)
    with pytest.raises(Exception):
        detector(context, bad_frame)
def test_response():
    context = SpeechContext()
    recognizer = CloudSpeechRecognizer()
    recognizer._client._socket = mock.MagicMock()

    recognizer._client._socket.recv.return_value = json.dumps({
        "error":
        None,
        "final":
        False,
        "hypotheses": [{
            "confidence": 0.5,
            "transcript": "this is a test"
        }],
        "status":
        "ok",
    })

    frame = np.random.rand(160).astype(np.int16)

    # run through all the steps
    context.is_active = True
    recognizer(context, frame)
    recognizer(context, frame)
    context.is_active = False
    recognizer(context, frame)

    recognizer._client._socket.recv.return_value = json.dumps({
        "error":
        None,
        "final":
        True,
        "hypotheses": [{
            "confidence": 0.5,
            "transcript": "this is a test"
        }],
        "status":
        "ok",
    })
    # process the final frame with the final transcript
    recognizer(context, frame)

    assert context.transcript == "this is a test"
    assert context.confidence == 0.5

    recognizer.close()
Пример #18
0
def test_recognize(*args):
    context = SpeechContext()
    recognizer = KeywordRecognizer(classes=["one", "two", "three"])

    test_frame = np.random.rand(
        160,
    ).astype(np.float32)

    context.is_active = True
    for i in range(10):
        recognizer(context, test_frame)
        recognizer(context, test_frame)

    context.is_active = False
    recognizer(context, test_frame)
    assert context.transcript == "one"

    recognizer.close()
Пример #19
0
def test_timeout(*args):
    context = SpeechContext()
    recognizer = KeywordRecognizer(classes=["one", "two", "three"])
    recognizer.detect_model.return_value = [[[0.0, 0.0, 0.0]]]

    test_frame = np.random.rand(
        160,
    ).astype(np.float32)

    context.is_active = True
    for i in range(10):
        recognizer(context, test_frame)
        recognizer(context, test_frame)

    context.is_active = False
    recognizer(context, test_frame)
    assert not context.transcript

    recognizer.close()
Пример #20
0
def test_vad_is_triggered(mock_class):
    context = SpeechContext()
    detector = VoiceActivityDetector(sample_rate=16000,
                                     frame_width=10,
                                     vad_rise_delay=0,
                                     vad_fall_delay=0)
    frame = np.zeros(160, np.int16)
    detector(context, frame)
    assert context.is_speech
    detector.close()
Пример #21
0
def test_vad_rise_delay():
    context = SpeechContext()
    detector = VoiceActivityDetector(frame_width=10, vad_rise_delay=30)
    for i in range(3):
        frame = voice_frame()
        detector(context, frame)
        if i < 2:
            assert not context.is_speech
        else:
            assert context.is_speech
    detector.close()
Пример #22
0
def test_vad_is_triggered():
    context = SpeechContext()
    detector = VoiceActivityDetector(frame_width=10)

    frame = silence_frame()
    detector(context, frame)
    assert not context.is_speech

    frame = voice_frame()
    detector(context, frame)
    assert context.is_speech

    detector.close()
Пример #23
0
    def _detect(self, context: SpeechContext) -> None:
        # read the full contents of the encode window and add the batch dimension
        # calculate a scalar probability of if the frame contains the wakeword
        # with the detect model
        frame = self.encode_window.read_all()
        frame = np.expand_dims(frame, 0)
        posterior = self.detect_model(frame)[0][0][0]

        if posterior > self._posterior_max:
            self._posterior_max = posterior
        if posterior > self._posterior_threshold:
            context.is_active = True
            _LOG.info(f"wake: {self._posterior_max}")
Пример #24
0
    def __call__(self, context: SpeechContext, frame: np.ndarray) -> None:
        """Activates speech context whenever speech is detected

        Args:
            context (SpeechContext): State based information that needs to be shared
            between pieces of the pipeline
            frame (np.ndarray): Single frame of PCM-16 audio from an input source

        """
        if context.is_speech != self._is_speech:
            if context.is_speech:
                context.is_active = True
            self._is_speech = context.is_speech
Пример #25
0
def test_vad_rise_delay(mock_class):
    context = SpeechContext()
    detector = VoiceActivityDetector(sample_rate=16000,
                                     frame_width=10,
                                     vad_rise_delay=30,
                                     vad_fall_delay=0)
    for i in range(3):
        frame = np.zeros(160, np.int16)
        detector(context, frame)
        if i < 2:
            assert not context.is_speech
        else:
            assert context.is_speech
    detector.close()
Пример #26
0
    def _receive(self, context: SpeechContext) -> None:
        for response in self._client.streaming_recognize(
                self._config, self._drain()):
            for result in response.results[:1]:
                for alternative in result.alternatives[:1]:
                    context.transcript = alternative.transcript
                    context.confidence = alternative.confidence
                    if context.transcript:
                        context.event("partial_recognize")

                if result.is_final:
                    if context.transcript:
                        context.event("recognize")
                        _LOG.debug("recognize event")
                    else:
                        context.event("timeout")
                        _LOG.debug("timeout event")
def test_receive(*args):
    context = SpeechContext()
    audio = np.zeros(160).astype(np.int16)
    recognizer = GoogleSpeechRecognizer(language="en-US", credentials="")
    recognizer._queue.put([audio, audio, audio])

    recognizer._client.streaming_recognize.return_value = [
        mock.Mock(
            results=[
                mock.Mock(alternatives=[mock.Mock(transcript="test", confidence=0.99)])
            ]
        )
    ]

    context.is_active = True
    for i in range(10):
        if i > 3:

            context.is_active = False
        recognizer(context, audio)

    recognizer._thread = mock.Mock()
    recognizer.reset()
    recognizer.close()
Пример #28
0
    def _receive(self, context: SpeechContext) -> None:
        self._client.receive()
        hypotheses = self._client.response.get("hypotheses")
        if hypotheses:
            hypothesis = hypotheses[0]
            context.transcript = hypothesis["transcript"]
            context.confidence = hypothesis["confidence"]
            if context.transcript:
                context.event("partial_recognize")

        if self._client.is_final:
            if context.transcript:
                context.event("recognize")
                _LOG.debug("recognize event")
            else:
                context.event("timeout")
                _LOG.debug("timeout event")
Пример #29
0
def test_handler():
    def on_speech(context):
        context.transcript = "event handled"

    context = SpeechContext()
    context.add_handler("recognize", on_speech)

    context.event("recognize")

    assert context.transcript == "event handled"
Пример #30
0
def test_vad_fall_untriggered():
    context = SpeechContext()
    detector = VoiceActivityDetector(frame_width=10, vad_fall_delay=20)

    voice = voice_frame()
    silence = silence_frame()

    detector(context, voice)
    assert context.is_speech

    for i in range(10):
        detector(context, silence)
        assert context.is_speech

    detector(context, silence)
    assert not context.is_speech
    detector.close()