Пример #1
0
def main(ARGS):
    # Load DeepSpeech model
    model_dir = "./models"
    ARGS.model = os.path.join(model_dir, 'output_graph.pbmm')
    ARGS.scorer = os.path.join(model_dir, 'kenlm.scorer')

    print('Initializing model...')
    logging.info("ARGS.model: %s", ARGS.model)
    model = Model(ARGS.model)

    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
                         device=ARGS.device,
                         input_rate=ARGS.rate,
                         file=ARGS.file)
    print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()

    # Stream from microphone to DeepSpeech using VAD
    spinner = None
    if not ARGS.nospinner:
        spinner = Halo(spinner='line')
    stream_context = model.createStream()
    wav_data = bytearray()
    for frame in frames:
        if frame is not None:
            if spinner: spinner.start()
            logging.debug("streaming frame")
            stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
        else:
            if spinner: spinner.stop()
            logging.debug("end utterence")
            text = stream_context.finishStream()
            print("Recognized: %s" % text)
            stream_context = model.createStream()
Пример #2
0
    def run(self, model, lmbin, trie):
        model = Model(model, BEAM_WIDTH)
        if lmbin:
            model.enableDecoderWithLM(lmbin, trie, LM_ALPHA, LM_BETA)
        stream = None

        while True:
            # Try to get the next command from our queue, use a timeout to check
            # periodically for a quit signal so the application doesn't hang on
            # exit.
            try:
                cmd, *data = self._in_queue.get(timeout=0.3)
            except queue.Empty:
                if self._should_quit:
                    break
                # If we haven't received a quit signal just continue trying to
                # get a command from the queue indefinitely
                continue

            if cmd == 'start':
                # 'start' means create a new stream
                stream = model.createStream()
            elif cmd == 'data':
                # 'data' means we received more audio data from the recorder
                if stream:
                    model.feedAudioContent(
                        stream, np.frombuffer(data[0].data(), np.int16))
            elif cmd == 'finish':
                # 'finish' means the caller wants the result of the current stream
                transcript = model.finishStream(stream)
                self.finished.emit(transcript)
                stream = None
class InferenceThread(QObject):
    finished = Signal(str)

    def __init__(self):
        super(InferenceThread, self).__init__()
        self.in_queue = queue.Queue()
        self.should_quit = False
        self.worker = threading.Thread(target=self.run)

    def send_cmd(self, cmd):
        ''' Insert command in queue to be processed by the thread '''
        self.in_queue.put(cmd)

    def setQuit(self):
        ''' Signal to the thread that it should stop running '''
        self.should_quit = True

    def start(self):
        self.worker.start()

    def run(self):
        # Creating the model
        self.model = Model( os.path.join(os.path.dirname(__file__),
                     "deepspeech-0.6.1-models/output_graph.pbmm"),
                     BEAM_WIDTH)
        self.model.enableDecoderWithLM( os.path.join(os.path.dirname(__file__),
            "deepspeech-0.6.1-models/lm.binary"),
            os.path.join(os.path.dirname(__file__),
            "deepspeech-0.6.1-models/trie"), LM_ALPHA, LM_BETA)
        stream = None

        while True:
            # Try to get the next command from our queue, use a timeout to check
            # periodically for a quit signal so the application doesn't hang on
            # exit.
            try:
                cmd, *data = self.in_queue.get(timeout=0.3)
            except queue.Empty:
                if self.should_quit:
                    break
                # If we haven't received a quit signal just continue trying to
                # get a command from the queue indefinitely
                continue

            if cmd == "start":
                # "start" means create a new stream
                stream = self.model.createStream()
                logging.debug("Starts to process sound")
            elif cmd == "data":
                # "data" means we received more audio data from the recorder
                if stream:
                    self.model.feedAudioContent(stream,
                        np.frombuffer(data[0].data(), np.int16))
            elif cmd == "finish":
                # "finish" means the caller wants the result of the current stream
                transcript = self.model.finishStream(stream)
                self.finished.emit(transcript)
                stream = None
                logging.debug("Finishes to process sound")
Пример #4
0
    def start(self,
              vad_aggressiveness: int = 3,
              callback: Callable[[str], None] = None):
        process = self.audio.connect(self.q)

        def read_audio():
            timeout = self.block_size * self.buffer_size / self.audio.sample_rate  # 2 for extra buffer? idk?
            while True:
                self.q.put(process.stdout.read(self.audio.read_size),
                           timeout=timeout)

        read_audio_thread = threading.Thread(target=read_audio, daemon=True)
        read_audio_thread.start()

        ds_model = Model(str(pbmm_path))
        ds_model.enableExternalScorer(str(scorer_path))
        stream_context = ds_model.createStream()

        vad_segmenter = VadSegmenter(vad_aggressiveness)
        previous_frame = None
        for frame in vad_segmenter.segmenter(
                self.q,
                block_size=self.block_size,
                sample_rate=self.sample_rate,
        ):
            if frame is not None:
                self.spinner.start()
                logger.debug("streaming frame")
                stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
            else:
                self.spinner.stop()
                logger.debug("end utterence")

                text = stream_context.finishStream()
                logger.info(f"{text=}")
                if callback:
                    callback(text)
                else:
                    print("Recognized: %s" % text)
                stream_context = ds_model.createStream()
            if (frame is None) != (previous_frame is None):
                logger.info("Streaming frame"
                            if frame is not None else "End utterence")
                previous_frame = frame
Пример #5
0
def main():
    parser = argparse.ArgumentParser(
        description='Running DeepSpeech inference.')
    parser.add_argument('--model',
                        required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument(
        '--alphabet',
        required=True,
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('--lm',
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        '--trie',
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    parser.add_argument('--audio1',
                        required=True,
                        help='First audio file to use in interleaved streams')
    parser.add_argument('--audio2',
                        required=True,
                        help='Second audio file to use in interleaved streams')
    args = parser.parse_args()

    ds = Model(args.model, args.alphabet, BEAM_WIDTH)

    if args.lm and args.trie:
        ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)

    fin = wave.open(args.audio1, 'rb')
    fs1 = fin.getframerate()
    audio1 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
    fin.close()

    fin = wave.open(args.audio2, 'rb')
    fs2 = fin.getframerate()
    audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
    fin.close()

    stream1 = ds.createStream()
    stream2 = ds.createStream()

    splits1 = np.array_split(audio1, 10)
    splits2 = np.array_split(audio2, 10)

    for part1, part2 in zip(splits1, splits2):
        ds.feedAudioContent(stream1, part1)
        ds.feedAudioContent(stream2, part2)

    print(ds.finishStream(stream1))
    print(ds.finishStream(stream2))
Пример #6
0
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float,
                       lm_beta: float, beam: int):
    class AudioProcessor(AudioProcessorBase):
        frames_lock: threading.Lock
        frames: deque

        def __init__(self) -> None:
            self.frames_lock = threading.Lock()
            self.frames = deque([])

        async def recv_queued(self,
                              frames: List[av.AudioFrame]) -> av.AudioFrame:
            with self.frames_lock:
                self.frames.extend(frames)

            # Return empty frames to be silent.
            new_frames = []
            for frame in frames:
                input_array = frame.to_ndarray()
                new_frame = av.AudioFrame.from_ndarray(
                    np.zeros(input_array.shape, dtype=input_array.dtype),
                    layout=frame.layout.name,
                )
                new_frame.sample_rate = frame.sample_rate
                new_frames.append(new_frame)

            return new_frames

    webrtc_ctx = webrtc_streamer(
        key="speech-to-text-w-video",
        mode=WebRtcMode.SENDRECV,
        audio_processor_factory=AudioProcessor,
        rtc_configuration={
            "iceServers": [{
                "urls": ["stun:stun.l.google.com:19302"]
            }]
        },
        media_stream_constraints={
            "video": True,
            "audio": True
        },
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_processor:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()

            audio_frames = []
            with webrtc_ctx.audio_processor.frames_lock:
                while len(webrtc_ctx.audio_processor.frames) > 0:
                    frame = webrtc_ctx.audio_processor.frames.popleft()
                    audio_frames.append(frame)

            if len(audio_frames) == 0:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
Пример #7
0
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float,
            beam: int):
    webrtc_ctx = webrtc_streamer(
        key="speech-to-text",
        mode=WebRtcMode.SENDONLY,
        audio_receiver_size=1024,
        rtc_configuration={
            "iceServers": [{
                "urls": ["stun:stun.l.google.com:19302"]
            }]
        },
        media_stream_constraints={
            "video": False,
            "audio": True
        },
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_receiver:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()
            try:
                audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
            except queue.Empty:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
Пример #8
0
class DeepSpeechInput(AudioInput):
    """
    Input from DeepSpeech using the US English language model.
    """
    def __init__(self,
                 notifier,
                 rate=None,
                 wav_dir=None,
                 model=os.path.join(_MODEL_DIR, 'models.pbmm'),
                 scorer=os.path.join(_MODEL_DIR, 'models.scorer')):
        """
        @see AudioInput.__init__()

        :type  rate:
        :param rate:
            The override for the rate, if not the model's one.
        :type  wav_dir:
        :param wav_dir:
            Where to save the wave files, if anywhere.
        :type  model:
        :param model:
            The path to the DeepSpeech model file.
        :type  scorer:
        :param scorer:
            The path to the DeepSpeech scorer file.
        """
        # If these don't exist then DeepSpeech will segfault when inferring!
        if not os.path.exists(model):
            raise IOError("Not found: %s" % (model, ))

        # Load in and configure the model.
        LOG.info("Loading model from %s" % (model, ))
        self._model = Model(model)
        if os.path.exists(scorer):
            LOG.info("Loading scorer from %s" % (scorer, ))
            self._model.enableExternalScorer(scorer)

        # Handle any rate override
        if rate is None:
            rate = self._model.sampleRate()

        # Wen can now init the superclass
        super(DeepSpeechInput, self).__init__(notifier,
                                              format=pyaudio.paInt16,
                                              channels=1,
                                              rate=rate,
                                              wav_dir=wav_dir)

        # Where we put the stream context
        self._context = None

    def _feed_raw(self, data):
        """
        @see AudioInput._feed_raw()
        """
        if self._context is None:
            self._context = self._model.createStream()
        audio = numpy.frombuffer(data, numpy.int16)
        self._context.feedAudioContent(audio)

    def _decode(self):
        """
        @see AudioInput._decode()
        """
        if self._context is None:
            # No context means no tokens
            LOG.warning("Had no stream context to close")
            tokens = []
        else:
            # Finish up by finishing the decoding
            words = self._context.finishStream()
            LOG.info("Got: %s" % (words, ))
            self._context = None

            # And tokenize
            tokens = [
                Token(word.strip(), 1.0, True) for word in words.split(' ')
                if len(word.strip()) > 0
            ]
        return tokens
Пример #9
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer', required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float,
                        help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
    parser.add_argument('--lm_beta', type=float,
                        help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
    # parser.add_argument('--version', action=VersionAction,
    #                     help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    parser.add_argument('--candidate_transcripts', type=int, default=3,
                        help='Number of candidate transcripts to include in JSON output')
    parser.add_argument('--hot_words', type=str,
                        help='Hot-words and their boosts.')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
        print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
        print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    if args.hot_words:
        print('Adding hot-words', file=sys.stderr)
        for word_boost in args.hot_words.split(','):
            word,boost = word_boost.split(':')
            ds.addHotWord(word,float(boost))

    fin = wave.open(args.audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts)))
    else: 
        print(ds.stt(audio))        
        test = ds.createStream().sentencefit(audio, "ka arohia katoatia te hāhi me ōna whakapono e te hapū o ōtākou")
        [print(f"letter: {t.letter}, confidence: {t.confidence}, timestep: {t.timestep}") for t in test.tokens]
        
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Пример #10
0
class SpeechToTextEngine:
    """ Class to perform speech-to-text transcription and related functionality """

    FORMAT = pyaudio.paInt16
    SAMPLE_RATE = 16000
    CHANNELS = 1
    BLOCKS_PER_SECOND = 50

    def __init__(self, scorer='deepspeech_model.scorer') -> None:
        """ Initializing the DeepSpeech model """

        self.model = Model(model_path=Path(__file__).parents[2].joinpath(
            'deepspeech_model.pbmm').absolute().as_posix())
        self.model.enableExternalScorer(scorer_path=Path(
            __file__).parents[2].joinpath(scorer).absolute().as_posix())
        self.vad = webrtcvad.Vad(mode=3)
        self.sample_rate = self.SAMPLE_RATE
        self.buffer_queue = queue.Queue()

    def run(self, audio) -> str:
        """ Receives the audio,  normalizes it and is sent to the model to be transcribed. Returns the result of the
        transcribe audio in string format."""

        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.stt(audio_buffer=audio_streams)
        return results

    def run_with_metadata(self, audio) -> Metadata:
        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.sttWithMetadata(audio_buffer=audio_streams)
        return results

    def add_hot_words(self, data) -> list:
        """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the
        added hot-words """

        all_hot_words = []
        try:
            logger.info('----------------------------------------------------')
            for hot_word in data:
                # Change all the characters of the hot-word to lower case
                word = hot_word.lower()

                # Get numeric value of the boost
                boost = float(data.get(hot_word))

                # Adding the hot-word and its boost to the language model
                self.model.addHotWord(hot_word, boost)

                # Printing on the prompt the activity
                logger.info(
                    f"`{word}` hot-word with boost `{boost}` was added.")
                all_hot_words.append(word)
            return all_hot_words
        except RuntimeError:
            return []

    def erase_hot_word(self, hot_words) -> None:
        try:
            for hot_word in hot_words:
                self.model.eraseHotWord(hot_word)
                logger.info(f"`{hot_word}` hot-word is erased.")
            logger.info('----------------------------------------------------')
        except RuntimeError:
            return

    def clear_hot_words(self) -> str:
        try:
            self.model.clearHotWords()
            return f"All hot-words were erased."
        except RuntimeError:
            return f"No more hot-words are left."

    def deep_stream(self):
        return self.model.createStream()

    def frame_generator(self, audio, sample_rate=16000, frame_duration_ms=30):
        """
        Takes the desired frame duration in milliseconds, the PCM data, and
        the sample rate. Yields Frames of the requested duration.
        """

        # audio = np.frombuffer(audio, np.int16)
        n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
        offset = 0
        timestamp = 0.0
        duration = (float(n) / sample_rate) / 2.0
        while offset + n < len(audio):
            yield Frame(audio[offset:offset + n], timestamp, duration)
            timestamp += duration
            offset += n
class DetectAndDeter:
    CLASSIFICATION_COUNT = 5
    TELEMARKETER_THRESH = 0.3
    VALID_CALLER_THRESH = 0.1
    IN_AUDIO_RATE = 8000
    DS_AUDIO_RATE = 16000
    MOZILLA_TTS_AUDIO_RATE = 22050
    QUIET_THRESH = 150
    QUIET_LENGTH = 3000

    def __init__(self, name):
        self.name = name  # user's name  e.g. "Bob Ross"
        self.valid_caller_event = Event()
        self.caller_audio_chunk = np.array([], dtype='int16')

        self.audio_in_queue = Queue()
        self.stt_to_classification_queue = Queue()
        self.stt_to_chatbot_queue = Queue()
        self.chatbot_to_tts_queue = Queue()
        self.audio_out_queue = Queue()

        self.manager = Manager()
        self.transcript = self.manager.list()
        self.is_telemarketer = self.manager.Value("is_telemarketer", None)

        self.deep_speech = None
        self.mozilla_tts = None

        self.final_transcript = None
        self.final_predictions = None

        self.speech_to_text_thread = Process(target=self.speech_to_text)
        self.classify_text_thread = Process(target=self.classify_text)
        self.generate_response_thread = Process(target=self.generate_responses)
        self.text_to_speech_thread = Process(target=self.text_to_speech)

        self.log = {
            "start": None,
            "end": None,
            "version": CONFIG['version'],
            "transcript": [],
            "is_telemarketer": None,
            "caller": None
        }

    @property
    def queues(self):
        return self.audio_in_queue, self.audio_out_queue

    def start(self):
        self.speech_to_text_thread.start()
        self.classify_text_thread.start()
        self.generate_response_thread.start()
        self.text_to_speech_thread.start()

        self.log["start"] = dt.datetime.now().isoformat()

    def close(self):
        self.log["transcript"] = [value for value in self.transcript]
        self.log["is_telemarketer"] = self.is_telemarketer.value
        self.log["end"] = dt.datetime.now().isoformat()

        self.speech_to_text_thread.terminate()
        self.speech_to_text_thread.join()
        self.speech_to_text_thread.close()

        self.classify_text_thread.terminate()
        self.classify_text_thread.join()
        self.classify_text_thread.close()

        self.generate_response_thread.terminate()
        self.generate_response_thread.join()
        self.generate_response_thread.close()

        self.text_to_speech_thread.terminate()
        self.text_to_speech_thread.join()
        self.text_to_speech_thread.close()

    def fill_log_info(self, caller_number):
        self.log['caller'] = caller_number
        return self.log

    def classify_text(self):
        predictions = []
        while self.is_telemarketer.value is None:
            idx = self.stt_to_classification_queue.get()
            text = self.transcript[idx]['text']

            preds = model.predict(text)
            transcript_line = self.transcript[idx]
            transcript_line['analysis'] = {
                "prediction": str(preds[0]).lower(),
                "confidence": float(max(preds[2]))
            }
            self.transcript[idx] = transcript_line
            predictions.append(str(preds[0]).lower())

            maybe_telemarketer = predictions.count("persuasion") / len(
                predictions)

            if len(predictions) > self.CLASSIFICATION_COUNT:
                print("CLASS")
                print(maybe_telemarketer, self.TELEMARKETER_THRESH,
                      self.VALID_CALLER_THRESH)
                if maybe_telemarketer > self.TELEMARKETER_THRESH:
                    self.is_telemarketer.value = True
                    break
                elif maybe_telemarketer < self.VALID_CALLER_THRESH:
                    self.is_telemarketer.value = False
                    # self.is_telemarketer.set()
                    break

        if not self.is_telemarketer.value:
            self.valid_caller_event.set()

    def generate_responses(self):
        while True:
            text = self.stt_to_chatbot_queue.get()
            print("Generate Response:", text)
            response = str(chatbot.get_response(text))

            self.chatbot_to_tts_queue.put(response)

    def text_to_speech(self):
        tts_config = CONFIG['tts_config']
        models_folder = Path(tts_config['folder'])

        model_path = str(models_folder / tts_config['model'])
        model_config_path = str(models_folder / tts_config['model_config'])
        vocoder_path = str(models_folder / tts_config['vocoder'])
        vocoder_config_path = str(models_folder / tts_config['vocoder_config'])

        self.mozilla_tts = Synthesizer(model_path, model_config_path,
                                       vocoder_path, vocoder_config_path)

        while True:
            response = self.chatbot_to_tts_queue.get()
            print("TTS:", response)

            sound_arr = np.array(self.mozilla_tts.tts(response))

            sound_arr *= 2**15
            sound_arr = sound_arr.astype('int16')

            sound = bytes(sound_arr)
            sound, _ = audioop.ratecv(sound, 2, 1, self.MOZILLA_TTS_AUDIO_RATE,
                                      self.IN_AUDIO_RATE, None)

            ulaw_sound = audioop.lin2ulaw(sound, 2)

            chunk_len = 540
            chunks = len(ulaw_sound) // chunk_len
            extra = len(ulaw_sound) - (chunks * chunk_len)

            for c in range(chunks):
                chunk = ulaw_sound[c * chunk_len:c * chunk_len + chunk_len]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            if extra != 0:
                chunk = ulaw_sound[-extra:]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            self.transcript.append({
                "speaker": "self",
                "text": response,
                "datetime": dt.datetime.now().isoformat()
            })

    def speech_to_text(self):
        stt_config = CONFIG['stt_config']
        models_folder = Path(stt_config['folder'])
        model_path = str(models_folder / stt_config['model'])
        scorer_path = str(models_folder / stt_config['scorer'])

        self.deep_speech = Model(model_path)
        self.deep_speech.enableExternalScorer(scorer_path)

        stream = self.deep_speech.createStream()

        while True:
            speech = self.audio_in_queue.get()

            while not self.audio_in_queue.empty():
                speech += self.audio_in_queue.get()

            lin_speech = audioop.ulaw2lin(speech, 2)
            ds_speech, _ = audioop.ratecv(lin_speech, 2, 1, self.IN_AUDIO_RATE,
                                          self.DS_AUDIO_RATE, None)

            lin_speech_arr = np.frombuffer(lin_speech, np.int16)
            ds_speech_arr = np.frombuffer(ds_speech, np.int16)

            stream.feedAudioContent(ds_speech_arr)

            self.caller_audio_chunk = np.concatenate(
                (self.caller_audio_chunk, lin_speech_arr))

            chunk_idx = max(0,
                            len(self.caller_audio_chunk) - self.QUIET_LENGTH)
            quiet_chunk = self.caller_audio_chunk[chunk_idx:]
            if (quiet_chunk < self.QUIET_THRESH).all() and (
                    self.caller_audio_chunk > self.QUIET_THRESH).any():
                text = stream.intermediateDecode()

                if text.strip():
                    self.stt_to_chatbot_queue.put(text)

                    idx = len(self.transcript
                              )  # insert to avoid race conditions with indexes
                    self.transcript.insert(
                        idx, {
                            "speaker": "caller",
                            "text": text,
                            "datetime": dt.datetime.now().isoformat()
                        })
                    self.stt_to_classification_queue.put(idx)

                    stream.finishStream()
                    stream = self.deep_speech.createStream()

                self.caller_audio_chunk = np.array([], dtype='int16')

    def make_greeting(self, one_party_consent):
        self.chatbot_to_tts_queue.put(
            f"Hi. This is {self.name} how may I help you?")

        if not one_party_consent:
            self.chatbot_to_tts_queue.put("Keep in mind, I record all calls")