Exemplo n.º 1
0
def ModelInitiate(model_file_path, lm_file_path, lm_alpha, lm_beta, beam_width):
  model = Model(model_file_path)
  model.enableExternalScorer(lm_file_path)

  model.setScorerAlphaBeta(lm_alpha, lm_beta)
  model.setBeamWidth(beam_width)
  return model
Exemplo n.º 2
0
def transcribe_many_parallel(args, filepaths):
    for index, filepath in filepaths:
        ds = Model(args.model)

        if args.beam_width:
            ds.setBeamWidth(args.beam_width)

        if args.scorer:
            print('Loading scorer from files {}'.format(args.scorer),
                  file=sys.stderr)
            scorer_load_start = timer()
            ds.enableExternalScorer(args.scorer)
            scorer_load_end = timer() - scorer_load_start
            print('Loaded scorer in {:.3}s.'.format(scorer_load_end),
                  file=sys.stderr)

            if args.lm_alpha and args.lm_beta:
                ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

        if args.hot_words:
            print('Adding hot-words', file=sys.stderr)
            for word_boost in args.hot_words.split(','):
                word, boost = word_boost.split(':')
                ds.addHotWord(word, float(boost))
        p = Process(target=transcribe_file, args=(args, ds, filepath, index))
        p.start()
        p.join()
        print('{}: Transcribed file {} of {} from "{}"'.format(
            time.strftime("%H:%M:%S", time.localtime()), index + 1,
            len(filepaths), filepath))
Exemplo n.º 3
0
def recognize_DS(audio1, data):
    beam_width = 500 #how many different word sequences will the model take into account
    model_name = data['wake']['model name']
    ds = Model(model_name)
    ds.setBeamWidth(beam_width)
    audio1 = np.frombuffer(audio1.frame_data, np.int16) #converts into numpy array
    return (ds.stt(audio1)) #returning predicted audio
Exemplo n.º 4
0
def create_deepspeech_model(args):
    ds = Model(args.model)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    if args.scorer:
        print('Loading scorer from files {}'.format(args.scorer),
              file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
        print('Loaded scorer in {:.3}s.'.format(scorer_load_end),
              file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    if args.hot_words:
        print('Adding hot-words', file=sys.stderr)
        for word_boost in args.hot_words.split(','):
            word, boost = word_boost.split(':')
            ds.addHotWord(word, float(boost))

    return ds
Exemplo n.º 5
0
def run():
    logging.basicConfig(level=logging.INFO)
    parser = argparse.ArgumentParser(description='DeepSpeech Server')
    parser.add_argument('--port',
                        default=3337,
                        type=int,
                        help='Port to listen on')
    parser.add_argument('--model',
                        required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer',
                        required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--beam_width',
                        type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument(
        '--lm_alpha',
        type=float,
        help=
        'Language model weight (lm_alpha). If not specified, use default from the scorer package.'
    )
    parser.add_argument(
        '--lm_beta',
        type=float,
        help=
        'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.'
    )
    parser.add_argument('--google_key',
                        help="Google Speech-Recognition API key.")
    args = parser.parse_args()

    ds = Model(args.model)
    if args.beam_width:
        ds.setBeamWidth(args.beam_width)
    ds.enableExternalScorer(args.scorer)
    if args.lm_alpha and args.lm_beta:
        ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)
    handler_class = ReqHandlerFactory(ds, args.google_key)

    server_address = ('', args.port)
    httpd = HTTPServer(server_address, handler_class)
    logging.info('Starting httpd...\n')
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        pass
    httpd.server_close()
    logging.info('Stopping httpd...\n')
Exemplo n.º 6
0
def speech_to_text(input_file,
                   file_length,
                   return_speed_per_chunk=False,
                   chunk_size=10):
    """
    Compute the words pronounced in the input_file
    :param input_file: sound file path
    :param file_length: time length of the input file (in seconds)
    :param return_speed_per_chunk: if True, the function return a list of words per chunk, if false it returns all the words in the extract
    :return: words as string
    """
    # setup the model
    if return_speed_per_chunk:
        result = []
    else:
        result = ""
    recognizer = Model("models/deepspeech-0.8.2-models.pbmm")
    recognizer.setBeamWidth(2000)
    recognizer.enableExternalScorer("models/deepspeech-0.8.2-models.scorer")
    desired_sample_rate = recognizer.sampleRate()
    # convert input file into smaller audio chunks (apparently works better)
    CHUNK_SIZE = chunk_size
    n_chunks = int(file_length // CHUNK_SIZE)
    for i in range(n_chunks):
        tfm = sox.Transformer()
        tfm.trim(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE)
        tfm.set_output_format(channels=1)
        tfm.build(input_file, "temp_folder/chunked_file{}.wav".format(i))
        #cmb = sox.Combiner()
        input_list = [
            "audio-files/silence.wav",
            "temp_folder/chunked_file{}.wav".format(i),
            "audio-files/silence.wav"
        ]
        input_list_correct_sample_rate = list(
            map(lambda file: convert_samplerate(file, desired_sample_rate)[1],
                input_list))
        audio = np.concatenate(input_list_correct_sample_rate)
        #cmb.build(input_list, "temp_folder/chunked_file_with_silence{}.wav".format(i), combine_type="concatenate")
        #fs, audio = convert_samplerate("temp_folder/chunked_file_with_silence{}.wav".format(i), desired_sample_rate)
        if return_speed_per_chunk:
            result.append(recognizer.stt(audio))
        else:
            result += recognizer.stt(audio)
        os.remove("temp_folder/chunked_file{}.wav".format(i))
        #os.remove("temp_folder/chunked_file_with_silence{}.wav".format(i))
    print(result)
    return result
Exemplo n.º 7
0
def get_model(lang):
    ds_model = Model(DS_PARAM[lang]['model'])
    if DS_PARAM[lang].get('beam_width'):
        ds_model.setBeamWidth(DS_PARAM[lang]['beam_width'])
    if DS_PARAM[lang].get('scorer'):
        print('Loading scorer from files {}'.format(DS_PARAM[lang]['scorer']),
              file=sys.stderr)
        scorer_load_start = timer()
        ds_model.enableExternalScorer(DS_PARAM[lang]['scorer'])
        scorer_load_end = timer() - scorer_load_start
        print('Loaded scorer in {:.3}s.'.format(scorer_load_end),
              file=sys.stderr)
        if DS_PARAM[lang].get('lm_alpha') and DS_PARAM[lang].get('lm_beta'):
            ds_model.setScorerAlphaBeta(DS_PARAM[lang]['lm_alpha'],
                                        DS_PARAM[lang]['lm_beta'])
    return ds_model
Exemplo n.º 8
0
def get_model(lang):
    ds_model = Model(DS_PARAM[lang]["model"])
    if DS_PARAM[lang].get("beam_width"):
        ds_model.setBeamWidth(DS_PARAM[lang]["beam_width"])
    if DS_PARAM[lang].get("scorer"):
        print(
            "Loading scorer from files {}".format(DS_PARAM[lang]["scorer"]),
            file=sys.stderr,
        )
        scorer_load_start = timer()
        ds_model.enableExternalScorer(DS_PARAM[lang]["scorer"])
        scorer_load_end = timer() - scorer_load_start
        print("Loaded scorer in {:.3}s.".format(scorer_load_end),
              file=sys.stderr)
        if DS_PARAM[lang].get("lm_alpha") and DS_PARAM[lang].get("lm_beta"):
            ds_model.setScorerAlphaBeta(DS_PARAM[lang]["lm_alpha"],
                                        DS_PARAM[lang]["lm_beta"])
    return ds_model
Exemplo n.º 9
0
def stt(model_path,
        audio,
        beam_width=None,
        scorer_path=None,
        lm_alpha=None,
        lm_beta=None,
        hot_words=None):
    ds = Model(model_path)

    if beam_width:
        ds.setBeamWidth(beam_width)

    desired_sample_rate = ds.sampleRate()

    if scorer_path:
        ds.enableExternalScorer(scorer_path)

        if lm_alpha and lm_beta:
            ds.setScorerAlphaBeta(lm_alpha, lm_beta)

    # TODO
    # if hot_words:
    #     print('Adding hot-words', file=sys.stderr)
    #     for w in hot_words:
    #         ds.addHotWord(w, 6.2)

    fin = wave.open(audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print(
            f'ERROR: original sample rate ({fs_orig}) is different than {desired_sample_rate}hz.',
            file=sys.stderr)
        exit(1)

    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    fin.close()

    print('Running inference.', file=sys.stderr)
    res = ds.sttWithMetadata(audio, 1)
    res = postprocess_metadata(res)
    return res
Exemplo n.º 10
0
def predict_speech_to_text(stream_file):
    alpha = 0.85
    beta = 1.75
    beam_width = 500

    # Initialize the model
    speech_model = Model(MODEL_PATH)

    # Enable language scorer to improve the accuracy
    speech_model.enableExternalScorer(SCORER_PATH)

    #set beam width. A larger beam width value generates better results at the cost of decoding time.
    speech_model.setBeamWidth(beam_width)

    # setting the Scorer language model weight (alpha) and word insertion weight(beta)
    speech_model.setScorerAlphaBeta(alpha, beta)

    # Use scipy to covert wav file into numpy array
    _, audio = wav.read(stream_file)
    return speech_model.stt(audio)
Exemplo n.º 11
0
def process_input_file(conn, options, out_queue, background=True):
    """Given socket/pipe process audio input and push to out_queue"""
    log.info("Starting recognition on %s", conn)
    model = Model(options.model,)
    if options.beam_width:
        model.setBeamWidth(options.beam_width)
    desired_sample_rate = model.sampleRate()
    if desired_sample_rate != defaults.SAMPLE_RATE:
        log.error("Model expects rate of %s", desired_sample_rate)
    # if options.scorer:
    #     model.enableExternalScorer(options.scorer)
    # else:
    log.info("Disabling the built-in scorer")
    model.disableExternalScorer()
    out_queue.put({'partial': False, 'final': False, 'message': ['Connected']})
    if background:
        thread = threading.Thread(target=run_recognition, args=(model, conn, out_queue))
        thread.setDaemon(background)
        thread.start()
    else:
        run_recognition(model, conn, out_queue)
Exemplo n.º 12
0
def process_input_file(conn, options, out_queue, background=True):
    # TODO: allow socket connections from *clients* to choose
    # the model rather than setting it in the daemon...
    # to be clear, *output* clients, not audio sinks
    log.info("Starting recognition on %s", conn)
    model = Model(options.model,)
    if options.beam_width:
        model.setBeamWidth(options.beam_width)
    desired_sample_rate = model.sampleRate()
    if desired_sample_rate != defaults.SAMPLE_RATE:
        log.error("Model expects rate of %s", desired_sample_rate)
    if options.scorer:
        model.enableExternalScorer(options.scorer)
    else:
        log.info("Disabling the scorer")
        model.disableExternalScorer()
    if background:
        t = threading.Thread(target=run_recognition, args=(model, conn, out_queue))
        t.setDaemon(background)
        t.start()
    else:
        run_recognition(model, conn, out_queue)
Exemplo n.º 13
0
    def load_deepspeech_model(self):
        model = os.path.join(self.deepspeech_models_folder,
                             "deepspeech-0.9.3-models.pbmm")
        scorer = os.path.join(self.deepspeech_models_folder,
                              "deepspeech-0.9.3-models.scorer")
        lm_alpha = 0.93
        lm_beta = 1.18
        beam_width = 100

        model_load_start = timer()
        deepspeech_model = Model(model)
        model_load_end = timer() - model_load_start
        logger.debug("Loaded model in %0.3fs." % (model_load_end))
        scorer_load_start = timer()

        deepspeech_model.enableExternalScorer(scorer)
        deepspeech_model.setScorerAlphaBeta(lm_alpha, lm_beta)
        deepspeech_model.setBeamWidth(beam_width)

        scorer_load_end = timer() - scorer_load_start
        logger.debug("Loaded external scorer in %0.3fs." % (scorer_load_end))

        return deepspeech_model
Exemplo n.º 14
0
def load(model, scorer, verbose=True, beam_width="", lm_alpha="", lm_beta="", hot_words=""):
    """ Load models"""

    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    if verbose==True:
        print('\nLoading model from files {}'.format(model), file=sys.stderr)
        print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if beam_width:
        ds.setBeamWidth(beam_width)

    desired_sample_rate = ds.sampleRate()

    if scorer:
        if verbose == True:
            print('Loading scorer from files {}'.format(scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(scorer)
        scorer_load_end = timer() - scorer_load_start
        if verbose == True:
            print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if lm_alpha and lm_beta:
            ds.setScorerAlphaBeta(lm_alpha, lm_beta)

    if hot_words:
        if verbose == True:
            print('Adding hot-words', file=sys.stderr)
        for word_boost in hot_words.split(','):
            word, boost = word_boost.split(':')
            ds.addHotWord(word, float(boost))
    return ds, desired_sample_rate
Exemplo n.º 15
0
    def __init__(self, ):

        print('Loading model from file {}'.format(args.model), file=sys.stderr)
        model_load_start = timer()
        # sphinx-doc: python_ref_model_start
        model_path = os.path.dirname(os.path.abspath(__file__))

        ds = Model(os.path.join(model_path, args.model))
        # sphinx-doc: python_ref_model_stop
        model_load_end = timer() - model_load_start
        print('Loaded model in {:.3}s.'.format(model_load_end),
              file=sys.stderr)

        if args.beam_width:
            ds.setBeamWidth(args.beam_width)

        self.desired_sample_rate = ds.sampleRate()

        if args.scorer:
            print('Loading scorer from files {}'.format(args.scorer),
                  file=sys.stderr)
            scorer_load_start = timer()
            ds.enableExternalScorer(os.path.join(model_path, args.scorer))
            scorer_load_end = timer() - scorer_load_start
            print('Loaded scorer in {:.3}s.'.format(scorer_load_end),
                  file=sys.stderr)

            if args.lm_alpha and args.lm_beta:
                ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

        if args.hot_words:
            print('Adding hot-words', file=sys.stderr)
            for word_boost in args.hot_words.split(','):
                word, boost = word_boost.split(':')
                ds.addHotWord(word, float(boost))
        self.ds = ds
Exemplo n.º 16
0
        def setup_model(model_path, scorer, beam_width):
            log("creating model {} with scorer {}...".format(model_path, scorer))
            model = Model(model_path)

            if scorer.scorer is not None:
                model.enableExternalScorer(scorer.scorer)
                if scorer.lm_alpha is not None and scorer.lm_beta is not None:
                    if model.setScorerAlphaBeta(scorer.lm_alpha, scorer.lm_beta) != 0:
                        raise RuntimeError("Unable to set scorer parameters")

            if beam_width is not None:
                if model.setBeamWidth(beam_width) != 0:
                    raise RuntimeError("Unable to set beam width")

            log("model is ready.")
            return model
Exemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser(
        description='Running DeepSpeech inference.')
    parser.add_argument('--model',
                        required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer',
                        required=False,
                        help='Path to the external scorer file')
    parser.add_argument(
        '--prediction_in',
        required=True,
        help='Path to the directory with sound files (mp3/ogg/wav) to analyze')
    parser.add_argument(
        '--prediction_out',
        required=True,
        help='Path to the directory for moving the processed sound files to')
    parser.add_argument(
        '--prediction_tmp',
        required=False,
        help=
        'Path to the temp directory for storing the predictions initially before moving them to "--prediction_out"'
    )
    parser.add_argument(
        '--continuous',
        action='store_true',
        help='Whether to continuously load test images and perform prediction',
        required=False,
        default=False)
    parser.add_argument(
        '--delete_input',
        action='store_true',
        help=
        'Whether to delete the input files rather than move them to "--prediction_out" directory',
        required=False,
        default=False)
    parser.add_argument('--beam_width',
                        type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument(
        '--lm_alpha',
        type=float,
        help=
        'Language model weight (lm_alpha). If not specified, use default from the scorer package.'
    )
    parser.add_argument(
        '--lm_beta',
        type=float,
        help=
        'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.'
    )
    parser.add_argument(
        '--json',
        required=False,
        action='store_true',
        help='Output json from metadata with timestamp of each word')
    parser.add_argument(
        '--candidate_transcripts',
        type=int,
        default=3,
        help='Number of candidate transcripts to include in JSON output')
    parser.add_argument(
        '--normalize',
        required=False,
        action='store_true',
        help='Whether to apply standard amplitude normalization')
    parsed = parser.parse_args()

    print('Loading model from file {}'.format(parsed.model))
    ds = Model(parsed.model)
    if parsed.beam_width:
        ds.setBeamWidth(parsed.beam_width)

    if parsed.scorer:
        print('Loading scorer from file {}'.format(parsed.scorer))
        ds.enableExternalScorer(parsed.scorer)
        if parsed.lm_alpha and parsed.lm_beta:
            ds.setScorerAlphaBeta(parsed.lm_alpha, parsed.lm_beta)

    process(model=ds,
            prediction_in=parsed.prediction_in,
            prediction_out=parsed.prediction_out,
            prediction_tmp=parsed.prediction_tmp,
            continuous=parsed.continuous,
            delete_input=parsed.delete_input,
            json=parsed.json,
            candidate_transcripts=parsed.candidate_transcripts,
            normalize=parsed.normalize)
Exemplo n.º 18
0
parser.add_argument('--model', required=True,
                    help='Path to the .pbmm file')
parser.add_argument('--scorer', required=False,
                    help='Path to the .scorer file')
parser.add_argument('--beam_width', type=int, default=500,
                    help='Beam width for the CTC decoder')
parser.add_argument('--port', type=int, default=8008,
                    help='The port number to listen on')
args = parser.parse_args()

# Load in the model
logging.info("Loading model from %s" % args.model)
model = Model(args.model)

# Configure it
model.setBeamWidth(args.beam_width)
if args.scorer:
    logging.info("Loading scorer from %s" % (args.scorer,))
    model.enableExternalScorer(args.scorer)

# Set up the server socket
logging.info("Opening socket on port %d" % (args.port,))
sckt = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sckt.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sckt.bind(('0.0.0.0', args.port))
sckt.listen(5)

# Do this forever
while True:
    try:
        # Get a connection
Exemplo n.º 19
0
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float,
            beam: int):
    webrtc_ctx = webrtc_streamer(
        key="speech-to-text",
        mode=WebRtcMode.SENDONLY,
        audio_receiver_size=1024,
        rtc_configuration={
            "iceServers": [{
                "urls": ["stun:stun.l.google.com:19302"]
            }]
        },
        media_stream_constraints={
            "video": False,
            "audio": True
        },
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_receiver:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()
            try:
                audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
            except queue.Empty:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
Exemplo n.º 20
0
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float,
                       lm_beta: float, beam: int):
    class AudioProcessor(AudioProcessorBase):
        frames_lock: threading.Lock
        frames: deque

        def __init__(self) -> None:
            self.frames_lock = threading.Lock()
            self.frames = deque([])

        async def recv_queued(self,
                              frames: List[av.AudioFrame]) -> av.AudioFrame:
            with self.frames_lock:
                self.frames.extend(frames)

            # Return empty frames to be silent.
            new_frames = []
            for frame in frames:
                input_array = frame.to_ndarray()
                new_frame = av.AudioFrame.from_ndarray(
                    np.zeros(input_array.shape, dtype=input_array.dtype),
                    layout=frame.layout.name,
                )
                new_frame.sample_rate = frame.sample_rate
                new_frames.append(new_frame)

            return new_frames

    webrtc_ctx = webrtc_streamer(
        key="speech-to-text-w-video",
        mode=WebRtcMode.SENDRECV,
        audio_processor_factory=AudioProcessor,
        rtc_configuration={
            "iceServers": [{
                "urls": ["stun:stun.l.google.com:19302"]
            }]
        },
        media_stream_constraints={
            "video": True,
            "audio": True
        },
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_processor:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()

            audio_frames = []
            with webrtc_ctx.audio_processor.frames_lock:
                while len(webrtc_ctx.audio_processor.frames) > 0:
                    frame = webrtc_ctx.audio_processor.frames.popleft()
                    audio_frames.append(frame)

            if len(audio_frames) == 0:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
Exemplo n.º 21
0
# Word insertion bonus (lm_beta). If not specified, use default from the scorer package.
lm_beta = None  # float,

# Hot-words and their boosts.
hot_words = None  # str

print('Loading model from file {}'.format(model))
model_load_start = timer()
# sphinx-doc: python_ref_model_start
ds = Model(model)
# sphinx-doc: python_ref_model_stop
model_load_end = timer() - model_load_start
print('Loaded model in {:.3}s.'.format(model_load_end))

if beam_width:
    ds.setBeamWidth(beam_width)

desired_sample_rate = ds.sampleRate()

if scorer:
    print('Loading scorer from files {}'.format(scorer))
    scorer_load_start = timer()
    ds.enableExternalScorer(scorer)
    scorer_load_end = timer() - scorer_load_start
    print('Loaded scorer in {:.3}s.'.format(scorer_load_end))

    if lm_alpha and lm_beta:
        print("Set Scorer Alpha and Beta")
        ds.setScorerAlphaBeta(lm_alpha, lm_beta)

if hot_words:
Exemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer', required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float,
                        help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
    parser.add_argument('--lm_beta', type=float,
                        help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    parser.add_argument('--candidate_transcripts', type=int, default=3,
                        help='Number of candidate transcripts to include in JSON output')
    args = parser.parse_args()

#     print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
#     print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
#         print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
#         print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    fin = wave.open(args.audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
#         print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()

#     print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts)))
    else:
        print("Translation: "+ds.stt(audio))
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
    def record_voice_and_predict_text(self):
        """Records the speech and predicts its text """
        #Recording the speech

        stream_file_name = 'AudioFile/speech_stream.wav'
        stream_format = pyaudio.paInt16  # Sampling size and format
        no_of_channels = 1  # Number of audio channels
        sampling_rate = 16000  # Sampling rate in Hertz
        frames_count = 1024  # Number of frames per buffer
        record_seconds = 5

        stream = pyaudio.PyAudio()

        stream_data = stream.open(format=stream_format,
                                  channels=no_of_channels,
                                  rate=sampling_rate,
                                  input=True,
                                  frames_per_buffer=frames_count)
        frames = [
            stream_data.read(frames_count)
            for i in range(0, int(sampling_rate / frames_count *
                                  record_seconds))
        ]
        stream_data.stop_stream()
        stream_data.close()
        stream.terminate()

        wave_file = wave.open(stream_file_name, 'wb')
        wave_file.setnchannels(no_of_channels)
        wave_file.setsampwidth(stream.get_sample_size(stream_format))
        wave_file.setframerate(sampling_rate)
        wave_file.writeframes(b''.join(frames))
        wave_file.close()

        try:
            self.label_info.setText('Recording completed.')
        except:
            pass

        #Text prediction Part
        alpha = 0.75
        beta = 1.85
        beam_width = 500

        # Initialize the model
        speech_model = Model(MODEL_PATH)

        # set beam width. A larger beam width value generates better results at the cost of decoding time.
        speech_model.setBeamWidth(beam_width)

        # Enable language scorer to improve the accuracy
        speech_model.enableExternalScorer(SCORER_PATH)
        # You can play with setting the model Beam Width, Scorer language model weight and word insertion weight

        # Set hyperparameters alpha and beta of the external scorer.
        # alpha: Language model weight.
        # beta: Word insertion weight
        speech_model.setScorerAlphaBeta(alpha, beta)

        # Use scipy to covert wav file into numpy array
        _, audio = wav.read(stream_file_name)
        text = speech_model.stt(audio)
        try:
            self.text_pred.setText(text)
        except:
            pass
        show_images(text)
Exemplo n.º 24
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer', required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float,
                        help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
    parser.add_argument('--lm_beta', type=float,
                        help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
    # parser.add_argument('--version', action=VersionAction,
    #                     help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    parser.add_argument('--candidate_transcripts', type=int, default=3,
                        help='Number of candidate transcripts to include in JSON output')
    parser.add_argument('--hot_words', type=str,
                        help='Hot-words and their boosts.')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
        print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
        print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    if args.hot_words:
        print('Adding hot-words', file=sys.stderr)
        for word_boost in args.hot_words.split(','):
            word,boost = word_boost.split(':')
            ds.addHotWord(word,float(boost))

    fin = wave.open(args.audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts)))
    else: 
        print(ds.stt(audio))        
        test = ds.createStream().sentencefit(audio, "ka arohia katoatia te hāhi me ōna whakapono e te hapū o ōtākou")
        [print(f"letter: {t.letter}, confidence: {t.confidence}, timestep: {t.timestep}") for t in test.tokens]
        
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Exemplo n.º 25
0
class DeepSpeech():
    def __init__(self, model_path, scorer_path, result_json_path,
                 result_txt_path, candidate_transcripts=3, beam_width=None):

        # Path to the Speech-To-Text model
        self.MODEL_PATH = model_path
        # Path to the scorer language mode
        self.SCORER_PATH = scorer_path
        # The number of times to trascript
        self.CANDIDATE_TRANSCRIPTS = candidate_transcripts

        self.result_json_path = result_json_path
        self.result_txt_path = result_txt_path

        self.beam_width = beam_width

        self._setup()

    def _setup(self):
        self.ds = Model(self.MODEL_PATH)  # Declare the model obj
        # Set desired sample rate for STT model.
        self.sample_rate = '16000'

        if self.beam_width:
            self.ds.setBeamWidth(self.beam_width)

        if self.SCORER_PATH:
            self.ds.enableExternalScorer(self.SCORER_PATH)

    def convert_samplerate(self, audio_path, desired_sample_rate):
        sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {}\
                   --encoding signed-integer --endian little\
                   --compression 0.0 --no-dither - '\
        .format(quote(audio_path), desired_sample_rate)
        try:
            output = subprocess.check_output(
                shlex.split(sox_cmd), stderr=subprocess.PIPE)
        except subprocess.CalledProcessError as e:
            raise RuntimeError(
                'SoX returned non-zero status: {}'.format(e.stderr))
        except OSError as e:
            raise OSError(e.errno,
                          'SoX not found, use {}hz files or install it: {}'
                          .format(desired_sample_rate, e.strerror))

        return desired_sample_rate, np.frombuffer(output, np.int16)

    def words_from_candidate_transcript(self, metadata):
        word = ""
        word_list = []
        word_start_time = 0
        # Loop through each character
        for i, token in enumerate(metadata.tokens):
            # Append character to word if it's not a space
            if token.text != " ":
                if len(word) == 0:
                    # Log the start time of the new word
                    word_start_time = token.start_time

                word = word + token.text
            # Word boundary is either a space or the last character in the arr
            if token.text == " " or i == len(metadata.tokens) - 1:
                word_duration = token.start_time - word_start_time

                if word_duration < 0:
                    word_duration = 0

                each_word = dict()
                each_word["word"] = word
                each_word["start_time "] = round(word_start_time, 4)
                each_word["duration"] = round(word_duration, 4)

                word_list.append(each_word)
                # Reset
                word = ""
                word_start_time = 0

        return word_list

    def metadata_json_output(self, metadata):
        json_result = dict()
        json_result["transcripts"] = [{
            "confidence": transcript.confidence,
            "words": self.words_from_candidate_transcript(transcript),
        } for transcript in metadata.transcripts]
        return json.dumps(json_result, indent=4)

    def take_audio_info(self):
        probe = ffmpeg.probe(self.FILE_PATH)
        self.audio_info = next(
            (stream for stream in probe['streams']
             if stream['codec_type'] == 'audio'), None)
        print(self.audio_info)
        return self.audio_info

    def take_audio(self):
        out, err = (
            ffmpeg
            .input(self.FILE_PATH)
            .output('-', format='s16le',
                    acodec='pcm_s16le', ac=1, ar=self.sample_rate)
            .run(capture_stdout=True, capture_stderr=True)
        )
        self.audio = np.frombuffer(out, np.int16)
        return self.audio

    def speech2text(self):
        metadata = self.ds.sttWithMetadata(
            self.audio, self.CANDIDATE_TRANSCRIPTS)
        json_result = self.metadata_json_output(metadata)

        with open(self.result_json_path, 'w') as outfile:
            outfile.write(json_result)

        dict_result = json.loads(json_result)
        word_list = [item["word"]
                     for item in dict_result["transcripts"][0]["words"]]

        sentence = " ".join(word_list)
        self.export2textfile(sentence)
        return sentence

    def export2textfile(self, sentence):
        txt_file = open(self.result_txt_path, "w")
        txt_file.writelines(sentence)
        txt_file.close()

    def set_file(self, filepath):
        self.FILE_PATH = filepath
Exemplo n.º 26
0
    p.terminate()
    print(" Recording complete.")
    audio_data = (np.frombuffer(b''.join(frames), dtype=np.int16) / 32767)
    bg_data = (np.frombuffer(b''.join(frames_bg), dtype=np.int16) / 32767)
    # denoised_data = removeNoise(audio_data, bg_data)#.astype('float32')
    return audio_data  #denoised_data


#######Deepspeech Voice-To-Text Parameters########
DS_FOLDER = 'deepspeech_data'
if not os.path.exists(DS_FOLDER):
    os.mkdir(DS_FOLDER)
DS_model_file_path = 'deepspeech_data/deepspeech-0.7.4-models.pbmm'
beam_width = 500
DS_model = Model(DS_model_file_path)
DS_model.setBeamWidth(beam_width)
DS_model.enableExternalScorer('deepspeech_data/deepspeech-0.7.4-models.scorer')


def get_text(data, model=DS_model):
    """
    Transcribe text from audio.

    data: audio data as in array read from librosa with sampling rate 16000.
    model: Deepspeech ASR model.
    """
    #     y , s = librosa.load(fpath, sr=16000)
    y = (data * 32767).astype('int16')
    text = model.stt(y)
    return text
Exemplo n.º 27
0
from deepspeech import Model
import gradio as gr
import numpy as np

model_file_path = "deepspeech-0.8.2-models.pbmm"
lm_file_path = "deepspeech-0.8.2-models.scorer"
beam_width = 100
lm_alpha = 0.93
lm_beta = 1.18

model = Model(model_file_path)
model.enableExternalScorer(lm_file_path)
model.setScorerAlphaBeta(lm_alpha, lm_beta)
model.setBeamWidth(beam_width)


def reformat_freq(sr, y):
    if sr not in (
            48000,
            16000,
    ):  # Deepspeech only supports 16k, (we convert 48k -> 16k)
        raise ValueError("Unsupported rate", sr)
    if sr == 48000:
        y = (((y / max(np.max(y), 1)) * 32767).reshape(
            (-1, 3)).mean(axis=1).astype("int16"))
        sr = 16000
    return sr, y


def transcribe(speech, stream):
    _, y = reformat_freq(*speech)