Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description='Running DeepSpeech inference.')
    parser.add_argument('--model',
                        required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument(
        '--alphabet',
        required=True,
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('--lm',
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        '--trie',
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    parser.add_argument('--audio1',
                        required=True,
                        help='First audio file to use in interleaved streams')
    parser.add_argument('--audio2',
                        required=True,
                        help='Second audio file to use in interleaved streams')
    args = parser.parse_args()

    ds = Model(args.model, args.alphabet, BEAM_WIDTH)

    if args.lm and args.trie:
        ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)

    fin = wave.open(args.audio1, 'rb')
    fs1 = fin.getframerate()
    audio1 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
    fin.close()

    fin = wave.open(args.audio2, 'rb')
    fs2 = fin.getframerate()
    audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
    fin.close()

    stream1 = ds.createStream()
    stream2 = ds.createStream()

    splits1 = np.array_split(audio1, 10)
    splits2 = np.array_split(audio2, 10)

    for part1, part2 in zip(splits1, splits2):
        ds.feedAudioContent(stream1, part1)
        ds.feedAudioContent(stream2, part2)

    print(ds.finishStream(stream1))
    print(ds.finishStream(stream2))
Exemplo n.º 2
0
        def setup_model(model_path, alphabet, lm, trie, features):
            log("creating model {} {} with features {}...".format(
                model_path, alphabet, features))
            ds_model = Model(model_path, features.beam_width)

            if lm and trie:
                ds_model.enableDecoderWithLM(lm, trie, features.lm_alpha,
                                             features.lm_beta)
            log("model is ready.")
            return ds_model
def main(argv):
    if len(argv) < 1:
        print("No .wav File given.")
        return

    ds = Model(MODEL_FILE, 500)
    ds.enableDecoderWithLM(LANG_MODEL, TRIE_FILE, 1.50, 2.25)

    fs, audio = wav.read(argv[0])
    data = ds.stt(audio)
    print(data)
Exemplo n.º 4
0
def transcribe(args, filepath="", verbose=0):

    if verbose > 0:
        print('Loading model from file {}'.format(args.model), file=sys.stderr)
        model_load_start = timer()

    ds = Model(args.model, args.beam_width)
    if verbose > 0:
        model_load_end = timer() - model_load_start
        print('Loaded model in {:.3}s.'.format(
            model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()
    if args.lm and args.trie:
        if verbose > 0:
            print('Loading language model from files {} {}'.format(
                args.lm, args.trie), file=sys.stderr)
            lm_load_start = timer()
        ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta)
        if verbose > 0:
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in {:.3}s.'.format(
                lm_load_end), file=sys.stderr)

    fin = wave.open(filepath, 'rb')
    fs = fin.getframerate()
    if fs != desired_sample_rate:
        if verbose > 0:
            print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(
                fs, desired_sample_rate), file=sys.stderr)
        fs, audio = convert_samplerate(filepath, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs)
    fin.close()

    if verbose > 0:
        print('Running inference.', file=sys.stderr)
        inference_start = timer()
    audio_metadata = ds.sttWithMetadata(audio)
    if verbose > 0:
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' %
              (inference_end, audio_length), file=sys.stderr)

    dict_result = dict()
    dict_result["sentence"] = "".join(
        item.character for item in audio_metadata.items)
    dict_result["words"] = words_from_metadata(audio_metadata)
    dict_result["characters"] = audio_metadata
    dict_result["confidence"] = audio_metadata.confidence

    return dict_result
class Tester(BaseTester):

    name = 'DeepSpeech'

    audio_format = RATE16K_MONO_WAV

    def __init__(self, *args, **kwargs):
        super(Tester, self).__init__(*args, **kwargs)

        files = [
            args_lm,
            args_trie,
            args_model,
            # args_alphabet,
        ]
        for f in files:
            assert os.path.isfile(f), 'File %s does not exist.' % f

        print('Loading model from file %s' % (args_model), file=sys.stderr)
        model_load_start = timer()
        # self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH)
        self.ds = Model(args_model, BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

        # if args_lm and args_trie:
        print('Loading language model from files %s %s' % (args_lm, args_trie),
              file=sys.stderr)
        lm_load_start = timer()
        # self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA)
        self.ds.enableDecoderWithLM(args_lm, args_trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    def audio_to_text(self, fn):
        fin = wave.open(fn, 'rb')
        fs = fin.getframerate()
        assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        audio_length = fin.getnframes() * (1. / fs)
        fin.close()

        print('Running inference.', file=sys.stderr)
        inference_start = timer()
        # text = self.ds.stt(audio, fs)
        text = self.ds.stt(audio)
        print('text:', text)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' %
              (inference_end, audio_length),
              file=sys.stderr)
        return text
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--alphabet', required=True,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != SAMPLE_RATE:
        print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, SAMPLE_RATE), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/SAMPLE_RATE)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
    else:
        print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Exemplo n.º 7
0
def load_models():
    model_load_start = timer()
    ds = Model(MODEL_FILE, BEAM_WIDTH)
    model_load_end = timer() - model_load_start

    logging.debug('Loaded model in %0.3fs.' % (model_load_end))

    lm_load_start = timer()
    ds.enableDecoderWithLM(LANGUAGE_MODEL, TRIE_FILE, LM_ALPHA, LM_BETA)
    lm_load_end = timer() - lm_load_start

    logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))

    return ds
Exemplo n.º 8
0
def load_model(model_dir):
    BEAM_WIDTH = 500
    DEFAULT_SAMPLE_RATE = 16000
    LM_ALPHA = 0.75
    LM_BETA = 1.85

    model_path = os.path.join(model_dir, 'output_graph.pbmm')
    trie_path = os.path.join(model_dir, 'trie')
    lm_path = os.path.join(model_dir, 'lm.binary')

    model = Model(model_path, BEAM_WIDTH)
    model.enableDecoderWithLM(lm_path, trie_path, LM_ALPHA, LM_BETA)

    return model
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--alphabet', required=True,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
    else:
        print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Exemplo n.º 10
0
def loadModel():
    print('Loading model from file {}'.format(modelFile), file=sys.stderr)
    model_load_start = timer()
    ds = Model(modelFile, N_FEATURES, N_CONTEXT, alphabetFile, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
    print('Loading language model from files {} {}'.format(lmFile, trieFIle),
          file=sys.stderr)
    lm_load_start = timer()
    ds.enableDecoderWithLM(alphabetFile, lmFile, trieFIle, LM_WEIGHT,
                           VALID_WORD_COUNT_WEIGHT)
    lm_load_end = timer() - lm_load_start
    print('Loaded language model in {:.3}s.'.format(lm_load_end),
          file=sys.stderr)
    return ds
Exemplo n.º 11
0
def load_deepspeech_model(model='deepspeech-0.5.1-models/output_graph.pb', alphabet='deepspeech-0.5.1-models/alphabet.txt', lm='deepspeech-0.5.1-models/lm.binary', trie='models/trie'):
    print('Loading model from file {}'.format(model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if lm and trie:
        print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    return ds
Exemplo n.º 12
0
def load_model(models, lm, trie):

    BEAM_WIDTH = 500

    LM_ALPHA = 0.75

    LM_BETA = 1.85

    ds = Model(models, BEAM_WIDTH)

    ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)

    sample_rate = ds.sampleRate()

    return [ds, sample_rate]
class DeepSpeech:
    def __init__(self, model_path):
        self.model = model_path + '/output_graph.pbmm'
        self.alphabet = model_path + '/alphabet.txt'
        self.lm = model_path + '/lm.binary'
        self.trie = model_path + '/trie'
        #print('Loading model from file {}'.format(self.model), file=sys.stderr)
        #model_load_start = timer()
        self.ds = Model(self.model, N_FEATURES, N_CONTEXT, self.alphabet, BEAM_WIDTH)
        #model_load_end = timer() - model_load_start
        #print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

        if self.lm and self.trie:
            #print('Loading language model from files {} {}'.format(self.lm, self.trie), file=sys.stderr)
            #lm_load_start = timer()
            self.ds.enableDecoderWithLM(self.alphabet, self.lm, self.trie, LM_ALPHA, LM_BETA)
            #lm_load_end = timer() - lm_load_start
            #print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    def recognize(self, wav_file):
        '''parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
        parser.add_argument('--model', required=True,
                            help='Path to the model (protocol buffer binary file)')
        parser.add_argument('--alphabet', required=True,
                            help='Path to the configuration file specifying the alphabet used by the network')
        parser.add_argument('--lm', nargs='?',
                            help='Path to the language model binary file')
        parser.add_argument('--trie', nargs='?',
                            help='Path to the language model trie file created with native_client/generate_trie')
        parser.add_argument('--audio', required=True,
                            help='Path to the audio file to run (WAV format)')
        parser.add_argument('--version', action=VersionAction,
                            help='Print version and exits')
        args = parser.parse_args()'''
        fin = wave.open(wav_file, 'rb')
        fs = fin.getframerate()
        if fs != 16000:
            #print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
            fs, audio = convert_samplerate(wav_file)
        else:
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

        #audio_length = fin.getnframes() * (1/16000)
        fin.close()

        #print('Running inference.', file=sys.stderr)
        #inference_start = timer()
        return self.ds.stt(audio, fs)
    def transcribe(self, audio):

        name = 'speech_server_main'
        conf = config.ConfigDeepSpeech()
        model = conf.get_config('model')
        print(model)
        # alphabet = conf.get_config('alphabet')
        # print(alphabet)
        lm = conf.get_config('lm')
        trie = conf.get_config('trie')
        print(trie)
        ds = Model(model, BEAM_WIDTH)
        if lm and trie:
            ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
        text = ds.stt(audio)
        return text
Exemplo n.º 15
0
def load_model(models, alphabet, lm, trie):
    BEAM_WIDTH = 500
    LM_ALPHA = 0.75
    LM_BETA = 1.85

    model_load_start = timer()
    ds = Model(models, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    logging.debug("Loaded model in %0.3fs." % (model_load_end))

    lm_load_start = timer()
    ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
    lm_load_end = timer() - lm_load_start
    logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))

    return [ds, model_load_end, lm_load_end]
    def build_model(self, model_path):

        # Build deepspeech model to use for adversarial sample evaluation
        BEAM_WIDTH = 500
        LM_ALPHA = 0.75
        LM_BETA = 1.85
        N_FEATURES = 26
        N_CONTEXT = 9
        MODEL_PATH = model_path + '/models/output_graph.pb'
        ALPHABET_PATH = model_path + '/models/alphabet.txt'
        LM_PATH = model_path + '/models/lm.binary'
        TRIE_PATH = model_path + '/models/trie'

        ds = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH, BEAM_WIDTH)
        ds.enableDecoderWithLM(ALPHABET_PATH, LM_PATH, TRIE_PATH, LM_ALPHA, LM_BETA)

        return ds
Exemplo n.º 17
0
def load_ds_model(model_path,
                  alphabet_path,
                  lm_path=None,
                  trie_path=None,
                  n_features=26,
                  n_context=9,
                  beam_width=500,
                  lm_weight=1.50,
                  valid_word_count_weight=2.10):
    print(
        f'loading DeepSpeech model from {model_path}, using alphabet at {alphabet_path}, '
        f'LM at {lm_path} and trie at {trie_path}')
    ds = Model(model_path, n_features, n_context, alphabet_path, beam_width)
    if lm_path and trie_path:
        ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, lm_weight,
                               valid_word_count_weight)
    return ds
Exemplo n.º 18
0
def load_model():

    models = "models/output_graph.tflite"  #.tflite
    lm = "models/lm.binary"  # lm.binary
    trie = "models/trie"  # trie

    BEAM_WIDTH = 500
    LM_ALPHA = 0.75
    LM_BETA = 1.85

    ds = Model(models, BEAM_WIDTH)

    ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)

    sample_rate = ds.sampleRate()

    return [ds, sample_rate]
Exemplo n.º 19
0
def load_model():
    # load the pre-trained Keras model (here we are using a model
    # pre-trained on ImageNet and provided by Keras, but you can
    # substitute in your own networks just as easily)
    global model
    global ds
    #model = predict_model("/home/mwang/Development/deep-learning/SincNet/exp/SincNet_lifesize/model_raw.pkl.lifesize")
    model = predict_model(
        "/home/mwang/Development/deep-learning/SincNet/model_raw.pkl.lifesize")
    #model = predict_model("/home/mwang/Development/deep-learning/SincNet/exp/SincNet_libri/model_raw.pkl.amazon")
    #model = predict_model("/home/mwang/Development/deep-learning/SincNet/exp/SincNet_libri/model_raw.pkl.my_desktop")
    ds_model = "/home/mwang/Development/deep-learning/stt/models/output_graph.pbmm"
    ds_alphabet = "/home/mwang/Development/deep-learning/stt/models/alphabet.txt"
    ds_lm = "/home/mwang/Development/deep-learning/stt/models/lm.binary"
    ds_trie = "/home/mwang/Development/deep-learning/stt/models/trie"
    ds = Model(ds_model, N_FEATURES, N_CONTEXT, ds_alphabet, BEAM_WIDTH)
    ds.enableDecoderWithLM(ds_alphabet, ds_lm, ds_trie, LM_ALPHA, LM_BETA)
Exemplo n.º 20
0
def load_model(models, alphabet, lm, trie):
    N_FEATURES = 26
    N_CONTEXT = 9
    BEAM_WIDTH = 500
    LM_WEIGHT = 1.50
    VALID_WORD_COUNT_WEIGHT = 2.10

    model_load_start = timer()
    ds = Model(models, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    logging.debug("Loaded model in %0.3fs." % (model_load_end))

    lm_load_start = timer()
    ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    lm_load_end = timer() - lm_load_start
    logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))

    return [ds, model_load_end, lm_load_end]
Exemplo n.º 21
0
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask):
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)

    while True:
        msg = queue_in.get()

        fin = wave.open(msg['filename'], 'rb')
        fs = fin.getframerate()
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        audio_length = fin.getnframes() * (1/16000)
        fin.close()
    
        decoded = ds.stt(audio, fs)
        
        queue_out.put({'prediction': decoded, 'ground_truth': msg['transcript']})
        queue_in.task_done()
Exemplo n.º 22
0
def load_model(models, alphabet, lm, trie):
    N_FEATURES = 26
    N_CONTEXT = 9
    BEAM_WIDTH = 500
    LM_ALPHA = 0.75
    LM_BETA = 1.85

    model_load_start = timer()
    ds = Model(models, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    logging.debug("Loaded model in %0.3fs." % (model_load_end))

    lm_load_start = timer()
    ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
    lm_load_end = timer() - lm_load_start
    logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))

    return [ds, model_load_end, lm_load_end]
Exemplo n.º 23
0
def main():

    #initialize the data dictionary that will be returned from the
    #view
    data = {"success": False}
    # ensure that an audio file was properly uploadec to our endpoint
    if flask.request.method == "POST":
        if flask.request.files.get("audio"):
            fin = wave.open(flask.request.files["audio"], 'rb')
            fs = fin.getframerate()
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
            audio_length = fin.getnframes() * (1 / 16000)
            fin.close()

            print('Loading model from file', file=sys.stderr)
            model_load_start = timer()
            ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
            model_load_end = timer() - model_load_start
            print('Loaded model in {:.3}s.'.format(model_load_end),
                  file=sys.stderr)
            lm_load_start = timer()
            ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                   VALID_WORD_COUNT_WEIGHT)
            lm_load_end = timer() - lm_load_start
            print('Loaded model in {:.3}s.'.format(lm_load_end),
                  file=sys.stderr)

            print('Running inference.', file=sys.stderr)
            inference_start = timer()
            text = ds.stt(audio, fs)
            inference_end = timer() - inference_start
            print('Inference took %0.3fs for %0.3fs audio file.' %
                  (inference_end, audio_length),
                  file=sys.stderr)

            data["results"] = []

            data["results"].append(text)

            data["success"] = True

            data["sentiment"] = sentimentanalysis.get_score(text)

            return flask.jsonify(data)
Exemplo n.º 24
0
def load_model(models, lm, trie):
    BEAM_WIDTH = 500
    LM_ALPHA = 0.75
    LM_BETA = 1.85

    model_load_start = timer()
    ds = Model(models, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    logging.debug("Loaded model in %0.3fs." % (model_load_end))

    lm_load_start = timer()
    ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
    lm_load_end = timer() - lm_load_start
    logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))

    sample_rate = ds.sampleRate()
    logging.debug('Loaded model sample rate: %dHz.' % (sample_rate))

    return [ds, model_load_end, lm_load_end, sample_rate]
Exemplo n.º 25
0
def speechRec(audio_data):

    # r = sr.Recognizer()
    # with sr.Microphone(sample_rate=sample_rate) as source:
    #     print("Say Something")
    #     audio = r.listen(source)
    #     fs = audio.sample_rate
    #     audio = np.frombuffer(audio.frame_data, np.int16)

    sample_rate = 16000
    beam_width = 500
    lm_alpha = 0.75
    lm_beta = 1.85
    n_features = 29
    n_context = 9

    data_folder = Path('deepspeech-0.6.1-models')
    model_name = str(data_folder / "output_graph.pbmm")
    alphabet = str(data_folder / "alphabet.txt")
    langauage_model = str(data_folder / "lm.binary")
    trie = str(data_folder / "trie")
    audio_file = 'temp.wav'

    with open(audio_file, 'wb') as f:
        f.write(audio_data)

    ds = Model(model_name, beam_width)
    ds.enableDecoderWithLM(langauage_model, trie, lm_alpha, lm_beta)
    # print(ds.sampleRate())

    with wave.open(audio_file, 'rb') as fin:
        fs = fin.getframerate()
        print("Framerate: ", fs)
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        audio_length = fin.getnframes() * (1 / sample_rate)

    if os.path.exists(audio_file):
        os.remove(audio_file)
    else:
        sys.exit("The file {} does not exist".format(audio_file))
    # print("Infering {} file".format(audio_file))

    return ds.stt(audio)
Exemplo n.º 26
0
def DeepSpeech(Window, SpeechToNLPQueue):

    # Create Signal Object
    SpeechSignal = GUISignal()
    SpeechSignal.signal.connect(Window.UpdateSpeechBox)

    MsgSignal = GUISignal()
    MsgSignal.signal.connect(Window.UpdateMsgBox)

    # References to models:
    model = 'DeepSpeech_Models/output_graph.pbmm'
    alphabet = 'DeepSpeech_Models/alphabet.txt'
    lm = 'DeepSpeech_Models/lm.binary'
    trie = 'DeepSpeech_Models/trie'

    print('Loading model from file {}'.format(model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if lm and trie:
        print('Loading language model from files {} {}'.format(lm, trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end),
              file=sys.stderr)

    audio = []
    with MicrophoneStream(Window, RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        for content in audio_generator:
            for sample in content:
                audio.append(sample)

    result = ds.stt(audio, 16000)

    QueueItem = SpeechNLPItem(result, True, 0, 0, 'Speech')
    SpeechToNLPQueue.put(QueueItem)
    SpeechSignal.signal.emit([QueueItem])
Exemplo n.º 27
0
def tflite_worker(args, queue_in, queue_out, gpu_mask):
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, args.beam_width)
    ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, args.lm_alpha, args.lm_beta)

    while True:
        msg = queue_in.get()

        filename = msg['filename']
        wavname = os.path.splitext(os.path.basename(filename))[0]
        fin = wave.open(filename, 'rb')
        fs = fin.getframerate()
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        fin.close()

        decoded = ds.stt(audio, fs)

        queue_out.put({'wav': wavname, 'prediction': decoded, 'ground_truth': msg['transcript']})
        print(queue_out.qsize(), end='\r') # Update the current progress
        queue_in.task_done()
class Tester(BaseTester):

    name = 'DeepSpeech'

    audio_format = RATE16K_MONO_WAV

    def __init__(self, *args, **kwargs):
        super(Tester, self).__init__(*args, **kwargs)

        files = [args_lm, args_trie, args_model, args_alphabet]
        for f in files:
            assert os.path.isfile(f)

        print('Loading model from file %s' % (args_model), file=sys.stderr)
        model_load_start = timer()
        self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

        if args_lm and args_trie:
            print('Loading language model from files %s %s' % (args_lm, args_trie), file=sys.stderr)
            lm_load_start = timer()
            self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA)
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    def audio_to_text(self, fn):
        fin = wave.open(fn, 'rb')
        fs = fin.getframerate()
        assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        audio_length = fin.getnframes() * (1/16000)
        fin.close()

        print('Running inference.', file=sys.stderr)
        inference_start = timer()
        text = self.ds.stt(audio, fs)
        print('text:', text)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
        return text
Exemplo n.º 29
0
class Speech_Deepspeech:
    """
    Wrapper for deepspeech

    Input:
        - model: model path
        - alphabet: alphabet file
        - lm: lm file
        - trie: trie file
    """
    def __init__(self, model, alphabet, lm, trie):
        from deepspeech import Model as DSModel
        self.ds_model = DSModel(model, 26, 9, alphabet, 500)
        self.ds_model.enableDecoderWithLM(alphabet, lm, trie, 0.75, 1.85)


    def __call__(self, wavfile):
        with wave.open(wavfile, "rb") as fin:
            fs = fin.getframerate()
            audio = numpy.frombuffer(fin.readframes(fin.getnframes()), numpy.int16)
        return self.ds_model.stt(audio, fs)
Exemplo n.º 30
0
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask):
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
    #os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)

    while True:
        msg = queue_in.get()

        filename = msg['filename']
        wavname = os.path.splitext(os.path.basename(filename))[0]
        fin = wave.open(filename, 'rb')
        fs = fin.getframerate()
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        fin.close()

        decoded = ds.stt(audio, fs)

        queue_out.put({'wav': wavname, 'prediction': decoded})
        print(queue_out.qsize(), end='\r')  # Update the current progress
        queue_in.task_done()
Exemplo n.º 31
0
def load_model(models, alphabet, lm, trie):
    """
    Load the pre-trained model into the memory
    :param models: Output Graph Protocol Buffer file
    :param alphabet: Alphabet.txt file
    :param lm: Language model file
    :param trie: Trie file
    :return: tuple (DeepSpeech object, Model Load Time, LM Load Time)
    """
    N_FEATURES = 26
    N_CONTEXT = 9
    BEAM_WIDTH = 500
    #LM_ALPHA = 0.75
    #LM_BETA = 1.85

    LM_ALPHA = 1
    LM_BETA = 1.85

    ds = Model(models, BEAM_WIDTH)
    ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
    return ds
Exemplo n.º 32
0
def create_model(path, config):
    # Extract config
    model = path.get('model') + 'output_graph.pbmm' or \
        path.get('model') + 'output_graph.pb'
    lm_path = path.get('lm_path')
    beam_width = config.get('beam_width')
    lm_weight = config.get('lm_weight')
    w_weight = config.get('w_weight')
    n_features = 26
    n_context = 9

    # Búa til lm paths
    alphabet = os.path.join(lm_path, 'alphabet.txt')
    lm = os.path.join(lm_path, 'lm.binary')
    trie = os.path.join(lm_path, 'trie')

    # Búa til módel
    ds = Model(model, n_features, n_context, alphabet, beam_width)
    ds.enableDecoderWithLM(alphabet, lm, trie, lm_weight, w_weight)

    return ds
Exemplo n.º 33
0
def get_model(modeldir):
    args = AttrDict({
        'model': str.join('/', (modeldir, "models/output_graph.pbmm")),
        'alphabet': str.join('/', (modeldir, "models/alphabet.txt")),
        'lm': str.join('/', (modeldir, "models/lm.binary")),
        'trie': str.join('/', (modeldir,"models/trie")),
    })
    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)
    return ds