Exemplo n.º 1
0
def main_deepspeech(args):
    args = parse_args_deep() if args is None else args
    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wave.read(args.audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Exemplo n.º 2
0
def build_model(init_settings):
    """

    :param init_settings: Configparser
    :return:
    """
    print('Loading DeepSpeech Models')
    try:
        ds = Model(str(init_settings['deepspeech']['model_path']),
                   int(init_settings['deepspeech']['N_FEATURES']),
                   int(init_settings['deepspeech']['N_CONTEXT']),
                   str(init_settings['deepspeech']['alphabet_path']),
                   int(init_settings['deepspeech']['BEAM_WIDTH']))
        ds.enableDecoderWithLM(
            str(init_settings['deepspeech']['alphabet_path']),
            str(init_settings['deepspeech']['lm_path']),
            str(init_settings['deepspeech']['trie_path']),
            float(init_settings['deepspeech']['LM_WEIGHT']),
            float(init_settings['deepspeech']['WORD_COUNT_WEIGHT']),
            float(init_settings['deepspeech']['VALID_WORD_COUNT_WEIGHT']))
        return ds
    except Exception as e:
        print('Loading Error!')
        print(e)
        return None
class DeepSpeechImp:
    ds = None

    def __init__(self):

        logging.info('Loading model from file %s' % (shared_params.DS_MODEL))
        model_load_start = timer()
        self.ds = Model(shared_params.DS_MODEL, N_FEATURES, N_CONTEXT,
                        shared_params.DS_ALPHABET, shared_params.BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        logging.info('Loaded model in %0.3fs.' % (model_load_end))
        logging.info('Loading language model from files %s %s' %
                     (shared_params.DS_LANGUAGE_MODEL, shared_params.DS_TRIE))
        lm_load_start = timer()
        self.ds.enableDecoderWithLM(shared_params.DS_ALPHABET,
                                    shared_params.DS_LANGUAGE_MODEL,
                                    shared_params.DS_TRIE,
                                    shared_params.LM_WEIGHT,
                                    shared_params.WORD_COUNT_WEIGHT,
                                    shared_params.VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        logging.info('Loaded language model in %0.3fs.' % (lm_load_end))

    def process_audio(self, audio_path):
        try:
            fs, audio = wav.read(audio_path)
            return self.ds.stt(audio, fs)
        except Exception as ex:
            logging.error(str(ex))
            return ""

    def __del__(self):
        del self.ds
Exemplo n.º 4
0
    def __init__(self,
                 model_path,
                 alphabet_path,
                 language_model_path=None,
                 trie_path=None):
        """
        Constructor.

        :param model_path: Absolute path to (acoustic) model file.
        :param alphabet_path: Absolute path to file containing alphabet.
        :param language_model_path: Absolute path to language model file. This parameter is optional. Set to
        enable decoding with language model.
        :param trie_path: Absolute path to trie. This parameter is optional. Set to enable decoding with language model.
        """

        # https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py
        self._model = Model(aModelPath=model_path,
                            aNCep=26,
                            aNContext=9,
                            aAlphabetConfigPath=alphabet_path,
                            aBeamWidth=500)

        if language_model_path is not None and trie_path is not None:
            self._model.enableDecoderWithLM(aAlphabetConfigPath=alphabet_path,
                                            aLMPath=language_model_path,
                                            aTriePath=trie_path,
                                            aLMWeight=1.75,
                                            aWordCountWeight=1.0,
                                            aValidWordCountWeight=1.0)
            self._with_language_model = True
        else:
            self._with_language_model = False
Exemplo n.º 5
0
def main():
    model = "models/output_graph.pb"
    alphabet = "models/alphabet.txt"
    lm = "models/lm.binary"
    trie = "models/trie"

    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)

    ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT,
                           VALID_WORD_COUNT_WEIGHT)

    with open("flickr_audio_transcription.txt", "w") as out:
        for audio_f in glob.glob(
                "/roaming/gchrupal/vgs/data/flickr8k/flickr_audio/wavs/*.wav"):
            print("Transcribing {}".format(audio_f))
            try:
                fs, audio = wav.read(audio_f)
                if fs != 16000:
                    if fs < 16000:
                        print(
                            'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.'
                            % (fs),
                            file=sys.stderr)
                    fs, audio = convert_samplerate(args.audio)
                audio_length = len(audio) * (1 / 16000)
                basename, ext = os.path.splitext(os.path.basename(audio_f))
                out.write("{}\t{}\n".format(basename, ds.stt(audio, fs)))
                out.flush()
            except ValueError as e:
                print("Error: {}".format(e))
Exemplo n.º 6
0
def recognize(model="../models/output_graph.pb",
              audio="../audio/2830-3980-0043.wav",
              alphabet="../models/alphabet.txt",
              lm="../models/lm.binary",
              trie="../models/trie"):
    print('Loading model from file %s' % (model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if lm and trie:
        print('Loading language model from files %s %s' % (lm, trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    result = ds.stt(audio, fs)
    print(result, file=sys.stderr)
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    return result
Exemplo n.º 7
0
def load_model():
    args = {
        'model': './models/output_graph.pb',
        'alphabet': './models/alphabet.txt',
        'lm': './models/lm.binary',
        'trie': './models/trie',
        'audio': './sample_input.wav'
    }

    print('Loading model from file {}'.format(args['model']), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'],
               BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args['lm'] and args['trie']:
        print('Loading language model from files {} {}'.format(
            args['lm'], args['trie']),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args['alphabet'],
                               args['lm'],
                               args['trie'],
                               aLMWeight=LM_WEIGHT,
                               aValidWordCountWeight=VALID_WORD_COUNT_WEIGHT,
                               aWordCountWeight=WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end),
              file=sys.stderr)
    return ds
Exemplo n.º 8
0
class DeepSpeech:
    def __init__(self, model, alphabet, lm=None, trie=None):
        print('Loading model from file %s' % (model), file=sys.stderr)
        model_load_start = timer()
        self.ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

        if lm is not None and trie is not None:
            print('Loading language model from files %s %s' % (lm, trie),
                  file=sys.stderr)
            lm_load_start = timer()
            self.ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                        WORD_COUNT_WEIGHT,
                                        VALID_WORD_COUNT_WEIGHT)
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in %0.3fs.' % (lm_load_end),
                  file=sys.stderr)

    def stt(self, audio_file):
        fs, audio = wav.read(audio_file)
        audio_length = len(audio) * (1 / 16000)
        print('Running inference.', file=sys.stderr)
        inference_start = timer()
        stt_result = self.ds.stt(audio, fs)
        print('Return result: ', stt_result)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' %
              (inference_end, audio_length),
              file=sys.stderr)
        return stt_result
Exemplo n.º 9
0
 def __init__(self, modelPath, alphabet, lmPath, trie, numFeatures=26, numContext=9, beamWidth=500):
     print('Loading model from file %s' % modelPath, file=sys.stderr)
     model_load_start = timer()
     self.model = Model(modelPath, numFeatures, numContext, alphabet, beamWidth)
     #self.model.enableDecoderWithLM(alphabet, lmPath, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
     model_load_end = timer() - model_load_start
     print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)
Exemplo n.º 10
0
    def __init__(self, vocabulary, graph="models/output_graph.pb",
                 alphabet="models/alphabet.txt"):

        self._logger = logging.getLogger(__name__)
        self._logger.debug("Initializing DeepSpeech with graph '%s' " +
                           "and alphabet '%s'", graph, alphabet)
        self._model = Model(graph, 26, 9, alphabet, 500)
class SpeechRecognizer:
    def __init__(self):
        self._model = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH,
                            BEAM_WIDTH)

        self._model.enableDecoderWithLM(ALPHABET_PATH, LANGUAGE_MODEL_PATH,
                                        TRIE_PATH, LM_WEIGHT,
                                        WORD_COUNT_WEIGHT,
                                        VALID_WORD_COUNT_WEIGHT)

    def speech_to_text(self, audio_buffer, sample_rate):
        app.logger.info('processing audio file')
        audio = self._process_audio_data(audio_buffer, sample_rate)
        app.logger.info('starting recognition')

        start = time()
        text = self._model.stt(audio, SAMPLE_RATE)
        end = time()
        app.logger.info('finished in {:.3f}s'.format(end - start))

        return text

    def _process_audio_data(self, audio_buffer, original_sample_rate):
        audio = np.frombuffer(audio_buffer, dtype=np.int16)
        if original_sample_rate != SAMPLE_RATE:
            audio = self._resample(audio, original_sample_rate)
        return audio

    def _resample(self, audio, original_sample_rate):
        audio_length = len(audio) / original_sample_rate
        samples = int(audio_length * SAMPLE_RATE)
        return signal.resample(audio, samples).astype(np.int16)
Exemplo n.º 12
0
    def _worker_thread(self):
        print('restoring from {}'.format(model_file))
        model = Model(model_file, N_INPUT, N_CONTEXT, ALPHABET_CONFIG_PATH,
                      BEAM_WIDTH)
        model.enableDecoderWithLM(ALPHABET_CONFIG_PATH, LM_BINARY_PATH,
                                  LM_TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT,
                                  VALID_WORD_COUNT_WEIGHT)

        while True:
            cmd, *args = self._queue.get()
            if cmd == 'sample':
                sample = args[0]
                file = wave.open(sample.wav_path)
                audio = np.frombuffer(file.readframes(file.getnframes()),
                                      dtype=np.int16)
                fs = file.getframerate()
                start = time.time()
                result = model.stt(audio, fs)
                inference_time = time.time() - start
                wav_time = wav_length(sample.wav_path)
                print('wav length: {}\ninference time: {}\nRTF: {:2f}'.format(
                    wav_time, inference_time, inference_time / wav_time))
                self.inference_done.emit(sample, result)
            elif cmd == 'stop':
                break

        sess.close()
Exemplo n.º 13
0
class transciber(object):
    def __init__(self,
                 modelPath,
                 alphabet,
                 lmPath,
                 trie,
                 numFeatures=26,
                 numContext=9,
                 beamWidth=500):
        print('Loading model from file %s' % modelPath, file=sys.stderr)
        model_load_start = timer()
        self.model = Model(modelPath, numFeatures, numContext, alphabet,
                           beamWidth)
        self.model.enableDecoderWithLM(alphabet, lmPath, trie, LM_WEIGHT,
                                       WORD_COUNT_WEIGHT,
                                       VALID_WORD_COUNT_WEIGHT)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    def transcribe(self, audioPath):
        fs, audio = wav.read(audioPath)
        audio_length = len(audio) * (1 / 16000)
        label = self.model.stt(audio, fs)
        print(label)
        return label
Exemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        type=str,
                        help='Path to the model (protocol buffer binary file)',
                        default="models/output_graph.pb")
    parser.add_argument('audio',
                        type=str,
                        help='Path to the audio file to run (WAV format)',
                        default="sample_input.wav")
    parser.add_argument(
        'alphabet',
        type=str,
        help=
        'Path to the configuration file specifying the alphabet used by the network',
        default="models/alphabet.txt")
    parser.add_argument('lm',
                        type=str,
                        nargs='?',
                        help='Path to the language model binary file',
                        default="models/lm.binary")
    parser.add_argument(
        'trie',
        type=str,
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie',
        default="models/trie")
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(args.audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    def __init__(self):
        self._model = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH,
                            BEAM_WIDTH)

        self._model.enableDecoderWithLM(ALPHABET_PATH, LANGUAGE_MODEL_PATH,
                                        TRIE_PATH, LM_WEIGHT,
                                        WORD_COUNT_WEIGHT,
                                        VALID_WORD_COUNT_WEIGHT)
Exemplo n.º 16
0
def main(model, alphabet, lm, trie, dest):

	# parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
	# parser.add_argument('model', type=str,
	# 					help='Path to the model (protocol buffer binary file)')
	# parser.add_argument('alphabet', type=str,
	# 					help='Path to the configuration file specifying the alphabet used by the network')
	# parser.add_argument('lm', type=str, nargs='?',
	# 					help='Path to the language model binary file')
	# parser.add_argument('trie', type=str, nargs='?',
	# 					help='Path to the language model trie file created with native_client/generate_trie')
	# parser.add_argument('audio', type=str,
	# 					help='Path to the audio file to run (WAV format)')
	# args = parser.parse_args()

	# print(args);

	print('Loading model from file %s' % (model), file=sys.stderr)
	model_load_start = timer()
	ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
	model_load_end = timer() - model_load_start
	print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

	if lm and trie:
		print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr)
		lm_load_start = timer()
		ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
							   WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
		lm_load_end = timer() - lm_load_start
		print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

	# fs, audio = read_video(args.audio) #wav.read(args.audio)
	# return ;
	print('Running inference.', file=sys.stderr)
	clips = os.listdir(dest) ; # clips dir path

	subs = [] ;

	for i, clip in enumerate(clips) :
		fs, audio = wav.read(dest + str(i) + '.wav') ;

		if fs != 16000:
			if fs < 16000:
				print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr)
		
		fs, audio = convert_samplerate(dest + str(i) + '.wav')	
		audio_length = len(audio) * ( 1 / 16000)

		inference_start = timer()
		subs.append(ds.stt(audio, fs)) ;
		print(subs[len(subs) - 1]);

		inference_end = timer() - inference_start
		print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

		# break ;

	return subs ;	
Exemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument(
        'alphabet',
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('lm',
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        'trie',
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    parser.add_argument('audio',
                        help='Path to the audio file to run (WAV format)')
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(args.audio)
    if fs != 16000:
        if fs < 16000:
            print(
                'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.'
                % (fs),
                file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    audio_length = len(audio) * (1 / 16000)

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Exemplo n.º 18
0
    def setup_model(model_path, alphabet, lm, trie):
        if model_path and alphabet:
            print("creating model {} {}".format(model_path, alphabet))
            ds_model = Model(model_path, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)

            if lm and trie:
                ds_model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
            return ds_model
        return None
Exemplo n.º 19
0
 def __init__(self, *args, **kwargs):
     plugin.STTPlugin.__init__(self, *args, **kwargs)
     self._logger = logging.getLogger(__name__)
     self._plugin_config = self.profile['deepspeech']
     graph = self._plugin_config['graph']
     alphabet = self._plugin_config['alphabet']
     self._logger.debug(
         "Initializing DeepSpeech with graph '%s' " + "and alphabet '%s'",
         graph, alphabet)
     self._model = Model(graph, 26, 9, alphabet, 500)
Exemplo n.º 20
0
def load_model():
    model_path = 'output_graph.pb'
    alphabet_path = 'alphabet.txt'
    lm_path = 'lm.binary'
    trie_path = 'trie'

    ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH)

    ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, LM_WEIGHT,
                           WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    return ds
Exemplo n.º 21
0
        def setup_model(model_path, alphabet, lm, trie):
            log("creating model {} {}...".format(model_path, alphabet))
            ds_model = Model(model_path, N_FEATURES, N_CONTEXT, alphabet,
                             BEAM_WIDTH)

            if lm and trie:
                ds_model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                             WORD_COUNT_WEIGHT,
                                             VALID_WORD_COUNT_WEIGHT)
            log("model is ready.")
            return ds_model
Exemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model',
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--alphabet',
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio',
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version',
                        help='Print version and exits')
    args = parser.parse_args()

    if args.version:
        print_versions()
        return 0

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Exemplo n.º 23
0
 def load_model(self):
     start = timer()
     self.model = Model(self.graph, self.n_features, self.n_context, self.alphabet, self.beam_width)
     end = timer()
     print('Loaded model in %0.3fs.' % (end - start))
     if self.lm is not None and self.trie is not None:
         start = timer()
         self.model.enableDecoderWithLM(
             self.alphabet, self.lm,
             self.trie, self.lm_weight,
             self.word_count_weight,
             self.valid_word_count_weight
         )
         end = timer()
         print('Loaded language model in %0.3fs.' % (end - start))
Exemplo n.º 24
0
def loadModel():
    global ds
    print('Loading model from file %s' % (MODEL_FILE), file=sys.stderr)
    model_load_start = timer()
    ds = Model(MODEL_FILE, N_FEATURES, N_CONTEXT, ALPHABET_FILE, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)
    print('Loading language model from files %s %s' %
          (LM_BINARY_FILE, TRIE_FILE),
          file=sys.stderr)
    lm_load_start = timer()
    ds.enableDecoderWithLM(ALPHABET_FILE, LM_BINARY_FILE, TRIE_FILE, LM_WEIGHT,
                           WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    lm_load_end = timer() - lm_load_start
    print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)
Exemplo n.º 25
0
def stt(audioPath):

    model = conf.get_config('model')
    alphabet = conf.get_config('alphabet')
    lm = conf.get_config('lm')
    trie = conf.get_config('trie')

    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    if lm and trie:
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)

    fs, audio = wav.read(audioPath)
    text = ds.stt(audio, fs)

    return text
    def load_model(self):
        print('Loading model from file %s' % (MODEL_PATH))
        model_load_start = timer()
        self.ds = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH,
                        BEAM_WIDTH)

        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end))

        print('Loading language model from files %s %s' % (LM_PATH, TRIE_PATH))
        lm_load_start = timer()
        self.ds.enableDecoderWithLM(ALPHABET_PATH, LM_PATH, TRIE_PATH,
                                    LM_WEIGHT, WORD_COUNT_WEIGHT,
                                    VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end))
Exemplo n.º 27
0
class DeepSpeechSTT(plugin.STTPlugin):
    """
    DeepSpeech Speech-to-Text implementation
    """
    def __init__(self, *args, **kwargs):
        plugin.STTPlugin.__init__(self, *args, **kwargs)
        self._logger = logging.getLogger(__name__)
        self._plugin_config = self.profile['deepspeech']
        graph = self._plugin_config['graph']
        alphabet = self._plugin_config['alphabet']
        self._logger.debug(
            "Initializing DeepSpeech with graph '%s' " + "and alphabet '%s'",
            graph, alphabet)
        self._model = Model(graph, 26, 9, alphabet, 500)

    def transcribe(self, fp):
        """
        Performs STT, transcribing an audio file and returning the result.

        Arguments:
            fp -- a file object containing audio data
        """

        fs, audio = wav.read(fp)
        return self._model.stt(audio, fs)
Exemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model', type=str,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('audio', type=str,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('alphabet', type=str,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('lm', type=str, nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('trie', type=str, nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    for path in sorted(glob.glob(args.audio))[::1]:
        target = os.path.splitext(path)[0] + '.txt'
        if os.path.exists(target):
            continue

        fs, audio = wav.read(path)
        # We can assume 16kHz
        audio_length = len(audio) * (1 / 16000)
        assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"
    
        print('Running inference of %s.' % path, file=sys.stderr)
        inference_start = timer()
        text = ds.stt(audio, fs)
        print(text)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

        with open(target, 'w') as out:
            out.write(text)
Exemplo n.º 29
0
def main():
    print('Loading model from file %s' % MODEL, file=sys.stderr)
    model_load_start = timer()
    ds = Model(MODEL, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % model_load_end, file=sys.stderr)

    # Uncomment if you want to use a language model
    # =============================================

    # print('Loading language model from files %s %s' % (LANGUAGE_MODEL, TRIE), file=sys.stderr)
    # lm_load_start = timer()
    # ds.enableDecoderWithLM(ALPHABET, LANGUAGE_MODEL, TRIE, LM_WEIGHT,
    #                        WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    # lm_load_end = timer() - lm_load_start
    # print('Loaded language model in %0.3fs.' % lm_load_end, file=sys.stderr)

    # audio file
    path_to_audio = 'data/sesq316qna.mp3'

    # change rate of audio file to 16kHz
    call = AudioSegment.from_file(path_to_audio)
    call = call.set_frame_rate(16000)
    # only analyze the first 2 minutes (2 * 60 * 1000)
    segment = call[:120000]

    # declare the new name of the audio file
    path = 'data/testing.wav'

    # export the audio file to wav format
    segment.export(path, format="wav")

    # read the new file again with the wav reader
    fs, audio = wav.read(path)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    prediction_text = ds.stt(audio, fs)
    print(prediction_text)
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Exemplo n.º 30
0
    def __init__(self, model, alphabet, lm=None, trie=None):
        print('Loading model from file %s' % (model), file=sys.stderr)
        model_load_start = timer()
        self.ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

        if lm is not None and trie is not None:
            print('Loading language model from files %s %s' % (lm, trie),
                  file=sys.stderr)
            lm_load_start = timer()
            self.ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                        WORD_COUNT_WEIGHT,
                                        VALID_WORD_COUNT_WEIGHT)
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in %0.3fs.' % (lm_load_end),
                  file=sys.stderr)
Exemplo n.º 31
0
class MozillaDeepSpeechASREngine(ASREngine):
    """https://github.com/mozilla/DeepSpeech"""
    def __init__(self,
                 model_path,
                 alphabet_path,
                 language_model_path=None,
                 trie_path=None):
        """
        Constructor.

        :param model_path: Absolute path to (acoustic) model file.
        :param alphabet_path: Absolute path to file containing alphabet.
        :param language_model_path: Absolute path to language model file. This parameter is optional. Set to
        enable decoding with language model.
        :param trie_path: Absolute path to trie. This parameter is optional. Set to enable decoding with language model.
        """

        # https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py
        self._model = Model(aModelPath=model_path,
                            aNCep=26,
                            aNContext=9,
                            aAlphabetConfigPath=alphabet_path,
                            aBeamWidth=500)

        if language_model_path is not None and trie_path is not None:
            self._model.enableDecoderWithLM(aAlphabetConfigPath=alphabet_path,
                                            aLMPath=language_model_path,
                                            aTriePath=trie_path,
                                            aLMWeight=1.75,
                                            aWordCountWeight=1.0,
                                            aValidWordCountWeight=1.0)
            self._with_language_model = True
        else:
            self._with_language_model = False

    def transcribe(self, path):
        pcm, sample_rate = soundfile.read(path)
        pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16)

        return self._model.stt(pcm, aSampleRate=sample_rate)

    def __str__(self):
        if self._with_language_model:
            return 'Mozilla DeepSpeech (with language model)'
        else:
            return 'Mozilla DeepSpeech'
Exemplo n.º 32
0
def main():
    parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model', type=str,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('alphabet', type=str,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('lm', type=str, nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('trie', type=str, nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('audio', type=str,
                        help='Path to the audio file to run (WAV format)')
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    fs, audio = wav.read(args.audio)
    if fs != 16000:
        if fs < 16000:
            print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    audio_length = len(audio) * ( 1 / 16000)

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)