class SpeechRecognizer:
    def __init__(self):
        self._model = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH,
                            BEAM_WIDTH)

        self._model.enableDecoderWithLM(ALPHABET_PATH, LANGUAGE_MODEL_PATH,
                                        TRIE_PATH, LM_WEIGHT,
                                        WORD_COUNT_WEIGHT,
                                        VALID_WORD_COUNT_WEIGHT)

    def speech_to_text(self, audio_buffer, sample_rate):
        app.logger.info('processing audio file')
        audio = self._process_audio_data(audio_buffer, sample_rate)
        app.logger.info('starting recognition')

        start = time()
        text = self._model.stt(audio, SAMPLE_RATE)
        end = time()
        app.logger.info('finished in {:.3f}s'.format(end - start))

        return text

    def _process_audio_data(self, audio_buffer, original_sample_rate):
        audio = np.frombuffer(audio_buffer, dtype=np.int16)
        if original_sample_rate != SAMPLE_RATE:
            audio = self._resample(audio, original_sample_rate)
        return audio

    def _resample(self, audio, original_sample_rate):
        audio_length = len(audio) / original_sample_rate
        samples = int(audio_length * SAMPLE_RATE)
        return signal.resample(audio, samples).astype(np.int16)
Exemplo n.º 2
0
    def _worker_thread(self):
        print('restoring from {}'.format(model_file))
        model = Model(model_file, N_INPUT, N_CONTEXT, ALPHABET_CONFIG_PATH,
                      BEAM_WIDTH)
        model.enableDecoderWithLM(ALPHABET_CONFIG_PATH, LM_BINARY_PATH,
                                  LM_TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT,
                                  VALID_WORD_COUNT_WEIGHT)

        while True:
            cmd, *args = self._queue.get()
            if cmd == 'sample':
                sample = args[0]
                file = wave.open(sample.wav_path)
                audio = np.frombuffer(file.readframes(file.getnframes()),
                                      dtype=np.int16)
                fs = file.getframerate()
                start = time.time()
                result = model.stt(audio, fs)
                inference_time = time.time() - start
                wav_time = wav_length(sample.wav_path)
                print('wav length: {}\ninference time: {}\nRTF: {:2f}'.format(
                    wav_time, inference_time, inference_time / wav_time))
                self.inference_done.emit(sample, result)
            elif cmd == 'stop':
                break

        sess.close()
Exemplo n.º 3
0
def main():
    model = "models/output_graph.pb"
    alphabet = "models/alphabet.txt"
    lm = "models/lm.binary"
    trie = "models/trie"

    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)

    ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT,
                           VALID_WORD_COUNT_WEIGHT)

    with open("flickr_audio_transcription.txt", "w") as out:
        for audio_f in glob.glob(
                "/roaming/gchrupal/vgs/data/flickr8k/flickr_audio/wavs/*.wav"):
            print("Transcribing {}".format(audio_f))
            try:
                fs, audio = wav.read(audio_f)
                if fs != 16000:
                    if fs < 16000:
                        print(
                            'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.'
                            % (fs),
                            file=sys.stderr)
                    fs, audio = convert_samplerate(args.audio)
                audio_length = len(audio) * (1 / 16000)
                basename, ext = os.path.splitext(os.path.basename(audio_f))
                out.write("{}\t{}\n".format(basename, ds.stt(audio, fs)))
                out.flush()
            except ValueError as e:
                print("Error: {}".format(e))
Exemplo n.º 4
0
def recognize(model="../models/output_graph.pb",
              audio="../audio/2830-3980-0043.wav",
              alphabet="../models/alphabet.txt",
              lm="../models/lm.binary",
              trie="../models/trie"):
    print('Loading model from file %s' % (model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if lm and trie:
        print('Loading language model from files %s %s' % (lm, trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    result = ds.stt(audio, fs)
    print(result, file=sys.stderr)
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    return result
Exemplo n.º 5
0
class DeepSpeechSTT(plugin.STTPlugin):
    """
    DeepSpeech Speech-to-Text implementation
    """
    def __init__(self, *args, **kwargs):
        plugin.STTPlugin.__init__(self, *args, **kwargs)
        self._logger = logging.getLogger(__name__)
        self._plugin_config = self.profile['deepspeech']
        graph = self._plugin_config['graph']
        alphabet = self._plugin_config['alphabet']
        self._logger.debug(
            "Initializing DeepSpeech with graph '%s' " + "and alphabet '%s'",
            graph, alphabet)
        self._model = Model(graph, 26, 9, alphabet, 500)

    def transcribe(self, fp):
        """
        Performs STT, transcribing an audio file and returning the result.

        Arguments:
            fp -- a file object containing audio data
        """

        fs, audio = wav.read(fp)
        return self._model.stt(audio, fs)
Exemplo n.º 6
0
class DeepSpeech:
    def __init__(self, model, alphabet, lm=None, trie=None):
        print('Loading model from file %s' % (model), file=sys.stderr)
        model_load_start = timer()
        self.ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

        if lm is not None and trie is not None:
            print('Loading language model from files %s %s' % (lm, trie),
                  file=sys.stderr)
            lm_load_start = timer()
            self.ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                        WORD_COUNT_WEIGHT,
                                        VALID_WORD_COUNT_WEIGHT)
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in %0.3fs.' % (lm_load_end),
                  file=sys.stderr)

    def stt(self, audio_file):
        fs, audio = wav.read(audio_file)
        audio_length = len(audio) * (1 / 16000)
        print('Running inference.', file=sys.stderr)
        inference_start = timer()
        stt_result = self.ds.stt(audio, fs)
        print('Return result: ', stt_result)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' %
              (inference_end, audio_length),
              file=sys.stderr)
        return stt_result
Exemplo n.º 7
0
def main_deepspeech(args):
    args = parse_args_deep() if args is None else args
    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wave.read(args.audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Exemplo n.º 8
0
class transciber(object):
    def __init__(self,
                 modelPath,
                 alphabet,
                 lmPath,
                 trie,
                 numFeatures=26,
                 numContext=9,
                 beamWidth=500):
        print('Loading model from file %s' % modelPath, file=sys.stderr)
        model_load_start = timer()
        self.model = Model(modelPath, numFeatures, numContext, alphabet,
                           beamWidth)
        self.model.enableDecoderWithLM(alphabet, lmPath, trie, LM_WEIGHT,
                                       WORD_COUNT_WEIGHT,
                                       VALID_WORD_COUNT_WEIGHT)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    def transcribe(self, audioPath):
        fs, audio = wav.read(audioPath)
        audio_length = len(audio) * (1 / 16000)
        label = self.model.stt(audio, fs)
        print(label)
        return label
class DeepSpeechImp:
    ds = None

    def __init__(self):

        logging.info('Loading model from file %s' % (shared_params.DS_MODEL))
        model_load_start = timer()
        self.ds = Model(shared_params.DS_MODEL, N_FEATURES, N_CONTEXT,
                        shared_params.DS_ALPHABET, shared_params.BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        logging.info('Loaded model in %0.3fs.' % (model_load_end))
        logging.info('Loading language model from files %s %s' %
                     (shared_params.DS_LANGUAGE_MODEL, shared_params.DS_TRIE))
        lm_load_start = timer()
        self.ds.enableDecoderWithLM(shared_params.DS_ALPHABET,
                                    shared_params.DS_LANGUAGE_MODEL,
                                    shared_params.DS_TRIE,
                                    shared_params.LM_WEIGHT,
                                    shared_params.WORD_COUNT_WEIGHT,
                                    shared_params.VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        logging.info('Loaded language model in %0.3fs.' % (lm_load_end))

    def process_audio(self, audio_path):
        try:
            fs, audio = wav.read(audio_path)
            return self.ds.stt(audio, fs)
        except Exception as ex:
            logging.error(str(ex))
            return ""

    def __del__(self):
        del self.ds
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        type=str,
                        help='Path to the model (protocol buffer binary file)',
                        default="models/output_graph.pb")
    parser.add_argument('audio',
                        type=str,
                        help='Path to the audio file to run (WAV format)',
                        default="sample_input.wav")
    parser.add_argument(
        'alphabet',
        type=str,
        help=
        'Path to the configuration file specifying the alphabet used by the network',
        default="models/alphabet.txt")
    parser.add_argument('lm',
                        type=str,
                        nargs='?',
                        help='Path to the language model binary file',
                        default="models/lm.binary")
    parser.add_argument(
        'trie',
        type=str,
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie',
        default="models/trie")
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(args.audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Exemplo n.º 11
0
def main(model, alphabet, lm, trie, dest):

	# parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
	# parser.add_argument('model', type=str,
	# 					help='Path to the model (protocol buffer binary file)')
	# parser.add_argument('alphabet', type=str,
	# 					help='Path to the configuration file specifying the alphabet used by the network')
	# parser.add_argument('lm', type=str, nargs='?',
	# 					help='Path to the language model binary file')
	# parser.add_argument('trie', type=str, nargs='?',
	# 					help='Path to the language model trie file created with native_client/generate_trie')
	# parser.add_argument('audio', type=str,
	# 					help='Path to the audio file to run (WAV format)')
	# args = parser.parse_args()

	# print(args);

	print('Loading model from file %s' % (model), file=sys.stderr)
	model_load_start = timer()
	ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
	model_load_end = timer() - model_load_start
	print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

	if lm and trie:
		print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr)
		lm_load_start = timer()
		ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
							   WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
		lm_load_end = timer() - lm_load_start
		print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

	# fs, audio = read_video(args.audio) #wav.read(args.audio)
	# return ;
	print('Running inference.', file=sys.stderr)
	clips = os.listdir(dest) ; # clips dir path

	subs = [] ;

	for i, clip in enumerate(clips) :
		fs, audio = wav.read(dest + str(i) + '.wav') ;

		if fs != 16000:
			if fs < 16000:
				print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr)
		
		fs, audio = convert_samplerate(dest + str(i) + '.wav')	
		audio_length = len(audio) * ( 1 / 16000)

		inference_start = timer()
		subs.append(ds.stt(audio, fs)) ;
		print(subs[len(subs) - 1]);

		inference_end = timer() - inference_start
		print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

		# break ;

	return subs ;	
Exemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument(
        'alphabet',
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('lm',
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        'trie',
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    parser.add_argument('audio',
                        help='Path to the audio file to run (WAV format)')
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(args.audio)
    if fs != 16000:
        if fs < 16000:
            print(
                'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.'
                % (fs),
                file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    audio_length = len(audio) * (1 / 16000)

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Exemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model',
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--alphabet',
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio',
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version',
                        help='Print version and exits')
    args = parser.parse_args()

    if args.version:
        print_versions()
        return 0

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Exemplo n.º 14
0
def stt(audioPath):

    model = conf.get_config('model')
    alphabet = conf.get_config('alphabet')
    lm = conf.get_config('lm')
    trie = conf.get_config('trie')

    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    if lm and trie:
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)

    fs, audio = wav.read(audioPath)
    text = ds.stt(audio, fs)

    return text
Exemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model', type=str,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('audio', type=str,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('alphabet', type=str,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('lm', type=str, nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('trie', type=str, nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    for path in sorted(glob.glob(args.audio))[::1]:
        target = os.path.splitext(path)[0] + '.txt'
        if os.path.exists(target):
            continue

        fs, audio = wav.read(path)
        # We can assume 16kHz
        audio_length = len(audio) * (1 / 16000)
        assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"
    
        print('Running inference of %s.' % path, file=sys.stderr)
        inference_start = timer()
        text = ds.stt(audio, fs)
        print(text)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

        with open(target, 'w') as out:
            out.write(text)
Exemplo n.º 16
0
def main():
    print('Loading model from file %s' % MODEL, file=sys.stderr)
    model_load_start = timer()
    ds = Model(MODEL, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % model_load_end, file=sys.stderr)

    # Uncomment if you want to use a language model
    # =============================================

    # print('Loading language model from files %s %s' % (LANGUAGE_MODEL, TRIE), file=sys.stderr)
    # lm_load_start = timer()
    # ds.enableDecoderWithLM(ALPHABET, LANGUAGE_MODEL, TRIE, LM_WEIGHT,
    #                        WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    # lm_load_end = timer() - lm_load_start
    # print('Loaded language model in %0.3fs.' % lm_load_end, file=sys.stderr)

    # audio file
    path_to_audio = 'data/sesq316qna.mp3'

    # change rate of audio file to 16kHz
    call = AudioSegment.from_file(path_to_audio)
    call = call.set_frame_rate(16000)
    # only analyze the first 2 minutes (2 * 60 * 1000)
    segment = call[:120000]

    # declare the new name of the audio file
    path = 'data/testing.wav'

    # export the audio file to wav format
    segment.export(path, format="wav")

    # read the new file again with the wav reader
    fs, audio = wav.read(path)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    prediction_text = ds.stt(audio, fs)
    print(prediction_text)
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Exemplo n.º 17
0
class DeepSpeechSTT(AbstractSTTEngine):
    """
    DeepSpeech Speech-to-Text implementation
    """

    SLUG = 'deepspeech'

    def __init__(self, vocabulary, graph="models/output_graph.pb",
                 alphabet="models/alphabet.txt"):

        self._logger = logging.getLogger(__name__)
        self._logger.debug("Initializing DeepSpeech with graph '%s' " +
                           "and alphabet '%s'", graph, alphabet)
        self._model = Model(graph, 26, 9, alphabet, 500)

    @classmethod
    def get_config(cls):
        # FIXME: Replace this as soon as we have a config module
        config = {}
        profile_path = jasperpath.config('profile.yml')

        if os.path.exists(profile_path):
            with open(profile_path, 'r') as f:
                profile = yaml.safe_load(f)
                try:
                    config['graph'] = profile['deepspeech']['graph']
                    config['alphabet'] = profile['deepspeech']['alphabet']
                except KeyError:
                    pass

        return config

    def transcribe(self, fp):
        """
        Performs STT, transcribing an audio file and returning the result.

        Arguments:
            fp -- a file object containing audio data
        """

        fs, audio = wav.read(fp)
        return self._model.stt(audio, fs)

    @classmethod
    def is_available(cls):
        return diagnose.check_python_import('deepspeech')
Exemplo n.º 18
0
class MozillaDeepSpeechASREngine(ASREngine):
    """https://github.com/mozilla/DeepSpeech"""
    def __init__(self,
                 model_path,
                 alphabet_path,
                 language_model_path=None,
                 trie_path=None):
        """
        Constructor.

        :param model_path: Absolute path to (acoustic) model file.
        :param alphabet_path: Absolute path to file containing alphabet.
        :param language_model_path: Absolute path to language model file. This parameter is optional. Set to
        enable decoding with language model.
        :param trie_path: Absolute path to trie. This parameter is optional. Set to enable decoding with language model.
        """

        # https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py
        self._model = Model(aModelPath=model_path,
                            aNCep=26,
                            aNContext=9,
                            aAlphabetConfigPath=alphabet_path,
                            aBeamWidth=500)

        if language_model_path is not None and trie_path is not None:
            self._model.enableDecoderWithLM(aAlphabetConfigPath=alphabet_path,
                                            aLMPath=language_model_path,
                                            aTriePath=trie_path,
                                            aLMWeight=1.75,
                                            aWordCountWeight=1.0,
                                            aValidWordCountWeight=1.0)
            self._with_language_model = True
        else:
            self._with_language_model = False

    def transcribe(self, path):
        pcm, sample_rate = soundfile.read(path)
        pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16)

        return self._model.stt(pcm, aSampleRate=sample_rate)

    def __str__(self):
        if self._with_language_model:
            return 'Mozilla DeepSpeech (with language model)'
        else:
            return 'Mozilla DeepSpeech'
Exemplo n.º 19
0
def Deep():
    try:
        if tkMessageBox.askyesno("Confirmation", "Would you like to proceed?"):

            BEAM_WIDTH = 500
            LM_WEIGHT = 1.75
            WORD_COUNT_WEIGHT = 1.00
            VALID_WORD_COUNT_WEIGHT = 1.00
            N_FEATURES = 26
            N_CONTEXT = 9

            ds = Model('models/models.pb', N_FEATURES, N_CONTEXT,
                       'models/alphabet.txt', BEAM_WIDTH)

            fs, audio = wav.read(audiofile.get())

            if fs != 16000:
                cbn = sox.Combiner()
                cbn.convert(samplerate=16000, n_channels=1)
                cbn.build([str(audiofile.get())], './', 'concatenate')
                fs, audio = wav.read('./')

            audio_length = len(audio) * (1 / 16000)

            resultpage = Toplevel(parent)
            resultpage.title("Result")
            result_border = ttk.Frame(resultpage, padding=(12, 12, 12, 12))
            result_border.pack()
            result_page = Frame(result_border, bg="white")
            result_page.pack()

            Tkinter.Label(result_page,
                          text="What I've heard from you:",
                          font=14,
                          bg="white").grid(row=1, column=1, sticky=E)
            Tkinter.Label(result_page, textvariable=word, font=12,
                          bg="white").grid(row=2, column=2, sticky=E)

            word.set(ds.stt(audio, fs))

    except ValueError:
        tkMessageBox.showerror("Error!", "Only 16000Hz WAV files supported!")
    except IOError:
        tkMessageBox.showerror("Error!", "No file uploaded!")
Exemplo n.º 20
0
def main():
    ds = Model('./output_graph.pb', N_FEATURES, N_CONTEXT, './alphabet.txt',
               BEAM_WIDTH)

    r = sr.Recognizer()
    r.energy_threshold = 500

    with sr.Microphone(sample_rate=16000) as source:
        print('Say something!', file=sys.stdout)
        headDisplay.display_image(
            "/home/team18/Grasp-Detector-master/sawyer_head/what_fruit_would_you_like.JPG"
        )
        audio_temp = r.listen(source)
    # fs=44100

    print('Recording done!!!')

    with open("microphone-results.wav", "wb") as f:
        f.write(audio_temp.get_wav_data())
    time.sleep(1)

    fs, audio = wav.read('microphone-results.wav')

    theText = ds.stt(audio, fs)
    print(theText)

    final_value = -1
    if "av" in theText:
        final_value = 2
    elif "ap" in theText:
        final_value = 1
    elif "ba" in theText:
        final_value = 3
    elif "or" in theText:
        final_value = 5
    else:
        final_value = 7

    with open("finalvalue.txt", "wb") as f:
        f.write(str(final_value))
    time.sleep(0.5)
Exemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model', type=str,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('alphabet', type=str,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('lm', type=str, nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('trie', type=str, nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('audio', type=str,
                        help='Path to the audio file to run (WAV format)')
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    fs, audio = wav.read(args.audio)
    if fs != 16000:
        if fs < 16000:
            print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    audio_length = len(audio) * ( 1 / 16000)

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Exemplo n.º 22
0
class DeepSpeech:
    """Wrap DeepSpeech and provide the methods we need"""

    def __init__(self, settings):

        self.beam_width = 1024
        self.lm_weight = 1.75
        self.word_count_weight = 1.00
        self.valid_word_count_weight = 1.00
        self.n_features = 26
        self.n_context = 9
        self.alphabet = settings.get('alphabet')
        self.lm = settings.get('lm')
        self.trie = settings.get('trie')
        self.graph = settings.get('graph')

    def load_model(self):
        start = timer()
        self.model = Model(self.graph, self.n_features, self.n_context, self.alphabet, self.beam_width)
        end = timer()
        print('Loaded model in %0.3fs.' % (end - start))
        if self.lm is not None and self.trie is not None:
            start = timer()
            self.model.enableDecoderWithLM(
                self.alphabet, self.lm,
                self.trie, self.lm_weight,
                self.word_count_weight,
                self.valid_word_count_weight
            )
            end = timer()
            print('Loaded language model in %0.3fs.' % (end - start))

    def oneshoot(self, wav_file):
        fs, audio = wav.read(wav_file)
        start = timer()
        result = self.model.stt(audio, fs)
        latency = timer() - start
        audio_length = len(audio) * ( 1 / 16000)
        return result, latency
Exemplo n.º 23
0
class SpeechToText():
    def __init__(self, model_path):
        # Defined constants. See https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py
        BEAM_WIDTH = 500
        LM_WEIGHT = 1.75
        WORD_COUNT_WEIGHT = 1.00
        VALID_WORD_COUNT_WEIGHT = 1.00
        N_FEATURES = 26
        N_CONTEXT = 9

        model = os.path.join(model_path, "output_graph.pb")
        alphabet = os.path.join(model_path, "alphabet.txt")
        lm = os.path.join(model_path, "lm.binary")
        trie = os.path.join(model_path, "trie")

        self.model = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
        self.model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                       WORD_COUNT_WEIGHT,
                                       VALID_WORD_COUNT_WEIGHT)

    def run(self, audio, fs):
        return self.model.stt(audio, fs)
Exemplo n.º 24
0
def recognize_deepspeech(audio):

    model = path.join(path.dirname(path.realpath(__file__)),
                      'models/output_graph.pb')
    alphabet = path.join(path.dirname(path.realpath(__file__)),
                         'models/alphabet.txt')
    lm = path.join(path.dirname(path.realpath(__file__)), 'models/lm.binary')
    trie = path.join(path.dirname(path.realpath(__file__)), 'models/trie')

    #print('Loading model from file %s' % (model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    #print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if lm and trie:
        #print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        #print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    fs, audio = wav.read(audio)
    if fs != 16000:
        if fs < 16000:
            print(
                'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.'
                % (fs),
                file=sys.stderr)
        fs, audio = convert_samplerate(audio)
    audio_length = len(audio) * (1 / 16000)

    #print('Running inference.', file=sys.stderr)
    #inference_start = timer()
    #inference_end = timer() - inference_start
    return ds.stt(audio, fs)
Exemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        type=str,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('audio',
                        type=str,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument(
        'alphabet',
        type=str,
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('lm',
                        type=str,
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        'trie',
        type=str,
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    args = parser.parse_args()

    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)

    if args.lm and args.trie:
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)

    fs, audio = wav.read(args.audio)
    print(ds.stt(audio, fs))
Exemplo n.º 26
0
    print('rank: ', str(rank))
    # Get Audio Filename
    vf = file_list[_file_id]
    print('file: ', vf)
    print(' ')
    file_path, file_name = os.path.split(vf)
    folder_name = audio_dir + "/rank_" + str(rank)
    try:
        os.makedirs(folder_name)
    except:
        print("Directory %s exists \n" % folder_name)
        #model location                   alphabet file
    ds = Model('/home/ubuntu/deepspeech/models/output_graph.pb', 26, 9,
               '/home/ubuntu/deepspeech/models/alphabet.txt', 500)
    fs, audio = wav.read(vf)
    processed_data = ds.stt(audio, fs)
    #processed_data=ds.stt(audio.flatten(),fs)

    seperate_save = str(folder_name) + '-' + str(file_name) + '-data.txt'
    with open(seperate_save, 'a+') as f:
        f.write(processed_data)  # read the entire audio file

    # Audio to text
    data_save = 'AudioData.txt'
    with open(data_save, 'a+') as f:
        f.write(processed_data + '\r\r')  # read the entire audio file
    try:
        print('\nDeepSpeech says, "...' + str(processed_data) +
              '..."\n\nThe data has been stored in file: ' + str(data_save) +
              '\n')
    except:
Exemplo n.º 27
0
def deepspeech_main():
    # These constants control the beam search decoder

    # Beam width used in the CTC decoder when building candidate transcriptions
    BEAM_WIDTH = 500

    # The alpha hyperparameter of the CTC decoder. Language Model weight
    LM_WEIGHT = 1.75

    # The beta hyperparameter of the CTC decoder. Word insertion weight (penalty)
    WORD_COUNT_WEIGHT = 1.00

    # Valid word insertion weight. This is used to lessen the word insertion penalty
    # when the inserted word is part of the vocabulary
    VALID_WORD_COUNT_WEIGHT = 1.00

    # These constants are tied to the shape of the graph used (changing them changes
    # the geometry of the first layer), so make sure you use the same constants that
    # were used during training

    # Number of MFCC features to use
    N_FEATURES = 26

    # Size of the context window used for producing timesteps in the input vector
    N_CONTEXT = 9

    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        type=str,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('audio',
                        type=str,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument(
        'alphabet',
        type=str,
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('lm',
                        type=str,
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        'trie',
        type=str,
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(args.audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Exemplo n.º 28
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function
import sys
import scipy.io.wavfile as wav
from deepspeech.model import Model
import time
print('imports ok')

model2 = '/home/nvidia/DeepSpeech/data/ldc93s1/model/output_graph.pb'
micro2 = '/home/nvidia/DeepSpeech/data/ldc93s1/LDC93S1.wav'

ds = Model(model2, 26, 9)  #model link, cepstrum, context
print('Model ok')

while 1:
    print('lecture wav')
    fs, audio = wav.read(micro2)
    print(ds.stt(audio, fs))
Exemplo n.º 29
0
def main(options):
    # Ensure ffmpeg is around
    if not run_ffmpeg(['-version']):
        log.error(
            "ffmpeg needs to be available to strip audio from the video file.")
        exit(1)

    with NamedTemporaryFile(delete=True) as vid_file:
        log.info("Downloading %s - this might take a while." % options.vid_url)
        response = get(options.vid_url, stream=True)
        total_length = response.headers.get("content-length")
        if total_length is None:  # no content length header
            log.info("Unknown length - can't predict how long this will take.")
            f.write(response.content)
        else:
            bar = ProgressBar(max_value=int(total_length))
            dl = 0
            for data in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
                dl += len(data)
                vid_file.write(data)
                vid_file.flush()
                bar.update(dl)

        log.info("Download done. Stripping audio.")
        (wav_file, wav_file_name) = mkstemp('.wav')
        result = run_ffmpeg([
            "-y", "-i", vid_file.name, "-vn", "-acodec", "pcm_s16le", "-ar",
            "16000", "-ac", "1", wav_file_name
        ])
        if not result:
            close(wav_file)
            log.error("ffmpeg failed. Bailing.")
            exit(1)

        fs, audio = wav.read(wav_file_name)
        close(wav_file)

    log.info("Will write VTT to %s" % options.output)
    # Make sure the WAV is to code...
    log.info("Loading up WAV file...")

    if fs != 16000:
        log.error("Only 16000hz WAV files are usable.")
        exit(1)

    total_samples = len(audio)
    duration_hours, duration_minutes, duration_seconds = sample_index_to_time(
        len(audio))
    log.info("Approximate duration: %d:%02d:%02d" %
             (duration_hours, duration_minutes, duration_seconds))

    # Let's load up DeepSpeech and get it ready.
    log.info("Loading pre-trained DeepSpeech model...")
    root_model_dir = path.join(options.deepspeech_model_dir, MODEL_DIR)

    model = path.join(root_model_dir, MODEL_FILE)
    alphabet = path.join(root_model_dir, MODEL_ALPHABET)
    lang_model = path.join(root_model_dir, MODEL_LANG_MODEL)
    trie = path.join(root_model_dir, MODEL_TRIE)

    deepspeech = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    log.info("Done loading model.")

    log.info("Loading language model...")
    deepspeech.enableDecoderWithLM(alphabet, lang_model, trie, LM_WEIGHT,
                                   WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    log.info("Done loading model.")

    playhead = 0

    out = WebVTTFile()

    bar = ProgressBar(max_value=total_samples)
    while playhead < (total_samples - 1):
        end_point = min(playhead + AUDIO_SEGMENT_SAMPLES, (total_samples - 1))
        segment = audio[playhead:end_point]
        inference = deepspeech.stt(segment, fs)
        log.debug("Inferred: %s" % inference)

        start_hours, start_minutes, start_seconds = sample_index_to_time(
            playhead)
        playhead = end_point
        end_hours, end_minutes, end_seconds = sample_index_to_time(playhead)

        if not inference or inference == "ah":
            continue

        for search, replace in INFERENCE_REPLACEMENTS.iteritems():
            inference = sub(r"\b" + search + r"\b", replace, inference)

        inference = fill(inference, width=MAX_CAPTION_WIDTH)

        start = WebVTTTime(start_hours, start_minutes, start_seconds)
        end = WebVTTTime(end_hours, end_minutes, end_seconds)

        item = WebVTTItem(0, start, end, inference)
        out.append(item)
        bar.update(playhead)

        out.save(options.output, encoding="utf-8")

    out.clean_indexes()
    out.save(options.output, encoding="utf-8")
Exemplo n.º 30
0
def main():
    # Use the following for defaults
    #  model       /home/dalonlobo/deepspeech_models/models/output_graph.pb
    #  audio       /home/dalonlobo/deepspeech_models/models/2830-3980-0043.wav
    #  alphabet    /home/dalonlobo/deepspeech_models/lm_models/alphabet.txt
    #  lm          /home/dalonlobo/deepspeech_models/lm_models/lm_o5.binary
    #  trie        /home/dalonlobo/deepspeech_models/lm_models/o5_trie
    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument(
        'model',
        type=str,
        nargs='?',
        default='/home/dalonlobo/deepspeech_models/models/output_graph.pb',
        help='Path to the model (protocol buffer binary file)')
    parser.add_argument(
        'audio',
        type=str,
        nargs='?',
        default='/home/dalonlobo/deepspeech_models/models/2830-3980-0043.wav',
        help='Path to the audio file to run (WAV format)')
    parser.add_argument(
        'alphabet',
        type=str,
        nargs='?',
        default='/home/dalonlobo/deepspeech_models/lm_models/alphabet.txt',
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument(
        'lm',
        type=str,
        nargs='?',
        default='/home/dalonlobo/deepspeech_models/lm_models/lm_o5.binary',
        help='Path to the language model binary file')
    parser.add_argument(
        'trie',
        type=str,
        nargs='?',
        default='/home/dalonlobo/deepspeech_models/lm_models/o5_trie',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(args.audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Exemplo n.º 31
0
class DeepSpeechSTTPlugin(plugin.STTPlugin):
    """
    Speech-To-Text implementation which relies on the DeepSpeech API.
    """
    def __init__(self, *args, **kwargs):
        """
        Create Plugin Instance
        """
        plugin.STTPlugin.__init__(self, *args, **kwargs)
        self._logger = logging.getLogger(__name__)
        self._logger.info("Init DeepSpeech")
        self._logger.debug(str(self.profile))

        if not deepspeech_available:
            self._logger.warning("DeepSpeech import error!")
        #    raise ImportError("DeepSpeech not installed!")

        self._logger.warning("This STT plugin doesn't have multilanguage " +
                             "support!")
        # Beam width used in the CTC decoder when building candidate
        # transcriptions
        try:
            self._BEAM_WIDTH = self.profile["deepspeech"]["beam_width"]
        except KeyError:
            self._BEAM_WIDTH = 500

        # The alpha hyperparameter of the CTC decoder. Language Model weight
        try:
            self._LM_WEIGHT = self.profile["deepspeech"]["lm_weight"]
        except KeyError:
            self._LM_WEIGHT = 1.75

        # The beta hyperparameter of the CTC decoder. Word insertion weight
        # (penalty)
        try:
            self._WORD_COUNT_WEIGHT = self.profile["deepspeech"][
                "word_count_weight"]
        except KeyError:
            self._WORD_COUNT_WEIGHT = 1.00

        # Valid word insertion weight. This is used to lessen the word
        # insertion penalty when the inserted word is part of the vocabulary
        try:
            self._VALID_WORD_COUNT_WEIGHT = self.profile["deepspeech"][
                "valid_word_count_weight"]
        except KeyError:
            self._VALID_WORD_COUNT_WEIGHT = 1.00

        # These constants are tied to the shape of the graph used (changing
        # them changes the geometry of the first layer), so make sure you
        # use the same constants that were used during training

        # Number of MFCC features to use
        try:
            self._N_FEATURES = self.profile["deepspeech"]["n_features"]
        except KeyError:
            self._N_FEATURES = 26

        # Size of the context window used for producing timesteps in the
        # input vector
        try:
            self._N_CONTEXT = self.profile["deepspeech"]["n_context"]
        except KeyError:
            self._N_CONTEXT = 9

        # Only 16KHz files are currently supported
        try:
            self._FS = self.profile["deepspeech"]["fs"]
        except KeyError:
            self._FS = 16000

        # These are paths. They are required

        # Path to the model (protocol buffer binary file)
        self._MODEL = self.profile["deepspeech"]["model"]
        if (not os.path.exists(self._MODEL)):
            msg = ("DeepSpeech model '%s' does not exist! " +
                   "Please make sure that you have set the " +
                   "correct deepspeech: model in your profile.") % self._MODEL
            self._logger.error(msg)
            raise RuntimeError(msg)

        # Path to the configuration file specifying the alphabet used
        self._ALPHABET = self.profile["deepspeech"]["alphabet"]
        if (not os.path.exists(self._ALPHABET)):
            msg = ("DeepSpeech alphabet '%s' does not exist! " +
                   "Please make sure that you have set the " +
                   "correct deepspeech: alphabet in your profile."
                   ) % self._ALPHABET
            self._logger.error(msg)
            raise RuntimeError(msg)

        # Path to the language model binary file
        self._LM = self.profile["deepspeech"]["language_model"]
        if (not os.path.exists(self._LM)):
            msg = ("DeepSpeech language model '%s' does not exist! " +
                   "Please make sure that you have set the correct " +
                   "deepspeech: language_model in your profile.") % self._LM
            self._logger.error(msg)
            raise RuntimeError(msg)

        # Path to the language model trie file created with
        # native_client/generate_trie
        self._TRIE = self.profile["deepspeech"]["trie"]
        if (not os.path.exists(self._TRIE)):
            msg = ("DeepSpeech trie '%s' does not exist! " +
                   "Please make sure that you have set the " +
                   "correct deepspeech: trie in your profile.") % self._TRIE
            self._logger.error(msg)
            raise RuntimeError(msg)
        self._ds = Model(self._MODEL, self._N_FEATURES, self._N_CONTEXT,
                         self._ALPHABET, self._BEAM_WIDTH)
        self._ds.enableDecoderWithLM(self._ALPHABET, self._LM, self._TRIE,
                                     self._LM_WEIGHT, self._WORD_COUNT_WEIGHT,
                                     self._VALID_WORD_COUNT_WEIGHT)

    def transcribe(self, fp):
        """
        transcribe given audio file object fp and return the result.
        """
        fp.seek(0)
        fs, audio = wav.read(fp)
        # We can assume 16kHz
        # audio_length = len(audio) * (1 / self._FS)
        assert fs == self._FS, (
            "Only %dHz input WAV files are supported for now!" % self._FS)

        text = self._ds.stt(audio, self._FS)

        transcribed = [text.upper()]

        return transcribed
Exemplo n.º 32
0
class DeepSpeechSTTPlugin(plugin.STTPlugin):
    """
    Speech-To-Text implementation which relies on the DeepSpeech API.
    """
    def __init__(self, *args, **kwargs):
        """
        Create Plugin Instance
        """
        plugin.STTPlugin.__init__(self, *args, **kwargs)
        self._logger = logging.getLogger(__name__)

        if not deepspeech_available:
            self._logger.warning("DeepSpeech import error!")
        #    raise ImportError("DeepSpeech not installed!")

        self._logger.warning("This STT plugin doesn't have multilanguage " +
                             "support!")
        # Beam width used in the CTC decoder when building candidate transcriptions
        try:
            self._BEAM_WIDTH = self.profile["deepspeech"]["beam_width"]
        except KeyError:
            self._BEAM_WIDTH = 500

        # The alpha hyperparameter of the CTC decoder. Language Model weight
        try:
            self._LM_WEIGHT = self.profile["deepspeech"]["lm_weight"]
        except KeyError:
            self._LM_WEIGHT = 1.75

        # The beta hyperparameter of the CTC decoder. Word insertion weight (penalty)
        try:
            self._WORD_COUNT_WEIGHT = self.profile["deepspeech"][
                "word_count_weight"]
        except KeyError:
            self._WORD_COUNT_WEIGHT = 1.00

        # Valid word insertion weight. This is used to lessen the word insertion penalty
        # when the inserted word is part of the vocabulary
        try:
            self._VALID_WORD_COUNT_WEIGHT = self.profile["deepspeech"][
                "valid_word_count_weight"]
        except KeyError:
            self._VALID_WORD_COUNT_WEIGHT = 1.00

        # These constants are tied to the shape of the graph used (changing them changes
        # the geometry of the first layer), so make sure you use the same constants that
        # were used during training

        # Number of MFCC features to use
        try:
            self._N_FEATURES = self.profile["deepspeech"]["n_features"]
        except KeyError:
            self._N_FEATURES = 26

        # Size of the context window used for producing timesteps in the input vector
        try:
            self._N_CONTEXT = self.profile["deepspeech"]["n_context"]
        except KeyError:
            self._N_CONTEXT = 9

        # Only 16KHz files are currently supported
        try:
            self._FS = self.profile["deepspeech"]["fs"]
        except KeyError:
            self._FS = 16000

        # Save the output for inspection?
        self._save_input = False
        try:
            _save_input = self.profile["deepspeech"]["save_input"]
        except KeyError:
            self._save_input = False

        # These are paths. They are required

        # Path to the model (protocol buffer binary file)
        self._MODEL = self.profile["deepspeech"]["model"]
        if (not os.path.exists(self._MODEL)):
            msg = (
                "DeepSpeech model '%s' does not exist! Please make sure that you "
                + "have set the correct deepspeech: model in your profile."
            ) % self._MODEL
            self._logger.error(msg)
            raise RuntimeError(msg)

        # Path to the configuration file specifying the alphabet used
        self._ALPHABET = self.profile["deepspeech"]["alphabet"]
        if (not os.path.exists(self._ALPHABET)):
            msg = (
                "DeepSpeech alphabet '%s' does not exist! Please make sure that you "
                + "have set the correct deepspeech: alphabet in your profile."
            ) % self._ALPHABET
            self._logger.error(msg)
            raise RuntimeError(msg)

        # Path to the language model binary file
        self._LM = self.profile["deepspeech"]["language_model"]
        if (not os.path.exists(self._LM)):
            msg = (
                "DeepSpeech language model '%s' does not exist! Please make sure that you "
                +
                "have set the correct deepspeech: language_model in your profile."
            ) % self._LM
            self._logger.error(msg)
            raise RuntimeError(msg)

        # Path to the language model trie file created with native_client/generate_trie
        self._TRIE = self.profile["deepspeech"]["trie"]
        if (not os.path.exists(self._TRIE)):
            msg = (
                "DeepSpeech trie '%s' does not exist! Please make sure that you "
                + "have set the correct deepspeech: trie in your profile."
            ) % self._TRIE
            self._logger.error(msg)
            raise RuntimeError(msg)
        self._ds = Model(self._MODEL, self._N_FEATURES, self._N_CONTEXT,
                         self._ALPHABET, self._BEAM_WIDTH)
        self._ds.enableDecoderWithLM(self._ALPHABET, self._LM, self._TRIE,
                                     self._LM_WEIGHT, self._WORD_COUNT_WEIGHT,
                                     self._VALID_WORD_COUNT_WEIGHT)
        # Create the audiolog if it does not exist
        self._audiolog = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "audiolog")
        if not os.path.exists(self._audiolog):
            os.makedirs(self._audiolog)
        # Clear the audiolog
        files = os.listdir(self._audiolog)
        for file in files:
            if file.endswith(".wav"):
                self._logger.info("to delete: %s" %
                                  os.path.join(self._audiolog, file))
                os.remove(os.path.join(self._audiolog, file))
        self._filecount = 0

    def transcribe(self, fp):
        """
        transcribe given audio file object fp and return the result.
        """

        fs, audio = wav.read(fp)
        # We can assume 16kHz
        audio_length = len(audio) * (1 / self._FS)
        assert fs == self._FS, "Only %dHz input WAV files are supported for now!" % self._FS

        text = self._ds.stt(audio, self._FS)

        transcribed = [text.upper()]
        print('>> %r' % transcribed)

        # write the output to a log file
        if (self._save_input and not transcribed == ['']):
            self._filecount += 1
            f = open(
                os.path.join(self._audiolog,
                             "%d_%s.wav" % (self._filecount, text)), "w")
            f.write(fp.read())
            f.close()

        return transcribed